aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--compiler/parser/error.go36
-rw-r--r--compiler/parser/fragment.go72
-rw-r--r--compiler/parser/lexer.go594
-rw-r--r--compiler/parser/lexer_test.go524
-rw-r--r--compiler/parser/parser.go531
-rw-r--r--compiler/parser/parser_test.go1389
-rw-r--r--compiler/parser/tree.go459
-rw-r--r--src/tre.go1661
-rw-r--r--tests/tre.go1904
9 files changed, 3565 insertions, 3605 deletions
diff --git a/compiler/parser/error.go b/compiler/parser/error.go
deleted file mode 100644
index be81da4..0000000
--- a/compiler/parser/error.go
+++ /dev/null
@@ -1,36 +0,0 @@
-package parser
-
-import "fmt"
-
-var (
- ParseErr = fmt.Errorf("parse error")
-
- // lexical errors
- synErrIncompletedEscSeq = fmt.Errorf("incompleted escape sequence; unexpected EOF following \\")
- synErrInvalidEscSeq = fmt.Errorf("invalid escape sequence")
- synErrInvalidCodePoint = fmt.Errorf("code points must consist of just 4 or 6 hex digits")
- synErrCharPropInvalidSymbol = fmt.Errorf("invalid character property symbol")
- SynErrFragmentInvalidSymbol = fmt.Errorf("invalid fragment symbol")
-
- // syntax errors
- synErrUnexpectedToken = fmt.Errorf("unexpected token")
- synErrNullPattern = fmt.Errorf("a pattern must be a non-empty byte sequence")
- synErrUnmatchablePattern = fmt.Errorf("a pattern cannot match any characters")
- synErrAltLackOfOperand = fmt.Errorf("an alternation expression must have operands")
- synErrRepNoTarget = fmt.Errorf("a repeat expression must have an operand")
- synErrGroupNoElem = fmt.Errorf("a grouping expression must include at least one character")
- synErrGroupUnclosed = fmt.Errorf("unclosed grouping expression")
- synErrGroupNoInitiator = fmt.Errorf(") needs preceding (")
- synErrGroupInvalidForm = fmt.Errorf("invalid grouping expression")
- synErrBExpNoElem = fmt.Errorf("a bracket expression must include at least one character")
- synErrBExpUnclosed = fmt.Errorf("unclosed bracket expression")
- synErrBExpInvalidForm = fmt.Errorf("invalid bracket expression")
- synErrRangeInvalidOrder = fmt.Errorf("a range expression with invalid order")
- synErrRangePropIsUnavailable = fmt.Errorf("a property expression is unavailable in a range expression")
- synErrRangeInvalidForm = fmt.Errorf("invalid range expression")
- synErrCPExpInvalidForm = fmt.Errorf("invalid code point expression")
- synErrCPExpOutOfRange = fmt.Errorf("a code point must be between U+0000 to U+10FFFF")
- synErrCharPropExpInvalidForm = fmt.Errorf("invalid character property expression")
- synErrCharPropUnsupported = fmt.Errorf("unsupported character property")
- synErrFragmentExpInvalidForm = fmt.Errorf("invalid fragment expression")
-)
diff --git a/compiler/parser/fragment.go b/compiler/parser/fragment.go
deleted file mode 100644
index 5680b55..0000000
--- a/compiler/parser/fragment.go
+++ /dev/null
@@ -1,72 +0,0 @@
-package parser
-
-import (
- "fmt"
-
- "github.com/nihei9/maleeni/spec"
-)
-
-type incompleteFragment struct {
- kind spec.LexKindName
- root *rootNode
-}
-
-func CompleteFragments(fragments map[spec.LexKindName]CPTree) error {
- if len(fragments) == 0 {
- return nil
- }
-
- completeFragments := map[spec.LexKindName]CPTree{}
- incompleteFragments := []*incompleteFragment{}
- for kind, tree := range fragments {
- root, ok := tree.(*rootNode)
- if !ok {
- return fmt.Errorf("CompleteFragments can take only *rootNode: %T", tree)
- }
- if root.incomplete() {
- incompleteFragments = append(incompleteFragments, &incompleteFragment{
- kind: kind,
- root: root,
- })
- } else {
- completeFragments[kind] = root
- }
- }
- for len(incompleteFragments) > 0 {
- lastIncompCount := len(incompleteFragments)
- remainingFragments := []*incompleteFragment{}
- for _, e := range incompleteFragments {
- complete, err := ApplyFragments(e.root, completeFragments)
- if err != nil {
- return err
- }
- if !complete {
- remainingFragments = append(remainingFragments, e)
- } else {
- completeFragments[e.kind] = e.root
- }
- }
- incompleteFragments = remainingFragments
- if len(incompleteFragments) == lastIncompCount {
- return ParseErr
- }
- }
-
- return nil
-}
-
-func ApplyFragments(t CPTree, fragments map[spec.LexKindName]CPTree) (bool, error) {
- root, ok := t.(*rootNode)
- if !ok {
- return false, fmt.Errorf("ApplyFragments can take only *rootNode type: %T", t)
- }
-
- for name, frag := range fragments {
- err := root.applyFragment(name, frag)
- if err != nil {
- return false, err
- }
- }
-
- return !root.incomplete(), nil
-}
diff --git a/compiler/parser/lexer.go b/compiler/parser/lexer.go
deleted file mode 100644
index 3861825..0000000
--- a/compiler/parser/lexer.go
+++ /dev/null
@@ -1,594 +0,0 @@
-package parser
-
-import (
- "bufio"
- "fmt"
- "io"
- "strings"
-)
-
-type tokenKind string
-
-const (
- tokenKindChar tokenKind = "char"
- tokenKindAnyChar tokenKind = "."
- tokenKindRepeat tokenKind = "*"
- tokenKindRepeatOneOrMore tokenKind = "+"
- tokenKindOption tokenKind = "?"
- tokenKindAlt tokenKind = "|"
- tokenKindGroupOpen tokenKind = "("
- tokenKindGroupClose tokenKind = ")"
- tokenKindBExpOpen tokenKind = "["
- tokenKindInverseBExpOpen tokenKind = "[^"
- tokenKindBExpClose tokenKind = "]"
- tokenKindCharRange tokenKind = "-"
- tokenKindCodePointLeader tokenKind = "\\u"
- tokenKindCharPropLeader tokenKind = "\\p"
- tokenKindFragmentLeader tokenKind = "\\f"
- tokenKindLBrace tokenKind = "{"
- tokenKindRBrace tokenKind = "}"
- tokenKindEqual tokenKind = "="
- tokenKindCodePoint tokenKind = "code point"
- tokenKindCharPropSymbol tokenKind = "character property symbol"
- tokenKindFragmentSymbol tokenKind = "fragment symbol"
- tokenKindEOF tokenKind = "eof"
-)
-
-type token struct {
- kind tokenKind
- char rune
- propSymbol string
- codePoint string
- fragmentSymbol string
-}
-
-const nullChar = '\u0000'
-
-func newToken(kind tokenKind, char rune) *token {
- return &token{
- kind: kind,
- char: char,
- }
-}
-
-func newCodePointToken(codePoint string) *token {
- return &token{
- kind: tokenKindCodePoint,
- codePoint: codePoint,
- }
-}
-
-func newCharPropSymbolToken(propSymbol string) *token {
- return &token{
- kind: tokenKindCharPropSymbol,
- propSymbol: propSymbol,
- }
-}
-
-func newFragmentSymbolToken(fragmentSymbol string) *token {
- return &token{
- kind: tokenKindFragmentSymbol,
- fragmentSymbol: fragmentSymbol,
- }
-}
-
-type lexerMode string
-
-const (
- lexerModeDefault lexerMode = "default"
- lexerModeBExp lexerMode = "bracket expression"
- lexerModeCPExp lexerMode = "code point expression"
- lexerModeCharPropExp lexerMode = "character property expression"
- lexerModeFragmentExp lexerMode = "fragment expression"
-)
-
-type lexerModeStack struct {
- stack []lexerMode
-}
-
-func newLexerModeStack() *lexerModeStack {
- return &lexerModeStack{
- stack: []lexerMode{
- lexerModeDefault,
- },
- }
-}
-
-func (s *lexerModeStack) top() lexerMode {
- return s.stack[len(s.stack)-1]
-}
-
-func (s *lexerModeStack) push(m lexerMode) {
- s.stack = append(s.stack, m)
-}
-
-func (s *lexerModeStack) pop() {
- s.stack = s.stack[:len(s.stack)-1]
-}
-
-type rangeState string
-
-// [a-z]
-// ^^^^
-// |||`-- ready
-// ||`-- expect range terminator
-// |`-- read range initiator
-// `-- ready
-const (
- rangeStateReady rangeState = "ready"
- rangeStateReadRangeInitiator rangeState = "read range initiator"
- rangeStateExpectRangeTerminator rangeState = "expect range terminator"
-)
-
-type lexer struct {
- src *bufio.Reader
- peekChar2 rune
- peekEOF2 bool
- peekChar1 rune
- peekEOF1 bool
- lastChar rune
- reachedEOF bool
- prevChar1 rune
- prevEOF1 bool
- prevChar2 rune
- pervEOF2 bool
- modeStack *lexerModeStack
- rangeState rangeState
-
- errCause error
- errDetail string
-}
-
-func newLexer(src io.Reader) *lexer {
- return &lexer{
- src: bufio.NewReader(src),
- peekChar2: nullChar,
- peekEOF2: false,
- peekChar1: nullChar,
- peekEOF1: false,
- lastChar: nullChar,
- reachedEOF: false,
- prevChar1: nullChar,
- prevEOF1: false,
- prevChar2: nullChar,
- pervEOF2: false,
- modeStack: newLexerModeStack(),
- rangeState: rangeStateReady,
- }
-}
-
-func (l *lexer) error() (string, error) {
- return l.errDetail, l.errCause
-}
-
-func (l *lexer) next() (*token, error) {
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- return newToken(tokenKindEOF, nullChar), nil
- }
-
- switch l.modeStack.top() {
- case lexerModeBExp:
- tok, err := l.nextInBExp(c)
- if err != nil {
- return nil, err
- }
- if tok.kind == tokenKindChar || tok.kind == tokenKindCodePointLeader || tok.kind == tokenKindCharPropLeader {
- switch l.rangeState {
- case rangeStateReady:
- l.rangeState = rangeStateReadRangeInitiator
- case rangeStateExpectRangeTerminator:
- l.rangeState = rangeStateReady
- }
- }
- switch tok.kind {
- case tokenKindBExpClose:
- l.modeStack.pop()
- case tokenKindCharRange:
- l.rangeState = rangeStateExpectRangeTerminator
- case tokenKindCodePointLeader:
- l.modeStack.push(lexerModeCPExp)
- case tokenKindCharPropLeader:
- l.modeStack.push(lexerModeCharPropExp)
- }
- return tok, nil
- case lexerModeCPExp:
- tok, err := l.nextInCodePoint(c)
- if err != nil {
- return nil, err
- }
- switch tok.kind {
- case tokenKindRBrace:
- l.modeStack.pop()
- }
- return tok, nil
- case lexerModeCharPropExp:
- tok, err := l.nextInCharProp(c)
- if err != nil {
- return nil, err
- }
- switch tok.kind {
- case tokenKindRBrace:
- l.modeStack.pop()
- }
- return tok, nil
- case lexerModeFragmentExp:
- tok, err := l.nextInFragment(c)
- if err != nil {
- return nil, err
- }
- switch tok.kind {
- case tokenKindRBrace:
- l.modeStack.pop()
- }
- return tok, nil
- default:
- tok, err := l.nextInDefault(c)
- if err != nil {
- return nil, err
- }
- switch tok.kind {
- case tokenKindBExpOpen:
- l.modeStack.push(lexerModeBExp)
- l.rangeState = rangeStateReady
- case tokenKindInverseBExpOpen:
- l.modeStack.push(lexerModeBExp)
- l.rangeState = rangeStateReady
- case tokenKindCodePointLeader:
- l.modeStack.push(lexerModeCPExp)
- case tokenKindCharPropLeader:
- l.modeStack.push(lexerModeCharPropExp)
- case tokenKindFragmentLeader:
- l.modeStack.push(lexerModeFragmentExp)
- }
- return tok, nil
- }
-}
-
-func (l *lexer) nextInDefault(c rune) (*token, error) {
- switch c {
- case '*':
- return newToken(tokenKindRepeat, nullChar), nil
- case '+':
- return newToken(tokenKindRepeatOneOrMore, nullChar), nil
- case '?':
- return newToken(tokenKindOption, nullChar), nil
- case '.':
- return newToken(tokenKindAnyChar, nullChar), nil
- case '|':
- return newToken(tokenKindAlt, nullChar), nil
- case '(':
- return newToken(tokenKindGroupOpen, nullChar), nil
- case ')':
- return newToken(tokenKindGroupClose, nullChar), nil
- case '[':
- c1, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindBExpOpen, nullChar), nil
- }
- if c1 != '^' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindBExpOpen, nullChar), nil
- }
- c2, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindInverseBExpOpen, nullChar), nil
- }
- if c2 != ']' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindInverseBExpOpen, nullChar), nil
- }
- err = l.restore()
- if err != nil {
- return nil, err
- }
- err = l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindBExpOpen, nullChar), nil
- case '\\':
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- l.errCause = synErrIncompletedEscSeq
- return nil, ParseErr
- }
- if c == 'u' {
- return newToken(tokenKindCodePointLeader, nullChar), nil
- }
- if c == 'p' {
- return newToken(tokenKindCharPropLeader, nullChar), nil
- }
- if c == 'f' {
- return newToken(tokenKindFragmentLeader, nullChar), nil
- }
- if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' {
- return newToken(tokenKindChar, c), nil
- }
- l.errCause = synErrInvalidEscSeq
- l.errDetail = fmt.Sprintf("\\%v is not supported", string(c))
- return nil, ParseErr
- default:
- return newToken(tokenKindChar, c), nil
- }
-}
-
-func (l *lexer) nextInBExp(c rune) (*token, error) {
- switch c {
- case '-':
- if l.rangeState != rangeStateReadRangeInitiator {
- return newToken(tokenKindChar, c), nil
- }
- c1, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindChar, c), nil
- }
- if c1 != ']' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindCharRange, nullChar), nil
- }
- err = l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindChar, c), nil
- case ']':
- return newToken(tokenKindBExpClose, nullChar), nil
- case '\\':
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- l.errCause = synErrIncompletedEscSeq
- return nil, ParseErr
- }
- if c == 'u' {
- return newToken(tokenKindCodePointLeader, nullChar), nil
- }
- if c == 'p' {
- return newToken(tokenKindCharPropLeader, nullChar), nil
- }
- if c == '\\' || c == '^' || c == '-' || c == ']' {
- return newToken(tokenKindChar, c), nil
- }
- l.errCause = synErrInvalidEscSeq
- l.errDetail = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c))
- return nil, ParseErr
- default:
- return newToken(tokenKindChar, c), nil
- }
-}
-
-func (l *lexer) nextInCodePoint(c rune) (*token, error) {
- switch c {
- case '{':
- return newToken(tokenKindLBrace, nullChar), nil
- case '}':
- return newToken(tokenKindRBrace, nullChar), nil
- default:
- if !isHexDigit(c) {
- l.errCause = synErrInvalidCodePoint
- return nil, ParseErr
- }
- var b strings.Builder
- fmt.Fprint(&b, string(c))
- n := 1
- for {
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- if c == '}' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- if !isHexDigit(c) || n >= 6 {
- l.errCause = synErrInvalidCodePoint
- return nil, ParseErr
- }
- fmt.Fprint(&b, string(c))
- n++
- }
- cp := b.String()
- cpLen := len(cp)
- if !(cpLen == 4 || cpLen == 6) {
- l.errCause = synErrInvalidCodePoint
- return nil, ParseErr
- }
- return newCodePointToken(b.String()), nil
- }
-}
-
-func isHexDigit(c rune) bool {
- if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' {
- return true
- }
- return false
-}
-
-func (l *lexer) nextInCharProp(c rune) (*token, error) {
- switch c {
- case '{':
- return newToken(tokenKindLBrace, nullChar), nil
- case '}':
- return newToken(tokenKindRBrace, nullChar), nil
- case '=':
- return newToken(tokenKindEqual, nullChar), nil
- default:
- var b strings.Builder
- fmt.Fprint(&b, string(c))
- n := 1
- for {
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- if c == '}' || c == '=' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- fmt.Fprint(&b, string(c))
- n++
- }
- sym := strings.TrimSpace(b.String())
- if len(sym) == 0 {
- l.errCause = synErrCharPropInvalidSymbol
- return nil, ParseErr
- }
- return newCharPropSymbolToken(sym), nil
- }
-}
-
-func (l *lexer) nextInFragment(c rune) (*token, error) {
- switch c {
- case '{':
- return newToken(tokenKindLBrace, nullChar), nil
- case '}':
- return newToken(tokenKindRBrace, nullChar), nil
- default:
- var b strings.Builder
- fmt.Fprint(&b, string(c))
- n := 1
- for {
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- if c == '}' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- fmt.Fprint(&b, string(c))
- n++
- }
- sym := strings.TrimSpace(b.String())
- if len(sym) == 0 {
- l.errCause = SynErrFragmentInvalidSymbol
- return nil, ParseErr
- }
- return newFragmentSymbolToken(sym), nil
- }
-}
-
-func (l *lexer) read() (rune, bool, error) {
- if l.reachedEOF {
- return l.lastChar, l.reachedEOF, nil
- }
- if l.peekChar1 != nullChar || l.peekEOF1 {
- l.prevChar2 = l.prevChar1
- l.pervEOF2 = l.prevEOF1
- l.prevChar1 = l.lastChar
- l.prevEOF1 = l.reachedEOF
- l.lastChar = l.peekChar1
- l.reachedEOF = l.peekEOF1
- l.peekChar1 = l.peekChar2
- l.peekEOF1 = l.peekEOF2
- l.peekChar2 = nullChar
- l.peekEOF2 = false
- return l.lastChar, l.reachedEOF, nil
- }
- c, _, err := l.src.ReadRune()
- if err != nil {
- if err == io.EOF {
- l.prevChar2 = l.prevChar1
- l.pervEOF2 = l.prevEOF1
- l.prevChar1 = l.lastChar
- l.prevEOF1 = l.reachedEOF
- l.lastChar = nullChar
- l.reachedEOF = true
- return l.lastChar, l.reachedEOF, nil
- }
- return nullChar, false, err
- }
- l.prevChar2 = l.prevChar1
- l.pervEOF2 = l.prevEOF1
- l.prevChar1 = l.lastChar
- l.prevEOF1 = l.reachedEOF
- l.lastChar = c
- l.reachedEOF = false
- return l.lastChar, l.reachedEOF, nil
-}
-
-func (l *lexer) restore() error {
- if l.lastChar == nullChar && !l.reachedEOF {
- return fmt.Errorf("failed to call restore() because the last character is null")
- }
- l.peekChar2 = l.peekChar1
- l.peekEOF2 = l.peekEOF1
- l.peekChar1 = l.lastChar
- l.peekEOF1 = l.reachedEOF
- l.lastChar = l.prevChar1
- l.reachedEOF = l.prevEOF1
- l.prevChar1 = l.prevChar2
- l.prevEOF1 = l.pervEOF2
- l.prevChar2 = nullChar
- l.pervEOF2 = false
- return nil
-}
diff --git a/compiler/parser/lexer_test.go b/compiler/parser/lexer_test.go
deleted file mode 100644
index 055466e..0000000
--- a/compiler/parser/lexer_test.go
+++ /dev/null
@@ -1,524 +0,0 @@
-package parser
-
-import (
- "strings"
- "testing"
-)
-
-func TestLexer(t *testing.T) {
- tests := []struct {
- caption string
- src string
- tokens []*token
- err error
- }{
- {
- caption: "lexer can recognize ordinaly characters",
- src: "123abcいろは",
- tokens: []*token{
- newToken(tokenKindChar, '1'),
- newToken(tokenKindChar, '2'),
- newToken(tokenKindChar, '3'),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindChar, 'b'),
- newToken(tokenKindChar, 'c'),
- newToken(tokenKindChar, 'い'),
- newToken(tokenKindChar, 'ろ'),
- newToken(tokenKindChar, 'は'),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "lexer can recognize the special characters in default mode",
- src: ".*+?|()[\\u",
- tokens: []*token{
- newToken(tokenKindAnyChar, nullChar),
- newToken(tokenKindRepeat, nullChar),
- newToken(tokenKindRepeatOneOrMore, nullChar),
- newToken(tokenKindOption, nullChar),
- newToken(tokenKindAlt, nullChar),
- newToken(tokenKindGroupOpen, nullChar),
- newToken(tokenKindGroupClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "lexer can recognize the escape sequences in default mode",
- src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[",
- tokens: []*token{
- newToken(tokenKindChar, '\\'),
- newToken(tokenKindChar, '.'),
- newToken(tokenKindChar, '*'),
- newToken(tokenKindChar, '+'),
- newToken(tokenKindChar, '?'),
- newToken(tokenKindChar, '|'),
- newToken(tokenKindChar, '('),
- newToken(tokenKindChar, ')'),
- newToken(tokenKindChar, '['),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "], {, and } are treated as an ordinary character in default mode",
- src: "]{}",
- tokens: []*token{
- newToken(tokenKindChar, ']'),
- newToken(tokenKindChar, '{'),
- newToken(tokenKindChar, '}'),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "lexer can recognize the special characters in bracket expression mode",
- src: "[a-z\\u{09AF}][^a-z\\u{09abcf}]",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindCharRange, nullChar),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("09AF"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindCharRange, nullChar),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("09abcf"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "lexer can recognize the escape sequences in bracket expression mode",
- src: "[\\^a\\-z]",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '^'),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "in a bracket expression, the special characters are also handled as normal characters",
- src: "[\\\\.*+?|()[",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '\\'),
- newToken(tokenKindChar, '.'),
- newToken(tokenKindChar, '*'),
- newToken(tokenKindChar, '+'),
- newToken(tokenKindChar, '?'),
- newToken(tokenKindChar, '|'),
- newToken(tokenKindChar, '('),
- newToken(tokenKindChar, ')'),
- newToken(tokenKindChar, '['),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "hyphen symbols that appear in bracket expressions are handled as the character range symbol or ordinary characters",
- // [...-...][...-][-...][-]
- // ~~~~~~~ ~ ~ ~
- // ^ ^ ^ ^
- // | | | `-- Ordinary Character (b)
- // | | `-- Ordinary Character (b)
- // | `-- Ordinary Character (b)
- // `-- Character Range (a)
- //
- // a. *-* is handled as a character-range expression.
- // b. *-, -*, or - are handled as ordinary characters.
- src: "[a-z][a-][-z][-][--][---][^a-z][^a-][^-z][^-][^--][^---]",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindCharRange, nullChar),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindCharRange, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
-
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindCharRange, nullChar),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindCharRange, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
-
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "caret symbols that appear in bracket expressions are handled as the logical inverse symbol or ordinary characters",
- // [^...^...][^]
- // ~~ ~ ~~
- // ^ ^ ^^
- // | | |`-- Ordinary Character (c)
- // | | `-- Bracket Expression
- // | `-- Ordinary Character (b)
- // `-- Inverse Bracket Expression (a)
- //
- // a. Bracket expressions that have a caret symbol at the beginning are handled as logical inverse expressions.
- // b. caret symbols that appear as the second and the subsequent symbols are handled as ordinary symbols.
- // c. When a bracket expression has just one symbol, a caret symbol at the beginning is handled as an ordinary character.
- src: "[^^][^]",
- tokens: []*token{
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, '^'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '^'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "lexer raises an error when an invalid escape sequence appears",
- src: "\\@",
- err: synErrInvalidEscSeq,
- },
- {
- caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears",
- src: "\\",
- err: synErrIncompletedEscSeq,
- },
- {
- caption: "lexer raises an error when an invalid escape sequence appears",
- src: "[\\@",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- },
- err: synErrInvalidEscSeq,
- },
- {
- caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears",
- src: "[\\",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- },
- err: synErrIncompletedEscSeq,
- },
- {
- caption: "lexer can recognize the special characters and code points in code point expression mode",
- src: "\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}[\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}][^\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}]",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("0123"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("4567"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("89abcd"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("efAB"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("CDEF01"),
- newToken(tokenKindRBrace, nullChar),
-
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("0123"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("4567"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("89abcd"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("efAB"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("CDEF01"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindBExpClose, nullChar),
-
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("0123"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("4567"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("89abcd"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("efAB"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("CDEF01"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindBExpClose, nullChar),
-
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "a one digit hex string isn't a valid code point",
- src: "\\u{0",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "a two digits hex string isn't a valid code point",
- src: "\\u{01",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "a three digits hex string isn't a valid code point",
- src: "\\u{012",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "a four digits hex string is a valid code point",
- src: "\\u{0123}",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("0123"),
- newToken(tokenKindRBrace, nullChar),
- },
- },
- {
- caption: "a five digits hex string isn't a valid code point",
- src: "\\u{01234",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "a six digits hex string is a valid code point",
- src: "\\u{012345}",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("012345"),
- newToken(tokenKindRBrace, nullChar),
- },
- },
- {
- caption: "a seven digits hex string isn't a valid code point",
- src: "\\u{0123456",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "a code point must be hex digits",
- src: "\\u{g",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "a code point must be hex digits",
- src: "\\u{G",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "lexer can recognize the special characters and symbols in character property expression mode",
- src: "\\p{Letter}\\p{General_Category=Letter}[\\p{Letter}\\p{General_Category=Letter}][^\\p{Letter}\\p{General_Category=Letter}]",
- tokens: []*token{
- newToken(tokenKindCharPropLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCharPropSymbolToken("Letter"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCharPropLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCharPropSymbolToken("General_Category"),
- newToken(tokenKindEqual, nullChar),
- newCharPropSymbolToken("Letter"),
- newToken(tokenKindRBrace, nullChar),
-
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindCharPropLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCharPropSymbolToken("Letter"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCharPropLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCharPropSymbolToken("General_Category"),
- newToken(tokenKindEqual, nullChar),
- newCharPropSymbolToken("Letter"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindBExpClose, nullChar),
-
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindCharPropLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCharPropSymbolToken("Letter"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCharPropLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCharPropSymbolToken("General_Category"),
- newToken(tokenKindEqual, nullChar),
- newCharPropSymbolToken("Letter"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindBExpClose, nullChar),
-
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "lexer can recognize the special characters and symbols in fragment expression mode",
- src: "\\f{integer}",
- tokens: []*token{
- newToken(tokenKindFragmentLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newFragmentSymbolToken("integer"),
- newToken(tokenKindRBrace, nullChar),
-
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "a fragment expression is not supported in a bracket expression",
- src: "[\\f",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- },
- err: synErrInvalidEscSeq,
- },
- {
- caption: "a fragment expression is not supported in an inverse bracket expression",
- src: "[^\\f",
- tokens: []*token{
- newToken(tokenKindInverseBExpOpen, nullChar),
- },
- err: synErrInvalidEscSeq,
- },
- }
- for _, tt := range tests {
- t.Run(tt.caption, func(t *testing.T) {
- lex := newLexer(strings.NewReader(tt.src))
- var err error
- var tok *token
- i := 0
- for {
- tok, err = lex.next()
- if err != nil {
- break
- }
- if i >= len(tt.tokens) {
- break
- }
- eTok := tt.tokens[i]
- i++
- testToken(t, tok, eTok)
-
- if tok.kind == tokenKindEOF {
- break
- }
- }
- if tt.err != nil {
- if err != ParseErr {
- t.Fatalf("unexpected error: want: %v, got: %v", ParseErr, err)
- }
- detail, cause := lex.error()
- if cause != tt.err {
- t.Fatalf("unexpected error: want: %v, got: %v (%v)", tt.err, cause, detail)
- }
- } else {
- if err != nil {
- t.Fatalf("unexpected error: %v", err)
- }
- }
- if i < len(tt.tokens) {
- t.Fatalf("expecte more tokens")
- }
- })
- }
-}
-
-func testToken(t *testing.T, a, e *token) {
- t.Helper()
- if e.kind != a.kind || e.char != a.char || e.codePoint != a.codePoint {
- t.Fatalf("unexpected token: want: %+v, got: %+v", e, a)
- }
-}
diff --git a/compiler/parser/parser.go b/compiler/parser/parser.go
deleted file mode 100644
index b7f8c04..0000000
--- a/compiler/parser/parser.go
+++ /dev/null
@@ -1,531 +0,0 @@
-package parser
-
-import (
- "bytes"
- "fmt"
- "io"
- "strconv"
-
- "github.com/nihei9/maleeni/spec"
- "github.com/nihei9/maleeni/ucd"
-)
-
-type PatternEntry struct {
- ID spec.LexModeKindID
- Pattern []byte
-}
-
-type parser struct {
- kind spec.LexKindName
- lex *lexer
- peekedTok *token
- lastTok *token
-
- // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that
- // appear in property expressions.
- //
- // The contributory properties are not exposed, and users cannot use those properties because the parser
- // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid.
- //
- // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to
- // interpret derived properties internally because the derived properties consist of other properties that
- // may contain the contributory properties.
- //
- // [UAX #44 5.13 Property APIs] says:
- // > The following subtypes of Unicode character properties should generally not be exposed in APIs,
- // > except in limited circumstances. They may not be useful, particularly in public API collections,
- // > and may instead prove misleading to the users of such API collections.
- // > * Contributory properties are not recommended for public APIs.
- // > ...
- // https://unicode.org/reports/tr44/#Property_APIs
- isContributoryPropertyExposed bool
-
- errCause error
- errDetail string
-}
-
-func NewParser(kind spec.LexKindName, src io.Reader) *parser {
- return &parser{
- kind: kind,
- lex: newLexer(src),
- isContributoryPropertyExposed: false,
- }
-}
-
-func (p *parser) exposeContributoryProperty() {
- p.isContributoryPropertyExposed = true
-}
-
-func (p *parser) Error() (string, error) {
- return p.errDetail, p.errCause
-}
-
-func (p *parser) Parse() (root CPTree, retErr error) {
- defer func() {
- err := recover()
- if err != nil {
- var ok bool
- retErr, ok = err.(error)
- if !ok {
- panic(err)
- }
- return
- }
- }()
-
- return newRootNode(p.kind, p.parseRegexp()), nil
-}
-
-func (p *parser) parseRegexp() CPTree {
- alt := p.parseAlt()
- if alt == nil {
- if p.consume(tokenKindGroupClose) {
- p.raiseParseError(synErrGroupNoInitiator, "")
- }
- p.raiseParseError(synErrNullPattern, "")
- }
- if p.consume(tokenKindGroupClose) {
- p.raiseParseError(synErrGroupNoInitiator, "")
- }
- p.expect(tokenKindEOF)
- return alt
-}
-
-func (p *parser) parseAlt() CPTree {
- left := p.parseConcat()
- if left == nil {
- if p.consume(tokenKindAlt) {
- p.raiseParseError(synErrAltLackOfOperand, "")
- }
- return nil
- }
- for {
- if !p.consume(tokenKindAlt) {
- break
- }
- right := p.parseConcat()
- if right == nil {
- p.raiseParseError(synErrAltLackOfOperand, "")
- }
- left = newAltNode(left, right)
- }
- return left
-}
-
-func (p *parser) parseConcat() CPTree {
- left := p.parseRepeat()
- for {
- right := p.parseRepeat()
- if right == nil {
- break
- }
- left = newConcatNode(left, right)
- }
- return left
-}
-
-func (p *parser) parseRepeat() CPTree {
- group := p.parseGroup()
- if group == nil {
- if p.consume(tokenKindRepeat) {
- p.raiseParseError(synErrRepNoTarget, "* needs an operand")
- }
- if p.consume(tokenKindRepeatOneOrMore) {
- p.raiseParseError(synErrRepNoTarget, "+ needs an operand")
- }
- if p.consume(tokenKindOption) {
- p.raiseParseError(synErrRepNoTarget, "? needs an operand")
- }
- return nil
- }
- if p.consume(tokenKindRepeat) {
- return newRepeatNode(group)
- }
- if p.consume(tokenKindRepeatOneOrMore) {
- return newRepeatOneOrMoreNode(group)
- }
- if p.consume(tokenKindOption) {
- return newOptionNode(group)
- }
- return group
-}
-
-func (p *parser) parseGroup() CPTree {
- if p.consume(tokenKindGroupOpen) {
- alt := p.parseAlt()
- if alt == nil {
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrGroupUnclosed, "")
- }
- p.raiseParseError(synErrGroupNoElem, "")
- }
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrGroupUnclosed, "")
- }
- if !p.consume(tokenKindGroupClose) {
- p.raiseParseError(synErrGroupInvalidForm, "")
- }
- return alt
- }
- return p.parseSingleChar()
-}
-
-func (p *parser) parseSingleChar() CPTree {
- if p.consume(tokenKindAnyChar) {
- return genAnyCharAST()
- }
- if p.consume(tokenKindBExpOpen) {
- left := p.parseBExpElem()
- if left == nil {
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrBExpUnclosed, "")
- }
- p.raiseParseError(synErrBExpNoElem, "")
- }
- for {
- right := p.parseBExpElem()
- if right == nil {
- break
- }
- left = newAltNode(left, right)
- }
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrBExpUnclosed, "")
- }
- p.expect(tokenKindBExpClose)
- return left
- }
- if p.consume(tokenKindInverseBExpOpen) {
- elem := p.parseBExpElem()
- if elem == nil {
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrBExpUnclosed, "")
- }
- p.raiseParseError(synErrBExpNoElem, "")
- }
- inverse := exclude(elem, genAnyCharAST())
- if inverse == nil {
- p.raiseParseError(synErrUnmatchablePattern, "")
- }
- for {
- elem := p.parseBExpElem()
- if elem == nil {
- break
- }
- inverse = exclude(elem, inverse)
- if inverse == nil {
- p.raiseParseError(synErrUnmatchablePattern, "")
- }
- }
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrBExpUnclosed, "")
- }
- p.expect(tokenKindBExpClose)
- return inverse
- }
- if p.consume(tokenKindCodePointLeader) {
- return p.parseCodePoint()
- }
- if p.consume(tokenKindCharPropLeader) {
- return p.parseCharProp()
- }
- if p.consume(tokenKindFragmentLeader) {
- return p.parseFragment()
- }
- c := p.parseNormalChar()
- if c == nil {
- if p.consume(tokenKindBExpClose) {
- p.raiseParseError(synErrBExpInvalidForm, "")
- }
- return nil
- }
- return c
-}
-
-func (p *parser) parseBExpElem() CPTree {
- var left CPTree
- switch {
- case p.consume(tokenKindCodePointLeader):
- left = p.parseCodePoint()
- case p.consume(tokenKindCharPropLeader):
- left = p.parseCharProp()
- if p.consume(tokenKindCharRange) {
- p.raiseParseError(synErrRangePropIsUnavailable, "")
- }
- default:
- left = p.parseNormalChar()
- }
- if left == nil {
- return nil
- }
- if !p.consume(tokenKindCharRange) {
- return left
- }
- var right CPTree
- switch {
- case p.consume(tokenKindCodePointLeader):
- right = p.parseCodePoint()
- case p.consume(tokenKindCharPropLeader):
- p.raiseParseError(synErrRangePropIsUnavailable, "")
- default:
- right = p.parseNormalChar()
- }
- if right == nil {
- p.raiseParseError(synErrRangeInvalidForm, "")
- }
- from, _, _ := left.Range()
- _, to, _ := right.Range()
- if !isValidOrder(from, to) {
- p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("%X..%X", from, to))
- }
- return newRangeSymbolNode(from, to)
-}
-
-func (p *parser) parseCodePoint() CPTree {
- if !p.consume(tokenKindLBrace) {
- p.raiseParseError(synErrCPExpInvalidForm, "")
- }
- if !p.consume(tokenKindCodePoint) {
- p.raiseParseError(synErrCPExpInvalidForm, "")
- }
-
- n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64)
- if err != nil {
- panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err))
- }
- if n < 0x0000 || n > 0x10FFFF {
- p.raiseParseError(synErrCPExpOutOfRange, "")
- }
-
- sym := newSymbolNode(rune(n))
-
- if !p.consume(tokenKindRBrace) {
- p.raiseParseError(synErrCPExpInvalidForm, "")
- }
-
- return sym
-}
-
-func (p *parser) parseCharProp() CPTree {
- if !p.consume(tokenKindLBrace) {
- p.raiseParseError(synErrCharPropExpInvalidForm, "")
- }
- var sym1, sym2 string
- if !p.consume(tokenKindCharPropSymbol) {
- p.raiseParseError(synErrCharPropExpInvalidForm, "")
- }
- sym1 = p.lastTok.propSymbol
- if p.consume(tokenKindEqual) {
- if !p.consume(tokenKindCharPropSymbol) {
- p.raiseParseError(synErrCharPropExpInvalidForm, "")
- }
- sym2 = p.lastTok.propSymbol
- }
-
- var alt CPTree
- var propName, propVal string
- if sym2 != "" {
- propName = sym1
- propVal = sym2
- } else {
- propName = ""
- propVal = sym1
- }
- if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) {
- p.raiseParseError(synErrCharPropUnsupported, propName)
- }
- pat, err := ucd.NormalizeCharacterProperty(propName, propVal)
- if err != nil {
- p.raiseParseError(synErrCharPropUnsupported, err.Error())
- }
- if pat != "" {
- p := NewParser(p.kind, bytes.NewReader([]byte(pat)))
- p.exposeContributoryProperty()
- ast, err := p.Parse()
- if err != nil {
- panic(err)
- }
- alt = ast
- } else {
- cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal)
- if err != nil {
- p.raiseParseError(synErrCharPropUnsupported, err.Error())
- }
- if inverse {
- r := cpRanges[0]
- alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST())
- if alt == nil {
- p.raiseParseError(synErrUnmatchablePattern, "")
- }
- for _, r := range cpRanges[1:] {
- alt = exclude(newRangeSymbolNode(r.From, r.To), alt)
- if alt == nil {
- p.raiseParseError(synErrUnmatchablePattern, "")
- }
- }
- } else {
- for _, r := range cpRanges {
- alt = genAltNode(
- alt,
- newRangeSymbolNode(r.From, r.To),
- )
- }
- }
- }
-
- if !p.consume(tokenKindRBrace) {
- p.raiseParseError(synErrCharPropExpInvalidForm, "")
- }
-
- return alt
-}
-
-func (p *parser) parseFragment() CPTree {
- if !p.consume(tokenKindLBrace) {
- p.raiseParseError(synErrFragmentExpInvalidForm, "")
- }
- if !p.consume(tokenKindFragmentSymbol) {
- p.raiseParseError(synErrFragmentExpInvalidForm, "")
- }
- sym := p.lastTok.fragmentSymbol
-
- if !p.consume(tokenKindRBrace) {
- p.raiseParseError(synErrFragmentExpInvalidForm, "")
- }
-
- return newFragmentNode(spec.LexKindName(sym), nil)
-}
-
-func (p *parser) parseNormalChar() CPTree {
- if !p.consume(tokenKindChar) {
- return nil
- }
- return newSymbolNode(p.lastTok.char)
-}
-
-func exclude(symbol, base CPTree) CPTree {
- if left, right, ok := symbol.Alternatives(); ok {
- return exclude(right, exclude(left, base))
- }
-
- if left, right, ok := base.Alternatives(); ok {
- return genAltNode(
- exclude(symbol, left),
- exclude(symbol, right),
- )
- }
-
- if bFrom, bTo, ok := base.Range(); ok {
- sFrom, sTo, ok := symbol.Range()
- if !ok {
- panic(fmt.Errorf("invalid symbol tree: %T", symbol))
- }
-
- switch {
- case sFrom > bFrom && sTo < bTo:
- return genAltNode(
- newRangeSymbolNode(bFrom, sFrom-1),
- newRangeSymbolNode(sTo+1, bTo),
- )
- case sFrom <= bFrom && sTo >= bFrom && sTo < bTo:
- return newRangeSymbolNode(sTo+1, bTo)
- case sFrom > bFrom && sFrom <= bTo && sTo >= bTo:
- return newRangeSymbolNode(bFrom, sFrom-1)
- case sFrom <= bFrom && sTo >= bTo:
- return nil
- default:
- return base
- }
- }
-
- panic(fmt.Errorf("invalid base tree: %T", base))
-}
-
-func genAnyCharAST() CPTree {
- return newRangeSymbolNode(0x0, 0x10FFFF)
-}
-
-func isValidOrder(from, to rune) bool {
- return from <= to
-}
-
-func genConcatNode(cs ...CPTree) CPTree {
- nonNilNodes := []CPTree{}
- for _, c := range cs {
- if c == nil {
- continue
- }
- nonNilNodes = append(nonNilNodes, c)
- }
- if len(nonNilNodes) <= 0 {
- return nil
- }
- if len(nonNilNodes) == 1 {
- return nonNilNodes[0]
- }
- concat := newConcatNode(nonNilNodes[0], nonNilNodes[1])
- for _, c := range nonNilNodes[2:] {
- concat = newConcatNode(concat, c)
- }
- return concat
-}
-
-func genAltNode(cs ...CPTree) CPTree {
- nonNilNodes := []CPTree{}
- for _, c := range cs {
- if c == nil {
- continue
- }
- nonNilNodes = append(nonNilNodes, c)
- }
- if len(nonNilNodes) <= 0 {
- return nil
- }
- if len(nonNilNodes) == 1 {
- return nonNilNodes[0]
- }
- alt := newAltNode(nonNilNodes[0], nonNilNodes[1])
- for _, c := range nonNilNodes[2:] {
- alt = newAltNode(alt, c)
- }
- return alt
-}
-
-func (p *parser) expect(expected tokenKind) {
- if !p.consume(expected) {
- tok := p.peekedTok
- p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("expected: %v, actual: %v", expected, tok.kind))
- }
-}
-
-func (p *parser) consume(expected tokenKind) bool {
- var tok *token
- var err error
- if p.peekedTok != nil {
- tok = p.peekedTok
- p.peekedTok = nil
- } else {
- tok, err = p.lex.next()
- if err != nil {
- if err == ParseErr {
- detail, cause := p.lex.error()
- p.raiseParseError(cause, detail)
- }
- panic(err)
- }
- }
- p.lastTok = tok
- if tok.kind == expected {
- return true
- }
- p.peekedTok = tok
- p.lastTok = nil
-
- return false
-}
-
-func (p *parser) raiseParseError(err error, detail string) {
- p.errCause = err
- p.errDetail = detail
- panic(ParseErr)
-}
diff --git a/compiler/parser/parser_test.go b/compiler/parser/parser_test.go
deleted file mode 100644
index 57c130e..0000000
--- a/compiler/parser/parser_test.go
+++ /dev/null
@@ -1,1389 +0,0 @@
-package parser
-
-import (
- "fmt"
- "reflect"
- "strings"
- "testing"
-
- "github.com/nihei9/maleeni/spec"
- "github.com/nihei9/maleeni/ucd"
-)
-
-func TestParse(t *testing.T) {
- tests := []struct {
- pattern string
- fragments map[spec.LexKindName]string
- ast CPTree
- syntaxError error
-
- // When an AST is large, as patterns containing a character property expression, this test only checks
- // that the pattern is parsable. The check of the validity of such AST is performed by checking that it
- // can be matched correctly using the driver.
- skipTestAST bool
- }{
- {
- pattern: "a",
- ast: newSymbolNode('a'),
- },
- {
- pattern: "abc",
- ast: genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- },
- {
- pattern: "a?",
- ast: newOptionNode(
- newSymbolNode('a'),
- ),
- },
- {
- pattern: "[abc]?",
- ast: newOptionNode(
- genAltNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- },
- {
- pattern: "\\u{3042}?",
- ast: newOptionNode(
- newSymbolNode('\u3042'),
- ),
- },
- {
- pattern: "\\p{Letter}?",
- skipTestAST: true,
- },
- {
- pattern: "\\f{a2c}?",
- fragments: map[spec.LexKindName]string{
- "a2c": "abc",
- },
- ast: newOptionNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- ),
- },
- {
- pattern: "(a)?",
- ast: newOptionNode(
- newSymbolNode('a'),
- ),
- },
- {
- pattern: "((a?)?)?",
- ast: newOptionNode(
- newOptionNode(
- newOptionNode(
- newSymbolNode('a'),
- ),
- ),
- ),
- },
- {
- pattern: "(abc)?",
- ast: newOptionNode(
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- },
- {
- pattern: "(a|b)?",
- ast: newOptionNode(
- genAltNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- ),
- ),
- },
- {
- pattern: "?",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "(?)",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a|?",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "?|b",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a??",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a*",
- ast: newRepeatNode(
- newSymbolNode('a'),
- ),
- },
- {
- pattern: "[abc]*",
- ast: newRepeatNode(
- genAltNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- },
- {
- pattern: "\\u{3042}*",
- ast: newRepeatNode(
- newSymbolNode('\u3042'),
- ),
- },
- {
- pattern: "\\p{Letter}*",
- skipTestAST: true,
- },
- {
- pattern: "\\f{a2c}*",
- fragments: map[spec.LexKindName]string{
- "a2c": "abc",
- },
- ast: newRepeatNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- ),
- },
- {
- pattern: "((a*)*)*",
- ast: newRepeatNode(
- newRepeatNode(
- newRepeatNode(
- newSymbolNode('a'),
- ),
- ),
- ),
- },
- {
- pattern: "(abc)*",
- ast: newRepeatNode(
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- },
- {
- pattern: "(a|b)*",
- ast: newRepeatNode(
- genAltNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- ),
- ),
- },
- {
- pattern: "*",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "(*)",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a|*",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "*|b",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a**",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a+",
- ast: genConcatNode(
- newSymbolNode('a'),
- newRepeatNode(
- newSymbolNode('a'),
- ),
- ),
- },
- {
- pattern: "[abc]+",
- ast: genConcatNode(
- genAltNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- newRepeatNode(
- genAltNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- ),
- },
- {
- pattern: "\\u{3042}+",
- ast: genConcatNode(
- newSymbolNode('\u3042'),
- newRepeatNode(
- newSymbolNode('\u3042'),
- ),
- ),
- },
- {
- pattern: "\\p{Letter}+",
- skipTestAST: true,
- },
- {
- pattern: "\\f{a2c}+",
- fragments: map[spec.LexKindName]string{
- "a2c": "abc",
- },
- ast: genConcatNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- newRepeatNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- ),
- ),
- },
- {
- pattern: "((a+)+)+",
- ast: genConcatNode(
- genConcatNode(
- genConcatNode(
- genConcatNode(
- newSymbolNode('a'),
- newRepeatNode(
- newSymbolNode('a'),
- ),
- ),
- newRepeatNode(
- genConcatNode(
- newSymbolNode('a'),
- newRepeatNode(
- newSymbolNode('a'),
- ),
- ),
- ),
- ),
- newRepeatNode(
- genConcatNode(
- genConcatNode(
- newSymbolNode('a'),
- newRepeatNode(
- newSymbolNode('a'),
- ),
- ),
- newRepeatNode(
- genConcatNode(
- newSymbolNode('a'),
- newRepeatNode(
- newSymbolNode('a'),
- ),
- ),
- ),
- ),
- ),
- ),
- ),
- },
- {
- pattern: "(abc)+",
- ast: genConcatNode(
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- newRepeatNode(
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- ),
- },
- {
- pattern: "(a|b)+",
- ast: genConcatNode(
- genAltNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- ),
- newRepeatNode(
- genAltNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- ),
- ),
- ),
- },
- {
- pattern: "+",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "(+)",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a|+",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "+|b",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a++",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: ".",
- ast: newRangeSymbolNode(0x00, 0x10FFFF),
- },
- {
- pattern: "[a]",
- ast: newSymbolNode('a'),
- },
- {
- pattern: "[abc]",
- ast: genAltNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- },
- {
- pattern: "[a-z]",
- ast: newRangeSymbolNode('a', 'z'),
- },
- {
- pattern: "[A-Za-z]",
- ast: genAltNode(
- newRangeSymbolNode('A', 'Z'),
- newRangeSymbolNode('a', 'z'),
- ),
- },
- {
- pattern: "[\\u{004E}]",
- ast: newSymbolNode('N'),
- },
- {
- pattern: "[\\u{0061}-\\u{007A}]",
- ast: newRangeSymbolNode('a', 'z'),
- },
- {
- pattern: "[\\p{Lu}]",
- skipTestAST: true,
- },
- {
- pattern: "[a-\\p{Lu}]",
- syntaxError: synErrRangePropIsUnavailable,
- },
- {
- pattern: "[\\p{Lu}-z]",
- syntaxError: synErrRangePropIsUnavailable,
- },
- {
- pattern: "[\\p{Lu}-\\p{Ll}]",
- syntaxError: synErrRangePropIsUnavailable,
- },
- {
- pattern: "[z-a]",
- syntaxError: synErrRangeInvalidOrder,
- },
- {
- pattern: "a[]",
- syntaxError: synErrBExpNoElem,
- },
- {
- pattern: "[]a",
- syntaxError: synErrBExpNoElem,
- },
- {
- pattern: "[]",
- syntaxError: synErrBExpNoElem,
- },
- {
- pattern: "[^\\u{004E}]",
- ast: genAltNode(
- newRangeSymbolNode(0x00, '\u004E'-1),
- newRangeSymbolNode('\u004E'+1, 0x10FFFF),
- ),
- },
- {
- pattern: "[^\\u{0061}-\\u{007A}]",
- ast: genAltNode(
- newRangeSymbolNode(0x00, '\u0061'-1),
- newRangeSymbolNode('\u007A'+1, 0x10FFFF),
- ),
- },
- {
- pattern: "[^\\p{Lu}]",
- skipTestAST: true,
- },
- {
- pattern: "[^a-\\p{Lu}]",
- syntaxError: synErrRangePropIsUnavailable,
- },
- {
- pattern: "[^\\p{Lu}-z]",
- syntaxError: synErrRangePropIsUnavailable,
- },
- {
- pattern: "[^\\p{Lu}-\\p{Ll}]",
- syntaxError: synErrRangePropIsUnavailable,
- },
- {
- pattern: "[^\\u{0000}-\\u{10FFFF}]",
- syntaxError: synErrUnmatchablePattern,
- },
- {
- pattern: "[^\\u{0000}-\\u{FFFF}\\u{010000}-\\u{10FFFF}]",
- syntaxError: synErrUnmatchablePattern,
- },
- {
- pattern: "[^]",
- ast: newSymbolNode('^'),
- },
- {
- pattern: "[",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[a",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([a",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[a-",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([a-",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[^",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([^",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[^a",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([^a",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[^a-",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([^a-",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "]",
- ast: newSymbolNode(']'),
- },
- {
- pattern: "(]",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "a]",
- ast: genConcatNode(
- newSymbolNode('a'),
- newSymbolNode(']'),
- ),
- },
- {
- pattern: "(a]",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "([)",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([a)",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[a-]",
- ast: genAltNode(
- newSymbolNode('a'),
- newSymbolNode('-'),
- ),
- },
- {
- pattern: "[^a-]",
- ast: genAltNode(
- newRangeSymbolNode(0x00, 0x2C),
- newRangeSymbolNode(0x2E, 0x60),
- newRangeSymbolNode(0x62, 0x10FFFF),
- ),
- },
- {
- pattern: "[-z]",
- ast: genAltNode(
- newSymbolNode('-'),
- newSymbolNode('z'),
- ),
- },
- {
- pattern: "[^-z]",
- ast: newAltNode(
- newRangeSymbolNode(0x00, 0x2C),
- newAltNode(
- newRangeSymbolNode(0x2E, 0x79),
- newRangeSymbolNode(0x7B, 0x10FFFF),
- ),
- ),
- },
- {
- pattern: "[-]",
- ast: newSymbolNode('-'),
- },
- {
- pattern: "[^-]",
- ast: genAltNode(
- newRangeSymbolNode(0x00, 0x2C),
- newRangeSymbolNode(0x2E, 0x10FFFF),
- ),
- },
- {
- pattern: "[^01]",
- ast: genAltNode(
- newRangeSymbolNode(0x00, '0'-1),
- newRangeSymbolNode('1'+1, 0x10FFFF),
- ),
- },
- {
- pattern: "[^10]",
- ast: genAltNode(
- newRangeSymbolNode(0x00, '0'-1),
- newRangeSymbolNode('1'+1, 0x10FFFF),
- ),
- },
- {
- pattern: "[^a-z]",
- ast: genAltNode(
- newRangeSymbolNode(0x00, 'a'-1),
- newRangeSymbolNode('z'+1, 0x10FFFF),
- ),
- },
- {
- pattern: "[^az]",
- ast: genAltNode(
- newRangeSymbolNode(0x00, 'a'-1),
- genAltNode(
- newRangeSymbolNode('a'+1, 'z'-1),
- newRangeSymbolNode('z'+1, 0x10FFFF),
- ),
- ),
- },
- {
- pattern: "\\u{006E}",
- ast: newSymbolNode('\u006E'),
- },
- {
- pattern: "\\u{03BD}",
- ast: newSymbolNode('\u03BD'),
- },
- {
- pattern: "\\u{306B}",
- ast: newSymbolNode('\u306B'),
- },
- {
- pattern: "\\u{01F638}",
- ast: newSymbolNode('\U0001F638'),
- },
- {
- pattern: "\\u{0000}",
- ast: newSymbolNode('\u0000'),
- },
- {
- pattern: "\\u{10FFFF}",
- ast: newSymbolNode('\U0010FFFF'),
- },
- {
- pattern: "\\u{110000}",
- syntaxError: synErrCPExpOutOfRange,
- },
- {
- pattern: "\\u",
- syntaxError: synErrCPExpInvalidForm,
- },
- {
- pattern: "\\u{",
- syntaxError: synErrCPExpInvalidForm,
- },
- {
- pattern: "\\u{03BD",
- syntaxError: synErrCPExpInvalidForm,
- },
- {
- pattern: "\\u{}",
- syntaxError: synErrCPExpInvalidForm,
- },
- {
- pattern: "\\p{Letter}",
- skipTestAST: true,
- },
- {
- pattern: "\\p{General_Category=Letter}",
- skipTestAST: true,
- },
- {
- pattern: "\\p{ Letter }",
- skipTestAST: true,
- },
- {
- pattern: "\\p{ General_Category = Letter }",
- skipTestAST: true,
- },
- {
- pattern: "\\p",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{Letter",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{General_Category=}",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{General_Category= }",
- syntaxError: synErrCharPropInvalidSymbol,
- },
- {
- pattern: "\\p{=Letter}",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{ =Letter}",
- syntaxError: synErrCharPropInvalidSymbol,
- },
- {
- pattern: "\\p{=}",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{}",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\f{a2c}",
- fragments: map[spec.LexKindName]string{
- "a2c": "abc",
- },
- ast: newFragmentNode("a2c",
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- },
- {
- pattern: "\\f{ a2c }",
- fragments: map[spec.LexKindName]string{
- "a2c": "abc",
- },
- ast: newFragmentNode("a2c",
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- },
- {
- pattern: "\\f",
- syntaxError: synErrFragmentExpInvalidForm,
- },
- {
- pattern: "\\f{",
- syntaxError: synErrFragmentExpInvalidForm,
- },
- {
- pattern: "\\f{a2c",
- fragments: map[spec.LexKindName]string{
- "a2c": "abc",
- },
- syntaxError: synErrFragmentExpInvalidForm,
- },
- {
- pattern: "(a)",
- ast: newSymbolNode('a'),
- },
- {
- pattern: "(((a)))",
- ast: newSymbolNode('a'),
- },
- {
- pattern: "a()",
- syntaxError: synErrGroupNoElem,
- },
- {
- pattern: "()a",
- syntaxError: synErrGroupNoElem,
- },
- {
- pattern: "()",
- syntaxError: synErrGroupNoElem,
- },
- {
- pattern: "(",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "a(",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "(a",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "((",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "((a)",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: ")",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: "a)",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: ")a",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: "))",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: "(a))",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: "Mulder|Scully",
- ast: genAltNode(
- genConcatNode(
- newSymbolNode('M'),
- newSymbolNode('u'),
- newSymbolNode('l'),
- newSymbolNode('d'),
- newSymbolNode('e'),
- newSymbolNode('r'),
- ),
- genConcatNode(
- newSymbolNode('S'),
- newSymbolNode('c'),
- newSymbolNode('u'),
- newSymbolNode('l'),
- newSymbolNode('l'),
- newSymbolNode('y'),
- ),
- ),
- },
- {
- pattern: "Langly|Frohike|Byers",
- ast: genAltNode(
- genConcatNode(
- newSymbolNode('L'),
- newSymbolNode('a'),
- newSymbolNode('n'),
- newSymbolNode('g'),
- newSymbolNode('l'),
- newSymbolNode('y'),
- ),
- genConcatNode(
- newSymbolNode('F'),
- newSymbolNode('r'),
- newSymbolNode('o'),
- newSymbolNode('h'),
- newSymbolNode('i'),
- newSymbolNode('k'),
- newSymbolNode('e'),
- ),
- genConcatNode(
- newSymbolNode('B'),
- newSymbolNode('y'),
- newSymbolNode('e'),
- newSymbolNode('r'),
- newSymbolNode('s'),
- ),
- ),
- },
- {
- pattern: "|",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "||",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Mulder|",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "|Scully",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Langly|Frohike|",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Langly||Byers",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "|Frohike|Byers",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "|Frohike|",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Fox(|)Mulder",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "(Fox|)Mulder",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Fox(|Mulder)",
- syntaxError: synErrAltLackOfOperand,
- },
- }
- for i, tt := range tests {
- t.Run(fmt.Sprintf("#%v %v", i, tt.pattern), func(t *testing.T) {
- fragmentTrees := map[spec.LexKindName]CPTree{}
- for kind, pattern := range tt.fragments {
- p := NewParser(kind, strings.NewReader(pattern))
- root, err := p.Parse()
- if err != nil {
- t.Fatal(err)
- }
-
- fragmentTrees[kind] = root
- }
- err := CompleteFragments(fragmentTrees)
- if err != nil {
- t.Fatal(err)
- }
-
- p := NewParser(spec.LexKindName("test"), strings.NewReader(tt.pattern))
- root, err := p.Parse()
- if tt.syntaxError != nil {
- // printCPTree(os.Stdout, root, "", "")
- if err != ParseErr {
- t.Fatalf("unexpected error: want: %v, got: %v", ParseErr, err)
- }
- _, synErr := p.Error()
- if synErr != tt.syntaxError {
- t.Fatalf("unexpected syntax error: want: %v, got: %v", tt.syntaxError, synErr)
- }
- if root != nil {
- t.Fatalf("tree must be nil")
- }
- } else {
- if err != nil {
- detail, cause := p.Error()
- t.Fatalf("%v: %v: %v", err, cause, detail)
- }
- if root == nil {
- t.Fatal("tree must be non-nil")
- }
-
- complete, err := ApplyFragments(root, fragmentTrees)
- if err != nil {
- t.Fatal(err)
- }
- if !complete {
- t.Fatalf("incomplete fragments")
- }
-
- // printCPTree(os.Stdout, root, "", "")
- if !tt.skipTestAST {
- r := root.(*rootNode)
- testAST(t, tt.ast, r.tree)
- }
- }
- })
- }
-}
-
-func TestParse_ContributoryPropertyIsNotExposed(t *testing.T) {
- for _, cProp := range ucd.ContributoryProperties() {
- t.Run(fmt.Sprintf("%v", cProp), func(t *testing.T) {
- p := NewParser(spec.LexKindName("test"), strings.NewReader(fmt.Sprintf(`\p{%v=yes}`, cProp)))
- root, err := p.Parse()
- if err == nil {
- t.Fatalf("expected syntax error: got: nil")
- }
- _, synErr := p.Error()
- if synErr != synErrCharPropUnsupported {
- t.Fatalf("unexpected syntax error: want: %v, got: %v", synErrCharPropUnsupported, synErr)
- }
- if root != nil {
- t.Fatalf("tree is not nil")
- }
- })
- }
-}
-
-func TestExclude(t *testing.T) {
- for _, test := range []struct {
- caption string
- target CPTree
- base CPTree
- result CPTree
- }{
- // t.From > b.From && t.To < b.To
-
- // |t.From - b.From| = 1
- // |b.To - t.To| = 1
- //
- // Target (t): +--+
- // Base (b): +--+--+--+
- // Result (b - t): +--+ +--+
- {
- caption: "|t.From - b.From| = 1 && |b.To - t.To| = 1",
- target: newSymbolNode('1'),
- base: newRangeSymbolNode('0', '2'),
- result: newAltNode(
- newSymbolNode('0'),
- newSymbolNode('2'),
- ),
- },
- // |t.From - b.From| > 1
- // |b.To - t.To| > 1
- //
- // Target (t): +--+
- // Base (b): +--+--+--+--+--+
- // Result (b - t): +--+--+ +--+--+
- {
- caption: "|t.From - b.From| > 1 && |b.To - t.To| > 1",
- target: newSymbolNode('2'),
- base: newRangeSymbolNode('0', '4'),
- result: newAltNode(
- newRangeSymbolNode('0', '1'),
- newRangeSymbolNode('3', '4'),
- ),
- },
-
- // t.From <= b.From && t.To >= b.From && t.To < b.To
-
- // |b.From - t.From| = 0
- // |t.To - b.From| = 0
- // |b.To - t.To| = 1
- //
- // Target (t): +--+
- // Base (b): +--+--+
- // Result (b - t): +--+
- {
- caption: "|b.From - t.From| = 0 && |t.To - b.From| = 0 && |b.To - t.To| = 1",
- target: newSymbolNode('0'),
- base: newRangeSymbolNode('0', '1'),
- result: newSymbolNode('1'),
- },
- // |b.From - t.From| = 0
- // |t.To - b.From| = 0
- // |b.To - t.To| > 1
- //
- // Target (t): +--+
- // Base (b): +--+--+--+
- // Result (b - t): +--+--+
- {
- caption: "|b.From - t.From| = 0 && |t.To - b.From| = 0 && |b.To - t.To| > 1",
- target: newSymbolNode('0'),
- base: newRangeSymbolNode('0', '2'),
- result: newRangeSymbolNode('1', '2'),
- },
- // |b.From - t.From| = 0
- // |t.To - b.From| > 0
- // |b.To - t.To| = 1
- //
- // Target (t): +--+--+
- // Base (b): +--+--+--+
- // Result (b - t): +--+
- {
- caption: "|b.From - t.From| = 0 && |t.To - b.From| > 0 && |b.To - t.To| = 1",
- target: newRangeSymbolNode('0', '1'),
- base: newRangeSymbolNode('0', '2'),
- result: newSymbolNode('2'),
- },
- // |b.From - t.From| = 0
- // |t.To - b.From| > 0
- // |b.To - t.To| > 1
- //
- // Target (t): +--+--+
- // Base (b): +--+--+--+--+
- // Result (b - t): +--+--+
- {
- caption: "|b.From - t.From| = 0 && |t.To - b.From| > 0 && |b.To - t.To| > 1",
- target: newRangeSymbolNode('0', '1'),
- base: newRangeSymbolNode('0', '3'),
- result: newRangeSymbolNode('2', '3'),
- },
- // |b.From - t.From| > 0
- // |t.To - b.From| = 0
- // |b.To - t.To| = 1
- //
- // Target (t): +--+--+
- // Base (b): +--+--+
- // Result (b - t): +--+
- {
- caption: "|b.From - t.From| > 0 && |t.To - b.From| = 0 && |b.To - t.To| = 1",
- target: newRangeSymbolNode('0', '1'),
- base: newRangeSymbolNode('1', '2'),
- result: newSymbolNode('2'),
- },
- // |b.From - t.From| > 0
- // |t.To - b.From| = 0
- // |b.To - t.To| > 1
- //
- // Target (t): +--+--+
- // Base (b): +--+--+--+
- // Result (b - t): +--+--+
- {
- caption: "|b.From - t.From| > 0 && |t.To - b.From| = 0 && |b.To - t.To| > 1",
- target: newRangeSymbolNode('0', '1'),
- base: newRangeSymbolNode('1', '3'),
- result: newRangeSymbolNode('2', '3'),
- },
- // |b.From - t.From| > 0
- // |t.To - b.From| > 0
- // |b.To - t.To| = 1
- //
- // Target (t): +--+--+--+
- // Base (b): +--+--+--+
- // Result (b - t): +--+
- {
- caption: "|b.From - t.From| > 0 && |t.To - b.From| > 0 && |b.To - t.To| = 1",
- target: newRangeSymbolNode('0', '2'),
- base: newRangeSymbolNode('1', '3'),
- result: newSymbolNode('3'),
- },
- // |b.From - t.From| > 0
- // |t.To - b.From| > 0
- // |b.To - t.To| > 1
- //
- // Target (t): +--+--+--+
- // Base (b): +--+--+--+--+
- // Result (b - t): +--+--+
- {
- caption: "|b.From - t.From| > 0 && |t.To - b.From| > 0 && |b.To - t.To| > 1",
- target: newRangeSymbolNode('0', '2'),
- base: newRangeSymbolNode('1', '4'),
- result: newRangeSymbolNode('3', '4'),
- },
-
- // t.From > b.From && t.From <= b.To && t.To >= b.To
-
- // |t.From - b.From| = 1
- // |b.To - t.From| = 0
- // |t.To - b.To| = 0
- //
- // Target (t): +--+
- // Base (b): +--+--+
- // Result (b - t): +--+
- {
- caption: "|t.From - b.From| = 1 && |b.To - t.From| = 0 && |t.To - b.To| = 0",
- target: newSymbolNode('1'),
- base: newRangeSymbolNode('0', '1'),
- result: newSymbolNode('0'),
- },
- // |t.From - b.From| = 1
- // |b.To - t.From| = 0
- // |t.To - b.To| > 0
- //
- // Target (t): +--+--+
- // Base (b): +--+--+
- // Result (b - t): +--+
- {
- caption: "|t.From - b.From| = 1 && |b.To - t.From| = 0 && |t.To - b.To| > 0",
- target: newRangeSymbolNode('1', '2'),
- base: newRangeSymbolNode('0', '1'),
- result: newSymbolNode('0'),
- },
- // |t.From - b.From| = 1
- // |b.To - t.From| > 0
- // |t.To - b.To| = 0
- //
- // Target (t): +--+--+
- // Base (b): +--+--+--+
- // Result (b - t): +--+
- {
- caption: "|t.From - b.From| = 1 && |b.To - t.From| > 0 && |t.To - b.To| = 0",
- target: newRangeSymbolNode('1', '2'),
- base: newRangeSymbolNode('0', '2'),
- result: newSymbolNode('0'),
- },
- // |t.From - b.From| = 1
- // |b.To - t.From| > 0
- // |t.To - b.To| > 0
- //
- // Target (t): +--+--+--+
- // Base (b): +--+--+--+
- // Result (b - t): +--+
- {
- caption: "|t.From - b.From| = 1 && |b.To - t.From| > 0 && |t.To - b.To| > 0",
- target: newRangeSymbolNode('1', '3'),
- base: newRangeSymbolNode('0', '2'),
- result: newSymbolNode('0'),
- },
- // |t.From - b.From| > 1
- // |b.To - t.From| = 0
- // |t.To - b.To| = 0
- //
- // Target (t): +--+
- // Base (b): +--+--+--+
- // Result (b - t): +--+--+
- {
- caption: "|t.From - b.From| > 1 && |b.To - t.From| = 0 && |t.To - b.To| = 0",
- target: newSymbolNode('2'),
- base: newRangeSymbolNode('0', '2'),
- result: newRangeSymbolNode('0', '1'),
- },
- // |t.From - b.From| > 1
- // |b.To - t.From| = 0
- // |t.To - b.To| > 0
- //
- // Target (t): +--+--+
- // Base (b): +--+--+--+
- // Result (b - t): +--+--+
- {
- caption: "|t.From - b.From| > 1 && |b.To - t.From| = 0 && |t.To - b.To| > 0",
- target: newRangeSymbolNode('2', '3'),
- base: newRangeSymbolNode('0', '2'),
- result: newRangeSymbolNode('0', '1'),
- },
- // |t.From - b.From| > 1
- // |b.To - t.From| > 0
- // |t.To - b.To| = 0
- //
- // Target (t): +--+--+
- // Base (b): +--+--+--+--+
- // Result (b - t): +--+--+
- {
- caption: "|t.From - b.From| > 1 && |b.To - t.From| > 0 && |t.To - b.To| = 0",
- target: newRangeSymbolNode('2', '3'),
- base: newRangeSymbolNode('0', '3'),
- result: newRangeSymbolNode('0', '1'),
- },
- // |t.From - b.From| > 1
- // |b.To - t.From| > 0
- // |t.To - b.To| > 0
- //
- // Target (t): +--+--+--+
- // Base (b): +--+--+--+--+
- // Result (b - t): +--+--+
- {
- caption: "|t.From - b.From| > 1 && |b.To - t.From| > 0 && |t.To - b.To| > 0",
- target: newRangeSymbolNode('2', '4'),
- base: newRangeSymbolNode('0', '3'),
- result: newRangeSymbolNode('0', '1'),
- },
-
- // t.From <= b.From && t.To >= b.To
-
- // |b.From - t.From| = 0
- // |t.To - b.To| = 0
- //
- // Target (t): +--+
- // Base (b): +--+
- // Result (b - t): N/A
- {
- caption: "|b.From - t.From| = 0 && |t.To - b.To| = 0",
- target: newSymbolNode('0'),
- base: newSymbolNode('0'),
- result: nil,
- },
- // |b.From - t.From| = 0
- // |t.To - b.To| > 0
- //
- // Target (t): +--+--+
- // Base (b): +--+
- // Result (b - t): N/A
- {
- caption: "|b.From - t.From| = 0 && |t.To - b.To| > 0",
- target: newRangeSymbolNode('0', '1'),
- base: newSymbolNode('0'),
- result: nil,
- },
- // |b.From - t.From| > 0
- // |t.To - b.To| = 0
- //
- // Target (t): +--+--+
- // Base (b): +--+
- // Result (b - t): N/A
- {
- caption: "|b.From - t.From| > 0 && |t.To - b.To| = 0",
- target: newRangeSymbolNode('0', '1'),
- base: newSymbolNode('1'),
- result: nil,
- },
- // |b.From - t.From| > 0
- // |t.To - b.To| > 0
- //
- // Target (t): +--+--+--+
- // Base (b): +--+
- // Result (b - t): N/A
- {
- caption: "|b.From - t.From| > 0 && |t.To - b.To| > 0",
- target: newRangeSymbolNode('0', '2'),
- base: newSymbolNode('1'),
- result: nil,
- },
-
- // Others
-
- // |b.From - t.From| = 1
- //
- // Target (t): +--+
- // Base (b): +--+
- // Result (b - t): +--+
- {
- caption: "|b.From - t.From| = 1",
- target: newSymbolNode('0'),
- base: newSymbolNode('1'),
- result: newSymbolNode('1'),
- },
- // |b.From - t.From| > 1
- //
- // Target (t): +--+
- // Base (b): +--+
- // Result (b - t): +--+
- {
- caption: "|b.From - t.From| > 1",
- target: newSymbolNode('0'),
- base: newSymbolNode('2'),
- result: newSymbolNode('2'),
- },
- // |t.To - b.To| = 1
- //
- // Target (t): +--+
- // Base (b): +--+
- // Result (b - t): +--+
- {
- caption: "|t.To - b.To| = 1",
- target: newSymbolNode('1'),
- base: newSymbolNode('0'),
- result: newSymbolNode('0'),
- },
- // |t.To - b.To| > 1
- //
- // Target (t): +--+
- // Base (b): +--+
- // Result (b - t): +--+
- {
- caption: "|t.To - b.To| > 1",
- target: newSymbolNode('2'),
- base: newSymbolNode('0'),
- result: newSymbolNode('0'),
- },
- } {
- t.Run(test.caption, func(t *testing.T) {
- r := exclude(test.target, test.base)
- testAST(t, test.result, r)
- })
- }
-}
-
-func testAST(t *testing.T, expected, actual CPTree) {
- t.Helper()
-
- aTy := reflect.TypeOf(actual)
- eTy := reflect.TypeOf(expected)
- if eTy != aTy {
- t.Fatalf("unexpected node: want: %+v, got: %+v", eTy, aTy)
- }
-
- if actual == nil {
- return
- }
-
- switch e := expected.(type) {
- case *symbolNode:
- a := actual.(*symbolNode)
- if a.From != e.From || a.To != e.To {
- t.Fatalf("unexpected node: want: %+v, got: %+v", e, a)
- }
- }
- eLeft, eRight := expected.children()
- aLeft, aRight := actual.children()
- testAST(t, eLeft, aLeft)
- testAST(t, eRight, aRight)
-}
diff --git a/compiler/parser/tree.go b/compiler/parser/tree.go
deleted file mode 100644
index 04ba723..0000000
--- a/compiler/parser/tree.go
+++ /dev/null
@@ -1,459 +0,0 @@
-package parser
-
-import (
- "fmt"
- "io"
- "sort"
-
- "github.com/nihei9/maleeni/spec"
-)
-
-type CPRange struct {
- From rune
- To rune
-}
-
-type CPTree interface {
- fmt.Stringer
- Range() (rune, rune, bool)
- Optional() (CPTree, bool)
- Repeatable() (CPTree, bool)
- Concatenation() (CPTree, CPTree, bool)
- Alternatives() (CPTree, CPTree, bool)
- Describe() (spec.LexKindName, []spec.LexKindName, error)
-
- children() (CPTree, CPTree)
- clone() CPTree
-}
-
-var (
- _ CPTree = &rootNode{}
- _ CPTree = &symbolNode{}
- _ CPTree = &concatNode{}
- _ CPTree = &altNode{}
- _ CPTree = &quantifierNode{}
- _ CPTree = &fragmentNode{}
-)
-
-type rootNode struct {
- kind spec.LexKindName
- tree CPTree
- fragments map[spec.LexKindName][]*fragmentNode
-}
-
-func newRootNode(kind spec.LexKindName, t CPTree) *rootNode {
- fragments := map[spec.LexKindName][]*fragmentNode{}
- collectFragments(t, fragments)
-
- return &rootNode{
- kind: kind,
- tree: t,
- fragments: fragments,
- }
-}
-
-func collectFragments(n CPTree, fragments map[spec.LexKindName][]*fragmentNode) {
- if n == nil {
- return
- }
-
- if f, ok := n.(*fragmentNode); ok {
- fragments[f.kind] = append(fragments[f.kind], f)
- return
- }
-
- l, r := n.children()
- collectFragments(l, fragments)
- collectFragments(r, fragments)
-}
-
-func (n *rootNode) String() string {
- return fmt.Sprintf("root: %v: %v fragments", n.kind, len(n.fragments))
-}
-
-func (n *rootNode) Range() (rune, rune, bool) {
- return n.tree.Range()
-}
-
-func (n *rootNode) Optional() (CPTree, bool) {
- return n.tree.Optional()
-}
-
-func (n *rootNode) Repeatable() (CPTree, bool) {
- return n.tree.Repeatable()
-}
-
-func (n *rootNode) Concatenation() (CPTree, CPTree, bool) {
- return n.tree.Concatenation()
-}
-
-func (n *rootNode) Alternatives() (CPTree, CPTree, bool) {
- return n.tree.Alternatives()
-}
-
-func (n *rootNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
- var frags []spec.LexKindName
- for f := range n.fragments {
- frags = append(frags, spec.LexKindName(f))
- }
- sort.Slice(frags, func(i, j int) bool {
- return frags[i] < frags[j]
- })
-
- return n.kind, frags, nil
-}
-
-func (n *rootNode) children() (CPTree, CPTree) {
- return n.tree.children()
-}
-
-func (n *rootNode) clone() CPTree {
- return n.tree.clone()
-}
-
-func (n *rootNode) incomplete() bool {
- return len(n.fragments) > 0
-}
-
-func (n *rootNode) applyFragment(kind spec.LexKindName, fragment CPTree) error {
- root, ok := fragment.(*rootNode)
- if !ok {
- return fmt.Errorf("applyFragment can take only *rootNode: %T", fragment)
- }
- if root.incomplete() {
- return fmt.Errorf("fragment is incomplete")
- }
-
- fs, ok := n.fragments[kind]
- if !ok {
- return nil
- }
- for _, f := range fs {
- f.tree = root.clone()
- }
- delete(n.fragments, kind)
-
- return nil
-}
-
-type symbolNode struct {
- CPRange
-}
-
-func newSymbolNode(cp rune) *symbolNode {
- return &symbolNode{
- CPRange: CPRange{
- From: cp,
- To: cp,
- },
- }
-}
-
-func newRangeSymbolNode(from, to rune) *symbolNode {
- return &symbolNode{
- CPRange: CPRange{
- From: from,
- To: to,
- },
- }
-}
-
-func (n *symbolNode) String() string {
- return fmt.Sprintf("symbol: %X..%X", n.From, n.To)
-}
-
-func (n *symbolNode) Range() (rune, rune, bool) {
- return n.From, n.To, true
-}
-
-func (n *symbolNode) Optional() (CPTree, bool) {
- return nil, false
-}
-
-func (n *symbolNode) Repeatable() (CPTree, bool) {
- return nil, false
-}
-
-func (n *symbolNode) Concatenation() (CPTree, CPTree, bool) {
- return nil, nil, false
-}
-
-func (n *symbolNode) Alternatives() (CPTree, CPTree, bool) {
- return nil, nil, false
-}
-
-func (n *symbolNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
- return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
-}
-
-func (n *symbolNode) children() (CPTree, CPTree) {
- return nil, nil
-}
-
-func (n *symbolNode) clone() CPTree {
- return newRangeSymbolNode(n.From, n.To)
-}
-
-type concatNode struct {
- left CPTree
- right CPTree
-}
-
-func newConcatNode(left, right CPTree) *concatNode {
- return &concatNode{
- left: left,
- right: right,
- }
-}
-
-func (n *concatNode) String() string {
- return "concat"
-}
-
-func (n *concatNode) Range() (rune, rune, bool) {
- return 0, 0, false
-}
-
-func (n *concatNode) Optional() (CPTree, bool) {
- return nil, false
-}
-
-func (n *concatNode) Repeatable() (CPTree, bool) {
- return nil, false
-}
-
-func (n *concatNode) Concatenation() (CPTree, CPTree, bool) {
- return n.left, n.right, true
-}
-
-func (n *concatNode) Alternatives() (CPTree, CPTree, bool) {
- return nil, nil, false
-}
-
-func (n *concatNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
- return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
-}
-
-func (n *concatNode) children() (CPTree, CPTree) {
- return n.left, n.right
-}
-
-func (n *concatNode) clone() CPTree {
- if n == nil {
- return nil
- }
- return newConcatNode(n.left.clone(), n.right.clone())
-}
-
-type altNode struct {
- left CPTree
- right CPTree
-}
-
-func newAltNode(left, right CPTree) *altNode {
- return &altNode{
- left: left,
- right: right,
- }
-}
-
-func (n *altNode) String() string {
- return "alt"
-}
-
-func (n *altNode) Range() (rune, rune, bool) {
- return 0, 0, false
-}
-
-func (n *altNode) Optional() (CPTree, bool) {
- return nil, false
-}
-
-func (n *altNode) Repeatable() (CPTree, bool) {
- return nil, false
-}
-
-func (n *altNode) Concatenation() (CPTree, CPTree, bool) {
- return nil, nil, false
-}
-
-func (n *altNode) Alternatives() (CPTree, CPTree, bool) {
- return n.left, n.right, true
-}
-
-func (n *altNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
- return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
-}
-
-func (n *altNode) children() (CPTree, CPTree) {
- return n.left, n.right
-}
-
-func (n *altNode) clone() CPTree {
- return newAltNode(n.left.clone(), n.right.clone())
-}
-
-type quantifierNode struct {
- optional bool
- repeatable bool
- tree CPTree
-}
-
-func (n *quantifierNode) String() string {
- switch {
- case n.repeatable:
- return "repeatable (>= 0 times)"
- case n.optional:
- return "optional (0 or 1 times)"
- default:
- return "invalid quantifier"
- }
-}
-
-func newRepeatNode(t CPTree) *quantifierNode {
- return &quantifierNode{
- repeatable: true,
- tree: t,
- }
-}
-
-func newRepeatOneOrMoreNode(t CPTree) *concatNode {
- return newConcatNode(
- t,
- &quantifierNode{
- repeatable: true,
- tree: t.clone(),
- })
-}
-
-func newOptionNode(t CPTree) *quantifierNode {
- return &quantifierNode{
- optional: true,
- tree: t,
- }
-}
-
-func (n *quantifierNode) Range() (rune, rune, bool) {
- return 0, 0, false
-}
-
-func (n *quantifierNode) Optional() (CPTree, bool) {
- return n.tree, n.optional
-}
-
-func (n *quantifierNode) Repeatable() (CPTree, bool) {
- return n.tree, n.repeatable
-}
-
-func (n *quantifierNode) Concatenation() (CPTree, CPTree, bool) {
- return nil, nil, false
-}
-
-func (n *quantifierNode) Alternatives() (CPTree, CPTree, bool) {
- return nil, nil, false
-}
-
-func (n *quantifierNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
- return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
-}
-
-func (n *quantifierNode) children() (CPTree, CPTree) {
- return n.tree, nil
-}
-
-func (n *quantifierNode) clone() CPTree {
- if n.repeatable {
- return newRepeatNode(n.tree.clone())
- }
- return newOptionNode(n.tree.clone())
-}
-
-type fragmentNode struct {
- kind spec.LexKindName
- tree CPTree
-}
-
-func newFragmentNode(kind spec.LexKindName, t CPTree) *fragmentNode {
- return &fragmentNode{
- kind: kind,
- tree: t,
- }
-}
-
-func (n *fragmentNode) String() string {
- return fmt.Sprintf("fragment: %v", n.kind)
-}
-
-func (n *fragmentNode) Range() (rune, rune, bool) {
- return n.tree.Range()
-}
-
-func (n *fragmentNode) Optional() (CPTree, bool) {
- return n.tree.Optional()
-}
-
-func (n *fragmentNode) Repeatable() (CPTree, bool) {
- return n.tree.Repeatable()
-}
-
-func (n *fragmentNode) Concatenation() (CPTree, CPTree, bool) {
- return n.tree.Concatenation()
-}
-
-func (n *fragmentNode) Alternatives() (CPTree, CPTree, bool) {
- return n.tree.Alternatives()
-}
-
-func (n *fragmentNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
- return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
-}
-
-func (n *fragmentNode) children() (CPTree, CPTree) {
- return n.tree.children()
-}
-
-func (n *fragmentNode) clone() CPTree {
- if n.tree == nil {
- return newFragmentNode(n.kind, nil)
- }
- return newFragmentNode(n.kind, n.tree.clone())
-}
-
-//nolint:unused
-func printCPTree(w io.Writer, t CPTree, ruledLine string, childRuledLinePrefix string) {
- if t == nil {
- return
- }
- fmt.Fprintf(w, "%v%v\n", ruledLine, t)
- children := []CPTree{}
- switch n := t.(type) {
- case *rootNode:
- children = append(children, n.tree)
- case *fragmentNode:
- children = append(children, n.tree)
- default:
- left, right := t.children()
- if left != nil {
- children = append(children, left)
- }
- if right != nil {
- children = append(children, right)
- }
- }
- num := len(children)
- for i, child := range children {
- line := "└─ "
- if num > 1 {
- if i == 0 {
- line = "├─ "
- } else if i < num-1 {
- line = "│ "
- }
- }
- prefix := "│ "
- if i >= num-1 {
- prefix = " "
- }
- printCPTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix)
- }
-}
diff --git a/src/tre.go b/src/tre.go
index 2350a52..ff1047e 100644
--- a/src/tre.go
+++ b/src/tre.go
@@ -1,12 +1,17 @@
package tre
import (
+ "bufio"
+ "bytes"
"encoding/binary"
"fmt"
+ "io"
"regexp"
"strconv"
"strings"
"sort"
+
+ "ucd"
)
@@ -22,6 +27,7 @@ type cpRange struct {
}
+// "github.com/nihei9/maleeni/spec"
func (b *CharBlock) String() string {
var s strings.Builder
@@ -747,3 +753,1658 @@ type CompiledLexSpec struct {
CompressionLevel int `json:"compression_level"`
Specs []*CompiledLexModeSpec `json:"specs"`
}
+
+var (
+ ParseErr = fmt.Errorf("parse error")
+
+ // lexical errors
+ synErrIncompletedEscSeq = fmt.Errorf("incompleted escape sequence; unexpected EOF following \\")
+ synErrInvalidEscSeq = fmt.Errorf("invalid escape sequence")
+ synErrInvalidCodePoint = fmt.Errorf("code points must consist of just 4 or 6 hex digits")
+ synErrCharPropInvalidSymbol = fmt.Errorf("invalid character property symbol")
+ SynErrFragmentInvalidSymbol = fmt.Errorf("invalid fragment symbol")
+
+ // syntax errors
+ synErrUnexpectedToken = fmt.Errorf("unexpected token")
+ synErrNullPattern = fmt.Errorf("a pattern must be a non-empty byte sequence")
+ synErrUnmatchablePattern = fmt.Errorf("a pattern cannot match any characters")
+ synErrAltLackOfOperand = fmt.Errorf("an alternation expression must have operands")
+ synErrRepNoTarget = fmt.Errorf("a repeat expression must have an operand")
+ synErrGroupNoElem = fmt.Errorf("a grouping expression must include at least one character")
+ synErrGroupUnclosed = fmt.Errorf("unclosed grouping expression")
+ synErrGroupNoInitiator = fmt.Errorf(") needs preceding (")
+ synErrGroupInvalidForm = fmt.Errorf("invalid grouping expression")
+ synErrBExpNoElem = fmt.Errorf("a bracket expression must include at least one character")
+ synErrBExpUnclosed = fmt.Errorf("unclosed bracket expression")
+ synErrBExpInvalidForm = fmt.Errorf("invalid bracket expression")
+ synErrRangeInvalidOrder = fmt.Errorf("a range expression with invalid order")
+ synErrRangePropIsUnavailable = fmt.Errorf("a property expression is unavailable in a range expression")
+ synErrRangeInvalidForm = fmt.Errorf("invalid range expression")
+ synErrCPExpInvalidForm = fmt.Errorf("invalid code point expression")
+ synErrCPExpOutOfRange = fmt.Errorf("a code point must be between U+0000 to U+10FFFF")
+ synErrCharPropExpInvalidForm = fmt.Errorf("invalid character property expression")
+ synErrCharPropUnsupported = fmt.Errorf("unsupported character property")
+ synErrFragmentExpInvalidForm = fmt.Errorf("invalid fragment expression")
+)
+
+
+type incompleteFragment struct {
+ kind LexKindName
+ root *rootNode
+}
+
+func CompleteFragments(fragments map[LexKindName]CPTree) error {
+ if len(fragments) == 0 {
+ return nil
+ }
+
+ completeFragments := map[LexKindName]CPTree{}
+ incompleteFragments := []*incompleteFragment{}
+ for kind, tree := range fragments {
+ root, ok := tree.(*rootNode)
+ if !ok {
+ return fmt.Errorf("CompleteFragments can take only *rootNode: %T", tree)
+ }
+ if root.incomplete() {
+ incompleteFragments = append(incompleteFragments, &incompleteFragment{
+ kind: kind,
+ root: root,
+ })
+ } else {
+ completeFragments[kind] = root
+ }
+ }
+ for len(incompleteFragments) > 0 {
+ lastIncompCount := len(incompleteFragments)
+ remainingFragments := []*incompleteFragment{}
+ for _, e := range incompleteFragments {
+ complete, err := ApplyFragments(e.root, completeFragments)
+ if err != nil {
+ return err
+ }
+ if !complete {
+ remainingFragments = append(remainingFragments, e)
+ } else {
+ completeFragments[e.kind] = e.root
+ }
+ }
+ incompleteFragments = remainingFragments
+ if len(incompleteFragments) == lastIncompCount {
+ return ParseErr
+ }
+ }
+
+ return nil
+}
+
+func ApplyFragments(t CPTree, fragments map[LexKindName]CPTree) (bool, error) {
+ root, ok := t.(*rootNode)
+ if !ok {
+ return false, fmt.Errorf("ApplyFragments can take only *rootNode type: %T", t)
+ }
+
+ for name, frag := range fragments {
+ err := root.applyFragment(name, frag)
+ if err != nil {
+ return false, err
+ }
+ }
+
+ return !root.incomplete(), nil
+}
+
+type tokenKind string
+
+const (
+ tokenKindChar tokenKind = "char"
+ tokenKindAnyChar tokenKind = "."
+ tokenKindRepeat tokenKind = "*"
+ tokenKindRepeatOneOrMore tokenKind = "+"
+ tokenKindOption tokenKind = "?"
+ tokenKindAlt tokenKind = "|"
+ tokenKindGroupOpen tokenKind = "("
+ tokenKindGroupClose tokenKind = ")"
+ tokenKindBExpOpen tokenKind = "["
+ tokenKindInverseBExpOpen tokenKind = "[^"
+ tokenKindBExpClose tokenKind = "]"
+ tokenKindCharRange tokenKind = "-"
+ tokenKindCodePointLeader tokenKind = "\\u"
+ tokenKindCharPropLeader tokenKind = "\\p"
+ tokenKindFragmentLeader tokenKind = "\\f"
+ tokenKindLBrace tokenKind = "{"
+ tokenKindRBrace tokenKind = "}"
+ tokenKindEqual tokenKind = "="
+ tokenKindCodePoint tokenKind = "code point"
+ tokenKindCharPropSymbol tokenKind = "character property symbol"
+ tokenKindFragmentSymbol tokenKind = "fragment symbol"
+ tokenKindEOF tokenKind = "eof"
+)
+
+type token struct {
+ kind tokenKind
+ char rune
+ propSymbol string
+ codePoint string
+ fragmentSymbol string
+}
+
+const nullChar = '\u0000'
+
+func newToken(kind tokenKind, char rune) *token {
+ return &token{
+ kind: kind,
+ char: char,
+ }
+}
+
+func newCodePointToken(codePoint string) *token {
+ return &token{
+ kind: tokenKindCodePoint,
+ codePoint: codePoint,
+ }
+}
+
+func newCharPropSymbolToken(propSymbol string) *token {
+ return &token{
+ kind: tokenKindCharPropSymbol,
+ propSymbol: propSymbol,
+ }
+}
+
+func newFragmentSymbolToken(fragmentSymbol string) *token {
+ return &token{
+ kind: tokenKindFragmentSymbol,
+ fragmentSymbol: fragmentSymbol,
+ }
+}
+
+type lexerMode string
+
+const (
+ lexerModeDefault lexerMode = "default"
+ lexerModeBExp lexerMode = "bracket expression"
+ lexerModeCPExp lexerMode = "code point expression"
+ lexerModeCharPropExp lexerMode = "character property expression"
+ lexerModeFragmentExp lexerMode = "fragment expression"
+)
+
+type lexerModeStack struct {
+ stack []lexerMode
+}
+
+func newLexerModeStack() *lexerModeStack {
+ return &lexerModeStack{
+ stack: []lexerMode{
+ lexerModeDefault,
+ },
+ }
+}
+
+func (s *lexerModeStack) top() lexerMode {
+ return s.stack[len(s.stack)-1]
+}
+
+func (s *lexerModeStack) push(m lexerMode) {
+ s.stack = append(s.stack, m)
+}
+
+func (s *lexerModeStack) pop() {
+ s.stack = s.stack[:len(s.stack)-1]
+}
+
+type rangeState string
+
+// [a-z]
+// ^^^^
+// |||`-- ready
+// ||`-- expect range terminator
+// |`-- read range initiator
+// `-- ready
+const (
+ rangeStateReady rangeState = "ready"
+ rangeStateReadRangeInitiator rangeState = "read range initiator"
+ rangeStateExpectRangeTerminator rangeState = "expect range terminator"
+)
+
+type lexer struct {
+ src *bufio.Reader
+ peekChar2 rune
+ peekEOF2 bool
+ peekChar1 rune
+ peekEOF1 bool
+ lastChar rune
+ reachedEOF bool
+ prevChar1 rune
+ prevEOF1 bool
+ prevChar2 rune
+ pervEOF2 bool
+ modeStack *lexerModeStack
+ rangeState rangeState
+
+ errCause error
+ errDetail string
+}
+
+func newLexer(src io.Reader) *lexer {
+ return &lexer{
+ src: bufio.NewReader(src),
+ peekChar2: nullChar,
+ peekEOF2: false,
+ peekChar1: nullChar,
+ peekEOF1: false,
+ lastChar: nullChar,
+ reachedEOF: false,
+ prevChar1: nullChar,
+ prevEOF1: false,
+ prevChar2: nullChar,
+ pervEOF2: false,
+ modeStack: newLexerModeStack(),
+ rangeState: rangeStateReady,
+ }
+}
+
+func (l *lexer) error() (string, error) {
+ return l.errDetail, l.errCause
+}
+
+func (l *lexer) next() (*token, error) {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ return newToken(tokenKindEOF, nullChar), nil
+ }
+
+ switch l.modeStack.top() {
+ case lexerModeBExp:
+ tok, err := l.nextInBExp(c)
+ if err != nil {
+ return nil, err
+ }
+ if tok.kind == tokenKindChar || tok.kind == tokenKindCodePointLeader || tok.kind == tokenKindCharPropLeader {
+ switch l.rangeState {
+ case rangeStateReady:
+ l.rangeState = rangeStateReadRangeInitiator
+ case rangeStateExpectRangeTerminator:
+ l.rangeState = rangeStateReady
+ }
+ }
+ switch tok.kind {
+ case tokenKindBExpClose:
+ l.modeStack.pop()
+ case tokenKindCharRange:
+ l.rangeState = rangeStateExpectRangeTerminator
+ case tokenKindCodePointLeader:
+ l.modeStack.push(lexerModeCPExp)
+ case tokenKindCharPropLeader:
+ l.modeStack.push(lexerModeCharPropExp)
+ }
+ return tok, nil
+ case lexerModeCPExp:
+ tok, err := l.nextInCodePoint(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindRBrace:
+ l.modeStack.pop()
+ }
+ return tok, nil
+ case lexerModeCharPropExp:
+ tok, err := l.nextInCharProp(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindRBrace:
+ l.modeStack.pop()
+ }
+ return tok, nil
+ case lexerModeFragmentExp:
+ tok, err := l.nextInFragment(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindRBrace:
+ l.modeStack.pop()
+ }
+ return tok, nil
+ default:
+ tok, err := l.nextInDefault(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindBExpOpen:
+ l.modeStack.push(lexerModeBExp)
+ l.rangeState = rangeStateReady
+ case tokenKindInverseBExpOpen:
+ l.modeStack.push(lexerModeBExp)
+ l.rangeState = rangeStateReady
+ case tokenKindCodePointLeader:
+ l.modeStack.push(lexerModeCPExp)
+ case tokenKindCharPropLeader:
+ l.modeStack.push(lexerModeCharPropExp)
+ case tokenKindFragmentLeader:
+ l.modeStack.push(lexerModeFragmentExp)
+ }
+ return tok, nil
+ }
+}
+
+func (l *lexer) nextInDefault(c rune) (*token, error) {
+ switch c {
+ case '*':
+ return newToken(tokenKindRepeat, nullChar), nil
+ case '+':
+ return newToken(tokenKindRepeatOneOrMore, nullChar), nil
+ case '?':
+ return newToken(tokenKindOption, nullChar), nil
+ case '.':
+ return newToken(tokenKindAnyChar, nullChar), nil
+ case '|':
+ return newToken(tokenKindAlt, nullChar), nil
+ case '(':
+ return newToken(tokenKindGroupOpen, nullChar), nil
+ case ')':
+ return newToken(tokenKindGroupClose, nullChar), nil
+ case '[':
+ c1, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindBExpOpen, nullChar), nil
+ }
+ if c1 != '^' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindBExpOpen, nullChar), nil
+ }
+ c2, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindInverseBExpOpen, nullChar), nil
+ }
+ if c2 != ']' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindInverseBExpOpen, nullChar), nil
+ }
+ err = l.restore()
+ if err != nil {
+ return nil, err
+ }
+ err = l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindBExpOpen, nullChar), nil
+ case '\\':
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ l.errCause = synErrIncompletedEscSeq
+ return nil, ParseErr
+ }
+ if c == 'u' {
+ return newToken(tokenKindCodePointLeader, nullChar), nil
+ }
+ if c == 'p' {
+ return newToken(tokenKindCharPropLeader, nullChar), nil
+ }
+ if c == 'f' {
+ return newToken(tokenKindFragmentLeader, nullChar), nil
+ }
+ if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' {
+ return newToken(tokenKindChar, c), nil
+ }
+ l.errCause = synErrInvalidEscSeq
+ l.errDetail = fmt.Sprintf("\\%v is not supported", string(c))
+ return nil, ParseErr
+ default:
+ return newToken(tokenKindChar, c), nil
+ }
+}
+
+func (l *lexer) nextInBExp(c rune) (*token, error) {
+ switch c {
+ case '-':
+ if l.rangeState != rangeStateReadRangeInitiator {
+ return newToken(tokenKindChar, c), nil
+ }
+ c1, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindChar, c), nil
+ }
+ if c1 != ']' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindCharRange, nullChar), nil
+ }
+ err = l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindChar, c), nil
+ case ']':
+ return newToken(tokenKindBExpClose, nullChar), nil
+ case '\\':
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ l.errCause = synErrIncompletedEscSeq
+ return nil, ParseErr
+ }
+ if c == 'u' {
+ return newToken(tokenKindCodePointLeader, nullChar), nil
+ }
+ if c == 'p' {
+ return newToken(tokenKindCharPropLeader, nullChar), nil
+ }
+ if c == '\\' || c == '^' || c == '-' || c == ']' {
+ return newToken(tokenKindChar, c), nil
+ }
+ l.errCause = synErrInvalidEscSeq
+ l.errDetail = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c))
+ return nil, ParseErr
+ default:
+ return newToken(tokenKindChar, c), nil
+ }
+}
+
+func (l *lexer) nextInCodePoint(c rune) (*token, error) {
+ switch c {
+ case '{':
+ return newToken(tokenKindLBrace, nullChar), nil
+ case '}':
+ return newToken(tokenKindRBrace, nullChar), nil
+ default:
+ if !isHexDigit(c) {
+ l.errCause = synErrInvalidCodePoint
+ return nil, ParseErr
+ }
+ var b strings.Builder
+ fmt.Fprint(&b, string(c))
+ n := 1
+ for {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if c == '}' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if !isHexDigit(c) || n >= 6 {
+ l.errCause = synErrInvalidCodePoint
+ return nil, ParseErr
+ }
+ fmt.Fprint(&b, string(c))
+ n++
+ }
+ cp := b.String()
+ cpLen := len(cp)
+ if !(cpLen == 4 || cpLen == 6) {
+ l.errCause = synErrInvalidCodePoint
+ return nil, ParseErr
+ }
+ return newCodePointToken(b.String()), nil
+ }
+}
+
+func isHexDigit(c rune) bool {
+ if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' {
+ return true
+ }
+ return false
+}
+
+func (l *lexer) nextInCharProp(c rune) (*token, error) {
+ switch c {
+ case '{':
+ return newToken(tokenKindLBrace, nullChar), nil
+ case '}':
+ return newToken(tokenKindRBrace, nullChar), nil
+ case '=':
+ return newToken(tokenKindEqual, nullChar), nil
+ default:
+ var b strings.Builder
+ fmt.Fprint(&b, string(c))
+ n := 1
+ for {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if c == '}' || c == '=' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ fmt.Fprint(&b, string(c))
+ n++
+ }
+ sym := strings.TrimSpace(b.String())
+ if len(sym) == 0 {
+ l.errCause = synErrCharPropInvalidSymbol
+ return nil, ParseErr
+ }
+ return newCharPropSymbolToken(sym), nil
+ }
+}
+
+func (l *lexer) nextInFragment(c rune) (*token, error) {
+ switch c {
+ case '{':
+ return newToken(tokenKindLBrace, nullChar), nil
+ case '}':
+ return newToken(tokenKindRBrace, nullChar), nil
+ default:
+ var b strings.Builder
+ fmt.Fprint(&b, string(c))
+ n := 1
+ for {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if c == '}' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ fmt.Fprint(&b, string(c))
+ n++
+ }
+ sym := strings.TrimSpace(b.String())
+ if len(sym) == 0 {
+ l.errCause = SynErrFragmentInvalidSymbol
+ return nil, ParseErr
+ }
+ return newFragmentSymbolToken(sym), nil
+ }
+}
+
+func (l *lexer) read() (rune, bool, error) {
+ if l.reachedEOF {
+ return l.lastChar, l.reachedEOF, nil
+ }
+ if l.peekChar1 != nullChar || l.peekEOF1 {
+ l.prevChar2 = l.prevChar1
+ l.pervEOF2 = l.prevEOF1
+ l.prevChar1 = l.lastChar
+ l.prevEOF1 = l.reachedEOF
+ l.lastChar = l.peekChar1
+ l.reachedEOF = l.peekEOF1
+ l.peekChar1 = l.peekChar2
+ l.peekEOF1 = l.peekEOF2
+ l.peekChar2 = nullChar
+ l.peekEOF2 = false
+ return l.lastChar, l.reachedEOF, nil
+ }
+ c, _, err := l.src.ReadRune()
+ if err != nil {
+ if err == io.EOF {
+ l.prevChar2 = l.prevChar1
+ l.pervEOF2 = l.prevEOF1
+ l.prevChar1 = l.lastChar
+ l.prevEOF1 = l.reachedEOF
+ l.lastChar = nullChar
+ l.reachedEOF = true
+ return l.lastChar, l.reachedEOF, nil
+ }
+ return nullChar, false, err
+ }
+ l.prevChar2 = l.prevChar1
+ l.pervEOF2 = l.prevEOF1
+ l.prevChar1 = l.lastChar
+ l.prevEOF1 = l.reachedEOF
+ l.lastChar = c
+ l.reachedEOF = false
+ return l.lastChar, l.reachedEOF, nil
+}
+
+func (l *lexer) restore() error {
+ if l.lastChar == nullChar && !l.reachedEOF {
+ return fmt.Errorf("failed to call restore() because the last character is null")
+ }
+ l.peekChar2 = l.peekChar1
+ l.peekEOF2 = l.peekEOF1
+ l.peekChar1 = l.lastChar
+ l.peekEOF1 = l.reachedEOF
+ l.lastChar = l.prevChar1
+ l.reachedEOF = l.prevEOF1
+ l.prevChar1 = l.prevChar2
+ l.prevEOF1 = l.pervEOF2
+ l.prevChar2 = nullChar
+ l.pervEOF2 = false
+ return nil
+}
+
+type PatternEntry struct {
+ ID LexModeKindID
+ Pattern []byte
+}
+
+type parser struct {
+ kind LexKindName
+ lex *lexer
+ peekedTok *token
+ lastTok *token
+
+ // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that
+ // appear in property expressions.
+ //
+ // The contributory properties are not exposed, and users cannot use those properties because the parser
+ // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid.
+ //
+ // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to
+ // interpret derived properties internally because the derived properties consist of other properties that
+ // may contain the contributory properties.
+ //
+ // [UAX #44 5.13 Property APIs] says:
+ // > The following subtypes of Unicode character properties should generally not be exposed in APIs,
+ // > except in limited circumstances. They may not be useful, particularly in public API collections,
+ // > and may instead prove misleading to the users of such API collections.
+ // > * Contributory properties are not recommended for public APIs.
+ // > ...
+ // https://unicode.org/reports/tr44/#Property_APIs
+ isContributoryPropertyExposed bool
+
+ errCause error
+ errDetail string
+}
+
+func NewParser(kind LexKindName, src io.Reader) *parser {
+ return &parser{
+ kind: kind,
+ lex: newLexer(src),
+ isContributoryPropertyExposed: false,
+ }
+}
+
+func (p *parser) exposeContributoryProperty() {
+ p.isContributoryPropertyExposed = true
+}
+
+func (p *parser) Error() (string, error) {
+ return p.errDetail, p.errCause
+}
+
+func (p *parser) Parse() (root CPTree, retErr error) {
+ defer func() {
+ err := recover()
+ if err != nil {
+ var ok bool
+ retErr, ok = err.(error)
+ if !ok {
+ panic(err)
+ }
+ return
+ }
+ }()
+
+ return newRootNode(p.kind, p.parseRegexp()), nil
+}
+
+func (p *parser) parseRegexp() CPTree {
+ alt := p.parseAlt()
+ if alt == nil {
+ if p.consume(tokenKindGroupClose) {
+ p.raiseParseError(synErrGroupNoInitiator, "")
+ }
+ p.raiseParseError(synErrNullPattern, "")
+ }
+ if p.consume(tokenKindGroupClose) {
+ p.raiseParseError(synErrGroupNoInitiator, "")
+ }
+ p.expect(tokenKindEOF)
+ return alt
+}
+
+func (p *parser) parseAlt() CPTree {
+ left := p.parseConcat()
+ if left == nil {
+ if p.consume(tokenKindAlt) {
+ p.raiseParseError(synErrAltLackOfOperand, "")
+ }
+ return nil
+ }
+ for {
+ if !p.consume(tokenKindAlt) {
+ break
+ }
+ right := p.parseConcat()
+ if right == nil {
+ p.raiseParseError(synErrAltLackOfOperand, "")
+ }
+ left = newAltNode(left, right)
+ }
+ return left
+}
+
+func (p *parser) parseConcat() CPTree {
+ left := p.parseRepeat()
+ for {
+ right := p.parseRepeat()
+ if right == nil {
+ break
+ }
+ left = newConcatNode(left, right)
+ }
+ return left
+}
+
+func (p *parser) parseRepeat() CPTree {
+ group := p.parseGroup()
+ if group == nil {
+ if p.consume(tokenKindRepeat) {
+ p.raiseParseError(synErrRepNoTarget, "* needs an operand")
+ }
+ if p.consume(tokenKindRepeatOneOrMore) {
+ p.raiseParseError(synErrRepNoTarget, "+ needs an operand")
+ }
+ if p.consume(tokenKindOption) {
+ p.raiseParseError(synErrRepNoTarget, "? needs an operand")
+ }
+ return nil
+ }
+ if p.consume(tokenKindRepeat) {
+ return newRepeatNode(group)
+ }
+ if p.consume(tokenKindRepeatOneOrMore) {
+ return newRepeatOneOrMoreNode(group)
+ }
+ if p.consume(tokenKindOption) {
+ return newOptionNode(group)
+ }
+ return group
+}
+
+func (p *parser) parseGroup() CPTree {
+ if p.consume(tokenKindGroupOpen) {
+ alt := p.parseAlt()
+ if alt == nil {
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrGroupUnclosed, "")
+ }
+ p.raiseParseError(synErrGroupNoElem, "")
+ }
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrGroupUnclosed, "")
+ }
+ if !p.consume(tokenKindGroupClose) {
+ p.raiseParseError(synErrGroupInvalidForm, "")
+ }
+ return alt
+ }
+ return p.parseSingleChar()
+}
+
+func (p *parser) parseSingleChar() CPTree {
+ if p.consume(tokenKindAnyChar) {
+ return genAnyCharAST()
+ }
+ if p.consume(tokenKindBExpOpen) {
+ left := p.parseBExpElem()
+ if left == nil {
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrBExpUnclosed, "")
+ }
+ p.raiseParseError(synErrBExpNoElem, "")
+ }
+ for {
+ right := p.parseBExpElem()
+ if right == nil {
+ break
+ }
+ left = newAltNode(left, right)
+ }
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrBExpUnclosed, "")
+ }
+ p.expect(tokenKindBExpClose)
+ return left
+ }
+ if p.consume(tokenKindInverseBExpOpen) {
+ elem := p.parseBExpElem()
+ if elem == nil {
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrBExpUnclosed, "")
+ }
+ p.raiseParseError(synErrBExpNoElem, "")
+ }
+ inverse := exclude(elem, genAnyCharAST())
+ if inverse == nil {
+ p.raiseParseError(synErrUnmatchablePattern, "")
+ }
+ for {
+ elem := p.parseBExpElem()
+ if elem == nil {
+ break
+ }
+ inverse = exclude(elem, inverse)
+ if inverse == nil {
+ p.raiseParseError(synErrUnmatchablePattern, "")
+ }
+ }
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrBExpUnclosed, "")
+ }
+ p.expect(tokenKindBExpClose)
+ return inverse
+ }
+ if p.consume(tokenKindCodePointLeader) {
+ return p.parseCodePoint()
+ }
+ if p.consume(tokenKindCharPropLeader) {
+ return p.parseCharProp()
+ }
+ if p.consume(tokenKindFragmentLeader) {
+ return p.parseFragment()
+ }
+ c := p.parseNormalChar()
+ if c == nil {
+ if p.consume(tokenKindBExpClose) {
+ p.raiseParseError(synErrBExpInvalidForm, "")
+ }
+ return nil
+ }
+ return c
+}
+
+func (p *parser) parseBExpElem() CPTree {
+ var left CPTree
+ switch {
+ case p.consume(tokenKindCodePointLeader):
+ left = p.parseCodePoint()
+ case p.consume(tokenKindCharPropLeader):
+ left = p.parseCharProp()
+ if p.consume(tokenKindCharRange) {
+ p.raiseParseError(synErrRangePropIsUnavailable, "")
+ }
+ default:
+ left = p.parseNormalChar()
+ }
+ if left == nil {
+ return nil
+ }
+ if !p.consume(tokenKindCharRange) {
+ return left
+ }
+ var right CPTree
+ switch {
+ case p.consume(tokenKindCodePointLeader):
+ right = p.parseCodePoint()
+ case p.consume(tokenKindCharPropLeader):
+ p.raiseParseError(synErrRangePropIsUnavailable, "")
+ default:
+ right = p.parseNormalChar()
+ }
+ if right == nil {
+ p.raiseParseError(synErrRangeInvalidForm, "")
+ }
+ from, _, _ := left.Range()
+ _, to, _ := right.Range()
+ if !isValidOrder(from, to) {
+ p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("%X..%X", from, to))
+ }
+ return newRangeSymbolNode(from, to)
+}
+
+func (p *parser) parseCodePoint() CPTree {
+ if !p.consume(tokenKindLBrace) {
+ p.raiseParseError(synErrCPExpInvalidForm, "")
+ }
+ if !p.consume(tokenKindCodePoint) {
+ p.raiseParseError(synErrCPExpInvalidForm, "")
+ }
+
+ n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64)
+ if err != nil {
+ panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err))
+ }
+ if n < 0x0000 || n > 0x10FFFF {
+ p.raiseParseError(synErrCPExpOutOfRange, "")
+ }
+
+ sym := newSymbolNode(rune(n))
+
+ if !p.consume(tokenKindRBrace) {
+ p.raiseParseError(synErrCPExpInvalidForm, "")
+ }
+
+ return sym
+}
+
+func (p *parser) parseCharProp() CPTree {
+ if !p.consume(tokenKindLBrace) {
+ p.raiseParseError(synErrCharPropExpInvalidForm, "")
+ }
+ var sym1, sym2 string
+ if !p.consume(tokenKindCharPropSymbol) {
+ p.raiseParseError(synErrCharPropExpInvalidForm, "")
+ }
+ sym1 = p.lastTok.propSymbol
+ if p.consume(tokenKindEqual) {
+ if !p.consume(tokenKindCharPropSymbol) {
+ p.raiseParseError(synErrCharPropExpInvalidForm, "")
+ }
+ sym2 = p.lastTok.propSymbol
+ }
+
+ var alt CPTree
+ var propName, propVal string
+ if sym2 != "" {
+ propName = sym1
+ propVal = sym2
+ } else {
+ propName = ""
+ propVal = sym1
+ }
+ if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) {
+ p.raiseParseError(synErrCharPropUnsupported, propName)
+ }
+ pat, err := ucd.NormalizeCharacterProperty(propName, propVal)
+ if err != nil {
+ p.raiseParseError(synErrCharPropUnsupported, err.Error())
+ }
+ if pat != "" {
+ p := NewParser(p.kind, bytes.NewReader([]byte(pat)))
+ p.exposeContributoryProperty()
+ ast, err := p.Parse()
+ if err != nil {
+ panic(err)
+ }
+ alt = ast
+ } else {
+ cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal)
+ if err != nil {
+ p.raiseParseError(synErrCharPropUnsupported, err.Error())
+ }
+ if inverse {
+ r := cpRanges[0]
+ alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST())
+ if alt == nil {
+ p.raiseParseError(synErrUnmatchablePattern, "")
+ }
+ for _, r := range cpRanges[1:] {
+ alt = exclude(newRangeSymbolNode(r.From, r.To), alt)
+ if alt == nil {
+ p.raiseParseError(synErrUnmatchablePattern, "")
+ }
+ }
+ } else {
+ for _, r := range cpRanges {
+ alt = genAltNode(
+ alt,
+ newRangeSymbolNode(r.From, r.To),
+ )
+ }
+ }
+ }
+
+ if !p.consume(tokenKindRBrace) {
+ p.raiseParseError(synErrCharPropExpInvalidForm, "")
+ }
+
+ return alt
+}
+
+func (p *parser) parseFragment() CPTree {
+ if !p.consume(tokenKindLBrace) {
+ p.raiseParseError(synErrFragmentExpInvalidForm, "")
+ }
+ if !p.consume(tokenKindFragmentSymbol) {
+ p.raiseParseError(synErrFragmentExpInvalidForm, "")
+ }
+ sym := p.lastTok.fragmentSymbol
+
+ if !p.consume(tokenKindRBrace) {
+ p.raiseParseError(synErrFragmentExpInvalidForm, "")
+ }
+
+ return newFragmentNode(LexKindName(sym), nil)
+}
+
+func (p *parser) parseNormalChar() CPTree {
+ if !p.consume(tokenKindChar) {
+ return nil
+ }
+ return newSymbolNode(p.lastTok.char)
+}
+
+func exclude(symbol, base CPTree) CPTree {
+ if left, right, ok := symbol.Alternatives(); ok {
+ return exclude(right, exclude(left, base))
+ }
+
+ if left, right, ok := base.Alternatives(); ok {
+ return genAltNode(
+ exclude(symbol, left),
+ exclude(symbol, right),
+ )
+ }
+
+ if bFrom, bTo, ok := base.Range(); ok {
+ sFrom, sTo, ok := symbol.Range()
+ if !ok {
+ panic(fmt.Errorf("invalid symbol tree: %T", symbol))
+ }
+
+ switch {
+ case sFrom > bFrom && sTo < bTo:
+ return genAltNode(
+ newRangeSymbolNode(bFrom, sFrom-1),
+ newRangeSymbolNode(sTo+1, bTo),
+ )
+ case sFrom <= bFrom && sTo >= bFrom && sTo < bTo:
+ return newRangeSymbolNode(sTo+1, bTo)
+ case sFrom > bFrom && sFrom <= bTo && sTo >= bTo:
+ return newRangeSymbolNode(bFrom, sFrom-1)
+ case sFrom <= bFrom && sTo >= bTo:
+ return nil
+ default:
+ return base
+ }
+ }
+
+ panic(fmt.Errorf("invalid base tree: %T", base))
+}
+
+func genAnyCharAST() CPTree {
+ return newRangeSymbolNode(0x0, 0x10FFFF)
+}
+
+func isValidOrder(from, to rune) bool {
+ return from <= to
+}
+
+func genConcatNode(cs ...CPTree) CPTree {
+ nonNilNodes := []CPTree{}
+ for _, c := range cs {
+ if c == nil {
+ continue
+ }
+ nonNilNodes = append(nonNilNodes, c)
+ }
+ if len(nonNilNodes) <= 0 {
+ return nil
+ }
+ if len(nonNilNodes) == 1 {
+ return nonNilNodes[0]
+ }
+ concat := newConcatNode(nonNilNodes[0], nonNilNodes[1])
+ for _, c := range nonNilNodes[2:] {
+ concat = newConcatNode(concat, c)
+ }
+ return concat
+}
+
+func genAltNode(cs ...CPTree) CPTree {
+ nonNilNodes := []CPTree{}
+ for _, c := range cs {
+ if c == nil {
+ continue
+ }
+ nonNilNodes = append(nonNilNodes, c)
+ }
+ if len(nonNilNodes) <= 0 {
+ return nil
+ }
+ if len(nonNilNodes) == 1 {
+ return nonNilNodes[0]
+ }
+ alt := newAltNode(nonNilNodes[0], nonNilNodes[1])
+ for _, c := range nonNilNodes[2:] {
+ alt = newAltNode(alt, c)
+ }
+ return alt
+}
+
+func (p *parser) expect(expected tokenKind) {
+ if !p.consume(expected) {
+ tok := p.peekedTok
+ p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("expected: %v, actual: %v", expected, tok.kind))
+ }
+}
+
+func (p *parser) consume(expected tokenKind) bool {
+ var tok *token
+ var err error
+ if p.peekedTok != nil {
+ tok = p.peekedTok
+ p.peekedTok = nil
+ } else {
+ tok, err = p.lex.next()
+ if err != nil {
+ if err == ParseErr {
+ detail, cause := p.lex.error()
+ p.raiseParseError(cause, detail)
+ }
+ panic(err)
+ }
+ }
+ p.lastTok = tok
+ if tok.kind == expected {
+ return true
+ }
+ p.peekedTok = tok
+ p.lastTok = nil
+
+ return false
+}
+
+func (p *parser) raiseParseError(err error, detail string) {
+ p.errCause = err
+ p.errDetail = detail
+ panic(ParseErr)
+}
+
+type CPRange struct {
+ From rune
+ To rune
+}
+
+type CPTree interface {
+ fmt.Stringer
+ Range() (rune, rune, bool)
+ Optional() (CPTree, bool)
+ Repeatable() (CPTree, bool)
+ Concatenation() (CPTree, CPTree, bool)
+ Alternatives() (CPTree, CPTree, bool)
+ Describe() (LexKindName, []LexKindName, error)
+
+ children() (CPTree, CPTree)
+ clone() CPTree
+}
+
+var (
+ _ CPTree = &rootNode{}
+ _ CPTree = &symbolNode{}
+ _ CPTree = &concatNode{}
+ _ CPTree = &altNode{}
+ _ CPTree = &quantifierNode{}
+ _ CPTree = &fragmentNode{}
+)
+
+type rootNode struct {
+ kind LexKindName
+ tree CPTree
+ fragments map[LexKindName][]*fragmentNode
+}
+
+func newRootNode(kind LexKindName, t CPTree) *rootNode {
+ fragments := map[LexKindName][]*fragmentNode{}
+ collectFragments(t, fragments)
+
+ return &rootNode{
+ kind: kind,
+ tree: t,
+ fragments: fragments,
+ }
+}
+
+func collectFragments(n CPTree, fragments map[LexKindName][]*fragmentNode) {
+ if n == nil {
+ return
+ }
+
+ if f, ok := n.(*fragmentNode); ok {
+ fragments[f.kind] = append(fragments[f.kind], f)
+ return
+ }
+
+ l, r := n.children()
+ collectFragments(l, fragments)
+ collectFragments(r, fragments)
+}
+
+func (n *rootNode) String() string {
+ return fmt.Sprintf("root: %v: %v fragments", n.kind, len(n.fragments))
+}
+
+func (n *rootNode) Range() (rune, rune, bool) {
+ return n.tree.Range()
+}
+
+func (n *rootNode) Optional() (CPTree, bool) {
+ return n.tree.Optional()
+}
+
+func (n *rootNode) Repeatable() (CPTree, bool) {
+ return n.tree.Repeatable()
+}
+
+func (n *rootNode) Concatenation() (CPTree, CPTree, bool) {
+ return n.tree.Concatenation()
+}
+
+func (n *rootNode) Alternatives() (CPTree, CPTree, bool) {
+ return n.tree.Alternatives()
+}
+
+func (n *rootNode) Describe() (LexKindName, []LexKindName, error) {
+ var frags []LexKindName
+ for f := range n.fragments {
+ frags = append(frags, LexKindName(f))
+ }
+ sort.Slice(frags, func(i, j int) bool {
+ return frags[i] < frags[j]
+ })
+
+ return n.kind, frags, nil
+}
+
+func (n *rootNode) children() (CPTree, CPTree) {
+ return n.tree.children()
+}
+
+func (n *rootNode) clone() CPTree {
+ return n.tree.clone()
+}
+
+func (n *rootNode) incomplete() bool {
+ return len(n.fragments) > 0
+}
+
+func (n *rootNode) applyFragment(kind LexKindName, fragment CPTree) error {
+ root, ok := fragment.(*rootNode)
+ if !ok {
+ return fmt.Errorf("applyFragment can take only *rootNode: %T", fragment)
+ }
+ if root.incomplete() {
+ return fmt.Errorf("fragment is incomplete")
+ }
+
+ fs, ok := n.fragments[kind]
+ if !ok {
+ return nil
+ }
+ for _, f := range fs {
+ f.tree = root.clone()
+ }
+ delete(n.fragments, kind)
+
+ return nil
+}
+
+type symbolNode struct {
+ CPRange
+}
+
+func newSymbolNode(cp rune) *symbolNode {
+ return &symbolNode{
+ CPRange: CPRange{
+ From: cp,
+ To: cp,
+ },
+ }
+}
+
+func newRangeSymbolNode(from, to rune) *symbolNode {
+ return &symbolNode{
+ CPRange: CPRange{
+ From: from,
+ To: to,
+ },
+ }
+}
+
+func (n *symbolNode) String() string {
+ return fmt.Sprintf("symbol: %X..%X", n.From, n.To)
+}
+
+func (n *symbolNode) Range() (rune, rune, bool) {
+ return n.From, n.To, true
+}
+
+func (n *symbolNode) Optional() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *symbolNode) Repeatable() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *symbolNode) Concatenation() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *symbolNode) Alternatives() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *symbolNode) Describe() (LexKindName, []LexKindName, error) {
+ return LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *symbolNode) children() (CPTree, CPTree) {
+ return nil, nil
+}
+
+func (n *symbolNode) clone() CPTree {
+ return newRangeSymbolNode(n.From, n.To)
+}
+
+type concatNode struct {
+ left CPTree
+ right CPTree
+}
+
+func newConcatNode(left, right CPTree) *concatNode {
+ return &concatNode{
+ left: left,
+ right: right,
+ }
+}
+
+func (n *concatNode) String() string {
+ return "concat"
+}
+
+func (n *concatNode) Range() (rune, rune, bool) {
+ return 0, 0, false
+}
+
+func (n *concatNode) Optional() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *concatNode) Repeatable() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *concatNode) Concatenation() (CPTree, CPTree, bool) {
+ return n.left, n.right, true
+}
+
+func (n *concatNode) Alternatives() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *concatNode) Describe() (LexKindName, []LexKindName, error) {
+ return LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *concatNode) children() (CPTree, CPTree) {
+ return n.left, n.right
+}
+
+func (n *concatNode) clone() CPTree {
+ if n == nil {
+ return nil
+ }
+ return newConcatNode(n.left.clone(), n.right.clone())
+}
+
+type altNode struct {
+ left CPTree
+ right CPTree
+}
+
+func newAltNode(left, right CPTree) *altNode {
+ return &altNode{
+ left: left,
+ right: right,
+ }
+}
+
+func (n *altNode) String() string {
+ return "alt"
+}
+
+func (n *altNode) Range() (rune, rune, bool) {
+ return 0, 0, false
+}
+
+func (n *altNode) Optional() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *altNode) Repeatable() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *altNode) Concatenation() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *altNode) Alternatives() (CPTree, CPTree, bool) {
+ return n.left, n.right, true
+}
+
+func (n *altNode) Describe() (LexKindName, []LexKindName, error) {
+ return LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *altNode) children() (CPTree, CPTree) {
+ return n.left, n.right
+}
+
+func (n *altNode) clone() CPTree {
+ return newAltNode(n.left.clone(), n.right.clone())
+}
+
+type quantifierNode struct {
+ optional bool
+ repeatable bool
+ tree CPTree
+}
+
+func (n *quantifierNode) String() string {
+ switch {
+ case n.repeatable:
+ return "repeatable (>= 0 times)"
+ case n.optional:
+ return "optional (0 or 1 times)"
+ default:
+ return "invalid quantifier"
+ }
+}
+
+func newRepeatNode(t CPTree) *quantifierNode {
+ return &quantifierNode{
+ repeatable: true,
+ tree: t,
+ }
+}
+
+func newRepeatOneOrMoreNode(t CPTree) *concatNode {
+ return newConcatNode(
+ t,
+ &quantifierNode{
+ repeatable: true,
+ tree: t.clone(),
+ })
+}
+
+func newOptionNode(t CPTree) *quantifierNode {
+ return &quantifierNode{
+ optional: true,
+ tree: t,
+ }
+}
+
+func (n *quantifierNode) Range() (rune, rune, bool) {
+ return 0, 0, false
+}
+
+func (n *quantifierNode) Optional() (CPTree, bool) {
+ return n.tree, n.optional
+}
+
+func (n *quantifierNode) Repeatable() (CPTree, bool) {
+ return n.tree, n.repeatable
+}
+
+func (n *quantifierNode) Concatenation() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *quantifierNode) Alternatives() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *quantifierNode) Describe() (LexKindName, []LexKindName, error) {
+ return LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *quantifierNode) children() (CPTree, CPTree) {
+ return n.tree, nil
+}
+
+func (n *quantifierNode) clone() CPTree {
+ if n.repeatable {
+ return newRepeatNode(n.tree.clone())
+ }
+ return newOptionNode(n.tree.clone())
+}
+
+type fragmentNode struct {
+ kind LexKindName
+ tree CPTree
+}
+
+func newFragmentNode(kind LexKindName, t CPTree) *fragmentNode {
+ return &fragmentNode{
+ kind: kind,
+ tree: t,
+ }
+}
+
+func (n *fragmentNode) String() string {
+ return fmt.Sprintf("fragment: %v", n.kind)
+}
+
+func (n *fragmentNode) Range() (rune, rune, bool) {
+ return n.tree.Range()
+}
+
+func (n *fragmentNode) Optional() (CPTree, bool) {
+ return n.tree.Optional()
+}
+
+func (n *fragmentNode) Repeatable() (CPTree, bool) {
+ return n.tree.Repeatable()
+}
+
+func (n *fragmentNode) Concatenation() (CPTree, CPTree, bool) {
+ return n.tree.Concatenation()
+}
+
+func (n *fragmentNode) Alternatives() (CPTree, CPTree, bool) {
+ return n.tree.Alternatives()
+}
+
+func (n *fragmentNode) Describe() (LexKindName, []LexKindName, error) {
+ return LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *fragmentNode) children() (CPTree, CPTree) {
+ return n.tree.children()
+}
+
+func (n *fragmentNode) clone() CPTree {
+ if n.tree == nil {
+ return newFragmentNode(n.kind, nil)
+ }
+ return newFragmentNode(n.kind, n.tree.clone())
+}
+
+//nolint:unused
+func printCPTree(w io.Writer, t CPTree, ruledLine string, childRuledLinePrefix string) {
+ if t == nil {
+ return
+ }
+ fmt.Fprintf(w, "%v%v\n", ruledLine, t)
+ children := []CPTree{}
+ switch n := t.(type) {
+ case *rootNode:
+ children = append(children, n.tree)
+ case *fragmentNode:
+ children = append(children, n.tree)
+ default:
+ left, right := t.children()
+ if left != nil {
+ children = append(children, left)
+ }
+ if right != nil {
+ children = append(children, right)
+ }
+ }
+ num := len(children)
+ for i, child := range children {
+ line := "└─ "
+ if num > 1 {
+ if i == 0 {
+ line = "├─ "
+ } else if i < num-1 {
+ line = "│ "
+ }
+ }
+ prefix := "│ "
+ if i >= num-1 {
+ prefix = " "
+ }
+ printCPTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix)
+ }
+}
diff --git a/tests/tre.go b/tests/tre.go
index 8c14feb..2b6fe26 100644
--- a/tests/tre.go
+++ b/tests/tre.go
@@ -3,8 +3,12 @@ package tre
import (
"fmt"
"os"
+ "reflect"
+ "strings"
"testing"
"testing/internal/testdeps"
+
+ "ucd"
)
@@ -569,6 +573,1902 @@ func TestLexSpec_Validate(t *testing.T) {
}
}
+func TestLexer(t *testing.T) {
+ tests := []struct {
+ caption string
+ src string
+ tokens []*token
+ err error
+ }{
+ {
+ caption: "lexer can recognize ordinaly characters",
+ src: "123abcいろは",
+ tokens: []*token{
+ newToken(tokenKindChar, '1'),
+ newToken(tokenKindChar, '2'),
+ newToken(tokenKindChar, '3'),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindChar, 'b'),
+ newToken(tokenKindChar, 'c'),
+ newToken(tokenKindChar, 'い'),
+ newToken(tokenKindChar, 'ろ'),
+ newToken(tokenKindChar, 'は'),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "lexer can recognize the special characters in default mode",
+ src: ".*+?|()[\\u",
+ tokens: []*token{
+ newToken(tokenKindAnyChar, nullChar),
+ newToken(tokenKindRepeat, nullChar),
+ newToken(tokenKindRepeatOneOrMore, nullChar),
+ newToken(tokenKindOption, nullChar),
+ newToken(tokenKindAlt, nullChar),
+ newToken(tokenKindGroupOpen, nullChar),
+ newToken(tokenKindGroupClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "lexer can recognize the escape sequences in default mode",
+ src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[",
+ tokens: []*token{
+ newToken(tokenKindChar, '\\'),
+ newToken(tokenKindChar, '.'),
+ newToken(tokenKindChar, '*'),
+ newToken(tokenKindChar, '+'),
+ newToken(tokenKindChar, '?'),
+ newToken(tokenKindChar, '|'),
+ newToken(tokenKindChar, '('),
+ newToken(tokenKindChar, ')'),
+ newToken(tokenKindChar, '['),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "], {, and } are treated as an ordinary character in default mode",
+ src: "]{}",
+ tokens: []*token{
+ newToken(tokenKindChar, ']'),
+ newToken(tokenKindChar, '{'),
+ newToken(tokenKindChar, '}'),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "lexer can recognize the special characters in bracket expression mode",
+ src: "[a-z\\u{09AF}][^a-z\\u{09abcf}]",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindCharRange, nullChar),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("09AF"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindCharRange, nullChar),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("09abcf"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "lexer can recognize the escape sequences in bracket expression mode",
+ src: "[\\^a\\-z]",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '^'),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "in a bracket expression, the special characters are also handled as normal characters",
+ src: "[\\\\.*+?|()[",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '\\'),
+ newToken(tokenKindChar, '.'),
+ newToken(tokenKindChar, '*'),
+ newToken(tokenKindChar, '+'),
+ newToken(tokenKindChar, '?'),
+ newToken(tokenKindChar, '|'),
+ newToken(tokenKindChar, '('),
+ newToken(tokenKindChar, ')'),
+ newToken(tokenKindChar, '['),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "hyphen symbols that appear in bracket expressions are handled as the character range symbol or ordinary characters",
+ // [...-...][...-][-...][-]
+ // ~~~~~~~ ~ ~ ~
+ // ^ ^ ^ ^
+ // | | | `-- Ordinary Character (b)
+ // | | `-- Ordinary Character (b)
+ // | `-- Ordinary Character (b)
+ // `-- Character Range (a)
+ //
+ // a. *-* is handled as a character-range expression.
+ // b. *-, -*, or - are handled as ordinary characters.
+ src: "[a-z][a-][-z][-][--][---][^a-z][^a-][^-z][^-][^--][^---]",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindCharRange, nullChar),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindCharRange, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindCharRange, nullChar),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindCharRange, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "caret symbols that appear in bracket expressions are handled as the logical inverse symbol or ordinary characters",
+ // [^...^...][^]
+ // ~~ ~ ~~
+ // ^ ^ ^^
+ // | | |`-- Ordinary Character (c)
+ // | | `-- Bracket Expression
+ // | `-- Ordinary Character (b)
+ // `-- Inverse Bracket Expression (a)
+ //
+ // a. Bracket expressions that have a caret symbol at the beginning are handled as logical inverse expressions.
+ // b. caret symbols that appear as the second and the subsequent symbols are handled as ordinary symbols.
+ // c. When a bracket expression has just one symbol, a caret symbol at the beginning is handled as an ordinary character.
+ src: "[^^][^]",
+ tokens: []*token{
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, '^'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '^'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "lexer raises an error when an invalid escape sequence appears",
+ src: "\\@",
+ err: synErrInvalidEscSeq,
+ },
+ {
+ caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears",
+ src: "\\",
+ err: synErrIncompletedEscSeq,
+ },
+ {
+ caption: "lexer raises an error when an invalid escape sequence appears",
+ src: "[\\@",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ },
+ err: synErrInvalidEscSeq,
+ },
+ {
+ caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears",
+ src: "[\\",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ },
+ err: synErrIncompletedEscSeq,
+ },
+ {
+ caption: "lexer can recognize the special characters and code points in code point expression mode",
+ src: "\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}[\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}][^\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}]",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("0123"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("4567"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("89abcd"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("efAB"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("CDEF01"),
+ newToken(tokenKindRBrace, nullChar),
+
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("0123"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("4567"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("89abcd"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("efAB"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("CDEF01"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("0123"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("4567"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("89abcd"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("efAB"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("CDEF01"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "a one digit hex string isn't a valid code point",
+ src: "\\u{0",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a two digits hex string isn't a valid code point",
+ src: "\\u{01",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a three digits hex string isn't a valid code point",
+ src: "\\u{012",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a four digits hex string is a valid code point",
+ src: "\\u{0123}",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("0123"),
+ newToken(tokenKindRBrace, nullChar),
+ },
+ },
+ {
+ caption: "a five digits hex string isn't a valid code point",
+ src: "\\u{01234",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a six digits hex string is a valid code point",
+ src: "\\u{012345}",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("012345"),
+ newToken(tokenKindRBrace, nullChar),
+ },
+ },
+ {
+ caption: "a seven digits hex string isn't a valid code point",
+ src: "\\u{0123456",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a code point must be hex digits",
+ src: "\\u{g",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a code point must be hex digits",
+ src: "\\u{G",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "lexer can recognize the special characters and symbols in character property expression mode",
+ src: "\\p{Letter}\\p{General_Category=Letter}[\\p{Letter}\\p{General_Category=Letter}][^\\p{Letter}\\p{General_Category=Letter}]",
+ tokens: []*token{
+ newToken(tokenKindCharPropLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCharPropSymbolToken("Letter"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCharPropLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCharPropSymbolToken("General_Category"),
+ newToken(tokenKindEqual, nullChar),
+ newCharPropSymbolToken("Letter"),
+ newToken(tokenKindRBrace, nullChar),
+
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindCharPropLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCharPropSymbolToken("Letter"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCharPropLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCharPropSymbolToken("General_Category"),
+ newToken(tokenKindEqual, nullChar),
+ newCharPropSymbolToken("Letter"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindCharPropLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCharPropSymbolToken("Letter"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCharPropLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCharPropSymbolToken("General_Category"),
+ newToken(tokenKindEqual, nullChar),
+ newCharPropSymbolToken("Letter"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "lexer can recognize the special characters and symbols in fragment expression mode",
+ src: "\\f{integer}",
+ tokens: []*token{
+ newToken(tokenKindFragmentLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newFragmentSymbolToken("integer"),
+ newToken(tokenKindRBrace, nullChar),
+
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "a fragment expression is not supported in a bracket expression",
+ src: "[\\f",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ },
+ err: synErrInvalidEscSeq,
+ },
+ {
+ caption: "a fragment expression is not supported in an inverse bracket expression",
+ src: "[^\\f",
+ tokens: []*token{
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ },
+ err: synErrInvalidEscSeq,
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.caption, func(t *testing.T) {
+ lex := newLexer(strings.NewReader(tt.src))
+ var err error
+ var tok *token
+ i := 0
+ for {
+ tok, err = lex.next()
+ if err != nil {
+ break
+ }
+ if i >= len(tt.tokens) {
+ break
+ }
+ eTok := tt.tokens[i]
+ i++
+ testToken(t, tok, eTok)
+
+ if tok.kind == tokenKindEOF {
+ break
+ }
+ }
+ if tt.err != nil {
+ if err != ParseErr {
+ t.Fatalf("unexpected error: want: %v, got: %v", ParseErr, err)
+ }
+ detail, cause := lex.error()
+ if cause != tt.err {
+ t.Fatalf("unexpected error: want: %v, got: %v (%v)", tt.err, cause, detail)
+ }
+ } else {
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ }
+ if i < len(tt.tokens) {
+ t.Fatalf("expecte more tokens")
+ }
+ })
+ }
+}
+
+func testToken(t *testing.T, a, e *token) {
+ t.Helper()
+ if e.kind != a.kind || e.char != a.char || e.codePoint != a.codePoint {
+ t.Fatalf("unexpected token: want: %+v, got: %+v", e, a)
+ }
+}
+
+func TestParse(t *testing.T) {
+ tests := []struct {
+ pattern string
+ fragments map[LexKindName]string
+ ast CPTree
+ syntaxError error
+
+ // When an AST is large, as patterns containing a character property expression, this test only checks
+ // that the pattern is parsable. The check of the validity of such AST is performed by checking that it
+ // can be matched correctly using the driver.
+ skipTestAST bool
+ }{
+ {
+ pattern: "a",
+ ast: newSymbolNode('a'),
+ },
+ {
+ pattern: "abc",
+ ast: genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ },
+ {
+ pattern: "a?",
+ ast: newOptionNode(
+ newSymbolNode('a'),
+ ),
+ },
+ {
+ pattern: "[abc]?",
+ ast: newOptionNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ },
+ {
+ pattern: "\\u{3042}?",
+ ast: newOptionNode(
+ newSymbolNode('\u3042'),
+ ),
+ },
+ {
+ pattern: "\\p{Letter}?",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\f{a2c}?",
+ fragments: map[LexKindName]string{
+ "a2c": "abc",
+ },
+ ast: newOptionNode(
+ newFragmentNode("a2c",
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "(a)?",
+ ast: newOptionNode(
+ newSymbolNode('a'),
+ ),
+ },
+ {
+ pattern: "((a?)?)?",
+ ast: newOptionNode(
+ newOptionNode(
+ newOptionNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "(abc)?",
+ ast: newOptionNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ },
+ {
+ pattern: "(a|b)?",
+ ast: newOptionNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ ),
+ ),
+ },
+ {
+ pattern: "?",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "(?)",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a|?",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "?|b",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a??",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a*",
+ ast: newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ },
+ {
+ pattern: "[abc]*",
+ ast: newRepeatNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ },
+ {
+ pattern: "\\u{3042}*",
+ ast: newRepeatNode(
+ newSymbolNode('\u3042'),
+ ),
+ },
+ {
+ pattern: "\\p{Letter}*",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\f{a2c}*",
+ fragments: map[LexKindName]string{
+ "a2c": "abc",
+ },
+ ast: newRepeatNode(
+ newFragmentNode("a2c",
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "((a*)*)*",
+ ast: newRepeatNode(
+ newRepeatNode(
+ newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "(abc)*",
+ ast: newRepeatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ },
+ {
+ pattern: "(a|b)*",
+ ast: newRepeatNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ ),
+ ),
+ },
+ {
+ pattern: "*",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "(*)",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a|*",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "*|b",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a**",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a+",
+ ast: genConcatNode(
+ newSymbolNode('a'),
+ newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ },
+ {
+ pattern: "[abc]+",
+ ast: genConcatNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ newRepeatNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "\\u{3042}+",
+ ast: genConcatNode(
+ newSymbolNode('\u3042'),
+ newRepeatNode(
+ newSymbolNode('\u3042'),
+ ),
+ ),
+ },
+ {
+ pattern: "\\p{Letter}+",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\f{a2c}+",
+ fragments: map[LexKindName]string{
+ "a2c": "abc",
+ },
+ ast: genConcatNode(
+ newFragmentNode("a2c",
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ newRepeatNode(
+ newFragmentNode("a2c",
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "((a+)+)+",
+ ast: genConcatNode(
+ genConcatNode(
+ genConcatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ newRepeatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ ),
+ ),
+ newRepeatNode(
+ genConcatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ newRepeatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ ),
+ ),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "(abc)+",
+ ast: genConcatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ newRepeatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "(a|b)+",
+ ast: genConcatNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ ),
+ newRepeatNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "+",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "(+)",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a|+",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "+|b",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a++",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: ".",
+ ast: newRangeSymbolNode(0x00, 0x10FFFF),
+ },
+ {
+ pattern: "[a]",
+ ast: newSymbolNode('a'),
+ },
+ {
+ pattern: "[abc]",
+ ast: genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ },
+ {
+ pattern: "[a-z]",
+ ast: newRangeSymbolNode('a', 'z'),
+ },
+ {
+ pattern: "[A-Za-z]",
+ ast: genAltNode(
+ newRangeSymbolNode('A', 'Z'),
+ newRangeSymbolNode('a', 'z'),
+ ),
+ },
+ {
+ pattern: "[\\u{004E}]",
+ ast: newSymbolNode('N'),
+ },
+ {
+ pattern: "[\\u{0061}-\\u{007A}]",
+ ast: newRangeSymbolNode('a', 'z'),
+ },
+ {
+ pattern: "[\\p{Lu}]",
+ skipTestAST: true,
+ },
+ {
+ pattern: "[a-\\p{Lu}]",
+ syntaxError: synErrRangePropIsUnavailable,
+ },
+ {
+ pattern: "[\\p{Lu}-z]",
+ syntaxError: synErrRangePropIsUnavailable,
+ },
+ {
+ pattern: "[\\p{Lu}-\\p{Ll}]",
+ syntaxError: synErrRangePropIsUnavailable,
+ },
+ {
+ pattern: "[z-a]",
+ syntaxError: synErrRangeInvalidOrder,
+ },
+ {
+ pattern: "a[]",
+ syntaxError: synErrBExpNoElem,
+ },
+ {
+ pattern: "[]a",
+ syntaxError: synErrBExpNoElem,
+ },
+ {
+ pattern: "[]",
+ syntaxError: synErrBExpNoElem,
+ },
+ {
+ pattern: "[^\\u{004E}]",
+ ast: genAltNode(
+ newRangeSymbolNode(0x00, '\u004E'-1),
+ newRangeSymbolNode('\u004E'+1, 0x10FFFF),
+ ),
+ },
+ {
+ pattern: "[^\\u{0061}-\\u{007A}]",
+ ast: genAltNode(
+ newRangeSymbolNode(0x00, '\u0061'-1),
+ newRangeSymbolNode('\u007A'+1, 0x10FFFF),
+ ),
+ },
+ {
+ pattern: "[^\\p{Lu}]",
+ skipTestAST: true,
+ },
+ {
+ pattern: "[^a-\\p{Lu}]",
+ syntaxError: synErrRangePropIsUnavailable,
+ },
+ {
+ pattern: "[^\\p{Lu}-z]",
+ syntaxError: synErrRangePropIsUnavailable,
+ },
+ {
+ pattern: "[^\\p{Lu}-\\p{Ll}]",
+ syntaxError: synErrRangePropIsUnavailable,
+ },
+ {
+ pattern: "[^\\u{0000}-\\u{10FFFF}]",
+ syntaxError: synErrUnmatchablePattern,
+ },
+ {
+ pattern: "[^\\u{0000}-\\u{FFFF}\\u{010000}-\\u{10FFFF}]",
+ syntaxError: synErrUnmatchablePattern,
+ },
+ {
+ pattern: "[^]",
+ ast: newSymbolNode('^'),
+ },
+ {
+ pattern: "[",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "[a",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([a",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "[a-",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([a-",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "[^",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([^",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "[^a",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([^a",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "[^a-",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([^a-",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "]",
+ ast: newSymbolNode(']'),
+ },
+ {
+ pattern: "(]",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: "a]",
+ ast: genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode(']'),
+ ),
+ },
+ {
+ pattern: "(a]",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: "([)",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([a)",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "[a-]",
+ ast: genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('-'),
+ ),
+ },
+ {
+ pattern: "[^a-]",
+ ast: genAltNode(
+ newRangeSymbolNode(0x00, 0x2C),
+ newRangeSymbolNode(0x2E, 0x60),
+ newRangeSymbolNode(0x62, 0x10FFFF),
+ ),
+ },
+ {
+ pattern: "[-z]",
+ ast: genAltNode(
+ newSymbolNode('-'),
+ newSymbolNode('z'),
+ ),
+ },
+ {
+ pattern: "[^-z]",
+ ast: newAltNode(
+ newRangeSymbolNode(0x00, 0x2C),
+ newAltNode(
+ newRangeSymbolNode(0x2E, 0x79),
+ newRangeSymbolNode(0x7B, 0x10FFFF),
+ ),
+ ),
+ },
+ {
+ pattern: "[-]",
+ ast: newSymbolNode('-'),
+ },
+ {
+ pattern: "[^-]",
+ ast: genAltNode(
+ newRangeSymbolNode(0x00, 0x2C),
+ newRangeSymbolNode(0x2E, 0x10FFFF),
+ ),
+ },
+ {
+ pattern: "[^01]",
+ ast: genAltNode(
+ newRangeSymbolNode(0x00, '0'-1),
+ newRangeSymbolNode('1'+1, 0x10FFFF),
+ ),
+ },
+ {
+ pattern: "[^10]",
+ ast: genAltNode(
+ newRangeSymbolNode(0x00, '0'-1),
+ newRangeSymbolNode('1'+1, 0x10FFFF),
+ ),
+ },
+ {
+ pattern: "[^a-z]",
+ ast: genAltNode(
+ newRangeSymbolNode(0x00, 'a'-1),
+ newRangeSymbolNode('z'+1, 0x10FFFF),
+ ),
+ },
+ {
+ pattern: "[^az]",
+ ast: genAltNode(
+ newRangeSymbolNode(0x00, 'a'-1),
+ genAltNode(
+ newRangeSymbolNode('a'+1, 'z'-1),
+ newRangeSymbolNode('z'+1, 0x10FFFF),
+ ),
+ ),
+ },
+ {
+ pattern: "\\u{006E}",
+ ast: newSymbolNode('\u006E'),
+ },
+ {
+ pattern: "\\u{03BD}",
+ ast: newSymbolNode('\u03BD'),
+ },
+ {
+ pattern: "\\u{306B}",
+ ast: newSymbolNode('\u306B'),
+ },
+ {
+ pattern: "\\u{01F638}",
+ ast: newSymbolNode('\U0001F638'),
+ },
+ {
+ pattern: "\\u{0000}",
+ ast: newSymbolNode('\u0000'),
+ },
+ {
+ pattern: "\\u{10FFFF}",
+ ast: newSymbolNode('\U0010FFFF'),
+ },
+ {
+ pattern: "\\u{110000}",
+ syntaxError: synErrCPExpOutOfRange,
+ },
+ {
+ pattern: "\\u",
+ syntaxError: synErrCPExpInvalidForm,
+ },
+ {
+ pattern: "\\u{",
+ syntaxError: synErrCPExpInvalidForm,
+ },
+ {
+ pattern: "\\u{03BD",
+ syntaxError: synErrCPExpInvalidForm,
+ },
+ {
+ pattern: "\\u{}",
+ syntaxError: synErrCPExpInvalidForm,
+ },
+ {
+ pattern: "\\p{Letter}",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\p{General_Category=Letter}",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\p{ Letter }",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\p{ General_Category = Letter }",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\p",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\p{",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\p{Letter",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\p{General_Category=}",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\p{General_Category= }",
+ syntaxError: synErrCharPropInvalidSymbol,
+ },
+ {
+ pattern: "\\p{=Letter}",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\p{ =Letter}",
+ syntaxError: synErrCharPropInvalidSymbol,
+ },
+ {
+ pattern: "\\p{=}",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\p{}",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\f{a2c}",
+ fragments: map[LexKindName]string{
+ "a2c": "abc",
+ },
+ ast: newFragmentNode("a2c",
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ },
+ {
+ pattern: "\\f{ a2c }",
+ fragments: map[LexKindName]string{
+ "a2c": "abc",
+ },
+ ast: newFragmentNode("a2c",
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ },
+ {
+ pattern: "\\f",
+ syntaxError: synErrFragmentExpInvalidForm,
+ },
+ {
+ pattern: "\\f{",
+ syntaxError: synErrFragmentExpInvalidForm,
+ },
+ {
+ pattern: "\\f{a2c",
+ fragments: map[LexKindName]string{
+ "a2c": "abc",
+ },
+ syntaxError: synErrFragmentExpInvalidForm,
+ },
+ {
+ pattern: "(a)",
+ ast: newSymbolNode('a'),
+ },
+ {
+ pattern: "(((a)))",
+ ast: newSymbolNode('a'),
+ },
+ {
+ pattern: "a()",
+ syntaxError: synErrGroupNoElem,
+ },
+ {
+ pattern: "()a",
+ syntaxError: synErrGroupNoElem,
+ },
+ {
+ pattern: "()",
+ syntaxError: synErrGroupNoElem,
+ },
+ {
+ pattern: "(",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: "a(",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: "(a",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: "((",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: "((a)",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: ")",
+ syntaxError: synErrGroupNoInitiator,
+ },
+ {
+ pattern: "a)",
+ syntaxError: synErrGroupNoInitiator,
+ },
+ {
+ pattern: ")a",
+ syntaxError: synErrGroupNoInitiator,
+ },
+ {
+ pattern: "))",
+ syntaxError: synErrGroupNoInitiator,
+ },
+ {
+ pattern: "(a))",
+ syntaxError: synErrGroupNoInitiator,
+ },
+ {
+ pattern: "Mulder|Scully",
+ ast: genAltNode(
+ genConcatNode(
+ newSymbolNode('M'),
+ newSymbolNode('u'),
+ newSymbolNode('l'),
+ newSymbolNode('d'),
+ newSymbolNode('e'),
+ newSymbolNode('r'),
+ ),
+ genConcatNode(
+ newSymbolNode('S'),
+ newSymbolNode('c'),
+ newSymbolNode('u'),
+ newSymbolNode('l'),
+ newSymbolNode('l'),
+ newSymbolNode('y'),
+ ),
+ ),
+ },
+ {
+ pattern: "Langly|Frohike|Byers",
+ ast: genAltNode(
+ genConcatNode(
+ newSymbolNode('L'),
+ newSymbolNode('a'),
+ newSymbolNode('n'),
+ newSymbolNode('g'),
+ newSymbolNode('l'),
+ newSymbolNode('y'),
+ ),
+ genConcatNode(
+ newSymbolNode('F'),
+ newSymbolNode('r'),
+ newSymbolNode('o'),
+ newSymbolNode('h'),
+ newSymbolNode('i'),
+ newSymbolNode('k'),
+ newSymbolNode('e'),
+ ),
+ genConcatNode(
+ newSymbolNode('B'),
+ newSymbolNode('y'),
+ newSymbolNode('e'),
+ newSymbolNode('r'),
+ newSymbolNode('s'),
+ ),
+ ),
+ },
+ {
+ pattern: "|",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "||",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "Mulder|",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "|Scully",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "Langly|Frohike|",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "Langly||Byers",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "|Frohike|Byers",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "|Frohike|",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "Fox(|)Mulder",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "(Fox|)Mulder",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "Fox(|Mulder)",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ }
+ for i, tt := range tests {
+ t.Run(fmt.Sprintf("#%v %v", i, tt.pattern), func(t *testing.T) {
+ fragmentTrees := map[LexKindName]CPTree{}
+ for kind, pattern := range tt.fragments {
+ p := NewParser(kind, strings.NewReader(pattern))
+ root, err := p.Parse()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ fragmentTrees[kind] = root
+ }
+ err := CompleteFragments(fragmentTrees)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ p := NewParser(LexKindName("test"), strings.NewReader(tt.pattern))
+ root, err := p.Parse()
+ if tt.syntaxError != nil {
+ // printCPTree(os.Stdout, root, "", "")
+ if err != ParseErr {
+ t.Fatalf("unexpected error: want: %v, got: %v", ParseErr, err)
+ }
+ _, synErr := p.Error()
+ if synErr != tt.syntaxError {
+ t.Fatalf("unexpected syntax error: want: %v, got: %v", tt.syntaxError, synErr)
+ }
+ if root != nil {
+ t.Fatalf("tree must be nil")
+ }
+ } else {
+ if err != nil {
+ detail, cause := p.Error()
+ t.Fatalf("%v: %v: %v", err, cause, detail)
+ }
+ if root == nil {
+ t.Fatal("tree must be non-nil")
+ }
+
+ complete, err := ApplyFragments(root, fragmentTrees)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if !complete {
+ t.Fatalf("incomplete fragments")
+ }
+
+ // printCPTree(os.Stdout, root, "", "")
+ if !tt.skipTestAST {
+ r := root.(*rootNode)
+ testAST(t, tt.ast, r.tree)
+ }
+ }
+ })
+ }
+}
+
+func TestParse_ContributoryPropertyIsNotExposed(t *testing.T) {
+ for _, cProp := range ucd.ContributoryProperties() {
+ t.Run(fmt.Sprintf("%v", cProp), func(t *testing.T) {
+ p := NewParser(LexKindName("test"), strings.NewReader(fmt.Sprintf(`\p{%v=yes}`, cProp)))
+ root, err := p.Parse()
+ if err == nil {
+ t.Fatalf("expected syntax error: got: nil")
+ }
+ _, synErr := p.Error()
+ if synErr != synErrCharPropUnsupported {
+ t.Fatalf("unexpected syntax error: want: %v, got: %v", synErrCharPropUnsupported, synErr)
+ }
+ if root != nil {
+ t.Fatalf("tree is not nil")
+ }
+ })
+ }
+}
+
+func testAST(t *testing.T, expected, actual CPTree) {
+ t.Helper()
+
+ aTy := reflect.TypeOf(actual)
+ eTy := reflect.TypeOf(expected)
+ if eTy != aTy {
+ t.Fatalf("unexpected node: want: %+v, got: %+v", eTy, aTy)
+ }
+
+ if actual == nil {
+ return
+ }
+
+ switch e := expected.(type) {
+ case *symbolNode:
+ a := actual.(*symbolNode)
+ if a.From != e.From || a.To != e.To {
+ t.Fatalf("unexpected node: want: %+v, got: %+v", e, a)
+ }
+ }
+ eLeft, eRight := expected.children()
+ aLeft, aRight := actual.children()
+ testAST(t, eLeft, aLeft)
+ testAST(t, eRight, aRight)
+}
+
+func TestExclude(t *testing.T) {
+ for _, test := range []struct {
+ caption string
+ target CPTree
+ base CPTree
+ result CPTree
+ }{
+ // t.From > b.From && t.To < b.To
+
+ // |t.From - b.From| = 1
+ // |b.To - t.To| = 1
+ //
+ // Target (t): +--+
+ // Base (b): +--+--+--+
+ // Result (b - t): +--+ +--+
+ {
+ caption: "|t.From - b.From| = 1 && |b.To - t.To| = 1",
+ target: newSymbolNode('1'),
+ base: newRangeSymbolNode('0', '2'),
+ result: newAltNode(
+ newSymbolNode('0'),
+ newSymbolNode('2'),
+ ),
+ },
+ // |t.From - b.From| > 1
+ // |b.To - t.To| > 1
+ //
+ // Target (t): +--+
+ // Base (b): +--+--+--+--+--+
+ // Result (b - t): +--+--+ +--+--+
+ {
+ caption: "|t.From - b.From| > 1 && |b.To - t.To| > 1",
+ target: newSymbolNode('2'),
+ base: newRangeSymbolNode('0', '4'),
+ result: newAltNode(
+ newRangeSymbolNode('0', '1'),
+ newRangeSymbolNode('3', '4'),
+ ),
+ },
+
+ // t.From <= b.From && t.To >= b.From && t.To < b.To
+
+ // |b.From - t.From| = 0
+ // |t.To - b.From| = 0
+ // |b.To - t.To| = 1
+ //
+ // Target (t): +--+
+ // Base (b): +--+--+
+ // Result (b - t): +--+
+ {
+ caption: "|b.From - t.From| = 0 && |t.To - b.From| = 0 && |b.To - t.To| = 1",
+ target: newSymbolNode('0'),
+ base: newRangeSymbolNode('0', '1'),
+ result: newSymbolNode('1'),
+ },
+ // |b.From - t.From| = 0
+ // |t.To - b.From| = 0
+ // |b.To - t.To| > 1
+ //
+ // Target (t): +--+
+ // Base (b): +--+--+--+
+ // Result (b - t): +--+--+
+ {
+ caption: "|b.From - t.From| = 0 && |t.To - b.From| = 0 && |b.To - t.To| > 1",
+ target: newSymbolNode('0'),
+ base: newRangeSymbolNode('0', '2'),
+ result: newRangeSymbolNode('1', '2'),
+ },
+ // |b.From - t.From| = 0
+ // |t.To - b.From| > 0
+ // |b.To - t.To| = 1
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+--+--+
+ // Result (b - t): +--+
+ {
+ caption: "|b.From - t.From| = 0 && |t.To - b.From| > 0 && |b.To - t.To| = 1",
+ target: newRangeSymbolNode('0', '1'),
+ base: newRangeSymbolNode('0', '2'),
+ result: newSymbolNode('2'),
+ },
+ // |b.From - t.From| = 0
+ // |t.To - b.From| > 0
+ // |b.To - t.To| > 1
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+--+--+--+
+ // Result (b - t): +--+--+
+ {
+ caption: "|b.From - t.From| = 0 && |t.To - b.From| > 0 && |b.To - t.To| > 1",
+ target: newRangeSymbolNode('0', '1'),
+ base: newRangeSymbolNode('0', '3'),
+ result: newRangeSymbolNode('2', '3'),
+ },
+ // |b.From - t.From| > 0
+ // |t.To - b.From| = 0
+ // |b.To - t.To| = 1
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+--+
+ // Result (b - t): +--+
+ {
+ caption: "|b.From - t.From| > 0 && |t.To - b.From| = 0 && |b.To - t.To| = 1",
+ target: newRangeSymbolNode('0', '1'),
+ base: newRangeSymbolNode('1', '2'),
+ result: newSymbolNode('2'),
+ },
+ // |b.From - t.From| > 0
+ // |t.To - b.From| = 0
+ // |b.To - t.To| > 1
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+--+--+
+ // Result (b - t): +--+--+
+ {
+ caption: "|b.From - t.From| > 0 && |t.To - b.From| = 0 && |b.To - t.To| > 1",
+ target: newRangeSymbolNode('0', '1'),
+ base: newRangeSymbolNode('1', '3'),
+ result: newRangeSymbolNode('2', '3'),
+ },
+ // |b.From - t.From| > 0
+ // |t.To - b.From| > 0
+ // |b.To - t.To| = 1
+ //
+ // Target (t): +--+--+--+
+ // Base (b): +--+--+--+
+ // Result (b - t): +--+
+ {
+ caption: "|b.From - t.From| > 0 && |t.To - b.From| > 0 && |b.To - t.To| = 1",
+ target: newRangeSymbolNode('0', '2'),
+ base: newRangeSymbolNode('1', '3'),
+ result: newSymbolNode('3'),
+ },
+ // |b.From - t.From| > 0
+ // |t.To - b.From| > 0
+ // |b.To - t.To| > 1
+ //
+ // Target (t): +--+--+--+
+ // Base (b): +--+--+--+--+
+ // Result (b - t): +--+--+
+ {
+ caption: "|b.From - t.From| > 0 && |t.To - b.From| > 0 && |b.To - t.To| > 1",
+ target: newRangeSymbolNode('0', '2'),
+ base: newRangeSymbolNode('1', '4'),
+ result: newRangeSymbolNode('3', '4'),
+ },
+
+ // t.From > b.From && t.From <= b.To && t.To >= b.To
+
+ // |t.From - b.From| = 1
+ // |b.To - t.From| = 0
+ // |t.To - b.To| = 0
+ //
+ // Target (t): +--+
+ // Base (b): +--+--+
+ // Result (b - t): +--+
+ {
+ caption: "|t.From - b.From| = 1 && |b.To - t.From| = 0 && |t.To - b.To| = 0",
+ target: newSymbolNode('1'),
+ base: newRangeSymbolNode('0', '1'),
+ result: newSymbolNode('0'),
+ },
+ // |t.From - b.From| = 1
+ // |b.To - t.From| = 0
+ // |t.To - b.To| > 0
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+--+
+ // Result (b - t): +--+
+ {
+ caption: "|t.From - b.From| = 1 && |b.To - t.From| = 0 && |t.To - b.To| > 0",
+ target: newRangeSymbolNode('1', '2'),
+ base: newRangeSymbolNode('0', '1'),
+ result: newSymbolNode('0'),
+ },
+ // |t.From - b.From| = 1
+ // |b.To - t.From| > 0
+ // |t.To - b.To| = 0
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+--+--+
+ // Result (b - t): +--+
+ {
+ caption: "|t.From - b.From| = 1 && |b.To - t.From| > 0 && |t.To - b.To| = 0",
+ target: newRangeSymbolNode('1', '2'),
+ base: newRangeSymbolNode('0', '2'),
+ result: newSymbolNode('0'),
+ },
+ // |t.From - b.From| = 1
+ // |b.To - t.From| > 0
+ // |t.To - b.To| > 0
+ //
+ // Target (t): +--+--+--+
+ // Base (b): +--+--+--+
+ // Result (b - t): +--+
+ {
+ caption: "|t.From - b.From| = 1 && |b.To - t.From| > 0 && |t.To - b.To| > 0",
+ target: newRangeSymbolNode('1', '3'),
+ base: newRangeSymbolNode('0', '2'),
+ result: newSymbolNode('0'),
+ },
+ // |t.From - b.From| > 1
+ // |b.To - t.From| = 0
+ // |t.To - b.To| = 0
+ //
+ // Target (t): +--+
+ // Base (b): +--+--+--+
+ // Result (b - t): +--+--+
+ {
+ caption: "|t.From - b.From| > 1 && |b.To - t.From| = 0 && |t.To - b.To| = 0",
+ target: newSymbolNode('2'),
+ base: newRangeSymbolNode('0', '2'),
+ result: newRangeSymbolNode('0', '1'),
+ },
+ // |t.From - b.From| > 1
+ // |b.To - t.From| = 0
+ // |t.To - b.To| > 0
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+--+--+
+ // Result (b - t): +--+--+
+ {
+ caption: "|t.From - b.From| > 1 && |b.To - t.From| = 0 && |t.To - b.To| > 0",
+ target: newRangeSymbolNode('2', '3'),
+ base: newRangeSymbolNode('0', '2'),
+ result: newRangeSymbolNode('0', '1'),
+ },
+ // |t.From - b.From| > 1
+ // |b.To - t.From| > 0
+ // |t.To - b.To| = 0
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+--+--+--+
+ // Result (b - t): +--+--+
+ {
+ caption: "|t.From - b.From| > 1 && |b.To - t.From| > 0 && |t.To - b.To| = 0",
+ target: newRangeSymbolNode('2', '3'),
+ base: newRangeSymbolNode('0', '3'),
+ result: newRangeSymbolNode('0', '1'),
+ },
+ // |t.From - b.From| > 1
+ // |b.To - t.From| > 0
+ // |t.To - b.To| > 0
+ //
+ // Target (t): +--+--+--+
+ // Base (b): +--+--+--+--+
+ // Result (b - t): +--+--+
+ {
+ caption: "|t.From - b.From| > 1 && |b.To - t.From| > 0 && |t.To - b.To| > 0",
+ target: newRangeSymbolNode('2', '4'),
+ base: newRangeSymbolNode('0', '3'),
+ result: newRangeSymbolNode('0', '1'),
+ },
+
+ // t.From <= b.From && t.To >= b.To
+
+ // |b.From - t.From| = 0
+ // |t.To - b.To| = 0
+ //
+ // Target (t): +--+
+ // Base (b): +--+
+ // Result (b - t): N/A
+ {
+ caption: "|b.From - t.From| = 0 && |t.To - b.To| = 0",
+ target: newSymbolNode('0'),
+ base: newSymbolNode('0'),
+ result: nil,
+ },
+ // |b.From - t.From| = 0
+ // |t.To - b.To| > 0
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+
+ // Result (b - t): N/A
+ {
+ caption: "|b.From - t.From| = 0 && |t.To - b.To| > 0",
+ target: newRangeSymbolNode('0', '1'),
+ base: newSymbolNode('0'),
+ result: nil,
+ },
+ // |b.From - t.From| > 0
+ // |t.To - b.To| = 0
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+
+ // Result (b - t): N/A
+ {
+ caption: "|b.From - t.From| > 0 && |t.To - b.To| = 0",
+ target: newRangeSymbolNode('0', '1'),
+ base: newSymbolNode('1'),
+ result: nil,
+ },
+ // |b.From - t.From| > 0
+ // |t.To - b.To| > 0
+ //
+ // Target (t): +--+--+--+
+ // Base (b): +--+
+ // Result (b - t): N/A
+ {
+ caption: "|b.From - t.From| > 0 && |t.To - b.To| > 0",
+ target: newRangeSymbolNode('0', '2'),
+ base: newSymbolNode('1'),
+ result: nil,
+ },
+
+ // Others
+
+ // |b.From - t.From| = 1
+ //
+ // Target (t): +--+
+ // Base (b): +--+
+ // Result (b - t): +--+
+ {
+ caption: "|b.From - t.From| = 1",
+ target: newSymbolNode('0'),
+ base: newSymbolNode('1'),
+ result: newSymbolNode('1'),
+ },
+ // |b.From - t.From| > 1
+ //
+ // Target (t): +--+
+ // Base (b): +--+
+ // Result (b - t): +--+
+ {
+ caption: "|b.From - t.From| > 1",
+ target: newSymbolNode('0'),
+ base: newSymbolNode('2'),
+ result: newSymbolNode('2'),
+ },
+ // |t.To - b.To| = 1
+ //
+ // Target (t): +--+
+ // Base (b): +--+
+ // Result (b - t): +--+
+ {
+ caption: "|t.To - b.To| = 1",
+ target: newSymbolNode('1'),
+ base: newSymbolNode('0'),
+ result: newSymbolNode('0'),
+ },
+ // |t.To - b.To| > 1
+ //
+ // Target (t): +--+
+ // Base (b): +--+
+ // Result (b - t): +--+
+ {
+ caption: "|t.To - b.To| > 1",
+ target: newSymbolNode('2'),
+ base: newSymbolNode('0'),
+ result: newSymbolNode('0'),
+ },
+ } {
+ t.Run(test.caption, func(t *testing.T) {
+ r := exclude(test.target, test.base)
+ testAST(t, test.result, r)
+ })
+ }
+}
+
func MainTest() {
@@ -582,6 +2482,10 @@ func MainTest() {
{ "TestSnakeCaseToUpperCamelCase", TestSnakeCaseToUpperCamelCase },
{ "TestFindSpellingInconsistencies", TestFindSpellingInconsistencies },
{ "TestLexSpec_Validate", TestLexSpec_Validate },
+ { "TestLexer", TestLexer },
+ { "TestParse", TestParse },
+ { "TestParse_ContributoryPropertyIsNotExposed", TestParse_ContributoryPropertyIsNotExposed },
+ { "TestExclude", TestExclude },
}
deps := testdeps.TestDeps{}