diff options
-rw-r--r-- | compiler/parser/error.go | 36 | ||||
-rw-r--r-- | compiler/parser/fragment.go | 72 | ||||
-rw-r--r-- | compiler/parser/lexer.go | 594 | ||||
-rw-r--r-- | compiler/parser/lexer_test.go | 524 | ||||
-rw-r--r-- | compiler/parser/parser.go | 531 | ||||
-rw-r--r-- | compiler/parser/parser_test.go | 1389 | ||||
-rw-r--r-- | compiler/parser/tree.go | 459 | ||||
-rw-r--r-- | src/tre.go | 1661 | ||||
-rw-r--r-- | tests/tre.go | 1904 |
9 files changed, 3565 insertions, 3605 deletions
diff --git a/compiler/parser/error.go b/compiler/parser/error.go deleted file mode 100644 index be81da4..0000000 --- a/compiler/parser/error.go +++ /dev/null @@ -1,36 +0,0 @@ -package parser - -import "fmt" - -var ( - ParseErr = fmt.Errorf("parse error") - - // lexical errors - synErrIncompletedEscSeq = fmt.Errorf("incompleted escape sequence; unexpected EOF following \\") - synErrInvalidEscSeq = fmt.Errorf("invalid escape sequence") - synErrInvalidCodePoint = fmt.Errorf("code points must consist of just 4 or 6 hex digits") - synErrCharPropInvalidSymbol = fmt.Errorf("invalid character property symbol") - SynErrFragmentInvalidSymbol = fmt.Errorf("invalid fragment symbol") - - // syntax errors - synErrUnexpectedToken = fmt.Errorf("unexpected token") - synErrNullPattern = fmt.Errorf("a pattern must be a non-empty byte sequence") - synErrUnmatchablePattern = fmt.Errorf("a pattern cannot match any characters") - synErrAltLackOfOperand = fmt.Errorf("an alternation expression must have operands") - synErrRepNoTarget = fmt.Errorf("a repeat expression must have an operand") - synErrGroupNoElem = fmt.Errorf("a grouping expression must include at least one character") - synErrGroupUnclosed = fmt.Errorf("unclosed grouping expression") - synErrGroupNoInitiator = fmt.Errorf(") needs preceding (") - synErrGroupInvalidForm = fmt.Errorf("invalid grouping expression") - synErrBExpNoElem = fmt.Errorf("a bracket expression must include at least one character") - synErrBExpUnclosed = fmt.Errorf("unclosed bracket expression") - synErrBExpInvalidForm = fmt.Errorf("invalid bracket expression") - synErrRangeInvalidOrder = fmt.Errorf("a range expression with invalid order") - synErrRangePropIsUnavailable = fmt.Errorf("a property expression is unavailable in a range expression") - synErrRangeInvalidForm = fmt.Errorf("invalid range expression") - synErrCPExpInvalidForm = fmt.Errorf("invalid code point expression") - synErrCPExpOutOfRange = fmt.Errorf("a code point must be between U+0000 to U+10FFFF") - synErrCharPropExpInvalidForm = fmt.Errorf("invalid character property expression") - synErrCharPropUnsupported = fmt.Errorf("unsupported character property") - synErrFragmentExpInvalidForm = fmt.Errorf("invalid fragment expression") -) diff --git a/compiler/parser/fragment.go b/compiler/parser/fragment.go deleted file mode 100644 index 5680b55..0000000 --- a/compiler/parser/fragment.go +++ /dev/null @@ -1,72 +0,0 @@ -package parser - -import ( - "fmt" - - "github.com/nihei9/maleeni/spec" -) - -type incompleteFragment struct { - kind spec.LexKindName - root *rootNode -} - -func CompleteFragments(fragments map[spec.LexKindName]CPTree) error { - if len(fragments) == 0 { - return nil - } - - completeFragments := map[spec.LexKindName]CPTree{} - incompleteFragments := []*incompleteFragment{} - for kind, tree := range fragments { - root, ok := tree.(*rootNode) - if !ok { - return fmt.Errorf("CompleteFragments can take only *rootNode: %T", tree) - } - if root.incomplete() { - incompleteFragments = append(incompleteFragments, &incompleteFragment{ - kind: kind, - root: root, - }) - } else { - completeFragments[kind] = root - } - } - for len(incompleteFragments) > 0 { - lastIncompCount := len(incompleteFragments) - remainingFragments := []*incompleteFragment{} - for _, e := range incompleteFragments { - complete, err := ApplyFragments(e.root, completeFragments) - if err != nil { - return err - } - if !complete { - remainingFragments = append(remainingFragments, e) - } else { - completeFragments[e.kind] = e.root - } - } - incompleteFragments = remainingFragments - if len(incompleteFragments) == lastIncompCount { - return ParseErr - } - } - - return nil -} - -func ApplyFragments(t CPTree, fragments map[spec.LexKindName]CPTree) (bool, error) { - root, ok := t.(*rootNode) - if !ok { - return false, fmt.Errorf("ApplyFragments can take only *rootNode type: %T", t) - } - - for name, frag := range fragments { - err := root.applyFragment(name, frag) - if err != nil { - return false, err - } - } - - return !root.incomplete(), nil -} diff --git a/compiler/parser/lexer.go b/compiler/parser/lexer.go deleted file mode 100644 index 3861825..0000000 --- a/compiler/parser/lexer.go +++ /dev/null @@ -1,594 +0,0 @@ -package parser - -import ( - "bufio" - "fmt" - "io" - "strings" -) - -type tokenKind string - -const ( - tokenKindChar tokenKind = "char" - tokenKindAnyChar tokenKind = "." - tokenKindRepeat tokenKind = "*" - tokenKindRepeatOneOrMore tokenKind = "+" - tokenKindOption tokenKind = "?" - tokenKindAlt tokenKind = "|" - tokenKindGroupOpen tokenKind = "(" - tokenKindGroupClose tokenKind = ")" - tokenKindBExpOpen tokenKind = "[" - tokenKindInverseBExpOpen tokenKind = "[^" - tokenKindBExpClose tokenKind = "]" - tokenKindCharRange tokenKind = "-" - tokenKindCodePointLeader tokenKind = "\\u" - tokenKindCharPropLeader tokenKind = "\\p" - tokenKindFragmentLeader tokenKind = "\\f" - tokenKindLBrace tokenKind = "{" - tokenKindRBrace tokenKind = "}" - tokenKindEqual tokenKind = "=" - tokenKindCodePoint tokenKind = "code point" - tokenKindCharPropSymbol tokenKind = "character property symbol" - tokenKindFragmentSymbol tokenKind = "fragment symbol" - tokenKindEOF tokenKind = "eof" -) - -type token struct { - kind tokenKind - char rune - propSymbol string - codePoint string - fragmentSymbol string -} - -const nullChar = '\u0000' - -func newToken(kind tokenKind, char rune) *token { - return &token{ - kind: kind, - char: char, - } -} - -func newCodePointToken(codePoint string) *token { - return &token{ - kind: tokenKindCodePoint, - codePoint: codePoint, - } -} - -func newCharPropSymbolToken(propSymbol string) *token { - return &token{ - kind: tokenKindCharPropSymbol, - propSymbol: propSymbol, - } -} - -func newFragmentSymbolToken(fragmentSymbol string) *token { - return &token{ - kind: tokenKindFragmentSymbol, - fragmentSymbol: fragmentSymbol, - } -} - -type lexerMode string - -const ( - lexerModeDefault lexerMode = "default" - lexerModeBExp lexerMode = "bracket expression" - lexerModeCPExp lexerMode = "code point expression" - lexerModeCharPropExp lexerMode = "character property expression" - lexerModeFragmentExp lexerMode = "fragment expression" -) - -type lexerModeStack struct { - stack []lexerMode -} - -func newLexerModeStack() *lexerModeStack { - return &lexerModeStack{ - stack: []lexerMode{ - lexerModeDefault, - }, - } -} - -func (s *lexerModeStack) top() lexerMode { - return s.stack[len(s.stack)-1] -} - -func (s *lexerModeStack) push(m lexerMode) { - s.stack = append(s.stack, m) -} - -func (s *lexerModeStack) pop() { - s.stack = s.stack[:len(s.stack)-1] -} - -type rangeState string - -// [a-z] -// ^^^^ -// |||`-- ready -// ||`-- expect range terminator -// |`-- read range initiator -// `-- ready -const ( - rangeStateReady rangeState = "ready" - rangeStateReadRangeInitiator rangeState = "read range initiator" - rangeStateExpectRangeTerminator rangeState = "expect range terminator" -) - -type lexer struct { - src *bufio.Reader - peekChar2 rune - peekEOF2 bool - peekChar1 rune - peekEOF1 bool - lastChar rune - reachedEOF bool - prevChar1 rune - prevEOF1 bool - prevChar2 rune - pervEOF2 bool - modeStack *lexerModeStack - rangeState rangeState - - errCause error - errDetail string -} - -func newLexer(src io.Reader) *lexer { - return &lexer{ - src: bufio.NewReader(src), - peekChar2: nullChar, - peekEOF2: false, - peekChar1: nullChar, - peekEOF1: false, - lastChar: nullChar, - reachedEOF: false, - prevChar1: nullChar, - prevEOF1: false, - prevChar2: nullChar, - pervEOF2: false, - modeStack: newLexerModeStack(), - rangeState: rangeStateReady, - } -} - -func (l *lexer) error() (string, error) { - return l.errDetail, l.errCause -} - -func (l *lexer) next() (*token, error) { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - return newToken(tokenKindEOF, nullChar), nil - } - - switch l.modeStack.top() { - case lexerModeBExp: - tok, err := l.nextInBExp(c) - if err != nil { - return nil, err - } - if tok.kind == tokenKindChar || tok.kind == tokenKindCodePointLeader || tok.kind == tokenKindCharPropLeader { - switch l.rangeState { - case rangeStateReady: - l.rangeState = rangeStateReadRangeInitiator - case rangeStateExpectRangeTerminator: - l.rangeState = rangeStateReady - } - } - switch tok.kind { - case tokenKindBExpClose: - l.modeStack.pop() - case tokenKindCharRange: - l.rangeState = rangeStateExpectRangeTerminator - case tokenKindCodePointLeader: - l.modeStack.push(lexerModeCPExp) - case tokenKindCharPropLeader: - l.modeStack.push(lexerModeCharPropExp) - } - return tok, nil - case lexerModeCPExp: - tok, err := l.nextInCodePoint(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindRBrace: - l.modeStack.pop() - } - return tok, nil - case lexerModeCharPropExp: - tok, err := l.nextInCharProp(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindRBrace: - l.modeStack.pop() - } - return tok, nil - case lexerModeFragmentExp: - tok, err := l.nextInFragment(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindRBrace: - l.modeStack.pop() - } - return tok, nil - default: - tok, err := l.nextInDefault(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindBExpOpen: - l.modeStack.push(lexerModeBExp) - l.rangeState = rangeStateReady - case tokenKindInverseBExpOpen: - l.modeStack.push(lexerModeBExp) - l.rangeState = rangeStateReady - case tokenKindCodePointLeader: - l.modeStack.push(lexerModeCPExp) - case tokenKindCharPropLeader: - l.modeStack.push(lexerModeCharPropExp) - case tokenKindFragmentLeader: - l.modeStack.push(lexerModeFragmentExp) - } - return tok, nil - } -} - -func (l *lexer) nextInDefault(c rune) (*token, error) { - switch c { - case '*': - return newToken(tokenKindRepeat, nullChar), nil - case '+': - return newToken(tokenKindRepeatOneOrMore, nullChar), nil - case '?': - return newToken(tokenKindOption, nullChar), nil - case '.': - return newToken(tokenKindAnyChar, nullChar), nil - case '|': - return newToken(tokenKindAlt, nullChar), nil - case '(': - return newToken(tokenKindGroupOpen, nullChar), nil - case ')': - return newToken(tokenKindGroupClose, nullChar), nil - case '[': - c1, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindBExpOpen, nullChar), nil - } - if c1 != '^' { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindBExpOpen, nullChar), nil - } - c2, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindInverseBExpOpen, nullChar), nil - } - if c2 != ']' { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindInverseBExpOpen, nullChar), nil - } - err = l.restore() - if err != nil { - return nil, err - } - err = l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindBExpOpen, nullChar), nil - case '\\': - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - l.errCause = synErrIncompletedEscSeq - return nil, ParseErr - } - if c == 'u' { - return newToken(tokenKindCodePointLeader, nullChar), nil - } - if c == 'p' { - return newToken(tokenKindCharPropLeader, nullChar), nil - } - if c == 'f' { - return newToken(tokenKindFragmentLeader, nullChar), nil - } - if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' { - return newToken(tokenKindChar, c), nil - } - l.errCause = synErrInvalidEscSeq - l.errDetail = fmt.Sprintf("\\%v is not supported", string(c)) - return nil, ParseErr - default: - return newToken(tokenKindChar, c), nil - } -} - -func (l *lexer) nextInBExp(c rune) (*token, error) { - switch c { - case '-': - if l.rangeState != rangeStateReadRangeInitiator { - return newToken(tokenKindChar, c), nil - } - c1, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindChar, c), nil - } - if c1 != ']' { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindCharRange, nullChar), nil - } - err = l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindChar, c), nil - case ']': - return newToken(tokenKindBExpClose, nullChar), nil - case '\\': - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - l.errCause = synErrIncompletedEscSeq - return nil, ParseErr - } - if c == 'u' { - return newToken(tokenKindCodePointLeader, nullChar), nil - } - if c == 'p' { - return newToken(tokenKindCharPropLeader, nullChar), nil - } - if c == '\\' || c == '^' || c == '-' || c == ']' { - return newToken(tokenKindChar, c), nil - } - l.errCause = synErrInvalidEscSeq - l.errDetail = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c)) - return nil, ParseErr - default: - return newToken(tokenKindChar, c), nil - } -} - -func (l *lexer) nextInCodePoint(c rune) (*token, error) { - switch c { - case '{': - return newToken(tokenKindLBrace, nullChar), nil - case '}': - return newToken(tokenKindRBrace, nullChar), nil - default: - if !isHexDigit(c) { - l.errCause = synErrInvalidCodePoint - return nil, ParseErr - } - var b strings.Builder - fmt.Fprint(&b, string(c)) - n := 1 - for { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - break - } - if c == '}' { - err := l.restore() - if err != nil { - return nil, err - } - break - } - if !isHexDigit(c) || n >= 6 { - l.errCause = synErrInvalidCodePoint - return nil, ParseErr - } - fmt.Fprint(&b, string(c)) - n++ - } - cp := b.String() - cpLen := len(cp) - if !(cpLen == 4 || cpLen == 6) { - l.errCause = synErrInvalidCodePoint - return nil, ParseErr - } - return newCodePointToken(b.String()), nil - } -} - -func isHexDigit(c rune) bool { - if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' { - return true - } - return false -} - -func (l *lexer) nextInCharProp(c rune) (*token, error) { - switch c { - case '{': - return newToken(tokenKindLBrace, nullChar), nil - case '}': - return newToken(tokenKindRBrace, nullChar), nil - case '=': - return newToken(tokenKindEqual, nullChar), nil - default: - var b strings.Builder - fmt.Fprint(&b, string(c)) - n := 1 - for { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - break - } - if c == '}' || c == '=' { - err := l.restore() - if err != nil { - return nil, err - } - break - } - fmt.Fprint(&b, string(c)) - n++ - } - sym := strings.TrimSpace(b.String()) - if len(sym) == 0 { - l.errCause = synErrCharPropInvalidSymbol - return nil, ParseErr - } - return newCharPropSymbolToken(sym), nil - } -} - -func (l *lexer) nextInFragment(c rune) (*token, error) { - switch c { - case '{': - return newToken(tokenKindLBrace, nullChar), nil - case '}': - return newToken(tokenKindRBrace, nullChar), nil - default: - var b strings.Builder - fmt.Fprint(&b, string(c)) - n := 1 - for { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - break - } - if c == '}' { - err := l.restore() - if err != nil { - return nil, err - } - break - } - fmt.Fprint(&b, string(c)) - n++ - } - sym := strings.TrimSpace(b.String()) - if len(sym) == 0 { - l.errCause = SynErrFragmentInvalidSymbol - return nil, ParseErr - } - return newFragmentSymbolToken(sym), nil - } -} - -func (l *lexer) read() (rune, bool, error) { - if l.reachedEOF { - return l.lastChar, l.reachedEOF, nil - } - if l.peekChar1 != nullChar || l.peekEOF1 { - l.prevChar2 = l.prevChar1 - l.pervEOF2 = l.prevEOF1 - l.prevChar1 = l.lastChar - l.prevEOF1 = l.reachedEOF - l.lastChar = l.peekChar1 - l.reachedEOF = l.peekEOF1 - l.peekChar1 = l.peekChar2 - l.peekEOF1 = l.peekEOF2 - l.peekChar2 = nullChar - l.peekEOF2 = false - return l.lastChar, l.reachedEOF, nil - } - c, _, err := l.src.ReadRune() - if err != nil { - if err == io.EOF { - l.prevChar2 = l.prevChar1 - l.pervEOF2 = l.prevEOF1 - l.prevChar1 = l.lastChar - l.prevEOF1 = l.reachedEOF - l.lastChar = nullChar - l.reachedEOF = true - return l.lastChar, l.reachedEOF, nil - } - return nullChar, false, err - } - l.prevChar2 = l.prevChar1 - l.pervEOF2 = l.prevEOF1 - l.prevChar1 = l.lastChar - l.prevEOF1 = l.reachedEOF - l.lastChar = c - l.reachedEOF = false - return l.lastChar, l.reachedEOF, nil -} - -func (l *lexer) restore() error { - if l.lastChar == nullChar && !l.reachedEOF { - return fmt.Errorf("failed to call restore() because the last character is null") - } - l.peekChar2 = l.peekChar1 - l.peekEOF2 = l.peekEOF1 - l.peekChar1 = l.lastChar - l.peekEOF1 = l.reachedEOF - l.lastChar = l.prevChar1 - l.reachedEOF = l.prevEOF1 - l.prevChar1 = l.prevChar2 - l.prevEOF1 = l.pervEOF2 - l.prevChar2 = nullChar - l.pervEOF2 = false - return nil -} diff --git a/compiler/parser/lexer_test.go b/compiler/parser/lexer_test.go deleted file mode 100644 index 055466e..0000000 --- a/compiler/parser/lexer_test.go +++ /dev/null @@ -1,524 +0,0 @@ -package parser - -import ( - "strings" - "testing" -) - -func TestLexer(t *testing.T) { - tests := []struct { - caption string - src string - tokens []*token - err error - }{ - { - caption: "lexer can recognize ordinaly characters", - src: "123abcいろは", - tokens: []*token{ - newToken(tokenKindChar, '1'), - newToken(tokenKindChar, '2'), - newToken(tokenKindChar, '3'), - newToken(tokenKindChar, 'a'), - newToken(tokenKindChar, 'b'), - newToken(tokenKindChar, 'c'), - newToken(tokenKindChar, 'い'), - newToken(tokenKindChar, 'ろ'), - newToken(tokenKindChar, 'は'), - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "lexer can recognize the special characters in default mode", - src: ".*+?|()[\\u", - tokens: []*token{ - newToken(tokenKindAnyChar, nullChar), - newToken(tokenKindRepeat, nullChar), - newToken(tokenKindRepeatOneOrMore, nullChar), - newToken(tokenKindOption, nullChar), - newToken(tokenKindAlt, nullChar), - newToken(tokenKindGroupOpen, nullChar), - newToken(tokenKindGroupClose, nullChar), - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "lexer can recognize the escape sequences in default mode", - src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[", - tokens: []*token{ - newToken(tokenKindChar, '\\'), - newToken(tokenKindChar, '.'), - newToken(tokenKindChar, '*'), - newToken(tokenKindChar, '+'), - newToken(tokenKindChar, '?'), - newToken(tokenKindChar, '|'), - newToken(tokenKindChar, '('), - newToken(tokenKindChar, ')'), - newToken(tokenKindChar, '['), - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "], {, and } are treated as an ordinary character in default mode", - src: "]{}", - tokens: []*token{ - newToken(tokenKindChar, ']'), - newToken(tokenKindChar, '{'), - newToken(tokenKindChar, '}'), - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "lexer can recognize the special characters in bracket expression mode", - src: "[a-z\\u{09AF}][^a-z\\u{09abcf}]", - tokens: []*token{ - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, 'a'), - newToken(tokenKindCharRange, nullChar), - newToken(tokenKindChar, 'z'), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("09AF"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindChar, 'a'), - newToken(tokenKindCharRange, nullChar), - newToken(tokenKindChar, 'z'), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("09abcf"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "lexer can recognize the escape sequences in bracket expression mode", - src: "[\\^a\\-z]", - tokens: []*token{ - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, '^'), - newToken(tokenKindChar, 'a'), - newToken(tokenKindChar, '-'), - newToken(tokenKindChar, 'z'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "in a bracket expression, the special characters are also handled as normal characters", - src: "[\\\\.*+?|()[", - tokens: []*token{ - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, '\\'), - newToken(tokenKindChar, '.'), - newToken(tokenKindChar, '*'), - newToken(tokenKindChar, '+'), - newToken(tokenKindChar, '?'), - newToken(tokenKindChar, '|'), - newToken(tokenKindChar, '('), - newToken(tokenKindChar, ')'), - newToken(tokenKindChar, '['), - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "hyphen symbols that appear in bracket expressions are handled as the character range symbol or ordinary characters", - // [...-...][...-][-...][-] - // ~~~~~~~ ~ ~ ~ - // ^ ^ ^ ^ - // | | | `-- Ordinary Character (b) - // | | `-- Ordinary Character (b) - // | `-- Ordinary Character (b) - // `-- Character Range (a) - // - // a. *-* is handled as a character-range expression. - // b. *-, -*, or - are handled as ordinary characters. - src: "[a-z][a-][-z][-][--][---][^a-z][^a-][^-z][^-][^--][^---]", - tokens: []*token{ - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, 'a'), - newToken(tokenKindCharRange, nullChar), - newToken(tokenKindChar, 'z'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, 'a'), - newToken(tokenKindChar, '-'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindChar, 'z'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindChar, '-'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindCharRange, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindBExpClose, nullChar), - - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindChar, 'a'), - newToken(tokenKindCharRange, nullChar), - newToken(tokenKindChar, 'z'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindChar, 'a'), - newToken(tokenKindChar, '-'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindChar, 'z'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindChar, '-'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindCharRange, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindBExpClose, nullChar), - - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "caret symbols that appear in bracket expressions are handled as the logical inverse symbol or ordinary characters", - // [^...^...][^] - // ~~ ~ ~~ - // ^ ^ ^^ - // | | |`-- Ordinary Character (c) - // | | `-- Bracket Expression - // | `-- Ordinary Character (b) - // `-- Inverse Bracket Expression (a) - // - // a. Bracket expressions that have a caret symbol at the beginning are handled as logical inverse expressions. - // b. caret symbols that appear as the second and the subsequent symbols are handled as ordinary symbols. - // c. When a bracket expression has just one symbol, a caret symbol at the beginning is handled as an ordinary character. - src: "[^^][^]", - tokens: []*token{ - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindChar, '^'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, '^'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "lexer raises an error when an invalid escape sequence appears", - src: "\\@", - err: synErrInvalidEscSeq, - }, - { - caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears", - src: "\\", - err: synErrIncompletedEscSeq, - }, - { - caption: "lexer raises an error when an invalid escape sequence appears", - src: "[\\@", - tokens: []*token{ - newToken(tokenKindBExpOpen, nullChar), - }, - err: synErrInvalidEscSeq, - }, - { - caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears", - src: "[\\", - tokens: []*token{ - newToken(tokenKindBExpOpen, nullChar), - }, - err: synErrIncompletedEscSeq, - }, - { - caption: "lexer can recognize the special characters and code points in code point expression mode", - src: "\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}[\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}][^\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}]", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("0123"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("4567"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("89abcd"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("efAB"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("CDEF01"), - newToken(tokenKindRBrace, nullChar), - - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("0123"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("4567"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("89abcd"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("efAB"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("CDEF01"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindBExpClose, nullChar), - - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("0123"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("4567"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("89abcd"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("efAB"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("CDEF01"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindBExpClose, nullChar), - - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "a one digit hex string isn't a valid code point", - src: "\\u{0", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - }, - err: synErrInvalidCodePoint, - }, - { - caption: "a two digits hex string isn't a valid code point", - src: "\\u{01", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - }, - err: synErrInvalidCodePoint, - }, - { - caption: "a three digits hex string isn't a valid code point", - src: "\\u{012", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - }, - err: synErrInvalidCodePoint, - }, - { - caption: "a four digits hex string is a valid code point", - src: "\\u{0123}", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("0123"), - newToken(tokenKindRBrace, nullChar), - }, - }, - { - caption: "a five digits hex string isn't a valid code point", - src: "\\u{01234", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - }, - err: synErrInvalidCodePoint, - }, - { - caption: "a six digits hex string is a valid code point", - src: "\\u{012345}", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("012345"), - newToken(tokenKindRBrace, nullChar), - }, - }, - { - caption: "a seven digits hex string isn't a valid code point", - src: "\\u{0123456", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - }, - err: synErrInvalidCodePoint, - }, - { - caption: "a code point must be hex digits", - src: "\\u{g", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - }, - err: synErrInvalidCodePoint, - }, - { - caption: "a code point must be hex digits", - src: "\\u{G", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - }, - err: synErrInvalidCodePoint, - }, - { - caption: "lexer can recognize the special characters and symbols in character property expression mode", - src: "\\p{Letter}\\p{General_Category=Letter}[\\p{Letter}\\p{General_Category=Letter}][^\\p{Letter}\\p{General_Category=Letter}]", - tokens: []*token{ - newToken(tokenKindCharPropLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCharPropSymbolToken("Letter"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCharPropLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCharPropSymbolToken("General_Category"), - newToken(tokenKindEqual, nullChar), - newCharPropSymbolToken("Letter"), - newToken(tokenKindRBrace, nullChar), - - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindCharPropLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCharPropSymbolToken("Letter"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCharPropLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCharPropSymbolToken("General_Category"), - newToken(tokenKindEqual, nullChar), - newCharPropSymbolToken("Letter"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindBExpClose, nullChar), - - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindCharPropLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCharPropSymbolToken("Letter"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCharPropLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCharPropSymbolToken("General_Category"), - newToken(tokenKindEqual, nullChar), - newCharPropSymbolToken("Letter"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindBExpClose, nullChar), - - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "lexer can recognize the special characters and symbols in fragment expression mode", - src: "\\f{integer}", - tokens: []*token{ - newToken(tokenKindFragmentLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newFragmentSymbolToken("integer"), - newToken(tokenKindRBrace, nullChar), - - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "a fragment expression is not supported in a bracket expression", - src: "[\\f", - tokens: []*token{ - newToken(tokenKindBExpOpen, nullChar), - }, - err: synErrInvalidEscSeq, - }, - { - caption: "a fragment expression is not supported in an inverse bracket expression", - src: "[^\\f", - tokens: []*token{ - newToken(tokenKindInverseBExpOpen, nullChar), - }, - err: synErrInvalidEscSeq, - }, - } - for _, tt := range tests { - t.Run(tt.caption, func(t *testing.T) { - lex := newLexer(strings.NewReader(tt.src)) - var err error - var tok *token - i := 0 - for { - tok, err = lex.next() - if err != nil { - break - } - if i >= len(tt.tokens) { - break - } - eTok := tt.tokens[i] - i++ - testToken(t, tok, eTok) - - if tok.kind == tokenKindEOF { - break - } - } - if tt.err != nil { - if err != ParseErr { - t.Fatalf("unexpected error: want: %v, got: %v", ParseErr, err) - } - detail, cause := lex.error() - if cause != tt.err { - t.Fatalf("unexpected error: want: %v, got: %v (%v)", tt.err, cause, detail) - } - } else { - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - } - if i < len(tt.tokens) { - t.Fatalf("expecte more tokens") - } - }) - } -} - -func testToken(t *testing.T, a, e *token) { - t.Helper() - if e.kind != a.kind || e.char != a.char || e.codePoint != a.codePoint { - t.Fatalf("unexpected token: want: %+v, got: %+v", e, a) - } -} diff --git a/compiler/parser/parser.go b/compiler/parser/parser.go deleted file mode 100644 index b7f8c04..0000000 --- a/compiler/parser/parser.go +++ /dev/null @@ -1,531 +0,0 @@ -package parser - -import ( - "bytes" - "fmt" - "io" - "strconv" - - "github.com/nihei9/maleeni/spec" - "github.com/nihei9/maleeni/ucd" -) - -type PatternEntry struct { - ID spec.LexModeKindID - Pattern []byte -} - -type parser struct { - kind spec.LexKindName - lex *lexer - peekedTok *token - lastTok *token - - // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that - // appear in property expressions. - // - // The contributory properties are not exposed, and users cannot use those properties because the parser - // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid. - // - // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to - // interpret derived properties internally because the derived properties consist of other properties that - // may contain the contributory properties. - // - // [UAX #44 5.13 Property APIs] says: - // > The following subtypes of Unicode character properties should generally not be exposed in APIs, - // > except in limited circumstances. They may not be useful, particularly in public API collections, - // > and may instead prove misleading to the users of such API collections. - // > * Contributory properties are not recommended for public APIs. - // > ... - // https://unicode.org/reports/tr44/#Property_APIs - isContributoryPropertyExposed bool - - errCause error - errDetail string -} - -func NewParser(kind spec.LexKindName, src io.Reader) *parser { - return &parser{ - kind: kind, - lex: newLexer(src), - isContributoryPropertyExposed: false, - } -} - -func (p *parser) exposeContributoryProperty() { - p.isContributoryPropertyExposed = true -} - -func (p *parser) Error() (string, error) { - return p.errDetail, p.errCause -} - -func (p *parser) Parse() (root CPTree, retErr error) { - defer func() { - err := recover() - if err != nil { - var ok bool - retErr, ok = err.(error) - if !ok { - panic(err) - } - return - } - }() - - return newRootNode(p.kind, p.parseRegexp()), nil -} - -func (p *parser) parseRegexp() CPTree { - alt := p.parseAlt() - if alt == nil { - if p.consume(tokenKindGroupClose) { - p.raiseParseError(synErrGroupNoInitiator, "") - } - p.raiseParseError(synErrNullPattern, "") - } - if p.consume(tokenKindGroupClose) { - p.raiseParseError(synErrGroupNoInitiator, "") - } - p.expect(tokenKindEOF) - return alt -} - -func (p *parser) parseAlt() CPTree { - left := p.parseConcat() - if left == nil { - if p.consume(tokenKindAlt) { - p.raiseParseError(synErrAltLackOfOperand, "") - } - return nil - } - for { - if !p.consume(tokenKindAlt) { - break - } - right := p.parseConcat() - if right == nil { - p.raiseParseError(synErrAltLackOfOperand, "") - } - left = newAltNode(left, right) - } - return left -} - -func (p *parser) parseConcat() CPTree { - left := p.parseRepeat() - for { - right := p.parseRepeat() - if right == nil { - break - } - left = newConcatNode(left, right) - } - return left -} - -func (p *parser) parseRepeat() CPTree { - group := p.parseGroup() - if group == nil { - if p.consume(tokenKindRepeat) { - p.raiseParseError(synErrRepNoTarget, "* needs an operand") - } - if p.consume(tokenKindRepeatOneOrMore) { - p.raiseParseError(synErrRepNoTarget, "+ needs an operand") - } - if p.consume(tokenKindOption) { - p.raiseParseError(synErrRepNoTarget, "? needs an operand") - } - return nil - } - if p.consume(tokenKindRepeat) { - return newRepeatNode(group) - } - if p.consume(tokenKindRepeatOneOrMore) { - return newRepeatOneOrMoreNode(group) - } - if p.consume(tokenKindOption) { - return newOptionNode(group) - } - return group -} - -func (p *parser) parseGroup() CPTree { - if p.consume(tokenKindGroupOpen) { - alt := p.parseAlt() - if alt == nil { - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrGroupUnclosed, "") - } - p.raiseParseError(synErrGroupNoElem, "") - } - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrGroupUnclosed, "") - } - if !p.consume(tokenKindGroupClose) { - p.raiseParseError(synErrGroupInvalidForm, "") - } - return alt - } - return p.parseSingleChar() -} - -func (p *parser) parseSingleChar() CPTree { - if p.consume(tokenKindAnyChar) { - return genAnyCharAST() - } - if p.consume(tokenKindBExpOpen) { - left := p.parseBExpElem() - if left == nil { - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.raiseParseError(synErrBExpNoElem, "") - } - for { - right := p.parseBExpElem() - if right == nil { - break - } - left = newAltNode(left, right) - } - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.expect(tokenKindBExpClose) - return left - } - if p.consume(tokenKindInverseBExpOpen) { - elem := p.parseBExpElem() - if elem == nil { - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.raiseParseError(synErrBExpNoElem, "") - } - inverse := exclude(elem, genAnyCharAST()) - if inverse == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - for { - elem := p.parseBExpElem() - if elem == nil { - break - } - inverse = exclude(elem, inverse) - if inverse == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - } - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.expect(tokenKindBExpClose) - return inverse - } - if p.consume(tokenKindCodePointLeader) { - return p.parseCodePoint() - } - if p.consume(tokenKindCharPropLeader) { - return p.parseCharProp() - } - if p.consume(tokenKindFragmentLeader) { - return p.parseFragment() - } - c := p.parseNormalChar() - if c == nil { - if p.consume(tokenKindBExpClose) { - p.raiseParseError(synErrBExpInvalidForm, "") - } - return nil - } - return c -} - -func (p *parser) parseBExpElem() CPTree { - var left CPTree - switch { - case p.consume(tokenKindCodePointLeader): - left = p.parseCodePoint() - case p.consume(tokenKindCharPropLeader): - left = p.parseCharProp() - if p.consume(tokenKindCharRange) { - p.raiseParseError(synErrRangePropIsUnavailable, "") - } - default: - left = p.parseNormalChar() - } - if left == nil { - return nil - } - if !p.consume(tokenKindCharRange) { - return left - } - var right CPTree - switch { - case p.consume(tokenKindCodePointLeader): - right = p.parseCodePoint() - case p.consume(tokenKindCharPropLeader): - p.raiseParseError(synErrRangePropIsUnavailable, "") - default: - right = p.parseNormalChar() - } - if right == nil { - p.raiseParseError(synErrRangeInvalidForm, "") - } - from, _, _ := left.Range() - _, to, _ := right.Range() - if !isValidOrder(from, to) { - p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("%X..%X", from, to)) - } - return newRangeSymbolNode(from, to) -} - -func (p *parser) parseCodePoint() CPTree { - if !p.consume(tokenKindLBrace) { - p.raiseParseError(synErrCPExpInvalidForm, "") - } - if !p.consume(tokenKindCodePoint) { - p.raiseParseError(synErrCPExpInvalidForm, "") - } - - n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64) - if err != nil { - panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err)) - } - if n < 0x0000 || n > 0x10FFFF { - p.raiseParseError(synErrCPExpOutOfRange, "") - } - - sym := newSymbolNode(rune(n)) - - if !p.consume(tokenKindRBrace) { - p.raiseParseError(synErrCPExpInvalidForm, "") - } - - return sym -} - -func (p *parser) parseCharProp() CPTree { - if !p.consume(tokenKindLBrace) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - var sym1, sym2 string - if !p.consume(tokenKindCharPropSymbol) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - sym1 = p.lastTok.propSymbol - if p.consume(tokenKindEqual) { - if !p.consume(tokenKindCharPropSymbol) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - sym2 = p.lastTok.propSymbol - } - - var alt CPTree - var propName, propVal string - if sym2 != "" { - propName = sym1 - propVal = sym2 - } else { - propName = "" - propVal = sym1 - } - if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) { - p.raiseParseError(synErrCharPropUnsupported, propName) - } - pat, err := ucd.NormalizeCharacterProperty(propName, propVal) - if err != nil { - p.raiseParseError(synErrCharPropUnsupported, err.Error()) - } - if pat != "" { - p := NewParser(p.kind, bytes.NewReader([]byte(pat))) - p.exposeContributoryProperty() - ast, err := p.Parse() - if err != nil { - panic(err) - } - alt = ast - } else { - cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal) - if err != nil { - p.raiseParseError(synErrCharPropUnsupported, err.Error()) - } - if inverse { - r := cpRanges[0] - alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST()) - if alt == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - for _, r := range cpRanges[1:] { - alt = exclude(newRangeSymbolNode(r.From, r.To), alt) - if alt == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - } - } else { - for _, r := range cpRanges { - alt = genAltNode( - alt, - newRangeSymbolNode(r.From, r.To), - ) - } - } - } - - if !p.consume(tokenKindRBrace) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - - return alt -} - -func (p *parser) parseFragment() CPTree { - if !p.consume(tokenKindLBrace) { - p.raiseParseError(synErrFragmentExpInvalidForm, "") - } - if !p.consume(tokenKindFragmentSymbol) { - p.raiseParseError(synErrFragmentExpInvalidForm, "") - } - sym := p.lastTok.fragmentSymbol - - if !p.consume(tokenKindRBrace) { - p.raiseParseError(synErrFragmentExpInvalidForm, "") - } - - return newFragmentNode(spec.LexKindName(sym), nil) -} - -func (p *parser) parseNormalChar() CPTree { - if !p.consume(tokenKindChar) { - return nil - } - return newSymbolNode(p.lastTok.char) -} - -func exclude(symbol, base CPTree) CPTree { - if left, right, ok := symbol.Alternatives(); ok { - return exclude(right, exclude(left, base)) - } - - if left, right, ok := base.Alternatives(); ok { - return genAltNode( - exclude(symbol, left), - exclude(symbol, right), - ) - } - - if bFrom, bTo, ok := base.Range(); ok { - sFrom, sTo, ok := symbol.Range() - if !ok { - panic(fmt.Errorf("invalid symbol tree: %T", symbol)) - } - - switch { - case sFrom > bFrom && sTo < bTo: - return genAltNode( - newRangeSymbolNode(bFrom, sFrom-1), - newRangeSymbolNode(sTo+1, bTo), - ) - case sFrom <= bFrom && sTo >= bFrom && sTo < bTo: - return newRangeSymbolNode(sTo+1, bTo) - case sFrom > bFrom && sFrom <= bTo && sTo >= bTo: - return newRangeSymbolNode(bFrom, sFrom-1) - case sFrom <= bFrom && sTo >= bTo: - return nil - default: - return base - } - } - - panic(fmt.Errorf("invalid base tree: %T", base)) -} - -func genAnyCharAST() CPTree { - return newRangeSymbolNode(0x0, 0x10FFFF) -} - -func isValidOrder(from, to rune) bool { - return from <= to -} - -func genConcatNode(cs ...CPTree) CPTree { - nonNilNodes := []CPTree{} - for _, c := range cs { - if c == nil { - continue - } - nonNilNodes = append(nonNilNodes, c) - } - if len(nonNilNodes) <= 0 { - return nil - } - if len(nonNilNodes) == 1 { - return nonNilNodes[0] - } - concat := newConcatNode(nonNilNodes[0], nonNilNodes[1]) - for _, c := range nonNilNodes[2:] { - concat = newConcatNode(concat, c) - } - return concat -} - -func genAltNode(cs ...CPTree) CPTree { - nonNilNodes := []CPTree{} - for _, c := range cs { - if c == nil { - continue - } - nonNilNodes = append(nonNilNodes, c) - } - if len(nonNilNodes) <= 0 { - return nil - } - if len(nonNilNodes) == 1 { - return nonNilNodes[0] - } - alt := newAltNode(nonNilNodes[0], nonNilNodes[1]) - for _, c := range nonNilNodes[2:] { - alt = newAltNode(alt, c) - } - return alt -} - -func (p *parser) expect(expected tokenKind) { - if !p.consume(expected) { - tok := p.peekedTok - p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("expected: %v, actual: %v", expected, tok.kind)) - } -} - -func (p *parser) consume(expected tokenKind) bool { - var tok *token - var err error - if p.peekedTok != nil { - tok = p.peekedTok - p.peekedTok = nil - } else { - tok, err = p.lex.next() - if err != nil { - if err == ParseErr { - detail, cause := p.lex.error() - p.raiseParseError(cause, detail) - } - panic(err) - } - } - p.lastTok = tok - if tok.kind == expected { - return true - } - p.peekedTok = tok - p.lastTok = nil - - return false -} - -func (p *parser) raiseParseError(err error, detail string) { - p.errCause = err - p.errDetail = detail - panic(ParseErr) -} diff --git a/compiler/parser/parser_test.go b/compiler/parser/parser_test.go deleted file mode 100644 index 57c130e..0000000 --- a/compiler/parser/parser_test.go +++ /dev/null @@ -1,1389 +0,0 @@ -package parser - -import ( - "fmt" - "reflect" - "strings" - "testing" - - "github.com/nihei9/maleeni/spec" - "github.com/nihei9/maleeni/ucd" -) - -func TestParse(t *testing.T) { - tests := []struct { - pattern string - fragments map[spec.LexKindName]string - ast CPTree - syntaxError error - - // When an AST is large, as patterns containing a character property expression, this test only checks - // that the pattern is parsable. The check of the validity of such AST is performed by checking that it - // can be matched correctly using the driver. - skipTestAST bool - }{ - { - pattern: "a", - ast: newSymbolNode('a'), - }, - { - pattern: "abc", - ast: genConcatNode( - newSymbolNode('a'), - newSymbolNode('b'), - newSymbolNode('c'), - ), - }, - { - pattern: "a?", - ast: newOptionNode( - newSymbolNode('a'), - ), - }, - { - pattern: "[abc]?", - ast: newOptionNode( - genAltNode( - newSymbolNode('a'), - newSymbolNode('b'), - newSymbolNode('c'), - ), - ), - }, - { - pattern: "\\u{3042}?", - ast: newOptionNode( - newSymbolNode('\u3042'), - ), - }, - { - pattern: "\\p{Letter}?", - skipTestAST: true, - }, - { - pattern: "\\f{a2c}?", - fragments: map[spec.LexKindName]string{ - "a2c": "abc", - }, - ast: newOptionNode( - newFragmentNode("a2c", - genConcatNode( - newSymbolNode('a'), - newSymbolNode('b'), - newSymbolNode('c'), - ), - ), - ), - }, - { - pattern: "(a)?", - ast: newOptionNode( - newSymbolNode('a'), - ), - }, - { - pattern: "((a?)?)?", - ast: newOptionNode( - newOptionNode( - newOptionNode( - newSymbolNode('a'), - ), - ), - ), - }, - { - pattern: "(abc)?", - ast: newOptionNode( - genConcatNode( - newSymbolNode('a'), - newSymbolNode('b'), - newSymbolNode('c'), - ), - ), - }, - { - pattern: "(a|b)?", - ast: newOptionNode( - genAltNode( - newSymbolNode('a'), - newSymbolNode('b'), - ), - ), - }, - { - pattern: "?", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "(?)", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a|?", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "?|b", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a??", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a*", - ast: newRepeatNode( - newSymbolNode('a'), - ), - }, - { - pattern: "[abc]*", - ast: newRepeatNode( - genAltNode( - newSymbolNode('a'), - newSymbolNode('b'), - newSymbolNode('c'), - ), - ), - }, - { - pattern: "\\u{3042}*", - ast: newRepeatNode( - newSymbolNode('\u3042'), - ), - }, - { - pattern: "\\p{Letter}*", - skipTestAST: true, - }, - { - pattern: "\\f{a2c}*", - fragments: map[spec.LexKindName]string{ - "a2c": "abc", - }, - ast: newRepeatNode( - newFragmentNode("a2c", - genConcatNode( - newSymbolNode('a'), - newSymbolNode('b'), - newSymbolNode('c'), - ), - ), - ), - }, - { - pattern: "((a*)*)*", - ast: newRepeatNode( - newRepeatNode( - newRepeatNode( - newSymbolNode('a'), - ), - ), - ), - }, - { - pattern: "(abc)*", - ast: newRepeatNode( - genConcatNode( - newSymbolNode('a'), - newSymbolNode('b'), - newSymbolNode('c'), - ), - ), - }, - { - pattern: "(a|b)*", - ast: newRepeatNode( - genAltNode( - newSymbolNode('a'), - newSymbolNode('b'), - ), - ), - }, - { - pattern: "*", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "(*)", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a|*", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "*|b", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a**", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a+", - ast: genConcatNode( - newSymbolNode('a'), - newRepeatNode( - newSymbolNode('a'), - ), - ), - }, - { - pattern: "[abc]+", - ast: genConcatNode( - genAltNode( - newSymbolNode('a'), - newSymbolNode('b'), - newSymbolNode('c'), - ), - newRepeatNode( - genAltNode( - newSymbolNode('a'), - newSymbolNode('b'), - newSymbolNode('c'), - ), - ), - ), - }, - { - pattern: "\\u{3042}+", - ast: genConcatNode( - newSymbolNode('\u3042'), - newRepeatNode( - newSymbolNode('\u3042'), - ), - ), - }, - { - pattern: "\\p{Letter}+", - skipTestAST: true, - }, - { - pattern: "\\f{a2c}+", - fragments: map[spec.LexKindName]string{ - "a2c": "abc", - }, - ast: genConcatNode( - newFragmentNode("a2c", - genConcatNode( - newSymbolNode('a'), - newSymbolNode('b'), - newSymbolNode('c'), - ), - ), - newRepeatNode( - newFragmentNode("a2c", - genConcatNode( - newSymbolNode('a'), - newSymbolNode('b'), - newSymbolNode('c'), - ), - ), - ), - ), - }, - { - pattern: "((a+)+)+", - ast: genConcatNode( - genConcatNode( - genConcatNode( - genConcatNode( - newSymbolNode('a'), - newRepeatNode( - newSymbolNode('a'), - ), - ), - newRepeatNode( - genConcatNode( - newSymbolNode('a'), - newRepeatNode( - newSymbolNode('a'), - ), - ), - ), - ), - newRepeatNode( - genConcatNode( - genConcatNode( - newSymbolNode('a'), - newRepeatNode( - newSymbolNode('a'), - ), - ), - newRepeatNode( - genConcatNode( - newSymbolNode('a'), - newRepeatNode( - newSymbolNode('a'), - ), - ), - ), - ), - ), - ), - ), - }, - { - pattern: "(abc)+", - ast: genConcatNode( - genConcatNode( - newSymbolNode('a'), - newSymbolNode('b'), - newSymbolNode('c'), - ), - newRepeatNode( - genConcatNode( - newSymbolNode('a'), - newSymbolNode('b'), - newSymbolNode('c'), - ), - ), - ), - }, - { - pattern: "(a|b)+", - ast: genConcatNode( - genAltNode( - newSymbolNode('a'), - newSymbolNode('b'), - ), - newRepeatNode( - genAltNode( - newSymbolNode('a'), - newSymbolNode('b'), - ), - ), - ), - }, - { - pattern: "+", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "(+)", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a|+", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "+|b", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a++", - syntaxError: synErrRepNoTarget, - }, - { - pattern: ".", - ast: newRangeSymbolNode(0x00, 0x10FFFF), - }, - { - pattern: "[a]", - ast: newSymbolNode('a'), - }, - { - pattern: "[abc]", - ast: genAltNode( - newSymbolNode('a'), - newSymbolNode('b'), - newSymbolNode('c'), - ), - }, - { - pattern: "[a-z]", - ast: newRangeSymbolNode('a', 'z'), - }, - { - pattern: "[A-Za-z]", - ast: genAltNode( - newRangeSymbolNode('A', 'Z'), - newRangeSymbolNode('a', 'z'), - ), - }, - { - pattern: "[\\u{004E}]", - ast: newSymbolNode('N'), - }, - { - pattern: "[\\u{0061}-\\u{007A}]", - ast: newRangeSymbolNode('a', 'z'), - }, - { - pattern: "[\\p{Lu}]", - skipTestAST: true, - }, - { - pattern: "[a-\\p{Lu}]", - syntaxError: synErrRangePropIsUnavailable, - }, - { - pattern: "[\\p{Lu}-z]", - syntaxError: synErrRangePropIsUnavailable, - }, - { - pattern: "[\\p{Lu}-\\p{Ll}]", - syntaxError: synErrRangePropIsUnavailable, - }, - { - pattern: "[z-a]", - syntaxError: synErrRangeInvalidOrder, - }, - { - pattern: "a[]", - syntaxError: synErrBExpNoElem, - }, - { - pattern: "[]a", - syntaxError: synErrBExpNoElem, - }, - { - pattern: "[]", - syntaxError: synErrBExpNoElem, - }, - { - pattern: "[^\\u{004E}]", - ast: genAltNode( - newRangeSymbolNode(0x00, '\u004E'-1), - newRangeSymbolNode('\u004E'+1, 0x10FFFF), - ), - }, - { - pattern: "[^\\u{0061}-\\u{007A}]", - ast: genAltNode( - newRangeSymbolNode(0x00, '\u0061'-1), - newRangeSymbolNode('\u007A'+1, 0x10FFFF), - ), - }, - { - pattern: "[^\\p{Lu}]", - skipTestAST: true, - }, - { - pattern: "[^a-\\p{Lu}]", - syntaxError: synErrRangePropIsUnavailable, - }, - { - pattern: "[^\\p{Lu}-z]", - syntaxError: synErrRangePropIsUnavailable, - }, - { - pattern: "[^\\p{Lu}-\\p{Ll}]", - syntaxError: synErrRangePropIsUnavailable, - }, - { - pattern: "[^\\u{0000}-\\u{10FFFF}]", - syntaxError: synErrUnmatchablePattern, - }, - { - pattern: "[^\\u{0000}-\\u{FFFF}\\u{010000}-\\u{10FFFF}]", - syntaxError: synErrUnmatchablePattern, - }, - { - pattern: "[^]", - ast: newSymbolNode('^'), - }, - { - pattern: "[", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "[a", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([a", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "[a-", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([a-", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "[^", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([^", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "[^a", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([^a", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "[^a-", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([^a-", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "]", - ast: newSymbolNode(']'), - }, - { - pattern: "(]", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: "a]", - ast: genConcatNode( - newSymbolNode('a'), - newSymbolNode(']'), - ), - }, - { - pattern: "(a]", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: "([)", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([a)", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "[a-]", - ast: genAltNode( - newSymbolNode('a'), - newSymbolNode('-'), - ), - }, - { - pattern: "[^a-]", - ast: genAltNode( - newRangeSymbolNode(0x00, 0x2C), - newRangeSymbolNode(0x2E, 0x60), - newRangeSymbolNode(0x62, 0x10FFFF), - ), - }, - { - pattern: "[-z]", - ast: genAltNode( - newSymbolNode('-'), - newSymbolNode('z'), - ), - }, - { - pattern: "[^-z]", - ast: newAltNode( - newRangeSymbolNode(0x00, 0x2C), - newAltNode( - newRangeSymbolNode(0x2E, 0x79), - newRangeSymbolNode(0x7B, 0x10FFFF), - ), - ), - }, - { - pattern: "[-]", - ast: newSymbolNode('-'), - }, - { - pattern: "[^-]", - ast: genAltNode( - newRangeSymbolNode(0x00, 0x2C), - newRangeSymbolNode(0x2E, 0x10FFFF), - ), - }, - { - pattern: "[^01]", - ast: genAltNode( - newRangeSymbolNode(0x00, '0'-1), - newRangeSymbolNode('1'+1, 0x10FFFF), - ), - }, - { - pattern: "[^10]", - ast: genAltNode( - newRangeSymbolNode(0x00, '0'-1), - newRangeSymbolNode('1'+1, 0x10FFFF), - ), - }, - { - pattern: "[^a-z]", - ast: genAltNode( - newRangeSymbolNode(0x00, 'a'-1), - newRangeSymbolNode('z'+1, 0x10FFFF), - ), - }, - { - pattern: "[^az]", - ast: genAltNode( - newRangeSymbolNode(0x00, 'a'-1), - genAltNode( - newRangeSymbolNode('a'+1, 'z'-1), - newRangeSymbolNode('z'+1, 0x10FFFF), - ), - ), - }, - { - pattern: "\\u{006E}", - ast: newSymbolNode('\u006E'), - }, - { - pattern: "\\u{03BD}", - ast: newSymbolNode('\u03BD'), - }, - { - pattern: "\\u{306B}", - ast: newSymbolNode('\u306B'), - }, - { - pattern: "\\u{01F638}", - ast: newSymbolNode('\U0001F638'), - }, - { - pattern: "\\u{0000}", - ast: newSymbolNode('\u0000'), - }, - { - pattern: "\\u{10FFFF}", - ast: newSymbolNode('\U0010FFFF'), - }, - { - pattern: "\\u{110000}", - syntaxError: synErrCPExpOutOfRange, - }, - { - pattern: "\\u", - syntaxError: synErrCPExpInvalidForm, - }, - { - pattern: "\\u{", - syntaxError: synErrCPExpInvalidForm, - }, - { - pattern: "\\u{03BD", - syntaxError: synErrCPExpInvalidForm, - }, - { - pattern: "\\u{}", - syntaxError: synErrCPExpInvalidForm, - }, - { - pattern: "\\p{Letter}", - skipTestAST: true, - }, - { - pattern: "\\p{General_Category=Letter}", - skipTestAST: true, - }, - { - pattern: "\\p{ Letter }", - skipTestAST: true, - }, - { - pattern: "\\p{ General_Category = Letter }", - skipTestAST: true, - }, - { - pattern: "\\p", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\p{", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\p{Letter", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\p{General_Category=}", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\p{General_Category= }", - syntaxError: synErrCharPropInvalidSymbol, - }, - { - pattern: "\\p{=Letter}", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\p{ =Letter}", - syntaxError: synErrCharPropInvalidSymbol, - }, - { - pattern: "\\p{=}", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\p{}", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\f{a2c}", - fragments: map[spec.LexKindName]string{ - "a2c": "abc", - }, - ast: newFragmentNode("a2c", - genConcatNode( - newSymbolNode('a'), - newSymbolNode('b'), - newSymbolNode('c'), - ), - ), - }, - { - pattern: "\\f{ a2c }", - fragments: map[spec.LexKindName]string{ - "a2c": "abc", - }, - ast: newFragmentNode("a2c", - genConcatNode( - newSymbolNode('a'), - newSymbolNode('b'), - newSymbolNode('c'), - ), - ), - }, - { - pattern: "\\f", - syntaxError: synErrFragmentExpInvalidForm, - }, - { - pattern: "\\f{", - syntaxError: synErrFragmentExpInvalidForm, - }, - { - pattern: "\\f{a2c", - fragments: map[spec.LexKindName]string{ - "a2c": "abc", - }, - syntaxError: synErrFragmentExpInvalidForm, - }, - { - pattern: "(a)", - ast: newSymbolNode('a'), - }, - { - pattern: "(((a)))", - ast: newSymbolNode('a'), - }, - { - pattern: "a()", - syntaxError: synErrGroupNoElem, - }, - { - pattern: "()a", - syntaxError: synErrGroupNoElem, - }, - { - pattern: "()", - syntaxError: synErrGroupNoElem, - }, - { - pattern: "(", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: "a(", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: "(a", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: "((", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: "((a)", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: ")", - syntaxError: synErrGroupNoInitiator, - }, - { - pattern: "a)", - syntaxError: synErrGroupNoInitiator, - }, - { - pattern: ")a", - syntaxError: synErrGroupNoInitiator, - }, - { - pattern: "))", - syntaxError: synErrGroupNoInitiator, - }, - { - pattern: "(a))", - syntaxError: synErrGroupNoInitiator, - }, - { - pattern: "Mulder|Scully", - ast: genAltNode( - genConcatNode( - newSymbolNode('M'), - newSymbolNode('u'), - newSymbolNode('l'), - newSymbolNode('d'), - newSymbolNode('e'), - newSymbolNode('r'), - ), - genConcatNode( - newSymbolNode('S'), - newSymbolNode('c'), - newSymbolNode('u'), - newSymbolNode('l'), - newSymbolNode('l'), - newSymbolNode('y'), - ), - ), - }, - { - pattern: "Langly|Frohike|Byers", - ast: genAltNode( - genConcatNode( - newSymbolNode('L'), - newSymbolNode('a'), - newSymbolNode('n'), - newSymbolNode('g'), - newSymbolNode('l'), - newSymbolNode('y'), - ), - genConcatNode( - newSymbolNode('F'), - newSymbolNode('r'), - newSymbolNode('o'), - newSymbolNode('h'), - newSymbolNode('i'), - newSymbolNode('k'), - newSymbolNode('e'), - ), - genConcatNode( - newSymbolNode('B'), - newSymbolNode('y'), - newSymbolNode('e'), - newSymbolNode('r'), - newSymbolNode('s'), - ), - ), - }, - { - pattern: "|", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "||", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "Mulder|", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "|Scully", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "Langly|Frohike|", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "Langly||Byers", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "|Frohike|Byers", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "|Frohike|", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "Fox(|)Mulder", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "(Fox|)Mulder", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "Fox(|Mulder)", - syntaxError: synErrAltLackOfOperand, - }, - } - for i, tt := range tests { - t.Run(fmt.Sprintf("#%v %v", i, tt.pattern), func(t *testing.T) { - fragmentTrees := map[spec.LexKindName]CPTree{} - for kind, pattern := range tt.fragments { - p := NewParser(kind, strings.NewReader(pattern)) - root, err := p.Parse() - if err != nil { - t.Fatal(err) - } - - fragmentTrees[kind] = root - } - err := CompleteFragments(fragmentTrees) - if err != nil { - t.Fatal(err) - } - - p := NewParser(spec.LexKindName("test"), strings.NewReader(tt.pattern)) - root, err := p.Parse() - if tt.syntaxError != nil { - // printCPTree(os.Stdout, root, "", "") - if err != ParseErr { - t.Fatalf("unexpected error: want: %v, got: %v", ParseErr, err) - } - _, synErr := p.Error() - if synErr != tt.syntaxError { - t.Fatalf("unexpected syntax error: want: %v, got: %v", tt.syntaxError, synErr) - } - if root != nil { - t.Fatalf("tree must be nil") - } - } else { - if err != nil { - detail, cause := p.Error() - t.Fatalf("%v: %v: %v", err, cause, detail) - } - if root == nil { - t.Fatal("tree must be non-nil") - } - - complete, err := ApplyFragments(root, fragmentTrees) - if err != nil { - t.Fatal(err) - } - if !complete { - t.Fatalf("incomplete fragments") - } - - // printCPTree(os.Stdout, root, "", "") - if !tt.skipTestAST { - r := root.(*rootNode) - testAST(t, tt.ast, r.tree) - } - } - }) - } -} - -func TestParse_ContributoryPropertyIsNotExposed(t *testing.T) { - for _, cProp := range ucd.ContributoryProperties() { - t.Run(fmt.Sprintf("%v", cProp), func(t *testing.T) { - p := NewParser(spec.LexKindName("test"), strings.NewReader(fmt.Sprintf(`\p{%v=yes}`, cProp))) - root, err := p.Parse() - if err == nil { - t.Fatalf("expected syntax error: got: nil") - } - _, synErr := p.Error() - if synErr != synErrCharPropUnsupported { - t.Fatalf("unexpected syntax error: want: %v, got: %v", synErrCharPropUnsupported, synErr) - } - if root != nil { - t.Fatalf("tree is not nil") - } - }) - } -} - -func TestExclude(t *testing.T) { - for _, test := range []struct { - caption string - target CPTree - base CPTree - result CPTree - }{ - // t.From > b.From && t.To < b.To - - // |t.From - b.From| = 1 - // |b.To - t.To| = 1 - // - // Target (t): +--+ - // Base (b): +--+--+--+ - // Result (b - t): +--+ +--+ - { - caption: "|t.From - b.From| = 1 && |b.To - t.To| = 1", - target: newSymbolNode('1'), - base: newRangeSymbolNode('0', '2'), - result: newAltNode( - newSymbolNode('0'), - newSymbolNode('2'), - ), - }, - // |t.From - b.From| > 1 - // |b.To - t.To| > 1 - // - // Target (t): +--+ - // Base (b): +--+--+--+--+--+ - // Result (b - t): +--+--+ +--+--+ - { - caption: "|t.From - b.From| > 1 && |b.To - t.To| > 1", - target: newSymbolNode('2'), - base: newRangeSymbolNode('0', '4'), - result: newAltNode( - newRangeSymbolNode('0', '1'), - newRangeSymbolNode('3', '4'), - ), - }, - - // t.From <= b.From && t.To >= b.From && t.To < b.To - - // |b.From - t.From| = 0 - // |t.To - b.From| = 0 - // |b.To - t.To| = 1 - // - // Target (t): +--+ - // Base (b): +--+--+ - // Result (b - t): +--+ - { - caption: "|b.From - t.From| = 0 && |t.To - b.From| = 0 && |b.To - t.To| = 1", - target: newSymbolNode('0'), - base: newRangeSymbolNode('0', '1'), - result: newSymbolNode('1'), - }, - // |b.From - t.From| = 0 - // |t.To - b.From| = 0 - // |b.To - t.To| > 1 - // - // Target (t): +--+ - // Base (b): +--+--+--+ - // Result (b - t): +--+--+ - { - caption: "|b.From - t.From| = 0 && |t.To - b.From| = 0 && |b.To - t.To| > 1", - target: newSymbolNode('0'), - base: newRangeSymbolNode('0', '2'), - result: newRangeSymbolNode('1', '2'), - }, - // |b.From - t.From| = 0 - // |t.To - b.From| > 0 - // |b.To - t.To| = 1 - // - // Target (t): +--+--+ - // Base (b): +--+--+--+ - // Result (b - t): +--+ - { - caption: "|b.From - t.From| = 0 && |t.To - b.From| > 0 && |b.To - t.To| = 1", - target: newRangeSymbolNode('0', '1'), - base: newRangeSymbolNode('0', '2'), - result: newSymbolNode('2'), - }, - // |b.From - t.From| = 0 - // |t.To - b.From| > 0 - // |b.To - t.To| > 1 - // - // Target (t): +--+--+ - // Base (b): +--+--+--+--+ - // Result (b - t): +--+--+ - { - caption: "|b.From - t.From| = 0 && |t.To - b.From| > 0 && |b.To - t.To| > 1", - target: newRangeSymbolNode('0', '1'), - base: newRangeSymbolNode('0', '3'), - result: newRangeSymbolNode('2', '3'), - }, - // |b.From - t.From| > 0 - // |t.To - b.From| = 0 - // |b.To - t.To| = 1 - // - // Target (t): +--+--+ - // Base (b): +--+--+ - // Result (b - t): +--+ - { - caption: "|b.From - t.From| > 0 && |t.To - b.From| = 0 && |b.To - t.To| = 1", - target: newRangeSymbolNode('0', '1'), - base: newRangeSymbolNode('1', '2'), - result: newSymbolNode('2'), - }, - // |b.From - t.From| > 0 - // |t.To - b.From| = 0 - // |b.To - t.To| > 1 - // - // Target (t): +--+--+ - // Base (b): +--+--+--+ - // Result (b - t): +--+--+ - { - caption: "|b.From - t.From| > 0 && |t.To - b.From| = 0 && |b.To - t.To| > 1", - target: newRangeSymbolNode('0', '1'), - base: newRangeSymbolNode('1', '3'), - result: newRangeSymbolNode('2', '3'), - }, - // |b.From - t.From| > 0 - // |t.To - b.From| > 0 - // |b.To - t.To| = 1 - // - // Target (t): +--+--+--+ - // Base (b): +--+--+--+ - // Result (b - t): +--+ - { - caption: "|b.From - t.From| > 0 && |t.To - b.From| > 0 && |b.To - t.To| = 1", - target: newRangeSymbolNode('0', '2'), - base: newRangeSymbolNode('1', '3'), - result: newSymbolNode('3'), - }, - // |b.From - t.From| > 0 - // |t.To - b.From| > 0 - // |b.To - t.To| > 1 - // - // Target (t): +--+--+--+ - // Base (b): +--+--+--+--+ - // Result (b - t): +--+--+ - { - caption: "|b.From - t.From| > 0 && |t.To - b.From| > 0 && |b.To - t.To| > 1", - target: newRangeSymbolNode('0', '2'), - base: newRangeSymbolNode('1', '4'), - result: newRangeSymbolNode('3', '4'), - }, - - // t.From > b.From && t.From <= b.To && t.To >= b.To - - // |t.From - b.From| = 1 - // |b.To - t.From| = 0 - // |t.To - b.To| = 0 - // - // Target (t): +--+ - // Base (b): +--+--+ - // Result (b - t): +--+ - { - caption: "|t.From - b.From| = 1 && |b.To - t.From| = 0 && |t.To - b.To| = 0", - target: newSymbolNode('1'), - base: newRangeSymbolNode('0', '1'), - result: newSymbolNode('0'), - }, - // |t.From - b.From| = 1 - // |b.To - t.From| = 0 - // |t.To - b.To| > 0 - // - // Target (t): +--+--+ - // Base (b): +--+--+ - // Result (b - t): +--+ - { - caption: "|t.From - b.From| = 1 && |b.To - t.From| = 0 && |t.To - b.To| > 0", - target: newRangeSymbolNode('1', '2'), - base: newRangeSymbolNode('0', '1'), - result: newSymbolNode('0'), - }, - // |t.From - b.From| = 1 - // |b.To - t.From| > 0 - // |t.To - b.To| = 0 - // - // Target (t): +--+--+ - // Base (b): +--+--+--+ - // Result (b - t): +--+ - { - caption: "|t.From - b.From| = 1 && |b.To - t.From| > 0 && |t.To - b.To| = 0", - target: newRangeSymbolNode('1', '2'), - base: newRangeSymbolNode('0', '2'), - result: newSymbolNode('0'), - }, - // |t.From - b.From| = 1 - // |b.To - t.From| > 0 - // |t.To - b.To| > 0 - // - // Target (t): +--+--+--+ - // Base (b): +--+--+--+ - // Result (b - t): +--+ - { - caption: "|t.From - b.From| = 1 && |b.To - t.From| > 0 && |t.To - b.To| > 0", - target: newRangeSymbolNode('1', '3'), - base: newRangeSymbolNode('0', '2'), - result: newSymbolNode('0'), - }, - // |t.From - b.From| > 1 - // |b.To - t.From| = 0 - // |t.To - b.To| = 0 - // - // Target (t): +--+ - // Base (b): +--+--+--+ - // Result (b - t): +--+--+ - { - caption: "|t.From - b.From| > 1 && |b.To - t.From| = 0 && |t.To - b.To| = 0", - target: newSymbolNode('2'), - base: newRangeSymbolNode('0', '2'), - result: newRangeSymbolNode('0', '1'), - }, - // |t.From - b.From| > 1 - // |b.To - t.From| = 0 - // |t.To - b.To| > 0 - // - // Target (t): +--+--+ - // Base (b): +--+--+--+ - // Result (b - t): +--+--+ - { - caption: "|t.From - b.From| > 1 && |b.To - t.From| = 0 && |t.To - b.To| > 0", - target: newRangeSymbolNode('2', '3'), - base: newRangeSymbolNode('0', '2'), - result: newRangeSymbolNode('0', '1'), - }, - // |t.From - b.From| > 1 - // |b.To - t.From| > 0 - // |t.To - b.To| = 0 - // - // Target (t): +--+--+ - // Base (b): +--+--+--+--+ - // Result (b - t): +--+--+ - { - caption: "|t.From - b.From| > 1 && |b.To - t.From| > 0 && |t.To - b.To| = 0", - target: newRangeSymbolNode('2', '3'), - base: newRangeSymbolNode('0', '3'), - result: newRangeSymbolNode('0', '1'), - }, - // |t.From - b.From| > 1 - // |b.To - t.From| > 0 - // |t.To - b.To| > 0 - // - // Target (t): +--+--+--+ - // Base (b): +--+--+--+--+ - // Result (b - t): +--+--+ - { - caption: "|t.From - b.From| > 1 && |b.To - t.From| > 0 && |t.To - b.To| > 0", - target: newRangeSymbolNode('2', '4'), - base: newRangeSymbolNode('0', '3'), - result: newRangeSymbolNode('0', '1'), - }, - - // t.From <= b.From && t.To >= b.To - - // |b.From - t.From| = 0 - // |t.To - b.To| = 0 - // - // Target (t): +--+ - // Base (b): +--+ - // Result (b - t): N/A - { - caption: "|b.From - t.From| = 0 && |t.To - b.To| = 0", - target: newSymbolNode('0'), - base: newSymbolNode('0'), - result: nil, - }, - // |b.From - t.From| = 0 - // |t.To - b.To| > 0 - // - // Target (t): +--+--+ - // Base (b): +--+ - // Result (b - t): N/A - { - caption: "|b.From - t.From| = 0 && |t.To - b.To| > 0", - target: newRangeSymbolNode('0', '1'), - base: newSymbolNode('0'), - result: nil, - }, - // |b.From - t.From| > 0 - // |t.To - b.To| = 0 - // - // Target (t): +--+--+ - // Base (b): +--+ - // Result (b - t): N/A - { - caption: "|b.From - t.From| > 0 && |t.To - b.To| = 0", - target: newRangeSymbolNode('0', '1'), - base: newSymbolNode('1'), - result: nil, - }, - // |b.From - t.From| > 0 - // |t.To - b.To| > 0 - // - // Target (t): +--+--+--+ - // Base (b): +--+ - // Result (b - t): N/A - { - caption: "|b.From - t.From| > 0 && |t.To - b.To| > 0", - target: newRangeSymbolNode('0', '2'), - base: newSymbolNode('1'), - result: nil, - }, - - // Others - - // |b.From - t.From| = 1 - // - // Target (t): +--+ - // Base (b): +--+ - // Result (b - t): +--+ - { - caption: "|b.From - t.From| = 1", - target: newSymbolNode('0'), - base: newSymbolNode('1'), - result: newSymbolNode('1'), - }, - // |b.From - t.From| > 1 - // - // Target (t): +--+ - // Base (b): +--+ - // Result (b - t): +--+ - { - caption: "|b.From - t.From| > 1", - target: newSymbolNode('0'), - base: newSymbolNode('2'), - result: newSymbolNode('2'), - }, - // |t.To - b.To| = 1 - // - // Target (t): +--+ - // Base (b): +--+ - // Result (b - t): +--+ - { - caption: "|t.To - b.To| = 1", - target: newSymbolNode('1'), - base: newSymbolNode('0'), - result: newSymbolNode('0'), - }, - // |t.To - b.To| > 1 - // - // Target (t): +--+ - // Base (b): +--+ - // Result (b - t): +--+ - { - caption: "|t.To - b.To| > 1", - target: newSymbolNode('2'), - base: newSymbolNode('0'), - result: newSymbolNode('0'), - }, - } { - t.Run(test.caption, func(t *testing.T) { - r := exclude(test.target, test.base) - testAST(t, test.result, r) - }) - } -} - -func testAST(t *testing.T, expected, actual CPTree) { - t.Helper() - - aTy := reflect.TypeOf(actual) - eTy := reflect.TypeOf(expected) - if eTy != aTy { - t.Fatalf("unexpected node: want: %+v, got: %+v", eTy, aTy) - } - - if actual == nil { - return - } - - switch e := expected.(type) { - case *symbolNode: - a := actual.(*symbolNode) - if a.From != e.From || a.To != e.To { - t.Fatalf("unexpected node: want: %+v, got: %+v", e, a) - } - } - eLeft, eRight := expected.children() - aLeft, aRight := actual.children() - testAST(t, eLeft, aLeft) - testAST(t, eRight, aRight) -} diff --git a/compiler/parser/tree.go b/compiler/parser/tree.go deleted file mode 100644 index 04ba723..0000000 --- a/compiler/parser/tree.go +++ /dev/null @@ -1,459 +0,0 @@ -package parser - -import ( - "fmt" - "io" - "sort" - - "github.com/nihei9/maleeni/spec" -) - -type CPRange struct { - From rune - To rune -} - -type CPTree interface { - fmt.Stringer - Range() (rune, rune, bool) - Optional() (CPTree, bool) - Repeatable() (CPTree, bool) - Concatenation() (CPTree, CPTree, bool) - Alternatives() (CPTree, CPTree, bool) - Describe() (spec.LexKindName, []spec.LexKindName, error) - - children() (CPTree, CPTree) - clone() CPTree -} - -var ( - _ CPTree = &rootNode{} - _ CPTree = &symbolNode{} - _ CPTree = &concatNode{} - _ CPTree = &altNode{} - _ CPTree = &quantifierNode{} - _ CPTree = &fragmentNode{} -) - -type rootNode struct { - kind spec.LexKindName - tree CPTree - fragments map[spec.LexKindName][]*fragmentNode -} - -func newRootNode(kind spec.LexKindName, t CPTree) *rootNode { - fragments := map[spec.LexKindName][]*fragmentNode{} - collectFragments(t, fragments) - - return &rootNode{ - kind: kind, - tree: t, - fragments: fragments, - } -} - -func collectFragments(n CPTree, fragments map[spec.LexKindName][]*fragmentNode) { - if n == nil { - return - } - - if f, ok := n.(*fragmentNode); ok { - fragments[f.kind] = append(fragments[f.kind], f) - return - } - - l, r := n.children() - collectFragments(l, fragments) - collectFragments(r, fragments) -} - -func (n *rootNode) String() string { - return fmt.Sprintf("root: %v: %v fragments", n.kind, len(n.fragments)) -} - -func (n *rootNode) Range() (rune, rune, bool) { - return n.tree.Range() -} - -func (n *rootNode) Optional() (CPTree, bool) { - return n.tree.Optional() -} - -func (n *rootNode) Repeatable() (CPTree, bool) { - return n.tree.Repeatable() -} - -func (n *rootNode) Concatenation() (CPTree, CPTree, bool) { - return n.tree.Concatenation() -} - -func (n *rootNode) Alternatives() (CPTree, CPTree, bool) { - return n.tree.Alternatives() -} - -func (n *rootNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - var frags []spec.LexKindName - for f := range n.fragments { - frags = append(frags, spec.LexKindName(f)) - } - sort.Slice(frags, func(i, j int) bool { - return frags[i] < frags[j] - }) - - return n.kind, frags, nil -} - -func (n *rootNode) children() (CPTree, CPTree) { - return n.tree.children() -} - -func (n *rootNode) clone() CPTree { - return n.tree.clone() -} - -func (n *rootNode) incomplete() bool { - return len(n.fragments) > 0 -} - -func (n *rootNode) applyFragment(kind spec.LexKindName, fragment CPTree) error { - root, ok := fragment.(*rootNode) - if !ok { - return fmt.Errorf("applyFragment can take only *rootNode: %T", fragment) - } - if root.incomplete() { - return fmt.Errorf("fragment is incomplete") - } - - fs, ok := n.fragments[kind] - if !ok { - return nil - } - for _, f := range fs { - f.tree = root.clone() - } - delete(n.fragments, kind) - - return nil -} - -type symbolNode struct { - CPRange -} - -func newSymbolNode(cp rune) *symbolNode { - return &symbolNode{ - CPRange: CPRange{ - From: cp, - To: cp, - }, - } -} - -func newRangeSymbolNode(from, to rune) *symbolNode { - return &symbolNode{ - CPRange: CPRange{ - From: from, - To: to, - }, - } -} - -func (n *symbolNode) String() string { - return fmt.Sprintf("symbol: %X..%X", n.From, n.To) -} - -func (n *symbolNode) Range() (rune, rune, bool) { - return n.From, n.To, true -} - -func (n *symbolNode) Optional() (CPTree, bool) { - return nil, false -} - -func (n *symbolNode) Repeatable() (CPTree, bool) { - return nil, false -} - -func (n *symbolNode) Concatenation() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *symbolNode) Alternatives() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *symbolNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *symbolNode) children() (CPTree, CPTree) { - return nil, nil -} - -func (n *symbolNode) clone() CPTree { - return newRangeSymbolNode(n.From, n.To) -} - -type concatNode struct { - left CPTree - right CPTree -} - -func newConcatNode(left, right CPTree) *concatNode { - return &concatNode{ - left: left, - right: right, - } -} - -func (n *concatNode) String() string { - return "concat" -} - -func (n *concatNode) Range() (rune, rune, bool) { - return 0, 0, false -} - -func (n *concatNode) Optional() (CPTree, bool) { - return nil, false -} - -func (n *concatNode) Repeatable() (CPTree, bool) { - return nil, false -} - -func (n *concatNode) Concatenation() (CPTree, CPTree, bool) { - return n.left, n.right, true -} - -func (n *concatNode) Alternatives() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *concatNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *concatNode) children() (CPTree, CPTree) { - return n.left, n.right -} - -func (n *concatNode) clone() CPTree { - if n == nil { - return nil - } - return newConcatNode(n.left.clone(), n.right.clone()) -} - -type altNode struct { - left CPTree - right CPTree -} - -func newAltNode(left, right CPTree) *altNode { - return &altNode{ - left: left, - right: right, - } -} - -func (n *altNode) String() string { - return "alt" -} - -func (n *altNode) Range() (rune, rune, bool) { - return 0, 0, false -} - -func (n *altNode) Optional() (CPTree, bool) { - return nil, false -} - -func (n *altNode) Repeatable() (CPTree, bool) { - return nil, false -} - -func (n *altNode) Concatenation() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *altNode) Alternatives() (CPTree, CPTree, bool) { - return n.left, n.right, true -} - -func (n *altNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *altNode) children() (CPTree, CPTree) { - return n.left, n.right -} - -func (n *altNode) clone() CPTree { - return newAltNode(n.left.clone(), n.right.clone()) -} - -type quantifierNode struct { - optional bool - repeatable bool - tree CPTree -} - -func (n *quantifierNode) String() string { - switch { - case n.repeatable: - return "repeatable (>= 0 times)" - case n.optional: - return "optional (0 or 1 times)" - default: - return "invalid quantifier" - } -} - -func newRepeatNode(t CPTree) *quantifierNode { - return &quantifierNode{ - repeatable: true, - tree: t, - } -} - -func newRepeatOneOrMoreNode(t CPTree) *concatNode { - return newConcatNode( - t, - &quantifierNode{ - repeatable: true, - tree: t.clone(), - }) -} - -func newOptionNode(t CPTree) *quantifierNode { - return &quantifierNode{ - optional: true, - tree: t, - } -} - -func (n *quantifierNode) Range() (rune, rune, bool) { - return 0, 0, false -} - -func (n *quantifierNode) Optional() (CPTree, bool) { - return n.tree, n.optional -} - -func (n *quantifierNode) Repeatable() (CPTree, bool) { - return n.tree, n.repeatable -} - -func (n *quantifierNode) Concatenation() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *quantifierNode) Alternatives() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *quantifierNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *quantifierNode) children() (CPTree, CPTree) { - return n.tree, nil -} - -func (n *quantifierNode) clone() CPTree { - if n.repeatable { - return newRepeatNode(n.tree.clone()) - } - return newOptionNode(n.tree.clone()) -} - -type fragmentNode struct { - kind spec.LexKindName - tree CPTree -} - -func newFragmentNode(kind spec.LexKindName, t CPTree) *fragmentNode { - return &fragmentNode{ - kind: kind, - tree: t, - } -} - -func (n *fragmentNode) String() string { - return fmt.Sprintf("fragment: %v", n.kind) -} - -func (n *fragmentNode) Range() (rune, rune, bool) { - return n.tree.Range() -} - -func (n *fragmentNode) Optional() (CPTree, bool) { - return n.tree.Optional() -} - -func (n *fragmentNode) Repeatable() (CPTree, bool) { - return n.tree.Repeatable() -} - -func (n *fragmentNode) Concatenation() (CPTree, CPTree, bool) { - return n.tree.Concatenation() -} - -func (n *fragmentNode) Alternatives() (CPTree, CPTree, bool) { - return n.tree.Alternatives() -} - -func (n *fragmentNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *fragmentNode) children() (CPTree, CPTree) { - return n.tree.children() -} - -func (n *fragmentNode) clone() CPTree { - if n.tree == nil { - return newFragmentNode(n.kind, nil) - } - return newFragmentNode(n.kind, n.tree.clone()) -} - -//nolint:unused -func printCPTree(w io.Writer, t CPTree, ruledLine string, childRuledLinePrefix string) { - if t == nil { - return - } - fmt.Fprintf(w, "%v%v\n", ruledLine, t) - children := []CPTree{} - switch n := t.(type) { - case *rootNode: - children = append(children, n.tree) - case *fragmentNode: - children = append(children, n.tree) - default: - left, right := t.children() - if left != nil { - children = append(children, left) - } - if right != nil { - children = append(children, right) - } - } - num := len(children) - for i, child := range children { - line := "└─ " - if num > 1 { - if i == 0 { - line = "├─ " - } else if i < num-1 { - line = "│ " - } - } - prefix := "│ " - if i >= num-1 { - prefix = " " - } - printCPTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix) - } -} @@ -1,12 +1,17 @@ package tre import ( + "bufio" + "bytes" "encoding/binary" "fmt" + "io" "regexp" "strconv" "strings" "sort" + + "ucd" ) @@ -22,6 +27,7 @@ type cpRange struct { } +// "github.com/nihei9/maleeni/spec" func (b *CharBlock) String() string { var s strings.Builder @@ -747,3 +753,1658 @@ type CompiledLexSpec struct { CompressionLevel int `json:"compression_level"` Specs []*CompiledLexModeSpec `json:"specs"` } + +var ( + ParseErr = fmt.Errorf("parse error") + + // lexical errors + synErrIncompletedEscSeq = fmt.Errorf("incompleted escape sequence; unexpected EOF following \\") + synErrInvalidEscSeq = fmt.Errorf("invalid escape sequence") + synErrInvalidCodePoint = fmt.Errorf("code points must consist of just 4 or 6 hex digits") + synErrCharPropInvalidSymbol = fmt.Errorf("invalid character property symbol") + SynErrFragmentInvalidSymbol = fmt.Errorf("invalid fragment symbol") + + // syntax errors + synErrUnexpectedToken = fmt.Errorf("unexpected token") + synErrNullPattern = fmt.Errorf("a pattern must be a non-empty byte sequence") + synErrUnmatchablePattern = fmt.Errorf("a pattern cannot match any characters") + synErrAltLackOfOperand = fmt.Errorf("an alternation expression must have operands") + synErrRepNoTarget = fmt.Errorf("a repeat expression must have an operand") + synErrGroupNoElem = fmt.Errorf("a grouping expression must include at least one character") + synErrGroupUnclosed = fmt.Errorf("unclosed grouping expression") + synErrGroupNoInitiator = fmt.Errorf(") needs preceding (") + synErrGroupInvalidForm = fmt.Errorf("invalid grouping expression") + synErrBExpNoElem = fmt.Errorf("a bracket expression must include at least one character") + synErrBExpUnclosed = fmt.Errorf("unclosed bracket expression") + synErrBExpInvalidForm = fmt.Errorf("invalid bracket expression") + synErrRangeInvalidOrder = fmt.Errorf("a range expression with invalid order") + synErrRangePropIsUnavailable = fmt.Errorf("a property expression is unavailable in a range expression") + synErrRangeInvalidForm = fmt.Errorf("invalid range expression") + synErrCPExpInvalidForm = fmt.Errorf("invalid code point expression") + synErrCPExpOutOfRange = fmt.Errorf("a code point must be between U+0000 to U+10FFFF") + synErrCharPropExpInvalidForm = fmt.Errorf("invalid character property expression") + synErrCharPropUnsupported = fmt.Errorf("unsupported character property") + synErrFragmentExpInvalidForm = fmt.Errorf("invalid fragment expression") +) + + +type incompleteFragment struct { + kind LexKindName + root *rootNode +} + +func CompleteFragments(fragments map[LexKindName]CPTree) error { + if len(fragments) == 0 { + return nil + } + + completeFragments := map[LexKindName]CPTree{} + incompleteFragments := []*incompleteFragment{} + for kind, tree := range fragments { + root, ok := tree.(*rootNode) + if !ok { + return fmt.Errorf("CompleteFragments can take only *rootNode: %T", tree) + } + if root.incomplete() { + incompleteFragments = append(incompleteFragments, &incompleteFragment{ + kind: kind, + root: root, + }) + } else { + completeFragments[kind] = root + } + } + for len(incompleteFragments) > 0 { + lastIncompCount := len(incompleteFragments) + remainingFragments := []*incompleteFragment{} + for _, e := range incompleteFragments { + complete, err := ApplyFragments(e.root, completeFragments) + if err != nil { + return err + } + if !complete { + remainingFragments = append(remainingFragments, e) + } else { + completeFragments[e.kind] = e.root + } + } + incompleteFragments = remainingFragments + if len(incompleteFragments) == lastIncompCount { + return ParseErr + } + } + + return nil +} + +func ApplyFragments(t CPTree, fragments map[LexKindName]CPTree) (bool, error) { + root, ok := t.(*rootNode) + if !ok { + return false, fmt.Errorf("ApplyFragments can take only *rootNode type: %T", t) + } + + for name, frag := range fragments { + err := root.applyFragment(name, frag) + if err != nil { + return false, err + } + } + + return !root.incomplete(), nil +} + +type tokenKind string + +const ( + tokenKindChar tokenKind = "char" + tokenKindAnyChar tokenKind = "." + tokenKindRepeat tokenKind = "*" + tokenKindRepeatOneOrMore tokenKind = "+" + tokenKindOption tokenKind = "?" + tokenKindAlt tokenKind = "|" + tokenKindGroupOpen tokenKind = "(" + tokenKindGroupClose tokenKind = ")" + tokenKindBExpOpen tokenKind = "[" + tokenKindInverseBExpOpen tokenKind = "[^" + tokenKindBExpClose tokenKind = "]" + tokenKindCharRange tokenKind = "-" + tokenKindCodePointLeader tokenKind = "\\u" + tokenKindCharPropLeader tokenKind = "\\p" + tokenKindFragmentLeader tokenKind = "\\f" + tokenKindLBrace tokenKind = "{" + tokenKindRBrace tokenKind = "}" + tokenKindEqual tokenKind = "=" + tokenKindCodePoint tokenKind = "code point" + tokenKindCharPropSymbol tokenKind = "character property symbol" + tokenKindFragmentSymbol tokenKind = "fragment symbol" + tokenKindEOF tokenKind = "eof" +) + +type token struct { + kind tokenKind + char rune + propSymbol string + codePoint string + fragmentSymbol string +} + +const nullChar = '\u0000' + +func newToken(kind tokenKind, char rune) *token { + return &token{ + kind: kind, + char: char, + } +} + +func newCodePointToken(codePoint string) *token { + return &token{ + kind: tokenKindCodePoint, + codePoint: codePoint, + } +} + +func newCharPropSymbolToken(propSymbol string) *token { + return &token{ + kind: tokenKindCharPropSymbol, + propSymbol: propSymbol, + } +} + +func newFragmentSymbolToken(fragmentSymbol string) *token { + return &token{ + kind: tokenKindFragmentSymbol, + fragmentSymbol: fragmentSymbol, + } +} + +type lexerMode string + +const ( + lexerModeDefault lexerMode = "default" + lexerModeBExp lexerMode = "bracket expression" + lexerModeCPExp lexerMode = "code point expression" + lexerModeCharPropExp lexerMode = "character property expression" + lexerModeFragmentExp lexerMode = "fragment expression" +) + +type lexerModeStack struct { + stack []lexerMode +} + +func newLexerModeStack() *lexerModeStack { + return &lexerModeStack{ + stack: []lexerMode{ + lexerModeDefault, + }, + } +} + +func (s *lexerModeStack) top() lexerMode { + return s.stack[len(s.stack)-1] +} + +func (s *lexerModeStack) push(m lexerMode) { + s.stack = append(s.stack, m) +} + +func (s *lexerModeStack) pop() { + s.stack = s.stack[:len(s.stack)-1] +} + +type rangeState string + +// [a-z] +// ^^^^ +// |||`-- ready +// ||`-- expect range terminator +// |`-- read range initiator +// `-- ready +const ( + rangeStateReady rangeState = "ready" + rangeStateReadRangeInitiator rangeState = "read range initiator" + rangeStateExpectRangeTerminator rangeState = "expect range terminator" +) + +type lexer struct { + src *bufio.Reader + peekChar2 rune + peekEOF2 bool + peekChar1 rune + peekEOF1 bool + lastChar rune + reachedEOF bool + prevChar1 rune + prevEOF1 bool + prevChar2 rune + pervEOF2 bool + modeStack *lexerModeStack + rangeState rangeState + + errCause error + errDetail string +} + +func newLexer(src io.Reader) *lexer { + return &lexer{ + src: bufio.NewReader(src), + peekChar2: nullChar, + peekEOF2: false, + peekChar1: nullChar, + peekEOF1: false, + lastChar: nullChar, + reachedEOF: false, + prevChar1: nullChar, + prevEOF1: false, + prevChar2: nullChar, + pervEOF2: false, + modeStack: newLexerModeStack(), + rangeState: rangeStateReady, + } +} + +func (l *lexer) error() (string, error) { + return l.errDetail, l.errCause +} + +func (l *lexer) next() (*token, error) { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + return newToken(tokenKindEOF, nullChar), nil + } + + switch l.modeStack.top() { + case lexerModeBExp: + tok, err := l.nextInBExp(c) + if err != nil { + return nil, err + } + if tok.kind == tokenKindChar || tok.kind == tokenKindCodePointLeader || tok.kind == tokenKindCharPropLeader { + switch l.rangeState { + case rangeStateReady: + l.rangeState = rangeStateReadRangeInitiator + case rangeStateExpectRangeTerminator: + l.rangeState = rangeStateReady + } + } + switch tok.kind { + case tokenKindBExpClose: + l.modeStack.pop() + case tokenKindCharRange: + l.rangeState = rangeStateExpectRangeTerminator + case tokenKindCodePointLeader: + l.modeStack.push(lexerModeCPExp) + case tokenKindCharPropLeader: + l.modeStack.push(lexerModeCharPropExp) + } + return tok, nil + case lexerModeCPExp: + tok, err := l.nextInCodePoint(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() + } + return tok, nil + case lexerModeCharPropExp: + tok, err := l.nextInCharProp(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() + } + return tok, nil + case lexerModeFragmentExp: + tok, err := l.nextInFragment(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() + } + return tok, nil + default: + tok, err := l.nextInDefault(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindBExpOpen: + l.modeStack.push(lexerModeBExp) + l.rangeState = rangeStateReady + case tokenKindInverseBExpOpen: + l.modeStack.push(lexerModeBExp) + l.rangeState = rangeStateReady + case tokenKindCodePointLeader: + l.modeStack.push(lexerModeCPExp) + case tokenKindCharPropLeader: + l.modeStack.push(lexerModeCharPropExp) + case tokenKindFragmentLeader: + l.modeStack.push(lexerModeFragmentExp) + } + return tok, nil + } +} + +func (l *lexer) nextInDefault(c rune) (*token, error) { + switch c { + case '*': + return newToken(tokenKindRepeat, nullChar), nil + case '+': + return newToken(tokenKindRepeatOneOrMore, nullChar), nil + case '?': + return newToken(tokenKindOption, nullChar), nil + case '.': + return newToken(tokenKindAnyChar, nullChar), nil + case '|': + return newToken(tokenKindAlt, nullChar), nil + case '(': + return newToken(tokenKindGroupOpen, nullChar), nil + case ')': + return newToken(tokenKindGroupClose, nullChar), nil + case '[': + c1, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindBExpOpen, nullChar), nil + } + if c1 != '^' { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindBExpOpen, nullChar), nil + } + c2, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindInverseBExpOpen, nullChar), nil + } + if c2 != ']' { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindInverseBExpOpen, nullChar), nil + } + err = l.restore() + if err != nil { + return nil, err + } + err = l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindBExpOpen, nullChar), nil + case '\\': + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + l.errCause = synErrIncompletedEscSeq + return nil, ParseErr + } + if c == 'u' { + return newToken(tokenKindCodePointLeader, nullChar), nil + } + if c == 'p' { + return newToken(tokenKindCharPropLeader, nullChar), nil + } + if c == 'f' { + return newToken(tokenKindFragmentLeader, nullChar), nil + } + if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' { + return newToken(tokenKindChar, c), nil + } + l.errCause = synErrInvalidEscSeq + l.errDetail = fmt.Sprintf("\\%v is not supported", string(c)) + return nil, ParseErr + default: + return newToken(tokenKindChar, c), nil + } +} + +func (l *lexer) nextInBExp(c rune) (*token, error) { + switch c { + case '-': + if l.rangeState != rangeStateReadRangeInitiator { + return newToken(tokenKindChar, c), nil + } + c1, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindChar, c), nil + } + if c1 != ']' { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindCharRange, nullChar), nil + } + err = l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindChar, c), nil + case ']': + return newToken(tokenKindBExpClose, nullChar), nil + case '\\': + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + l.errCause = synErrIncompletedEscSeq + return nil, ParseErr + } + if c == 'u' { + return newToken(tokenKindCodePointLeader, nullChar), nil + } + if c == 'p' { + return newToken(tokenKindCharPropLeader, nullChar), nil + } + if c == '\\' || c == '^' || c == '-' || c == ']' { + return newToken(tokenKindChar, c), nil + } + l.errCause = synErrInvalidEscSeq + l.errDetail = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c)) + return nil, ParseErr + default: + return newToken(tokenKindChar, c), nil + } +} + +func (l *lexer) nextInCodePoint(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + default: + if !isHexDigit(c) { + l.errCause = synErrInvalidCodePoint + return nil, ParseErr + } + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if !isHexDigit(c) || n >= 6 { + l.errCause = synErrInvalidCodePoint + return nil, ParseErr + } + fmt.Fprint(&b, string(c)) + n++ + } + cp := b.String() + cpLen := len(cp) + if !(cpLen == 4 || cpLen == 6) { + l.errCause = synErrInvalidCodePoint + return nil, ParseErr + } + return newCodePointToken(b.String()), nil + } +} + +func isHexDigit(c rune) bool { + if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' { + return true + } + return false +} + +func (l *lexer) nextInCharProp(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + case '=': + return newToken(tokenKindEqual, nullChar), nil + default: + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' || c == '=' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + fmt.Fprint(&b, string(c)) + n++ + } + sym := strings.TrimSpace(b.String()) + if len(sym) == 0 { + l.errCause = synErrCharPropInvalidSymbol + return nil, ParseErr + } + return newCharPropSymbolToken(sym), nil + } +} + +func (l *lexer) nextInFragment(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + default: + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + fmt.Fprint(&b, string(c)) + n++ + } + sym := strings.TrimSpace(b.String()) + if len(sym) == 0 { + l.errCause = SynErrFragmentInvalidSymbol + return nil, ParseErr + } + return newFragmentSymbolToken(sym), nil + } +} + +func (l *lexer) read() (rune, bool, error) { + if l.reachedEOF { + return l.lastChar, l.reachedEOF, nil + } + if l.peekChar1 != nullChar || l.peekEOF1 { + l.prevChar2 = l.prevChar1 + l.pervEOF2 = l.prevEOF1 + l.prevChar1 = l.lastChar + l.prevEOF1 = l.reachedEOF + l.lastChar = l.peekChar1 + l.reachedEOF = l.peekEOF1 + l.peekChar1 = l.peekChar2 + l.peekEOF1 = l.peekEOF2 + l.peekChar2 = nullChar + l.peekEOF2 = false + return l.lastChar, l.reachedEOF, nil + } + c, _, err := l.src.ReadRune() + if err != nil { + if err == io.EOF { + l.prevChar2 = l.prevChar1 + l.pervEOF2 = l.prevEOF1 + l.prevChar1 = l.lastChar + l.prevEOF1 = l.reachedEOF + l.lastChar = nullChar + l.reachedEOF = true + return l.lastChar, l.reachedEOF, nil + } + return nullChar, false, err + } + l.prevChar2 = l.prevChar1 + l.pervEOF2 = l.prevEOF1 + l.prevChar1 = l.lastChar + l.prevEOF1 = l.reachedEOF + l.lastChar = c + l.reachedEOF = false + return l.lastChar, l.reachedEOF, nil +} + +func (l *lexer) restore() error { + if l.lastChar == nullChar && !l.reachedEOF { + return fmt.Errorf("failed to call restore() because the last character is null") + } + l.peekChar2 = l.peekChar1 + l.peekEOF2 = l.peekEOF1 + l.peekChar1 = l.lastChar + l.peekEOF1 = l.reachedEOF + l.lastChar = l.prevChar1 + l.reachedEOF = l.prevEOF1 + l.prevChar1 = l.prevChar2 + l.prevEOF1 = l.pervEOF2 + l.prevChar2 = nullChar + l.pervEOF2 = false + return nil +} + +type PatternEntry struct { + ID LexModeKindID + Pattern []byte +} + +type parser struct { + kind LexKindName + lex *lexer + peekedTok *token + lastTok *token + + // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that + // appear in property expressions. + // + // The contributory properties are not exposed, and users cannot use those properties because the parser + // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid. + // + // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to + // interpret derived properties internally because the derived properties consist of other properties that + // may contain the contributory properties. + // + // [UAX #44 5.13 Property APIs] says: + // > The following subtypes of Unicode character properties should generally not be exposed in APIs, + // > except in limited circumstances. They may not be useful, particularly in public API collections, + // > and may instead prove misleading to the users of such API collections. + // > * Contributory properties are not recommended for public APIs. + // > ... + // https://unicode.org/reports/tr44/#Property_APIs + isContributoryPropertyExposed bool + + errCause error + errDetail string +} + +func NewParser(kind LexKindName, src io.Reader) *parser { + return &parser{ + kind: kind, + lex: newLexer(src), + isContributoryPropertyExposed: false, + } +} + +func (p *parser) exposeContributoryProperty() { + p.isContributoryPropertyExposed = true +} + +func (p *parser) Error() (string, error) { + return p.errDetail, p.errCause +} + +func (p *parser) Parse() (root CPTree, retErr error) { + defer func() { + err := recover() + if err != nil { + var ok bool + retErr, ok = err.(error) + if !ok { + panic(err) + } + return + } + }() + + return newRootNode(p.kind, p.parseRegexp()), nil +} + +func (p *parser) parseRegexp() CPTree { + alt := p.parseAlt() + if alt == nil { + if p.consume(tokenKindGroupClose) { + p.raiseParseError(synErrGroupNoInitiator, "") + } + p.raiseParseError(synErrNullPattern, "") + } + if p.consume(tokenKindGroupClose) { + p.raiseParseError(synErrGroupNoInitiator, "") + } + p.expect(tokenKindEOF) + return alt +} + +func (p *parser) parseAlt() CPTree { + left := p.parseConcat() + if left == nil { + if p.consume(tokenKindAlt) { + p.raiseParseError(synErrAltLackOfOperand, "") + } + return nil + } + for { + if !p.consume(tokenKindAlt) { + break + } + right := p.parseConcat() + if right == nil { + p.raiseParseError(synErrAltLackOfOperand, "") + } + left = newAltNode(left, right) + } + return left +} + +func (p *parser) parseConcat() CPTree { + left := p.parseRepeat() + for { + right := p.parseRepeat() + if right == nil { + break + } + left = newConcatNode(left, right) + } + return left +} + +func (p *parser) parseRepeat() CPTree { + group := p.parseGroup() + if group == nil { + if p.consume(tokenKindRepeat) { + p.raiseParseError(synErrRepNoTarget, "* needs an operand") + } + if p.consume(tokenKindRepeatOneOrMore) { + p.raiseParseError(synErrRepNoTarget, "+ needs an operand") + } + if p.consume(tokenKindOption) { + p.raiseParseError(synErrRepNoTarget, "? needs an operand") + } + return nil + } + if p.consume(tokenKindRepeat) { + return newRepeatNode(group) + } + if p.consume(tokenKindRepeatOneOrMore) { + return newRepeatOneOrMoreNode(group) + } + if p.consume(tokenKindOption) { + return newOptionNode(group) + } + return group +} + +func (p *parser) parseGroup() CPTree { + if p.consume(tokenKindGroupOpen) { + alt := p.parseAlt() + if alt == nil { + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrGroupUnclosed, "") + } + p.raiseParseError(synErrGroupNoElem, "") + } + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrGroupUnclosed, "") + } + if !p.consume(tokenKindGroupClose) { + p.raiseParseError(synErrGroupInvalidForm, "") + } + return alt + } + return p.parseSingleChar() +} + +func (p *parser) parseSingleChar() CPTree { + if p.consume(tokenKindAnyChar) { + return genAnyCharAST() + } + if p.consume(tokenKindBExpOpen) { + left := p.parseBExpElem() + if left == nil { + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.raiseParseError(synErrBExpNoElem, "") + } + for { + right := p.parseBExpElem() + if right == nil { + break + } + left = newAltNode(left, right) + } + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.expect(tokenKindBExpClose) + return left + } + if p.consume(tokenKindInverseBExpOpen) { + elem := p.parseBExpElem() + if elem == nil { + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.raiseParseError(synErrBExpNoElem, "") + } + inverse := exclude(elem, genAnyCharAST()) + if inverse == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + for { + elem := p.parseBExpElem() + if elem == nil { + break + } + inverse = exclude(elem, inverse) + if inverse == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + } + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.expect(tokenKindBExpClose) + return inverse + } + if p.consume(tokenKindCodePointLeader) { + return p.parseCodePoint() + } + if p.consume(tokenKindCharPropLeader) { + return p.parseCharProp() + } + if p.consume(tokenKindFragmentLeader) { + return p.parseFragment() + } + c := p.parseNormalChar() + if c == nil { + if p.consume(tokenKindBExpClose) { + p.raiseParseError(synErrBExpInvalidForm, "") + } + return nil + } + return c +} + +func (p *parser) parseBExpElem() CPTree { + var left CPTree + switch { + case p.consume(tokenKindCodePointLeader): + left = p.parseCodePoint() + case p.consume(tokenKindCharPropLeader): + left = p.parseCharProp() + if p.consume(tokenKindCharRange) { + p.raiseParseError(synErrRangePropIsUnavailable, "") + } + default: + left = p.parseNormalChar() + } + if left == nil { + return nil + } + if !p.consume(tokenKindCharRange) { + return left + } + var right CPTree + switch { + case p.consume(tokenKindCodePointLeader): + right = p.parseCodePoint() + case p.consume(tokenKindCharPropLeader): + p.raiseParseError(synErrRangePropIsUnavailable, "") + default: + right = p.parseNormalChar() + } + if right == nil { + p.raiseParseError(synErrRangeInvalidForm, "") + } + from, _, _ := left.Range() + _, to, _ := right.Range() + if !isValidOrder(from, to) { + p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("%X..%X", from, to)) + } + return newRangeSymbolNode(from, to) +} + +func (p *parser) parseCodePoint() CPTree { + if !p.consume(tokenKindLBrace) { + p.raiseParseError(synErrCPExpInvalidForm, "") + } + if !p.consume(tokenKindCodePoint) { + p.raiseParseError(synErrCPExpInvalidForm, "") + } + + n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64) + if err != nil { + panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err)) + } + if n < 0x0000 || n > 0x10FFFF { + p.raiseParseError(synErrCPExpOutOfRange, "") + } + + sym := newSymbolNode(rune(n)) + + if !p.consume(tokenKindRBrace) { + p.raiseParseError(synErrCPExpInvalidForm, "") + } + + return sym +} + +func (p *parser) parseCharProp() CPTree { + if !p.consume(tokenKindLBrace) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + var sym1, sym2 string + if !p.consume(tokenKindCharPropSymbol) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + sym1 = p.lastTok.propSymbol + if p.consume(tokenKindEqual) { + if !p.consume(tokenKindCharPropSymbol) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + sym2 = p.lastTok.propSymbol + } + + var alt CPTree + var propName, propVal string + if sym2 != "" { + propName = sym1 + propVal = sym2 + } else { + propName = "" + propVal = sym1 + } + if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) { + p.raiseParseError(synErrCharPropUnsupported, propName) + } + pat, err := ucd.NormalizeCharacterProperty(propName, propVal) + if err != nil { + p.raiseParseError(synErrCharPropUnsupported, err.Error()) + } + if pat != "" { + p := NewParser(p.kind, bytes.NewReader([]byte(pat))) + p.exposeContributoryProperty() + ast, err := p.Parse() + if err != nil { + panic(err) + } + alt = ast + } else { + cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal) + if err != nil { + p.raiseParseError(synErrCharPropUnsupported, err.Error()) + } + if inverse { + r := cpRanges[0] + alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST()) + if alt == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + for _, r := range cpRanges[1:] { + alt = exclude(newRangeSymbolNode(r.From, r.To), alt) + if alt == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + } + } else { + for _, r := range cpRanges { + alt = genAltNode( + alt, + newRangeSymbolNode(r.From, r.To), + ) + } + } + } + + if !p.consume(tokenKindRBrace) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + + return alt +} + +func (p *parser) parseFragment() CPTree { + if !p.consume(tokenKindLBrace) { + p.raiseParseError(synErrFragmentExpInvalidForm, "") + } + if !p.consume(tokenKindFragmentSymbol) { + p.raiseParseError(synErrFragmentExpInvalidForm, "") + } + sym := p.lastTok.fragmentSymbol + + if !p.consume(tokenKindRBrace) { + p.raiseParseError(synErrFragmentExpInvalidForm, "") + } + + return newFragmentNode(LexKindName(sym), nil) +} + +func (p *parser) parseNormalChar() CPTree { + if !p.consume(tokenKindChar) { + return nil + } + return newSymbolNode(p.lastTok.char) +} + +func exclude(symbol, base CPTree) CPTree { + if left, right, ok := symbol.Alternatives(); ok { + return exclude(right, exclude(left, base)) + } + + if left, right, ok := base.Alternatives(); ok { + return genAltNode( + exclude(symbol, left), + exclude(symbol, right), + ) + } + + if bFrom, bTo, ok := base.Range(); ok { + sFrom, sTo, ok := symbol.Range() + if !ok { + panic(fmt.Errorf("invalid symbol tree: %T", symbol)) + } + + switch { + case sFrom > bFrom && sTo < bTo: + return genAltNode( + newRangeSymbolNode(bFrom, sFrom-1), + newRangeSymbolNode(sTo+1, bTo), + ) + case sFrom <= bFrom && sTo >= bFrom && sTo < bTo: + return newRangeSymbolNode(sTo+1, bTo) + case sFrom > bFrom && sFrom <= bTo && sTo >= bTo: + return newRangeSymbolNode(bFrom, sFrom-1) + case sFrom <= bFrom && sTo >= bTo: + return nil + default: + return base + } + } + + panic(fmt.Errorf("invalid base tree: %T", base)) +} + +func genAnyCharAST() CPTree { + return newRangeSymbolNode(0x0, 0x10FFFF) +} + +func isValidOrder(from, to rune) bool { + return from <= to +} + +func genConcatNode(cs ...CPTree) CPTree { + nonNilNodes := []CPTree{} + for _, c := range cs { + if c == nil { + continue + } + nonNilNodes = append(nonNilNodes, c) + } + if len(nonNilNodes) <= 0 { + return nil + } + if len(nonNilNodes) == 1 { + return nonNilNodes[0] + } + concat := newConcatNode(nonNilNodes[0], nonNilNodes[1]) + for _, c := range nonNilNodes[2:] { + concat = newConcatNode(concat, c) + } + return concat +} + +func genAltNode(cs ...CPTree) CPTree { + nonNilNodes := []CPTree{} + for _, c := range cs { + if c == nil { + continue + } + nonNilNodes = append(nonNilNodes, c) + } + if len(nonNilNodes) <= 0 { + return nil + } + if len(nonNilNodes) == 1 { + return nonNilNodes[0] + } + alt := newAltNode(nonNilNodes[0], nonNilNodes[1]) + for _, c := range nonNilNodes[2:] { + alt = newAltNode(alt, c) + } + return alt +} + +func (p *parser) expect(expected tokenKind) { + if !p.consume(expected) { + tok := p.peekedTok + p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("expected: %v, actual: %v", expected, tok.kind)) + } +} + +func (p *parser) consume(expected tokenKind) bool { + var tok *token + var err error + if p.peekedTok != nil { + tok = p.peekedTok + p.peekedTok = nil + } else { + tok, err = p.lex.next() + if err != nil { + if err == ParseErr { + detail, cause := p.lex.error() + p.raiseParseError(cause, detail) + } + panic(err) + } + } + p.lastTok = tok + if tok.kind == expected { + return true + } + p.peekedTok = tok + p.lastTok = nil + + return false +} + +func (p *parser) raiseParseError(err error, detail string) { + p.errCause = err + p.errDetail = detail + panic(ParseErr) +} + +type CPRange struct { + From rune + To rune +} + +type CPTree interface { + fmt.Stringer + Range() (rune, rune, bool) + Optional() (CPTree, bool) + Repeatable() (CPTree, bool) + Concatenation() (CPTree, CPTree, bool) + Alternatives() (CPTree, CPTree, bool) + Describe() (LexKindName, []LexKindName, error) + + children() (CPTree, CPTree) + clone() CPTree +} + +var ( + _ CPTree = &rootNode{} + _ CPTree = &symbolNode{} + _ CPTree = &concatNode{} + _ CPTree = &altNode{} + _ CPTree = &quantifierNode{} + _ CPTree = &fragmentNode{} +) + +type rootNode struct { + kind LexKindName + tree CPTree + fragments map[LexKindName][]*fragmentNode +} + +func newRootNode(kind LexKindName, t CPTree) *rootNode { + fragments := map[LexKindName][]*fragmentNode{} + collectFragments(t, fragments) + + return &rootNode{ + kind: kind, + tree: t, + fragments: fragments, + } +} + +func collectFragments(n CPTree, fragments map[LexKindName][]*fragmentNode) { + if n == nil { + return + } + + if f, ok := n.(*fragmentNode); ok { + fragments[f.kind] = append(fragments[f.kind], f) + return + } + + l, r := n.children() + collectFragments(l, fragments) + collectFragments(r, fragments) +} + +func (n *rootNode) String() string { + return fmt.Sprintf("root: %v: %v fragments", n.kind, len(n.fragments)) +} + +func (n *rootNode) Range() (rune, rune, bool) { + return n.tree.Range() +} + +func (n *rootNode) Optional() (CPTree, bool) { + return n.tree.Optional() +} + +func (n *rootNode) Repeatable() (CPTree, bool) { + return n.tree.Repeatable() +} + +func (n *rootNode) Concatenation() (CPTree, CPTree, bool) { + return n.tree.Concatenation() +} + +func (n *rootNode) Alternatives() (CPTree, CPTree, bool) { + return n.tree.Alternatives() +} + +func (n *rootNode) Describe() (LexKindName, []LexKindName, error) { + var frags []LexKindName + for f := range n.fragments { + frags = append(frags, LexKindName(f)) + } + sort.Slice(frags, func(i, j int) bool { + return frags[i] < frags[j] + }) + + return n.kind, frags, nil +} + +func (n *rootNode) children() (CPTree, CPTree) { + return n.tree.children() +} + +func (n *rootNode) clone() CPTree { + return n.tree.clone() +} + +func (n *rootNode) incomplete() bool { + return len(n.fragments) > 0 +} + +func (n *rootNode) applyFragment(kind LexKindName, fragment CPTree) error { + root, ok := fragment.(*rootNode) + if !ok { + return fmt.Errorf("applyFragment can take only *rootNode: %T", fragment) + } + if root.incomplete() { + return fmt.Errorf("fragment is incomplete") + } + + fs, ok := n.fragments[kind] + if !ok { + return nil + } + for _, f := range fs { + f.tree = root.clone() + } + delete(n.fragments, kind) + + return nil +} + +type symbolNode struct { + CPRange +} + +func newSymbolNode(cp rune) *symbolNode { + return &symbolNode{ + CPRange: CPRange{ + From: cp, + To: cp, + }, + } +} + +func newRangeSymbolNode(from, to rune) *symbolNode { + return &symbolNode{ + CPRange: CPRange{ + From: from, + To: to, + }, + } +} + +func (n *symbolNode) String() string { + return fmt.Sprintf("symbol: %X..%X", n.From, n.To) +} + +func (n *symbolNode) Range() (rune, rune, bool) { + return n.From, n.To, true +} + +func (n *symbolNode) Optional() (CPTree, bool) { + return nil, false +} + +func (n *symbolNode) Repeatable() (CPTree, bool) { + return nil, false +} + +func (n *symbolNode) Concatenation() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *symbolNode) Alternatives() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *symbolNode) Describe() (LexKindName, []LexKindName, error) { + return LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *symbolNode) children() (CPTree, CPTree) { + return nil, nil +} + +func (n *symbolNode) clone() CPTree { + return newRangeSymbolNode(n.From, n.To) +} + +type concatNode struct { + left CPTree + right CPTree +} + +func newConcatNode(left, right CPTree) *concatNode { + return &concatNode{ + left: left, + right: right, + } +} + +func (n *concatNode) String() string { + return "concat" +} + +func (n *concatNode) Range() (rune, rune, bool) { + return 0, 0, false +} + +func (n *concatNode) Optional() (CPTree, bool) { + return nil, false +} + +func (n *concatNode) Repeatable() (CPTree, bool) { + return nil, false +} + +func (n *concatNode) Concatenation() (CPTree, CPTree, bool) { + return n.left, n.right, true +} + +func (n *concatNode) Alternatives() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *concatNode) Describe() (LexKindName, []LexKindName, error) { + return LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *concatNode) children() (CPTree, CPTree) { + return n.left, n.right +} + +func (n *concatNode) clone() CPTree { + if n == nil { + return nil + } + return newConcatNode(n.left.clone(), n.right.clone()) +} + +type altNode struct { + left CPTree + right CPTree +} + +func newAltNode(left, right CPTree) *altNode { + return &altNode{ + left: left, + right: right, + } +} + +func (n *altNode) String() string { + return "alt" +} + +func (n *altNode) Range() (rune, rune, bool) { + return 0, 0, false +} + +func (n *altNode) Optional() (CPTree, bool) { + return nil, false +} + +func (n *altNode) Repeatable() (CPTree, bool) { + return nil, false +} + +func (n *altNode) Concatenation() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *altNode) Alternatives() (CPTree, CPTree, bool) { + return n.left, n.right, true +} + +func (n *altNode) Describe() (LexKindName, []LexKindName, error) { + return LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *altNode) children() (CPTree, CPTree) { + return n.left, n.right +} + +func (n *altNode) clone() CPTree { + return newAltNode(n.left.clone(), n.right.clone()) +} + +type quantifierNode struct { + optional bool + repeatable bool + tree CPTree +} + +func (n *quantifierNode) String() string { + switch { + case n.repeatable: + return "repeatable (>= 0 times)" + case n.optional: + return "optional (0 or 1 times)" + default: + return "invalid quantifier" + } +} + +func newRepeatNode(t CPTree) *quantifierNode { + return &quantifierNode{ + repeatable: true, + tree: t, + } +} + +func newRepeatOneOrMoreNode(t CPTree) *concatNode { + return newConcatNode( + t, + &quantifierNode{ + repeatable: true, + tree: t.clone(), + }) +} + +func newOptionNode(t CPTree) *quantifierNode { + return &quantifierNode{ + optional: true, + tree: t, + } +} + +func (n *quantifierNode) Range() (rune, rune, bool) { + return 0, 0, false +} + +func (n *quantifierNode) Optional() (CPTree, bool) { + return n.tree, n.optional +} + +func (n *quantifierNode) Repeatable() (CPTree, bool) { + return n.tree, n.repeatable +} + +func (n *quantifierNode) Concatenation() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *quantifierNode) Alternatives() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *quantifierNode) Describe() (LexKindName, []LexKindName, error) { + return LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *quantifierNode) children() (CPTree, CPTree) { + return n.tree, nil +} + +func (n *quantifierNode) clone() CPTree { + if n.repeatable { + return newRepeatNode(n.tree.clone()) + } + return newOptionNode(n.tree.clone()) +} + +type fragmentNode struct { + kind LexKindName + tree CPTree +} + +func newFragmentNode(kind LexKindName, t CPTree) *fragmentNode { + return &fragmentNode{ + kind: kind, + tree: t, + } +} + +func (n *fragmentNode) String() string { + return fmt.Sprintf("fragment: %v", n.kind) +} + +func (n *fragmentNode) Range() (rune, rune, bool) { + return n.tree.Range() +} + +func (n *fragmentNode) Optional() (CPTree, bool) { + return n.tree.Optional() +} + +func (n *fragmentNode) Repeatable() (CPTree, bool) { + return n.tree.Repeatable() +} + +func (n *fragmentNode) Concatenation() (CPTree, CPTree, bool) { + return n.tree.Concatenation() +} + +func (n *fragmentNode) Alternatives() (CPTree, CPTree, bool) { + return n.tree.Alternatives() +} + +func (n *fragmentNode) Describe() (LexKindName, []LexKindName, error) { + return LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *fragmentNode) children() (CPTree, CPTree) { + return n.tree.children() +} + +func (n *fragmentNode) clone() CPTree { + if n.tree == nil { + return newFragmentNode(n.kind, nil) + } + return newFragmentNode(n.kind, n.tree.clone()) +} + +//nolint:unused +func printCPTree(w io.Writer, t CPTree, ruledLine string, childRuledLinePrefix string) { + if t == nil { + return + } + fmt.Fprintf(w, "%v%v\n", ruledLine, t) + children := []CPTree{} + switch n := t.(type) { + case *rootNode: + children = append(children, n.tree) + case *fragmentNode: + children = append(children, n.tree) + default: + left, right := t.children() + if left != nil { + children = append(children, left) + } + if right != nil { + children = append(children, right) + } + } + num := len(children) + for i, child := range children { + line := "└─ " + if num > 1 { + if i == 0 { + line = "├─ " + } else if i < num-1 { + line = "│ " + } + } + prefix := "│ " + if i >= num-1 { + prefix = " " + } + printCPTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix) + } +} diff --git a/tests/tre.go b/tests/tre.go index 8c14feb..2b6fe26 100644 --- a/tests/tre.go +++ b/tests/tre.go @@ -3,8 +3,12 @@ package tre import ( "fmt" "os" + "reflect" + "strings" "testing" "testing/internal/testdeps" + + "ucd" ) @@ -569,6 +573,1902 @@ func TestLexSpec_Validate(t *testing.T) { } } +func TestLexer(t *testing.T) { + tests := []struct { + caption string + src string + tokens []*token + err error + }{ + { + caption: "lexer can recognize ordinaly characters", + src: "123abcいろは", + tokens: []*token{ + newToken(tokenKindChar, '1'), + newToken(tokenKindChar, '2'), + newToken(tokenKindChar, '3'), + newToken(tokenKindChar, 'a'), + newToken(tokenKindChar, 'b'), + newToken(tokenKindChar, 'c'), + newToken(tokenKindChar, 'い'), + newToken(tokenKindChar, 'ろ'), + newToken(tokenKindChar, 'は'), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "lexer can recognize the special characters in default mode", + src: ".*+?|()[\\u", + tokens: []*token{ + newToken(tokenKindAnyChar, nullChar), + newToken(tokenKindRepeat, nullChar), + newToken(tokenKindRepeatOneOrMore, nullChar), + newToken(tokenKindOption, nullChar), + newToken(tokenKindAlt, nullChar), + newToken(tokenKindGroupOpen, nullChar), + newToken(tokenKindGroupClose, nullChar), + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "lexer can recognize the escape sequences in default mode", + src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[", + tokens: []*token{ + newToken(tokenKindChar, '\\'), + newToken(tokenKindChar, '.'), + newToken(tokenKindChar, '*'), + newToken(tokenKindChar, '+'), + newToken(tokenKindChar, '?'), + newToken(tokenKindChar, '|'), + newToken(tokenKindChar, '('), + newToken(tokenKindChar, ')'), + newToken(tokenKindChar, '['), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "], {, and } are treated as an ordinary character in default mode", + src: "]{}", + tokens: []*token{ + newToken(tokenKindChar, ']'), + newToken(tokenKindChar, '{'), + newToken(tokenKindChar, '}'), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "lexer can recognize the special characters in bracket expression mode", + src: "[a-z\\u{09AF}][^a-z\\u{09abcf}]", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, 'a'), + newToken(tokenKindCharRange, nullChar), + newToken(tokenKindChar, 'z'), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("09AF"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, 'a'), + newToken(tokenKindCharRange, nullChar), + newToken(tokenKindChar, 'z'), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("09abcf"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "lexer can recognize the escape sequences in bracket expression mode", + src: "[\\^a\\-z]", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, '^'), + newToken(tokenKindChar, 'a'), + newToken(tokenKindChar, '-'), + newToken(tokenKindChar, 'z'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "in a bracket expression, the special characters are also handled as normal characters", + src: "[\\\\.*+?|()[", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, '\\'), + newToken(tokenKindChar, '.'), + newToken(tokenKindChar, '*'), + newToken(tokenKindChar, '+'), + newToken(tokenKindChar, '?'), + newToken(tokenKindChar, '|'), + newToken(tokenKindChar, '('), + newToken(tokenKindChar, ')'), + newToken(tokenKindChar, '['), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "hyphen symbols that appear in bracket expressions are handled as the character range symbol or ordinary characters", + // [...-...][...-][-...][-] + // ~~~~~~~ ~ ~ ~ + // ^ ^ ^ ^ + // | | | `-- Ordinary Character (b) + // | | `-- Ordinary Character (b) + // | `-- Ordinary Character (b) + // `-- Character Range (a) + // + // a. *-* is handled as a character-range expression. + // b. *-, -*, or - are handled as ordinary characters. + src: "[a-z][a-][-z][-][--][---][^a-z][^a-][^-z][^-][^--][^---]", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, 'a'), + newToken(tokenKindCharRange, nullChar), + newToken(tokenKindChar, 'z'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, 'a'), + newToken(tokenKindChar, '-'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindChar, 'z'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindChar, '-'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindCharRange, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindBExpClose, nullChar), + + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, 'a'), + newToken(tokenKindCharRange, nullChar), + newToken(tokenKindChar, 'z'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, 'a'), + newToken(tokenKindChar, '-'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindChar, 'z'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindChar, '-'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindCharRange, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindBExpClose, nullChar), + + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "caret symbols that appear in bracket expressions are handled as the logical inverse symbol or ordinary characters", + // [^...^...][^] + // ~~ ~ ~~ + // ^ ^ ^^ + // | | |`-- Ordinary Character (c) + // | | `-- Bracket Expression + // | `-- Ordinary Character (b) + // `-- Inverse Bracket Expression (a) + // + // a. Bracket expressions that have a caret symbol at the beginning are handled as logical inverse expressions. + // b. caret symbols that appear as the second and the subsequent symbols are handled as ordinary symbols. + // c. When a bracket expression has just one symbol, a caret symbol at the beginning is handled as an ordinary character. + src: "[^^][^]", + tokens: []*token{ + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, '^'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, '^'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "lexer raises an error when an invalid escape sequence appears", + src: "\\@", + err: synErrInvalidEscSeq, + }, + { + caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears", + src: "\\", + err: synErrIncompletedEscSeq, + }, + { + caption: "lexer raises an error when an invalid escape sequence appears", + src: "[\\@", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + }, + err: synErrInvalidEscSeq, + }, + { + caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears", + src: "[\\", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + }, + err: synErrIncompletedEscSeq, + }, + { + caption: "lexer can recognize the special characters and code points in code point expression mode", + src: "\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}[\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}][^\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}]", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("0123"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("4567"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("89abcd"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("efAB"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("CDEF01"), + newToken(tokenKindRBrace, nullChar), + + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("0123"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("4567"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("89abcd"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("efAB"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("CDEF01"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindBExpClose, nullChar), + + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("0123"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("4567"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("89abcd"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("efAB"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("CDEF01"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindBExpClose, nullChar), + + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "a one digit hex string isn't a valid code point", + src: "\\u{0", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a two digits hex string isn't a valid code point", + src: "\\u{01", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a three digits hex string isn't a valid code point", + src: "\\u{012", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a four digits hex string is a valid code point", + src: "\\u{0123}", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("0123"), + newToken(tokenKindRBrace, nullChar), + }, + }, + { + caption: "a five digits hex string isn't a valid code point", + src: "\\u{01234", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a six digits hex string is a valid code point", + src: "\\u{012345}", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("012345"), + newToken(tokenKindRBrace, nullChar), + }, + }, + { + caption: "a seven digits hex string isn't a valid code point", + src: "\\u{0123456", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a code point must be hex digits", + src: "\\u{g", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a code point must be hex digits", + src: "\\u{G", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "lexer can recognize the special characters and symbols in character property expression mode", + src: "\\p{Letter}\\p{General_Category=Letter}[\\p{Letter}\\p{General_Category=Letter}][^\\p{Letter}\\p{General_Category=Letter}]", + tokens: []*token{ + newToken(tokenKindCharPropLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCharPropSymbolToken("Letter"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCharPropLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCharPropSymbolToken("General_Category"), + newToken(tokenKindEqual, nullChar), + newCharPropSymbolToken("Letter"), + newToken(tokenKindRBrace, nullChar), + + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindCharPropLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCharPropSymbolToken("Letter"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCharPropLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCharPropSymbolToken("General_Category"), + newToken(tokenKindEqual, nullChar), + newCharPropSymbolToken("Letter"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindBExpClose, nullChar), + + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindCharPropLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCharPropSymbolToken("Letter"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCharPropLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCharPropSymbolToken("General_Category"), + newToken(tokenKindEqual, nullChar), + newCharPropSymbolToken("Letter"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindBExpClose, nullChar), + + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "lexer can recognize the special characters and symbols in fragment expression mode", + src: "\\f{integer}", + tokens: []*token{ + newToken(tokenKindFragmentLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newFragmentSymbolToken("integer"), + newToken(tokenKindRBrace, nullChar), + + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "a fragment expression is not supported in a bracket expression", + src: "[\\f", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + }, + err: synErrInvalidEscSeq, + }, + { + caption: "a fragment expression is not supported in an inverse bracket expression", + src: "[^\\f", + tokens: []*token{ + newToken(tokenKindInverseBExpOpen, nullChar), + }, + err: synErrInvalidEscSeq, + }, + } + for _, tt := range tests { + t.Run(tt.caption, func(t *testing.T) { + lex := newLexer(strings.NewReader(tt.src)) + var err error + var tok *token + i := 0 + for { + tok, err = lex.next() + if err != nil { + break + } + if i >= len(tt.tokens) { + break + } + eTok := tt.tokens[i] + i++ + testToken(t, tok, eTok) + + if tok.kind == tokenKindEOF { + break + } + } + if tt.err != nil { + if err != ParseErr { + t.Fatalf("unexpected error: want: %v, got: %v", ParseErr, err) + } + detail, cause := lex.error() + if cause != tt.err { + t.Fatalf("unexpected error: want: %v, got: %v (%v)", tt.err, cause, detail) + } + } else { + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + } + if i < len(tt.tokens) { + t.Fatalf("expecte more tokens") + } + }) + } +} + +func testToken(t *testing.T, a, e *token) { + t.Helper() + if e.kind != a.kind || e.char != a.char || e.codePoint != a.codePoint { + t.Fatalf("unexpected token: want: %+v, got: %+v", e, a) + } +} + +func TestParse(t *testing.T) { + tests := []struct { + pattern string + fragments map[LexKindName]string + ast CPTree + syntaxError error + + // When an AST is large, as patterns containing a character property expression, this test only checks + // that the pattern is parsable. The check of the validity of such AST is performed by checking that it + // can be matched correctly using the driver. + skipTestAST bool + }{ + { + pattern: "a", + ast: newSymbolNode('a'), + }, + { + pattern: "abc", + ast: genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + }, + { + pattern: "a?", + ast: newOptionNode( + newSymbolNode('a'), + ), + }, + { + pattern: "[abc]?", + ast: newOptionNode( + genAltNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + }, + { + pattern: "\\u{3042}?", + ast: newOptionNode( + newSymbolNode('\u3042'), + ), + }, + { + pattern: "\\p{Letter}?", + skipTestAST: true, + }, + { + pattern: "\\f{a2c}?", + fragments: map[LexKindName]string{ + "a2c": "abc", + }, + ast: newOptionNode( + newFragmentNode("a2c", + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + ), + }, + { + pattern: "(a)?", + ast: newOptionNode( + newSymbolNode('a'), + ), + }, + { + pattern: "((a?)?)?", + ast: newOptionNode( + newOptionNode( + newOptionNode( + newSymbolNode('a'), + ), + ), + ), + }, + { + pattern: "(abc)?", + ast: newOptionNode( + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + }, + { + pattern: "(a|b)?", + ast: newOptionNode( + genAltNode( + newSymbolNode('a'), + newSymbolNode('b'), + ), + ), + }, + { + pattern: "?", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "(?)", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a|?", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "?|b", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a??", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a*", + ast: newRepeatNode( + newSymbolNode('a'), + ), + }, + { + pattern: "[abc]*", + ast: newRepeatNode( + genAltNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + }, + { + pattern: "\\u{3042}*", + ast: newRepeatNode( + newSymbolNode('\u3042'), + ), + }, + { + pattern: "\\p{Letter}*", + skipTestAST: true, + }, + { + pattern: "\\f{a2c}*", + fragments: map[LexKindName]string{ + "a2c": "abc", + }, + ast: newRepeatNode( + newFragmentNode("a2c", + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + ), + }, + { + pattern: "((a*)*)*", + ast: newRepeatNode( + newRepeatNode( + newRepeatNode( + newSymbolNode('a'), + ), + ), + ), + }, + { + pattern: "(abc)*", + ast: newRepeatNode( + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + }, + { + pattern: "(a|b)*", + ast: newRepeatNode( + genAltNode( + newSymbolNode('a'), + newSymbolNode('b'), + ), + ), + }, + { + pattern: "*", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "(*)", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a|*", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "*|b", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a**", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a+", + ast: genConcatNode( + newSymbolNode('a'), + newRepeatNode( + newSymbolNode('a'), + ), + ), + }, + { + pattern: "[abc]+", + ast: genConcatNode( + genAltNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + newRepeatNode( + genAltNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + ), + }, + { + pattern: "\\u{3042}+", + ast: genConcatNode( + newSymbolNode('\u3042'), + newRepeatNode( + newSymbolNode('\u3042'), + ), + ), + }, + { + pattern: "\\p{Letter}+", + skipTestAST: true, + }, + { + pattern: "\\f{a2c}+", + fragments: map[LexKindName]string{ + "a2c": "abc", + }, + ast: genConcatNode( + newFragmentNode("a2c", + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + newRepeatNode( + newFragmentNode("a2c", + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + ), + ), + }, + { + pattern: "((a+)+)+", + ast: genConcatNode( + genConcatNode( + genConcatNode( + genConcatNode( + newSymbolNode('a'), + newRepeatNode( + newSymbolNode('a'), + ), + ), + newRepeatNode( + genConcatNode( + newSymbolNode('a'), + newRepeatNode( + newSymbolNode('a'), + ), + ), + ), + ), + newRepeatNode( + genConcatNode( + genConcatNode( + newSymbolNode('a'), + newRepeatNode( + newSymbolNode('a'), + ), + ), + newRepeatNode( + genConcatNode( + newSymbolNode('a'), + newRepeatNode( + newSymbolNode('a'), + ), + ), + ), + ), + ), + ), + ), + }, + { + pattern: "(abc)+", + ast: genConcatNode( + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + newRepeatNode( + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + ), + }, + { + pattern: "(a|b)+", + ast: genConcatNode( + genAltNode( + newSymbolNode('a'), + newSymbolNode('b'), + ), + newRepeatNode( + genAltNode( + newSymbolNode('a'), + newSymbolNode('b'), + ), + ), + ), + }, + { + pattern: "+", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "(+)", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a|+", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "+|b", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a++", + syntaxError: synErrRepNoTarget, + }, + { + pattern: ".", + ast: newRangeSymbolNode(0x00, 0x10FFFF), + }, + { + pattern: "[a]", + ast: newSymbolNode('a'), + }, + { + pattern: "[abc]", + ast: genAltNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + }, + { + pattern: "[a-z]", + ast: newRangeSymbolNode('a', 'z'), + }, + { + pattern: "[A-Za-z]", + ast: genAltNode( + newRangeSymbolNode('A', 'Z'), + newRangeSymbolNode('a', 'z'), + ), + }, + { + pattern: "[\\u{004E}]", + ast: newSymbolNode('N'), + }, + { + pattern: "[\\u{0061}-\\u{007A}]", + ast: newRangeSymbolNode('a', 'z'), + }, + { + pattern: "[\\p{Lu}]", + skipTestAST: true, + }, + { + pattern: "[a-\\p{Lu}]", + syntaxError: synErrRangePropIsUnavailable, + }, + { + pattern: "[\\p{Lu}-z]", + syntaxError: synErrRangePropIsUnavailable, + }, + { + pattern: "[\\p{Lu}-\\p{Ll}]", + syntaxError: synErrRangePropIsUnavailable, + }, + { + pattern: "[z-a]", + syntaxError: synErrRangeInvalidOrder, + }, + { + pattern: "a[]", + syntaxError: synErrBExpNoElem, + }, + { + pattern: "[]a", + syntaxError: synErrBExpNoElem, + }, + { + pattern: "[]", + syntaxError: synErrBExpNoElem, + }, + { + pattern: "[^\\u{004E}]", + ast: genAltNode( + newRangeSymbolNode(0x00, '\u004E'-1), + newRangeSymbolNode('\u004E'+1, 0x10FFFF), + ), + }, + { + pattern: "[^\\u{0061}-\\u{007A}]", + ast: genAltNode( + newRangeSymbolNode(0x00, '\u0061'-1), + newRangeSymbolNode('\u007A'+1, 0x10FFFF), + ), + }, + { + pattern: "[^\\p{Lu}]", + skipTestAST: true, + }, + { + pattern: "[^a-\\p{Lu}]", + syntaxError: synErrRangePropIsUnavailable, + }, + { + pattern: "[^\\p{Lu}-z]", + syntaxError: synErrRangePropIsUnavailable, + }, + { + pattern: "[^\\p{Lu}-\\p{Ll}]", + syntaxError: synErrRangePropIsUnavailable, + }, + { + pattern: "[^\\u{0000}-\\u{10FFFF}]", + syntaxError: synErrUnmatchablePattern, + }, + { + pattern: "[^\\u{0000}-\\u{FFFF}\\u{010000}-\\u{10FFFF}]", + syntaxError: synErrUnmatchablePattern, + }, + { + pattern: "[^]", + ast: newSymbolNode('^'), + }, + { + pattern: "[", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "([", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "[a", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "([a", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "[a-", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "([a-", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "[^", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "([^", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "[^a", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "([^a", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "[^a-", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "([^a-", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "]", + ast: newSymbolNode(']'), + }, + { + pattern: "(]", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: "a]", + ast: genConcatNode( + newSymbolNode('a'), + newSymbolNode(']'), + ), + }, + { + pattern: "(a]", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: "([)", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "([a)", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "[a-]", + ast: genAltNode( + newSymbolNode('a'), + newSymbolNode('-'), + ), + }, + { + pattern: "[^a-]", + ast: genAltNode( + newRangeSymbolNode(0x00, 0x2C), + newRangeSymbolNode(0x2E, 0x60), + newRangeSymbolNode(0x62, 0x10FFFF), + ), + }, + { + pattern: "[-z]", + ast: genAltNode( + newSymbolNode('-'), + newSymbolNode('z'), + ), + }, + { + pattern: "[^-z]", + ast: newAltNode( + newRangeSymbolNode(0x00, 0x2C), + newAltNode( + newRangeSymbolNode(0x2E, 0x79), + newRangeSymbolNode(0x7B, 0x10FFFF), + ), + ), + }, + { + pattern: "[-]", + ast: newSymbolNode('-'), + }, + { + pattern: "[^-]", + ast: genAltNode( + newRangeSymbolNode(0x00, 0x2C), + newRangeSymbolNode(0x2E, 0x10FFFF), + ), + }, + { + pattern: "[^01]", + ast: genAltNode( + newRangeSymbolNode(0x00, '0'-1), + newRangeSymbolNode('1'+1, 0x10FFFF), + ), + }, + { + pattern: "[^10]", + ast: genAltNode( + newRangeSymbolNode(0x00, '0'-1), + newRangeSymbolNode('1'+1, 0x10FFFF), + ), + }, + { + pattern: "[^a-z]", + ast: genAltNode( + newRangeSymbolNode(0x00, 'a'-1), + newRangeSymbolNode('z'+1, 0x10FFFF), + ), + }, + { + pattern: "[^az]", + ast: genAltNode( + newRangeSymbolNode(0x00, 'a'-1), + genAltNode( + newRangeSymbolNode('a'+1, 'z'-1), + newRangeSymbolNode('z'+1, 0x10FFFF), + ), + ), + }, + { + pattern: "\\u{006E}", + ast: newSymbolNode('\u006E'), + }, + { + pattern: "\\u{03BD}", + ast: newSymbolNode('\u03BD'), + }, + { + pattern: "\\u{306B}", + ast: newSymbolNode('\u306B'), + }, + { + pattern: "\\u{01F638}", + ast: newSymbolNode('\U0001F638'), + }, + { + pattern: "\\u{0000}", + ast: newSymbolNode('\u0000'), + }, + { + pattern: "\\u{10FFFF}", + ast: newSymbolNode('\U0010FFFF'), + }, + { + pattern: "\\u{110000}", + syntaxError: synErrCPExpOutOfRange, + }, + { + pattern: "\\u", + syntaxError: synErrCPExpInvalidForm, + }, + { + pattern: "\\u{", + syntaxError: synErrCPExpInvalidForm, + }, + { + pattern: "\\u{03BD", + syntaxError: synErrCPExpInvalidForm, + }, + { + pattern: "\\u{}", + syntaxError: synErrCPExpInvalidForm, + }, + { + pattern: "\\p{Letter}", + skipTestAST: true, + }, + { + pattern: "\\p{General_Category=Letter}", + skipTestAST: true, + }, + { + pattern: "\\p{ Letter }", + skipTestAST: true, + }, + { + pattern: "\\p{ General_Category = Letter }", + skipTestAST: true, + }, + { + pattern: "\\p", + syntaxError: synErrCharPropExpInvalidForm, + }, + { + pattern: "\\p{", + syntaxError: synErrCharPropExpInvalidForm, + }, + { + pattern: "\\p{Letter", + syntaxError: synErrCharPropExpInvalidForm, + }, + { + pattern: "\\p{General_Category=}", + syntaxError: synErrCharPropExpInvalidForm, + }, + { + pattern: "\\p{General_Category= }", + syntaxError: synErrCharPropInvalidSymbol, + }, + { + pattern: "\\p{=Letter}", + syntaxError: synErrCharPropExpInvalidForm, + }, + { + pattern: "\\p{ =Letter}", + syntaxError: synErrCharPropInvalidSymbol, + }, + { + pattern: "\\p{=}", + syntaxError: synErrCharPropExpInvalidForm, + }, + { + pattern: "\\p{}", + syntaxError: synErrCharPropExpInvalidForm, + }, + { + pattern: "\\f{a2c}", + fragments: map[LexKindName]string{ + "a2c": "abc", + }, + ast: newFragmentNode("a2c", + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + }, + { + pattern: "\\f{ a2c }", + fragments: map[LexKindName]string{ + "a2c": "abc", + }, + ast: newFragmentNode("a2c", + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + }, + { + pattern: "\\f", + syntaxError: synErrFragmentExpInvalidForm, + }, + { + pattern: "\\f{", + syntaxError: synErrFragmentExpInvalidForm, + }, + { + pattern: "\\f{a2c", + fragments: map[LexKindName]string{ + "a2c": "abc", + }, + syntaxError: synErrFragmentExpInvalidForm, + }, + { + pattern: "(a)", + ast: newSymbolNode('a'), + }, + { + pattern: "(((a)))", + ast: newSymbolNode('a'), + }, + { + pattern: "a()", + syntaxError: synErrGroupNoElem, + }, + { + pattern: "()a", + syntaxError: synErrGroupNoElem, + }, + { + pattern: "()", + syntaxError: synErrGroupNoElem, + }, + { + pattern: "(", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: "a(", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: "(a", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: "((", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: "((a)", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: ")", + syntaxError: synErrGroupNoInitiator, + }, + { + pattern: "a)", + syntaxError: synErrGroupNoInitiator, + }, + { + pattern: ")a", + syntaxError: synErrGroupNoInitiator, + }, + { + pattern: "))", + syntaxError: synErrGroupNoInitiator, + }, + { + pattern: "(a))", + syntaxError: synErrGroupNoInitiator, + }, + { + pattern: "Mulder|Scully", + ast: genAltNode( + genConcatNode( + newSymbolNode('M'), + newSymbolNode('u'), + newSymbolNode('l'), + newSymbolNode('d'), + newSymbolNode('e'), + newSymbolNode('r'), + ), + genConcatNode( + newSymbolNode('S'), + newSymbolNode('c'), + newSymbolNode('u'), + newSymbolNode('l'), + newSymbolNode('l'), + newSymbolNode('y'), + ), + ), + }, + { + pattern: "Langly|Frohike|Byers", + ast: genAltNode( + genConcatNode( + newSymbolNode('L'), + newSymbolNode('a'), + newSymbolNode('n'), + newSymbolNode('g'), + newSymbolNode('l'), + newSymbolNode('y'), + ), + genConcatNode( + newSymbolNode('F'), + newSymbolNode('r'), + newSymbolNode('o'), + newSymbolNode('h'), + newSymbolNode('i'), + newSymbolNode('k'), + newSymbolNode('e'), + ), + genConcatNode( + newSymbolNode('B'), + newSymbolNode('y'), + newSymbolNode('e'), + newSymbolNode('r'), + newSymbolNode('s'), + ), + ), + }, + { + pattern: "|", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "||", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "Mulder|", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "|Scully", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "Langly|Frohike|", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "Langly||Byers", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "|Frohike|Byers", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "|Frohike|", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "Fox(|)Mulder", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "(Fox|)Mulder", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "Fox(|Mulder)", + syntaxError: synErrAltLackOfOperand, + }, + } + for i, tt := range tests { + t.Run(fmt.Sprintf("#%v %v", i, tt.pattern), func(t *testing.T) { + fragmentTrees := map[LexKindName]CPTree{} + for kind, pattern := range tt.fragments { + p := NewParser(kind, strings.NewReader(pattern)) + root, err := p.Parse() + if err != nil { + t.Fatal(err) + } + + fragmentTrees[kind] = root + } + err := CompleteFragments(fragmentTrees) + if err != nil { + t.Fatal(err) + } + + p := NewParser(LexKindName("test"), strings.NewReader(tt.pattern)) + root, err := p.Parse() + if tt.syntaxError != nil { + // printCPTree(os.Stdout, root, "", "") + if err != ParseErr { + t.Fatalf("unexpected error: want: %v, got: %v", ParseErr, err) + } + _, synErr := p.Error() + if synErr != tt.syntaxError { + t.Fatalf("unexpected syntax error: want: %v, got: %v", tt.syntaxError, synErr) + } + if root != nil { + t.Fatalf("tree must be nil") + } + } else { + if err != nil { + detail, cause := p.Error() + t.Fatalf("%v: %v: %v", err, cause, detail) + } + if root == nil { + t.Fatal("tree must be non-nil") + } + + complete, err := ApplyFragments(root, fragmentTrees) + if err != nil { + t.Fatal(err) + } + if !complete { + t.Fatalf("incomplete fragments") + } + + // printCPTree(os.Stdout, root, "", "") + if !tt.skipTestAST { + r := root.(*rootNode) + testAST(t, tt.ast, r.tree) + } + } + }) + } +} + +func TestParse_ContributoryPropertyIsNotExposed(t *testing.T) { + for _, cProp := range ucd.ContributoryProperties() { + t.Run(fmt.Sprintf("%v", cProp), func(t *testing.T) { + p := NewParser(LexKindName("test"), strings.NewReader(fmt.Sprintf(`\p{%v=yes}`, cProp))) + root, err := p.Parse() + if err == nil { + t.Fatalf("expected syntax error: got: nil") + } + _, synErr := p.Error() + if synErr != synErrCharPropUnsupported { + t.Fatalf("unexpected syntax error: want: %v, got: %v", synErrCharPropUnsupported, synErr) + } + if root != nil { + t.Fatalf("tree is not nil") + } + }) + } +} + +func testAST(t *testing.T, expected, actual CPTree) { + t.Helper() + + aTy := reflect.TypeOf(actual) + eTy := reflect.TypeOf(expected) + if eTy != aTy { + t.Fatalf("unexpected node: want: %+v, got: %+v", eTy, aTy) + } + + if actual == nil { + return + } + + switch e := expected.(type) { + case *symbolNode: + a := actual.(*symbolNode) + if a.From != e.From || a.To != e.To { + t.Fatalf("unexpected node: want: %+v, got: %+v", e, a) + } + } + eLeft, eRight := expected.children() + aLeft, aRight := actual.children() + testAST(t, eLeft, aLeft) + testAST(t, eRight, aRight) +} + +func TestExclude(t *testing.T) { + for _, test := range []struct { + caption string + target CPTree + base CPTree + result CPTree + }{ + // t.From > b.From && t.To < b.To + + // |t.From - b.From| = 1 + // |b.To - t.To| = 1 + // + // Target (t): +--+ + // Base (b): +--+--+--+ + // Result (b - t): +--+ +--+ + { + caption: "|t.From - b.From| = 1 && |b.To - t.To| = 1", + target: newSymbolNode('1'), + base: newRangeSymbolNode('0', '2'), + result: newAltNode( + newSymbolNode('0'), + newSymbolNode('2'), + ), + }, + // |t.From - b.From| > 1 + // |b.To - t.To| > 1 + // + // Target (t): +--+ + // Base (b): +--+--+--+--+--+ + // Result (b - t): +--+--+ +--+--+ + { + caption: "|t.From - b.From| > 1 && |b.To - t.To| > 1", + target: newSymbolNode('2'), + base: newRangeSymbolNode('0', '4'), + result: newAltNode( + newRangeSymbolNode('0', '1'), + newRangeSymbolNode('3', '4'), + ), + }, + + // t.From <= b.From && t.To >= b.From && t.To < b.To + + // |b.From - t.From| = 0 + // |t.To - b.From| = 0 + // |b.To - t.To| = 1 + // + // Target (t): +--+ + // Base (b): +--+--+ + // Result (b - t): +--+ + { + caption: "|b.From - t.From| = 0 && |t.To - b.From| = 0 && |b.To - t.To| = 1", + target: newSymbolNode('0'), + base: newRangeSymbolNode('0', '1'), + result: newSymbolNode('1'), + }, + // |b.From - t.From| = 0 + // |t.To - b.From| = 0 + // |b.To - t.To| > 1 + // + // Target (t): +--+ + // Base (b): +--+--+--+ + // Result (b - t): +--+--+ + { + caption: "|b.From - t.From| = 0 && |t.To - b.From| = 0 && |b.To - t.To| > 1", + target: newSymbolNode('0'), + base: newRangeSymbolNode('0', '2'), + result: newRangeSymbolNode('1', '2'), + }, + // |b.From - t.From| = 0 + // |t.To - b.From| > 0 + // |b.To - t.To| = 1 + // + // Target (t): +--+--+ + // Base (b): +--+--+--+ + // Result (b - t): +--+ + { + caption: "|b.From - t.From| = 0 && |t.To - b.From| > 0 && |b.To - t.To| = 1", + target: newRangeSymbolNode('0', '1'), + base: newRangeSymbolNode('0', '2'), + result: newSymbolNode('2'), + }, + // |b.From - t.From| = 0 + // |t.To - b.From| > 0 + // |b.To - t.To| > 1 + // + // Target (t): +--+--+ + // Base (b): +--+--+--+--+ + // Result (b - t): +--+--+ + { + caption: "|b.From - t.From| = 0 && |t.To - b.From| > 0 && |b.To - t.To| > 1", + target: newRangeSymbolNode('0', '1'), + base: newRangeSymbolNode('0', '3'), + result: newRangeSymbolNode('2', '3'), + }, + // |b.From - t.From| > 0 + // |t.To - b.From| = 0 + // |b.To - t.To| = 1 + // + // Target (t): +--+--+ + // Base (b): +--+--+ + // Result (b - t): +--+ + { + caption: "|b.From - t.From| > 0 && |t.To - b.From| = 0 && |b.To - t.To| = 1", + target: newRangeSymbolNode('0', '1'), + base: newRangeSymbolNode('1', '2'), + result: newSymbolNode('2'), + }, + // |b.From - t.From| > 0 + // |t.To - b.From| = 0 + // |b.To - t.To| > 1 + // + // Target (t): +--+--+ + // Base (b): +--+--+--+ + // Result (b - t): +--+--+ + { + caption: "|b.From - t.From| > 0 && |t.To - b.From| = 0 && |b.To - t.To| > 1", + target: newRangeSymbolNode('0', '1'), + base: newRangeSymbolNode('1', '3'), + result: newRangeSymbolNode('2', '3'), + }, + // |b.From - t.From| > 0 + // |t.To - b.From| > 0 + // |b.To - t.To| = 1 + // + // Target (t): +--+--+--+ + // Base (b): +--+--+--+ + // Result (b - t): +--+ + { + caption: "|b.From - t.From| > 0 && |t.To - b.From| > 0 && |b.To - t.To| = 1", + target: newRangeSymbolNode('0', '2'), + base: newRangeSymbolNode('1', '3'), + result: newSymbolNode('3'), + }, + // |b.From - t.From| > 0 + // |t.To - b.From| > 0 + // |b.To - t.To| > 1 + // + // Target (t): +--+--+--+ + // Base (b): +--+--+--+--+ + // Result (b - t): +--+--+ + { + caption: "|b.From - t.From| > 0 && |t.To - b.From| > 0 && |b.To - t.To| > 1", + target: newRangeSymbolNode('0', '2'), + base: newRangeSymbolNode('1', '4'), + result: newRangeSymbolNode('3', '4'), + }, + + // t.From > b.From && t.From <= b.To && t.To >= b.To + + // |t.From - b.From| = 1 + // |b.To - t.From| = 0 + // |t.To - b.To| = 0 + // + // Target (t): +--+ + // Base (b): +--+--+ + // Result (b - t): +--+ + { + caption: "|t.From - b.From| = 1 && |b.To - t.From| = 0 && |t.To - b.To| = 0", + target: newSymbolNode('1'), + base: newRangeSymbolNode('0', '1'), + result: newSymbolNode('0'), + }, + // |t.From - b.From| = 1 + // |b.To - t.From| = 0 + // |t.To - b.To| > 0 + // + // Target (t): +--+--+ + // Base (b): +--+--+ + // Result (b - t): +--+ + { + caption: "|t.From - b.From| = 1 && |b.To - t.From| = 0 && |t.To - b.To| > 0", + target: newRangeSymbolNode('1', '2'), + base: newRangeSymbolNode('0', '1'), + result: newSymbolNode('0'), + }, + // |t.From - b.From| = 1 + // |b.To - t.From| > 0 + // |t.To - b.To| = 0 + // + // Target (t): +--+--+ + // Base (b): +--+--+--+ + // Result (b - t): +--+ + { + caption: "|t.From - b.From| = 1 && |b.To - t.From| > 0 && |t.To - b.To| = 0", + target: newRangeSymbolNode('1', '2'), + base: newRangeSymbolNode('0', '2'), + result: newSymbolNode('0'), + }, + // |t.From - b.From| = 1 + // |b.To - t.From| > 0 + // |t.To - b.To| > 0 + // + // Target (t): +--+--+--+ + // Base (b): +--+--+--+ + // Result (b - t): +--+ + { + caption: "|t.From - b.From| = 1 && |b.To - t.From| > 0 && |t.To - b.To| > 0", + target: newRangeSymbolNode('1', '3'), + base: newRangeSymbolNode('0', '2'), + result: newSymbolNode('0'), + }, + // |t.From - b.From| > 1 + // |b.To - t.From| = 0 + // |t.To - b.To| = 0 + // + // Target (t): +--+ + // Base (b): +--+--+--+ + // Result (b - t): +--+--+ + { + caption: "|t.From - b.From| > 1 && |b.To - t.From| = 0 && |t.To - b.To| = 0", + target: newSymbolNode('2'), + base: newRangeSymbolNode('0', '2'), + result: newRangeSymbolNode('0', '1'), + }, + // |t.From - b.From| > 1 + // |b.To - t.From| = 0 + // |t.To - b.To| > 0 + // + // Target (t): +--+--+ + // Base (b): +--+--+--+ + // Result (b - t): +--+--+ + { + caption: "|t.From - b.From| > 1 && |b.To - t.From| = 0 && |t.To - b.To| > 0", + target: newRangeSymbolNode('2', '3'), + base: newRangeSymbolNode('0', '2'), + result: newRangeSymbolNode('0', '1'), + }, + // |t.From - b.From| > 1 + // |b.To - t.From| > 0 + // |t.To - b.To| = 0 + // + // Target (t): +--+--+ + // Base (b): +--+--+--+--+ + // Result (b - t): +--+--+ + { + caption: "|t.From - b.From| > 1 && |b.To - t.From| > 0 && |t.To - b.To| = 0", + target: newRangeSymbolNode('2', '3'), + base: newRangeSymbolNode('0', '3'), + result: newRangeSymbolNode('0', '1'), + }, + // |t.From - b.From| > 1 + // |b.To - t.From| > 0 + // |t.To - b.To| > 0 + // + // Target (t): +--+--+--+ + // Base (b): +--+--+--+--+ + // Result (b - t): +--+--+ + { + caption: "|t.From - b.From| > 1 && |b.To - t.From| > 0 && |t.To - b.To| > 0", + target: newRangeSymbolNode('2', '4'), + base: newRangeSymbolNode('0', '3'), + result: newRangeSymbolNode('0', '1'), + }, + + // t.From <= b.From && t.To >= b.To + + // |b.From - t.From| = 0 + // |t.To - b.To| = 0 + // + // Target (t): +--+ + // Base (b): +--+ + // Result (b - t): N/A + { + caption: "|b.From - t.From| = 0 && |t.To - b.To| = 0", + target: newSymbolNode('0'), + base: newSymbolNode('0'), + result: nil, + }, + // |b.From - t.From| = 0 + // |t.To - b.To| > 0 + // + // Target (t): +--+--+ + // Base (b): +--+ + // Result (b - t): N/A + { + caption: "|b.From - t.From| = 0 && |t.To - b.To| > 0", + target: newRangeSymbolNode('0', '1'), + base: newSymbolNode('0'), + result: nil, + }, + // |b.From - t.From| > 0 + // |t.To - b.To| = 0 + // + // Target (t): +--+--+ + // Base (b): +--+ + // Result (b - t): N/A + { + caption: "|b.From - t.From| > 0 && |t.To - b.To| = 0", + target: newRangeSymbolNode('0', '1'), + base: newSymbolNode('1'), + result: nil, + }, + // |b.From - t.From| > 0 + // |t.To - b.To| > 0 + // + // Target (t): +--+--+--+ + // Base (b): +--+ + // Result (b - t): N/A + { + caption: "|b.From - t.From| > 0 && |t.To - b.To| > 0", + target: newRangeSymbolNode('0', '2'), + base: newSymbolNode('1'), + result: nil, + }, + + // Others + + // |b.From - t.From| = 1 + // + // Target (t): +--+ + // Base (b): +--+ + // Result (b - t): +--+ + { + caption: "|b.From - t.From| = 1", + target: newSymbolNode('0'), + base: newSymbolNode('1'), + result: newSymbolNode('1'), + }, + // |b.From - t.From| > 1 + // + // Target (t): +--+ + // Base (b): +--+ + // Result (b - t): +--+ + { + caption: "|b.From - t.From| > 1", + target: newSymbolNode('0'), + base: newSymbolNode('2'), + result: newSymbolNode('2'), + }, + // |t.To - b.To| = 1 + // + // Target (t): +--+ + // Base (b): +--+ + // Result (b - t): +--+ + { + caption: "|t.To - b.To| = 1", + target: newSymbolNode('1'), + base: newSymbolNode('0'), + result: newSymbolNode('0'), + }, + // |t.To - b.To| > 1 + // + // Target (t): +--+ + // Base (b): +--+ + // Result (b - t): +--+ + { + caption: "|t.To - b.To| > 1", + target: newSymbolNode('2'), + base: newSymbolNode('0'), + result: newSymbolNode('0'), + }, + } { + t.Run(test.caption, func(t *testing.T) { + r := exclude(test.target, test.base) + testAST(t, test.result, r) + }) + } +} + func MainTest() { @@ -582,6 +2482,10 @@ func MainTest() { { "TestSnakeCaseToUpperCamelCase", TestSnakeCaseToUpperCamelCase }, { "TestFindSpellingInconsistencies", TestFindSpellingInconsistencies }, { "TestLexSpec_Validate", TestLexSpec_Validate }, + { "TestLexer", TestLexer }, + { "TestParse", TestParse }, + { "TestParse_ContributoryPropertyIsNotExposed", TestParse_ContributoryPropertyIsNotExposed }, + { "TestExclude", TestExclude }, } deps := testdeps.TestDeps{} |