aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRyo Nihei <nihei.dev@gmail.com>2021-12-05 13:17:32 +0900
committerRyo Nihei <nihei.dev@gmail.com>2021-12-10 01:50:24 +0900
commit19b68a5ca013c1ff7562e608db7964973fd691b2 (patch)
treee43ac01d0d6f41b4d18e4adf357b3662517806f9
parentMove UTF8-related processes to utf8 package (diff)
downloadtre-19b68a5ca013c1ff7562e608db7964973fd691b2.tar.gz
tre-19b68a5ca013c1ff7562e608db7964973fd691b2.tar.xz
Add a new parser that constructs a tree representing characters as code points, not byte sequences
Diffstat (limited to '')
-rw-r--r--compiler/parser/error.go33
-rw-r--r--compiler/parser/fragment.go72
-rw-r--r--compiler/parser/lexer.go593
-rw-r--r--compiler/parser/lexer_test.go524
-rw-r--r--compiler/parser/parser.go521
-rw-r--r--compiler/parser/parser_test.go940
-rw-r--r--compiler/parser/tree.go451
7 files changed, 3134 insertions, 0 deletions
diff --git a/compiler/parser/error.go b/compiler/parser/error.go
new file mode 100644
index 0000000..a90e3b3
--- /dev/null
+++ b/compiler/parser/error.go
@@ -0,0 +1,33 @@
+package parser
+
+import "fmt"
+
+var (
+ ParseErr = fmt.Errorf("parse error")
+
+ // lexical errors
+ synErrIncompletedEscSeq = fmt.Errorf("incompleted escape sequence; unexpected EOF following \\")
+ synErrInvalidEscSeq = fmt.Errorf("invalid escape sequence")
+ synErrInvalidCodePoint = fmt.Errorf("code points must consist of just 4 or 6 hex digits")
+ synErrCharPropInvalidSymbol = fmt.Errorf("invalid character property symbol")
+ SynErrFragmentInvalidSymbol = fmt.Errorf("invalid fragment symbol")
+
+ // syntax errors
+ synErrUnexpectedToken = fmt.Errorf("unexpected token")
+ synErrNullPattern = fmt.Errorf("a pattern must be a non-empty byte sequence")
+ synErrAltLackOfOperand = fmt.Errorf("an alternation expression must have operands")
+ synErrRepNoTarget = fmt.Errorf("a repeat expression must have an operand")
+ synErrGroupNoElem = fmt.Errorf("a grouping expression must include at least one character")
+ synErrGroupUnclosed = fmt.Errorf("unclosed grouping expression")
+ synErrGroupNoInitiator = fmt.Errorf(") needs preceding (")
+ synErrGroupInvalidForm = fmt.Errorf("invalid grouping expression")
+ synErrBExpNoElem = fmt.Errorf("a bracket expression must include at least one character")
+ synErrBExpUnclosed = fmt.Errorf("unclosed bracket expression")
+ synErrBExpInvalidForm = fmt.Errorf("invalid bracket expression")
+ synErrRangeInvalidOrder = fmt.Errorf("a range expression with invalid order")
+ synErrCPExpInvalidForm = fmt.Errorf("invalid code point expression")
+ synErrCPExpOutOfRange = fmt.Errorf("a code point must be between U+0000 to U+10FFFF")
+ synErrCharPropExpInvalidForm = fmt.Errorf("invalid character property expression")
+ synErrCharPropUnsupported = fmt.Errorf("unsupported character property")
+ synErrFragmentExpInvalidForm = fmt.Errorf("invalid fragment expression")
+)
diff --git a/compiler/parser/fragment.go b/compiler/parser/fragment.go
new file mode 100644
index 0000000..5680b55
--- /dev/null
+++ b/compiler/parser/fragment.go
@@ -0,0 +1,72 @@
+package parser
+
+import (
+ "fmt"
+
+ "github.com/nihei9/maleeni/spec"
+)
+
+type incompleteFragment struct {
+ kind spec.LexKindName
+ root *rootNode
+}
+
+func CompleteFragments(fragments map[spec.LexKindName]CPTree) error {
+ if len(fragments) == 0 {
+ return nil
+ }
+
+ completeFragments := map[spec.LexKindName]CPTree{}
+ incompleteFragments := []*incompleteFragment{}
+ for kind, tree := range fragments {
+ root, ok := tree.(*rootNode)
+ if !ok {
+ return fmt.Errorf("CompleteFragments can take only *rootNode: %T", tree)
+ }
+ if root.incomplete() {
+ incompleteFragments = append(incompleteFragments, &incompleteFragment{
+ kind: kind,
+ root: root,
+ })
+ } else {
+ completeFragments[kind] = root
+ }
+ }
+ for len(incompleteFragments) > 0 {
+ lastIncompCount := len(incompleteFragments)
+ remainingFragments := []*incompleteFragment{}
+ for _, e := range incompleteFragments {
+ complete, err := ApplyFragments(e.root, completeFragments)
+ if err != nil {
+ return err
+ }
+ if !complete {
+ remainingFragments = append(remainingFragments, e)
+ } else {
+ completeFragments[e.kind] = e.root
+ }
+ }
+ incompleteFragments = remainingFragments
+ if len(incompleteFragments) == lastIncompCount {
+ return ParseErr
+ }
+ }
+
+ return nil
+}
+
+func ApplyFragments(t CPTree, fragments map[spec.LexKindName]CPTree) (bool, error) {
+ root, ok := t.(*rootNode)
+ if !ok {
+ return false, fmt.Errorf("ApplyFragments can take only *rootNode type: %T", t)
+ }
+
+ for name, frag := range fragments {
+ err := root.applyFragment(name, frag)
+ if err != nil {
+ return false, err
+ }
+ }
+
+ return !root.incomplete(), nil
+}
diff --git a/compiler/parser/lexer.go b/compiler/parser/lexer.go
new file mode 100644
index 0000000..d2237a5
--- /dev/null
+++ b/compiler/parser/lexer.go
@@ -0,0 +1,593 @@
+package parser
+
+import (
+ "bufio"
+ "fmt"
+ "io"
+ "strings"
+)
+
+type tokenKind string
+
+const (
+ tokenKindChar tokenKind = "char"
+ tokenKindAnyChar tokenKind = "."
+ tokenKindRepeat tokenKind = "*"
+ tokenKindRepeatOneOrMore tokenKind = "+"
+ tokenKindOption tokenKind = "?"
+ tokenKindAlt tokenKind = "|"
+ tokenKindGroupOpen tokenKind = "("
+ tokenKindGroupClose tokenKind = ")"
+ tokenKindBExpOpen tokenKind = "["
+ tokenKindInverseBExpOpen tokenKind = "[^"
+ tokenKindBExpClose tokenKind = "]"
+ tokenKindCharRange tokenKind = "-"
+ tokenKindCodePointLeader tokenKind = "\\u"
+ tokenKindCharPropLeader tokenKind = "\\p"
+ tokenKindFragmentLeader tokenKind = "\\f"
+ tokenKindLBrace tokenKind = "{"
+ tokenKindRBrace tokenKind = "}"
+ tokenKindEqual tokenKind = "="
+ tokenKindCodePoint tokenKind = "code point"
+ tokenKindCharPropSymbol tokenKind = "character property symbol"
+ tokenKindFragmentSymbol tokenKind = "fragment symbol"
+ tokenKindEOF tokenKind = "eof"
+)
+
+type token struct {
+ kind tokenKind
+ char rune
+ propSymbol string
+ codePoint string
+ fragmentSymbol string
+}
+
+const nullChar = '\u0000'
+
+func newToken(kind tokenKind, char rune) *token {
+ return &token{
+ kind: kind,
+ char: char,
+ }
+}
+
+func newCodePointToken(codePoint string) *token {
+ return &token{
+ kind: tokenKindCodePoint,
+ codePoint: codePoint,
+ }
+}
+
+func newCharPropSymbolToken(propSymbol string) *token {
+ return &token{
+ kind: tokenKindCharPropSymbol,
+ propSymbol: propSymbol,
+ }
+}
+
+func newFragmentSymbolToken(fragmentSymbol string) *token {
+ return &token{
+ kind: tokenKindFragmentSymbol,
+ fragmentSymbol: fragmentSymbol,
+ }
+}
+
+type lexerMode string
+
+const (
+ lexerModeDefault lexerMode = "default"
+ lexerModeBExp lexerMode = "bracket expression"
+ lexerModeCPExp lexerMode = "code point expression"
+ lexerModeCharPropExp lexerMode = "character property expression"
+ lexerModeFragmentExp lexerMode = "fragment expression"
+)
+
+type lexerModeStack struct {
+ stack []lexerMode
+}
+
+func newLexerModeStack() *lexerModeStack {
+ return &lexerModeStack{
+ stack: []lexerMode{
+ lexerModeDefault,
+ },
+ }
+}
+
+func (s *lexerModeStack) top() lexerMode {
+ return s.stack[len(s.stack)-1]
+}
+
+func (s *lexerModeStack) push(m lexerMode) {
+ s.stack = append(s.stack, m)
+}
+
+func (s *lexerModeStack) pop() {
+ s.stack = s.stack[:len(s.stack)-1]
+}
+
+type rangeState string
+
+// [a-z]
+// ^^^^
+// |||`-- ready
+// ||`-- expect range terminator
+// |`-- read range initiator
+// `-- ready
+const (
+ rangeStateReady rangeState = "ready"
+ rangeStateReadRangeInitiator rangeState = "read range initiator"
+ rangeStateExpectRangeTerminator rangeState = "expect range terminator"
+)
+
+type lexer struct {
+ src *bufio.Reader
+ peekChar2 rune
+ peekEOF2 bool
+ peekChar1 rune
+ peekEOF1 bool
+ lastChar rune
+ reachedEOF bool
+ prevChar1 rune
+ prevEOF1 bool
+ prevChar2 rune
+ pervEOF2 bool
+ modeStack *lexerModeStack
+ rangeState rangeState
+
+ errCause error
+ errDetail string
+}
+
+func newLexer(src io.Reader) *lexer {
+ return &lexer{
+ src: bufio.NewReader(src),
+ peekChar2: nullChar,
+ peekEOF2: false,
+ peekChar1: nullChar,
+ peekEOF1: false,
+ lastChar: nullChar,
+ reachedEOF: false,
+ prevChar1: nullChar,
+ prevEOF1: false,
+ prevChar2: nullChar,
+ pervEOF2: false,
+ modeStack: newLexerModeStack(),
+ rangeState: rangeStateReady,
+ }
+}
+
+func (l *lexer) error() (string, error) {
+ return l.errDetail, l.errCause
+}
+
+func (l *lexer) next() (*token, error) {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ return newToken(tokenKindEOF, nullChar), nil
+ }
+
+ switch l.modeStack.top() {
+ case lexerModeBExp:
+ tok, err := l.nextInBExp(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindBExpClose:
+ l.modeStack.pop()
+ case tokenKindCharRange:
+ l.rangeState = rangeStateExpectRangeTerminator
+ case tokenKindChar:
+ switch l.rangeState {
+ case rangeStateReady:
+ l.rangeState = rangeStateReadRangeInitiator
+ case rangeStateExpectRangeTerminator:
+ l.rangeState = rangeStateReady
+ }
+ case tokenKindCodePointLeader:
+ l.modeStack.push(lexerModeCPExp)
+ case tokenKindCharPropLeader:
+ l.modeStack.push(lexerModeCharPropExp)
+ }
+ return tok, nil
+ case lexerModeCPExp:
+ tok, err := l.nextInCodePoint(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindRBrace:
+ l.modeStack.pop()
+ }
+ return tok, nil
+ case lexerModeCharPropExp:
+ tok, err := l.nextInCharProp(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindRBrace:
+ l.modeStack.pop()
+ }
+ return tok, nil
+ case lexerModeFragmentExp:
+ tok, err := l.nextInFragment(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindRBrace:
+ l.modeStack.pop()
+ }
+ return tok, nil
+ default:
+ tok, err := l.nextInDefault(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindBExpOpen:
+ l.modeStack.push(lexerModeBExp)
+ l.rangeState = rangeStateReady
+ case tokenKindInverseBExpOpen:
+ l.modeStack.push(lexerModeBExp)
+ l.rangeState = rangeStateReady
+ case tokenKindCodePointLeader:
+ l.modeStack.push(lexerModeCPExp)
+ case tokenKindCharPropLeader:
+ l.modeStack.push(lexerModeCharPropExp)
+ case tokenKindFragmentLeader:
+ l.modeStack.push(lexerModeFragmentExp)
+ }
+ return tok, nil
+ }
+}
+
+func (l *lexer) nextInDefault(c rune) (*token, error) {
+ switch c {
+ case '*':
+ return newToken(tokenKindRepeat, nullChar), nil
+ case '+':
+ return newToken(tokenKindRepeatOneOrMore, nullChar), nil
+ case '?':
+ return newToken(tokenKindOption, nullChar), nil
+ case '.':
+ return newToken(tokenKindAnyChar, nullChar), nil
+ case '|':
+ return newToken(tokenKindAlt, nullChar), nil
+ case '(':
+ return newToken(tokenKindGroupOpen, nullChar), nil
+ case ')':
+ return newToken(tokenKindGroupClose, nullChar), nil
+ case '[':
+ c1, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindBExpOpen, nullChar), nil
+ }
+ if c1 != '^' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindBExpOpen, nullChar), nil
+ }
+ c2, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindInverseBExpOpen, nullChar), nil
+ }
+ if c2 != ']' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindInverseBExpOpen, nullChar), nil
+ }
+ err = l.restore()
+ if err != nil {
+ return nil, err
+ }
+ err = l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindBExpOpen, nullChar), nil
+ case '\\':
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ l.errCause = synErrIncompletedEscSeq
+ return nil, ParseErr
+ }
+ if c == 'u' {
+ return newToken(tokenKindCodePointLeader, nullChar), nil
+ }
+ if c == 'p' {
+ return newToken(tokenKindCharPropLeader, nullChar), nil
+ }
+ if c == 'f' {
+ return newToken(tokenKindFragmentLeader, nullChar), nil
+ }
+ if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' {
+ return newToken(tokenKindChar, c), nil
+ }
+ l.errCause = synErrInvalidEscSeq
+ l.errDetail = fmt.Sprintf("\\%v is not supported", string(c))
+ return nil, ParseErr
+ default:
+ return newToken(tokenKindChar, c), nil
+ }
+}
+
+func (l *lexer) nextInBExp(c rune) (*token, error) {
+ switch c {
+ case '-':
+ if l.rangeState != rangeStateReadRangeInitiator {
+ return newToken(tokenKindChar, c), nil
+ }
+ c1, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindChar, c), nil
+ }
+ if c1 != ']' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindCharRange, nullChar), nil
+ }
+ err = l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindChar, c), nil
+ case ']':
+ return newToken(tokenKindBExpClose, nullChar), nil
+ case '\\':
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ l.errCause = synErrIncompletedEscSeq
+ return nil, ParseErr
+ }
+ if c == 'u' {
+ return newToken(tokenKindCodePointLeader, nullChar), nil
+ }
+ if c == 'p' {
+ return newToken(tokenKindCharPropLeader, nullChar), nil
+ }
+ if c == '\\' || c == '^' || c == '-' || c == ']' {
+ return newToken(tokenKindChar, c), nil
+ }
+ l.errCause = synErrInvalidEscSeq
+ l.errDetail = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c))
+ return nil, ParseErr
+ default:
+ return newToken(tokenKindChar, c), nil
+ }
+}
+
+func (l *lexer) nextInCodePoint(c rune) (*token, error) {
+ switch c {
+ case '{':
+ return newToken(tokenKindLBrace, nullChar), nil
+ case '}':
+ return newToken(tokenKindRBrace, nullChar), nil
+ default:
+ if !isHexDigit(c) {
+ l.errCause = synErrInvalidCodePoint
+ return nil, ParseErr
+ }
+ var b strings.Builder
+ fmt.Fprint(&b, string(c))
+ n := 1
+ for {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if c == '}' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if !isHexDigit(c) || n >= 6 {
+ l.errCause = synErrInvalidCodePoint
+ return nil, ParseErr
+ }
+ fmt.Fprint(&b, string(c))
+ n++
+ }
+ cp := b.String()
+ cpLen := len(cp)
+ if !(cpLen == 4 || cpLen == 6) {
+ l.errCause = synErrInvalidCodePoint
+ return nil, ParseErr
+ }
+ return newCodePointToken(b.String()), nil
+ }
+}
+
+func isHexDigit(c rune) bool {
+ if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' {
+ return true
+ }
+ return false
+}
+
+func (l *lexer) nextInCharProp(c rune) (*token, error) {
+ switch c {
+ case '{':
+ return newToken(tokenKindLBrace, nullChar), nil
+ case '}':
+ return newToken(tokenKindRBrace, nullChar), nil
+ case '=':
+ return newToken(tokenKindEqual, nullChar), nil
+ default:
+ var b strings.Builder
+ fmt.Fprint(&b, string(c))
+ n := 1
+ for {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if c == '}' || c == '=' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ fmt.Fprint(&b, string(c))
+ n++
+ }
+ sym := strings.TrimSpace(b.String())
+ if len(sym) == 0 {
+ l.errCause = synErrCharPropInvalidSymbol
+ return nil, ParseErr
+ }
+ return newCharPropSymbolToken(sym), nil
+ }
+}
+
+func (l *lexer) nextInFragment(c rune) (*token, error) {
+ switch c {
+ case '{':
+ return newToken(tokenKindLBrace, nullChar), nil
+ case '}':
+ return newToken(tokenKindRBrace, nullChar), nil
+ default:
+ var b strings.Builder
+ fmt.Fprint(&b, string(c))
+ n := 1
+ for {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if c == '}' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ fmt.Fprint(&b, string(c))
+ n++
+ }
+ sym := strings.TrimSpace(b.String())
+ if len(sym) == 0 {
+ l.errCause = SynErrFragmentInvalidSymbol
+ return nil, ParseErr
+ }
+ return newFragmentSymbolToken(sym), nil
+ }
+}
+
+func (l *lexer) read() (rune, bool, error) {
+ if l.reachedEOF {
+ return l.lastChar, l.reachedEOF, nil
+ }
+ if l.peekChar1 != nullChar || l.peekEOF1 {
+ l.prevChar2 = l.prevChar1
+ l.pervEOF2 = l.prevEOF1
+ l.prevChar1 = l.lastChar
+ l.prevEOF1 = l.reachedEOF
+ l.lastChar = l.peekChar1
+ l.reachedEOF = l.peekEOF1
+ l.peekChar1 = l.peekChar2
+ l.peekEOF1 = l.peekEOF2
+ l.peekChar2 = nullChar
+ l.peekEOF2 = false
+ return l.lastChar, l.reachedEOF, nil
+ }
+ c, _, err := l.src.ReadRune()
+ if err != nil {
+ if err == io.EOF {
+ l.prevChar2 = l.prevChar1
+ l.pervEOF2 = l.prevEOF1
+ l.prevChar1 = l.lastChar
+ l.prevEOF1 = l.reachedEOF
+ l.lastChar = nullChar
+ l.reachedEOF = true
+ return l.lastChar, l.reachedEOF, nil
+ }
+ return nullChar, false, err
+ }
+ l.prevChar2 = l.prevChar1
+ l.pervEOF2 = l.prevEOF1
+ l.prevChar1 = l.lastChar
+ l.prevEOF1 = l.reachedEOF
+ l.lastChar = c
+ l.reachedEOF = false
+ return l.lastChar, l.reachedEOF, nil
+}
+
+func (l *lexer) restore() error {
+ if l.lastChar == nullChar && !l.reachedEOF {
+ return fmt.Errorf("failed to call restore() because the last character is null")
+ }
+ l.peekChar2 = l.peekChar1
+ l.peekEOF2 = l.peekEOF1
+ l.peekChar1 = l.lastChar
+ l.peekEOF1 = l.reachedEOF
+ l.lastChar = l.prevChar1
+ l.reachedEOF = l.prevEOF1
+ l.prevChar1 = l.prevChar2
+ l.prevEOF1 = l.pervEOF2
+ l.prevChar2 = nullChar
+ l.pervEOF2 = false
+ return nil
+}
diff --git a/compiler/parser/lexer_test.go b/compiler/parser/lexer_test.go
new file mode 100644
index 0000000..055466e
--- /dev/null
+++ b/compiler/parser/lexer_test.go
@@ -0,0 +1,524 @@
+package parser
+
+import (
+ "strings"
+ "testing"
+)
+
+func TestLexer(t *testing.T) {
+ tests := []struct {
+ caption string
+ src string
+ tokens []*token
+ err error
+ }{
+ {
+ caption: "lexer can recognize ordinaly characters",
+ src: "123abcいろは",
+ tokens: []*token{
+ newToken(tokenKindChar, '1'),
+ newToken(tokenKindChar, '2'),
+ newToken(tokenKindChar, '3'),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindChar, 'b'),
+ newToken(tokenKindChar, 'c'),
+ newToken(tokenKindChar, 'い'),
+ newToken(tokenKindChar, 'ろ'),
+ newToken(tokenKindChar, 'は'),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "lexer can recognize the special characters in default mode",
+ src: ".*+?|()[\\u",
+ tokens: []*token{
+ newToken(tokenKindAnyChar, nullChar),
+ newToken(tokenKindRepeat, nullChar),
+ newToken(tokenKindRepeatOneOrMore, nullChar),
+ newToken(tokenKindOption, nullChar),
+ newToken(tokenKindAlt, nullChar),
+ newToken(tokenKindGroupOpen, nullChar),
+ newToken(tokenKindGroupClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "lexer can recognize the escape sequences in default mode",
+ src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[",
+ tokens: []*token{
+ newToken(tokenKindChar, '\\'),
+ newToken(tokenKindChar, '.'),
+ newToken(tokenKindChar, '*'),
+ newToken(tokenKindChar, '+'),
+ newToken(tokenKindChar, '?'),
+ newToken(tokenKindChar, '|'),
+ newToken(tokenKindChar, '('),
+ newToken(tokenKindChar, ')'),
+ newToken(tokenKindChar, '['),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "], {, and } are treated as an ordinary character in default mode",
+ src: "]{}",
+ tokens: []*token{
+ newToken(tokenKindChar, ']'),
+ newToken(tokenKindChar, '{'),
+ newToken(tokenKindChar, '}'),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "lexer can recognize the special characters in bracket expression mode",
+ src: "[a-z\\u{09AF}][^a-z\\u{09abcf}]",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindCharRange, nullChar),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("09AF"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindCharRange, nullChar),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("09abcf"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "lexer can recognize the escape sequences in bracket expression mode",
+ src: "[\\^a\\-z]",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '^'),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "in a bracket expression, the special characters are also handled as normal characters",
+ src: "[\\\\.*+?|()[",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '\\'),
+ newToken(tokenKindChar, '.'),
+ newToken(tokenKindChar, '*'),
+ newToken(tokenKindChar, '+'),
+ newToken(tokenKindChar, '?'),
+ newToken(tokenKindChar, '|'),
+ newToken(tokenKindChar, '('),
+ newToken(tokenKindChar, ')'),
+ newToken(tokenKindChar, '['),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "hyphen symbols that appear in bracket expressions are handled as the character range symbol or ordinary characters",
+ // [...-...][...-][-...][-]
+ // ~~~~~~~ ~ ~ ~
+ // ^ ^ ^ ^
+ // | | | `-- Ordinary Character (b)
+ // | | `-- Ordinary Character (b)
+ // | `-- Ordinary Character (b)
+ // `-- Character Range (a)
+ //
+ // a. *-* is handled as a character-range expression.
+ // b. *-, -*, or - are handled as ordinary characters.
+ src: "[a-z][a-][-z][-][--][---][^a-z][^a-][^-z][^-][^--][^---]",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindCharRange, nullChar),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindCharRange, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindCharRange, nullChar),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindCharRange, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "caret symbols that appear in bracket expressions are handled as the logical inverse symbol or ordinary characters",
+ // [^...^...][^]
+ // ~~ ~ ~~
+ // ^ ^ ^^
+ // | | |`-- Ordinary Character (c)
+ // | | `-- Bracket Expression
+ // | `-- Ordinary Character (b)
+ // `-- Inverse Bracket Expression (a)
+ //
+ // a. Bracket expressions that have a caret symbol at the beginning are handled as logical inverse expressions.
+ // b. caret symbols that appear as the second and the subsequent symbols are handled as ordinary symbols.
+ // c. When a bracket expression has just one symbol, a caret symbol at the beginning is handled as an ordinary character.
+ src: "[^^][^]",
+ tokens: []*token{
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, '^'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '^'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "lexer raises an error when an invalid escape sequence appears",
+ src: "\\@",
+ err: synErrInvalidEscSeq,
+ },
+ {
+ caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears",
+ src: "\\",
+ err: synErrIncompletedEscSeq,
+ },
+ {
+ caption: "lexer raises an error when an invalid escape sequence appears",
+ src: "[\\@",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ },
+ err: synErrInvalidEscSeq,
+ },
+ {
+ caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears",
+ src: "[\\",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ },
+ err: synErrIncompletedEscSeq,
+ },
+ {
+ caption: "lexer can recognize the special characters and code points in code point expression mode",
+ src: "\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}[\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}][^\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}]",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("0123"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("4567"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("89abcd"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("efAB"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("CDEF01"),
+ newToken(tokenKindRBrace, nullChar),
+
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("0123"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("4567"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("89abcd"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("efAB"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("CDEF01"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("0123"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("4567"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("89abcd"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("efAB"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("CDEF01"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "a one digit hex string isn't a valid code point",
+ src: "\\u{0",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a two digits hex string isn't a valid code point",
+ src: "\\u{01",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a three digits hex string isn't a valid code point",
+ src: "\\u{012",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a four digits hex string is a valid code point",
+ src: "\\u{0123}",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("0123"),
+ newToken(tokenKindRBrace, nullChar),
+ },
+ },
+ {
+ caption: "a five digits hex string isn't a valid code point",
+ src: "\\u{01234",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a six digits hex string is a valid code point",
+ src: "\\u{012345}",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("012345"),
+ newToken(tokenKindRBrace, nullChar),
+ },
+ },
+ {
+ caption: "a seven digits hex string isn't a valid code point",
+ src: "\\u{0123456",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a code point must be hex digits",
+ src: "\\u{g",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a code point must be hex digits",
+ src: "\\u{G",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "lexer can recognize the special characters and symbols in character property expression mode",
+ src: "\\p{Letter}\\p{General_Category=Letter}[\\p{Letter}\\p{General_Category=Letter}][^\\p{Letter}\\p{General_Category=Letter}]",
+ tokens: []*token{
+ newToken(tokenKindCharPropLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCharPropSymbolToken("Letter"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCharPropLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCharPropSymbolToken("General_Category"),
+ newToken(tokenKindEqual, nullChar),
+ newCharPropSymbolToken("Letter"),
+ newToken(tokenKindRBrace, nullChar),
+
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindCharPropLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCharPropSymbolToken("Letter"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCharPropLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCharPropSymbolToken("General_Category"),
+ newToken(tokenKindEqual, nullChar),
+ newCharPropSymbolToken("Letter"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindCharPropLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCharPropSymbolToken("Letter"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCharPropLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCharPropSymbolToken("General_Category"),
+ newToken(tokenKindEqual, nullChar),
+ newCharPropSymbolToken("Letter"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "lexer can recognize the special characters and symbols in fragment expression mode",
+ src: "\\f{integer}",
+ tokens: []*token{
+ newToken(tokenKindFragmentLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newFragmentSymbolToken("integer"),
+ newToken(tokenKindRBrace, nullChar),
+
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "a fragment expression is not supported in a bracket expression",
+ src: "[\\f",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ },
+ err: synErrInvalidEscSeq,
+ },
+ {
+ caption: "a fragment expression is not supported in an inverse bracket expression",
+ src: "[^\\f",
+ tokens: []*token{
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ },
+ err: synErrInvalidEscSeq,
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.caption, func(t *testing.T) {
+ lex := newLexer(strings.NewReader(tt.src))
+ var err error
+ var tok *token
+ i := 0
+ for {
+ tok, err = lex.next()
+ if err != nil {
+ break
+ }
+ if i >= len(tt.tokens) {
+ break
+ }
+ eTok := tt.tokens[i]
+ i++
+ testToken(t, tok, eTok)
+
+ if tok.kind == tokenKindEOF {
+ break
+ }
+ }
+ if tt.err != nil {
+ if err != ParseErr {
+ t.Fatalf("unexpected error: want: %v, got: %v", ParseErr, err)
+ }
+ detail, cause := lex.error()
+ if cause != tt.err {
+ t.Fatalf("unexpected error: want: %v, got: %v (%v)", tt.err, cause, detail)
+ }
+ } else {
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ }
+ if i < len(tt.tokens) {
+ t.Fatalf("expecte more tokens")
+ }
+ })
+ }
+}
+
+func testToken(t *testing.T, a, e *token) {
+ t.Helper()
+ if e.kind != a.kind || e.char != a.char || e.codePoint != a.codePoint {
+ t.Fatalf("unexpected token: want: %+v, got: %+v", e, a)
+ }
+}
diff --git a/compiler/parser/parser.go b/compiler/parser/parser.go
new file mode 100644
index 0000000..4cf0e66
--- /dev/null
+++ b/compiler/parser/parser.go
@@ -0,0 +1,521 @@
+package parser
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "strconv"
+
+ "github.com/nihei9/maleeni/spec"
+ "github.com/nihei9/maleeni/ucd"
+)
+
+type PatternEntry struct {
+ ID spec.LexModeKindID
+ Pattern []byte
+}
+
+type parser struct {
+ kind spec.LexKindName
+ lex *lexer
+ peekedTok *token
+ lastTok *token
+
+ // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that
+ // appear in property expressions.
+ //
+ // The contributory properties are not exposed, and users cannot use those properties because the parser
+ // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid.
+ //
+ // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to
+ // interpret derived properties internally because the derived properties consist of other properties that
+ // may contain the contributory properties.
+ //
+ // [UAX #44 5.13 Property APIs] says:
+ // > The following subtypes of Unicode character properties should generally not be exposed in APIs,
+ // > except in limited circumstances. They may not be useful, particularly in public API collections,
+ // > and may instead prove misleading to the users of such API collections.
+ // > * Contributory properties are not recommended for public APIs.
+ // > ...
+ // https://unicode.org/reports/tr44/#Property_APIs
+ isContributoryPropertyExposed bool
+
+ errCause error
+ errDetail string
+}
+
+func NewParser(kind spec.LexKindName, src io.Reader) *parser {
+ return &parser{
+ kind: kind,
+ lex: newLexer(src),
+ isContributoryPropertyExposed: false,
+ }
+}
+
+func (p *parser) exposeContributoryProperty() {
+ p.isContributoryPropertyExposed = true
+}
+
+func (p *parser) Error() (string, error) {
+ return p.errDetail, p.errCause
+}
+
+func (p *parser) Parse() (root CPTree, retErr error) {
+ defer func() {
+ err := recover()
+ if err != nil {
+ var ok bool
+ retErr, ok = err.(error)
+ if !ok {
+ panic(err)
+ }
+ return
+ }
+ }()
+
+ return newRootNode(p.kind, p.parseRegexp()), nil
+}
+
+func (p *parser) parseRegexp() CPTree {
+ alt := p.parseAlt()
+ if alt == nil {
+ if p.consume(tokenKindGroupClose) {
+ p.raiseParseError(synErrGroupNoInitiator, "")
+ }
+ p.raiseParseError(synErrNullPattern, "")
+ }
+ if p.consume(tokenKindGroupClose) {
+ p.raiseParseError(synErrGroupNoInitiator, "")
+ }
+ p.expect(tokenKindEOF)
+ return alt
+}
+
+func (p *parser) parseAlt() CPTree {
+ left := p.parseConcat()
+ if left == nil {
+ if p.consume(tokenKindAlt) {
+ p.raiseParseError(synErrAltLackOfOperand, "")
+ }
+ return nil
+ }
+ for {
+ if !p.consume(tokenKindAlt) {
+ break
+ }
+ right := p.parseConcat()
+ if right == nil {
+ p.raiseParseError(synErrAltLackOfOperand, "")
+ }
+ left = newAltNode(left, right)
+ }
+ return left
+}
+
+func (p *parser) parseConcat() CPTree {
+ left := p.parseRepeat()
+ for {
+ right := p.parseRepeat()
+ if right == nil {
+ break
+ }
+ left = newConcatNode(left, right)
+ }
+ return left
+}
+
+func (p *parser) parseRepeat() CPTree {
+ group := p.parseGroup()
+ if group == nil {
+ if p.consume(tokenKindRepeat) {
+ p.raiseParseError(synErrRepNoTarget, "* needs an operand")
+ }
+ if p.consume(tokenKindRepeatOneOrMore) {
+ p.raiseParseError(synErrRepNoTarget, "+ needs an operand")
+ }
+ if p.consume(tokenKindOption) {
+ p.raiseParseError(synErrRepNoTarget, "? needs an operand")
+ }
+ return nil
+ }
+ if p.consume(tokenKindRepeat) {
+ return newRepeatNode(group)
+ }
+ if p.consume(tokenKindRepeatOneOrMore) {
+ return newRepeatOneOrMoreNode(group)
+ }
+ if p.consume(tokenKindOption) {
+ return newOptionNode(group)
+ }
+ return group
+}
+
+func (p *parser) parseGroup() CPTree {
+ if p.consume(tokenKindGroupOpen) {
+ alt := p.parseAlt()
+ if alt == nil {
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrGroupUnclosed, "")
+ }
+ p.raiseParseError(synErrGroupNoElem, "")
+ }
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrGroupUnclosed, "")
+ }
+ if !p.consume(tokenKindGroupClose) {
+ p.raiseParseError(synErrGroupInvalidForm, "")
+ }
+ return alt
+ }
+ return p.parseSingleChar()
+}
+
+func (p *parser) parseSingleChar() CPTree {
+ if p.consume(tokenKindAnyChar) {
+ return genAnyCharAST()
+ }
+ if p.consume(tokenKindBExpOpen) {
+ left := p.parseBExpElem()
+ if left == nil {
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrBExpUnclosed, "")
+ }
+ p.raiseParseError(synErrBExpNoElem, "")
+ }
+ for {
+ right := p.parseBExpElem()
+ if right == nil {
+ break
+ }
+ left = newAltNode(left, right)
+ }
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrBExpUnclosed, "")
+ }
+ p.expect(tokenKindBExpClose)
+ return left
+ }
+ if p.consume(tokenKindInverseBExpOpen) {
+ elem := p.parseBExpElem()
+ if elem == nil {
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrBExpUnclosed, "")
+ }
+ p.raiseParseError(synErrBExpNoElem, "")
+ }
+ inverse := exclude(elem, genAnyCharAST())
+ if inverse == nil {
+ panic(fmt.Errorf("a pattern that isn't matching any symbols"))
+ }
+ for {
+ elem := p.parseBExpElem()
+ if elem == nil {
+ break
+ }
+ inverse = exclude(elem, inverse)
+ if inverse == nil {
+ panic(fmt.Errorf("a pattern that isn't matching any symbols"))
+ }
+ }
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrBExpUnclosed, "")
+ }
+ p.expect(tokenKindBExpClose)
+ return inverse
+ }
+ if p.consume(tokenKindCodePointLeader) {
+ return p.parseCodePoint()
+ }
+ if p.consume(tokenKindCharPropLeader) {
+ return p.parseCharProp()
+ }
+ if p.consume(tokenKindFragmentLeader) {
+ return p.parseFragment()
+ }
+ c := p.parseNormalChar()
+ if c == nil {
+ if p.consume(tokenKindBExpClose) {
+ p.raiseParseError(synErrBExpInvalidForm, "")
+ }
+ return nil
+ }
+ return c
+}
+
+func (p *parser) parseBExpElem() CPTree {
+ if p.consume(tokenKindCodePointLeader) {
+ return p.parseCodePoint()
+ }
+ if p.consume(tokenKindCharPropLeader) {
+ return p.parseCharProp()
+ }
+ left := p.parseNormalChar()
+ if left == nil {
+ return nil
+ }
+ if !p.consume(tokenKindCharRange) {
+ return left
+ }
+ right := p.parseNormalChar()
+ if right == nil {
+ panic(fmt.Errorf("invalid range expression"))
+ }
+ from, _, _ := left.Range()
+ _, to, _ := right.Range()
+ if !isValidOrder(from, to) {
+ p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("[%v-%v]", from, to))
+ }
+ return newRangeSymbolNode(from, to)
+}
+
+func (p *parser) parseCodePoint() CPTree {
+ if !p.consume(tokenKindLBrace) {
+ p.raiseParseError(synErrCPExpInvalidForm, "")
+ }
+ if !p.consume(tokenKindCodePoint) {
+ p.raiseParseError(synErrCPExpInvalidForm, "")
+ }
+
+ n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64)
+ if err != nil {
+ panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err))
+ }
+ if n < 0x0000 || n > 0x10FFFF {
+ p.raiseParseError(synErrCPExpOutOfRange, "")
+ }
+
+ sym := newSymbolNode(rune(n))
+
+ if !p.consume(tokenKindRBrace) {
+ p.raiseParseError(synErrCPExpInvalidForm, "")
+ }
+
+ return sym
+}
+
+func (p *parser) parseCharProp() CPTree {
+ if !p.consume(tokenKindLBrace) {
+ p.raiseParseError(synErrCharPropExpInvalidForm, "")
+ }
+ var sym1, sym2 string
+ if !p.consume(tokenKindCharPropSymbol) {
+ p.raiseParseError(synErrCharPropExpInvalidForm, "")
+ }
+ sym1 = p.lastTok.propSymbol
+ if p.consume(tokenKindEqual) {
+ if !p.consume(tokenKindCharPropSymbol) {
+ p.raiseParseError(synErrCharPropExpInvalidForm, "")
+ }
+ sym2 = p.lastTok.propSymbol
+ }
+
+ var alt CPTree
+ var propName, propVal string
+ if sym2 != "" {
+ propName = sym1
+ propVal = sym2
+ } else {
+ propName = ""
+ propVal = sym1
+ }
+ if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) {
+ p.raiseParseError(synErrCharPropUnsupported, propName)
+ }
+ pat, err := ucd.NormalizeCharacterProperty(propName, propVal)
+ if err != nil {
+ p.raiseParseError(synErrCharPropUnsupported, err.Error())
+ }
+ if pat != "" {
+ p := NewParser(p.kind, bytes.NewReader([]byte(pat)))
+ p.exposeContributoryProperty()
+ ast, err := p.Parse()
+ if err != nil {
+ panic(err)
+ }
+ alt = ast
+ } else {
+ cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal)
+ if err != nil {
+ p.raiseParseError(synErrCharPropUnsupported, err.Error())
+ }
+ if inverse {
+ r := cpRanges[0]
+ alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST())
+ if alt == nil {
+ panic(fmt.Errorf("a pattern that isn't matching any symbols"))
+ }
+ for _, r := range cpRanges[1:] {
+ alt = exclude(newRangeSymbolNode(r.From, r.To), alt)
+ if alt == nil {
+ panic(fmt.Errorf("a pattern that isn't matching any symbols"))
+ }
+ }
+ } else {
+ for _, r := range cpRanges {
+ alt = genAltNode(
+ alt,
+ newRangeSymbolNode(r.From, r.To),
+ )
+ }
+ }
+ }
+
+ if !p.consume(tokenKindRBrace) {
+ p.raiseParseError(synErrCharPropExpInvalidForm, "")
+ }
+
+ return alt
+}
+
+func (p *parser) parseFragment() CPTree {
+ if !p.consume(tokenKindLBrace) {
+ p.raiseParseError(synErrFragmentExpInvalidForm, "")
+ }
+ if !p.consume(tokenKindFragmentSymbol) {
+ p.raiseParseError(synErrFragmentExpInvalidForm, "")
+ }
+ sym := p.lastTok.fragmentSymbol
+
+ if !p.consume(tokenKindRBrace) {
+ p.raiseParseError(synErrFragmentExpInvalidForm, "")
+ }
+
+ return newFragmentNode(spec.LexKindName(sym), nil)
+}
+
+func (p *parser) parseNormalChar() CPTree {
+ if !p.consume(tokenKindChar) {
+ return nil
+ }
+ return newSymbolNode(p.lastTok.char)
+}
+
+func exclude(symbol, base CPTree) CPTree {
+ if left, right, ok := symbol.Alternatives(); ok {
+ return exclude(right, exclude(left, base))
+ }
+
+ if left, right, ok := base.Alternatives(); ok {
+ return genAltNode(
+ exclude(symbol, left),
+ exclude(symbol, right),
+ )
+ }
+
+ if bFrom, bTo, ok := base.Range(); ok {
+ sFrom, sTo, ok := symbol.Range()
+ if !ok {
+ panic(fmt.Errorf("invalid symbol tree: %T", symbol))
+ }
+
+ switch {
+ case sFrom > bFrom && sTo < bTo:
+ return genAltNode(
+ newRangeSymbolNode(bFrom, sFrom-1),
+ newRangeSymbolNode(sTo+1, bTo),
+ )
+ case sFrom <= bFrom && sTo > bFrom && sTo < bTo:
+ return newRangeSymbolNode(sTo+1, bTo)
+ case sFrom < bFrom && sFrom < bTo && sTo >= bTo:
+ return newRangeSymbolNode(bFrom, sFrom-1)
+ case sFrom <= bFrom && sTo >= bTo:
+ return nil
+ default:
+ return base
+ }
+ }
+
+ panic(fmt.Errorf("invalid base tree: %T", base))
+}
+
+func genAnyCharAST() CPTree {
+ return newRangeSymbolNode(0x0, 0x10FFFF)
+}
+
+func isValidOrder(from, to rune) bool {
+ if from <= to {
+ return true
+ }
+ return false
+}
+
+func genConcatNode(cs ...CPTree) CPTree {
+ nonNilNodes := []CPTree{}
+ for _, c := range cs {
+ if c == nil {
+ continue
+ }
+ nonNilNodes = append(nonNilNodes, c)
+ }
+ if len(nonNilNodes) <= 0 {
+ return nil
+ }
+ if len(nonNilNodes) == 1 {
+ return nonNilNodes[0]
+ }
+ concat := newConcatNode(nonNilNodes[0], nonNilNodes[1])
+ for _, c := range nonNilNodes[2:] {
+ concat = newConcatNode(concat, c)
+ }
+ return concat
+}
+
+func genAltNode(cs ...CPTree) CPTree {
+ nonNilNodes := []CPTree{}
+ for _, c := range cs {
+ if c == nil {
+ continue
+ }
+ nonNilNodes = append(nonNilNodes, c)
+ }
+ if len(nonNilNodes) <= 0 {
+ return nil
+ }
+ if len(nonNilNodes) == 1 {
+ return nonNilNodes[0]
+ }
+ alt := newAltNode(nonNilNodes[0], nonNilNodes[1])
+ for _, c := range nonNilNodes[2:] {
+ alt = newAltNode(alt, c)
+ }
+ return alt
+}
+
+func (p *parser) expect(expected tokenKind) {
+ if !p.consume(expected) {
+ tok := p.peekedTok
+ p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("unexpected token; expected: %v, actual: %v", expected, tok.kind))
+ }
+}
+
+func (p *parser) consume(expected tokenKind) bool {
+ var tok *token
+ var err error
+ if p.peekedTok != nil {
+ tok = p.peekedTok
+ p.peekedTok = nil
+ } else {
+ tok, err = p.lex.next()
+ if err != nil {
+ if err == ParseErr {
+ detail, cause := p.lex.error()
+ p.raiseParseError(cause, detail)
+ }
+ panic(err)
+ }
+ }
+ p.lastTok = tok
+ if tok.kind == expected {
+ return true
+ }
+ p.peekedTok = tok
+ p.lastTok = nil
+
+ return false
+}
+
+func (p *parser) raiseParseError(err error, detail string) {
+ p.errCause = err
+ p.errDetail = detail
+ panic(ParseErr)
+}
diff --git a/compiler/parser/parser_test.go b/compiler/parser/parser_test.go
new file mode 100644
index 0000000..beeef1b
--- /dev/null
+++ b/compiler/parser/parser_test.go
@@ -0,0 +1,940 @@
+package parser
+
+import (
+ "fmt"
+ "reflect"
+ "strings"
+ "testing"
+
+ "github.com/nihei9/maleeni/spec"
+ "github.com/nihei9/maleeni/ucd"
+)
+
+func TestParse(t *testing.T) {
+ tests := []struct {
+ pattern string
+ fragments map[spec.LexKindName]string
+ ast CPTree
+ syntaxError error
+
+ // When an AST is large, as patterns containing a character property expression, this test only checks
+ // that the pattern is parsable. The check of the validity of such AST is performed by checking that it
+ // can be matched correctly using the driver.
+ skipTestAST bool
+ }{
+ {
+ pattern: "a",
+ ast: newSymbolNode('a'),
+ },
+ {
+ pattern: "abc",
+ ast: genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ },
+ {
+ pattern: "a?",
+ ast: newOptionNode(
+ newSymbolNode('a'),
+ ),
+ },
+ {
+ pattern: "[abc]?",
+ ast: newOptionNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ },
+ {
+ pattern: "\\u{3042}?",
+ ast: newOptionNode(
+ newSymbolNode('\u3042'),
+ ),
+ },
+ {
+ pattern: "\\p{Letter}?",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\f{a2c}?",
+ fragments: map[spec.LexKindName]string{
+ "a2c": "abc",
+ },
+ ast: newOptionNode(
+ newFragmentNode("a2c",
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "(a)?",
+ ast: newOptionNode(
+ newSymbolNode('a'),
+ ),
+ },
+ {
+ pattern: "((a?)?)?",
+ ast: newOptionNode(
+ newOptionNode(
+ newOptionNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "(abc)?",
+ ast: newOptionNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ },
+ {
+ pattern: "(a|b)?",
+ ast: newOptionNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ ),
+ ),
+ },
+ {
+ pattern: "?",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "(?)",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a|?",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "?|b",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a??",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a*",
+ ast: newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ },
+ {
+ pattern: "[abc]*",
+ ast: newRepeatNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ },
+ {
+ pattern: "\\u{3042}*",
+ ast: newRepeatNode(
+ newSymbolNode('\u3042'),
+ ),
+ },
+ {
+ pattern: "\\p{Letter}*",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\f{a2c}*",
+ fragments: map[spec.LexKindName]string{
+ "a2c": "abc",
+ },
+ ast: newRepeatNode(
+ newFragmentNode("a2c",
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "((a*)*)*",
+ ast: newRepeatNode(
+ newRepeatNode(
+ newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "(abc)*",
+ ast: newRepeatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ },
+ {
+ pattern: "(a|b)*",
+ ast: newRepeatNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ ),
+ ),
+ },
+ {
+ pattern: "*",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "(*)",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a|*",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "*|b",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a**",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a+",
+ ast: genConcatNode(
+ newSymbolNode('a'),
+ newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ },
+ {
+ pattern: "[abc]+",
+ ast: genConcatNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ newRepeatNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "\\u{3042}+",
+ ast: genConcatNode(
+ newSymbolNode('\u3042'),
+ newRepeatNode(
+ newSymbolNode('\u3042'),
+ ),
+ ),
+ },
+ {
+ pattern: "\\p{Letter}+",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\f{a2c}+",
+ fragments: map[spec.LexKindName]string{
+ "a2c": "abc",
+ },
+ ast: genConcatNode(
+ newFragmentNode("a2c",
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ newRepeatNode(
+ newFragmentNode("a2c",
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "((a+)+)+",
+ ast: genConcatNode(
+ genConcatNode(
+ genConcatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ newRepeatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ ),
+ ),
+ newRepeatNode(
+ genConcatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ newRepeatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ ),
+ ),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "(abc)+",
+ ast: genConcatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ newRepeatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "(a|b)+",
+ ast: genConcatNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ ),
+ newRepeatNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "+",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "(+)",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a|+",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "+|b",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a++",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: ".",
+ ast: newRangeSymbolNode(0x00, 0x10FFFF),
+ },
+ {
+ pattern: "[a]",
+ ast: newSymbolNode('a'),
+ },
+ {
+ pattern: "[abc]",
+ ast: genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ },
+ {
+ pattern: "[a-z]",
+ ast: newRangeSymbolNode('a', 'z'),
+ },
+ {
+ pattern: "[A-Za-z]",
+ ast: genAltNode(
+ newRangeSymbolNode('A', 'Z'),
+ newRangeSymbolNode('a', 'z'),
+ ),
+ },
+ {
+ pattern: "[\\u{004E}]",
+ ast: newSymbolNode('N'),
+ },
+ {
+ pattern: "[\\p{Lu}]",
+ skipTestAST: true,
+ },
+ {
+ pattern: "a[]",
+ syntaxError: synErrBExpNoElem,
+ },
+ {
+ pattern: "[]a",
+ syntaxError: synErrBExpNoElem,
+ },
+ {
+ pattern: "[]",
+ syntaxError: synErrBExpNoElem,
+ },
+ {
+ pattern: "[^]",
+ ast: newSymbolNode('^'),
+ },
+ {
+ pattern: "[",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "[a",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([a",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "[a-",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([a-",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "[^",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([^",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "[^a",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([^a",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "[^a-",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([^a-",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "]",
+ ast: newSymbolNode(']'),
+ },
+ {
+ pattern: "(]",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: "a]",
+ ast: genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode(']'),
+ ),
+ },
+ {
+ pattern: "(a]",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: "([)",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([a)",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "[a-]",
+ ast: genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('-'),
+ ),
+ },
+ {
+ pattern: "[^a-]",
+ ast: genAltNode(
+ newRangeSymbolNode(0x00, 0x2C),
+ newRangeSymbolNode(0x2E, 0x60),
+ newRangeSymbolNode(0x62, 0x10FFFF),
+ ),
+ },
+ {
+ pattern: "[-z]",
+ ast: genAltNode(
+ newSymbolNode('-'),
+ newSymbolNode('z'),
+ ),
+ },
+ {
+ pattern: "[^-z]",
+ ast: newAltNode(
+ newRangeSymbolNode(0x00, 0x2C),
+ newAltNode(
+ newRangeSymbolNode(0x2E, 0x79),
+ newRangeSymbolNode(0x7B, 0x10FFFF),
+ ),
+ ),
+ },
+ {
+ pattern: "[-]",
+ ast: newSymbolNode('-'),
+ },
+ {
+ pattern: "[^-]",
+ ast: genAltNode(
+ newRangeSymbolNode(0x00, 0x2C),
+ newRangeSymbolNode(0x2E, 0x10FFFF),
+ ),
+ },
+ {
+ pattern: "\\u{006E}",
+ ast: newSymbolNode('\u006E'),
+ },
+ {
+ pattern: "\\u{03BD}",
+ ast: newSymbolNode('\u03BD'),
+ },
+ {
+ pattern: "\\u{306B}",
+ ast: newSymbolNode('\u306B'),
+ },
+ {
+ pattern: "\\u{01F638}",
+ ast: newSymbolNode('\U0001F638'),
+ },
+ {
+ pattern: "\\u{0000}",
+ ast: newSymbolNode('\u0000'),
+ },
+ {
+ pattern: "\\u{10FFFF}",
+ ast: newSymbolNode('\U0010FFFF'),
+ },
+ {
+ pattern: "\\u{110000}",
+ syntaxError: synErrCPExpOutOfRange,
+ },
+ {
+ pattern: "\\u",
+ syntaxError: synErrCPExpInvalidForm,
+ },
+ {
+ pattern: "\\u{",
+ syntaxError: synErrCPExpInvalidForm,
+ },
+ {
+ pattern: "\\u{03BD",
+ syntaxError: synErrCPExpInvalidForm,
+ },
+ {
+ pattern: "\\u{}",
+ syntaxError: synErrCPExpInvalidForm,
+ },
+ {
+ pattern: "\\p{Letter}",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\p{General_Category=Letter}",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\p{ Letter }",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\p{ General_Category = Letter }",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\p",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\p{",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\p{Letter",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\p{General_Category=}",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\p{General_Category= }",
+ syntaxError: synErrCharPropInvalidSymbol,
+ },
+ {
+ pattern: "\\p{=Letter}",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\p{ =Letter}",
+ syntaxError: synErrCharPropInvalidSymbol,
+ },
+ {
+ pattern: "\\p{=}",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\p{}",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\f{a2c}",
+ fragments: map[spec.LexKindName]string{
+ "a2c": "abc",
+ },
+ ast: newFragmentNode("a2c",
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ },
+ {
+ pattern: "\\f{ a2c }",
+ fragments: map[spec.LexKindName]string{
+ "a2c": "abc",
+ },
+ ast: newFragmentNode("a2c",
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ },
+ {
+ pattern: "\\f",
+ syntaxError: synErrFragmentExpInvalidForm,
+ },
+ {
+ pattern: "\\f{",
+ syntaxError: synErrFragmentExpInvalidForm,
+ },
+ {
+ pattern: "\\f{a2c",
+ fragments: map[spec.LexKindName]string{
+ "a2c": "abc",
+ },
+ syntaxError: synErrFragmentExpInvalidForm,
+ },
+ {
+ pattern: "(a)",
+ ast: newSymbolNode('a'),
+ },
+ {
+ pattern: "(((a)))",
+ ast: newSymbolNode('a'),
+ },
+ {
+ pattern: "a()",
+ syntaxError: synErrGroupNoElem,
+ },
+ {
+ pattern: "()a",
+ syntaxError: synErrGroupNoElem,
+ },
+ {
+ pattern: "()",
+ syntaxError: synErrGroupNoElem,
+ },
+ {
+ pattern: "(",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: "a(",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: "(a",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: "((",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: "((a)",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: ")",
+ syntaxError: synErrGroupNoInitiator,
+ },
+ {
+ pattern: "a)",
+ syntaxError: synErrGroupNoInitiator,
+ },
+ {
+ pattern: ")a",
+ syntaxError: synErrGroupNoInitiator,
+ },
+ {
+ pattern: "))",
+ syntaxError: synErrGroupNoInitiator,
+ },
+ {
+ pattern: "(a))",
+ syntaxError: synErrGroupNoInitiator,
+ },
+ {
+ pattern: "Mulder|Scully",
+ ast: genAltNode(
+ genConcatNode(
+ newSymbolNode('M'),
+ newSymbolNode('u'),
+ newSymbolNode('l'),
+ newSymbolNode('d'),
+ newSymbolNode('e'),
+ newSymbolNode('r'),
+ ),
+ genConcatNode(
+ newSymbolNode('S'),
+ newSymbolNode('c'),
+ newSymbolNode('u'),
+ newSymbolNode('l'),
+ newSymbolNode('l'),
+ newSymbolNode('y'),
+ ),
+ ),
+ },
+ {
+ pattern: "Langly|Frohike|Byers",
+ ast: genAltNode(
+ genConcatNode(
+ newSymbolNode('L'),
+ newSymbolNode('a'),
+ newSymbolNode('n'),
+ newSymbolNode('g'),
+ newSymbolNode('l'),
+ newSymbolNode('y'),
+ ),
+ genConcatNode(
+ newSymbolNode('F'),
+ newSymbolNode('r'),
+ newSymbolNode('o'),
+ newSymbolNode('h'),
+ newSymbolNode('i'),
+ newSymbolNode('k'),
+ newSymbolNode('e'),
+ ),
+ genConcatNode(
+ newSymbolNode('B'),
+ newSymbolNode('y'),
+ newSymbolNode('e'),
+ newSymbolNode('r'),
+ newSymbolNode('s'),
+ ),
+ ),
+ },
+ {
+ pattern: "|",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "||",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "Mulder|",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "|Scully",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "Langly|Frohike|",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "Langly||Byers",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "|Frohike|Byers",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "|Frohike|",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "Fox(|)Mulder",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "(Fox|)Mulder",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "Fox(|Mulder)",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ }
+ for i, tt := range tests {
+ t.Run(fmt.Sprintf("#%v %v", i, tt.pattern), func(t *testing.T) {
+ fragmentTrees := map[spec.LexKindName]CPTree{}
+ for kind, pattern := range tt.fragments {
+ p := NewParser(kind, strings.NewReader(pattern))
+ root, err := p.Parse()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ fragmentTrees[kind] = root
+ }
+ err := CompleteFragments(fragmentTrees)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ p := NewParser(spec.LexKindName("test"), strings.NewReader(tt.pattern))
+ root, err := p.Parse()
+ if tt.syntaxError != nil {
+ // printCPTree(os.Stdout, ast, "", "")
+ if err == nil {
+ t.Fatalf("expected syntax error: got: nil")
+ }
+ _, synErr := p.Error()
+ if synErr != tt.syntaxError {
+ t.Fatalf("unexpected syntax error: want: %v, got: %v", tt.syntaxError, synErr)
+ }
+ if root != nil {
+ t.Fatalf("tree is not nil")
+ }
+ } else {
+ if err != nil {
+ t.Fatal(err)
+ }
+ if root == nil {
+ t.Fatal("tree is nil")
+ }
+
+ complete, err := ApplyFragments(root, fragmentTrees)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if !complete {
+ t.Fatalf("incomplete fragments")
+ }
+
+ // printCPTree(os.Stdout, ast, "", "")
+ if !tt.skipTestAST {
+ r := root.(*rootNode)
+ testAST(t, tt.ast, r.tree)
+ }
+ }
+ })
+ }
+}
+
+func TestParse_ContributoryPropertyIsNotExposed(t *testing.T) {
+ for _, cProp := range ucd.ContributoryProperties() {
+ t.Run(fmt.Sprintf("%v", cProp), func(t *testing.T) {
+ p := NewParser(spec.LexKindName("test"), strings.NewReader(fmt.Sprintf(`\p{%v=yes}`, cProp)))
+ root, err := p.Parse()
+ if err == nil {
+ t.Fatalf("expected syntax error: got: nil")
+ }
+ _, synErr := p.Error()
+ if synErr != synErrCharPropUnsupported {
+ t.Fatalf("unexpected syntax error: want: %v, got: %v", synErrCharPropUnsupported, synErr)
+ }
+ if root != nil {
+ t.Fatalf("tree is not nil")
+ }
+ })
+ }
+}
+
+func testAST(t *testing.T, expected, actual CPTree) {
+ t.Helper()
+
+ aTy := reflect.TypeOf(actual)
+ eTy := reflect.TypeOf(expected)
+ if eTy != aTy {
+ t.Fatalf("unexpected node: want: %+v, got: %+v", eTy, aTy)
+ }
+
+ if actual == nil {
+ return
+ }
+
+ switch e := expected.(type) {
+ case *symbolNode:
+ a := actual.(*symbolNode)
+ if a.From != e.From || a.To != e.To {
+ t.Fatalf("unexpected node: want: %+v, got: %+v", e, a)
+ }
+ }
+ eLeft, eRight := expected.children()
+ aLeft, aRight := actual.children()
+ testAST(t, eLeft, aLeft)
+ testAST(t, eRight, aRight)
+}
diff --git a/compiler/parser/tree.go b/compiler/parser/tree.go
new file mode 100644
index 0000000..77d2e46
--- /dev/null
+++ b/compiler/parser/tree.go
@@ -0,0 +1,451 @@
+package parser
+
+import (
+ "fmt"
+ "io"
+ "sort"
+
+ "github.com/nihei9/maleeni/spec"
+)
+
+type CPRange struct {
+ From rune
+ To rune
+}
+
+type CPTree interface {
+ fmt.Stringer
+ Range() (rune, rune, bool)
+ Optional() (CPTree, bool)
+ Repeatable() (CPTree, bool)
+ Concatenation() (CPTree, CPTree, bool)
+ Alternatives() (CPTree, CPTree, bool)
+ Describe() (spec.LexKindName, []spec.LexKindName, error)
+
+ children() (CPTree, CPTree)
+ clone() CPTree
+}
+
+var (
+ _ CPTree = &rootNode{}
+ _ CPTree = &symbolNode{}
+ _ CPTree = &concatNode{}
+ _ CPTree = &altNode{}
+ _ CPTree = &quantifierNode{}
+ _ CPTree = &fragmentNode{}
+)
+
+type rootNode struct {
+ kind spec.LexKindName
+ tree CPTree
+ fragments map[spec.LexKindName][]*fragmentNode
+}
+
+func newRootNode(kind spec.LexKindName, t CPTree) *rootNode {
+ fragments := map[spec.LexKindName][]*fragmentNode{}
+ collectFragments(t, fragments)
+
+ return &rootNode{
+ kind: kind,
+ tree: t,
+ fragments: fragments,
+ }
+}
+
+func collectFragments(n CPTree, fragments map[spec.LexKindName][]*fragmentNode) {
+ if n == nil {
+ return
+ }
+
+ if f, ok := n.(*fragmentNode); ok {
+ fragments[f.kind] = append(fragments[f.kind], f)
+ return
+ }
+
+ l, r := n.children()
+ collectFragments(l, fragments)
+ collectFragments(r, fragments)
+}
+
+func (n *rootNode) String() string {
+ return fmt.Sprintf("root: %v: %v fragments", n.kind, len(n.fragments))
+}
+
+func (n *rootNode) Range() (rune, rune, bool) {
+ return n.tree.Range()
+}
+
+func (n *rootNode) Optional() (CPTree, bool) {
+ return n.tree.Optional()
+}
+
+func (n *rootNode) Repeatable() (CPTree, bool) {
+ return n.tree.Repeatable()
+}
+
+func (n *rootNode) Concatenation() (CPTree, CPTree, bool) {
+ return n.tree.Concatenation()
+}
+
+func (n *rootNode) Alternatives() (CPTree, CPTree, bool) {
+ return n.tree.Alternatives()
+}
+
+func (n *rootNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
+ var frags []spec.LexKindName
+ for f := range n.fragments {
+ frags = append(frags, spec.LexKindName(f))
+ }
+ sort.Slice(frags, func(i, j int) bool {
+ return frags[i] < frags[j]
+ })
+
+ return n.kind, frags, nil
+}
+
+func (n *rootNode) children() (CPTree, CPTree) {
+ return n.tree.children()
+}
+
+func (n *rootNode) clone() CPTree {
+ return n.tree.clone()
+}
+
+func (n *rootNode) incomplete() bool {
+ return len(n.fragments) > 0
+}
+
+func (n *rootNode) applyFragment(kind spec.LexKindName, fragment CPTree) error {
+ root, ok := fragment.(*rootNode)
+ if !ok {
+ return fmt.Errorf("applyFragment can take only *rootNode: %T", fragment)
+ }
+ if root.incomplete() {
+ return fmt.Errorf("fragment is incomplete")
+ }
+
+ fs, ok := n.fragments[kind]
+ if !ok {
+ return nil
+ }
+ for _, f := range fs {
+ f.tree = root.clone()
+ }
+ delete(n.fragments, kind)
+
+ return nil
+}
+
+type symbolNode struct {
+ CPRange
+}
+
+func newSymbolNode(cp rune) *symbolNode {
+ return &symbolNode{
+ CPRange: CPRange{
+ From: cp,
+ To: cp,
+ },
+ }
+}
+
+func newRangeSymbolNode(from, to rune) *symbolNode {
+ return &symbolNode{
+ CPRange: CPRange{
+ From: from,
+ To: to,
+ },
+ }
+}
+
+func (n *symbolNode) String() string {
+ return fmt.Sprintf("symbol: %v - %v", n.From, n.To)
+}
+
+func (n *symbolNode) Range() (rune, rune, bool) {
+ return n.From, n.To, true
+}
+
+func (n *symbolNode) Optional() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *symbolNode) Repeatable() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *symbolNode) Concatenation() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *symbolNode) Alternatives() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *symbolNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
+ return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *symbolNode) children() (CPTree, CPTree) {
+ return nil, nil
+}
+
+func (n *symbolNode) clone() CPTree {
+ return newRangeSymbolNode(n.From, n.To)
+}
+
+type concatNode struct {
+ left CPTree
+ right CPTree
+}
+
+func newConcatNode(left, right CPTree) *concatNode {
+ return &concatNode{
+ left: left,
+ right: right,
+ }
+}
+
+func (n *concatNode) String() string {
+ return fmt.Sprintf("concat")
+}
+
+func (n *concatNode) Range() (rune, rune, bool) {
+ return 0, 0, false
+}
+
+func (n *concatNode) Optional() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *concatNode) Repeatable() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *concatNode) Concatenation() (CPTree, CPTree, bool) {
+ return n.left, n.right, true
+}
+
+func (n *concatNode) Alternatives() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *concatNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
+ return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *concatNode) children() (CPTree, CPTree) {
+ return n.left, n.right
+}
+
+func (n *concatNode) clone() CPTree {
+ if n == nil {
+ return nil
+ }
+ return newConcatNode(n.left.clone(), n.right.clone())
+}
+
+type altNode struct {
+ left CPTree
+ right CPTree
+}
+
+func newAltNode(left, right CPTree) *altNode {
+ return &altNode{
+ left: left,
+ right: right,
+ }
+}
+
+func (n *altNode) String() string {
+ return fmt.Sprintf("alt")
+}
+
+func (n *altNode) Range() (rune, rune, bool) {
+ return 0, 0, false
+}
+
+func (n *altNode) Optional() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *altNode) Repeatable() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *altNode) Concatenation() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *altNode) Alternatives() (CPTree, CPTree, bool) {
+ return n.left, n.right, true
+}
+
+func (n *altNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
+ return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *altNode) children() (CPTree, CPTree) {
+ return n.left, n.right
+}
+
+func (n *altNode) clone() CPTree {
+ return newAltNode(n.left.clone(), n.right.clone())
+}
+
+type quantifierNode struct {
+ optional bool
+ repeatable bool
+ tree CPTree
+}
+
+func (n *quantifierNode) String() string {
+ switch {
+ case n.repeatable:
+ return "repeatable (>= 0 times)"
+ case n.optional:
+ return "optional (0 or 1 times)"
+ default:
+ return "invalid quantifier"
+ }
+}
+
+func newRepeatNode(t CPTree) *quantifierNode {
+ return &quantifierNode{
+ repeatable: true,
+ tree: t,
+ }
+}
+
+func newRepeatOneOrMoreNode(t CPTree) *concatNode {
+ return newConcatNode(
+ t,
+ &quantifierNode{
+ repeatable: true,
+ tree: t.clone(),
+ })
+}
+
+func newOptionNode(t CPTree) *quantifierNode {
+ return &quantifierNode{
+ optional: true,
+ tree: t,
+ }
+}
+
+func (n *quantifierNode) Range() (rune, rune, bool) {
+ return 0, 0, false
+}
+
+func (n *quantifierNode) Optional() (CPTree, bool) {
+ return n.tree, n.optional
+}
+
+func (n *quantifierNode) Repeatable() (CPTree, bool) {
+ return n.tree, n.repeatable
+}
+
+func (n *quantifierNode) Concatenation() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *quantifierNode) Alternatives() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *quantifierNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
+ return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *quantifierNode) children() (CPTree, CPTree) {
+ return n.tree, nil
+}
+
+func (n *quantifierNode) clone() CPTree {
+ if n.repeatable {
+ return newRepeatNode(n.tree.clone())
+ }
+ return newOptionNode(n.tree.clone())
+}
+
+type fragmentNode struct {
+ kind spec.LexKindName
+ tree CPTree
+}
+
+func newFragmentNode(kind spec.LexKindName, t CPTree) *fragmentNode {
+ return &fragmentNode{
+ kind: kind,
+ tree: t,
+ }
+}
+
+func (n *fragmentNode) String() string {
+ return fmt.Sprintf("fragment: %v", n.kind)
+}
+
+func (n *fragmentNode) Range() (rune, rune, bool) {
+ return n.tree.Range()
+}
+
+func (n *fragmentNode) Optional() (CPTree, bool) {
+ return n.tree.Optional()
+}
+
+func (n *fragmentNode) Repeatable() (CPTree, bool) {
+ return n.tree.Repeatable()
+}
+
+func (n *fragmentNode) Concatenation() (CPTree, CPTree, bool) {
+ return n.tree.Concatenation()
+}
+
+func (n *fragmentNode) Alternatives() (CPTree, CPTree, bool) {
+ return n.tree.Alternatives()
+}
+
+func (n *fragmentNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
+ return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *fragmentNode) children() (CPTree, CPTree) {
+ return n.tree.children()
+}
+
+func (n *fragmentNode) clone() CPTree {
+ if n.tree == nil {
+ return newFragmentNode(n.kind, nil)
+ }
+ return newFragmentNode(n.kind, n.tree.clone())
+}
+
+func printCPTree(w io.Writer, t CPTree, ruledLine string, childRuledLinePrefix string) {
+ if t == nil {
+ return
+ }
+ fmt.Fprintf(w, "%v%v\n", ruledLine, t)
+ left, right := t.children()
+ children := []CPTree{}
+ if left != nil {
+ children = append(children, left)
+ }
+ if right != nil {
+ children = append(children, right)
+ }
+ num := len(children)
+ for i, child := range children {
+ line := "└─ "
+ if num > 1 {
+ if i == 0 {
+ line = "├─ "
+ } else if i < num-1 {
+ line = "│ "
+ }
+ }
+ prefix := "│ "
+ if i >= num-1 {
+ prefix = " "
+ }
+ printCPTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix)
+ }
+}