aboutsummaryrefslogtreecommitdiff
path: root/compiler/parser
diff options
context:
space:
mode:
authorEuAndreh <eu@euandre.org>2024-11-29 16:53:50 -0300
committerEuAndreh <eu@euandre.org>2024-11-29 16:53:50 -0300
commitb987d9ec5b7f73aa90ab8d966c313ef34ce150ef (patch)
treea678c49e609d241e70f63e65e563bd3028c01865 /compiler/parser
parentrm example/* (diff)
downloadtre-b987d9ec5b7f73aa90ab8d966c313ef34ce150ef.tar.gz
tre-b987d9ec5b7f73aa90ab8d966c313ef34ce150ef.tar.xz
Absorb compiler/parser/
Diffstat (limited to 'compiler/parser')
-rw-r--r--compiler/parser/error.go36
-rw-r--r--compiler/parser/fragment.go72
-rw-r--r--compiler/parser/lexer.go594
-rw-r--r--compiler/parser/lexer_test.go524
-rw-r--r--compiler/parser/parser.go531
-rw-r--r--compiler/parser/parser_test.go1389
-rw-r--r--compiler/parser/tree.go459
7 files changed, 0 insertions, 3605 deletions
diff --git a/compiler/parser/error.go b/compiler/parser/error.go
deleted file mode 100644
index be81da4..0000000
--- a/compiler/parser/error.go
+++ /dev/null
@@ -1,36 +0,0 @@
-package parser
-
-import "fmt"
-
-var (
- ParseErr = fmt.Errorf("parse error")
-
- // lexical errors
- synErrIncompletedEscSeq = fmt.Errorf("incompleted escape sequence; unexpected EOF following \\")
- synErrInvalidEscSeq = fmt.Errorf("invalid escape sequence")
- synErrInvalidCodePoint = fmt.Errorf("code points must consist of just 4 or 6 hex digits")
- synErrCharPropInvalidSymbol = fmt.Errorf("invalid character property symbol")
- SynErrFragmentInvalidSymbol = fmt.Errorf("invalid fragment symbol")
-
- // syntax errors
- synErrUnexpectedToken = fmt.Errorf("unexpected token")
- synErrNullPattern = fmt.Errorf("a pattern must be a non-empty byte sequence")
- synErrUnmatchablePattern = fmt.Errorf("a pattern cannot match any characters")
- synErrAltLackOfOperand = fmt.Errorf("an alternation expression must have operands")
- synErrRepNoTarget = fmt.Errorf("a repeat expression must have an operand")
- synErrGroupNoElem = fmt.Errorf("a grouping expression must include at least one character")
- synErrGroupUnclosed = fmt.Errorf("unclosed grouping expression")
- synErrGroupNoInitiator = fmt.Errorf(") needs preceding (")
- synErrGroupInvalidForm = fmt.Errorf("invalid grouping expression")
- synErrBExpNoElem = fmt.Errorf("a bracket expression must include at least one character")
- synErrBExpUnclosed = fmt.Errorf("unclosed bracket expression")
- synErrBExpInvalidForm = fmt.Errorf("invalid bracket expression")
- synErrRangeInvalidOrder = fmt.Errorf("a range expression with invalid order")
- synErrRangePropIsUnavailable = fmt.Errorf("a property expression is unavailable in a range expression")
- synErrRangeInvalidForm = fmt.Errorf("invalid range expression")
- synErrCPExpInvalidForm = fmt.Errorf("invalid code point expression")
- synErrCPExpOutOfRange = fmt.Errorf("a code point must be between U+0000 to U+10FFFF")
- synErrCharPropExpInvalidForm = fmt.Errorf("invalid character property expression")
- synErrCharPropUnsupported = fmt.Errorf("unsupported character property")
- synErrFragmentExpInvalidForm = fmt.Errorf("invalid fragment expression")
-)
diff --git a/compiler/parser/fragment.go b/compiler/parser/fragment.go
deleted file mode 100644
index 5680b55..0000000
--- a/compiler/parser/fragment.go
+++ /dev/null
@@ -1,72 +0,0 @@
-package parser
-
-import (
- "fmt"
-
- "github.com/nihei9/maleeni/spec"
-)
-
-type incompleteFragment struct {
- kind spec.LexKindName
- root *rootNode
-}
-
-func CompleteFragments(fragments map[spec.LexKindName]CPTree) error {
- if len(fragments) == 0 {
- return nil
- }
-
- completeFragments := map[spec.LexKindName]CPTree{}
- incompleteFragments := []*incompleteFragment{}
- for kind, tree := range fragments {
- root, ok := tree.(*rootNode)
- if !ok {
- return fmt.Errorf("CompleteFragments can take only *rootNode: %T", tree)
- }
- if root.incomplete() {
- incompleteFragments = append(incompleteFragments, &incompleteFragment{
- kind: kind,
- root: root,
- })
- } else {
- completeFragments[kind] = root
- }
- }
- for len(incompleteFragments) > 0 {
- lastIncompCount := len(incompleteFragments)
- remainingFragments := []*incompleteFragment{}
- for _, e := range incompleteFragments {
- complete, err := ApplyFragments(e.root, completeFragments)
- if err != nil {
- return err
- }
- if !complete {
- remainingFragments = append(remainingFragments, e)
- } else {
- completeFragments[e.kind] = e.root
- }
- }
- incompleteFragments = remainingFragments
- if len(incompleteFragments) == lastIncompCount {
- return ParseErr
- }
- }
-
- return nil
-}
-
-func ApplyFragments(t CPTree, fragments map[spec.LexKindName]CPTree) (bool, error) {
- root, ok := t.(*rootNode)
- if !ok {
- return false, fmt.Errorf("ApplyFragments can take only *rootNode type: %T", t)
- }
-
- for name, frag := range fragments {
- err := root.applyFragment(name, frag)
- if err != nil {
- return false, err
- }
- }
-
- return !root.incomplete(), nil
-}
diff --git a/compiler/parser/lexer.go b/compiler/parser/lexer.go
deleted file mode 100644
index 3861825..0000000
--- a/compiler/parser/lexer.go
+++ /dev/null
@@ -1,594 +0,0 @@
-package parser
-
-import (
- "bufio"
- "fmt"
- "io"
- "strings"
-)
-
-type tokenKind string
-
-const (
- tokenKindChar tokenKind = "char"
- tokenKindAnyChar tokenKind = "."
- tokenKindRepeat tokenKind = "*"
- tokenKindRepeatOneOrMore tokenKind = "+"
- tokenKindOption tokenKind = "?"
- tokenKindAlt tokenKind = "|"
- tokenKindGroupOpen tokenKind = "("
- tokenKindGroupClose tokenKind = ")"
- tokenKindBExpOpen tokenKind = "["
- tokenKindInverseBExpOpen tokenKind = "[^"
- tokenKindBExpClose tokenKind = "]"
- tokenKindCharRange tokenKind = "-"
- tokenKindCodePointLeader tokenKind = "\\u"
- tokenKindCharPropLeader tokenKind = "\\p"
- tokenKindFragmentLeader tokenKind = "\\f"
- tokenKindLBrace tokenKind = "{"
- tokenKindRBrace tokenKind = "}"
- tokenKindEqual tokenKind = "="
- tokenKindCodePoint tokenKind = "code point"
- tokenKindCharPropSymbol tokenKind = "character property symbol"
- tokenKindFragmentSymbol tokenKind = "fragment symbol"
- tokenKindEOF tokenKind = "eof"
-)
-
-type token struct {
- kind tokenKind
- char rune
- propSymbol string
- codePoint string
- fragmentSymbol string
-}
-
-const nullChar = '\u0000'
-
-func newToken(kind tokenKind, char rune) *token {
- return &token{
- kind: kind,
- char: char,
- }
-}
-
-func newCodePointToken(codePoint string) *token {
- return &token{
- kind: tokenKindCodePoint,
- codePoint: codePoint,
- }
-}
-
-func newCharPropSymbolToken(propSymbol string) *token {
- return &token{
- kind: tokenKindCharPropSymbol,
- propSymbol: propSymbol,
- }
-}
-
-func newFragmentSymbolToken(fragmentSymbol string) *token {
- return &token{
- kind: tokenKindFragmentSymbol,
- fragmentSymbol: fragmentSymbol,
- }
-}
-
-type lexerMode string
-
-const (
- lexerModeDefault lexerMode = "default"
- lexerModeBExp lexerMode = "bracket expression"
- lexerModeCPExp lexerMode = "code point expression"
- lexerModeCharPropExp lexerMode = "character property expression"
- lexerModeFragmentExp lexerMode = "fragment expression"
-)
-
-type lexerModeStack struct {
- stack []lexerMode
-}
-
-func newLexerModeStack() *lexerModeStack {
- return &lexerModeStack{
- stack: []lexerMode{
- lexerModeDefault,
- },
- }
-}
-
-func (s *lexerModeStack) top() lexerMode {
- return s.stack[len(s.stack)-1]
-}
-
-func (s *lexerModeStack) push(m lexerMode) {
- s.stack = append(s.stack, m)
-}
-
-func (s *lexerModeStack) pop() {
- s.stack = s.stack[:len(s.stack)-1]
-}
-
-type rangeState string
-
-// [a-z]
-// ^^^^
-// |||`-- ready
-// ||`-- expect range terminator
-// |`-- read range initiator
-// `-- ready
-const (
- rangeStateReady rangeState = "ready"
- rangeStateReadRangeInitiator rangeState = "read range initiator"
- rangeStateExpectRangeTerminator rangeState = "expect range terminator"
-)
-
-type lexer struct {
- src *bufio.Reader
- peekChar2 rune
- peekEOF2 bool
- peekChar1 rune
- peekEOF1 bool
- lastChar rune
- reachedEOF bool
- prevChar1 rune
- prevEOF1 bool
- prevChar2 rune
- pervEOF2 bool
- modeStack *lexerModeStack
- rangeState rangeState
-
- errCause error
- errDetail string
-}
-
-func newLexer(src io.Reader) *lexer {
- return &lexer{
- src: bufio.NewReader(src),
- peekChar2: nullChar,
- peekEOF2: false,
- peekChar1: nullChar,
- peekEOF1: false,
- lastChar: nullChar,
- reachedEOF: false,
- prevChar1: nullChar,
- prevEOF1: false,
- prevChar2: nullChar,
- pervEOF2: false,
- modeStack: newLexerModeStack(),
- rangeState: rangeStateReady,
- }
-}
-
-func (l *lexer) error() (string, error) {
- return l.errDetail, l.errCause
-}
-
-func (l *lexer) next() (*token, error) {
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- return newToken(tokenKindEOF, nullChar), nil
- }
-
- switch l.modeStack.top() {
- case lexerModeBExp:
- tok, err := l.nextInBExp(c)
- if err != nil {
- return nil, err
- }
- if tok.kind == tokenKindChar || tok.kind == tokenKindCodePointLeader || tok.kind == tokenKindCharPropLeader {
- switch l.rangeState {
- case rangeStateReady:
- l.rangeState = rangeStateReadRangeInitiator
- case rangeStateExpectRangeTerminator:
- l.rangeState = rangeStateReady
- }
- }
- switch tok.kind {
- case tokenKindBExpClose:
- l.modeStack.pop()
- case tokenKindCharRange:
- l.rangeState = rangeStateExpectRangeTerminator
- case tokenKindCodePointLeader:
- l.modeStack.push(lexerModeCPExp)
- case tokenKindCharPropLeader:
- l.modeStack.push(lexerModeCharPropExp)
- }
- return tok, nil
- case lexerModeCPExp:
- tok, err := l.nextInCodePoint(c)
- if err != nil {
- return nil, err
- }
- switch tok.kind {
- case tokenKindRBrace:
- l.modeStack.pop()
- }
- return tok, nil
- case lexerModeCharPropExp:
- tok, err := l.nextInCharProp(c)
- if err != nil {
- return nil, err
- }
- switch tok.kind {
- case tokenKindRBrace:
- l.modeStack.pop()
- }
- return tok, nil
- case lexerModeFragmentExp:
- tok, err := l.nextInFragment(c)
- if err != nil {
- return nil, err
- }
- switch tok.kind {
- case tokenKindRBrace:
- l.modeStack.pop()
- }
- return tok, nil
- default:
- tok, err := l.nextInDefault(c)
- if err != nil {
- return nil, err
- }
- switch tok.kind {
- case tokenKindBExpOpen:
- l.modeStack.push(lexerModeBExp)
- l.rangeState = rangeStateReady
- case tokenKindInverseBExpOpen:
- l.modeStack.push(lexerModeBExp)
- l.rangeState = rangeStateReady
- case tokenKindCodePointLeader:
- l.modeStack.push(lexerModeCPExp)
- case tokenKindCharPropLeader:
- l.modeStack.push(lexerModeCharPropExp)
- case tokenKindFragmentLeader:
- l.modeStack.push(lexerModeFragmentExp)
- }
- return tok, nil
- }
-}
-
-func (l *lexer) nextInDefault(c rune) (*token, error) {
- switch c {
- case '*':
- return newToken(tokenKindRepeat, nullChar), nil
- case '+':
- return newToken(tokenKindRepeatOneOrMore, nullChar), nil
- case '?':
- return newToken(tokenKindOption, nullChar), nil
- case '.':
- return newToken(tokenKindAnyChar, nullChar), nil
- case '|':
- return newToken(tokenKindAlt, nullChar), nil
- case '(':
- return newToken(tokenKindGroupOpen, nullChar), nil
- case ')':
- return newToken(tokenKindGroupClose, nullChar), nil
- case '[':
- c1, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindBExpOpen, nullChar), nil
- }
- if c1 != '^' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindBExpOpen, nullChar), nil
- }
- c2, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindInverseBExpOpen, nullChar), nil
- }
- if c2 != ']' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindInverseBExpOpen, nullChar), nil
- }
- err = l.restore()
- if err != nil {
- return nil, err
- }
- err = l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindBExpOpen, nullChar), nil
- case '\\':
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- l.errCause = synErrIncompletedEscSeq
- return nil, ParseErr
- }
- if c == 'u' {
- return newToken(tokenKindCodePointLeader, nullChar), nil
- }
- if c == 'p' {
- return newToken(tokenKindCharPropLeader, nullChar), nil
- }
- if c == 'f' {
- return newToken(tokenKindFragmentLeader, nullChar), nil
- }
- if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' {
- return newToken(tokenKindChar, c), nil
- }
- l.errCause = synErrInvalidEscSeq
- l.errDetail = fmt.Sprintf("\\%v is not supported", string(c))
- return nil, ParseErr
- default:
- return newToken(tokenKindChar, c), nil
- }
-}
-
-func (l *lexer) nextInBExp(c rune) (*token, error) {
- switch c {
- case '-':
- if l.rangeState != rangeStateReadRangeInitiator {
- return newToken(tokenKindChar, c), nil
- }
- c1, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindChar, c), nil
- }
- if c1 != ']' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindCharRange, nullChar), nil
- }
- err = l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindChar, c), nil
- case ']':
- return newToken(tokenKindBExpClose, nullChar), nil
- case '\\':
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- l.errCause = synErrIncompletedEscSeq
- return nil, ParseErr
- }
- if c == 'u' {
- return newToken(tokenKindCodePointLeader, nullChar), nil
- }
- if c == 'p' {
- return newToken(tokenKindCharPropLeader, nullChar), nil
- }
- if c == '\\' || c == '^' || c == '-' || c == ']' {
- return newToken(tokenKindChar, c), nil
- }
- l.errCause = synErrInvalidEscSeq
- l.errDetail = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c))
- return nil, ParseErr
- default:
- return newToken(tokenKindChar, c), nil
- }
-}
-
-func (l *lexer) nextInCodePoint(c rune) (*token, error) {
- switch c {
- case '{':
- return newToken(tokenKindLBrace, nullChar), nil
- case '}':
- return newToken(tokenKindRBrace, nullChar), nil
- default:
- if !isHexDigit(c) {
- l.errCause = synErrInvalidCodePoint
- return nil, ParseErr
- }
- var b strings.Builder
- fmt.Fprint(&b, string(c))
- n := 1
- for {
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- if c == '}' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- if !isHexDigit(c) || n >= 6 {
- l.errCause = synErrInvalidCodePoint
- return nil, ParseErr
- }
- fmt.Fprint(&b, string(c))
- n++
- }
- cp := b.String()
- cpLen := len(cp)
- if !(cpLen == 4 || cpLen == 6) {
- l.errCause = synErrInvalidCodePoint
- return nil, ParseErr
- }
- return newCodePointToken(b.String()), nil
- }
-}
-
-func isHexDigit(c rune) bool {
- if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' {
- return true
- }
- return false
-}
-
-func (l *lexer) nextInCharProp(c rune) (*token, error) {
- switch c {
- case '{':
- return newToken(tokenKindLBrace, nullChar), nil
- case '}':
- return newToken(tokenKindRBrace, nullChar), nil
- case '=':
- return newToken(tokenKindEqual, nullChar), nil
- default:
- var b strings.Builder
- fmt.Fprint(&b, string(c))
- n := 1
- for {
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- if c == '}' || c == '=' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- fmt.Fprint(&b, string(c))
- n++
- }
- sym := strings.TrimSpace(b.String())
- if len(sym) == 0 {
- l.errCause = synErrCharPropInvalidSymbol
- return nil, ParseErr
- }
- return newCharPropSymbolToken(sym), nil
- }
-}
-
-func (l *lexer) nextInFragment(c rune) (*token, error) {
- switch c {
- case '{':
- return newToken(tokenKindLBrace, nullChar), nil
- case '}':
- return newToken(tokenKindRBrace, nullChar), nil
- default:
- var b strings.Builder
- fmt.Fprint(&b, string(c))
- n := 1
- for {
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- if c == '}' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- fmt.Fprint(&b, string(c))
- n++
- }
- sym := strings.TrimSpace(b.String())
- if len(sym) == 0 {
- l.errCause = SynErrFragmentInvalidSymbol
- return nil, ParseErr
- }
- return newFragmentSymbolToken(sym), nil
- }
-}
-
-func (l *lexer) read() (rune, bool, error) {
- if l.reachedEOF {
- return l.lastChar, l.reachedEOF, nil
- }
- if l.peekChar1 != nullChar || l.peekEOF1 {
- l.prevChar2 = l.prevChar1
- l.pervEOF2 = l.prevEOF1
- l.prevChar1 = l.lastChar
- l.prevEOF1 = l.reachedEOF
- l.lastChar = l.peekChar1
- l.reachedEOF = l.peekEOF1
- l.peekChar1 = l.peekChar2
- l.peekEOF1 = l.peekEOF2
- l.peekChar2 = nullChar
- l.peekEOF2 = false
- return l.lastChar, l.reachedEOF, nil
- }
- c, _, err := l.src.ReadRune()
- if err != nil {
- if err == io.EOF {
- l.prevChar2 = l.prevChar1
- l.pervEOF2 = l.prevEOF1
- l.prevChar1 = l.lastChar
- l.prevEOF1 = l.reachedEOF
- l.lastChar = nullChar
- l.reachedEOF = true
- return l.lastChar, l.reachedEOF, nil
- }
- return nullChar, false, err
- }
- l.prevChar2 = l.prevChar1
- l.pervEOF2 = l.prevEOF1
- l.prevChar1 = l.lastChar
- l.prevEOF1 = l.reachedEOF
- l.lastChar = c
- l.reachedEOF = false
- return l.lastChar, l.reachedEOF, nil
-}
-
-func (l *lexer) restore() error {
- if l.lastChar == nullChar && !l.reachedEOF {
- return fmt.Errorf("failed to call restore() because the last character is null")
- }
- l.peekChar2 = l.peekChar1
- l.peekEOF2 = l.peekEOF1
- l.peekChar1 = l.lastChar
- l.peekEOF1 = l.reachedEOF
- l.lastChar = l.prevChar1
- l.reachedEOF = l.prevEOF1
- l.prevChar1 = l.prevChar2
- l.prevEOF1 = l.pervEOF2
- l.prevChar2 = nullChar
- l.pervEOF2 = false
- return nil
-}
diff --git a/compiler/parser/lexer_test.go b/compiler/parser/lexer_test.go
deleted file mode 100644
index 055466e..0000000
--- a/compiler/parser/lexer_test.go
+++ /dev/null
@@ -1,524 +0,0 @@
-package parser
-
-import (
- "strings"
- "testing"
-)
-
-func TestLexer(t *testing.T) {
- tests := []struct {
- caption string
- src string
- tokens []*token
- err error
- }{
- {
- caption: "lexer can recognize ordinaly characters",
- src: "123abcいろは",
- tokens: []*token{
- newToken(tokenKindChar, '1'),
- newToken(tokenKindChar, '2'),
- newToken(tokenKindChar, '3'),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindChar, 'b'),
- newToken(tokenKindChar, 'c'),
- newToken(tokenKindChar, 'い'),
- newToken(tokenKindChar, 'ろ'),
- newToken(tokenKindChar, 'は'),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "lexer can recognize the special characters in default mode",
- src: ".*+?|()[\\u",
- tokens: []*token{
- newToken(tokenKindAnyChar, nullChar),
- newToken(tokenKindRepeat, nullChar),
- newToken(tokenKindRepeatOneOrMore, nullChar),
- newToken(tokenKindOption, nullChar),
- newToken(tokenKindAlt, nullChar),
- newToken(tokenKindGroupOpen, nullChar),
- newToken(tokenKindGroupClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "lexer can recognize the escape sequences in default mode",
- src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[",
- tokens: []*token{
- newToken(tokenKindChar, '\\'),
- newToken(tokenKindChar, '.'),
- newToken(tokenKindChar, '*'),
- newToken(tokenKindChar, '+'),
- newToken(tokenKindChar, '?'),
- newToken(tokenKindChar, '|'),
- newToken(tokenKindChar, '('),
- newToken(tokenKindChar, ')'),
- newToken(tokenKindChar, '['),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "], {, and } are treated as an ordinary character in default mode",
- src: "]{}",
- tokens: []*token{
- newToken(tokenKindChar, ']'),
- newToken(tokenKindChar, '{'),
- newToken(tokenKindChar, '}'),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "lexer can recognize the special characters in bracket expression mode",
- src: "[a-z\\u{09AF}][^a-z\\u{09abcf}]",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindCharRange, nullChar),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("09AF"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindCharRange, nullChar),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("09abcf"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "lexer can recognize the escape sequences in bracket expression mode",
- src: "[\\^a\\-z]",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '^'),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "in a bracket expression, the special characters are also handled as normal characters",
- src: "[\\\\.*+?|()[",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '\\'),
- newToken(tokenKindChar, '.'),
- newToken(tokenKindChar, '*'),
- newToken(tokenKindChar, '+'),
- newToken(tokenKindChar, '?'),
- newToken(tokenKindChar, '|'),
- newToken(tokenKindChar, '('),
- newToken(tokenKindChar, ')'),
- newToken(tokenKindChar, '['),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "hyphen symbols that appear in bracket expressions are handled as the character range symbol or ordinary characters",
- // [...-...][...-][-...][-]
- // ~~~~~~~ ~ ~ ~
- // ^ ^ ^ ^
- // | | | `-- Ordinary Character (b)
- // | | `-- Ordinary Character (b)
- // | `-- Ordinary Character (b)
- // `-- Character Range (a)
- //
- // a. *-* is handled as a character-range expression.
- // b. *-, -*, or - are handled as ordinary characters.
- src: "[a-z][a-][-z][-][--][---][^a-z][^a-][^-z][^-][^--][^---]",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindCharRange, nullChar),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindCharRange, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
-
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindCharRange, nullChar),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindCharRange, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
-
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "caret symbols that appear in bracket expressions are handled as the logical inverse symbol or ordinary characters",
- // [^...^...][^]
- // ~~ ~ ~~
- // ^ ^ ^^
- // | | |`-- Ordinary Character (c)
- // | | `-- Bracket Expression
- // | `-- Ordinary Character (b)
- // `-- Inverse Bracket Expression (a)
- //
- // a. Bracket expressions that have a caret symbol at the beginning are handled as logical inverse expressions.
- // b. caret symbols that appear as the second and the subsequent symbols are handled as ordinary symbols.
- // c. When a bracket expression has just one symbol, a caret symbol at the beginning is handled as an ordinary character.
- src: "[^^][^]",
- tokens: []*token{
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, '^'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '^'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "lexer raises an error when an invalid escape sequence appears",
- src: "\\@",
- err: synErrInvalidEscSeq,
- },
- {
- caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears",
- src: "\\",
- err: synErrIncompletedEscSeq,
- },
- {
- caption: "lexer raises an error when an invalid escape sequence appears",
- src: "[\\@",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- },
- err: synErrInvalidEscSeq,
- },
- {
- caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears",
- src: "[\\",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- },
- err: synErrIncompletedEscSeq,
- },
- {
- caption: "lexer can recognize the special characters and code points in code point expression mode",
- src: "\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}[\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}][^\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}]",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("0123"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("4567"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("89abcd"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("efAB"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("CDEF01"),
- newToken(tokenKindRBrace, nullChar),
-
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("0123"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("4567"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("89abcd"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("efAB"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("CDEF01"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindBExpClose, nullChar),
-
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("0123"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("4567"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("89abcd"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("efAB"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("CDEF01"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindBExpClose, nullChar),
-
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "a one digit hex string isn't a valid code point",
- src: "\\u{0",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "a two digits hex string isn't a valid code point",
- src: "\\u{01",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "a three digits hex string isn't a valid code point",
- src: "\\u{012",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "a four digits hex string is a valid code point",
- src: "\\u{0123}",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("0123"),
- newToken(tokenKindRBrace, nullChar),
- },
- },
- {
- caption: "a five digits hex string isn't a valid code point",
- src: "\\u{01234",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "a six digits hex string is a valid code point",
- src: "\\u{012345}",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("012345"),
- newToken(tokenKindRBrace, nullChar),
- },
- },
- {
- caption: "a seven digits hex string isn't a valid code point",
- src: "\\u{0123456",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "a code point must be hex digits",
- src: "\\u{g",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "a code point must be hex digits",
- src: "\\u{G",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "lexer can recognize the special characters and symbols in character property expression mode",
- src: "\\p{Letter}\\p{General_Category=Letter}[\\p{Letter}\\p{General_Category=Letter}][^\\p{Letter}\\p{General_Category=Letter}]",
- tokens: []*token{
- newToken(tokenKindCharPropLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCharPropSymbolToken("Letter"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCharPropLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCharPropSymbolToken("General_Category"),
- newToken(tokenKindEqual, nullChar),
- newCharPropSymbolToken("Letter"),
- newToken(tokenKindRBrace, nullChar),
-
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindCharPropLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCharPropSymbolToken("Letter"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCharPropLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCharPropSymbolToken("General_Category"),
- newToken(tokenKindEqual, nullChar),
- newCharPropSymbolToken("Letter"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindBExpClose, nullChar),
-
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindCharPropLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCharPropSymbolToken("Letter"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCharPropLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCharPropSymbolToken("General_Category"),
- newToken(tokenKindEqual, nullChar),
- newCharPropSymbolToken("Letter"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindBExpClose, nullChar),
-
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "lexer can recognize the special characters and symbols in fragment expression mode",
- src: "\\f{integer}",
- tokens: []*token{
- newToken(tokenKindFragmentLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newFragmentSymbolToken("integer"),
- newToken(tokenKindRBrace, nullChar),
-
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "a fragment expression is not supported in a bracket expression",
- src: "[\\f",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- },
- err: synErrInvalidEscSeq,
- },
- {
- caption: "a fragment expression is not supported in an inverse bracket expression",
- src: "[^\\f",
- tokens: []*token{
- newToken(tokenKindInverseBExpOpen, nullChar),
- },
- err: synErrInvalidEscSeq,
- },
- }
- for _, tt := range tests {
- t.Run(tt.caption, func(t *testing.T) {
- lex := newLexer(strings.NewReader(tt.src))
- var err error
- var tok *token
- i := 0
- for {
- tok, err = lex.next()
- if err != nil {
- break
- }
- if i >= len(tt.tokens) {
- break
- }
- eTok := tt.tokens[i]
- i++
- testToken(t, tok, eTok)
-
- if tok.kind == tokenKindEOF {
- break
- }
- }
- if tt.err != nil {
- if err != ParseErr {
- t.Fatalf("unexpected error: want: %v, got: %v", ParseErr, err)
- }
- detail, cause := lex.error()
- if cause != tt.err {
- t.Fatalf("unexpected error: want: %v, got: %v (%v)", tt.err, cause, detail)
- }
- } else {
- if err != nil {
- t.Fatalf("unexpected error: %v", err)
- }
- }
- if i < len(tt.tokens) {
- t.Fatalf("expecte more tokens")
- }
- })
- }
-}
-
-func testToken(t *testing.T, a, e *token) {
- t.Helper()
- if e.kind != a.kind || e.char != a.char || e.codePoint != a.codePoint {
- t.Fatalf("unexpected token: want: %+v, got: %+v", e, a)
- }
-}
diff --git a/compiler/parser/parser.go b/compiler/parser/parser.go
deleted file mode 100644
index b7f8c04..0000000
--- a/compiler/parser/parser.go
+++ /dev/null
@@ -1,531 +0,0 @@
-package parser
-
-import (
- "bytes"
- "fmt"
- "io"
- "strconv"
-
- "github.com/nihei9/maleeni/spec"
- "github.com/nihei9/maleeni/ucd"
-)
-
-type PatternEntry struct {
- ID spec.LexModeKindID
- Pattern []byte
-}
-
-type parser struct {
- kind spec.LexKindName
- lex *lexer
- peekedTok *token
- lastTok *token
-
- // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that
- // appear in property expressions.
- //
- // The contributory properties are not exposed, and users cannot use those properties because the parser
- // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid.
- //
- // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to
- // interpret derived properties internally because the derived properties consist of other properties that
- // may contain the contributory properties.
- //
- // [UAX #44 5.13 Property APIs] says:
- // > The following subtypes of Unicode character properties should generally not be exposed in APIs,
- // > except in limited circumstances. They may not be useful, particularly in public API collections,
- // > and may instead prove misleading to the users of such API collections.
- // > * Contributory properties are not recommended for public APIs.
- // > ...
- // https://unicode.org/reports/tr44/#Property_APIs
- isContributoryPropertyExposed bool
-
- errCause error
- errDetail string
-}
-
-func NewParser(kind spec.LexKindName, src io.Reader) *parser {
- return &parser{
- kind: kind,
- lex: newLexer(src),
- isContributoryPropertyExposed: false,
- }
-}
-
-func (p *parser) exposeContributoryProperty() {
- p.isContributoryPropertyExposed = true
-}
-
-func (p *parser) Error() (string, error) {
- return p.errDetail, p.errCause
-}
-
-func (p *parser) Parse() (root CPTree, retErr error) {
- defer func() {
- err := recover()
- if err != nil {
- var ok bool
- retErr, ok = err.(error)
- if !ok {
- panic(err)
- }
- return
- }
- }()
-
- return newRootNode(p.kind, p.parseRegexp()), nil
-}
-
-func (p *parser) parseRegexp() CPTree {
- alt := p.parseAlt()
- if alt == nil {
- if p.consume(tokenKindGroupClose) {
- p.raiseParseError(synErrGroupNoInitiator, "")
- }
- p.raiseParseError(synErrNullPattern, "")
- }
- if p.consume(tokenKindGroupClose) {
- p.raiseParseError(synErrGroupNoInitiator, "")
- }
- p.expect(tokenKindEOF)
- return alt
-}
-
-func (p *parser) parseAlt() CPTree {
- left := p.parseConcat()
- if left == nil {
- if p.consume(tokenKindAlt) {
- p.raiseParseError(synErrAltLackOfOperand, "")
- }
- return nil
- }
- for {
- if !p.consume(tokenKindAlt) {
- break
- }
- right := p.parseConcat()
- if right == nil {
- p.raiseParseError(synErrAltLackOfOperand, "")
- }
- left = newAltNode(left, right)
- }
- return left
-}
-
-func (p *parser) parseConcat() CPTree {
- left := p.parseRepeat()
- for {
- right := p.parseRepeat()
- if right == nil {
- break
- }
- left = newConcatNode(left, right)
- }
- return left
-}
-
-func (p *parser) parseRepeat() CPTree {
- group := p.parseGroup()
- if group == nil {
- if p.consume(tokenKindRepeat) {
- p.raiseParseError(synErrRepNoTarget, "* needs an operand")
- }
- if p.consume(tokenKindRepeatOneOrMore) {
- p.raiseParseError(synErrRepNoTarget, "+ needs an operand")
- }
- if p.consume(tokenKindOption) {
- p.raiseParseError(synErrRepNoTarget, "? needs an operand")
- }
- return nil
- }
- if p.consume(tokenKindRepeat) {
- return newRepeatNode(group)
- }
- if p.consume(tokenKindRepeatOneOrMore) {
- return newRepeatOneOrMoreNode(group)
- }
- if p.consume(tokenKindOption) {
- return newOptionNode(group)
- }
- return group
-}
-
-func (p *parser) parseGroup() CPTree {
- if p.consume(tokenKindGroupOpen) {
- alt := p.parseAlt()
- if alt == nil {
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrGroupUnclosed, "")
- }
- p.raiseParseError(synErrGroupNoElem, "")
- }
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrGroupUnclosed, "")
- }
- if !p.consume(tokenKindGroupClose) {
- p.raiseParseError(synErrGroupInvalidForm, "")
- }
- return alt
- }
- return p.parseSingleChar()
-}
-
-func (p *parser) parseSingleChar() CPTree {
- if p.consume(tokenKindAnyChar) {
- return genAnyCharAST()
- }
- if p.consume(tokenKindBExpOpen) {
- left := p.parseBExpElem()
- if left == nil {
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrBExpUnclosed, "")
- }
- p.raiseParseError(synErrBExpNoElem, "")
- }
- for {
- right := p.parseBExpElem()
- if right == nil {
- break
- }
- left = newAltNode(left, right)
- }
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrBExpUnclosed, "")
- }
- p.expect(tokenKindBExpClose)
- return left
- }
- if p.consume(tokenKindInverseBExpOpen) {
- elem := p.parseBExpElem()
- if elem == nil {
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrBExpUnclosed, "")
- }
- p.raiseParseError(synErrBExpNoElem, "")
- }
- inverse := exclude(elem, genAnyCharAST())
- if inverse == nil {
- p.raiseParseError(synErrUnmatchablePattern, "")
- }
- for {
- elem := p.parseBExpElem()
- if elem == nil {
- break
- }
- inverse = exclude(elem, inverse)
- if inverse == nil {
- p.raiseParseError(synErrUnmatchablePattern, "")
- }
- }
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrBExpUnclosed, "")
- }
- p.expect(tokenKindBExpClose)
- return inverse
- }
- if p.consume(tokenKindCodePointLeader) {
- return p.parseCodePoint()
- }
- if p.consume(tokenKindCharPropLeader) {
- return p.parseCharProp()
- }
- if p.consume(tokenKindFragmentLeader) {
- return p.parseFragment()
- }
- c := p.parseNormalChar()
- if c == nil {
- if p.consume(tokenKindBExpClose) {
- p.raiseParseError(synErrBExpInvalidForm, "")
- }
- return nil
- }
- return c
-}
-
-func (p *parser) parseBExpElem() CPTree {
- var left CPTree
- switch {
- case p.consume(tokenKindCodePointLeader):
- left = p.parseCodePoint()
- case p.consume(tokenKindCharPropLeader):
- left = p.parseCharProp()
- if p.consume(tokenKindCharRange) {
- p.raiseParseError(synErrRangePropIsUnavailable, "")
- }
- default:
- left = p.parseNormalChar()
- }
- if left == nil {
- return nil
- }
- if !p.consume(tokenKindCharRange) {
- return left
- }
- var right CPTree
- switch {
- case p.consume(tokenKindCodePointLeader):
- right = p.parseCodePoint()
- case p.consume(tokenKindCharPropLeader):
- p.raiseParseError(synErrRangePropIsUnavailable, "")
- default:
- right = p.parseNormalChar()
- }
- if right == nil {
- p.raiseParseError(synErrRangeInvalidForm, "")
- }
- from, _, _ := left.Range()
- _, to, _ := right.Range()
- if !isValidOrder(from, to) {
- p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("%X..%X", from, to))
- }
- return newRangeSymbolNode(from, to)
-}
-
-func (p *parser) parseCodePoint() CPTree {
- if !p.consume(tokenKindLBrace) {
- p.raiseParseError(synErrCPExpInvalidForm, "")
- }
- if !p.consume(tokenKindCodePoint) {
- p.raiseParseError(synErrCPExpInvalidForm, "")
- }
-
- n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64)
- if err != nil {
- panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err))
- }
- if n < 0x0000 || n > 0x10FFFF {
- p.raiseParseError(synErrCPExpOutOfRange, "")
- }
-
- sym := newSymbolNode(rune(n))
-
- if !p.consume(tokenKindRBrace) {
- p.raiseParseError(synErrCPExpInvalidForm, "")
- }
-
- return sym
-}
-
-func (p *parser) parseCharProp() CPTree {
- if !p.consume(tokenKindLBrace) {
- p.raiseParseError(synErrCharPropExpInvalidForm, "")
- }
- var sym1, sym2 string
- if !p.consume(tokenKindCharPropSymbol) {
- p.raiseParseError(synErrCharPropExpInvalidForm, "")
- }
- sym1 = p.lastTok.propSymbol
- if p.consume(tokenKindEqual) {
- if !p.consume(tokenKindCharPropSymbol) {
- p.raiseParseError(synErrCharPropExpInvalidForm, "")
- }
- sym2 = p.lastTok.propSymbol
- }
-
- var alt CPTree
- var propName, propVal string
- if sym2 != "" {
- propName = sym1
- propVal = sym2
- } else {
- propName = ""
- propVal = sym1
- }
- if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) {
- p.raiseParseError(synErrCharPropUnsupported, propName)
- }
- pat, err := ucd.NormalizeCharacterProperty(propName, propVal)
- if err != nil {
- p.raiseParseError(synErrCharPropUnsupported, err.Error())
- }
- if pat != "" {
- p := NewParser(p.kind, bytes.NewReader([]byte(pat)))
- p.exposeContributoryProperty()
- ast, err := p.Parse()
- if err != nil {
- panic(err)
- }
- alt = ast
- } else {
- cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal)
- if err != nil {
- p.raiseParseError(synErrCharPropUnsupported, err.Error())
- }
- if inverse {
- r := cpRanges[0]
- alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST())
- if alt == nil {
- p.raiseParseError(synErrUnmatchablePattern, "")
- }
- for _, r := range cpRanges[1:] {
- alt = exclude(newRangeSymbolNode(r.From, r.To), alt)
- if alt == nil {
- p.raiseParseError(synErrUnmatchablePattern, "")
- }
- }
- } else {
- for _, r := range cpRanges {
- alt = genAltNode(
- alt,
- newRangeSymbolNode(r.From, r.To),
- )
- }
- }
- }
-
- if !p.consume(tokenKindRBrace) {
- p.raiseParseError(synErrCharPropExpInvalidForm, "")
- }
-
- return alt
-}
-
-func (p *parser) parseFragment() CPTree {
- if !p.consume(tokenKindLBrace) {
- p.raiseParseError(synErrFragmentExpInvalidForm, "")
- }
- if !p.consume(tokenKindFragmentSymbol) {
- p.raiseParseError(synErrFragmentExpInvalidForm, "")
- }
- sym := p.lastTok.fragmentSymbol
-
- if !p.consume(tokenKindRBrace) {
- p.raiseParseError(synErrFragmentExpInvalidForm, "")
- }
-
- return newFragmentNode(spec.LexKindName(sym), nil)
-}
-
-func (p *parser) parseNormalChar() CPTree {
- if !p.consume(tokenKindChar) {
- return nil
- }
- return newSymbolNode(p.lastTok.char)
-}
-
-func exclude(symbol, base CPTree) CPTree {
- if left, right, ok := symbol.Alternatives(); ok {
- return exclude(right, exclude(left, base))
- }
-
- if left, right, ok := base.Alternatives(); ok {
- return genAltNode(
- exclude(symbol, left),
- exclude(symbol, right),
- )
- }
-
- if bFrom, bTo, ok := base.Range(); ok {
- sFrom, sTo, ok := symbol.Range()
- if !ok {
- panic(fmt.Errorf("invalid symbol tree: %T", symbol))
- }
-
- switch {
- case sFrom > bFrom && sTo < bTo:
- return genAltNode(
- newRangeSymbolNode(bFrom, sFrom-1),
- newRangeSymbolNode(sTo+1, bTo),
- )
- case sFrom <= bFrom && sTo >= bFrom && sTo < bTo:
- return newRangeSymbolNode(sTo+1, bTo)
- case sFrom > bFrom && sFrom <= bTo && sTo >= bTo:
- return newRangeSymbolNode(bFrom, sFrom-1)
- case sFrom <= bFrom && sTo >= bTo:
- return nil
- default:
- return base
- }
- }
-
- panic(fmt.Errorf("invalid base tree: %T", base))
-}
-
-func genAnyCharAST() CPTree {
- return newRangeSymbolNode(0x0, 0x10FFFF)
-}
-
-func isValidOrder(from, to rune) bool {
- return from <= to
-}
-
-func genConcatNode(cs ...CPTree) CPTree {
- nonNilNodes := []CPTree{}
- for _, c := range cs {
- if c == nil {
- continue
- }
- nonNilNodes = append(nonNilNodes, c)
- }
- if len(nonNilNodes) <= 0 {
- return nil
- }
- if len(nonNilNodes) == 1 {
- return nonNilNodes[0]
- }
- concat := newConcatNode(nonNilNodes[0], nonNilNodes[1])
- for _, c := range nonNilNodes[2:] {
- concat = newConcatNode(concat, c)
- }
- return concat
-}
-
-func genAltNode(cs ...CPTree) CPTree {
- nonNilNodes := []CPTree{}
- for _, c := range cs {
- if c == nil {
- continue
- }
- nonNilNodes = append(nonNilNodes, c)
- }
- if len(nonNilNodes) <= 0 {
- return nil
- }
- if len(nonNilNodes) == 1 {
- return nonNilNodes[0]
- }
- alt := newAltNode(nonNilNodes[0], nonNilNodes[1])
- for _, c := range nonNilNodes[2:] {
- alt = newAltNode(alt, c)
- }
- return alt
-}
-
-func (p *parser) expect(expected tokenKind) {
- if !p.consume(expected) {
- tok := p.peekedTok
- p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("expected: %v, actual: %v", expected, tok.kind))
- }
-}
-
-func (p *parser) consume(expected tokenKind) bool {
- var tok *token
- var err error
- if p.peekedTok != nil {
- tok = p.peekedTok
- p.peekedTok = nil
- } else {
- tok, err = p.lex.next()
- if err != nil {
- if err == ParseErr {
- detail, cause := p.lex.error()
- p.raiseParseError(cause, detail)
- }
- panic(err)
- }
- }
- p.lastTok = tok
- if tok.kind == expected {
- return true
- }
- p.peekedTok = tok
- p.lastTok = nil
-
- return false
-}
-
-func (p *parser) raiseParseError(err error, detail string) {
- p.errCause = err
- p.errDetail = detail
- panic(ParseErr)
-}
diff --git a/compiler/parser/parser_test.go b/compiler/parser/parser_test.go
deleted file mode 100644
index 57c130e..0000000
--- a/compiler/parser/parser_test.go
+++ /dev/null
@@ -1,1389 +0,0 @@
-package parser
-
-import (
- "fmt"
- "reflect"
- "strings"
- "testing"
-
- "github.com/nihei9/maleeni/spec"
- "github.com/nihei9/maleeni/ucd"
-)
-
-func TestParse(t *testing.T) {
- tests := []struct {
- pattern string
- fragments map[spec.LexKindName]string
- ast CPTree
- syntaxError error
-
- // When an AST is large, as patterns containing a character property expression, this test only checks
- // that the pattern is parsable. The check of the validity of such AST is performed by checking that it
- // can be matched correctly using the driver.
- skipTestAST bool
- }{
- {
- pattern: "a",
- ast: newSymbolNode('a'),
- },
- {
- pattern: "abc",
- ast: genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- },
- {
- pattern: "a?",
- ast: newOptionNode(
- newSymbolNode('a'),
- ),
- },
- {
- pattern: "[abc]?",
- ast: newOptionNode(
- genAltNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- },
- {
- pattern: "\\u{3042}?",
- ast: newOptionNode(
- newSymbolNode('\u3042'),
- ),
- },
- {
- pattern: "\\p{Letter}?",
- skipTestAST: true,
- },
- {
- pattern: "\\f{a2c}?",
- fragments: map[spec.LexKindName]string{
- "a2c": "abc",
- },
- ast: newOptionNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- ),
- },
- {
- pattern: "(a)?",
- ast: newOptionNode(
- newSymbolNode('a'),
- ),
- },
- {
- pattern: "((a?)?)?",
- ast: newOptionNode(
- newOptionNode(
- newOptionNode(
- newSymbolNode('a'),
- ),
- ),
- ),
- },
- {
- pattern: "(abc)?",
- ast: newOptionNode(
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- },
- {
- pattern: "(a|b)?",
- ast: newOptionNode(
- genAltNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- ),
- ),
- },
- {
- pattern: "?",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "(?)",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a|?",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "?|b",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a??",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a*",
- ast: newRepeatNode(
- newSymbolNode('a'),
- ),
- },
- {
- pattern: "[abc]*",
- ast: newRepeatNode(
- genAltNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- },
- {
- pattern: "\\u{3042}*",
- ast: newRepeatNode(
- newSymbolNode('\u3042'),
- ),
- },
- {
- pattern: "\\p{Letter}*",
- skipTestAST: true,
- },
- {
- pattern: "\\f{a2c}*",
- fragments: map[spec.LexKindName]string{
- "a2c": "abc",
- },
- ast: newRepeatNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- ),
- },
- {
- pattern: "((a*)*)*",
- ast: newRepeatNode(
- newRepeatNode(
- newRepeatNode(
- newSymbolNode('a'),
- ),
- ),
- ),
- },
- {
- pattern: "(abc)*",
- ast: newRepeatNode(
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- },
- {
- pattern: "(a|b)*",
- ast: newRepeatNode(
- genAltNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- ),
- ),
- },
- {
- pattern: "*",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "(*)",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a|*",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "*|b",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a**",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a+",
- ast: genConcatNode(
- newSymbolNode('a'),
- newRepeatNode(
- newSymbolNode('a'),
- ),
- ),
- },
- {
- pattern: "[abc]+",
- ast: genConcatNode(
- genAltNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- newRepeatNode(
- genAltNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- ),
- },
- {
- pattern: "\\u{3042}+",
- ast: genConcatNode(
- newSymbolNode('\u3042'),
- newRepeatNode(
- newSymbolNode('\u3042'),
- ),
- ),
- },
- {
- pattern: "\\p{Letter}+",
- skipTestAST: true,
- },
- {
- pattern: "\\f{a2c}+",
- fragments: map[spec.LexKindName]string{
- "a2c": "abc",
- },
- ast: genConcatNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- newRepeatNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- ),
- ),
- },
- {
- pattern: "((a+)+)+",
- ast: genConcatNode(
- genConcatNode(
- genConcatNode(
- genConcatNode(
- newSymbolNode('a'),
- newRepeatNode(
- newSymbolNode('a'),
- ),
- ),
- newRepeatNode(
- genConcatNode(
- newSymbolNode('a'),
- newRepeatNode(
- newSymbolNode('a'),
- ),
- ),
- ),
- ),
- newRepeatNode(
- genConcatNode(
- genConcatNode(
- newSymbolNode('a'),
- newRepeatNode(
- newSymbolNode('a'),
- ),
- ),
- newRepeatNode(
- genConcatNode(
- newSymbolNode('a'),
- newRepeatNode(
- newSymbolNode('a'),
- ),
- ),
- ),
- ),
- ),
- ),
- ),
- },
- {
- pattern: "(abc)+",
- ast: genConcatNode(
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- newRepeatNode(
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- ),
- },
- {
- pattern: "(a|b)+",
- ast: genConcatNode(
- genAltNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- ),
- newRepeatNode(
- genAltNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- ),
- ),
- ),
- },
- {
- pattern: "+",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "(+)",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a|+",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "+|b",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a++",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: ".",
- ast: newRangeSymbolNode(0x00, 0x10FFFF),
- },
- {
- pattern: "[a]",
- ast: newSymbolNode('a'),
- },
- {
- pattern: "[abc]",
- ast: genAltNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- },
- {
- pattern: "[a-z]",
- ast: newRangeSymbolNode('a', 'z'),
- },
- {
- pattern: "[A-Za-z]",
- ast: genAltNode(
- newRangeSymbolNode('A', 'Z'),
- newRangeSymbolNode('a', 'z'),
- ),
- },
- {
- pattern: "[\\u{004E}]",
- ast: newSymbolNode('N'),
- },
- {
- pattern: "[\\u{0061}-\\u{007A}]",
- ast: newRangeSymbolNode('a', 'z'),
- },
- {
- pattern: "[\\p{Lu}]",
- skipTestAST: true,
- },
- {
- pattern: "[a-\\p{Lu}]",
- syntaxError: synErrRangePropIsUnavailable,
- },
- {
- pattern: "[\\p{Lu}-z]",
- syntaxError: synErrRangePropIsUnavailable,
- },
- {
- pattern: "[\\p{Lu}-\\p{Ll}]",
- syntaxError: synErrRangePropIsUnavailable,
- },
- {
- pattern: "[z-a]",
- syntaxError: synErrRangeInvalidOrder,
- },
- {
- pattern: "a[]",
- syntaxError: synErrBExpNoElem,
- },
- {
- pattern: "[]a",
- syntaxError: synErrBExpNoElem,
- },
- {
- pattern: "[]",
- syntaxError: synErrBExpNoElem,
- },
- {
- pattern: "[^\\u{004E}]",
- ast: genAltNode(
- newRangeSymbolNode(0x00, '\u004E'-1),
- newRangeSymbolNode('\u004E'+1, 0x10FFFF),
- ),
- },
- {
- pattern: "[^\\u{0061}-\\u{007A}]",
- ast: genAltNode(
- newRangeSymbolNode(0x00, '\u0061'-1),
- newRangeSymbolNode('\u007A'+1, 0x10FFFF),
- ),
- },
- {
- pattern: "[^\\p{Lu}]",
- skipTestAST: true,
- },
- {
- pattern: "[^a-\\p{Lu}]",
- syntaxError: synErrRangePropIsUnavailable,
- },
- {
- pattern: "[^\\p{Lu}-z]",
- syntaxError: synErrRangePropIsUnavailable,
- },
- {
- pattern: "[^\\p{Lu}-\\p{Ll}]",
- syntaxError: synErrRangePropIsUnavailable,
- },
- {
- pattern: "[^\\u{0000}-\\u{10FFFF}]",
- syntaxError: synErrUnmatchablePattern,
- },
- {
- pattern: "[^\\u{0000}-\\u{FFFF}\\u{010000}-\\u{10FFFF}]",
- syntaxError: synErrUnmatchablePattern,
- },
- {
- pattern: "[^]",
- ast: newSymbolNode('^'),
- },
- {
- pattern: "[",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[a",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([a",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[a-",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([a-",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[^",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([^",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[^a",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([^a",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[^a-",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([^a-",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "]",
- ast: newSymbolNode(']'),
- },
- {
- pattern: "(]",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "a]",
- ast: genConcatNode(
- newSymbolNode('a'),
- newSymbolNode(']'),
- ),
- },
- {
- pattern: "(a]",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "([)",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([a)",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[a-]",
- ast: genAltNode(
- newSymbolNode('a'),
- newSymbolNode('-'),
- ),
- },
- {
- pattern: "[^a-]",
- ast: genAltNode(
- newRangeSymbolNode(0x00, 0x2C),
- newRangeSymbolNode(0x2E, 0x60),
- newRangeSymbolNode(0x62, 0x10FFFF),
- ),
- },
- {
- pattern: "[-z]",
- ast: genAltNode(
- newSymbolNode('-'),
- newSymbolNode('z'),
- ),
- },
- {
- pattern: "[^-z]",
- ast: newAltNode(
- newRangeSymbolNode(0x00, 0x2C),
- newAltNode(
- newRangeSymbolNode(0x2E, 0x79),
- newRangeSymbolNode(0x7B, 0x10FFFF),
- ),
- ),
- },
- {
- pattern: "[-]",
- ast: newSymbolNode('-'),
- },
- {
- pattern: "[^-]",
- ast: genAltNode(
- newRangeSymbolNode(0x00, 0x2C),
- newRangeSymbolNode(0x2E, 0x10FFFF),
- ),
- },
- {
- pattern: "[^01]",
- ast: genAltNode(
- newRangeSymbolNode(0x00, '0'-1),
- newRangeSymbolNode('1'+1, 0x10FFFF),
- ),
- },
- {
- pattern: "[^10]",
- ast: genAltNode(
- newRangeSymbolNode(0x00, '0'-1),
- newRangeSymbolNode('1'+1, 0x10FFFF),
- ),
- },
- {
- pattern: "[^a-z]",
- ast: genAltNode(
- newRangeSymbolNode(0x00, 'a'-1),
- newRangeSymbolNode('z'+1, 0x10FFFF),
- ),
- },
- {
- pattern: "[^az]",
- ast: genAltNode(
- newRangeSymbolNode(0x00, 'a'-1),
- genAltNode(
- newRangeSymbolNode('a'+1, 'z'-1),
- newRangeSymbolNode('z'+1, 0x10FFFF),
- ),
- ),
- },
- {
- pattern: "\\u{006E}",
- ast: newSymbolNode('\u006E'),
- },
- {
- pattern: "\\u{03BD}",
- ast: newSymbolNode('\u03BD'),
- },
- {
- pattern: "\\u{306B}",
- ast: newSymbolNode('\u306B'),
- },
- {
- pattern: "\\u{01F638}",
- ast: newSymbolNode('\U0001F638'),
- },
- {
- pattern: "\\u{0000}",
- ast: newSymbolNode('\u0000'),
- },
- {
- pattern: "\\u{10FFFF}",
- ast: newSymbolNode('\U0010FFFF'),
- },
- {
- pattern: "\\u{110000}",
- syntaxError: synErrCPExpOutOfRange,
- },
- {
- pattern: "\\u",
- syntaxError: synErrCPExpInvalidForm,
- },
- {
- pattern: "\\u{",
- syntaxError: synErrCPExpInvalidForm,
- },
- {
- pattern: "\\u{03BD",
- syntaxError: synErrCPExpInvalidForm,
- },
- {
- pattern: "\\u{}",
- syntaxError: synErrCPExpInvalidForm,
- },
- {
- pattern: "\\p{Letter}",
- skipTestAST: true,
- },
- {
- pattern: "\\p{General_Category=Letter}",
- skipTestAST: true,
- },
- {
- pattern: "\\p{ Letter }",
- skipTestAST: true,
- },
- {
- pattern: "\\p{ General_Category = Letter }",
- skipTestAST: true,
- },
- {
- pattern: "\\p",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{Letter",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{General_Category=}",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{General_Category= }",
- syntaxError: synErrCharPropInvalidSymbol,
- },
- {
- pattern: "\\p{=Letter}",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{ =Letter}",
- syntaxError: synErrCharPropInvalidSymbol,
- },
- {
- pattern: "\\p{=}",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{}",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\f{a2c}",
- fragments: map[spec.LexKindName]string{
- "a2c": "abc",
- },
- ast: newFragmentNode("a2c",
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- },
- {
- pattern: "\\f{ a2c }",
- fragments: map[spec.LexKindName]string{
- "a2c": "abc",
- },
- ast: newFragmentNode("a2c",
- genConcatNode(
- newSymbolNode('a'),
- newSymbolNode('b'),
- newSymbolNode('c'),
- ),
- ),
- },
- {
- pattern: "\\f",
- syntaxError: synErrFragmentExpInvalidForm,
- },
- {
- pattern: "\\f{",
- syntaxError: synErrFragmentExpInvalidForm,
- },
- {
- pattern: "\\f{a2c",
- fragments: map[spec.LexKindName]string{
- "a2c": "abc",
- },
- syntaxError: synErrFragmentExpInvalidForm,
- },
- {
- pattern: "(a)",
- ast: newSymbolNode('a'),
- },
- {
- pattern: "(((a)))",
- ast: newSymbolNode('a'),
- },
- {
- pattern: "a()",
- syntaxError: synErrGroupNoElem,
- },
- {
- pattern: "()a",
- syntaxError: synErrGroupNoElem,
- },
- {
- pattern: "()",
- syntaxError: synErrGroupNoElem,
- },
- {
- pattern: "(",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "a(",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "(a",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "((",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "((a)",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: ")",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: "a)",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: ")a",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: "))",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: "(a))",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: "Mulder|Scully",
- ast: genAltNode(
- genConcatNode(
- newSymbolNode('M'),
- newSymbolNode('u'),
- newSymbolNode('l'),
- newSymbolNode('d'),
- newSymbolNode('e'),
- newSymbolNode('r'),
- ),
- genConcatNode(
- newSymbolNode('S'),
- newSymbolNode('c'),
- newSymbolNode('u'),
- newSymbolNode('l'),
- newSymbolNode('l'),
- newSymbolNode('y'),
- ),
- ),
- },
- {
- pattern: "Langly|Frohike|Byers",
- ast: genAltNode(
- genConcatNode(
- newSymbolNode('L'),
- newSymbolNode('a'),
- newSymbolNode('n'),
- newSymbolNode('g'),
- newSymbolNode('l'),
- newSymbolNode('y'),
- ),
- genConcatNode(
- newSymbolNode('F'),
- newSymbolNode('r'),
- newSymbolNode('o'),
- newSymbolNode('h'),
- newSymbolNode('i'),
- newSymbolNode('k'),
- newSymbolNode('e'),
- ),
- genConcatNode(
- newSymbolNode('B'),
- newSymbolNode('y'),
- newSymbolNode('e'),
- newSymbolNode('r'),
- newSymbolNode('s'),
- ),
- ),
- },
- {
- pattern: "|",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "||",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Mulder|",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "|Scully",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Langly|Frohike|",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Langly||Byers",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "|Frohike|Byers",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "|Frohike|",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Fox(|)Mulder",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "(Fox|)Mulder",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Fox(|Mulder)",
- syntaxError: synErrAltLackOfOperand,
- },
- }
- for i, tt := range tests {
- t.Run(fmt.Sprintf("#%v %v", i, tt.pattern), func(t *testing.T) {
- fragmentTrees := map[spec.LexKindName]CPTree{}
- for kind, pattern := range tt.fragments {
- p := NewParser(kind, strings.NewReader(pattern))
- root, err := p.Parse()
- if err != nil {
- t.Fatal(err)
- }
-
- fragmentTrees[kind] = root
- }
- err := CompleteFragments(fragmentTrees)
- if err != nil {
- t.Fatal(err)
- }
-
- p := NewParser(spec.LexKindName("test"), strings.NewReader(tt.pattern))
- root, err := p.Parse()
- if tt.syntaxError != nil {
- // printCPTree(os.Stdout, root, "", "")
- if err != ParseErr {
- t.Fatalf("unexpected error: want: %v, got: %v", ParseErr, err)
- }
- _, synErr := p.Error()
- if synErr != tt.syntaxError {
- t.Fatalf("unexpected syntax error: want: %v, got: %v", tt.syntaxError, synErr)
- }
- if root != nil {
- t.Fatalf("tree must be nil")
- }
- } else {
- if err != nil {
- detail, cause := p.Error()
- t.Fatalf("%v: %v: %v", err, cause, detail)
- }
- if root == nil {
- t.Fatal("tree must be non-nil")
- }
-
- complete, err := ApplyFragments(root, fragmentTrees)
- if err != nil {
- t.Fatal(err)
- }
- if !complete {
- t.Fatalf("incomplete fragments")
- }
-
- // printCPTree(os.Stdout, root, "", "")
- if !tt.skipTestAST {
- r := root.(*rootNode)
- testAST(t, tt.ast, r.tree)
- }
- }
- })
- }
-}
-
-func TestParse_ContributoryPropertyIsNotExposed(t *testing.T) {
- for _, cProp := range ucd.ContributoryProperties() {
- t.Run(fmt.Sprintf("%v", cProp), func(t *testing.T) {
- p := NewParser(spec.LexKindName("test"), strings.NewReader(fmt.Sprintf(`\p{%v=yes}`, cProp)))
- root, err := p.Parse()
- if err == nil {
- t.Fatalf("expected syntax error: got: nil")
- }
- _, synErr := p.Error()
- if synErr != synErrCharPropUnsupported {
- t.Fatalf("unexpected syntax error: want: %v, got: %v", synErrCharPropUnsupported, synErr)
- }
- if root != nil {
- t.Fatalf("tree is not nil")
- }
- })
- }
-}
-
-func TestExclude(t *testing.T) {
- for _, test := range []struct {
- caption string
- target CPTree
- base CPTree
- result CPTree
- }{
- // t.From > b.From && t.To < b.To
-
- // |t.From - b.From| = 1
- // |b.To - t.To| = 1
- //
- // Target (t): +--+
- // Base (b): +--+--+--+
- // Result (b - t): +--+ +--+
- {
- caption: "|t.From - b.From| = 1 && |b.To - t.To| = 1",
- target: newSymbolNode('1'),
- base: newRangeSymbolNode('0', '2'),
- result: newAltNode(
- newSymbolNode('0'),
- newSymbolNode('2'),
- ),
- },
- // |t.From - b.From| > 1
- // |b.To - t.To| > 1
- //
- // Target (t): +--+
- // Base (b): +--+--+--+--+--+
- // Result (b - t): +--+--+ +--+--+
- {
- caption: "|t.From - b.From| > 1 && |b.To - t.To| > 1",
- target: newSymbolNode('2'),
- base: newRangeSymbolNode('0', '4'),
- result: newAltNode(
- newRangeSymbolNode('0', '1'),
- newRangeSymbolNode('3', '4'),
- ),
- },
-
- // t.From <= b.From && t.To >= b.From && t.To < b.To
-
- // |b.From - t.From| = 0
- // |t.To - b.From| = 0
- // |b.To - t.To| = 1
- //
- // Target (t): +--+
- // Base (b): +--+--+
- // Result (b - t): +--+
- {
- caption: "|b.From - t.From| = 0 && |t.To - b.From| = 0 && |b.To - t.To| = 1",
- target: newSymbolNode('0'),
- base: newRangeSymbolNode('0', '1'),
- result: newSymbolNode('1'),
- },
- // |b.From - t.From| = 0
- // |t.To - b.From| = 0
- // |b.To - t.To| > 1
- //
- // Target (t): +--+
- // Base (b): +--+--+--+
- // Result (b - t): +--+--+
- {
- caption: "|b.From - t.From| = 0 && |t.To - b.From| = 0 && |b.To - t.To| > 1",
- target: newSymbolNode('0'),
- base: newRangeSymbolNode('0', '2'),
- result: newRangeSymbolNode('1', '2'),
- },
- // |b.From - t.From| = 0
- // |t.To - b.From| > 0
- // |b.To - t.To| = 1
- //
- // Target (t): +--+--+
- // Base (b): +--+--+--+
- // Result (b - t): +--+
- {
- caption: "|b.From - t.From| = 0 && |t.To - b.From| > 0 && |b.To - t.To| = 1",
- target: newRangeSymbolNode('0', '1'),
- base: newRangeSymbolNode('0', '2'),
- result: newSymbolNode('2'),
- },
- // |b.From - t.From| = 0
- // |t.To - b.From| > 0
- // |b.To - t.To| > 1
- //
- // Target (t): +--+--+
- // Base (b): +--+--+--+--+
- // Result (b - t): +--+--+
- {
- caption: "|b.From - t.From| = 0 && |t.To - b.From| > 0 && |b.To - t.To| > 1",
- target: newRangeSymbolNode('0', '1'),
- base: newRangeSymbolNode('0', '3'),
- result: newRangeSymbolNode('2', '3'),
- },
- // |b.From - t.From| > 0
- // |t.To - b.From| = 0
- // |b.To - t.To| = 1
- //
- // Target (t): +--+--+
- // Base (b): +--+--+
- // Result (b - t): +--+
- {
- caption: "|b.From - t.From| > 0 && |t.To - b.From| = 0 && |b.To - t.To| = 1",
- target: newRangeSymbolNode('0', '1'),
- base: newRangeSymbolNode('1', '2'),
- result: newSymbolNode('2'),
- },
- // |b.From - t.From| > 0
- // |t.To - b.From| = 0
- // |b.To - t.To| > 1
- //
- // Target (t): +--+--+
- // Base (b): +--+--+--+
- // Result (b - t): +--+--+
- {
- caption: "|b.From - t.From| > 0 && |t.To - b.From| = 0 && |b.To - t.To| > 1",
- target: newRangeSymbolNode('0', '1'),
- base: newRangeSymbolNode('1', '3'),
- result: newRangeSymbolNode('2', '3'),
- },
- // |b.From - t.From| > 0
- // |t.To - b.From| > 0
- // |b.To - t.To| = 1
- //
- // Target (t): +--+--+--+
- // Base (b): +--+--+--+
- // Result (b - t): +--+
- {
- caption: "|b.From - t.From| > 0 && |t.To - b.From| > 0 && |b.To - t.To| = 1",
- target: newRangeSymbolNode('0', '2'),
- base: newRangeSymbolNode('1', '3'),
- result: newSymbolNode('3'),
- },
- // |b.From - t.From| > 0
- // |t.To - b.From| > 0
- // |b.To - t.To| > 1
- //
- // Target (t): +--+--+--+
- // Base (b): +--+--+--+--+
- // Result (b - t): +--+--+
- {
- caption: "|b.From - t.From| > 0 && |t.To - b.From| > 0 && |b.To - t.To| > 1",
- target: newRangeSymbolNode('0', '2'),
- base: newRangeSymbolNode('1', '4'),
- result: newRangeSymbolNode('3', '4'),
- },
-
- // t.From > b.From && t.From <= b.To && t.To >= b.To
-
- // |t.From - b.From| = 1
- // |b.To - t.From| = 0
- // |t.To - b.To| = 0
- //
- // Target (t): +--+
- // Base (b): +--+--+
- // Result (b - t): +--+
- {
- caption: "|t.From - b.From| = 1 && |b.To - t.From| = 0 && |t.To - b.To| = 0",
- target: newSymbolNode('1'),
- base: newRangeSymbolNode('0', '1'),
- result: newSymbolNode('0'),
- },
- // |t.From - b.From| = 1
- // |b.To - t.From| = 0
- // |t.To - b.To| > 0
- //
- // Target (t): +--+--+
- // Base (b): +--+--+
- // Result (b - t): +--+
- {
- caption: "|t.From - b.From| = 1 && |b.To - t.From| = 0 && |t.To - b.To| > 0",
- target: newRangeSymbolNode('1', '2'),
- base: newRangeSymbolNode('0', '1'),
- result: newSymbolNode('0'),
- },
- // |t.From - b.From| = 1
- // |b.To - t.From| > 0
- // |t.To - b.To| = 0
- //
- // Target (t): +--+--+
- // Base (b): +--+--+--+
- // Result (b - t): +--+
- {
- caption: "|t.From - b.From| = 1 && |b.To - t.From| > 0 && |t.To - b.To| = 0",
- target: newRangeSymbolNode('1', '2'),
- base: newRangeSymbolNode('0', '2'),
- result: newSymbolNode('0'),
- },
- // |t.From - b.From| = 1
- // |b.To - t.From| > 0
- // |t.To - b.To| > 0
- //
- // Target (t): +--+--+--+
- // Base (b): +--+--+--+
- // Result (b - t): +--+
- {
- caption: "|t.From - b.From| = 1 && |b.To - t.From| > 0 && |t.To - b.To| > 0",
- target: newRangeSymbolNode('1', '3'),
- base: newRangeSymbolNode('0', '2'),
- result: newSymbolNode('0'),
- },
- // |t.From - b.From| > 1
- // |b.To - t.From| = 0
- // |t.To - b.To| = 0
- //
- // Target (t): +--+
- // Base (b): +--+--+--+
- // Result (b - t): +--+--+
- {
- caption: "|t.From - b.From| > 1 && |b.To - t.From| = 0 && |t.To - b.To| = 0",
- target: newSymbolNode('2'),
- base: newRangeSymbolNode('0', '2'),
- result: newRangeSymbolNode('0', '1'),
- },
- // |t.From - b.From| > 1
- // |b.To - t.From| = 0
- // |t.To - b.To| > 0
- //
- // Target (t): +--+--+
- // Base (b): +--+--+--+
- // Result (b - t): +--+--+
- {
- caption: "|t.From - b.From| > 1 && |b.To - t.From| = 0 && |t.To - b.To| > 0",
- target: newRangeSymbolNode('2', '3'),
- base: newRangeSymbolNode('0', '2'),
- result: newRangeSymbolNode('0', '1'),
- },
- // |t.From - b.From| > 1
- // |b.To - t.From| > 0
- // |t.To - b.To| = 0
- //
- // Target (t): +--+--+
- // Base (b): +--+--+--+--+
- // Result (b - t): +--+--+
- {
- caption: "|t.From - b.From| > 1 && |b.To - t.From| > 0 && |t.To - b.To| = 0",
- target: newRangeSymbolNode('2', '3'),
- base: newRangeSymbolNode('0', '3'),
- result: newRangeSymbolNode('0', '1'),
- },
- // |t.From - b.From| > 1
- // |b.To - t.From| > 0
- // |t.To - b.To| > 0
- //
- // Target (t): +--+--+--+
- // Base (b): +--+--+--+--+
- // Result (b - t): +--+--+
- {
- caption: "|t.From - b.From| > 1 && |b.To - t.From| > 0 && |t.To - b.To| > 0",
- target: newRangeSymbolNode('2', '4'),
- base: newRangeSymbolNode('0', '3'),
- result: newRangeSymbolNode('0', '1'),
- },
-
- // t.From <= b.From && t.To >= b.To
-
- // |b.From - t.From| = 0
- // |t.To - b.To| = 0
- //
- // Target (t): +--+
- // Base (b): +--+
- // Result (b - t): N/A
- {
- caption: "|b.From - t.From| = 0 && |t.To - b.To| = 0",
- target: newSymbolNode('0'),
- base: newSymbolNode('0'),
- result: nil,
- },
- // |b.From - t.From| = 0
- // |t.To - b.To| > 0
- //
- // Target (t): +--+--+
- // Base (b): +--+
- // Result (b - t): N/A
- {
- caption: "|b.From - t.From| = 0 && |t.To - b.To| > 0",
- target: newRangeSymbolNode('0', '1'),
- base: newSymbolNode('0'),
- result: nil,
- },
- // |b.From - t.From| > 0
- // |t.To - b.To| = 0
- //
- // Target (t): +--+--+
- // Base (b): +--+
- // Result (b - t): N/A
- {
- caption: "|b.From - t.From| > 0 && |t.To - b.To| = 0",
- target: newRangeSymbolNode('0', '1'),
- base: newSymbolNode('1'),
- result: nil,
- },
- // |b.From - t.From| > 0
- // |t.To - b.To| > 0
- //
- // Target (t): +--+--+--+
- // Base (b): +--+
- // Result (b - t): N/A
- {
- caption: "|b.From - t.From| > 0 && |t.To - b.To| > 0",
- target: newRangeSymbolNode('0', '2'),
- base: newSymbolNode('1'),
- result: nil,
- },
-
- // Others
-
- // |b.From - t.From| = 1
- //
- // Target (t): +--+
- // Base (b): +--+
- // Result (b - t): +--+
- {
- caption: "|b.From - t.From| = 1",
- target: newSymbolNode('0'),
- base: newSymbolNode('1'),
- result: newSymbolNode('1'),
- },
- // |b.From - t.From| > 1
- //
- // Target (t): +--+
- // Base (b): +--+
- // Result (b - t): +--+
- {
- caption: "|b.From - t.From| > 1",
- target: newSymbolNode('0'),
- base: newSymbolNode('2'),
- result: newSymbolNode('2'),
- },
- // |t.To - b.To| = 1
- //
- // Target (t): +--+
- // Base (b): +--+
- // Result (b - t): +--+
- {
- caption: "|t.To - b.To| = 1",
- target: newSymbolNode('1'),
- base: newSymbolNode('0'),
- result: newSymbolNode('0'),
- },
- // |t.To - b.To| > 1
- //
- // Target (t): +--+
- // Base (b): +--+
- // Result (b - t): +--+
- {
- caption: "|t.To - b.To| > 1",
- target: newSymbolNode('2'),
- base: newSymbolNode('0'),
- result: newSymbolNode('0'),
- },
- } {
- t.Run(test.caption, func(t *testing.T) {
- r := exclude(test.target, test.base)
- testAST(t, test.result, r)
- })
- }
-}
-
-func testAST(t *testing.T, expected, actual CPTree) {
- t.Helper()
-
- aTy := reflect.TypeOf(actual)
- eTy := reflect.TypeOf(expected)
- if eTy != aTy {
- t.Fatalf("unexpected node: want: %+v, got: %+v", eTy, aTy)
- }
-
- if actual == nil {
- return
- }
-
- switch e := expected.(type) {
- case *symbolNode:
- a := actual.(*symbolNode)
- if a.From != e.From || a.To != e.To {
- t.Fatalf("unexpected node: want: %+v, got: %+v", e, a)
- }
- }
- eLeft, eRight := expected.children()
- aLeft, aRight := actual.children()
- testAST(t, eLeft, aLeft)
- testAST(t, eRight, aRight)
-}
diff --git a/compiler/parser/tree.go b/compiler/parser/tree.go
deleted file mode 100644
index 04ba723..0000000
--- a/compiler/parser/tree.go
+++ /dev/null
@@ -1,459 +0,0 @@
-package parser
-
-import (
- "fmt"
- "io"
- "sort"
-
- "github.com/nihei9/maleeni/spec"
-)
-
-type CPRange struct {
- From rune
- To rune
-}
-
-type CPTree interface {
- fmt.Stringer
- Range() (rune, rune, bool)
- Optional() (CPTree, bool)
- Repeatable() (CPTree, bool)
- Concatenation() (CPTree, CPTree, bool)
- Alternatives() (CPTree, CPTree, bool)
- Describe() (spec.LexKindName, []spec.LexKindName, error)
-
- children() (CPTree, CPTree)
- clone() CPTree
-}
-
-var (
- _ CPTree = &rootNode{}
- _ CPTree = &symbolNode{}
- _ CPTree = &concatNode{}
- _ CPTree = &altNode{}
- _ CPTree = &quantifierNode{}
- _ CPTree = &fragmentNode{}
-)
-
-type rootNode struct {
- kind spec.LexKindName
- tree CPTree
- fragments map[spec.LexKindName][]*fragmentNode
-}
-
-func newRootNode(kind spec.LexKindName, t CPTree) *rootNode {
- fragments := map[spec.LexKindName][]*fragmentNode{}
- collectFragments(t, fragments)
-
- return &rootNode{
- kind: kind,
- tree: t,
- fragments: fragments,
- }
-}
-
-func collectFragments(n CPTree, fragments map[spec.LexKindName][]*fragmentNode) {
- if n == nil {
- return
- }
-
- if f, ok := n.(*fragmentNode); ok {
- fragments[f.kind] = append(fragments[f.kind], f)
- return
- }
-
- l, r := n.children()
- collectFragments(l, fragments)
- collectFragments(r, fragments)
-}
-
-func (n *rootNode) String() string {
- return fmt.Sprintf("root: %v: %v fragments", n.kind, len(n.fragments))
-}
-
-func (n *rootNode) Range() (rune, rune, bool) {
- return n.tree.Range()
-}
-
-func (n *rootNode) Optional() (CPTree, bool) {
- return n.tree.Optional()
-}
-
-func (n *rootNode) Repeatable() (CPTree, bool) {
- return n.tree.Repeatable()
-}
-
-func (n *rootNode) Concatenation() (CPTree, CPTree, bool) {
- return n.tree.Concatenation()
-}
-
-func (n *rootNode) Alternatives() (CPTree, CPTree, bool) {
- return n.tree.Alternatives()
-}
-
-func (n *rootNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
- var frags []spec.LexKindName
- for f := range n.fragments {
- frags = append(frags, spec.LexKindName(f))
- }
- sort.Slice(frags, func(i, j int) bool {
- return frags[i] < frags[j]
- })
-
- return n.kind, frags, nil
-}
-
-func (n *rootNode) children() (CPTree, CPTree) {
- return n.tree.children()
-}
-
-func (n *rootNode) clone() CPTree {
- return n.tree.clone()
-}
-
-func (n *rootNode) incomplete() bool {
- return len(n.fragments) > 0
-}
-
-func (n *rootNode) applyFragment(kind spec.LexKindName, fragment CPTree) error {
- root, ok := fragment.(*rootNode)
- if !ok {
- return fmt.Errorf("applyFragment can take only *rootNode: %T", fragment)
- }
- if root.incomplete() {
- return fmt.Errorf("fragment is incomplete")
- }
-
- fs, ok := n.fragments[kind]
- if !ok {
- return nil
- }
- for _, f := range fs {
- f.tree = root.clone()
- }
- delete(n.fragments, kind)
-
- return nil
-}
-
-type symbolNode struct {
- CPRange
-}
-
-func newSymbolNode(cp rune) *symbolNode {
- return &symbolNode{
- CPRange: CPRange{
- From: cp,
- To: cp,
- },
- }
-}
-
-func newRangeSymbolNode(from, to rune) *symbolNode {
- return &symbolNode{
- CPRange: CPRange{
- From: from,
- To: to,
- },
- }
-}
-
-func (n *symbolNode) String() string {
- return fmt.Sprintf("symbol: %X..%X", n.From, n.To)
-}
-
-func (n *symbolNode) Range() (rune, rune, bool) {
- return n.From, n.To, true
-}
-
-func (n *symbolNode) Optional() (CPTree, bool) {
- return nil, false
-}
-
-func (n *symbolNode) Repeatable() (CPTree, bool) {
- return nil, false
-}
-
-func (n *symbolNode) Concatenation() (CPTree, CPTree, bool) {
- return nil, nil, false
-}
-
-func (n *symbolNode) Alternatives() (CPTree, CPTree, bool) {
- return nil, nil, false
-}
-
-func (n *symbolNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
- return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
-}
-
-func (n *symbolNode) children() (CPTree, CPTree) {
- return nil, nil
-}
-
-func (n *symbolNode) clone() CPTree {
- return newRangeSymbolNode(n.From, n.To)
-}
-
-type concatNode struct {
- left CPTree
- right CPTree
-}
-
-func newConcatNode(left, right CPTree) *concatNode {
- return &concatNode{
- left: left,
- right: right,
- }
-}
-
-func (n *concatNode) String() string {
- return "concat"
-}
-
-func (n *concatNode) Range() (rune, rune, bool) {
- return 0, 0, false
-}
-
-func (n *concatNode) Optional() (CPTree, bool) {
- return nil, false
-}
-
-func (n *concatNode) Repeatable() (CPTree, bool) {
- return nil, false
-}
-
-func (n *concatNode) Concatenation() (CPTree, CPTree, bool) {
- return n.left, n.right, true
-}
-
-func (n *concatNode) Alternatives() (CPTree, CPTree, bool) {
- return nil, nil, false
-}
-
-func (n *concatNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
- return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
-}
-
-func (n *concatNode) children() (CPTree, CPTree) {
- return n.left, n.right
-}
-
-func (n *concatNode) clone() CPTree {
- if n == nil {
- return nil
- }
- return newConcatNode(n.left.clone(), n.right.clone())
-}
-
-type altNode struct {
- left CPTree
- right CPTree
-}
-
-func newAltNode(left, right CPTree) *altNode {
- return &altNode{
- left: left,
- right: right,
- }
-}
-
-func (n *altNode) String() string {
- return "alt"
-}
-
-func (n *altNode) Range() (rune, rune, bool) {
- return 0, 0, false
-}
-
-func (n *altNode) Optional() (CPTree, bool) {
- return nil, false
-}
-
-func (n *altNode) Repeatable() (CPTree, bool) {
- return nil, false
-}
-
-func (n *altNode) Concatenation() (CPTree, CPTree, bool) {
- return nil, nil, false
-}
-
-func (n *altNode) Alternatives() (CPTree, CPTree, bool) {
- return n.left, n.right, true
-}
-
-func (n *altNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
- return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
-}
-
-func (n *altNode) children() (CPTree, CPTree) {
- return n.left, n.right
-}
-
-func (n *altNode) clone() CPTree {
- return newAltNode(n.left.clone(), n.right.clone())
-}
-
-type quantifierNode struct {
- optional bool
- repeatable bool
- tree CPTree
-}
-
-func (n *quantifierNode) String() string {
- switch {
- case n.repeatable:
- return "repeatable (>= 0 times)"
- case n.optional:
- return "optional (0 or 1 times)"
- default:
- return "invalid quantifier"
- }
-}
-
-func newRepeatNode(t CPTree) *quantifierNode {
- return &quantifierNode{
- repeatable: true,
- tree: t,
- }
-}
-
-func newRepeatOneOrMoreNode(t CPTree) *concatNode {
- return newConcatNode(
- t,
- &quantifierNode{
- repeatable: true,
- tree: t.clone(),
- })
-}
-
-func newOptionNode(t CPTree) *quantifierNode {
- return &quantifierNode{
- optional: true,
- tree: t,
- }
-}
-
-func (n *quantifierNode) Range() (rune, rune, bool) {
- return 0, 0, false
-}
-
-func (n *quantifierNode) Optional() (CPTree, bool) {
- return n.tree, n.optional
-}
-
-func (n *quantifierNode) Repeatable() (CPTree, bool) {
- return n.tree, n.repeatable
-}
-
-func (n *quantifierNode) Concatenation() (CPTree, CPTree, bool) {
- return nil, nil, false
-}
-
-func (n *quantifierNode) Alternatives() (CPTree, CPTree, bool) {
- return nil, nil, false
-}
-
-func (n *quantifierNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
- return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
-}
-
-func (n *quantifierNode) children() (CPTree, CPTree) {
- return n.tree, nil
-}
-
-func (n *quantifierNode) clone() CPTree {
- if n.repeatable {
- return newRepeatNode(n.tree.clone())
- }
- return newOptionNode(n.tree.clone())
-}
-
-type fragmentNode struct {
- kind spec.LexKindName
- tree CPTree
-}
-
-func newFragmentNode(kind spec.LexKindName, t CPTree) *fragmentNode {
- return &fragmentNode{
- kind: kind,
- tree: t,
- }
-}
-
-func (n *fragmentNode) String() string {
- return fmt.Sprintf("fragment: %v", n.kind)
-}
-
-func (n *fragmentNode) Range() (rune, rune, bool) {
- return n.tree.Range()
-}
-
-func (n *fragmentNode) Optional() (CPTree, bool) {
- return n.tree.Optional()
-}
-
-func (n *fragmentNode) Repeatable() (CPTree, bool) {
- return n.tree.Repeatable()
-}
-
-func (n *fragmentNode) Concatenation() (CPTree, CPTree, bool) {
- return n.tree.Concatenation()
-}
-
-func (n *fragmentNode) Alternatives() (CPTree, CPTree, bool) {
- return n.tree.Alternatives()
-}
-
-func (n *fragmentNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
- return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
-}
-
-func (n *fragmentNode) children() (CPTree, CPTree) {
- return n.tree.children()
-}
-
-func (n *fragmentNode) clone() CPTree {
- if n.tree == nil {
- return newFragmentNode(n.kind, nil)
- }
- return newFragmentNode(n.kind, n.tree.clone())
-}
-
-//nolint:unused
-func printCPTree(w io.Writer, t CPTree, ruledLine string, childRuledLinePrefix string) {
- if t == nil {
- return
- }
- fmt.Fprintf(w, "%v%v\n", ruledLine, t)
- children := []CPTree{}
- switch n := t.(type) {
- case *rootNode:
- children = append(children, n.tree)
- case *fragmentNode:
- children = append(children, n.tree)
- default:
- left, right := t.children()
- if left != nil {
- children = append(children, left)
- }
- if right != nil {
- children = append(children, right)
- }
- }
- num := len(children)
- for i, child := range children {
- line := "└─ "
- if num > 1 {
- if i == 0 {
- line = "├─ "
- } else if i < num-1 {
- line = "│ "
- }
- }
- prefix := "│ "
- if i >= num-1 {
- prefix = " "
- }
- printCPTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix)
- }
-}