diff options
Diffstat (limited to 'src/urubu/grammar/lexical/parser')
-rw-r--r-- | src/urubu/grammar/lexical/parser/error.go | 36 | ||||
-rw-r--r-- | src/urubu/grammar/lexical/parser/fragment.go | 72 | ||||
-rw-r--r-- | src/urubu/grammar/lexical/parser/lexer.go | 594 | ||||
-rw-r--r-- | src/urubu/grammar/lexical/parser/parser.go | 531 | ||||
-rw-r--r-- | src/urubu/grammar/lexical/parser/tree.go | 459 |
5 files changed, 0 insertions, 1692 deletions
diff --git a/src/urubu/grammar/lexical/parser/error.go b/src/urubu/grammar/lexical/parser/error.go deleted file mode 100644 index be81da4..0000000 --- a/src/urubu/grammar/lexical/parser/error.go +++ /dev/null @@ -1,36 +0,0 @@ -package parser - -import "fmt" - -var ( - ParseErr = fmt.Errorf("parse error") - - // lexical errors - synErrIncompletedEscSeq = fmt.Errorf("incompleted escape sequence; unexpected EOF following \\") - synErrInvalidEscSeq = fmt.Errorf("invalid escape sequence") - synErrInvalidCodePoint = fmt.Errorf("code points must consist of just 4 or 6 hex digits") - synErrCharPropInvalidSymbol = fmt.Errorf("invalid character property symbol") - SynErrFragmentInvalidSymbol = fmt.Errorf("invalid fragment symbol") - - // syntax errors - synErrUnexpectedToken = fmt.Errorf("unexpected token") - synErrNullPattern = fmt.Errorf("a pattern must be a non-empty byte sequence") - synErrUnmatchablePattern = fmt.Errorf("a pattern cannot match any characters") - synErrAltLackOfOperand = fmt.Errorf("an alternation expression must have operands") - synErrRepNoTarget = fmt.Errorf("a repeat expression must have an operand") - synErrGroupNoElem = fmt.Errorf("a grouping expression must include at least one character") - synErrGroupUnclosed = fmt.Errorf("unclosed grouping expression") - synErrGroupNoInitiator = fmt.Errorf(") needs preceding (") - synErrGroupInvalidForm = fmt.Errorf("invalid grouping expression") - synErrBExpNoElem = fmt.Errorf("a bracket expression must include at least one character") - synErrBExpUnclosed = fmt.Errorf("unclosed bracket expression") - synErrBExpInvalidForm = fmt.Errorf("invalid bracket expression") - synErrRangeInvalidOrder = fmt.Errorf("a range expression with invalid order") - synErrRangePropIsUnavailable = fmt.Errorf("a property expression is unavailable in a range expression") - synErrRangeInvalidForm = fmt.Errorf("invalid range expression") - synErrCPExpInvalidForm = fmt.Errorf("invalid code point expression") - synErrCPExpOutOfRange = fmt.Errorf("a code point must be between U+0000 to U+10FFFF") - synErrCharPropExpInvalidForm = fmt.Errorf("invalid character property expression") - synErrCharPropUnsupported = fmt.Errorf("unsupported character property") - synErrFragmentExpInvalidForm = fmt.Errorf("invalid fragment expression") -) diff --git a/src/urubu/grammar/lexical/parser/fragment.go b/src/urubu/grammar/lexical/parser/fragment.go deleted file mode 100644 index 196c00b..0000000 --- a/src/urubu/grammar/lexical/parser/fragment.go +++ /dev/null @@ -1,72 +0,0 @@ -package parser - -import ( - "fmt" - - spec "urubu/spec/grammar" -) - -type incompleteFragment struct { - kind spec.LexKindName - root *rootNode -} - -func CompleteFragments(fragments map[spec.LexKindName]CPTree) error { - if len(fragments) == 0 { - return nil - } - - completeFragments := map[spec.LexKindName]CPTree{} - incompleteFragments := []*incompleteFragment{} - for kind, tree := range fragments { - root, ok := tree.(*rootNode) - if !ok { - return fmt.Errorf("CompleteFragments can take only *rootNode: %T", tree) - } - if root.incomplete() { - incompleteFragments = append(incompleteFragments, &incompleteFragment{ - kind: kind, - root: root, - }) - } else { - completeFragments[kind] = root - } - } - for len(incompleteFragments) > 0 { - lastIncompCount := len(incompleteFragments) - remainingFragments := []*incompleteFragment{} - for _, e := range incompleteFragments { - complete, err := ApplyFragments(e.root, completeFragments) - if err != nil { - return err - } - if !complete { - remainingFragments = append(remainingFragments, e) - } else { - completeFragments[e.kind] = e.root - } - } - incompleteFragments = remainingFragments - if len(incompleteFragments) == lastIncompCount { - return ParseErr - } - } - - return nil -} - -func ApplyFragments(t CPTree, fragments map[spec.LexKindName]CPTree) (bool, error) { - root, ok := t.(*rootNode) - if !ok { - return false, fmt.Errorf("ApplyFragments can take only *rootNode type: %T", t) - } - - for name, frag := range fragments { - err := root.applyFragment(name, frag) - if err != nil { - return false, err - } - } - - return !root.incomplete(), nil -} diff --git a/src/urubu/grammar/lexical/parser/lexer.go b/src/urubu/grammar/lexical/parser/lexer.go deleted file mode 100644 index 3861825..0000000 --- a/src/urubu/grammar/lexical/parser/lexer.go +++ /dev/null @@ -1,594 +0,0 @@ -package parser - -import ( - "bufio" - "fmt" - "io" - "strings" -) - -type tokenKind string - -const ( - tokenKindChar tokenKind = "char" - tokenKindAnyChar tokenKind = "." - tokenKindRepeat tokenKind = "*" - tokenKindRepeatOneOrMore tokenKind = "+" - tokenKindOption tokenKind = "?" - tokenKindAlt tokenKind = "|" - tokenKindGroupOpen tokenKind = "(" - tokenKindGroupClose tokenKind = ")" - tokenKindBExpOpen tokenKind = "[" - tokenKindInverseBExpOpen tokenKind = "[^" - tokenKindBExpClose tokenKind = "]" - tokenKindCharRange tokenKind = "-" - tokenKindCodePointLeader tokenKind = "\\u" - tokenKindCharPropLeader tokenKind = "\\p" - tokenKindFragmentLeader tokenKind = "\\f" - tokenKindLBrace tokenKind = "{" - tokenKindRBrace tokenKind = "}" - tokenKindEqual tokenKind = "=" - tokenKindCodePoint tokenKind = "code point" - tokenKindCharPropSymbol tokenKind = "character property symbol" - tokenKindFragmentSymbol tokenKind = "fragment symbol" - tokenKindEOF tokenKind = "eof" -) - -type token struct { - kind tokenKind - char rune - propSymbol string - codePoint string - fragmentSymbol string -} - -const nullChar = '\u0000' - -func newToken(kind tokenKind, char rune) *token { - return &token{ - kind: kind, - char: char, - } -} - -func newCodePointToken(codePoint string) *token { - return &token{ - kind: tokenKindCodePoint, - codePoint: codePoint, - } -} - -func newCharPropSymbolToken(propSymbol string) *token { - return &token{ - kind: tokenKindCharPropSymbol, - propSymbol: propSymbol, - } -} - -func newFragmentSymbolToken(fragmentSymbol string) *token { - return &token{ - kind: tokenKindFragmentSymbol, - fragmentSymbol: fragmentSymbol, - } -} - -type lexerMode string - -const ( - lexerModeDefault lexerMode = "default" - lexerModeBExp lexerMode = "bracket expression" - lexerModeCPExp lexerMode = "code point expression" - lexerModeCharPropExp lexerMode = "character property expression" - lexerModeFragmentExp lexerMode = "fragment expression" -) - -type lexerModeStack struct { - stack []lexerMode -} - -func newLexerModeStack() *lexerModeStack { - return &lexerModeStack{ - stack: []lexerMode{ - lexerModeDefault, - }, - } -} - -func (s *lexerModeStack) top() lexerMode { - return s.stack[len(s.stack)-1] -} - -func (s *lexerModeStack) push(m lexerMode) { - s.stack = append(s.stack, m) -} - -func (s *lexerModeStack) pop() { - s.stack = s.stack[:len(s.stack)-1] -} - -type rangeState string - -// [a-z] -// ^^^^ -// |||`-- ready -// ||`-- expect range terminator -// |`-- read range initiator -// `-- ready -const ( - rangeStateReady rangeState = "ready" - rangeStateReadRangeInitiator rangeState = "read range initiator" - rangeStateExpectRangeTerminator rangeState = "expect range terminator" -) - -type lexer struct { - src *bufio.Reader - peekChar2 rune - peekEOF2 bool - peekChar1 rune - peekEOF1 bool - lastChar rune - reachedEOF bool - prevChar1 rune - prevEOF1 bool - prevChar2 rune - pervEOF2 bool - modeStack *lexerModeStack - rangeState rangeState - - errCause error - errDetail string -} - -func newLexer(src io.Reader) *lexer { - return &lexer{ - src: bufio.NewReader(src), - peekChar2: nullChar, - peekEOF2: false, - peekChar1: nullChar, - peekEOF1: false, - lastChar: nullChar, - reachedEOF: false, - prevChar1: nullChar, - prevEOF1: false, - prevChar2: nullChar, - pervEOF2: false, - modeStack: newLexerModeStack(), - rangeState: rangeStateReady, - } -} - -func (l *lexer) error() (string, error) { - return l.errDetail, l.errCause -} - -func (l *lexer) next() (*token, error) { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - return newToken(tokenKindEOF, nullChar), nil - } - - switch l.modeStack.top() { - case lexerModeBExp: - tok, err := l.nextInBExp(c) - if err != nil { - return nil, err - } - if tok.kind == tokenKindChar || tok.kind == tokenKindCodePointLeader || tok.kind == tokenKindCharPropLeader { - switch l.rangeState { - case rangeStateReady: - l.rangeState = rangeStateReadRangeInitiator - case rangeStateExpectRangeTerminator: - l.rangeState = rangeStateReady - } - } - switch tok.kind { - case tokenKindBExpClose: - l.modeStack.pop() - case tokenKindCharRange: - l.rangeState = rangeStateExpectRangeTerminator - case tokenKindCodePointLeader: - l.modeStack.push(lexerModeCPExp) - case tokenKindCharPropLeader: - l.modeStack.push(lexerModeCharPropExp) - } - return tok, nil - case lexerModeCPExp: - tok, err := l.nextInCodePoint(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindRBrace: - l.modeStack.pop() - } - return tok, nil - case lexerModeCharPropExp: - tok, err := l.nextInCharProp(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindRBrace: - l.modeStack.pop() - } - return tok, nil - case lexerModeFragmentExp: - tok, err := l.nextInFragment(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindRBrace: - l.modeStack.pop() - } - return tok, nil - default: - tok, err := l.nextInDefault(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindBExpOpen: - l.modeStack.push(lexerModeBExp) - l.rangeState = rangeStateReady - case tokenKindInverseBExpOpen: - l.modeStack.push(lexerModeBExp) - l.rangeState = rangeStateReady - case tokenKindCodePointLeader: - l.modeStack.push(lexerModeCPExp) - case tokenKindCharPropLeader: - l.modeStack.push(lexerModeCharPropExp) - case tokenKindFragmentLeader: - l.modeStack.push(lexerModeFragmentExp) - } - return tok, nil - } -} - -func (l *lexer) nextInDefault(c rune) (*token, error) { - switch c { - case '*': - return newToken(tokenKindRepeat, nullChar), nil - case '+': - return newToken(tokenKindRepeatOneOrMore, nullChar), nil - case '?': - return newToken(tokenKindOption, nullChar), nil - case '.': - return newToken(tokenKindAnyChar, nullChar), nil - case '|': - return newToken(tokenKindAlt, nullChar), nil - case '(': - return newToken(tokenKindGroupOpen, nullChar), nil - case ')': - return newToken(tokenKindGroupClose, nullChar), nil - case '[': - c1, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindBExpOpen, nullChar), nil - } - if c1 != '^' { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindBExpOpen, nullChar), nil - } - c2, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindInverseBExpOpen, nullChar), nil - } - if c2 != ']' { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindInverseBExpOpen, nullChar), nil - } - err = l.restore() - if err != nil { - return nil, err - } - err = l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindBExpOpen, nullChar), nil - case '\\': - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - l.errCause = synErrIncompletedEscSeq - return nil, ParseErr - } - if c == 'u' { - return newToken(tokenKindCodePointLeader, nullChar), nil - } - if c == 'p' { - return newToken(tokenKindCharPropLeader, nullChar), nil - } - if c == 'f' { - return newToken(tokenKindFragmentLeader, nullChar), nil - } - if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' { - return newToken(tokenKindChar, c), nil - } - l.errCause = synErrInvalidEscSeq - l.errDetail = fmt.Sprintf("\\%v is not supported", string(c)) - return nil, ParseErr - default: - return newToken(tokenKindChar, c), nil - } -} - -func (l *lexer) nextInBExp(c rune) (*token, error) { - switch c { - case '-': - if l.rangeState != rangeStateReadRangeInitiator { - return newToken(tokenKindChar, c), nil - } - c1, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindChar, c), nil - } - if c1 != ']' { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindCharRange, nullChar), nil - } - err = l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindChar, c), nil - case ']': - return newToken(tokenKindBExpClose, nullChar), nil - case '\\': - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - l.errCause = synErrIncompletedEscSeq - return nil, ParseErr - } - if c == 'u' { - return newToken(tokenKindCodePointLeader, nullChar), nil - } - if c == 'p' { - return newToken(tokenKindCharPropLeader, nullChar), nil - } - if c == '\\' || c == '^' || c == '-' || c == ']' { - return newToken(tokenKindChar, c), nil - } - l.errCause = synErrInvalidEscSeq - l.errDetail = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c)) - return nil, ParseErr - default: - return newToken(tokenKindChar, c), nil - } -} - -func (l *lexer) nextInCodePoint(c rune) (*token, error) { - switch c { - case '{': - return newToken(tokenKindLBrace, nullChar), nil - case '}': - return newToken(tokenKindRBrace, nullChar), nil - default: - if !isHexDigit(c) { - l.errCause = synErrInvalidCodePoint - return nil, ParseErr - } - var b strings.Builder - fmt.Fprint(&b, string(c)) - n := 1 - for { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - break - } - if c == '}' { - err := l.restore() - if err != nil { - return nil, err - } - break - } - if !isHexDigit(c) || n >= 6 { - l.errCause = synErrInvalidCodePoint - return nil, ParseErr - } - fmt.Fprint(&b, string(c)) - n++ - } - cp := b.String() - cpLen := len(cp) - if !(cpLen == 4 || cpLen == 6) { - l.errCause = synErrInvalidCodePoint - return nil, ParseErr - } - return newCodePointToken(b.String()), nil - } -} - -func isHexDigit(c rune) bool { - if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' { - return true - } - return false -} - -func (l *lexer) nextInCharProp(c rune) (*token, error) { - switch c { - case '{': - return newToken(tokenKindLBrace, nullChar), nil - case '}': - return newToken(tokenKindRBrace, nullChar), nil - case '=': - return newToken(tokenKindEqual, nullChar), nil - default: - var b strings.Builder - fmt.Fprint(&b, string(c)) - n := 1 - for { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - break - } - if c == '}' || c == '=' { - err := l.restore() - if err != nil { - return nil, err - } - break - } - fmt.Fprint(&b, string(c)) - n++ - } - sym := strings.TrimSpace(b.String()) - if len(sym) == 0 { - l.errCause = synErrCharPropInvalidSymbol - return nil, ParseErr - } - return newCharPropSymbolToken(sym), nil - } -} - -func (l *lexer) nextInFragment(c rune) (*token, error) { - switch c { - case '{': - return newToken(tokenKindLBrace, nullChar), nil - case '}': - return newToken(tokenKindRBrace, nullChar), nil - default: - var b strings.Builder - fmt.Fprint(&b, string(c)) - n := 1 - for { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - break - } - if c == '}' { - err := l.restore() - if err != nil { - return nil, err - } - break - } - fmt.Fprint(&b, string(c)) - n++ - } - sym := strings.TrimSpace(b.String()) - if len(sym) == 0 { - l.errCause = SynErrFragmentInvalidSymbol - return nil, ParseErr - } - return newFragmentSymbolToken(sym), nil - } -} - -func (l *lexer) read() (rune, bool, error) { - if l.reachedEOF { - return l.lastChar, l.reachedEOF, nil - } - if l.peekChar1 != nullChar || l.peekEOF1 { - l.prevChar2 = l.prevChar1 - l.pervEOF2 = l.prevEOF1 - l.prevChar1 = l.lastChar - l.prevEOF1 = l.reachedEOF - l.lastChar = l.peekChar1 - l.reachedEOF = l.peekEOF1 - l.peekChar1 = l.peekChar2 - l.peekEOF1 = l.peekEOF2 - l.peekChar2 = nullChar - l.peekEOF2 = false - return l.lastChar, l.reachedEOF, nil - } - c, _, err := l.src.ReadRune() - if err != nil { - if err == io.EOF { - l.prevChar2 = l.prevChar1 - l.pervEOF2 = l.prevEOF1 - l.prevChar1 = l.lastChar - l.prevEOF1 = l.reachedEOF - l.lastChar = nullChar - l.reachedEOF = true - return l.lastChar, l.reachedEOF, nil - } - return nullChar, false, err - } - l.prevChar2 = l.prevChar1 - l.pervEOF2 = l.prevEOF1 - l.prevChar1 = l.lastChar - l.prevEOF1 = l.reachedEOF - l.lastChar = c - l.reachedEOF = false - return l.lastChar, l.reachedEOF, nil -} - -func (l *lexer) restore() error { - if l.lastChar == nullChar && !l.reachedEOF { - return fmt.Errorf("failed to call restore() because the last character is null") - } - l.peekChar2 = l.peekChar1 - l.peekEOF2 = l.peekEOF1 - l.peekChar1 = l.lastChar - l.peekEOF1 = l.reachedEOF - l.lastChar = l.prevChar1 - l.reachedEOF = l.prevEOF1 - l.prevChar1 = l.prevChar2 - l.prevEOF1 = l.pervEOF2 - l.prevChar2 = nullChar - l.pervEOF2 = false - return nil -} diff --git a/src/urubu/grammar/lexical/parser/parser.go b/src/urubu/grammar/lexical/parser/parser.go deleted file mode 100644 index 425b553..0000000 --- a/src/urubu/grammar/lexical/parser/parser.go +++ /dev/null @@ -1,531 +0,0 @@ -package parser - -import ( - "bytes" - "fmt" - "io" - "strconv" - - spec "urubu/spec/grammar" - "urubu/ucd" -) - -type PatternEntry struct { - ID spec.LexModeKindID - Pattern []byte -} - -type parser struct { - kind spec.LexKindName - lex *lexer - peekedTok *token - lastTok *token - - // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that - // appear in property expressions. - // - // The contributory properties are not exposed, and users cannot use those properties because the parser - // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid. - // - // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to - // interpret derived properties internally because the derived properties consist of other properties that - // may contain the contributory properties. - // - // [UAX #44 5.13 Property APIs] says: - // > The following subtypes of Unicode character properties should generally not be exposed in APIs, - // > except in limited circumstances. They may not be useful, particularly in public API collections, - // > and may instead prove misleading to the users of such API collections. - // > * Contributory properties are not recommended for public APIs. - // > ... - // https://unicode.org/reports/tr44/#Property_APIs - isContributoryPropertyExposed bool - - errCause error - errDetail string -} - -func NewParser(kind spec.LexKindName, src io.Reader) *parser { - return &parser{ - kind: kind, - lex: newLexer(src), - isContributoryPropertyExposed: false, - } -} - -func (p *parser) exposeContributoryProperty() { - p.isContributoryPropertyExposed = true -} - -func (p *parser) Error() (string, error) { - return p.errDetail, p.errCause -} - -func (p *parser) Parse() (root CPTree, retErr error) { - defer func() { - err := recover() - if err != nil { - var ok bool - retErr, ok = err.(error) - if !ok { - panic(err) - } - return - } - }() - - return newRootNode(p.kind, p.parseRegexp()), nil -} - -func (p *parser) parseRegexp() CPTree { - alt := p.parseAlt() - if alt == nil { - if p.consume(tokenKindGroupClose) { - p.raiseParseError(synErrGroupNoInitiator, "") - } - p.raiseParseError(synErrNullPattern, "") - } - if p.consume(tokenKindGroupClose) { - p.raiseParseError(synErrGroupNoInitiator, "") - } - p.expect(tokenKindEOF) - return alt -} - -func (p *parser) parseAlt() CPTree { - left := p.parseConcat() - if left == nil { - if p.consume(tokenKindAlt) { - p.raiseParseError(synErrAltLackOfOperand, "") - } - return nil - } - for { - if !p.consume(tokenKindAlt) { - break - } - right := p.parseConcat() - if right == nil { - p.raiseParseError(synErrAltLackOfOperand, "") - } - left = newAltNode(left, right) - } - return left -} - -func (p *parser) parseConcat() CPTree { - left := p.parseRepeat() - for { - right := p.parseRepeat() - if right == nil { - break - } - left = newConcatNode(left, right) - } - return left -} - -func (p *parser) parseRepeat() CPTree { - group := p.parseGroup() - if group == nil { - if p.consume(tokenKindRepeat) { - p.raiseParseError(synErrRepNoTarget, "* needs an operand") - } - if p.consume(tokenKindRepeatOneOrMore) { - p.raiseParseError(synErrRepNoTarget, "+ needs an operand") - } - if p.consume(tokenKindOption) { - p.raiseParseError(synErrRepNoTarget, "? needs an operand") - } - return nil - } - if p.consume(tokenKindRepeat) { - return newRepeatNode(group) - } - if p.consume(tokenKindRepeatOneOrMore) { - return newRepeatOneOrMoreNode(group) - } - if p.consume(tokenKindOption) { - return newOptionNode(group) - } - return group -} - -func (p *parser) parseGroup() CPTree { - if p.consume(tokenKindGroupOpen) { - alt := p.parseAlt() - if alt == nil { - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrGroupUnclosed, "") - } - p.raiseParseError(synErrGroupNoElem, "") - } - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrGroupUnclosed, "") - } - if !p.consume(tokenKindGroupClose) { - p.raiseParseError(synErrGroupInvalidForm, "") - } - return alt - } - return p.parseSingleChar() -} - -func (p *parser) parseSingleChar() CPTree { - if p.consume(tokenKindAnyChar) { - return genAnyCharAST() - } - if p.consume(tokenKindBExpOpen) { - left := p.parseBExpElem() - if left == nil { - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.raiseParseError(synErrBExpNoElem, "") - } - for { - right := p.parseBExpElem() - if right == nil { - break - } - left = newAltNode(left, right) - } - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.expect(tokenKindBExpClose) - return left - } - if p.consume(tokenKindInverseBExpOpen) { - elem := p.parseBExpElem() - if elem == nil { - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.raiseParseError(synErrBExpNoElem, "") - } - inverse := exclude(elem, genAnyCharAST()) - if inverse == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - for { - elem := p.parseBExpElem() - if elem == nil { - break - } - inverse = exclude(elem, inverse) - if inverse == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - } - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.expect(tokenKindBExpClose) - return inverse - } - if p.consume(tokenKindCodePointLeader) { - return p.parseCodePoint() - } - if p.consume(tokenKindCharPropLeader) { - return p.parseCharProp() - } - if p.consume(tokenKindFragmentLeader) { - return p.parseFragment() - } - c := p.parseNormalChar() - if c == nil { - if p.consume(tokenKindBExpClose) { - p.raiseParseError(synErrBExpInvalidForm, "") - } - return nil - } - return c -} - -func (p *parser) parseBExpElem() CPTree { - var left CPTree - switch { - case p.consume(tokenKindCodePointLeader): - left = p.parseCodePoint() - case p.consume(tokenKindCharPropLeader): - left = p.parseCharProp() - if p.consume(tokenKindCharRange) { - p.raiseParseError(synErrRangePropIsUnavailable, "") - } - default: - left = p.parseNormalChar() - } - if left == nil { - return nil - } - if !p.consume(tokenKindCharRange) { - return left - } - var right CPTree - switch { - case p.consume(tokenKindCodePointLeader): - right = p.parseCodePoint() - case p.consume(tokenKindCharPropLeader): - p.raiseParseError(synErrRangePropIsUnavailable, "") - default: - right = p.parseNormalChar() - } - if right == nil { - p.raiseParseError(synErrRangeInvalidForm, "") - } - from, _, _ := left.Range() - _, to, _ := right.Range() - if !isValidOrder(from, to) { - p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("%X..%X", from, to)) - } - return newRangeSymbolNode(from, to) -} - -func (p *parser) parseCodePoint() CPTree { - if !p.consume(tokenKindLBrace) { - p.raiseParseError(synErrCPExpInvalidForm, "") - } - if !p.consume(tokenKindCodePoint) { - p.raiseParseError(synErrCPExpInvalidForm, "") - } - - n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64) - if err != nil { - panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err)) - } - if n < 0x0000 || n > 0x10FFFF { - p.raiseParseError(synErrCPExpOutOfRange, "") - } - - sym := newSymbolNode(rune(n)) - - if !p.consume(tokenKindRBrace) { - p.raiseParseError(synErrCPExpInvalidForm, "") - } - - return sym -} - -func (p *parser) parseCharProp() CPTree { - if !p.consume(tokenKindLBrace) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - var sym1, sym2 string - if !p.consume(tokenKindCharPropSymbol) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - sym1 = p.lastTok.propSymbol - if p.consume(tokenKindEqual) { - if !p.consume(tokenKindCharPropSymbol) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - sym2 = p.lastTok.propSymbol - } - - var alt CPTree - var propName, propVal string - if sym2 != "" { - propName = sym1 - propVal = sym2 - } else { - propName = "" - propVal = sym1 - } - if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) { - p.raiseParseError(synErrCharPropUnsupported, propName) - } - pat, err := ucd.NormalizeCharacterProperty(propName, propVal) - if err != nil { - p.raiseParseError(synErrCharPropUnsupported, err.Error()) - } - if pat != "" { - p := NewParser(p.kind, bytes.NewReader([]byte(pat))) - p.exposeContributoryProperty() - ast, err := p.Parse() - if err != nil { - panic(err) - } - alt = ast - } else { - cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal) - if err != nil { - p.raiseParseError(synErrCharPropUnsupported, err.Error()) - } - if inverse { - r := cpRanges[0] - alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST()) - if alt == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - for _, r := range cpRanges[1:] { - alt = exclude(newRangeSymbolNode(r.From, r.To), alt) - if alt == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - } - } else { - for _, r := range cpRanges { - alt = genAltNode( - alt, - newRangeSymbolNode(r.From, r.To), - ) - } - } - } - - if !p.consume(tokenKindRBrace) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - - return alt -} - -func (p *parser) parseFragment() CPTree { - if !p.consume(tokenKindLBrace) { - p.raiseParseError(synErrFragmentExpInvalidForm, "") - } - if !p.consume(tokenKindFragmentSymbol) { - p.raiseParseError(synErrFragmentExpInvalidForm, "") - } - sym := p.lastTok.fragmentSymbol - - if !p.consume(tokenKindRBrace) { - p.raiseParseError(synErrFragmentExpInvalidForm, "") - } - - return newFragmentNode(spec.LexKindName(sym), nil) -} - -func (p *parser) parseNormalChar() CPTree { - if !p.consume(tokenKindChar) { - return nil - } - return newSymbolNode(p.lastTok.char) -} - -func exclude(symbol, base CPTree) CPTree { - if left, right, ok := symbol.Alternatives(); ok { - return exclude(right, exclude(left, base)) - } - - if left, right, ok := base.Alternatives(); ok { - return genAltNode( - exclude(symbol, left), - exclude(symbol, right), - ) - } - - if bFrom, bTo, ok := base.Range(); ok { - sFrom, sTo, ok := symbol.Range() - if !ok { - panic(fmt.Errorf("invalid symbol tree: %T", symbol)) - } - - switch { - case sFrom > bFrom && sTo < bTo: - return genAltNode( - newRangeSymbolNode(bFrom, sFrom-1), - newRangeSymbolNode(sTo+1, bTo), - ) - case sFrom <= bFrom && sTo >= bFrom && sTo < bTo: - return newRangeSymbolNode(sTo+1, bTo) - case sFrom > bFrom && sFrom <= bTo && sTo >= bTo: - return newRangeSymbolNode(bFrom, sFrom-1) - case sFrom <= bFrom && sTo >= bTo: - return nil - default: - return base - } - } - - panic(fmt.Errorf("invalid base tree: %T", base)) -} - -func genAnyCharAST() CPTree { - return newRangeSymbolNode(0x0, 0x10FFFF) -} - -func isValidOrder(from, to rune) bool { - return from <= to -} - -func genConcatNode(cs ...CPTree) CPTree { - nonNilNodes := []CPTree{} - for _, c := range cs { - if c == nil { - continue - } - nonNilNodes = append(nonNilNodes, c) - } - if len(nonNilNodes) <= 0 { - return nil - } - if len(nonNilNodes) == 1 { - return nonNilNodes[0] - } - concat := newConcatNode(nonNilNodes[0], nonNilNodes[1]) - for _, c := range nonNilNodes[2:] { - concat = newConcatNode(concat, c) - } - return concat -} - -func genAltNode(cs ...CPTree) CPTree { - nonNilNodes := []CPTree{} - for _, c := range cs { - if c == nil { - continue - } - nonNilNodes = append(nonNilNodes, c) - } - if len(nonNilNodes) <= 0 { - return nil - } - if len(nonNilNodes) == 1 { - return nonNilNodes[0] - } - alt := newAltNode(nonNilNodes[0], nonNilNodes[1]) - for _, c := range nonNilNodes[2:] { - alt = newAltNode(alt, c) - } - return alt -} - -func (p *parser) expect(expected tokenKind) { - if !p.consume(expected) { - tok := p.peekedTok - p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("expected: %v, actual: %v", expected, tok.kind)) - } -} - -func (p *parser) consume(expected tokenKind) bool { - var tok *token - var err error - if p.peekedTok != nil { - tok = p.peekedTok - p.peekedTok = nil - } else { - tok, err = p.lex.next() - if err != nil { - if err == ParseErr { - detail, cause := p.lex.error() - p.raiseParseError(cause, detail) - } - panic(err) - } - } - p.lastTok = tok - if tok.kind == expected { - return true - } - p.peekedTok = tok - p.lastTok = nil - - return false -} - -func (p *parser) raiseParseError(err error, detail string) { - p.errCause = err - p.errDetail = detail - panic(ParseErr) -} diff --git a/src/urubu/grammar/lexical/parser/tree.go b/src/urubu/grammar/lexical/parser/tree.go deleted file mode 100644 index df03d37..0000000 --- a/src/urubu/grammar/lexical/parser/tree.go +++ /dev/null @@ -1,459 +0,0 @@ -package parser - -import ( - "fmt" - "io" - "sort" - - spec "urubu/spec/grammar" -) - -type CPRange struct { - From rune - To rune -} - -type CPTree interface { - fmt.Stringer - Range() (rune, rune, bool) - Optional() (CPTree, bool) - Repeatable() (CPTree, bool) - Concatenation() (CPTree, CPTree, bool) - Alternatives() (CPTree, CPTree, bool) - Describe() (spec.LexKindName, []spec.LexKindName, error) - - children() (CPTree, CPTree) - clone() CPTree -} - -var ( - _ CPTree = &rootNode{} - _ CPTree = &symbolNode{} - _ CPTree = &concatNode{} - _ CPTree = &altNode{} - _ CPTree = &quantifierNode{} - _ CPTree = &fragmentNode{} -) - -type rootNode struct { - kind spec.LexKindName - tree CPTree - fragments map[spec.LexKindName][]*fragmentNode -} - -func newRootNode(kind spec.LexKindName, t CPTree) *rootNode { - fragments := map[spec.LexKindName][]*fragmentNode{} - collectFragments(t, fragments) - - return &rootNode{ - kind: kind, - tree: t, - fragments: fragments, - } -} - -func collectFragments(n CPTree, fragments map[spec.LexKindName][]*fragmentNode) { - if n == nil { - return - } - - if f, ok := n.(*fragmentNode); ok { - fragments[f.kind] = append(fragments[f.kind], f) - return - } - - l, r := n.children() - collectFragments(l, fragments) - collectFragments(r, fragments) -} - -func (n *rootNode) String() string { - return fmt.Sprintf("root: %v: %v fragments", n.kind, len(n.fragments)) -} - -func (n *rootNode) Range() (rune, rune, bool) { - return n.tree.Range() -} - -func (n *rootNode) Optional() (CPTree, bool) { - return n.tree.Optional() -} - -func (n *rootNode) Repeatable() (CPTree, bool) { - return n.tree.Repeatable() -} - -func (n *rootNode) Concatenation() (CPTree, CPTree, bool) { - return n.tree.Concatenation() -} - -func (n *rootNode) Alternatives() (CPTree, CPTree, bool) { - return n.tree.Alternatives() -} - -func (n *rootNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - var frags []spec.LexKindName - for f := range n.fragments { - frags = append(frags, spec.LexKindName(f)) - } - sort.Slice(frags, func(i, j int) bool { - return frags[i] < frags[j] - }) - - return n.kind, frags, nil -} - -func (n *rootNode) children() (CPTree, CPTree) { - return n.tree.children() -} - -func (n *rootNode) clone() CPTree { - return n.tree.clone() -} - -func (n *rootNode) incomplete() bool { - return len(n.fragments) > 0 -} - -func (n *rootNode) applyFragment(kind spec.LexKindName, fragment CPTree) error { - root, ok := fragment.(*rootNode) - if !ok { - return fmt.Errorf("applyFragment can take only *rootNode: %T", fragment) - } - if root.incomplete() { - return fmt.Errorf("fragment is incomplete") - } - - fs, ok := n.fragments[kind] - if !ok { - return nil - } - for _, f := range fs { - f.tree = root.clone() - } - delete(n.fragments, kind) - - return nil -} - -type symbolNode struct { - CPRange -} - -func newSymbolNode(cp rune) *symbolNode { - return &symbolNode{ - CPRange: CPRange{ - From: cp, - To: cp, - }, - } -} - -func newRangeSymbolNode(from, to rune) *symbolNode { - return &symbolNode{ - CPRange: CPRange{ - From: from, - To: to, - }, - } -} - -func (n *symbolNode) String() string { - return fmt.Sprintf("symbol: %X..%X", n.From, n.To) -} - -func (n *symbolNode) Range() (rune, rune, bool) { - return n.From, n.To, true -} - -func (n *symbolNode) Optional() (CPTree, bool) { - return nil, false -} - -func (n *symbolNode) Repeatable() (CPTree, bool) { - return nil, false -} - -func (n *symbolNode) Concatenation() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *symbolNode) Alternatives() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *symbolNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *symbolNode) children() (CPTree, CPTree) { - return nil, nil -} - -func (n *symbolNode) clone() CPTree { - return newRangeSymbolNode(n.From, n.To) -} - -type concatNode struct { - left CPTree - right CPTree -} - -func newConcatNode(left, right CPTree) *concatNode { - return &concatNode{ - left: left, - right: right, - } -} - -func (n *concatNode) String() string { - return "concat" -} - -func (n *concatNode) Range() (rune, rune, bool) { - return 0, 0, false -} - -func (n *concatNode) Optional() (CPTree, bool) { - return nil, false -} - -func (n *concatNode) Repeatable() (CPTree, bool) { - return nil, false -} - -func (n *concatNode) Concatenation() (CPTree, CPTree, bool) { - return n.left, n.right, true -} - -func (n *concatNode) Alternatives() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *concatNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *concatNode) children() (CPTree, CPTree) { - return n.left, n.right -} - -func (n *concatNode) clone() CPTree { - if n == nil { - return nil - } - return newConcatNode(n.left.clone(), n.right.clone()) -} - -type altNode struct { - left CPTree - right CPTree -} - -func newAltNode(left, right CPTree) *altNode { - return &altNode{ - left: left, - right: right, - } -} - -func (n *altNode) String() string { - return "alt" -} - -func (n *altNode) Range() (rune, rune, bool) { - return 0, 0, false -} - -func (n *altNode) Optional() (CPTree, bool) { - return nil, false -} - -func (n *altNode) Repeatable() (CPTree, bool) { - return nil, false -} - -func (n *altNode) Concatenation() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *altNode) Alternatives() (CPTree, CPTree, bool) { - return n.left, n.right, true -} - -func (n *altNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *altNode) children() (CPTree, CPTree) { - return n.left, n.right -} - -func (n *altNode) clone() CPTree { - return newAltNode(n.left.clone(), n.right.clone()) -} - -type quantifierNode struct { - optional bool - repeatable bool - tree CPTree -} - -func (n *quantifierNode) String() string { - switch { - case n.repeatable: - return "repeatable (>= 0 times)" - case n.optional: - return "optional (0 or 1 times)" - default: - return "invalid quantifier" - } -} - -func newRepeatNode(t CPTree) *quantifierNode { - return &quantifierNode{ - repeatable: true, - tree: t, - } -} - -func newRepeatOneOrMoreNode(t CPTree) *concatNode { - return newConcatNode( - t, - &quantifierNode{ - repeatable: true, - tree: t.clone(), - }) -} - -func newOptionNode(t CPTree) *quantifierNode { - return &quantifierNode{ - optional: true, - tree: t, - } -} - -func (n *quantifierNode) Range() (rune, rune, bool) { - return 0, 0, false -} - -func (n *quantifierNode) Optional() (CPTree, bool) { - return n.tree, n.optional -} - -func (n *quantifierNode) Repeatable() (CPTree, bool) { - return n.tree, n.repeatable -} - -func (n *quantifierNode) Concatenation() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *quantifierNode) Alternatives() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *quantifierNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *quantifierNode) children() (CPTree, CPTree) { - return n.tree, nil -} - -func (n *quantifierNode) clone() CPTree { - if n.repeatable { - return newRepeatNode(n.tree.clone()) - } - return newOptionNode(n.tree.clone()) -} - -type fragmentNode struct { - kind spec.LexKindName - tree CPTree -} - -func newFragmentNode(kind spec.LexKindName, t CPTree) *fragmentNode { - return &fragmentNode{ - kind: kind, - tree: t, - } -} - -func (n *fragmentNode) String() string { - return fmt.Sprintf("fragment: %v", n.kind) -} - -func (n *fragmentNode) Range() (rune, rune, bool) { - return n.tree.Range() -} - -func (n *fragmentNode) Optional() (CPTree, bool) { - return n.tree.Optional() -} - -func (n *fragmentNode) Repeatable() (CPTree, bool) { - return n.tree.Repeatable() -} - -func (n *fragmentNode) Concatenation() (CPTree, CPTree, bool) { - return n.tree.Concatenation() -} - -func (n *fragmentNode) Alternatives() (CPTree, CPTree, bool) { - return n.tree.Alternatives() -} - -func (n *fragmentNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *fragmentNode) children() (CPTree, CPTree) { - return n.tree.children() -} - -func (n *fragmentNode) clone() CPTree { - if n.tree == nil { - return newFragmentNode(n.kind, nil) - } - return newFragmentNode(n.kind, n.tree.clone()) -} - -//nolint:unused -func printCPTree(w io.Writer, t CPTree, ruledLine string, childRuledLinePrefix string) { - if t == nil { - return - } - fmt.Fprintf(w, "%v%v\n", ruledLine, t) - children := []CPTree{} - switch n := t.(type) { - case *rootNode: - children = append(children, n.tree) - case *fragmentNode: - children = append(children, n.tree) - default: - left, right := t.children() - if left != nil { - children = append(children, left) - } - if right != nil { - children = append(children, right) - } - } - num := len(children) - for i, child := range children { - line := "└─ " - if num > 1 { - if i == 0 { - line = "├─ " - } else if i < num-1 { - line = "│ " - } - } - prefix := "│ " - if i >= num-1 { - prefix = " " - } - printCPTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix) - } -} |