aboutsummaryrefslogtreecommitdiff
path: root/src/urubu/grammar/lexical/parser/parser.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/urubu/grammar/lexical/parser/parser.go')
-rw-r--r--src/urubu/grammar/lexical/parser/parser.go531
1 files changed, 0 insertions, 531 deletions
diff --git a/src/urubu/grammar/lexical/parser/parser.go b/src/urubu/grammar/lexical/parser/parser.go
deleted file mode 100644
index 425b553..0000000
--- a/src/urubu/grammar/lexical/parser/parser.go
+++ /dev/null
@@ -1,531 +0,0 @@
-package parser
-
-import (
- "bytes"
- "fmt"
- "io"
- "strconv"
-
- spec "urubu/spec/grammar"
- "urubu/ucd"
-)
-
-type PatternEntry struct {
- ID spec.LexModeKindID
- Pattern []byte
-}
-
-type parser struct {
- kind spec.LexKindName
- lex *lexer
- peekedTok *token
- lastTok *token
-
- // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that
- // appear in property expressions.
- //
- // The contributory properties are not exposed, and users cannot use those properties because the parser
- // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid.
- //
- // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to
- // interpret derived properties internally because the derived properties consist of other properties that
- // may contain the contributory properties.
- //
- // [UAX #44 5.13 Property APIs] says:
- // > The following subtypes of Unicode character properties should generally not be exposed in APIs,
- // > except in limited circumstances. They may not be useful, particularly in public API collections,
- // > and may instead prove misleading to the users of such API collections.
- // > * Contributory properties are not recommended for public APIs.
- // > ...
- // https://unicode.org/reports/tr44/#Property_APIs
- isContributoryPropertyExposed bool
-
- errCause error
- errDetail string
-}
-
-func NewParser(kind spec.LexKindName, src io.Reader) *parser {
- return &parser{
- kind: kind,
- lex: newLexer(src),
- isContributoryPropertyExposed: false,
- }
-}
-
-func (p *parser) exposeContributoryProperty() {
- p.isContributoryPropertyExposed = true
-}
-
-func (p *parser) Error() (string, error) {
- return p.errDetail, p.errCause
-}
-
-func (p *parser) Parse() (root CPTree, retErr error) {
- defer func() {
- err := recover()
- if err != nil {
- var ok bool
- retErr, ok = err.(error)
- if !ok {
- panic(err)
- }
- return
- }
- }()
-
- return newRootNode(p.kind, p.parseRegexp()), nil
-}
-
-func (p *parser) parseRegexp() CPTree {
- alt := p.parseAlt()
- if alt == nil {
- if p.consume(tokenKindGroupClose) {
- p.raiseParseError(synErrGroupNoInitiator, "")
- }
- p.raiseParseError(synErrNullPattern, "")
- }
- if p.consume(tokenKindGroupClose) {
- p.raiseParseError(synErrGroupNoInitiator, "")
- }
- p.expect(tokenKindEOF)
- return alt
-}
-
-func (p *parser) parseAlt() CPTree {
- left := p.parseConcat()
- if left == nil {
- if p.consume(tokenKindAlt) {
- p.raiseParseError(synErrAltLackOfOperand, "")
- }
- return nil
- }
- for {
- if !p.consume(tokenKindAlt) {
- break
- }
- right := p.parseConcat()
- if right == nil {
- p.raiseParseError(synErrAltLackOfOperand, "")
- }
- left = newAltNode(left, right)
- }
- return left
-}
-
-func (p *parser) parseConcat() CPTree {
- left := p.parseRepeat()
- for {
- right := p.parseRepeat()
- if right == nil {
- break
- }
- left = newConcatNode(left, right)
- }
- return left
-}
-
-func (p *parser) parseRepeat() CPTree {
- group := p.parseGroup()
- if group == nil {
- if p.consume(tokenKindRepeat) {
- p.raiseParseError(synErrRepNoTarget, "* needs an operand")
- }
- if p.consume(tokenKindRepeatOneOrMore) {
- p.raiseParseError(synErrRepNoTarget, "+ needs an operand")
- }
- if p.consume(tokenKindOption) {
- p.raiseParseError(synErrRepNoTarget, "? needs an operand")
- }
- return nil
- }
- if p.consume(tokenKindRepeat) {
- return newRepeatNode(group)
- }
- if p.consume(tokenKindRepeatOneOrMore) {
- return newRepeatOneOrMoreNode(group)
- }
- if p.consume(tokenKindOption) {
- return newOptionNode(group)
- }
- return group
-}
-
-func (p *parser) parseGroup() CPTree {
- if p.consume(tokenKindGroupOpen) {
- alt := p.parseAlt()
- if alt == nil {
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrGroupUnclosed, "")
- }
- p.raiseParseError(synErrGroupNoElem, "")
- }
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrGroupUnclosed, "")
- }
- if !p.consume(tokenKindGroupClose) {
- p.raiseParseError(synErrGroupInvalidForm, "")
- }
- return alt
- }
- return p.parseSingleChar()
-}
-
-func (p *parser) parseSingleChar() CPTree {
- if p.consume(tokenKindAnyChar) {
- return genAnyCharAST()
- }
- if p.consume(tokenKindBExpOpen) {
- left := p.parseBExpElem()
- if left == nil {
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrBExpUnclosed, "")
- }
- p.raiseParseError(synErrBExpNoElem, "")
- }
- for {
- right := p.parseBExpElem()
- if right == nil {
- break
- }
- left = newAltNode(left, right)
- }
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrBExpUnclosed, "")
- }
- p.expect(tokenKindBExpClose)
- return left
- }
- if p.consume(tokenKindInverseBExpOpen) {
- elem := p.parseBExpElem()
- if elem == nil {
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrBExpUnclosed, "")
- }
- p.raiseParseError(synErrBExpNoElem, "")
- }
- inverse := exclude(elem, genAnyCharAST())
- if inverse == nil {
- p.raiseParseError(synErrUnmatchablePattern, "")
- }
- for {
- elem := p.parseBExpElem()
- if elem == nil {
- break
- }
- inverse = exclude(elem, inverse)
- if inverse == nil {
- p.raiseParseError(synErrUnmatchablePattern, "")
- }
- }
- if p.consume(tokenKindEOF) {
- p.raiseParseError(synErrBExpUnclosed, "")
- }
- p.expect(tokenKindBExpClose)
- return inverse
- }
- if p.consume(tokenKindCodePointLeader) {
- return p.parseCodePoint()
- }
- if p.consume(tokenKindCharPropLeader) {
- return p.parseCharProp()
- }
- if p.consume(tokenKindFragmentLeader) {
- return p.parseFragment()
- }
- c := p.parseNormalChar()
- if c == nil {
- if p.consume(tokenKindBExpClose) {
- p.raiseParseError(synErrBExpInvalidForm, "")
- }
- return nil
- }
- return c
-}
-
-func (p *parser) parseBExpElem() CPTree {
- var left CPTree
- switch {
- case p.consume(tokenKindCodePointLeader):
- left = p.parseCodePoint()
- case p.consume(tokenKindCharPropLeader):
- left = p.parseCharProp()
- if p.consume(tokenKindCharRange) {
- p.raiseParseError(synErrRangePropIsUnavailable, "")
- }
- default:
- left = p.parseNormalChar()
- }
- if left == nil {
- return nil
- }
- if !p.consume(tokenKindCharRange) {
- return left
- }
- var right CPTree
- switch {
- case p.consume(tokenKindCodePointLeader):
- right = p.parseCodePoint()
- case p.consume(tokenKindCharPropLeader):
- p.raiseParseError(synErrRangePropIsUnavailable, "")
- default:
- right = p.parseNormalChar()
- }
- if right == nil {
- p.raiseParseError(synErrRangeInvalidForm, "")
- }
- from, _, _ := left.Range()
- _, to, _ := right.Range()
- if !isValidOrder(from, to) {
- p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("%X..%X", from, to))
- }
- return newRangeSymbolNode(from, to)
-}
-
-func (p *parser) parseCodePoint() CPTree {
- if !p.consume(tokenKindLBrace) {
- p.raiseParseError(synErrCPExpInvalidForm, "")
- }
- if !p.consume(tokenKindCodePoint) {
- p.raiseParseError(synErrCPExpInvalidForm, "")
- }
-
- n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64)
- if err != nil {
- panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err))
- }
- if n < 0x0000 || n > 0x10FFFF {
- p.raiseParseError(synErrCPExpOutOfRange, "")
- }
-
- sym := newSymbolNode(rune(n))
-
- if !p.consume(tokenKindRBrace) {
- p.raiseParseError(synErrCPExpInvalidForm, "")
- }
-
- return sym
-}
-
-func (p *parser) parseCharProp() CPTree {
- if !p.consume(tokenKindLBrace) {
- p.raiseParseError(synErrCharPropExpInvalidForm, "")
- }
- var sym1, sym2 string
- if !p.consume(tokenKindCharPropSymbol) {
- p.raiseParseError(synErrCharPropExpInvalidForm, "")
- }
- sym1 = p.lastTok.propSymbol
- if p.consume(tokenKindEqual) {
- if !p.consume(tokenKindCharPropSymbol) {
- p.raiseParseError(synErrCharPropExpInvalidForm, "")
- }
- sym2 = p.lastTok.propSymbol
- }
-
- var alt CPTree
- var propName, propVal string
- if sym2 != "" {
- propName = sym1
- propVal = sym2
- } else {
- propName = ""
- propVal = sym1
- }
- if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) {
- p.raiseParseError(synErrCharPropUnsupported, propName)
- }
- pat, err := ucd.NormalizeCharacterProperty(propName, propVal)
- if err != nil {
- p.raiseParseError(synErrCharPropUnsupported, err.Error())
- }
- if pat != "" {
- p := NewParser(p.kind, bytes.NewReader([]byte(pat)))
- p.exposeContributoryProperty()
- ast, err := p.Parse()
- if err != nil {
- panic(err)
- }
- alt = ast
- } else {
- cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal)
- if err != nil {
- p.raiseParseError(synErrCharPropUnsupported, err.Error())
- }
- if inverse {
- r := cpRanges[0]
- alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST())
- if alt == nil {
- p.raiseParseError(synErrUnmatchablePattern, "")
- }
- for _, r := range cpRanges[1:] {
- alt = exclude(newRangeSymbolNode(r.From, r.To), alt)
- if alt == nil {
- p.raiseParseError(synErrUnmatchablePattern, "")
- }
- }
- } else {
- for _, r := range cpRanges {
- alt = genAltNode(
- alt,
- newRangeSymbolNode(r.From, r.To),
- )
- }
- }
- }
-
- if !p.consume(tokenKindRBrace) {
- p.raiseParseError(synErrCharPropExpInvalidForm, "")
- }
-
- return alt
-}
-
-func (p *parser) parseFragment() CPTree {
- if !p.consume(tokenKindLBrace) {
- p.raiseParseError(synErrFragmentExpInvalidForm, "")
- }
- if !p.consume(tokenKindFragmentSymbol) {
- p.raiseParseError(synErrFragmentExpInvalidForm, "")
- }
- sym := p.lastTok.fragmentSymbol
-
- if !p.consume(tokenKindRBrace) {
- p.raiseParseError(synErrFragmentExpInvalidForm, "")
- }
-
- return newFragmentNode(spec.LexKindName(sym), nil)
-}
-
-func (p *parser) parseNormalChar() CPTree {
- if !p.consume(tokenKindChar) {
- return nil
- }
- return newSymbolNode(p.lastTok.char)
-}
-
-func exclude(symbol, base CPTree) CPTree {
- if left, right, ok := symbol.Alternatives(); ok {
- return exclude(right, exclude(left, base))
- }
-
- if left, right, ok := base.Alternatives(); ok {
- return genAltNode(
- exclude(symbol, left),
- exclude(symbol, right),
- )
- }
-
- if bFrom, bTo, ok := base.Range(); ok {
- sFrom, sTo, ok := symbol.Range()
- if !ok {
- panic(fmt.Errorf("invalid symbol tree: %T", symbol))
- }
-
- switch {
- case sFrom > bFrom && sTo < bTo:
- return genAltNode(
- newRangeSymbolNode(bFrom, sFrom-1),
- newRangeSymbolNode(sTo+1, bTo),
- )
- case sFrom <= bFrom && sTo >= bFrom && sTo < bTo:
- return newRangeSymbolNode(sTo+1, bTo)
- case sFrom > bFrom && sFrom <= bTo && sTo >= bTo:
- return newRangeSymbolNode(bFrom, sFrom-1)
- case sFrom <= bFrom && sTo >= bTo:
- return nil
- default:
- return base
- }
- }
-
- panic(fmt.Errorf("invalid base tree: %T", base))
-}
-
-func genAnyCharAST() CPTree {
- return newRangeSymbolNode(0x0, 0x10FFFF)
-}
-
-func isValidOrder(from, to rune) bool {
- return from <= to
-}
-
-func genConcatNode(cs ...CPTree) CPTree {
- nonNilNodes := []CPTree{}
- for _, c := range cs {
- if c == nil {
- continue
- }
- nonNilNodes = append(nonNilNodes, c)
- }
- if len(nonNilNodes) <= 0 {
- return nil
- }
- if len(nonNilNodes) == 1 {
- return nonNilNodes[0]
- }
- concat := newConcatNode(nonNilNodes[0], nonNilNodes[1])
- for _, c := range nonNilNodes[2:] {
- concat = newConcatNode(concat, c)
- }
- return concat
-}
-
-func genAltNode(cs ...CPTree) CPTree {
- nonNilNodes := []CPTree{}
- for _, c := range cs {
- if c == nil {
- continue
- }
- nonNilNodes = append(nonNilNodes, c)
- }
- if len(nonNilNodes) <= 0 {
- return nil
- }
- if len(nonNilNodes) == 1 {
- return nonNilNodes[0]
- }
- alt := newAltNode(nonNilNodes[0], nonNilNodes[1])
- for _, c := range nonNilNodes[2:] {
- alt = newAltNode(alt, c)
- }
- return alt
-}
-
-func (p *parser) expect(expected tokenKind) {
- if !p.consume(expected) {
- tok := p.peekedTok
- p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("expected: %v, actual: %v", expected, tok.kind))
- }
-}
-
-func (p *parser) consume(expected tokenKind) bool {
- var tok *token
- var err error
- if p.peekedTok != nil {
- tok = p.peekedTok
- p.peekedTok = nil
- } else {
- tok, err = p.lex.next()
- if err != nil {
- if err == ParseErr {
- detail, cause := p.lex.error()
- p.raiseParseError(cause, detail)
- }
- panic(err)
- }
- }
- p.lastTok = tok
- if tok.kind == expected {
- return true
- }
- p.peekedTok = tok
- p.lastTok = nil
-
- return false
-}
-
-func (p *parser) raiseParseError(err error, detail string) {
- p.errCause = err
- p.errDetail = detail
- panic(ParseErr)
-}