diff options
Diffstat (limited to 'src/urubu/grammar/lexical/parser/parser.go')
-rw-r--r-- | src/urubu/grammar/lexical/parser/parser.go | 531 |
1 files changed, 0 insertions, 531 deletions
diff --git a/src/urubu/grammar/lexical/parser/parser.go b/src/urubu/grammar/lexical/parser/parser.go deleted file mode 100644 index 425b553..0000000 --- a/src/urubu/grammar/lexical/parser/parser.go +++ /dev/null @@ -1,531 +0,0 @@ -package parser - -import ( - "bytes" - "fmt" - "io" - "strconv" - - spec "urubu/spec/grammar" - "urubu/ucd" -) - -type PatternEntry struct { - ID spec.LexModeKindID - Pattern []byte -} - -type parser struct { - kind spec.LexKindName - lex *lexer - peekedTok *token - lastTok *token - - // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that - // appear in property expressions. - // - // The contributory properties are not exposed, and users cannot use those properties because the parser - // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid. - // - // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to - // interpret derived properties internally because the derived properties consist of other properties that - // may contain the contributory properties. - // - // [UAX #44 5.13 Property APIs] says: - // > The following subtypes of Unicode character properties should generally not be exposed in APIs, - // > except in limited circumstances. They may not be useful, particularly in public API collections, - // > and may instead prove misleading to the users of such API collections. - // > * Contributory properties are not recommended for public APIs. - // > ... - // https://unicode.org/reports/tr44/#Property_APIs - isContributoryPropertyExposed bool - - errCause error - errDetail string -} - -func NewParser(kind spec.LexKindName, src io.Reader) *parser { - return &parser{ - kind: kind, - lex: newLexer(src), - isContributoryPropertyExposed: false, - } -} - -func (p *parser) exposeContributoryProperty() { - p.isContributoryPropertyExposed = true -} - -func (p *parser) Error() (string, error) { - return p.errDetail, p.errCause -} - -func (p *parser) Parse() (root CPTree, retErr error) { - defer func() { - err := recover() - if err != nil { - var ok bool - retErr, ok = err.(error) - if !ok { - panic(err) - } - return - } - }() - - return newRootNode(p.kind, p.parseRegexp()), nil -} - -func (p *parser) parseRegexp() CPTree { - alt := p.parseAlt() - if alt == nil { - if p.consume(tokenKindGroupClose) { - p.raiseParseError(synErrGroupNoInitiator, "") - } - p.raiseParseError(synErrNullPattern, "") - } - if p.consume(tokenKindGroupClose) { - p.raiseParseError(synErrGroupNoInitiator, "") - } - p.expect(tokenKindEOF) - return alt -} - -func (p *parser) parseAlt() CPTree { - left := p.parseConcat() - if left == nil { - if p.consume(tokenKindAlt) { - p.raiseParseError(synErrAltLackOfOperand, "") - } - return nil - } - for { - if !p.consume(tokenKindAlt) { - break - } - right := p.parseConcat() - if right == nil { - p.raiseParseError(synErrAltLackOfOperand, "") - } - left = newAltNode(left, right) - } - return left -} - -func (p *parser) parseConcat() CPTree { - left := p.parseRepeat() - for { - right := p.parseRepeat() - if right == nil { - break - } - left = newConcatNode(left, right) - } - return left -} - -func (p *parser) parseRepeat() CPTree { - group := p.parseGroup() - if group == nil { - if p.consume(tokenKindRepeat) { - p.raiseParseError(synErrRepNoTarget, "* needs an operand") - } - if p.consume(tokenKindRepeatOneOrMore) { - p.raiseParseError(synErrRepNoTarget, "+ needs an operand") - } - if p.consume(tokenKindOption) { - p.raiseParseError(synErrRepNoTarget, "? needs an operand") - } - return nil - } - if p.consume(tokenKindRepeat) { - return newRepeatNode(group) - } - if p.consume(tokenKindRepeatOneOrMore) { - return newRepeatOneOrMoreNode(group) - } - if p.consume(tokenKindOption) { - return newOptionNode(group) - } - return group -} - -func (p *parser) parseGroup() CPTree { - if p.consume(tokenKindGroupOpen) { - alt := p.parseAlt() - if alt == nil { - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrGroupUnclosed, "") - } - p.raiseParseError(synErrGroupNoElem, "") - } - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrGroupUnclosed, "") - } - if !p.consume(tokenKindGroupClose) { - p.raiseParseError(synErrGroupInvalidForm, "") - } - return alt - } - return p.parseSingleChar() -} - -func (p *parser) parseSingleChar() CPTree { - if p.consume(tokenKindAnyChar) { - return genAnyCharAST() - } - if p.consume(tokenKindBExpOpen) { - left := p.parseBExpElem() - if left == nil { - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.raiseParseError(synErrBExpNoElem, "") - } - for { - right := p.parseBExpElem() - if right == nil { - break - } - left = newAltNode(left, right) - } - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.expect(tokenKindBExpClose) - return left - } - if p.consume(tokenKindInverseBExpOpen) { - elem := p.parseBExpElem() - if elem == nil { - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.raiseParseError(synErrBExpNoElem, "") - } - inverse := exclude(elem, genAnyCharAST()) - if inverse == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - for { - elem := p.parseBExpElem() - if elem == nil { - break - } - inverse = exclude(elem, inverse) - if inverse == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - } - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.expect(tokenKindBExpClose) - return inverse - } - if p.consume(tokenKindCodePointLeader) { - return p.parseCodePoint() - } - if p.consume(tokenKindCharPropLeader) { - return p.parseCharProp() - } - if p.consume(tokenKindFragmentLeader) { - return p.parseFragment() - } - c := p.parseNormalChar() - if c == nil { - if p.consume(tokenKindBExpClose) { - p.raiseParseError(synErrBExpInvalidForm, "") - } - return nil - } - return c -} - -func (p *parser) parseBExpElem() CPTree { - var left CPTree - switch { - case p.consume(tokenKindCodePointLeader): - left = p.parseCodePoint() - case p.consume(tokenKindCharPropLeader): - left = p.parseCharProp() - if p.consume(tokenKindCharRange) { - p.raiseParseError(synErrRangePropIsUnavailable, "") - } - default: - left = p.parseNormalChar() - } - if left == nil { - return nil - } - if !p.consume(tokenKindCharRange) { - return left - } - var right CPTree - switch { - case p.consume(tokenKindCodePointLeader): - right = p.parseCodePoint() - case p.consume(tokenKindCharPropLeader): - p.raiseParseError(synErrRangePropIsUnavailable, "") - default: - right = p.parseNormalChar() - } - if right == nil { - p.raiseParseError(synErrRangeInvalidForm, "") - } - from, _, _ := left.Range() - _, to, _ := right.Range() - if !isValidOrder(from, to) { - p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("%X..%X", from, to)) - } - return newRangeSymbolNode(from, to) -} - -func (p *parser) parseCodePoint() CPTree { - if !p.consume(tokenKindLBrace) { - p.raiseParseError(synErrCPExpInvalidForm, "") - } - if !p.consume(tokenKindCodePoint) { - p.raiseParseError(synErrCPExpInvalidForm, "") - } - - n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64) - if err != nil { - panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err)) - } - if n < 0x0000 || n > 0x10FFFF { - p.raiseParseError(synErrCPExpOutOfRange, "") - } - - sym := newSymbolNode(rune(n)) - - if !p.consume(tokenKindRBrace) { - p.raiseParseError(synErrCPExpInvalidForm, "") - } - - return sym -} - -func (p *parser) parseCharProp() CPTree { - if !p.consume(tokenKindLBrace) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - var sym1, sym2 string - if !p.consume(tokenKindCharPropSymbol) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - sym1 = p.lastTok.propSymbol - if p.consume(tokenKindEqual) { - if !p.consume(tokenKindCharPropSymbol) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - sym2 = p.lastTok.propSymbol - } - - var alt CPTree - var propName, propVal string - if sym2 != "" { - propName = sym1 - propVal = sym2 - } else { - propName = "" - propVal = sym1 - } - if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) { - p.raiseParseError(synErrCharPropUnsupported, propName) - } - pat, err := ucd.NormalizeCharacterProperty(propName, propVal) - if err != nil { - p.raiseParseError(synErrCharPropUnsupported, err.Error()) - } - if pat != "" { - p := NewParser(p.kind, bytes.NewReader([]byte(pat))) - p.exposeContributoryProperty() - ast, err := p.Parse() - if err != nil { - panic(err) - } - alt = ast - } else { - cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal) - if err != nil { - p.raiseParseError(synErrCharPropUnsupported, err.Error()) - } - if inverse { - r := cpRanges[0] - alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST()) - if alt == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - for _, r := range cpRanges[1:] { - alt = exclude(newRangeSymbolNode(r.From, r.To), alt) - if alt == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - } - } else { - for _, r := range cpRanges { - alt = genAltNode( - alt, - newRangeSymbolNode(r.From, r.To), - ) - } - } - } - - if !p.consume(tokenKindRBrace) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - - return alt -} - -func (p *parser) parseFragment() CPTree { - if !p.consume(tokenKindLBrace) { - p.raiseParseError(synErrFragmentExpInvalidForm, "") - } - if !p.consume(tokenKindFragmentSymbol) { - p.raiseParseError(synErrFragmentExpInvalidForm, "") - } - sym := p.lastTok.fragmentSymbol - - if !p.consume(tokenKindRBrace) { - p.raiseParseError(synErrFragmentExpInvalidForm, "") - } - - return newFragmentNode(spec.LexKindName(sym), nil) -} - -func (p *parser) parseNormalChar() CPTree { - if !p.consume(tokenKindChar) { - return nil - } - return newSymbolNode(p.lastTok.char) -} - -func exclude(symbol, base CPTree) CPTree { - if left, right, ok := symbol.Alternatives(); ok { - return exclude(right, exclude(left, base)) - } - - if left, right, ok := base.Alternatives(); ok { - return genAltNode( - exclude(symbol, left), - exclude(symbol, right), - ) - } - - if bFrom, bTo, ok := base.Range(); ok { - sFrom, sTo, ok := symbol.Range() - if !ok { - panic(fmt.Errorf("invalid symbol tree: %T", symbol)) - } - - switch { - case sFrom > bFrom && sTo < bTo: - return genAltNode( - newRangeSymbolNode(bFrom, sFrom-1), - newRangeSymbolNode(sTo+1, bTo), - ) - case sFrom <= bFrom && sTo >= bFrom && sTo < bTo: - return newRangeSymbolNode(sTo+1, bTo) - case sFrom > bFrom && sFrom <= bTo && sTo >= bTo: - return newRangeSymbolNode(bFrom, sFrom-1) - case sFrom <= bFrom && sTo >= bTo: - return nil - default: - return base - } - } - - panic(fmt.Errorf("invalid base tree: %T", base)) -} - -func genAnyCharAST() CPTree { - return newRangeSymbolNode(0x0, 0x10FFFF) -} - -func isValidOrder(from, to rune) bool { - return from <= to -} - -func genConcatNode(cs ...CPTree) CPTree { - nonNilNodes := []CPTree{} - for _, c := range cs { - if c == nil { - continue - } - nonNilNodes = append(nonNilNodes, c) - } - if len(nonNilNodes) <= 0 { - return nil - } - if len(nonNilNodes) == 1 { - return nonNilNodes[0] - } - concat := newConcatNode(nonNilNodes[0], nonNilNodes[1]) - for _, c := range nonNilNodes[2:] { - concat = newConcatNode(concat, c) - } - return concat -} - -func genAltNode(cs ...CPTree) CPTree { - nonNilNodes := []CPTree{} - for _, c := range cs { - if c == nil { - continue - } - nonNilNodes = append(nonNilNodes, c) - } - if len(nonNilNodes) <= 0 { - return nil - } - if len(nonNilNodes) == 1 { - return nonNilNodes[0] - } - alt := newAltNode(nonNilNodes[0], nonNilNodes[1]) - for _, c := range nonNilNodes[2:] { - alt = newAltNode(alt, c) - } - return alt -} - -func (p *parser) expect(expected tokenKind) { - if !p.consume(expected) { - tok := p.peekedTok - p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("expected: %v, actual: %v", expected, tok.kind)) - } -} - -func (p *parser) consume(expected tokenKind) bool { - var tok *token - var err error - if p.peekedTok != nil { - tok = p.peekedTok - p.peekedTok = nil - } else { - tok, err = p.lex.next() - if err != nil { - if err == ParseErr { - detail, cause := p.lex.error() - p.raiseParseError(cause, detail) - } - panic(err) - } - } - p.lastTok = tok - if tok.kind == expected { - return true - } - p.peekedTok = tok - p.lastTok = nil - - return false -} - -func (p *parser) raiseParseError(err error, detail string) { - p.errCause = err - p.errDetail = detail - panic(ParseErr) -} |