diff options
Diffstat (limited to '')
-rw-r--r-- | compiler/parser.go | 862 |
1 files changed, 0 insertions, 862 deletions
diff --git a/compiler/parser.go b/compiler/parser.go deleted file mode 100644 index ce481e3..0000000 --- a/compiler/parser.go +++ /dev/null @@ -1,862 +0,0 @@ -package compiler - -import ( - "bytes" - "encoding/binary" - "encoding/hex" - "fmt" - "io" - "strings" - - "github.com/nihei9/maleeni/spec" - "github.com/nihei9/maleeni/ucd" - "github.com/nihei9/maleeni/utf8" -) - -type ParseErrors struct { - Errors []*ParseError -} - -func (e *ParseErrors) Error() string { - var b strings.Builder - fmt.Fprintf(&b, "%v", e.Errors[0]) - for _, err := range e.Errors[1:] { - fmt.Fprintf(&b, "\n%v", err) - } - return b.String() -} - -type ParseError struct { - ID spec.LexModeKindID - Pattern []byte - Cause error - Details string -} - -func (e *ParseError) Error() string { - var b strings.Builder - fmt.Fprintf(&b, "#%v %v: %v", e.ID, string(e.Pattern), e.Cause) - if e.Details != "" { - fmt.Fprintf(&b, ": %v", e.Details) - } - return b.String() -} - -func raiseSyntaxError(synErr *SyntaxError) { - panic(synErr) -} - -type symbolTable struct { - symPos2Byte map[symbolPosition]byteRange - endPos2ID map[symbolPosition]spec.LexModeKindID -} - -func genSymbolTable(root astNode) *symbolTable { - symTab := &symbolTable{ - symPos2Byte: map[symbolPosition]byteRange{}, - endPos2ID: map[symbolPosition]spec.LexModeKindID{}, - } - return genSymTab(symTab, root) -} - -func genSymTab(symTab *symbolTable, node astNode) *symbolTable { - if node == nil { - return symTab - } - - switch n := node.(type) { - case *symbolNode: - symTab.symPos2Byte[n.pos] = byteRange{ - from: n.from, - to: n.to, - } - case *endMarkerNode: - symTab.endPos2ID[n.pos] = n.id - default: - left, right := node.children() - genSymTab(symTab, left) - genSymTab(symTab, right) - } - return symTab -} - -type patternEntry struct { - id spec.LexModeKindID - pattern []byte -} - -func parse(pats []*patternEntry, fragments map[string][]byte) (astNode, *symbolTable, error) { - if len(pats) == 0 { - return nil, nil, fmt.Errorf("parse() needs at least one token entry") - } - - fragmentASTs, err := parseFragments(fragments) - if err != nil { - return nil, nil, err - } - if fragmentASTs == nil { - fragmentASTs = map[string]astNode{} - } - - root, err := parseRegexp(pats, fragmentASTs) - if err != nil { - return nil, nil, err - } - - return root, genSymbolTable(root), nil -} - -type incompleteFragment struct { - kind string - ast astNode -} - -func parseFragments(fragments map[string][]byte) (map[string]astNode, error) { - if len(fragments) == 0 { - return nil, nil - } - fragmentASTs := map[string]astNode{} - incompleteFragments := []*incompleteFragment{} - var perrs []*ParseError - for kind, pattern := range fragments { - p := newParser(bytes.NewReader(pattern)) - ast, err := p.parse() - if err != nil { - perrs = append(perrs, &ParseError{ - Pattern: pattern, - Cause: err, - Details: p.errMsgDetails, - }) - continue - } - if p.incomplete { - incompleteFragments = append(incompleteFragments, &incompleteFragment{ - kind: kind, - ast: ast, - }) - } else { - fragmentASTs[kind] = ast - } - } - for len(incompleteFragments) > 0 { - lastIncompCount := len(incompleteFragments) - remainingFragments := []*incompleteFragment{} - for _, e := range incompleteFragments { - remains := applyFragments(e.ast, fragmentASTs) - if len(remains) > 0 { - remainingFragments = append(remainingFragments, e) - } else { - fragmentASTs[e.kind] = e.ast - } - } - incompleteFragments = remainingFragments - if len(incompleteFragments) == lastIncompCount { - for _, e := range incompleteFragments { - perrs = append(perrs, &ParseError{ - Cause: fmt.Errorf("%v has an undefined fragment or a cycle", e.kind), - }) - } - break - } - } - if len(perrs) > 0 { - return nil, &ParseErrors{ - Errors: perrs, - } - } - - return fragmentASTs, nil -} - -func parseRegexp(pats []*patternEntry, fragmentASTs map[string]astNode) (astNode, error) { - symPos := symbolPositionMin - var root astNode - var perrs []*ParseError - - for _, pat := range pats { - if pat.id == spec.LexModeKindIDNil { - continue - } - - p := newParser(bytes.NewReader(pat.pattern)) - ast, err := p.parse() - if err != nil { - perrs = append(perrs, &ParseError{ - ID: pat.id, - Pattern: pat.pattern, - Cause: err, - Details: p.errMsgDetails, - }) - continue - } - remains := applyFragments(ast, fragmentASTs) - if len(remains) > 0 { - perrs = append(perrs, &ParseError{ - ID: pat.id, - Pattern: pat.pattern, - Cause: fmt.Errorf("undefined fragment: %+v", remains), - }) - continue - } - ast = newConcatNode(ast, newEndMarkerNode(pat.id)) - symPos, err = positionSymbols(ast, symPos) - if err != nil { - perrs = append(perrs, &ParseError{ - ID: pat.id, - Pattern: pat.pattern, - Cause: err, - Details: p.errMsgDetails, - }) - continue - } - root = genAltNode(root, ast) - } - if len(perrs) > 0 { - return nil, &ParseErrors{ - Errors: perrs, - } - } - - return root, nil -} - -func applyFragments(ast astNode, fragments map[string]astNode) []string { - if ast == nil { - return nil - } - n, ok := ast.(*fragmentNode) - if !ok { - var remains []string - left, right := ast.children() - r := applyFragments(left, fragments) - if len(r) > 0 { - remains = append(remains, r...) - } - r = applyFragments(right, fragments) - if len(r) > 0 { - remains = append(remains, r...) - } - return remains - } - f, ok := fragments[n.symbol] - if !ok { - return []string{n.symbol} - } - n.left = copyAST(f) - return nil -} - -type parser struct { - lex *lexer - peekedTok *token - lastTok *token - incomplete bool - errMsgDetails string - - // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that - // appear in property expressions. - // - // The contributory properties are not exposed, and users cannot use those properties because the parser - // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid. - // - // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to - // interpret derived properties internally because the derived properties consist of other properties that - // may contain the contributory properties. - // - // [UAX #44 5.13 Property APIs] says: - // > The following subtypes of Unicode character properties should generally not be exposed in APIs, - // > except in limited circumstances. They may not be useful, particularly in public API collections, - // > and may instead prove misleading to the users of such API collections. - // > * Contributory properties are not recommended for public APIs. - // > ... - // https://unicode.org/reports/tr44/#Property_APIs - isContributoryPropertyExposed bool -} - -func newParser(src io.Reader) *parser { - return &parser{ - lex: newLexer(src), - isContributoryPropertyExposed: false, - } -} - -func (p *parser) exposeContributoryProperty() { - p.isContributoryPropertyExposed = true -} - -func (p *parser) parse() (ast astNode, retErr error) { - defer func() { - err := recover() - if err != nil { - var ok bool - retErr, ok = err.(error) - if !ok { - retErr = fmt.Errorf("%v", err) - } - return - } - }() - - ast, err := p.parseRegexp() - if err != nil { - return nil, err - } - - return ast, nil -} - -func (p *parser) parseRegexp() (astNode, error) { - alt := p.parseAlt() - if alt == nil { - if p.consume(tokenKindGroupClose) { - raiseSyntaxError(synErrGroupNoInitiator) - } - raiseSyntaxError(synErrNullPattern) - } - if p.consume(tokenKindGroupClose) { - raiseSyntaxError(synErrGroupNoInitiator) - } - p.expect(tokenKindEOF) - return alt, nil -} - -func (p *parser) parseAlt() astNode { - left := p.parseConcat() - if left == nil { - if p.consume(tokenKindAlt) { - raiseSyntaxError(synErrAltLackOfOperand) - } - return nil - } - for { - if !p.consume(tokenKindAlt) { - break - } - right := p.parseConcat() - if right == nil { - raiseSyntaxError(synErrAltLackOfOperand) - } - left = newAltNode(left, right) - } - return left -} - -func (p *parser) parseConcat() astNode { - left := p.parseRepeat() - for { - right := p.parseRepeat() - if right == nil { - break - } - left = newConcatNode(left, right) - } - return left -} - -func (p *parser) parseRepeat() astNode { - group := p.parseGroup() - if group == nil { - if p.consume(tokenKindRepeat) { - p.errMsgDetails = "* needs an operand" - raiseSyntaxError(synErrRepNoTarget) - } - if p.consume(tokenKindRepeatOneOrMore) { - p.errMsgDetails = "+ needs an operand" - raiseSyntaxError(synErrRepNoTarget) - } - if p.consume(tokenKindOption) { - p.errMsgDetails = "? needs an operand" - raiseSyntaxError(synErrRepNoTarget) - } - return nil - } - if p.consume(tokenKindRepeat) { - return newRepeatNode(group) - } - if p.consume(tokenKindRepeatOneOrMore) { - return newRepeatOneOrMoreNode(group) - } - if p.consume(tokenKindOption) { - return newOptionNode(group) - } - return group -} - -func (p *parser) parseGroup() astNode { - if p.consume(tokenKindGroupOpen) { - alt := p.parseAlt() - if alt == nil { - if p.consume(tokenKindEOF) { - raiseSyntaxError(synErrGroupUnclosed) - } - raiseSyntaxError(synErrGroupNoElem) - } - if p.consume(tokenKindEOF) { - raiseSyntaxError(synErrGroupUnclosed) - } - if !p.consume(tokenKindGroupClose) { - raiseSyntaxError(synErrGroupInvalidForm) - } - return alt - } - return p.parseSingleChar() -} - -func (p *parser) parseSingleChar() astNode { - if p.consume(tokenKindAnyChar) { - return genAnyCharAST() - } - if p.consume(tokenKindBExpOpen) { - left := p.parseBExpElem() - if left == nil { - if p.consume(tokenKindEOF) { - raiseSyntaxError(synErrBExpUnclosed) - } - raiseSyntaxError(synErrBExpNoElem) - } - for { - right := p.parseBExpElem() - if right == nil { - break - } - left = newAltNode(left, right) - } - if p.consume(tokenKindEOF) { - raiseSyntaxError(synErrBExpUnclosed) - } - p.expect(tokenKindBExpClose) - return left - } - if p.consume(tokenKindInverseBExpOpen) { - elem := p.parseBExpElem() - if elem == nil { - if p.consume(tokenKindEOF) { - raiseSyntaxError(synErrBExpUnclosed) - } - raiseSyntaxError(synErrBExpNoElem) - } - inverse := exclude(elem, genAnyCharAST()) - if inverse == nil { - panic(fmt.Errorf("a pattern that isn't matching any symbols")) - } - for { - elem := p.parseBExpElem() - if elem == nil { - break - } - inverse = exclude(elem, inverse) - if inverse == nil { - panic(fmt.Errorf("a pattern that isn't matching any symbols")) - } - } - if p.consume(tokenKindEOF) { - raiseSyntaxError(synErrBExpUnclosed) - } - p.expect(tokenKindBExpClose) - return inverse - } - if p.consume(tokenKindCodePointLeader) { - return p.parseCodePoint() - } - if p.consume(tokenKindCharPropLeader) { - return p.parseCharProp() - } - if p.consume(tokenKindFragmentLeader) { - return p.parseFragment() - } - c := p.parseNormalChar() - if c == nil { - if p.consume(tokenKindBExpClose) { - raiseSyntaxError(synErrBExpInvalidForm) - } - return nil - } - return c -} - -func (p *parser) parseBExpElem() astNode { - if p.consume(tokenKindCodePointLeader) { - return p.parseCodePoint() - } - if p.consume(tokenKindCharPropLeader) { - return p.parseCharProp() - } - left := p.parseNormalChar() - if left == nil { - return nil - } - if !p.consume(tokenKindCharRange) { - return left - } - right := p.parseNormalChar() - if right == nil { - panic(fmt.Errorf("invalid range expression")) - } - from := genByteSeq(left) - to := genByteSeq(right) - if !isValidOrder(from, to) { - p.errMsgDetails = fmt.Sprintf("[%s-%s] ([%v-%v])", string(from), string(to), from, to) - raiseSyntaxError(synErrRangeInvalidOrder) - } - return genRangeAST(left, right) -} - -func (p *parser) parseCodePoint() astNode { - if !p.consume(tokenKindLBrace) { - raiseSyntaxError(synErrCPExpInvalidForm) - } - if !p.consume(tokenKindCodePoint) { - raiseSyntaxError(synErrCPExpInvalidForm) - } - - var cp []byte - { - // Although hex.DecodeString method can handle only a hex string that has even length, - // `codePoint` always has even length by the lexical specification. - b, err := hex.DecodeString(p.lastTok.codePoint) - if err != nil { - panic(fmt.Errorf("failed to decode a code point (%v) into a byte slice: %v", p.lastTok.codePoint, err)) - } - // `b` must be 4 bytes to convert it into a 32-bit integer. - l := len(b) - for i := 0; i < 4-l; i++ { - b = append([]byte{0}, b...) - } - n := binary.BigEndian.Uint32(b) - if n < 0x0000 || n > 0x10FFFF { - raiseSyntaxError(synErrCPExpOutOfRange) - } - - cp = []byte(string(rune(n))) - } - - var concat astNode - { - concat = newSymbolNode(cp[0]) - for _, b := range cp[1:] { - concat = genConcatNode( - concat, - newSymbolNode(b), - ) - } - } - - if !p.consume(tokenKindRBrace) { - raiseSyntaxError(synErrCPExpInvalidForm) - } - - return concat -} - -func (p *parser) parseCharProp() astNode { - if !p.consume(tokenKindLBrace) { - raiseSyntaxError(synErrCharPropExpInvalidForm) - } - var sym1, sym2 string - if !p.consume(tokenKindCharPropSymbol) { - raiseSyntaxError(synErrCharPropExpInvalidForm) - } - sym1 = p.lastTok.propSymbol - if p.consume(tokenKindEqual) { - if !p.consume(tokenKindCharPropSymbol) { - raiseSyntaxError(synErrCharPropExpInvalidForm) - } - sym2 = p.lastTok.propSymbol - } - - var alt astNode - var propName, propVal string - if sym2 != "" { - propName = sym1 - propVal = sym2 - } else { - propName = "" - propVal = sym1 - } - if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) { - p.errMsgDetails = propName - raiseSyntaxError(synErrCharPropUnsupported) - } - pat, err := ucd.NormalizeCharacterProperty(propName, propVal) - if err != nil { - p.errMsgDetails = fmt.Sprintf("%v", err) - raiseSyntaxError(synErrCharPropUnsupported) - } - if pat != "" { - p := newParser(bytes.NewReader([]byte(pat))) - p.exposeContributoryProperty() - ast, err := p.parse() - if err != nil { - panic(err) - } - alt = ast - } else { - cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal) - if err != nil { - p.errMsgDetails = fmt.Sprintf("%v", err) - raiseSyntaxError(synErrCharPropUnsupported) - } - if inverse { - r := cpRanges[0] - from := genNormalCharAST(r.From) - to := genNormalCharAST(r.To) - alt = exclude(genRangeAST(from, to), genAnyCharAST()) - if alt == nil { - panic(fmt.Errorf("a pattern that isn't matching any symbols")) - } - for _, r := range cpRanges[1:] { - from := genNormalCharAST(r.From) - to := genNormalCharAST(r.To) - alt = exclude(genRangeAST(from, to), alt) - if alt == nil { - panic(fmt.Errorf("a pattern that isn't matching any symbols")) - } - } - } else { - for _, r := range cpRanges { - from := genNormalCharAST(r.From) - to := genNormalCharAST(r.To) - alt = genAltNode( - alt, - genRangeAST(from, to), - ) - } - } - } - - if !p.consume(tokenKindRBrace) { - raiseSyntaxError(synErrCharPropExpInvalidForm) - } - - return alt -} - -func (p *parser) parseFragment() astNode { - if !p.consume(tokenKindLBrace) { - raiseSyntaxError(synErrFragmentExpInvalidForm) - } - if !p.consume(tokenKindFragmentSymbol) { - raiseSyntaxError(synErrFragmentExpInvalidForm) - } - sym := p.lastTok.fragmentSymbol - - if !p.consume(tokenKindRBrace) { - raiseSyntaxError(synErrFragmentExpInvalidForm) - } - - p.incomplete = true - - return newFragmentNode(sym, nil) -} - -func (p *parser) parseNormalChar() astNode { - if !p.consume(tokenKindChar) { - return nil - } - return genNormalCharAST(p.lastTok.char) -} - -func genNormalCharAST(c rune) astNode { - b := []byte(string(c)) - switch len(b) { - case 1: - return newSymbolNode(b[0]) - case 2: - return genConcatNode( - newSymbolNode(b[0]), - newSymbolNode(b[1]), - ) - case 3: - return genConcatNode( - newSymbolNode(b[0]), - newSymbolNode(b[1]), - newSymbolNode(b[2]), - ) - default: // is equivalent to case 4 - return genConcatNode( - newSymbolNode(b[0]), - newSymbolNode(b[1]), - newSymbolNode(b[2]), - newSymbolNode(b[3]), - ) - } -} - -func exclude(symbol, base astNode) astNode { - if alt, ok := symbol.(*altNode); ok { - return exclude(alt.right, exclude(alt.left, base)) - } - - switch base.(type) { - case *altNode: - left, right := base.children() - return genAltNode( - exclude(symbol, left), - exclude(symbol, right), - ) - case *concatNode: - baseSeq := genByteRangeSeq(base) - symSeq := genByteRangeSeq(symbol) - excluded := excludeByteRangeSequence(symSeq, baseSeq) - if len(excluded) <= 0 { - return nil - } - return convertByteRangeSeqsToAST(excluded) - case *symbolNode: - baseSeq := genByteRangeSeq(base) - symSeq := genByteRangeSeq(symbol) - excluded := excludeByteRangeSequence(symSeq, baseSeq) - if len(excluded) <= 0 { - return nil - } - return convertByteRangeSeqsToAST(excluded) - } - return nil -} - -func convertByteRangeSeqsToAST(seqs [][]byteRange) astNode { - concats := []astNode{} - for _, seq := range seqs { - syms := []astNode{} - for _, elem := range seq { - syms = append(syms, newRangeSymbolNode(elem.from, elem.to)) - } - concats = append(concats, genConcatNode(syms...)) - } - return genAltNode(concats...) -} - -func genAnyCharAST() astNode { - return convertCharBlocksToAST(utf8.AllCharBlocks()) -} - -func genRangeAST(fromNode, toNode astNode) astNode { - from := genByteSeq(fromNode) - to := genByteSeq(toNode) - blks, err := utf8.GenCharBlocks(from, to) - if err != nil { - panic(err) - } - return convertCharBlocksToAST(blks) -} - -func convertCharBlocksToAST(blks []*utf8.CharBlock) astNode { - var alt astNode - for _, blk := range blks { - r := make([]astNode, len(blk.From)) - for i := 0; i < len(blk.From); i++ { - r[i] = newRangeSymbolNode(blk.From[i], blk.To[i]) - } - alt = genAltNode(alt, genConcatNode(r...)) - } - return alt -} - -func genByteSeq(node astNode) []byte { - switch n := node.(type) { - case *symbolNode: - return []byte{n.from} - case *concatNode: - seq := genByteSeq(n.left) - seq = append(seq, genByteSeq(n.right)...) - return seq - } - panic(fmt.Errorf("genByteSeq() cannot handle %T: %v", node, node)) -} - -func genByteRangeSeq(node astNode) []byteRange { - switch n := node.(type) { - case *symbolNode: - return []byteRange{{from: n.from, to: n.to}} - case *concatNode: - seq := genByteRangeSeq(n.left) - seq = append(seq, genByteRangeSeq(n.right)...) - return seq - } - panic(fmt.Errorf("genByteRangeSeq() cannot handle %T: %v", node, node)) -} - -func isValidOrder(from, to []byte) bool { - if len(from) > len(to) { - return false - } - if len(from) < len(to) { - return true - } - for i, f := range from { - t := to[i] - if f > t { - return false - } - if f < t { - return true - } - } - return true -} - -func genConcatNode(cs ...astNode) astNode { - if len(cs) <= 0 { - return nil - } - if len(cs) == 1 { - return cs[0] - } - concat := newConcatNode(cs[0], cs[1]) - for _, c := range cs[2:] { - concat = newConcatNode(concat, c) - } - return concat -} - -func genAltNode(cs ...astNode) astNode { - nonNilNodes := []astNode{} - for _, c := range cs { - if c == nil { - continue - } - nonNilNodes = append(nonNilNodes, c) - } - if len(nonNilNodes) <= 0 { - return nil - } - if len(nonNilNodes) == 1 { - return nonNilNodes[0] - } - alt := newAltNode(nonNilNodes[0], nonNilNodes[1]) - for _, c := range nonNilNodes[2:] { - alt = newAltNode(alt, c) - } - return alt -} - -func (p *parser) expect(expected tokenKind) { - if !p.consume(expected) { - tok := p.peekedTok - p.errMsgDetails = fmt.Sprintf("unexpected token; expected: %v, actual: %v", expected, tok.kind) - raiseSyntaxError(synErrUnexpectedToken) - } -} - -func (p *parser) consume(expected tokenKind) bool { - var tok *token - var err error - if p.peekedTok != nil { - tok = p.peekedTok - p.peekedTok = nil - } else { - tok, err = p.lex.next() - if err != nil { - p.errMsgDetails = p.lex.errMsgDetails - panic(err) - } - } - p.lastTok = tok - if tok.kind == expected { - return true - } - p.peekedTok = tok - p.lastTok = nil - - return false -} |