diff options
author | Ryo Nihei <nihei.dev@gmail.com> | 2021-12-09 02:38:12 +0900 |
---|---|---|
committer | Ryo Nihei <nihei.dev@gmail.com> | 2021-12-10 01:50:32 +0900 |
commit | d595194791483a71c5afaff2aa3f4b575a9d22b7 (patch) | |
tree | 65477d9fab1db2b9ded5eeda8a14ce0b235718b5 /compiler/parser | |
parent | Add a new DFA compiler that generates DFA from a set of CPTree (diff) | |
download | tre-d595194791483a71c5afaff2aa3f4b575a9d22b7.tar.gz tre-d595194791483a71c5afaff2aa3f4b575a9d22b7.tar.xz |
Use new parser and DFA compiler
Diffstat (limited to '')
-rw-r--r-- | compiler/parser.go | 862 | ||||
-rw-r--r-- | compiler/parser_test.go | 1422 |
2 files changed, 0 insertions, 2284 deletions
diff --git a/compiler/parser.go b/compiler/parser.go deleted file mode 100644 index ce481e3..0000000 --- a/compiler/parser.go +++ /dev/null @@ -1,862 +0,0 @@ -package compiler - -import ( - "bytes" - "encoding/binary" - "encoding/hex" - "fmt" - "io" - "strings" - - "github.com/nihei9/maleeni/spec" - "github.com/nihei9/maleeni/ucd" - "github.com/nihei9/maleeni/utf8" -) - -type ParseErrors struct { - Errors []*ParseError -} - -func (e *ParseErrors) Error() string { - var b strings.Builder - fmt.Fprintf(&b, "%v", e.Errors[0]) - for _, err := range e.Errors[1:] { - fmt.Fprintf(&b, "\n%v", err) - } - return b.String() -} - -type ParseError struct { - ID spec.LexModeKindID - Pattern []byte - Cause error - Details string -} - -func (e *ParseError) Error() string { - var b strings.Builder - fmt.Fprintf(&b, "#%v %v: %v", e.ID, string(e.Pattern), e.Cause) - if e.Details != "" { - fmt.Fprintf(&b, ": %v", e.Details) - } - return b.String() -} - -func raiseSyntaxError(synErr *SyntaxError) { - panic(synErr) -} - -type symbolTable struct { - symPos2Byte map[symbolPosition]byteRange - endPos2ID map[symbolPosition]spec.LexModeKindID -} - -func genSymbolTable(root astNode) *symbolTable { - symTab := &symbolTable{ - symPos2Byte: map[symbolPosition]byteRange{}, - endPos2ID: map[symbolPosition]spec.LexModeKindID{}, - } - return genSymTab(symTab, root) -} - -func genSymTab(symTab *symbolTable, node astNode) *symbolTable { - if node == nil { - return symTab - } - - switch n := node.(type) { - case *symbolNode: - symTab.symPos2Byte[n.pos] = byteRange{ - from: n.from, - to: n.to, - } - case *endMarkerNode: - symTab.endPos2ID[n.pos] = n.id - default: - left, right := node.children() - genSymTab(symTab, left) - genSymTab(symTab, right) - } - return symTab -} - -type patternEntry struct { - id spec.LexModeKindID - pattern []byte -} - -func parse(pats []*patternEntry, fragments map[string][]byte) (astNode, *symbolTable, error) { - if len(pats) == 0 { - return nil, nil, fmt.Errorf("parse() needs at least one token entry") - } - - fragmentASTs, err := parseFragments(fragments) - if err != nil { - return nil, nil, err - } - if fragmentASTs == nil { - fragmentASTs = map[string]astNode{} - } - - root, err := parseRegexp(pats, fragmentASTs) - if err != nil { - return nil, nil, err - } - - return root, genSymbolTable(root), nil -} - -type incompleteFragment struct { - kind string - ast astNode -} - -func parseFragments(fragments map[string][]byte) (map[string]astNode, error) { - if len(fragments) == 0 { - return nil, nil - } - fragmentASTs := map[string]astNode{} - incompleteFragments := []*incompleteFragment{} - var perrs []*ParseError - for kind, pattern := range fragments { - p := newParser(bytes.NewReader(pattern)) - ast, err := p.parse() - if err != nil { - perrs = append(perrs, &ParseError{ - Pattern: pattern, - Cause: err, - Details: p.errMsgDetails, - }) - continue - } - if p.incomplete { - incompleteFragments = append(incompleteFragments, &incompleteFragment{ - kind: kind, - ast: ast, - }) - } else { - fragmentASTs[kind] = ast - } - } - for len(incompleteFragments) > 0 { - lastIncompCount := len(incompleteFragments) - remainingFragments := []*incompleteFragment{} - for _, e := range incompleteFragments { - remains := applyFragments(e.ast, fragmentASTs) - if len(remains) > 0 { - remainingFragments = append(remainingFragments, e) - } else { - fragmentASTs[e.kind] = e.ast - } - } - incompleteFragments = remainingFragments - if len(incompleteFragments) == lastIncompCount { - for _, e := range incompleteFragments { - perrs = append(perrs, &ParseError{ - Cause: fmt.Errorf("%v has an undefined fragment or a cycle", e.kind), - }) - } - break - } - } - if len(perrs) > 0 { - return nil, &ParseErrors{ - Errors: perrs, - } - } - - return fragmentASTs, nil -} - -func parseRegexp(pats []*patternEntry, fragmentASTs map[string]astNode) (astNode, error) { - symPos := symbolPositionMin - var root astNode - var perrs []*ParseError - - for _, pat := range pats { - if pat.id == spec.LexModeKindIDNil { - continue - } - - p := newParser(bytes.NewReader(pat.pattern)) - ast, err := p.parse() - if err != nil { - perrs = append(perrs, &ParseError{ - ID: pat.id, - Pattern: pat.pattern, - Cause: err, - Details: p.errMsgDetails, - }) - continue - } - remains := applyFragments(ast, fragmentASTs) - if len(remains) > 0 { - perrs = append(perrs, &ParseError{ - ID: pat.id, - Pattern: pat.pattern, - Cause: fmt.Errorf("undefined fragment: %+v", remains), - }) - continue - } - ast = newConcatNode(ast, newEndMarkerNode(pat.id)) - symPos, err = positionSymbols(ast, symPos) - if err != nil { - perrs = append(perrs, &ParseError{ - ID: pat.id, - Pattern: pat.pattern, - Cause: err, - Details: p.errMsgDetails, - }) - continue - } - root = genAltNode(root, ast) - } - if len(perrs) > 0 { - return nil, &ParseErrors{ - Errors: perrs, - } - } - - return root, nil -} - -func applyFragments(ast astNode, fragments map[string]astNode) []string { - if ast == nil { - return nil - } - n, ok := ast.(*fragmentNode) - if !ok { - var remains []string - left, right := ast.children() - r := applyFragments(left, fragments) - if len(r) > 0 { - remains = append(remains, r...) - } - r = applyFragments(right, fragments) - if len(r) > 0 { - remains = append(remains, r...) - } - return remains - } - f, ok := fragments[n.symbol] - if !ok { - return []string{n.symbol} - } - n.left = copyAST(f) - return nil -} - -type parser struct { - lex *lexer - peekedTok *token - lastTok *token - incomplete bool - errMsgDetails string - - // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that - // appear in property expressions. - // - // The contributory properties are not exposed, and users cannot use those properties because the parser - // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid. - // - // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to - // interpret derived properties internally because the derived properties consist of other properties that - // may contain the contributory properties. - // - // [UAX #44 5.13 Property APIs] says: - // > The following subtypes of Unicode character properties should generally not be exposed in APIs, - // > except in limited circumstances. They may not be useful, particularly in public API collections, - // > and may instead prove misleading to the users of such API collections. - // > * Contributory properties are not recommended for public APIs. - // > ... - // https://unicode.org/reports/tr44/#Property_APIs - isContributoryPropertyExposed bool -} - -func newParser(src io.Reader) *parser { - return &parser{ - lex: newLexer(src), - isContributoryPropertyExposed: false, - } -} - -func (p *parser) exposeContributoryProperty() { - p.isContributoryPropertyExposed = true -} - -func (p *parser) parse() (ast astNode, retErr error) { - defer func() { - err := recover() - if err != nil { - var ok bool - retErr, ok = err.(error) - if !ok { - retErr = fmt.Errorf("%v", err) - } - return - } - }() - - ast, err := p.parseRegexp() - if err != nil { - return nil, err - } - - return ast, nil -} - -func (p *parser) parseRegexp() (astNode, error) { - alt := p.parseAlt() - if alt == nil { - if p.consume(tokenKindGroupClose) { - raiseSyntaxError(synErrGroupNoInitiator) - } - raiseSyntaxError(synErrNullPattern) - } - if p.consume(tokenKindGroupClose) { - raiseSyntaxError(synErrGroupNoInitiator) - } - p.expect(tokenKindEOF) - return alt, nil -} - -func (p *parser) parseAlt() astNode { - left := p.parseConcat() - if left == nil { - if p.consume(tokenKindAlt) { - raiseSyntaxError(synErrAltLackOfOperand) - } - return nil - } - for { - if !p.consume(tokenKindAlt) { - break - } - right := p.parseConcat() - if right == nil { - raiseSyntaxError(synErrAltLackOfOperand) - } - left = newAltNode(left, right) - } - return left -} - -func (p *parser) parseConcat() astNode { - left := p.parseRepeat() - for { - right := p.parseRepeat() - if right == nil { - break - } - left = newConcatNode(left, right) - } - return left -} - -func (p *parser) parseRepeat() astNode { - group := p.parseGroup() - if group == nil { - if p.consume(tokenKindRepeat) { - p.errMsgDetails = "* needs an operand" - raiseSyntaxError(synErrRepNoTarget) - } - if p.consume(tokenKindRepeatOneOrMore) { - p.errMsgDetails = "+ needs an operand" - raiseSyntaxError(synErrRepNoTarget) - } - if p.consume(tokenKindOption) { - p.errMsgDetails = "? needs an operand" - raiseSyntaxError(synErrRepNoTarget) - } - return nil - } - if p.consume(tokenKindRepeat) { - return newRepeatNode(group) - } - if p.consume(tokenKindRepeatOneOrMore) { - return newRepeatOneOrMoreNode(group) - } - if p.consume(tokenKindOption) { - return newOptionNode(group) - } - return group -} - -func (p *parser) parseGroup() astNode { - if p.consume(tokenKindGroupOpen) { - alt := p.parseAlt() - if alt == nil { - if p.consume(tokenKindEOF) { - raiseSyntaxError(synErrGroupUnclosed) - } - raiseSyntaxError(synErrGroupNoElem) - } - if p.consume(tokenKindEOF) { - raiseSyntaxError(synErrGroupUnclosed) - } - if !p.consume(tokenKindGroupClose) { - raiseSyntaxError(synErrGroupInvalidForm) - } - return alt - } - return p.parseSingleChar() -} - -func (p *parser) parseSingleChar() astNode { - if p.consume(tokenKindAnyChar) { - return genAnyCharAST() - } - if p.consume(tokenKindBExpOpen) { - left := p.parseBExpElem() - if left == nil { - if p.consume(tokenKindEOF) { - raiseSyntaxError(synErrBExpUnclosed) - } - raiseSyntaxError(synErrBExpNoElem) - } - for { - right := p.parseBExpElem() - if right == nil { - break - } - left = newAltNode(left, right) - } - if p.consume(tokenKindEOF) { - raiseSyntaxError(synErrBExpUnclosed) - } - p.expect(tokenKindBExpClose) - return left - } - if p.consume(tokenKindInverseBExpOpen) { - elem := p.parseBExpElem() - if elem == nil { - if p.consume(tokenKindEOF) { - raiseSyntaxError(synErrBExpUnclosed) - } - raiseSyntaxError(synErrBExpNoElem) - } - inverse := exclude(elem, genAnyCharAST()) - if inverse == nil { - panic(fmt.Errorf("a pattern that isn't matching any symbols")) - } - for { - elem := p.parseBExpElem() - if elem == nil { - break - } - inverse = exclude(elem, inverse) - if inverse == nil { - panic(fmt.Errorf("a pattern that isn't matching any symbols")) - } - } - if p.consume(tokenKindEOF) { - raiseSyntaxError(synErrBExpUnclosed) - } - p.expect(tokenKindBExpClose) - return inverse - } - if p.consume(tokenKindCodePointLeader) { - return p.parseCodePoint() - } - if p.consume(tokenKindCharPropLeader) { - return p.parseCharProp() - } - if p.consume(tokenKindFragmentLeader) { - return p.parseFragment() - } - c := p.parseNormalChar() - if c == nil { - if p.consume(tokenKindBExpClose) { - raiseSyntaxError(synErrBExpInvalidForm) - } - return nil - } - return c -} - -func (p *parser) parseBExpElem() astNode { - if p.consume(tokenKindCodePointLeader) { - return p.parseCodePoint() - } - if p.consume(tokenKindCharPropLeader) { - return p.parseCharProp() - } - left := p.parseNormalChar() - if left == nil { - return nil - } - if !p.consume(tokenKindCharRange) { - return left - } - right := p.parseNormalChar() - if right == nil { - panic(fmt.Errorf("invalid range expression")) - } - from := genByteSeq(left) - to := genByteSeq(right) - if !isValidOrder(from, to) { - p.errMsgDetails = fmt.Sprintf("[%s-%s] ([%v-%v])", string(from), string(to), from, to) - raiseSyntaxError(synErrRangeInvalidOrder) - } - return genRangeAST(left, right) -} - -func (p *parser) parseCodePoint() astNode { - if !p.consume(tokenKindLBrace) { - raiseSyntaxError(synErrCPExpInvalidForm) - } - if !p.consume(tokenKindCodePoint) { - raiseSyntaxError(synErrCPExpInvalidForm) - } - - var cp []byte - { - // Although hex.DecodeString method can handle only a hex string that has even length, - // `codePoint` always has even length by the lexical specification. - b, err := hex.DecodeString(p.lastTok.codePoint) - if err != nil { - panic(fmt.Errorf("failed to decode a code point (%v) into a byte slice: %v", p.lastTok.codePoint, err)) - } - // `b` must be 4 bytes to convert it into a 32-bit integer. - l := len(b) - for i := 0; i < 4-l; i++ { - b = append([]byte{0}, b...) - } - n := binary.BigEndian.Uint32(b) - if n < 0x0000 || n > 0x10FFFF { - raiseSyntaxError(synErrCPExpOutOfRange) - } - - cp = []byte(string(rune(n))) - } - - var concat astNode - { - concat = newSymbolNode(cp[0]) - for _, b := range cp[1:] { - concat = genConcatNode( - concat, - newSymbolNode(b), - ) - } - } - - if !p.consume(tokenKindRBrace) { - raiseSyntaxError(synErrCPExpInvalidForm) - } - - return concat -} - -func (p *parser) parseCharProp() astNode { - if !p.consume(tokenKindLBrace) { - raiseSyntaxError(synErrCharPropExpInvalidForm) - } - var sym1, sym2 string - if !p.consume(tokenKindCharPropSymbol) { - raiseSyntaxError(synErrCharPropExpInvalidForm) - } - sym1 = p.lastTok.propSymbol - if p.consume(tokenKindEqual) { - if !p.consume(tokenKindCharPropSymbol) { - raiseSyntaxError(synErrCharPropExpInvalidForm) - } - sym2 = p.lastTok.propSymbol - } - - var alt astNode - var propName, propVal string - if sym2 != "" { - propName = sym1 - propVal = sym2 - } else { - propName = "" - propVal = sym1 - } - if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) { - p.errMsgDetails = propName - raiseSyntaxError(synErrCharPropUnsupported) - } - pat, err := ucd.NormalizeCharacterProperty(propName, propVal) - if err != nil { - p.errMsgDetails = fmt.Sprintf("%v", err) - raiseSyntaxError(synErrCharPropUnsupported) - } - if pat != "" { - p := newParser(bytes.NewReader([]byte(pat))) - p.exposeContributoryProperty() - ast, err := p.parse() - if err != nil { - panic(err) - } - alt = ast - } else { - cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal) - if err != nil { - p.errMsgDetails = fmt.Sprintf("%v", err) - raiseSyntaxError(synErrCharPropUnsupported) - } - if inverse { - r := cpRanges[0] - from := genNormalCharAST(r.From) - to := genNormalCharAST(r.To) - alt = exclude(genRangeAST(from, to), genAnyCharAST()) - if alt == nil { - panic(fmt.Errorf("a pattern that isn't matching any symbols")) - } - for _, r := range cpRanges[1:] { - from := genNormalCharAST(r.From) - to := genNormalCharAST(r.To) - alt = exclude(genRangeAST(from, to), alt) - if alt == nil { - panic(fmt.Errorf("a pattern that isn't matching any symbols")) - } - } - } else { - for _, r := range cpRanges { - from := genNormalCharAST(r.From) - to := genNormalCharAST(r.To) - alt = genAltNode( - alt, - genRangeAST(from, to), - ) - } - } - } - - if !p.consume(tokenKindRBrace) { - raiseSyntaxError(synErrCharPropExpInvalidForm) - } - - return alt -} - -func (p *parser) parseFragment() astNode { - if !p.consume(tokenKindLBrace) { - raiseSyntaxError(synErrFragmentExpInvalidForm) - } - if !p.consume(tokenKindFragmentSymbol) { - raiseSyntaxError(synErrFragmentExpInvalidForm) - } - sym := p.lastTok.fragmentSymbol - - if !p.consume(tokenKindRBrace) { - raiseSyntaxError(synErrFragmentExpInvalidForm) - } - - p.incomplete = true - - return newFragmentNode(sym, nil) -} - -func (p *parser) parseNormalChar() astNode { - if !p.consume(tokenKindChar) { - return nil - } - return genNormalCharAST(p.lastTok.char) -} - -func genNormalCharAST(c rune) astNode { - b := []byte(string(c)) - switch len(b) { - case 1: - return newSymbolNode(b[0]) - case 2: - return genConcatNode( - newSymbolNode(b[0]), - newSymbolNode(b[1]), - ) - case 3: - return genConcatNode( - newSymbolNode(b[0]), - newSymbolNode(b[1]), - newSymbolNode(b[2]), - ) - default: // is equivalent to case 4 - return genConcatNode( - newSymbolNode(b[0]), - newSymbolNode(b[1]), - newSymbolNode(b[2]), - newSymbolNode(b[3]), - ) - } -} - -func exclude(symbol, base astNode) astNode { - if alt, ok := symbol.(*altNode); ok { - return exclude(alt.right, exclude(alt.left, base)) - } - - switch base.(type) { - case *altNode: - left, right := base.children() - return genAltNode( - exclude(symbol, left), - exclude(symbol, right), - ) - case *concatNode: - baseSeq := genByteRangeSeq(base) - symSeq := genByteRangeSeq(symbol) - excluded := excludeByteRangeSequence(symSeq, baseSeq) - if len(excluded) <= 0 { - return nil - } - return convertByteRangeSeqsToAST(excluded) - case *symbolNode: - baseSeq := genByteRangeSeq(base) - symSeq := genByteRangeSeq(symbol) - excluded := excludeByteRangeSequence(symSeq, baseSeq) - if len(excluded) <= 0 { - return nil - } - return convertByteRangeSeqsToAST(excluded) - } - return nil -} - -func convertByteRangeSeqsToAST(seqs [][]byteRange) astNode { - concats := []astNode{} - for _, seq := range seqs { - syms := []astNode{} - for _, elem := range seq { - syms = append(syms, newRangeSymbolNode(elem.from, elem.to)) - } - concats = append(concats, genConcatNode(syms...)) - } - return genAltNode(concats...) -} - -func genAnyCharAST() astNode { - return convertCharBlocksToAST(utf8.AllCharBlocks()) -} - -func genRangeAST(fromNode, toNode astNode) astNode { - from := genByteSeq(fromNode) - to := genByteSeq(toNode) - blks, err := utf8.GenCharBlocks(from, to) - if err != nil { - panic(err) - } - return convertCharBlocksToAST(blks) -} - -func convertCharBlocksToAST(blks []*utf8.CharBlock) astNode { - var alt astNode - for _, blk := range blks { - r := make([]astNode, len(blk.From)) - for i := 0; i < len(blk.From); i++ { - r[i] = newRangeSymbolNode(blk.From[i], blk.To[i]) - } - alt = genAltNode(alt, genConcatNode(r...)) - } - return alt -} - -func genByteSeq(node astNode) []byte { - switch n := node.(type) { - case *symbolNode: - return []byte{n.from} - case *concatNode: - seq := genByteSeq(n.left) - seq = append(seq, genByteSeq(n.right)...) - return seq - } - panic(fmt.Errorf("genByteSeq() cannot handle %T: %v", node, node)) -} - -func genByteRangeSeq(node astNode) []byteRange { - switch n := node.(type) { - case *symbolNode: - return []byteRange{{from: n.from, to: n.to}} - case *concatNode: - seq := genByteRangeSeq(n.left) - seq = append(seq, genByteRangeSeq(n.right)...) - return seq - } - panic(fmt.Errorf("genByteRangeSeq() cannot handle %T: %v", node, node)) -} - -func isValidOrder(from, to []byte) bool { - if len(from) > len(to) { - return false - } - if len(from) < len(to) { - return true - } - for i, f := range from { - t := to[i] - if f > t { - return false - } - if f < t { - return true - } - } - return true -} - -func genConcatNode(cs ...astNode) astNode { - if len(cs) <= 0 { - return nil - } - if len(cs) == 1 { - return cs[0] - } - concat := newConcatNode(cs[0], cs[1]) - for _, c := range cs[2:] { - concat = newConcatNode(concat, c) - } - return concat -} - -func genAltNode(cs ...astNode) astNode { - nonNilNodes := []astNode{} - for _, c := range cs { - if c == nil { - continue - } - nonNilNodes = append(nonNilNodes, c) - } - if len(nonNilNodes) <= 0 { - return nil - } - if len(nonNilNodes) == 1 { - return nonNilNodes[0] - } - alt := newAltNode(nonNilNodes[0], nonNilNodes[1]) - for _, c := range nonNilNodes[2:] { - alt = newAltNode(alt, c) - } - return alt -} - -func (p *parser) expect(expected tokenKind) { - if !p.consume(expected) { - tok := p.peekedTok - p.errMsgDetails = fmt.Sprintf("unexpected token; expected: %v, actual: %v", expected, tok.kind) - raiseSyntaxError(synErrUnexpectedToken) - } -} - -func (p *parser) consume(expected tokenKind) bool { - var tok *token - var err error - if p.peekedTok != nil { - tok = p.peekedTok - p.peekedTok = nil - } else { - tok, err = p.lex.next() - if err != nil { - p.errMsgDetails = p.lex.errMsgDetails - panic(err) - } - } - p.lastTok = tok - if tok.kind == expected { - return true - } - p.peekedTok = tok - p.lastTok = nil - - return false -} diff --git a/compiler/parser_test.go b/compiler/parser_test.go deleted file mode 100644 index b0bc67a..0000000 --- a/compiler/parser_test.go +++ /dev/null @@ -1,1422 +0,0 @@ -package compiler - -import ( - "fmt" - "reflect" - "testing" - - "github.com/nihei9/maleeni/spec" - "github.com/nihei9/maleeni/ucd" -) - -func symPos(n uint16) symbolPosition { - pos, err := newSymbolPosition(n, false) - if err != nil { - panic(err) - } - return pos -} - -func endPos(n uint16) symbolPosition { - pos, err := newSymbolPosition(n, true) - if err != nil { - panic(err) - } - return pos -} - -func TestParse(t *testing.T) { - tests := []struct { - pattern string - fragments map[string]string - ast astNode - syntaxError *SyntaxError - - // When an AST is large, as patterns containing a character property expression, - // this test only checks that the pattern is parsable. - // The check of the validity of such AST is performed by checking that it can be matched correctly using the driver. - skipTestAST bool - }{ - { - pattern: "a", - ast: genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "abc", - ast: genConcatNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "a?", - ast: genConcatNode( - newOptionNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - ), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "[abc]?", - ast: genConcatNode( - newOptionNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "\\u{3042}?", - ast: genConcatNode( - newOptionNode( - genConcatNode( - newSymbolNodeWithPos(0xE3, symPos(1)), - newSymbolNodeWithPos(0x81, symPos(2)), - newSymbolNodeWithPos(0x82, symPos(3)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "\\p{Letter}?", - skipTestAST: true, - }, - { - pattern: "\\f{a2c}?", - fragments: map[string]string{ - "a2c": "abc", - }, - ast: genConcatNode( - newOptionNode( - newFragmentNode("a2c", - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "(a)?", - ast: genConcatNode( - newOptionNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - ), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "((a?)?)?", - ast: genConcatNode( - newOptionNode( - newOptionNode( - newOptionNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - ), - ), - ), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "(abc)?", - ast: genConcatNode( - newOptionNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "(a|b)?", - ast: genConcatNode( - newOptionNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(3)), - ), - }, - { - pattern: "?", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "(?)", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a|?", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "?|b", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a??", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a*", - ast: genConcatNode( - newRepeatNode( - newSymbolNodeWithPos(byte('a'), 1), - ), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "[abc]*", - ast: genConcatNode( - newRepeatNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), 1), - newSymbolNodeWithPos(byte('b'), 2), - newSymbolNodeWithPos(byte('c'), 3), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "\\u{3042}*", - ast: genConcatNode( - newRepeatNode( - genConcatNode( - newSymbolNodeWithPos(0xE3, symPos(1)), - newSymbolNodeWithPos(0x81, symPos(2)), - newSymbolNodeWithPos(0x82, symPos(3)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "\\p{Letter}*", - skipTestAST: true, - }, - { - pattern: "\\f{a2c}*", - fragments: map[string]string{ - "a2c": "abc", - }, - ast: genConcatNode( - newRepeatNode( - newFragmentNode("a2c", - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "((a*)*)*", - ast: genConcatNode( - newRepeatNode( - newRepeatNode( - newRepeatNode( - newSymbolNodeWithPos(byte('a'), 1), - ), - ), - ), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "(abc)*", - ast: genConcatNode( - newRepeatNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), 1), - newSymbolNodeWithPos(byte('b'), 2), - newSymbolNodeWithPos(byte('c'), 3), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "(a|b)*", - ast: genConcatNode( - newRepeatNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), 1), - newSymbolNodeWithPos(byte('b'), 2), - ), - ), - newEndMarkerNodeWithPos(1, endPos(3)), - ), - }, - { - pattern: "*", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "(*)", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a|*", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "*|b", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a**", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a+", - ast: genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newRepeatNode( - newSymbolNodeWithPos(byte('a'), symPos(2)), - ), - newEndMarkerNodeWithPos(1, endPos(3)), - ), - }, - { - pattern: "[abc]+", - ast: genConcatNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - newRepeatNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), symPos(4)), - newSymbolNodeWithPos(byte('b'), symPos(5)), - newSymbolNodeWithPos(byte('c'), symPos(6)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(7)), - ), - }, - { - pattern: "\\u{3042}+", - ast: genConcatNode( - genConcatNode( - newSymbolNodeWithPos(0xE3, symPos(1)), - newSymbolNodeWithPos(0x81, symPos(2)), - newSymbolNodeWithPos(0x82, symPos(3)), - ), - newRepeatNode( - genConcatNode( - newSymbolNodeWithPos(0xE3, symPos(4)), - newSymbolNodeWithPos(0x81, symPos(5)), - newSymbolNodeWithPos(0x82, symPos(6)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(7)), - ), - }, - { - pattern: "\\p{Letter}+", - skipTestAST: true, - }, - { - pattern: "\\f{a2c}+", - fragments: map[string]string{ - "a2c": "abc", - }, - ast: genConcatNode( - newFragmentNode("a2c", - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - ), - newRepeatNode( - newFragmentNode("a2c", - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(4)), - newSymbolNodeWithPos(byte('b'), symPos(5)), - newSymbolNodeWithPos(byte('c'), symPos(6)), - ), - ), - ), - newEndMarkerNodeWithPos(1, endPos(7)), - ), - }, - { - pattern: "((a+)+)+", - ast: genConcatNode( - genConcatNode( - genConcatNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newRepeatNode( - newSymbolNodeWithPos(byte('a'), symPos(2)), - ), - ), - newRepeatNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(3)), - newRepeatNode( - newSymbolNodeWithPos(byte('a'), symPos(4)), - ), - ), - ), - ), - newRepeatNode( - genConcatNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(5)), - newRepeatNode( - newSymbolNodeWithPos(byte('a'), symPos(6)), - ), - ), - newRepeatNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(7)), - newRepeatNode( - newSymbolNodeWithPos(byte('a'), symPos(8)), - ), - ), - ), - ), - ), - ), - newEndMarkerNodeWithPos(1, endPos(9)), - ), - }, - { - pattern: "(abc)+", - ast: genConcatNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - newRepeatNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(4)), - newSymbolNodeWithPos(byte('b'), symPos(5)), - newSymbolNodeWithPos(byte('c'), symPos(6)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(7)), - ), - }, - { - pattern: "(a|b)+", - ast: genConcatNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - ), - newRepeatNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), symPos(3)), - newSymbolNodeWithPos(byte('b'), symPos(4)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(5)), - ), - }, - { - pattern: "+", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "(+)", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a|+", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "+|b", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a++", - syntaxError: synErrRepNoTarget, - }, - { - pattern: ".", - ast: newConcatNode( - genAltNode( - newRangeSymbolNodeWithPos(0x00, 0x7f, symPos(1)), - genConcatNode( - newRangeSymbolNodeWithPos(0xc2, 0xdf, symPos(2)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(3)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xe0, 0xe0, symPos(4)), - newRangeSymbolNodeWithPos(0xa0, 0xbf, symPos(5)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(6)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xe1, 0xec, symPos(7)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(8)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(9)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xed, 0xed, symPos(10)), - newRangeSymbolNodeWithPos(0x80, 0x9f, symPos(11)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(12)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xee, 0xef, symPos(13)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(14)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(15)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf0, 0xf0, symPos(16)), - newRangeSymbolNodeWithPos(0x90, 0xbf, symPos(17)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(18)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(19)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf1, 0xf3, symPos(20)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(21)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(22)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(23)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf4, 0xf4, symPos(24)), - newRangeSymbolNodeWithPos(0x80, 0x8f, symPos(25)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(26)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(27)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(28)), - ), - }, - { - pattern: "[a]", - ast: newConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "[abc]", - ast: newConcatNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "[a-z]", - ast: newConcatNode( - newRangeSymbolNodeWithPos(byte('a'), byte('z'), symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "[A-Za-z]", - ast: newConcatNode( - genAltNode( - newRangeSymbolNodeWithPos(byte('A'), byte('Z'), symPos(1)), - newRangeSymbolNodeWithPos(byte('a'), byte('z'), symPos(2)), - ), - newEndMarkerNodeWithPos(1, endPos(3)), - ), - }, - { - pattern: "[\\u{004E}]", - ast: newConcatNode( - newSymbolNodeWithPos(byte('N'), symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "[\\p{Lu}]", - skipTestAST: true, - }, - { - pattern: "a[]", - syntaxError: synErrBExpNoElem, - }, - { - pattern: "[]a", - syntaxError: synErrBExpNoElem, - }, - { - pattern: "[]", - syntaxError: synErrBExpNoElem, - }, - { - pattern: "[^]", - ast: newConcatNode( - newSymbolNodeWithPos(byte('^'), symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "[", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "[a", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([a", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "[a-", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([a-", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "[^", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([^", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "[^a", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([^a", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "[^a-", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([^a-", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "]", - ast: newConcatNode( - newSymbolNodeWithPos(byte(']'), symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "(]", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: "a]", - ast: newConcatNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte(']'), symPos(2)), - ), - newEndMarkerNodeWithPos(1, endPos(3)), - ), - }, - { - pattern: "(a]", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: "([)", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([a)", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "[a-]", - ast: newConcatNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('-'), symPos(2)), - ), - newEndMarkerNodeWithPos(1, endPos(3)), - ), - }, - { - pattern: "[^a-]", - ast: newConcatNode( - genAltNode( - newRangeSymbolNodeWithPos(0x00, byte(44), symPos(1)), - newRangeSymbolNodeWithPos(byte(46), byte(96), symPos(2)), - newRangeSymbolNodeWithPos(byte(98), 0x7f, symPos(3)), - genConcatNode( - newRangeSymbolNodeWithPos(0xc2, 0xdf, symPos(4)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(5)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xe0, 0xe0, symPos(6)), - newRangeSymbolNodeWithPos(0xa0, 0xbf, symPos(7)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(8)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xe1, 0xec, symPos(9)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(10)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(11)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xed, 0xed, symPos(12)), - newRangeSymbolNodeWithPos(0x80, 0x9f, symPos(13)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(14)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xee, 0xef, symPos(15)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(16)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(17)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf0, 0xf0, symPos(18)), - newRangeSymbolNodeWithPos(0x90, 0xbf, symPos(19)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(20)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(21)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf1, 0xf3, symPos(22)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(23)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(24)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(25)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf4, 0xf4, symPos(26)), - newRangeSymbolNodeWithPos(0x80, 0x8f, symPos(27)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(28)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(29)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(30)), - ), - }, - { - pattern: "[-z]", - ast: newConcatNode( - genAltNode( - newSymbolNodeWithPos(byte('-'), symPos(1)), - newSymbolNodeWithPos(byte('z'), symPos(2)), - ), - newEndMarkerNodeWithPos(1, endPos(3)), - ), - }, - { - pattern: "[^-z]", - ast: newConcatNode( - genAltNode( - newRangeSymbolNodeWithPos(0x00, byte(44), symPos(1)), - genAltNode( - newRangeSymbolNodeWithPos(byte(46), byte(121), symPos(2)), - newRangeSymbolNodeWithPos(byte(123), 0x7f, symPos(3)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xc2, 0xdf, symPos(4)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(5)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xe0, 0xe0, symPos(6)), - newRangeSymbolNodeWithPos(0xa0, 0xbf, symPos(7)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(8)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xe1, 0xec, symPos(9)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(10)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(11)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xed, 0xed, symPos(12)), - newRangeSymbolNodeWithPos(0x80, 0x9f, symPos(13)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(14)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xee, 0xef, symPos(15)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(16)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(17)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf0, 0xf0, symPos(18)), - newRangeSymbolNodeWithPos(0x90, 0xbf, symPos(19)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(20)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(21)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf1, 0xf3, symPos(22)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(23)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(24)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(25)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf4, 0xf4, symPos(26)), - newRangeSymbolNodeWithPos(0x80, 0x8f, symPos(27)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(28)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(29)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(30)), - ), - }, - { - pattern: "[-]", - ast: newConcatNode( - newSymbolNodeWithPos(byte('-'), symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "[^-]", - ast: newConcatNode( - genAltNode( - newRangeSymbolNodeWithPos(0x00, byte(44), symPos(1)), - newRangeSymbolNodeWithPos(byte(46), 0x7f, symPos(2)), - genConcatNode( - newRangeSymbolNodeWithPos(0xc2, 0xdf, symPos(3)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(4)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xe0, 0xe0, symPos(5)), - newRangeSymbolNodeWithPos(0xa0, 0xbf, symPos(6)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(7)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xe1, 0xec, symPos(8)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(9)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(10)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xed, 0xed, symPos(11)), - newRangeSymbolNodeWithPos(0x80, 0x9f, symPos(12)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(13)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xee, 0xef, symPos(14)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(15)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(16)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf0, 0xf0, symPos(17)), - newRangeSymbolNodeWithPos(0x90, 0xbf, symPos(18)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(19)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(20)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf1, 0xf3, symPos(21)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(22)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(23)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(24)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf4, 0xf4, symPos(25)), - newRangeSymbolNodeWithPos(0x80, 0x8f, symPos(26)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(27)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(28)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(29)), - ), - }, - { - pattern: "\\u{006E}", - ast: genConcatNode( - newSymbolNodeWithPos(0x6E, symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "\\u{03BD}", - ast: genConcatNode( - genConcatNode( - newSymbolNodeWithPos(0xCE, symPos(1)), - newSymbolNodeWithPos(0xBD, symPos(2)), - ), - newEndMarkerNodeWithPos(1, endPos(3)), - ), - }, - { - pattern: "\\u{306B}", - ast: genConcatNode( - genConcatNode( - newSymbolNodeWithPos(0xE3, symPos(1)), - newSymbolNodeWithPos(0x81, symPos(2)), - newSymbolNodeWithPos(0xAB, symPos(3)), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "\\u{01F638}", - ast: genConcatNode( - genConcatNode( - newSymbolNodeWithPos(0xF0, symPos(1)), - newSymbolNodeWithPos(0x9F, symPos(2)), - newSymbolNodeWithPos(0x98, symPos(3)), - newSymbolNodeWithPos(0xB8, symPos(4)), - ), - newEndMarkerNodeWithPos(1, endPos(5)), - ), - }, - { - pattern: "\\u{0000}", - ast: genConcatNode( - newSymbolNodeWithPos(0x00, symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "\\u{10FFFF}", - ast: genConcatNode( - genConcatNode( - newSymbolNodeWithPos(0xF4, symPos(1)), - newSymbolNodeWithPos(0x8F, symPos(2)), - newSymbolNodeWithPos(0xBF, symPos(3)), - newSymbolNodeWithPos(0xBF, symPos(4)), - ), - newEndMarkerNodeWithPos(1, endPos(5)), - ), - }, - { - pattern: "\\u{110000}", - syntaxError: synErrCPExpOutOfRange, - }, - { - pattern: "\\u", - syntaxError: synErrCPExpInvalidForm, - }, - { - pattern: "\\u{", - syntaxError: synErrCPExpInvalidForm, - }, - { - pattern: "\\u{03BD", - syntaxError: synErrCPExpInvalidForm, - }, - { - pattern: "\\u{}", - syntaxError: synErrCPExpInvalidForm, - }, - { - pattern: "\\p{Letter}", - skipTestAST: true, - }, - { - pattern: "\\p{General_Category=Letter}", - skipTestAST: true, - }, - { - pattern: "\\p{ Letter }", - skipTestAST: true, - }, - { - pattern: "\\p{ General_Category = Letter }", - skipTestAST: true, - }, - { - pattern: "\\p", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\p{", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\p{Letter", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\p{General_Category=}", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\p{General_Category= }", - syntaxError: synErrCharPropInvalidSymbol, - }, - { - pattern: "\\p{=Letter}", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\p{ =Letter}", - syntaxError: synErrCharPropInvalidSymbol, - }, - { - pattern: "\\p{=}", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\p{}", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\f{a2c}", - fragments: map[string]string{ - "a2c": "abc", - }, - ast: genConcatNode( - newFragmentNode("a2c", - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "\\f{ a2c }", - fragments: map[string]string{ - "a2c": "abc", - }, - ast: genConcatNode( - newFragmentNode("a2c", - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "\\f", - syntaxError: synErrFragmentExpInvalidForm, - }, - { - pattern: "\\f{", - syntaxError: synErrFragmentExpInvalidForm, - }, - { - pattern: "\\f{a2c", - fragments: map[string]string{ - "a2c": "abc", - }, - syntaxError: synErrFragmentExpInvalidForm, - }, - { - pattern: "(a)", - ast: newConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "(((a)))", - ast: newConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "a()", - syntaxError: synErrGroupNoElem, - }, - { - pattern: "()a", - syntaxError: synErrGroupNoElem, - }, - { - pattern: "()", - syntaxError: synErrGroupNoElem, - }, - { - pattern: "(", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: "a(", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: "(a", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: "((", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: "((a)", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: ")", - syntaxError: synErrGroupNoInitiator, - }, - { - pattern: "a)", - syntaxError: synErrGroupNoInitiator, - }, - { - pattern: ")a", - syntaxError: synErrGroupNoInitiator, - }, - { - pattern: "))", - syntaxError: synErrGroupNoInitiator, - }, - { - pattern: "(a))", - syntaxError: synErrGroupNoInitiator, - }, - { - pattern: "Mulder|Scully", - ast: newConcatNode( - genAltNode( - genConcatNode( - newSymbolNodeWithPos(byte('M'), symPos(1)), - newSymbolNodeWithPos(byte('u'), symPos(2)), - newSymbolNodeWithPos(byte('l'), symPos(3)), - newSymbolNodeWithPos(byte('d'), symPos(4)), - newSymbolNodeWithPos(byte('e'), symPos(5)), - newSymbolNodeWithPos(byte('r'), symPos(6)), - ), - genConcatNode( - newSymbolNodeWithPos(byte('S'), symPos(7)), - newSymbolNodeWithPos(byte('c'), symPos(8)), - newSymbolNodeWithPos(byte('u'), symPos(9)), - newSymbolNodeWithPos(byte('l'), symPos(10)), - newSymbolNodeWithPos(byte('l'), symPos(11)), - newSymbolNodeWithPos(byte('y'), symPos(12)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(13)), - ), - }, - { - pattern: "Langly|Frohike|Byers", - ast: newConcatNode( - genAltNode( - genConcatNode( - newSymbolNodeWithPos(byte('L'), symPos(1)), - newSymbolNodeWithPos(byte('a'), symPos(2)), - newSymbolNodeWithPos(byte('n'), symPos(3)), - newSymbolNodeWithPos(byte('g'), symPos(4)), - newSymbolNodeWithPos(byte('l'), symPos(5)), - newSymbolNodeWithPos(byte('y'), symPos(6)), - ), - genConcatNode( - newSymbolNodeWithPos(byte('F'), symPos(7)), - newSymbolNodeWithPos(byte('r'), symPos(8)), - newSymbolNodeWithPos(byte('o'), symPos(9)), - newSymbolNodeWithPos(byte('h'), symPos(10)), - newSymbolNodeWithPos(byte('i'), symPos(11)), - newSymbolNodeWithPos(byte('k'), symPos(12)), - newSymbolNodeWithPos(byte('e'), symPos(13)), - ), - genConcatNode( - newSymbolNodeWithPos(byte('B'), symPos(14)), - newSymbolNodeWithPos(byte('y'), symPos(15)), - newSymbolNodeWithPos(byte('e'), symPos(16)), - newSymbolNodeWithPos(byte('r'), symPos(17)), - newSymbolNodeWithPos(byte('s'), symPos(18)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(19)), - ), - }, - { - pattern: "|", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "||", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "Mulder|", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "|Scully", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "Langly|Frohike|", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "Langly||Byers", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "|Frohike|Byers", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "|Frohike|", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "Fox(|)Mulder", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "(Fox|)Mulder", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "Fox(|Mulder)", - syntaxError: synErrAltLackOfOperand, - }, - } - for i, tt := range tests { - t.Run(fmt.Sprintf("#%v %v", i, tt.pattern), func(t *testing.T) { - fragments := map[string][]byte{} - for kind, pattern := range tt.fragments { - fragments[kind] = []byte(pattern) - } - ast, _, err := parse([]*patternEntry{ - { - id: spec.LexModeKindIDMin, - pattern: []byte(tt.pattern), - }, - }, fragments) - if tt.syntaxError != nil { - // printAST(os.Stdout, ast, "", "", false) - if err == nil { - t.Fatalf("expected syntax error; got: nil") - } - parseErrs, ok := err.(*ParseErrors) - if !ok { - t.Fatalf("expected ParseErrors; got: %v (type: %T)", err, err) - } - parseErr := parseErrs.Errors[0].Cause - synErr, ok := parseErr.(*SyntaxError) - if !ok { - t.Fatalf("expected SyntaxError; got: %v (type: %T)", parseErr, parseErr) - } - if synErr != tt.syntaxError { - t.Fatalf("unexpected syntax error; want: %v, got: %v", tt.syntaxError, synErr) - } - if ast != nil { - t.Fatalf("ast is not nil") - } - } else { - if err != nil { - t.Fatal(err) - } - if ast == nil { - t.Fatal("AST is nil") - } - // printAST(os.Stdout, ast, "", "", false) - if !tt.skipTestAST { - testAST(t, tt.ast, ast) - } - } - }) - } -} - -func TestParse_ContributoryPropertyIsNotExposed(t *testing.T) { - for _, cProp := range ucd.ContributoryProperties() { - t.Run(fmt.Sprintf("%v", cProp), func(t *testing.T) { - ast, _, err := parse([]*patternEntry{ - { - id: spec.LexModeKindIDMin, - pattern: []byte(fmt.Sprintf(`\p{%v=yes}`, cProp)), - }, - }, nil) - if err == nil { - t.Fatalf("expected syntax error; got: nil") - } - parseErrs, ok := err.(*ParseErrors) - if !ok { - t.Fatalf("expected ParseErrors; got: %v (type: %T)", err, err) - } - parseErr := parseErrs.Errors[0].Cause - synErr, ok := parseErr.(*SyntaxError) - if !ok { - t.Fatalf("expected SyntaxError; got: %v (type: %T)", parseErr, parseErr) - } - if synErr != synErrCharPropUnsupported { - t.Fatalf("unexpected syntax error; want: %v, got: %v", synErrCharPropUnsupported, synErr) - } - if ast != nil { - t.Fatalf("ast is not nil") - } - }) - } -} - -func TestParse_FollowAndSymbolTable(t *testing.T) { - root, symTab, err := parse([]*patternEntry{ - { - id: spec.LexModeKindIDMin, - pattern: []byte("(a|b)*abb"), - }, - }, nil) - if err != nil { - t.Fatal(err) - } - if root == nil { - t.Fatal("root of AST is nil") - } - // printAST(os.Stdout, root, "", "", false) - - { - expectedAST := genConcatNode( - newRepeatNode( - newAltNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - ), - ), - newSymbolNodeWithPos(byte('a'), symPos(3)), - newSymbolNodeWithPos(byte('b'), symPos(4)), - newSymbolNodeWithPos(byte('b'), symPos(5)), - newEndMarkerNodeWithPos(1, endPos(6)), - ) - testAST(t, expectedAST, root) - } - - { - followTab := genFollowTable(root) - if followTab == nil { - t.Fatal("follow table is nil") - } - expectedFollowTab := followTable{ - 1: newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)), - 2: newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)), - 3: newSymbolPositionSet().add(symPos(4)), - 4: newSymbolPositionSet().add(symPos(5)), - 5: newSymbolPositionSet().add(endPos(6)), - } - testFollowTable(t, expectedFollowTab, followTab) - } - - { - entry := func(v byte) byteRange { - return byteRange{ - from: v, - to: v, - } - } - - expectedSymTab := &symbolTable{ - symPos2Byte: map[symbolPosition]byteRange{ - symPos(1): entry(byte('a')), - symPos(2): entry(byte('b')), - symPos(3): entry(byte('a')), - symPos(4): entry(byte('b')), - symPos(5): entry(byte('b')), - }, - endPos2ID: map[symbolPosition]spec.LexModeKindID{ - endPos(6): 1, - }, - } - testSymbolTable(t, expectedSymTab, symTab) - } -} - -func testAST(t *testing.T, expected, actual astNode) { - t.Helper() - - aTy := reflect.TypeOf(actual) - eTy := reflect.TypeOf(expected) - if eTy != aTy { - t.Fatalf("AST node type is mismatched; want: %v, got: %v", eTy, aTy) - } - - if actual == nil { - return - } - - switch e := expected.(type) { - case *symbolNode: - a := actual.(*symbolNode) - if a.pos != e.pos || a.from != e.from || a.to != e.to { - t.Fatalf("unexpected node; want: %+v, got: %+v", e, a) - } - case *endMarkerNode: - a := actual.(*endMarkerNode) - if a.pos != e.pos { - t.Fatalf("symbol position is mismatched; want: %v, got: %v", e.pos, a.pos) - } - } - eLeft, eRight := expected.children() - aLeft, aRight := actual.children() - testAST(t, eLeft, aLeft) - testAST(t, eRight, aRight) -} - -func testFollowTable(t *testing.T, expected, actual followTable) { - if len(actual) != len(expected) { - t.Errorf("unexpected number of the follow table entries; want: %v, got: %v", len(expected), len(actual)) - } - for ePos, eSet := range expected { - aSet, ok := actual[ePos] - if !ok { - t.Fatalf("follow entry is not found; position: %v, follow: %v", ePos, eSet) - } - if aSet.hash() != eSet.hash() { - t.Fatalf("follow entry of position %v is mismatched; want: %v, got: %v", ePos, aSet, eSet) - } - } -} - -func testSymbolTable(t *testing.T, expected, actual *symbolTable) { - t.Helper() - - if len(actual.symPos2Byte) != len(expected.symPos2Byte) { - t.Errorf("unexpected symPos2Byte entries; want: %v entries, got: %v entries", len(expected.symPos2Byte), len(actual.symPos2Byte)) - } - for ePos, eByte := range expected.symPos2Byte { - byte, ok := actual.symPos2Byte[ePos] - if !ok { - t.Errorf("a symbol position entry was not found: %v -> %v", ePos, eByte) - continue - } - if byte.from != eByte.from || byte.to != eByte.to { - t.Errorf("unexpected symbol position entry; want: %v -> %v, got: %v -> %v", ePos, eByte, ePos, byte) - } - } - - if len(actual.endPos2ID) != len(expected.endPos2ID) { - t.Errorf("unexpected endPos2ID entries; want: %v entries, got: %v entries", len(expected.endPos2ID), len(actual.endPos2ID)) - } - for ePos, eID := range expected.endPos2ID { - id, ok := actual.endPos2ID[ePos] - if !ok { - t.Errorf("an end position entry was not found: %v -> %v", ePos, eID) - continue - } - if id != eID { - t.Errorf("unexpected end position entry; want: %v -> %v, got: %v -> %v", ePos, eID, ePos, id) - } - } -} |