aboutsummaryrefslogtreecommitdiff
path: root/compiler/parser
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--compiler/parser.go862
-rw-r--r--compiler/parser_test.go1422
2 files changed, 0 insertions, 2284 deletions
diff --git a/compiler/parser.go b/compiler/parser.go
deleted file mode 100644
index ce481e3..0000000
--- a/compiler/parser.go
+++ /dev/null
@@ -1,862 +0,0 @@
-package compiler
-
-import (
- "bytes"
- "encoding/binary"
- "encoding/hex"
- "fmt"
- "io"
- "strings"
-
- "github.com/nihei9/maleeni/spec"
- "github.com/nihei9/maleeni/ucd"
- "github.com/nihei9/maleeni/utf8"
-)
-
-type ParseErrors struct {
- Errors []*ParseError
-}
-
-func (e *ParseErrors) Error() string {
- var b strings.Builder
- fmt.Fprintf(&b, "%v", e.Errors[0])
- for _, err := range e.Errors[1:] {
- fmt.Fprintf(&b, "\n%v", err)
- }
- return b.String()
-}
-
-type ParseError struct {
- ID spec.LexModeKindID
- Pattern []byte
- Cause error
- Details string
-}
-
-func (e *ParseError) Error() string {
- var b strings.Builder
- fmt.Fprintf(&b, "#%v %v: %v", e.ID, string(e.Pattern), e.Cause)
- if e.Details != "" {
- fmt.Fprintf(&b, ": %v", e.Details)
- }
- return b.String()
-}
-
-func raiseSyntaxError(synErr *SyntaxError) {
- panic(synErr)
-}
-
-type symbolTable struct {
- symPos2Byte map[symbolPosition]byteRange
- endPos2ID map[symbolPosition]spec.LexModeKindID
-}
-
-func genSymbolTable(root astNode) *symbolTable {
- symTab := &symbolTable{
- symPos2Byte: map[symbolPosition]byteRange{},
- endPos2ID: map[symbolPosition]spec.LexModeKindID{},
- }
- return genSymTab(symTab, root)
-}
-
-func genSymTab(symTab *symbolTable, node astNode) *symbolTable {
- if node == nil {
- return symTab
- }
-
- switch n := node.(type) {
- case *symbolNode:
- symTab.symPos2Byte[n.pos] = byteRange{
- from: n.from,
- to: n.to,
- }
- case *endMarkerNode:
- symTab.endPos2ID[n.pos] = n.id
- default:
- left, right := node.children()
- genSymTab(symTab, left)
- genSymTab(symTab, right)
- }
- return symTab
-}
-
-type patternEntry struct {
- id spec.LexModeKindID
- pattern []byte
-}
-
-func parse(pats []*patternEntry, fragments map[string][]byte) (astNode, *symbolTable, error) {
- if len(pats) == 0 {
- return nil, nil, fmt.Errorf("parse() needs at least one token entry")
- }
-
- fragmentASTs, err := parseFragments(fragments)
- if err != nil {
- return nil, nil, err
- }
- if fragmentASTs == nil {
- fragmentASTs = map[string]astNode{}
- }
-
- root, err := parseRegexp(pats, fragmentASTs)
- if err != nil {
- return nil, nil, err
- }
-
- return root, genSymbolTable(root), nil
-}
-
-type incompleteFragment struct {
- kind string
- ast astNode
-}
-
-func parseFragments(fragments map[string][]byte) (map[string]astNode, error) {
- if len(fragments) == 0 {
- return nil, nil
- }
- fragmentASTs := map[string]astNode{}
- incompleteFragments := []*incompleteFragment{}
- var perrs []*ParseError
- for kind, pattern := range fragments {
- p := newParser(bytes.NewReader(pattern))
- ast, err := p.parse()
- if err != nil {
- perrs = append(perrs, &ParseError{
- Pattern: pattern,
- Cause: err,
- Details: p.errMsgDetails,
- })
- continue
- }
- if p.incomplete {
- incompleteFragments = append(incompleteFragments, &incompleteFragment{
- kind: kind,
- ast: ast,
- })
- } else {
- fragmentASTs[kind] = ast
- }
- }
- for len(incompleteFragments) > 0 {
- lastIncompCount := len(incompleteFragments)
- remainingFragments := []*incompleteFragment{}
- for _, e := range incompleteFragments {
- remains := applyFragments(e.ast, fragmentASTs)
- if len(remains) > 0 {
- remainingFragments = append(remainingFragments, e)
- } else {
- fragmentASTs[e.kind] = e.ast
- }
- }
- incompleteFragments = remainingFragments
- if len(incompleteFragments) == lastIncompCount {
- for _, e := range incompleteFragments {
- perrs = append(perrs, &ParseError{
- Cause: fmt.Errorf("%v has an undefined fragment or a cycle", e.kind),
- })
- }
- break
- }
- }
- if len(perrs) > 0 {
- return nil, &ParseErrors{
- Errors: perrs,
- }
- }
-
- return fragmentASTs, nil
-}
-
-func parseRegexp(pats []*patternEntry, fragmentASTs map[string]astNode) (astNode, error) {
- symPos := symbolPositionMin
- var root astNode
- var perrs []*ParseError
-
- for _, pat := range pats {
- if pat.id == spec.LexModeKindIDNil {
- continue
- }
-
- p := newParser(bytes.NewReader(pat.pattern))
- ast, err := p.parse()
- if err != nil {
- perrs = append(perrs, &ParseError{
- ID: pat.id,
- Pattern: pat.pattern,
- Cause: err,
- Details: p.errMsgDetails,
- })
- continue
- }
- remains := applyFragments(ast, fragmentASTs)
- if len(remains) > 0 {
- perrs = append(perrs, &ParseError{
- ID: pat.id,
- Pattern: pat.pattern,
- Cause: fmt.Errorf("undefined fragment: %+v", remains),
- })
- continue
- }
- ast = newConcatNode(ast, newEndMarkerNode(pat.id))
- symPos, err = positionSymbols(ast, symPos)
- if err != nil {
- perrs = append(perrs, &ParseError{
- ID: pat.id,
- Pattern: pat.pattern,
- Cause: err,
- Details: p.errMsgDetails,
- })
- continue
- }
- root = genAltNode(root, ast)
- }
- if len(perrs) > 0 {
- return nil, &ParseErrors{
- Errors: perrs,
- }
- }
-
- return root, nil
-}
-
-func applyFragments(ast astNode, fragments map[string]astNode) []string {
- if ast == nil {
- return nil
- }
- n, ok := ast.(*fragmentNode)
- if !ok {
- var remains []string
- left, right := ast.children()
- r := applyFragments(left, fragments)
- if len(r) > 0 {
- remains = append(remains, r...)
- }
- r = applyFragments(right, fragments)
- if len(r) > 0 {
- remains = append(remains, r...)
- }
- return remains
- }
- f, ok := fragments[n.symbol]
- if !ok {
- return []string{n.symbol}
- }
- n.left = copyAST(f)
- return nil
-}
-
-type parser struct {
- lex *lexer
- peekedTok *token
- lastTok *token
- incomplete bool
- errMsgDetails string
-
- // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that
- // appear in property expressions.
- //
- // The contributory properties are not exposed, and users cannot use those properties because the parser
- // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid.
- //
- // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to
- // interpret derived properties internally because the derived properties consist of other properties that
- // may contain the contributory properties.
- //
- // [UAX #44 5.13 Property APIs] says:
- // > The following subtypes of Unicode character properties should generally not be exposed in APIs,
- // > except in limited circumstances. They may not be useful, particularly in public API collections,
- // > and may instead prove misleading to the users of such API collections.
- // > * Contributory properties are not recommended for public APIs.
- // > ...
- // https://unicode.org/reports/tr44/#Property_APIs
- isContributoryPropertyExposed bool
-}
-
-func newParser(src io.Reader) *parser {
- return &parser{
- lex: newLexer(src),
- isContributoryPropertyExposed: false,
- }
-}
-
-func (p *parser) exposeContributoryProperty() {
- p.isContributoryPropertyExposed = true
-}
-
-func (p *parser) parse() (ast astNode, retErr error) {
- defer func() {
- err := recover()
- if err != nil {
- var ok bool
- retErr, ok = err.(error)
- if !ok {
- retErr = fmt.Errorf("%v", err)
- }
- return
- }
- }()
-
- ast, err := p.parseRegexp()
- if err != nil {
- return nil, err
- }
-
- return ast, nil
-}
-
-func (p *parser) parseRegexp() (astNode, error) {
- alt := p.parseAlt()
- if alt == nil {
- if p.consume(tokenKindGroupClose) {
- raiseSyntaxError(synErrGroupNoInitiator)
- }
- raiseSyntaxError(synErrNullPattern)
- }
- if p.consume(tokenKindGroupClose) {
- raiseSyntaxError(synErrGroupNoInitiator)
- }
- p.expect(tokenKindEOF)
- return alt, nil
-}
-
-func (p *parser) parseAlt() astNode {
- left := p.parseConcat()
- if left == nil {
- if p.consume(tokenKindAlt) {
- raiseSyntaxError(synErrAltLackOfOperand)
- }
- return nil
- }
- for {
- if !p.consume(tokenKindAlt) {
- break
- }
- right := p.parseConcat()
- if right == nil {
- raiseSyntaxError(synErrAltLackOfOperand)
- }
- left = newAltNode(left, right)
- }
- return left
-}
-
-func (p *parser) parseConcat() astNode {
- left := p.parseRepeat()
- for {
- right := p.parseRepeat()
- if right == nil {
- break
- }
- left = newConcatNode(left, right)
- }
- return left
-}
-
-func (p *parser) parseRepeat() astNode {
- group := p.parseGroup()
- if group == nil {
- if p.consume(tokenKindRepeat) {
- p.errMsgDetails = "* needs an operand"
- raiseSyntaxError(synErrRepNoTarget)
- }
- if p.consume(tokenKindRepeatOneOrMore) {
- p.errMsgDetails = "+ needs an operand"
- raiseSyntaxError(synErrRepNoTarget)
- }
- if p.consume(tokenKindOption) {
- p.errMsgDetails = "? needs an operand"
- raiseSyntaxError(synErrRepNoTarget)
- }
- return nil
- }
- if p.consume(tokenKindRepeat) {
- return newRepeatNode(group)
- }
- if p.consume(tokenKindRepeatOneOrMore) {
- return newRepeatOneOrMoreNode(group)
- }
- if p.consume(tokenKindOption) {
- return newOptionNode(group)
- }
- return group
-}
-
-func (p *parser) parseGroup() astNode {
- if p.consume(tokenKindGroupOpen) {
- alt := p.parseAlt()
- if alt == nil {
- if p.consume(tokenKindEOF) {
- raiseSyntaxError(synErrGroupUnclosed)
- }
- raiseSyntaxError(synErrGroupNoElem)
- }
- if p.consume(tokenKindEOF) {
- raiseSyntaxError(synErrGroupUnclosed)
- }
- if !p.consume(tokenKindGroupClose) {
- raiseSyntaxError(synErrGroupInvalidForm)
- }
- return alt
- }
- return p.parseSingleChar()
-}
-
-func (p *parser) parseSingleChar() astNode {
- if p.consume(tokenKindAnyChar) {
- return genAnyCharAST()
- }
- if p.consume(tokenKindBExpOpen) {
- left := p.parseBExpElem()
- if left == nil {
- if p.consume(tokenKindEOF) {
- raiseSyntaxError(synErrBExpUnclosed)
- }
- raiseSyntaxError(synErrBExpNoElem)
- }
- for {
- right := p.parseBExpElem()
- if right == nil {
- break
- }
- left = newAltNode(left, right)
- }
- if p.consume(tokenKindEOF) {
- raiseSyntaxError(synErrBExpUnclosed)
- }
- p.expect(tokenKindBExpClose)
- return left
- }
- if p.consume(tokenKindInverseBExpOpen) {
- elem := p.parseBExpElem()
- if elem == nil {
- if p.consume(tokenKindEOF) {
- raiseSyntaxError(synErrBExpUnclosed)
- }
- raiseSyntaxError(synErrBExpNoElem)
- }
- inverse := exclude(elem, genAnyCharAST())
- if inverse == nil {
- panic(fmt.Errorf("a pattern that isn't matching any symbols"))
- }
- for {
- elem := p.parseBExpElem()
- if elem == nil {
- break
- }
- inverse = exclude(elem, inverse)
- if inverse == nil {
- panic(fmt.Errorf("a pattern that isn't matching any symbols"))
- }
- }
- if p.consume(tokenKindEOF) {
- raiseSyntaxError(synErrBExpUnclosed)
- }
- p.expect(tokenKindBExpClose)
- return inverse
- }
- if p.consume(tokenKindCodePointLeader) {
- return p.parseCodePoint()
- }
- if p.consume(tokenKindCharPropLeader) {
- return p.parseCharProp()
- }
- if p.consume(tokenKindFragmentLeader) {
- return p.parseFragment()
- }
- c := p.parseNormalChar()
- if c == nil {
- if p.consume(tokenKindBExpClose) {
- raiseSyntaxError(synErrBExpInvalidForm)
- }
- return nil
- }
- return c
-}
-
-func (p *parser) parseBExpElem() astNode {
- if p.consume(tokenKindCodePointLeader) {
- return p.parseCodePoint()
- }
- if p.consume(tokenKindCharPropLeader) {
- return p.parseCharProp()
- }
- left := p.parseNormalChar()
- if left == nil {
- return nil
- }
- if !p.consume(tokenKindCharRange) {
- return left
- }
- right := p.parseNormalChar()
- if right == nil {
- panic(fmt.Errorf("invalid range expression"))
- }
- from := genByteSeq(left)
- to := genByteSeq(right)
- if !isValidOrder(from, to) {
- p.errMsgDetails = fmt.Sprintf("[%s-%s] ([%v-%v])", string(from), string(to), from, to)
- raiseSyntaxError(synErrRangeInvalidOrder)
- }
- return genRangeAST(left, right)
-}
-
-func (p *parser) parseCodePoint() astNode {
- if !p.consume(tokenKindLBrace) {
- raiseSyntaxError(synErrCPExpInvalidForm)
- }
- if !p.consume(tokenKindCodePoint) {
- raiseSyntaxError(synErrCPExpInvalidForm)
- }
-
- var cp []byte
- {
- // Although hex.DecodeString method can handle only a hex string that has even length,
- // `codePoint` always has even length by the lexical specification.
- b, err := hex.DecodeString(p.lastTok.codePoint)
- if err != nil {
- panic(fmt.Errorf("failed to decode a code point (%v) into a byte slice: %v", p.lastTok.codePoint, err))
- }
- // `b` must be 4 bytes to convert it into a 32-bit integer.
- l := len(b)
- for i := 0; i < 4-l; i++ {
- b = append([]byte{0}, b...)
- }
- n := binary.BigEndian.Uint32(b)
- if n < 0x0000 || n > 0x10FFFF {
- raiseSyntaxError(synErrCPExpOutOfRange)
- }
-
- cp = []byte(string(rune(n)))
- }
-
- var concat astNode
- {
- concat = newSymbolNode(cp[0])
- for _, b := range cp[1:] {
- concat = genConcatNode(
- concat,
- newSymbolNode(b),
- )
- }
- }
-
- if !p.consume(tokenKindRBrace) {
- raiseSyntaxError(synErrCPExpInvalidForm)
- }
-
- return concat
-}
-
-func (p *parser) parseCharProp() astNode {
- if !p.consume(tokenKindLBrace) {
- raiseSyntaxError(synErrCharPropExpInvalidForm)
- }
- var sym1, sym2 string
- if !p.consume(tokenKindCharPropSymbol) {
- raiseSyntaxError(synErrCharPropExpInvalidForm)
- }
- sym1 = p.lastTok.propSymbol
- if p.consume(tokenKindEqual) {
- if !p.consume(tokenKindCharPropSymbol) {
- raiseSyntaxError(synErrCharPropExpInvalidForm)
- }
- sym2 = p.lastTok.propSymbol
- }
-
- var alt astNode
- var propName, propVal string
- if sym2 != "" {
- propName = sym1
- propVal = sym2
- } else {
- propName = ""
- propVal = sym1
- }
- if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) {
- p.errMsgDetails = propName
- raiseSyntaxError(synErrCharPropUnsupported)
- }
- pat, err := ucd.NormalizeCharacterProperty(propName, propVal)
- if err != nil {
- p.errMsgDetails = fmt.Sprintf("%v", err)
- raiseSyntaxError(synErrCharPropUnsupported)
- }
- if pat != "" {
- p := newParser(bytes.NewReader([]byte(pat)))
- p.exposeContributoryProperty()
- ast, err := p.parse()
- if err != nil {
- panic(err)
- }
- alt = ast
- } else {
- cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal)
- if err != nil {
- p.errMsgDetails = fmt.Sprintf("%v", err)
- raiseSyntaxError(synErrCharPropUnsupported)
- }
- if inverse {
- r := cpRanges[0]
- from := genNormalCharAST(r.From)
- to := genNormalCharAST(r.To)
- alt = exclude(genRangeAST(from, to), genAnyCharAST())
- if alt == nil {
- panic(fmt.Errorf("a pattern that isn't matching any symbols"))
- }
- for _, r := range cpRanges[1:] {
- from := genNormalCharAST(r.From)
- to := genNormalCharAST(r.To)
- alt = exclude(genRangeAST(from, to), alt)
- if alt == nil {
- panic(fmt.Errorf("a pattern that isn't matching any symbols"))
- }
- }
- } else {
- for _, r := range cpRanges {
- from := genNormalCharAST(r.From)
- to := genNormalCharAST(r.To)
- alt = genAltNode(
- alt,
- genRangeAST(from, to),
- )
- }
- }
- }
-
- if !p.consume(tokenKindRBrace) {
- raiseSyntaxError(synErrCharPropExpInvalidForm)
- }
-
- return alt
-}
-
-func (p *parser) parseFragment() astNode {
- if !p.consume(tokenKindLBrace) {
- raiseSyntaxError(synErrFragmentExpInvalidForm)
- }
- if !p.consume(tokenKindFragmentSymbol) {
- raiseSyntaxError(synErrFragmentExpInvalidForm)
- }
- sym := p.lastTok.fragmentSymbol
-
- if !p.consume(tokenKindRBrace) {
- raiseSyntaxError(synErrFragmentExpInvalidForm)
- }
-
- p.incomplete = true
-
- return newFragmentNode(sym, nil)
-}
-
-func (p *parser) parseNormalChar() astNode {
- if !p.consume(tokenKindChar) {
- return nil
- }
- return genNormalCharAST(p.lastTok.char)
-}
-
-func genNormalCharAST(c rune) astNode {
- b := []byte(string(c))
- switch len(b) {
- case 1:
- return newSymbolNode(b[0])
- case 2:
- return genConcatNode(
- newSymbolNode(b[0]),
- newSymbolNode(b[1]),
- )
- case 3:
- return genConcatNode(
- newSymbolNode(b[0]),
- newSymbolNode(b[1]),
- newSymbolNode(b[2]),
- )
- default: // is equivalent to case 4
- return genConcatNode(
- newSymbolNode(b[0]),
- newSymbolNode(b[1]),
- newSymbolNode(b[2]),
- newSymbolNode(b[3]),
- )
- }
-}
-
-func exclude(symbol, base astNode) astNode {
- if alt, ok := symbol.(*altNode); ok {
- return exclude(alt.right, exclude(alt.left, base))
- }
-
- switch base.(type) {
- case *altNode:
- left, right := base.children()
- return genAltNode(
- exclude(symbol, left),
- exclude(symbol, right),
- )
- case *concatNode:
- baseSeq := genByteRangeSeq(base)
- symSeq := genByteRangeSeq(symbol)
- excluded := excludeByteRangeSequence(symSeq, baseSeq)
- if len(excluded) <= 0 {
- return nil
- }
- return convertByteRangeSeqsToAST(excluded)
- case *symbolNode:
- baseSeq := genByteRangeSeq(base)
- symSeq := genByteRangeSeq(symbol)
- excluded := excludeByteRangeSequence(symSeq, baseSeq)
- if len(excluded) <= 0 {
- return nil
- }
- return convertByteRangeSeqsToAST(excluded)
- }
- return nil
-}
-
-func convertByteRangeSeqsToAST(seqs [][]byteRange) astNode {
- concats := []astNode{}
- for _, seq := range seqs {
- syms := []astNode{}
- for _, elem := range seq {
- syms = append(syms, newRangeSymbolNode(elem.from, elem.to))
- }
- concats = append(concats, genConcatNode(syms...))
- }
- return genAltNode(concats...)
-}
-
-func genAnyCharAST() astNode {
- return convertCharBlocksToAST(utf8.AllCharBlocks())
-}
-
-func genRangeAST(fromNode, toNode astNode) astNode {
- from := genByteSeq(fromNode)
- to := genByteSeq(toNode)
- blks, err := utf8.GenCharBlocks(from, to)
- if err != nil {
- panic(err)
- }
- return convertCharBlocksToAST(blks)
-}
-
-func convertCharBlocksToAST(blks []*utf8.CharBlock) astNode {
- var alt astNode
- for _, blk := range blks {
- r := make([]astNode, len(blk.From))
- for i := 0; i < len(blk.From); i++ {
- r[i] = newRangeSymbolNode(blk.From[i], blk.To[i])
- }
- alt = genAltNode(alt, genConcatNode(r...))
- }
- return alt
-}
-
-func genByteSeq(node astNode) []byte {
- switch n := node.(type) {
- case *symbolNode:
- return []byte{n.from}
- case *concatNode:
- seq := genByteSeq(n.left)
- seq = append(seq, genByteSeq(n.right)...)
- return seq
- }
- panic(fmt.Errorf("genByteSeq() cannot handle %T: %v", node, node))
-}
-
-func genByteRangeSeq(node astNode) []byteRange {
- switch n := node.(type) {
- case *symbolNode:
- return []byteRange{{from: n.from, to: n.to}}
- case *concatNode:
- seq := genByteRangeSeq(n.left)
- seq = append(seq, genByteRangeSeq(n.right)...)
- return seq
- }
- panic(fmt.Errorf("genByteRangeSeq() cannot handle %T: %v", node, node))
-}
-
-func isValidOrder(from, to []byte) bool {
- if len(from) > len(to) {
- return false
- }
- if len(from) < len(to) {
- return true
- }
- for i, f := range from {
- t := to[i]
- if f > t {
- return false
- }
- if f < t {
- return true
- }
- }
- return true
-}
-
-func genConcatNode(cs ...astNode) astNode {
- if len(cs) <= 0 {
- return nil
- }
- if len(cs) == 1 {
- return cs[0]
- }
- concat := newConcatNode(cs[0], cs[1])
- for _, c := range cs[2:] {
- concat = newConcatNode(concat, c)
- }
- return concat
-}
-
-func genAltNode(cs ...astNode) astNode {
- nonNilNodes := []astNode{}
- for _, c := range cs {
- if c == nil {
- continue
- }
- nonNilNodes = append(nonNilNodes, c)
- }
- if len(nonNilNodes) <= 0 {
- return nil
- }
- if len(nonNilNodes) == 1 {
- return nonNilNodes[0]
- }
- alt := newAltNode(nonNilNodes[0], nonNilNodes[1])
- for _, c := range nonNilNodes[2:] {
- alt = newAltNode(alt, c)
- }
- return alt
-}
-
-func (p *parser) expect(expected tokenKind) {
- if !p.consume(expected) {
- tok := p.peekedTok
- p.errMsgDetails = fmt.Sprintf("unexpected token; expected: %v, actual: %v", expected, tok.kind)
- raiseSyntaxError(synErrUnexpectedToken)
- }
-}
-
-func (p *parser) consume(expected tokenKind) bool {
- var tok *token
- var err error
- if p.peekedTok != nil {
- tok = p.peekedTok
- p.peekedTok = nil
- } else {
- tok, err = p.lex.next()
- if err != nil {
- p.errMsgDetails = p.lex.errMsgDetails
- panic(err)
- }
- }
- p.lastTok = tok
- if tok.kind == expected {
- return true
- }
- p.peekedTok = tok
- p.lastTok = nil
-
- return false
-}
diff --git a/compiler/parser_test.go b/compiler/parser_test.go
deleted file mode 100644
index b0bc67a..0000000
--- a/compiler/parser_test.go
+++ /dev/null
@@ -1,1422 +0,0 @@
-package compiler
-
-import (
- "fmt"
- "reflect"
- "testing"
-
- "github.com/nihei9/maleeni/spec"
- "github.com/nihei9/maleeni/ucd"
-)
-
-func symPos(n uint16) symbolPosition {
- pos, err := newSymbolPosition(n, false)
- if err != nil {
- panic(err)
- }
- return pos
-}
-
-func endPos(n uint16) symbolPosition {
- pos, err := newSymbolPosition(n, true)
- if err != nil {
- panic(err)
- }
- return pos
-}
-
-func TestParse(t *testing.T) {
- tests := []struct {
- pattern string
- fragments map[string]string
- ast astNode
- syntaxError *SyntaxError
-
- // When an AST is large, as patterns containing a character property expression,
- // this test only checks that the pattern is parsable.
- // The check of the validity of such AST is performed by checking that it can be matched correctly using the driver.
- skipTestAST bool
- }{
- {
- pattern: "a",
- ast: genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "abc",
- ast: genConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "a?",
- ast: genConcatNode(
- newOptionNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- ),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "[abc]?",
- ast: genConcatNode(
- newOptionNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "\\u{3042}?",
- ast: genConcatNode(
- newOptionNode(
- genConcatNode(
- newSymbolNodeWithPos(0xE3, symPos(1)),
- newSymbolNodeWithPos(0x81, symPos(2)),
- newSymbolNodeWithPos(0x82, symPos(3)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "\\p{Letter}?",
- skipTestAST: true,
- },
- {
- pattern: "\\f{a2c}?",
- fragments: map[string]string{
- "a2c": "abc",
- },
- ast: genConcatNode(
- newOptionNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "(a)?",
- ast: genConcatNode(
- newOptionNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- ),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "((a?)?)?",
- ast: genConcatNode(
- newOptionNode(
- newOptionNode(
- newOptionNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- ),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "(abc)?",
- ast: genConcatNode(
- newOptionNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "(a|b)?",
- ast: genConcatNode(
- newOptionNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(3)),
- ),
- },
- {
- pattern: "?",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "(?)",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a|?",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "?|b",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a??",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a*",
- ast: genConcatNode(
- newRepeatNode(
- newSymbolNodeWithPos(byte('a'), 1),
- ),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "[abc]*",
- ast: genConcatNode(
- newRepeatNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), 1),
- newSymbolNodeWithPos(byte('b'), 2),
- newSymbolNodeWithPos(byte('c'), 3),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "\\u{3042}*",
- ast: genConcatNode(
- newRepeatNode(
- genConcatNode(
- newSymbolNodeWithPos(0xE3, symPos(1)),
- newSymbolNodeWithPos(0x81, symPos(2)),
- newSymbolNodeWithPos(0x82, symPos(3)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "\\p{Letter}*",
- skipTestAST: true,
- },
- {
- pattern: "\\f{a2c}*",
- fragments: map[string]string{
- "a2c": "abc",
- },
- ast: genConcatNode(
- newRepeatNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "((a*)*)*",
- ast: genConcatNode(
- newRepeatNode(
- newRepeatNode(
- newRepeatNode(
- newSymbolNodeWithPos(byte('a'), 1),
- ),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "(abc)*",
- ast: genConcatNode(
- newRepeatNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), 1),
- newSymbolNodeWithPos(byte('b'), 2),
- newSymbolNodeWithPos(byte('c'), 3),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "(a|b)*",
- ast: genConcatNode(
- newRepeatNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), 1),
- newSymbolNodeWithPos(byte('b'), 2),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(3)),
- ),
- },
- {
- pattern: "*",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "(*)",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a|*",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "*|b",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a**",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a+",
- ast: genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newRepeatNode(
- newSymbolNodeWithPos(byte('a'), symPos(2)),
- ),
- newEndMarkerNodeWithPos(1, endPos(3)),
- ),
- },
- {
- pattern: "[abc]+",
- ast: genConcatNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- newRepeatNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), symPos(4)),
- newSymbolNodeWithPos(byte('b'), symPos(5)),
- newSymbolNodeWithPos(byte('c'), symPos(6)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(7)),
- ),
- },
- {
- pattern: "\\u{3042}+",
- ast: genConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(0xE3, symPos(1)),
- newSymbolNodeWithPos(0x81, symPos(2)),
- newSymbolNodeWithPos(0x82, symPos(3)),
- ),
- newRepeatNode(
- genConcatNode(
- newSymbolNodeWithPos(0xE3, symPos(4)),
- newSymbolNodeWithPos(0x81, symPos(5)),
- newSymbolNodeWithPos(0x82, symPos(6)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(7)),
- ),
- },
- {
- pattern: "\\p{Letter}+",
- skipTestAST: true,
- },
- {
- pattern: "\\f{a2c}+",
- fragments: map[string]string{
- "a2c": "abc",
- },
- ast: genConcatNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- ),
- newRepeatNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(4)),
- newSymbolNodeWithPos(byte('b'), symPos(5)),
- newSymbolNodeWithPos(byte('c'), symPos(6)),
- ),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(7)),
- ),
- },
- {
- pattern: "((a+)+)+",
- ast: genConcatNode(
- genConcatNode(
- genConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newRepeatNode(
- newSymbolNodeWithPos(byte('a'), symPos(2)),
- ),
- ),
- newRepeatNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(3)),
- newRepeatNode(
- newSymbolNodeWithPos(byte('a'), symPos(4)),
- ),
- ),
- ),
- ),
- newRepeatNode(
- genConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(5)),
- newRepeatNode(
- newSymbolNodeWithPos(byte('a'), symPos(6)),
- ),
- ),
- newRepeatNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(7)),
- newRepeatNode(
- newSymbolNodeWithPos(byte('a'), symPos(8)),
- ),
- ),
- ),
- ),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(9)),
- ),
- },
- {
- pattern: "(abc)+",
- ast: genConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- newRepeatNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(4)),
- newSymbolNodeWithPos(byte('b'), symPos(5)),
- newSymbolNodeWithPos(byte('c'), symPos(6)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(7)),
- ),
- },
- {
- pattern: "(a|b)+",
- ast: genConcatNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- ),
- newRepeatNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), symPos(3)),
- newSymbolNodeWithPos(byte('b'), symPos(4)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(5)),
- ),
- },
- {
- pattern: "+",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "(+)",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a|+",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "+|b",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a++",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: ".",
- ast: newConcatNode(
- genAltNode(
- newRangeSymbolNodeWithPos(0x00, 0x7f, symPos(1)),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xc2, 0xdf, symPos(2)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(3)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xe0, 0xe0, symPos(4)),
- newRangeSymbolNodeWithPos(0xa0, 0xbf, symPos(5)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(6)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xe1, 0xec, symPos(7)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(8)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(9)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xed, 0xed, symPos(10)),
- newRangeSymbolNodeWithPos(0x80, 0x9f, symPos(11)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(12)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xee, 0xef, symPos(13)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(14)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(15)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf0, 0xf0, symPos(16)),
- newRangeSymbolNodeWithPos(0x90, 0xbf, symPos(17)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(18)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(19)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf1, 0xf3, symPos(20)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(21)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(22)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(23)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf4, 0xf4, symPos(24)),
- newRangeSymbolNodeWithPos(0x80, 0x8f, symPos(25)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(26)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(27)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(28)),
- ),
- },
- {
- pattern: "[a]",
- ast: newConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "[abc]",
- ast: newConcatNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "[a-z]",
- ast: newConcatNode(
- newRangeSymbolNodeWithPos(byte('a'), byte('z'), symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "[A-Za-z]",
- ast: newConcatNode(
- genAltNode(
- newRangeSymbolNodeWithPos(byte('A'), byte('Z'), symPos(1)),
- newRangeSymbolNodeWithPos(byte('a'), byte('z'), symPos(2)),
- ),
- newEndMarkerNodeWithPos(1, endPos(3)),
- ),
- },
- {
- pattern: "[\\u{004E}]",
- ast: newConcatNode(
- newSymbolNodeWithPos(byte('N'), symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "[\\p{Lu}]",
- skipTestAST: true,
- },
- {
- pattern: "a[]",
- syntaxError: synErrBExpNoElem,
- },
- {
- pattern: "[]a",
- syntaxError: synErrBExpNoElem,
- },
- {
- pattern: "[]",
- syntaxError: synErrBExpNoElem,
- },
- {
- pattern: "[^]",
- ast: newConcatNode(
- newSymbolNodeWithPos(byte('^'), symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "[",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[a",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([a",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[a-",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([a-",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[^",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([^",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[^a",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([^a",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[^a-",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([^a-",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "]",
- ast: newConcatNode(
- newSymbolNodeWithPos(byte(']'), symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "(]",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "a]",
- ast: newConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte(']'), symPos(2)),
- ),
- newEndMarkerNodeWithPos(1, endPos(3)),
- ),
- },
- {
- pattern: "(a]",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "([)",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([a)",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[a-]",
- ast: newConcatNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('-'), symPos(2)),
- ),
- newEndMarkerNodeWithPos(1, endPos(3)),
- ),
- },
- {
- pattern: "[^a-]",
- ast: newConcatNode(
- genAltNode(
- newRangeSymbolNodeWithPos(0x00, byte(44), symPos(1)),
- newRangeSymbolNodeWithPos(byte(46), byte(96), symPos(2)),
- newRangeSymbolNodeWithPos(byte(98), 0x7f, symPos(3)),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xc2, 0xdf, symPos(4)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(5)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xe0, 0xe0, symPos(6)),
- newRangeSymbolNodeWithPos(0xa0, 0xbf, symPos(7)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(8)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xe1, 0xec, symPos(9)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(10)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(11)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xed, 0xed, symPos(12)),
- newRangeSymbolNodeWithPos(0x80, 0x9f, symPos(13)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(14)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xee, 0xef, symPos(15)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(16)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(17)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf0, 0xf0, symPos(18)),
- newRangeSymbolNodeWithPos(0x90, 0xbf, symPos(19)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(20)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(21)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf1, 0xf3, symPos(22)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(23)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(24)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(25)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf4, 0xf4, symPos(26)),
- newRangeSymbolNodeWithPos(0x80, 0x8f, symPos(27)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(28)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(29)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(30)),
- ),
- },
- {
- pattern: "[-z]",
- ast: newConcatNode(
- genAltNode(
- newSymbolNodeWithPos(byte('-'), symPos(1)),
- newSymbolNodeWithPos(byte('z'), symPos(2)),
- ),
- newEndMarkerNodeWithPos(1, endPos(3)),
- ),
- },
- {
- pattern: "[^-z]",
- ast: newConcatNode(
- genAltNode(
- newRangeSymbolNodeWithPos(0x00, byte(44), symPos(1)),
- genAltNode(
- newRangeSymbolNodeWithPos(byte(46), byte(121), symPos(2)),
- newRangeSymbolNodeWithPos(byte(123), 0x7f, symPos(3)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xc2, 0xdf, symPos(4)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(5)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xe0, 0xe0, symPos(6)),
- newRangeSymbolNodeWithPos(0xa0, 0xbf, symPos(7)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(8)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xe1, 0xec, symPos(9)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(10)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(11)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xed, 0xed, symPos(12)),
- newRangeSymbolNodeWithPos(0x80, 0x9f, symPos(13)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(14)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xee, 0xef, symPos(15)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(16)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(17)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf0, 0xf0, symPos(18)),
- newRangeSymbolNodeWithPos(0x90, 0xbf, symPos(19)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(20)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(21)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf1, 0xf3, symPos(22)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(23)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(24)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(25)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf4, 0xf4, symPos(26)),
- newRangeSymbolNodeWithPos(0x80, 0x8f, symPos(27)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(28)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(29)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(30)),
- ),
- },
- {
- pattern: "[-]",
- ast: newConcatNode(
- newSymbolNodeWithPos(byte('-'), symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "[^-]",
- ast: newConcatNode(
- genAltNode(
- newRangeSymbolNodeWithPos(0x00, byte(44), symPos(1)),
- newRangeSymbolNodeWithPos(byte(46), 0x7f, symPos(2)),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xc2, 0xdf, symPos(3)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(4)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xe0, 0xe0, symPos(5)),
- newRangeSymbolNodeWithPos(0xa0, 0xbf, symPos(6)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(7)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xe1, 0xec, symPos(8)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(9)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(10)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xed, 0xed, symPos(11)),
- newRangeSymbolNodeWithPos(0x80, 0x9f, symPos(12)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(13)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xee, 0xef, symPos(14)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(15)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(16)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf0, 0xf0, symPos(17)),
- newRangeSymbolNodeWithPos(0x90, 0xbf, symPos(18)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(19)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(20)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf1, 0xf3, symPos(21)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(22)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(23)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(24)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf4, 0xf4, symPos(25)),
- newRangeSymbolNodeWithPos(0x80, 0x8f, symPos(26)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(27)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(28)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(29)),
- ),
- },
- {
- pattern: "\\u{006E}",
- ast: genConcatNode(
- newSymbolNodeWithPos(0x6E, symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "\\u{03BD}",
- ast: genConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(0xCE, symPos(1)),
- newSymbolNodeWithPos(0xBD, symPos(2)),
- ),
- newEndMarkerNodeWithPos(1, endPos(3)),
- ),
- },
- {
- pattern: "\\u{306B}",
- ast: genConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(0xE3, symPos(1)),
- newSymbolNodeWithPos(0x81, symPos(2)),
- newSymbolNodeWithPos(0xAB, symPos(3)),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "\\u{01F638}",
- ast: genConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(0xF0, symPos(1)),
- newSymbolNodeWithPos(0x9F, symPos(2)),
- newSymbolNodeWithPos(0x98, symPos(3)),
- newSymbolNodeWithPos(0xB8, symPos(4)),
- ),
- newEndMarkerNodeWithPos(1, endPos(5)),
- ),
- },
- {
- pattern: "\\u{0000}",
- ast: genConcatNode(
- newSymbolNodeWithPos(0x00, symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "\\u{10FFFF}",
- ast: genConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(0xF4, symPos(1)),
- newSymbolNodeWithPos(0x8F, symPos(2)),
- newSymbolNodeWithPos(0xBF, symPos(3)),
- newSymbolNodeWithPos(0xBF, symPos(4)),
- ),
- newEndMarkerNodeWithPos(1, endPos(5)),
- ),
- },
- {
- pattern: "\\u{110000}",
- syntaxError: synErrCPExpOutOfRange,
- },
- {
- pattern: "\\u",
- syntaxError: synErrCPExpInvalidForm,
- },
- {
- pattern: "\\u{",
- syntaxError: synErrCPExpInvalidForm,
- },
- {
- pattern: "\\u{03BD",
- syntaxError: synErrCPExpInvalidForm,
- },
- {
- pattern: "\\u{}",
- syntaxError: synErrCPExpInvalidForm,
- },
- {
- pattern: "\\p{Letter}",
- skipTestAST: true,
- },
- {
- pattern: "\\p{General_Category=Letter}",
- skipTestAST: true,
- },
- {
- pattern: "\\p{ Letter }",
- skipTestAST: true,
- },
- {
- pattern: "\\p{ General_Category = Letter }",
- skipTestAST: true,
- },
- {
- pattern: "\\p",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{Letter",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{General_Category=}",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{General_Category= }",
- syntaxError: synErrCharPropInvalidSymbol,
- },
- {
- pattern: "\\p{=Letter}",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{ =Letter}",
- syntaxError: synErrCharPropInvalidSymbol,
- },
- {
- pattern: "\\p{=}",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{}",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\f{a2c}",
- fragments: map[string]string{
- "a2c": "abc",
- },
- ast: genConcatNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "\\f{ a2c }",
- fragments: map[string]string{
- "a2c": "abc",
- },
- ast: genConcatNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "\\f",
- syntaxError: synErrFragmentExpInvalidForm,
- },
- {
- pattern: "\\f{",
- syntaxError: synErrFragmentExpInvalidForm,
- },
- {
- pattern: "\\f{a2c",
- fragments: map[string]string{
- "a2c": "abc",
- },
- syntaxError: synErrFragmentExpInvalidForm,
- },
- {
- pattern: "(a)",
- ast: newConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "(((a)))",
- ast: newConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "a()",
- syntaxError: synErrGroupNoElem,
- },
- {
- pattern: "()a",
- syntaxError: synErrGroupNoElem,
- },
- {
- pattern: "()",
- syntaxError: synErrGroupNoElem,
- },
- {
- pattern: "(",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "a(",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "(a",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "((",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "((a)",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: ")",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: "a)",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: ")a",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: "))",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: "(a))",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: "Mulder|Scully",
- ast: newConcatNode(
- genAltNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('M'), symPos(1)),
- newSymbolNodeWithPos(byte('u'), symPos(2)),
- newSymbolNodeWithPos(byte('l'), symPos(3)),
- newSymbolNodeWithPos(byte('d'), symPos(4)),
- newSymbolNodeWithPos(byte('e'), symPos(5)),
- newSymbolNodeWithPos(byte('r'), symPos(6)),
- ),
- genConcatNode(
- newSymbolNodeWithPos(byte('S'), symPos(7)),
- newSymbolNodeWithPos(byte('c'), symPos(8)),
- newSymbolNodeWithPos(byte('u'), symPos(9)),
- newSymbolNodeWithPos(byte('l'), symPos(10)),
- newSymbolNodeWithPos(byte('l'), symPos(11)),
- newSymbolNodeWithPos(byte('y'), symPos(12)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(13)),
- ),
- },
- {
- pattern: "Langly|Frohike|Byers",
- ast: newConcatNode(
- genAltNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('L'), symPos(1)),
- newSymbolNodeWithPos(byte('a'), symPos(2)),
- newSymbolNodeWithPos(byte('n'), symPos(3)),
- newSymbolNodeWithPos(byte('g'), symPos(4)),
- newSymbolNodeWithPos(byte('l'), symPos(5)),
- newSymbolNodeWithPos(byte('y'), symPos(6)),
- ),
- genConcatNode(
- newSymbolNodeWithPos(byte('F'), symPos(7)),
- newSymbolNodeWithPos(byte('r'), symPos(8)),
- newSymbolNodeWithPos(byte('o'), symPos(9)),
- newSymbolNodeWithPos(byte('h'), symPos(10)),
- newSymbolNodeWithPos(byte('i'), symPos(11)),
- newSymbolNodeWithPos(byte('k'), symPos(12)),
- newSymbolNodeWithPos(byte('e'), symPos(13)),
- ),
- genConcatNode(
- newSymbolNodeWithPos(byte('B'), symPos(14)),
- newSymbolNodeWithPos(byte('y'), symPos(15)),
- newSymbolNodeWithPos(byte('e'), symPos(16)),
- newSymbolNodeWithPos(byte('r'), symPos(17)),
- newSymbolNodeWithPos(byte('s'), symPos(18)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(19)),
- ),
- },
- {
- pattern: "|",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "||",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Mulder|",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "|Scully",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Langly|Frohike|",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Langly||Byers",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "|Frohike|Byers",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "|Frohike|",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Fox(|)Mulder",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "(Fox|)Mulder",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Fox(|Mulder)",
- syntaxError: synErrAltLackOfOperand,
- },
- }
- for i, tt := range tests {
- t.Run(fmt.Sprintf("#%v %v", i, tt.pattern), func(t *testing.T) {
- fragments := map[string][]byte{}
- for kind, pattern := range tt.fragments {
- fragments[kind] = []byte(pattern)
- }
- ast, _, err := parse([]*patternEntry{
- {
- id: spec.LexModeKindIDMin,
- pattern: []byte(tt.pattern),
- },
- }, fragments)
- if tt.syntaxError != nil {
- // printAST(os.Stdout, ast, "", "", false)
- if err == nil {
- t.Fatalf("expected syntax error; got: nil")
- }
- parseErrs, ok := err.(*ParseErrors)
- if !ok {
- t.Fatalf("expected ParseErrors; got: %v (type: %T)", err, err)
- }
- parseErr := parseErrs.Errors[0].Cause
- synErr, ok := parseErr.(*SyntaxError)
- if !ok {
- t.Fatalf("expected SyntaxError; got: %v (type: %T)", parseErr, parseErr)
- }
- if synErr != tt.syntaxError {
- t.Fatalf("unexpected syntax error; want: %v, got: %v", tt.syntaxError, synErr)
- }
- if ast != nil {
- t.Fatalf("ast is not nil")
- }
- } else {
- if err != nil {
- t.Fatal(err)
- }
- if ast == nil {
- t.Fatal("AST is nil")
- }
- // printAST(os.Stdout, ast, "", "", false)
- if !tt.skipTestAST {
- testAST(t, tt.ast, ast)
- }
- }
- })
- }
-}
-
-func TestParse_ContributoryPropertyIsNotExposed(t *testing.T) {
- for _, cProp := range ucd.ContributoryProperties() {
- t.Run(fmt.Sprintf("%v", cProp), func(t *testing.T) {
- ast, _, err := parse([]*patternEntry{
- {
- id: spec.LexModeKindIDMin,
- pattern: []byte(fmt.Sprintf(`\p{%v=yes}`, cProp)),
- },
- }, nil)
- if err == nil {
- t.Fatalf("expected syntax error; got: nil")
- }
- parseErrs, ok := err.(*ParseErrors)
- if !ok {
- t.Fatalf("expected ParseErrors; got: %v (type: %T)", err, err)
- }
- parseErr := parseErrs.Errors[0].Cause
- synErr, ok := parseErr.(*SyntaxError)
- if !ok {
- t.Fatalf("expected SyntaxError; got: %v (type: %T)", parseErr, parseErr)
- }
- if synErr != synErrCharPropUnsupported {
- t.Fatalf("unexpected syntax error; want: %v, got: %v", synErrCharPropUnsupported, synErr)
- }
- if ast != nil {
- t.Fatalf("ast is not nil")
- }
- })
- }
-}
-
-func TestParse_FollowAndSymbolTable(t *testing.T) {
- root, symTab, err := parse([]*patternEntry{
- {
- id: spec.LexModeKindIDMin,
- pattern: []byte("(a|b)*abb"),
- },
- }, nil)
- if err != nil {
- t.Fatal(err)
- }
- if root == nil {
- t.Fatal("root of AST is nil")
- }
- // printAST(os.Stdout, root, "", "", false)
-
- {
- expectedAST := genConcatNode(
- newRepeatNode(
- newAltNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- ),
- ),
- newSymbolNodeWithPos(byte('a'), symPos(3)),
- newSymbolNodeWithPos(byte('b'), symPos(4)),
- newSymbolNodeWithPos(byte('b'), symPos(5)),
- newEndMarkerNodeWithPos(1, endPos(6)),
- )
- testAST(t, expectedAST, root)
- }
-
- {
- followTab := genFollowTable(root)
- if followTab == nil {
- t.Fatal("follow table is nil")
- }
- expectedFollowTab := followTable{
- 1: newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)),
- 2: newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)),
- 3: newSymbolPositionSet().add(symPos(4)),
- 4: newSymbolPositionSet().add(symPos(5)),
- 5: newSymbolPositionSet().add(endPos(6)),
- }
- testFollowTable(t, expectedFollowTab, followTab)
- }
-
- {
- entry := func(v byte) byteRange {
- return byteRange{
- from: v,
- to: v,
- }
- }
-
- expectedSymTab := &symbolTable{
- symPos2Byte: map[symbolPosition]byteRange{
- symPos(1): entry(byte('a')),
- symPos(2): entry(byte('b')),
- symPos(3): entry(byte('a')),
- symPos(4): entry(byte('b')),
- symPos(5): entry(byte('b')),
- },
- endPos2ID: map[symbolPosition]spec.LexModeKindID{
- endPos(6): 1,
- },
- }
- testSymbolTable(t, expectedSymTab, symTab)
- }
-}
-
-func testAST(t *testing.T, expected, actual astNode) {
- t.Helper()
-
- aTy := reflect.TypeOf(actual)
- eTy := reflect.TypeOf(expected)
- if eTy != aTy {
- t.Fatalf("AST node type is mismatched; want: %v, got: %v", eTy, aTy)
- }
-
- if actual == nil {
- return
- }
-
- switch e := expected.(type) {
- case *symbolNode:
- a := actual.(*symbolNode)
- if a.pos != e.pos || a.from != e.from || a.to != e.to {
- t.Fatalf("unexpected node; want: %+v, got: %+v", e, a)
- }
- case *endMarkerNode:
- a := actual.(*endMarkerNode)
- if a.pos != e.pos {
- t.Fatalf("symbol position is mismatched; want: %v, got: %v", e.pos, a.pos)
- }
- }
- eLeft, eRight := expected.children()
- aLeft, aRight := actual.children()
- testAST(t, eLeft, aLeft)
- testAST(t, eRight, aRight)
-}
-
-func testFollowTable(t *testing.T, expected, actual followTable) {
- if len(actual) != len(expected) {
- t.Errorf("unexpected number of the follow table entries; want: %v, got: %v", len(expected), len(actual))
- }
- for ePos, eSet := range expected {
- aSet, ok := actual[ePos]
- if !ok {
- t.Fatalf("follow entry is not found; position: %v, follow: %v", ePos, eSet)
- }
- if aSet.hash() != eSet.hash() {
- t.Fatalf("follow entry of position %v is mismatched; want: %v, got: %v", ePos, aSet, eSet)
- }
- }
-}
-
-func testSymbolTable(t *testing.T, expected, actual *symbolTable) {
- t.Helper()
-
- if len(actual.symPos2Byte) != len(expected.symPos2Byte) {
- t.Errorf("unexpected symPos2Byte entries; want: %v entries, got: %v entries", len(expected.symPos2Byte), len(actual.symPos2Byte))
- }
- for ePos, eByte := range expected.symPos2Byte {
- byte, ok := actual.symPos2Byte[ePos]
- if !ok {
- t.Errorf("a symbol position entry was not found: %v -> %v", ePos, eByte)
- continue
- }
- if byte.from != eByte.from || byte.to != eByte.to {
- t.Errorf("unexpected symbol position entry; want: %v -> %v, got: %v -> %v", ePos, eByte, ePos, byte)
- }
- }
-
- if len(actual.endPos2ID) != len(expected.endPos2ID) {
- t.Errorf("unexpected endPos2ID entries; want: %v entries, got: %v entries", len(expected.endPos2ID), len(actual.endPos2ID))
- }
- for ePos, eID := range expected.endPos2ID {
- id, ok := actual.endPos2ID[ePos]
- if !ok {
- t.Errorf("an end position entry was not found: %v -> %v", ePos, eID)
- continue
- }
- if id != eID {
- t.Errorf("unexpected end position entry; want: %v -> %v, got: %v -> %v", ePos, eID, ePos, id)
- }
- }
-}