diff options
Diffstat (limited to '')
-rw-r--r-- | cmd/maleeni/compile.go | 23 | ||||
-rw-r--r-- | compiler/ast.go | 469 | ||||
-rw-r--r-- | compiler/ast_test.go | 198 | ||||
-rw-r--r-- | compiler/byte.go | 281 | ||||
-rw-r--r-- | compiler/byte_test.go | 263 | ||||
-rw-r--r-- | compiler/compiler.go | 183 | ||||
-rw-r--r-- | compiler/compiler_test.go | 4 | ||||
-rw-r--r-- | compiler/dfa.go | 139 | ||||
-rw-r--r-- | compiler/dfa_test.go | 117 | ||||
-rw-r--r-- | compiler/lexer.go | 578 | ||||
-rw-r--r-- | compiler/lexer_test.go | 514 | ||||
-rw-r--r-- | compiler/parser.go | 862 | ||||
-rw-r--r-- | compiler/parser_test.go | 1422 | ||||
-rw-r--r-- | compiler/symbol_position.go | 198 | ||||
-rw-r--r-- | compiler/syntax_error.go | 45 | ||||
-rw-r--r-- | compiler/test_util_test.go | 21 | ||||
-rw-r--r-- | driver/lexer_test.go | 7 | ||||
-rw-r--r-- | utf8/utf8.go | 9 |
18 files changed, 181 insertions, 5152 deletions
diff --git a/cmd/maleeni/compile.go b/cmd/maleeni/compile.go index c7a8b59..21e5058 100644 --- a/cmd/maleeni/compile.go +++ b/cmd/maleeni/compile.go @@ -3,8 +3,10 @@ package main import ( "encoding/json" "fmt" + "io" "io/ioutil" "os" + "strings" "github.com/nihei9/maleeni/compiler" "github.com/nihei9/maleeni/spec" @@ -38,8 +40,17 @@ func runCompile(cmd *cobra.Command, args []string) (retErr error) { return fmt.Errorf("Cannot read a lexical specification: %w", err) } - clspec, err := compiler.Compile(lspec, compiler.CompressionLevel(*compileFlags.compLv)) + clspec, err, cerrs := compiler.Compile(lspec, compiler.CompressionLevel(*compileFlags.compLv)) if err != nil { + if len(cerrs) > 0 { + var b strings.Builder + writeCompileError(&b, cerrs[0]) + for _, cerr := range cerrs[1:] { + fmt.Fprintf(&b, "\n") + writeCompileError(&b, cerr) + } + return fmt.Errorf(b.String()) + } return err } err = writeCompiledLexSpec(clspec, *compileFlags.output) @@ -50,6 +61,16 @@ func runCompile(cmd *cobra.Command, args []string) (retErr error) { return nil } +func writeCompileError(w io.Writer, cerr *compiler.CompileError) { + if cerr.Fragment { + fmt.Fprintf(w, "fragment ") + } + fmt.Fprintf(w, "%v: %v", cerr.Kind, cerr.Cause) + if cerr.Detail != "" { + fmt.Fprintf(w, ": %v", cerr.Detail) + } +} + func readLexSpec(path string) (*spec.LexSpec, error) { r := os.Stdin if path != "" { diff --git a/compiler/ast.go b/compiler/ast.go deleted file mode 100644 index 046662e..0000000 --- a/compiler/ast.go +++ /dev/null @@ -1,469 +0,0 @@ -package compiler - -import ( - "fmt" - "io" - - "github.com/nihei9/maleeni/spec" -) - -type astNode interface { - fmt.Stringer - children() (astNode, astNode) - nullable() bool - first() *symbolPositionSet - last() *symbolPositionSet -} - -var ( - _ astNode = &symbolNode{} - _ astNode = &endMarkerNode{} - _ astNode = &concatNode{} - _ astNode = &altNode{} - _ astNode = &optionNode{} - _ astNode = &fragmentNode{} -) - -type symbolNode struct { - byteRange - pos symbolPosition - firstMemo *symbolPositionSet - lastMemo *symbolPositionSet -} - -func newSymbolNode(value byte) *symbolNode { - return &symbolNode{ - byteRange: byteRange{ - from: value, - to: value, - }, - pos: symbolPositionNil, - } -} - -func newRangeSymbolNode(from, to byte) *symbolNode { - return &symbolNode{ - byteRange: byteRange{ - from: from, - to: to, - }, - pos: symbolPositionNil, - } -} - -func (n *symbolNode) String() string { - return fmt.Sprintf("{type: symbol, value: %v - %v, pos: %v}", n.from, n.to, n.pos) -} - -func (n *symbolNode) children() (astNode, astNode) { - return nil, nil -} - -func (n *symbolNode) nullable() bool { - return false -} - -func (n *symbolNode) first() *symbolPositionSet { - if n.firstMemo == nil { - n.firstMemo = newSymbolPositionSet() - n.firstMemo.add(n.pos) - } - return n.firstMemo -} - -func (n *symbolNode) last() *symbolPositionSet { - if n.lastMemo == nil { - n.lastMemo = newSymbolPositionSet() - n.lastMemo.add(n.pos) - } - return n.lastMemo -} - -type endMarkerNode struct { - id spec.LexModeKindID - pos symbolPosition - firstMemo *symbolPositionSet - lastMemo *symbolPositionSet -} - -func newEndMarkerNode(id spec.LexModeKindID) *endMarkerNode { - return &endMarkerNode{ - id: id, - pos: symbolPositionNil, - } -} - -func (n *endMarkerNode) String() string { - return fmt.Sprintf("{type: end, pos: %v}", n.pos) -} - -func (n *endMarkerNode) children() (astNode, astNode) { - return nil, nil -} - -func (n *endMarkerNode) nullable() bool { - return false -} - -func (n *endMarkerNode) first() *symbolPositionSet { - if n.firstMemo == nil { - n.firstMemo = newSymbolPositionSet() - n.firstMemo.add(n.pos) - } - return n.firstMemo -} - -func (n *endMarkerNode) last() *symbolPositionSet { - if n.lastMemo == nil { - n.lastMemo = newSymbolPositionSet() - n.lastMemo.add(n.pos) - } - return n.lastMemo -} - -type concatNode struct { - left astNode - right astNode - firstMemo *symbolPositionSet - lastMemo *symbolPositionSet -} - -func newConcatNode(left, right astNode) *concatNode { - return &concatNode{ - left: left, - right: right, - } -} - -func (n *concatNode) String() string { - return fmt.Sprintf("{type: concat}") -} - -func (n *concatNode) children() (astNode, astNode) { - return n.left, n.right -} - -func (n *concatNode) nullable() bool { - return n.left.nullable() && n.right.nullable() -} - -func (n *concatNode) first() *symbolPositionSet { - if n.firstMemo == nil { - n.firstMemo = newSymbolPositionSet() - n.firstMemo.merge(n.left.first()) - if n.left.nullable() { - n.firstMemo.merge(n.right.first()) - } - n.firstMemo.sortAndRemoveDuplicates() - } - return n.firstMemo -} - -func (n *concatNode) last() *symbolPositionSet { - if n.lastMemo == nil { - n.lastMemo = newSymbolPositionSet() - n.lastMemo.merge(n.right.last()) - if n.right.nullable() { - n.lastMemo.merge(n.left.last()) - } - n.lastMemo.sortAndRemoveDuplicates() - } - return n.lastMemo -} - -type altNode struct { - left astNode - right astNode - firstMemo *symbolPositionSet - lastMemo *symbolPositionSet -} - -func newAltNode(left, right astNode) *altNode { - return &altNode{ - left: left, - right: right, - } -} - -func (n *altNode) String() string { - return fmt.Sprintf("{type: alt}") -} - -func (n *altNode) children() (astNode, astNode) { - return n.left, n.right -} - -func (n *altNode) nullable() bool { - return n.left.nullable() || n.right.nullable() -} - -func (n *altNode) first() *symbolPositionSet { - if n.firstMemo == nil { - n.firstMemo = newSymbolPositionSet() - n.firstMemo.merge(n.left.first()) - n.firstMemo.merge(n.right.first()) - n.firstMemo.sortAndRemoveDuplicates() - } - return n.firstMemo -} - -func (n *altNode) last() *symbolPositionSet { - if n.lastMemo == nil { - n.lastMemo = newSymbolPositionSet() - n.lastMemo.merge(n.left.last()) - n.lastMemo.merge(n.right.last()) - n.lastMemo.sortAndRemoveDuplicates() - } - return n.lastMemo -} - -type repeatNode struct { - left astNode - firstMemo *symbolPositionSet - lastMemo *symbolPositionSet -} - -func newRepeatNode(left astNode) *repeatNode { - return &repeatNode{ - left: left, - } -} - -func (n *repeatNode) String() string { - return fmt.Sprintf("{type: repeat}") -} - -func (n *repeatNode) children() (astNode, astNode) { - return n.left, nil -} - -func (n *repeatNode) nullable() bool { - return true -} - -func (n *repeatNode) first() *symbolPositionSet { - if n.firstMemo == nil { - n.firstMemo = newSymbolPositionSet() - n.firstMemo.merge(n.left.first()) - n.firstMemo.sortAndRemoveDuplicates() - } - return n.firstMemo -} - -func (n *repeatNode) last() *symbolPositionSet { - if n.lastMemo == nil { - n.lastMemo = newSymbolPositionSet() - n.lastMemo.merge(n.left.last()) - n.lastMemo.sortAndRemoveDuplicates() - } - return n.lastMemo -} - -func newRepeatOneOrMoreNode(left astNode) *concatNode { - return newConcatNode( - left, - &repeatNode{ - left: copyAST(left), - }) -} - -type optionNode struct { - left astNode - firstMemo *symbolPositionSet - lastMemo *symbolPositionSet -} - -func newOptionNode(left astNode) *optionNode { - return &optionNode{ - left: left, - } -} - -func (n *optionNode) String() string { - return fmt.Sprintf("{type: option}") -} - -func (n *optionNode) children() (astNode, astNode) { - return n.left, nil -} - -func (n *optionNode) nullable() bool { - return true -} - -func (n *optionNode) first() *symbolPositionSet { - if n.firstMemo == nil { - n.firstMemo = newSymbolPositionSet() - n.firstMemo.merge(n.left.first()) - n.firstMemo.sortAndRemoveDuplicates() - } - return n.firstMemo -} - -func (n *optionNode) last() *symbolPositionSet { - if n.lastMemo == nil { - n.lastMemo = newSymbolPositionSet() - n.lastMemo.merge(n.left.last()) - n.lastMemo.sortAndRemoveDuplicates() - } - return n.lastMemo -} - -type fragmentNode struct { - symbol string - left astNode -} - -func newFragmentNode(symbol string, ast astNode) *fragmentNode { - return &fragmentNode{ - symbol: symbol, - left: ast, - } -} - -func (n *fragmentNode) String() string { - return fmt.Sprintf("{type: fragment, symbol: %v}", n.symbol) -} - -func (n *fragmentNode) children() (astNode, astNode) { - return n.left, nil -} - -func (n *fragmentNode) nullable() bool { - return n.left.nullable() -} - -func (n *fragmentNode) first() *symbolPositionSet { - return n.left.first() -} - -func (n *fragmentNode) last() *symbolPositionSet { - return n.left.last() -} - -func copyAST(src astNode) astNode { - switch n := src.(type) { - case *symbolNode: - return newRangeSymbolNode(n.from, n.to) - case *endMarkerNode: - return newEndMarkerNode(n.id) - case *concatNode: - return newConcatNode(copyAST(n.left), copyAST(n.right)) - case *altNode: - return newAltNode(copyAST(n.left), copyAST(n.right)) - case *repeatNode: - return newRepeatNode(copyAST(n.left)) - case *optionNode: - return newOptionNode(copyAST(n.left)) - case *fragmentNode: - if n.left == nil { - return newFragmentNode(n.symbol, nil) - } - return newFragmentNode(n.symbol, copyAST(n.left)) - } - panic(fmt.Errorf("copyAST cannot handle %T type; AST: %v", src, src)) -} - -type followTable map[symbolPosition]*symbolPositionSet - -func genFollowTable(root astNode) followTable { - follow := followTable{} - calcFollow(follow, root) - return follow -} - -func calcFollow(follow followTable, ast astNode) { - if ast == nil { - return - } - left, right := ast.children() - calcFollow(follow, left) - calcFollow(follow, right) - switch n := ast.(type) { - case *concatNode: - l, r := n.children() - for _, p := range l.last().set() { - if _, ok := follow[p]; !ok { - follow[p] = newSymbolPositionSet() - } - follow[p].merge(r.first()) - } - case *repeatNode: - for _, p := range n.last().set() { - if _, ok := follow[p]; !ok { - follow[p] = newSymbolPositionSet() - } - follow[p].merge(n.first()) - } - } -} - -func positionSymbols(node astNode, n uint16) (uint16, error) { - if node == nil { - return n, nil - } - - l, r := node.children() - p := n - p, err := positionSymbols(l, p) - if err != nil { - return p, err - } - p, err = positionSymbols(r, p) - if err != nil { - return p, err - } - switch n := node.(type) { - case *symbolNode: - n.pos, err = newSymbolPosition(p, false) - if err != nil { - return p, err - } - p++ - case *endMarkerNode: - n.pos, err = newSymbolPosition(p, true) - if err != nil { - return p, err - } - p++ - } - node.first() - node.last() - return p, nil -} - -func printAST(w io.Writer, ast astNode, ruledLine string, childRuledLinePrefix string, withAttrs bool) { - if ast == nil { - return - } - fmt.Fprintf(w, ruledLine) - fmt.Fprintf(w, "node: %v", ast) - if withAttrs { - fmt.Fprintf(w, ", nullable: %v, first: %v, last: %v", ast.nullable(), ast.first(), ast.last()) - } - fmt.Fprintf(w, "\n") - left, right := ast.children() - children := []astNode{} - if left != nil { - children = append(children, left) - } - if right != nil { - children = append(children, right) - } - num := len(children) - for i, child := range children { - line := "└─ " - if num > 1 { - if i == 0 { - line = "├─ " - } else if i < num-1 { - line = "│ " - } - } - prefix := "│ " - if i >= num-1 { - prefix = " " - } - printAST(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix, withAttrs) - } -} diff --git a/compiler/ast_test.go b/compiler/ast_test.go deleted file mode 100644 index 2a77dfc..0000000 --- a/compiler/ast_test.go +++ /dev/null @@ -1,198 +0,0 @@ -package compiler - -import ( - "fmt" - "testing" -) - -func TestNewSymbolPosition(t *testing.T) { - tests := []struct { - n uint16 - endMark bool - err bool - }{ - { - n: 0, - endMark: false, - err: true, - }, - { - n: 0, - endMark: true, - err: true, - }, - { - n: symbolPositionMin - 1, - endMark: false, - err: true, - }, - { - n: symbolPositionMin - 1, - endMark: true, - err: true, - }, - { - n: symbolPositionMin, - endMark: false, - }, - { - n: symbolPositionMin, - endMark: true, - }, - { - n: symbolPositionMax, - endMark: false, - }, - { - n: symbolPositionMax, - endMark: true, - }, - { - n: symbolPositionMax + 1, - endMark: false, - err: true, - }, - { - n: symbolPositionMax + 1, - endMark: true, - err: true, - }, - } - for i, tt := range tests { - t.Run(fmt.Sprintf("#%v n: %v, endMark: %v", i, tt.n, tt.endMark), func(t *testing.T) { - pos, err := newSymbolPosition(tt.n, tt.endMark) - if tt.err { - if err == nil { - t.Fatal("err is nil") - } - return - } - if err != nil { - t.Fatal(err) - } - n, endMark := pos.describe() - if n != tt.n || endMark != tt.endMark { - t.Errorf("unexpected symbol position; want: n: %v, endMark: %v, got: n: %v, endMark: %v", tt.n, tt.endMark, n, endMark) - } - }) - } -} - -func TestASTNode(t *testing.T) { - tests := []struct { - root astNode - nullable bool - first *symbolPositionSet - last *symbolPositionSet - }{ - { - root: newSymbolNodeWithPos(0, 1), - nullable: false, - first: newSymbolPositionSet().add(1), - last: newSymbolPositionSet().add(1), - }, - { - root: newEndMarkerNodeWithPos(1, 1), - nullable: false, - first: newSymbolPositionSet().add(1), - last: newSymbolPositionSet().add(1), - }, - { - root: newConcatNode( - newSymbolNodeWithPos(0, 1), - newSymbolNodeWithPos(0, 2), - ), - nullable: false, - first: newSymbolPositionSet().add(1), - last: newSymbolPositionSet().add(2), - }, - { - root: newConcatNode( - newRepeatNode(newSymbolNodeWithPos(0, 1)), - newSymbolNodeWithPos(0, 2), - ), - nullable: false, - first: newSymbolPositionSet().add(1).add(2), - last: newSymbolPositionSet().add(2), - }, - { - root: newConcatNode( - newSymbolNodeWithPos(0, 1), - newRepeatNode(newSymbolNodeWithPos(0, 2)), - ), - nullable: false, - first: newSymbolPositionSet().add(1), - last: newSymbolPositionSet().add(1).add(2), - }, - { - root: newConcatNode( - newRepeatNode(newSymbolNodeWithPos(0, 1)), - newRepeatNode(newSymbolNodeWithPos(0, 2)), - ), - nullable: true, - first: newSymbolPositionSet().add(1).add(2), - last: newSymbolPositionSet().add(1).add(2), - }, - { - root: newAltNode( - newSymbolNodeWithPos(0, 1), - newSymbolNodeWithPos(0, 2), - ), - nullable: false, - first: newSymbolPositionSet().add(1).add(2), - last: newSymbolPositionSet().add(1).add(2), - }, - { - root: newAltNode( - newRepeatNode(newSymbolNodeWithPos(0, 1)), - newSymbolNodeWithPos(0, 2), - ), - nullable: true, - first: newSymbolPositionSet().add(1).add(2), - last: newSymbolPositionSet().add(1).add(2), - }, - { - root: newAltNode( - newSymbolNodeWithPos(0, 1), - newRepeatNode(newSymbolNodeWithPos(0, 2)), - ), - nullable: true, - first: newSymbolPositionSet().add(1).add(2), - last: newSymbolPositionSet().add(1).add(2), - }, - { - root: newAltNode( - newRepeatNode(newSymbolNodeWithPos(0, 1)), - newRepeatNode(newSymbolNodeWithPos(0, 2)), - ), - nullable: true, - first: newSymbolPositionSet().add(1).add(2), - last: newSymbolPositionSet().add(1).add(2), - }, - { - root: newRepeatNode(newSymbolNodeWithPos(0, 1)), - nullable: true, - first: newSymbolPositionSet().add(1), - last: newSymbolPositionSet().add(1), - }, - { - root: newOptionNode(newSymbolNodeWithPos(0, 1)), - nullable: true, - first: newSymbolPositionSet().add(1), - last: newSymbolPositionSet().add(1), - }, - } - for i, tt := range tests { - t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) { - if tt.root.nullable() != tt.nullable { - t.Errorf("unexpected nullable attribute; want: %v, got: %v", tt.nullable, tt.root.nullable()) - } - if tt.first.hash() != tt.root.first().hash() { - t.Errorf("unexpected first positions attribute; want: %v, got: %v", tt.first, tt.root.first()) - } - if tt.last.hash() != tt.root.last().hash() { - t.Errorf("unexpected last positions attribute; want: %v, got: %v", tt.last, tt.root.last()) - } - }) - } -} diff --git a/compiler/byte.go b/compiler/byte.go deleted file mode 100644 index 464887e..0000000 --- a/compiler/byte.go +++ /dev/null @@ -1,281 +0,0 @@ -package compiler - -import "fmt" - -type byteRange struct { - from byte - to byte -} - -func newByteRange(from, to byte) byteRange { - return byteRange{ - from: from, - to: to, - } -} - -func (r byteRange) String() string { - return fmt.Sprintf("%X-%X (%v-%v)", r.from, r.to, r.from, r.to) -} - -func newByteRangeSequence(from, to []byte) ([]byteRange, error) { - if len(from) != len(to) { - return nil, fmt.Errorf("length of `from` and `to` are mismatched; from: %+v, to: %+v", from, to) - } - seq := []byteRange{} - for i := 0; i < len(from); i++ { - seq = append(seq, newByteRange(from[i], to[i])) - } - return seq, nil -} - -func excludeByteRangeSequence(b1, b2 []byteRange) [][]byteRange { - if len(b1) != len(b2) { - // no overlapping - return [][]byteRange{b2} - } - switch len(b1) { - case 1: - return exclude1Byte(b1, b2) - case 2: - return exclude2Byte(b1, b2) - case 3: - return exclude3Byte(b1, b2) - case 4: - return exclude4Byte(b1, b2) - } - panic(fmt.Errorf("excludeByteRangeSequence can only handle sequences up to 4 bytes in size; b1: %+v, b2: %+v", b1, b2)) -} - -func exclude1Byte(b1, b2 []byteRange) [][]byteRange { - r01, r02, overlapping0 := excludeByteRange(&b1[0], &b2[0]) - if !overlapping0 { - // no overlapping - return [][]byteRange{b2} - } - result := [][]byteRange{} - { - if r01 != nil { - result = append(result, []byteRange{ - newByteRange(r01.from, r01.to), - }) - } - if r02 != nil { - result = append(result, []byteRange{ - newByteRange(r02.from, r02.to), - }) - } - } - return result -} - -func exclude2Byte(b1, b2 []byteRange) [][]byteRange { - r01, r02, overlapping0 := excludeByteRange(&b1[0], &b2[0]) - r11, r12, overlapping1 := excludeByteRange(&b1[1], &b2[1]) - if !overlapping0 || !overlapping1 { - // no overlapping - return [][]byteRange{b2} - } - result := [][]byteRange{} - { - if r11 != nil { - result = append(result, []byteRange{ - newByteRange(b1[0].from, b1[0].to), - newByteRange(r11.from, r11.to), - }) - } - if r12 != nil { - result = append(result, []byteRange{ - newByteRange(b1[0].from, b1[0].to), - newByteRange(r12.from, r12.to), - }) - } - if r01 != nil { - result = append(result, []byteRange{ - newByteRange(r01.from, r01.to), - newByteRange(b2[1].from, b2[1].to), - }) - } - if r02 != nil { - result = append(result, []byteRange{ - newByteRange(r02.from, r02.to), - newByteRange(b2[1].from, b2[1].to), - }) - } - } - return result -} - -func exclude3Byte(b1, b2 []byteRange) [][]byteRange { - r01, r02, overlapping0 := excludeByteRange(&b1[0], &b2[0]) - r11, r12, overlapping1 := excludeByteRange(&b1[1], &b2[1]) - r21, r22, overlapping2 := excludeByteRange(&b1[2], &b2[2]) - if !overlapping0 || !overlapping1 || !overlapping2 { - // no overlapping - return [][]byteRange{b2} - } - result := [][]byteRange{} - { - if r21 != nil { - result = append(result, []byteRange{ - newByteRange(b1[0].from, b1[0].to), - newByteRange(b1[1].from, b1[1].to), - newByteRange(r21.from, r21.to), - }) - } - if r22 != nil { - result = append(result, []byteRange{ - newByteRange(b1[0].from, b1[0].to), - newByteRange(b1[1].from, b1[1].to), - newByteRange(r22.from, r22.to), - }) - } - if r11 != nil { - result = append(result, []byteRange{ - newByteRange(b1[0].from, b1[0].to), - newByteRange(r11.from, r11.to), - newByteRange(b2[2].from, b2[2].to), - }) - } - if r12 != nil { - result = append(result, []byteRange{ - newByteRange(b1[0].from, b1[0].to), - newByteRange(r12.from, r12.to), - newByteRange(b2[2].from, b2[2].to), - }) - } - if r01 != nil { - result = append(result, []byteRange{ - newByteRange(r01.from, r01.to), - newByteRange(b2[1].from, b2[1].to), - newByteRange(b2[2].from, b2[2].to), - }) - } - if r02 != nil { - result = append(result, []byteRange{ - newByteRange(r02.from, r02.to), - newByteRange(b2[1].from, b2[1].to), - newByteRange(b2[2].from, b2[2].to), - }) - } - } - return result -} - -func exclude4Byte(b1, b2 []byteRange) [][]byteRange { - r01, r02, overlapping0 := excludeByteRange(&b1[0], &b2[0]) - r11, r12, overlapping1 := excludeByteRange(&b1[1], &b2[1]) - r21, r22, overlapping2 := excludeByteRange(&b1[2], &b2[2]) - r31, r32, overlapping3 := excludeByteRange(&b1[3], &b2[3]) - if !overlapping0 || !overlapping1 || !overlapping2 || !overlapping3 { - // no overlapping - return [][]byteRange{b2} - } - result := [][]byteRange{} - { - if r31 != nil { - result = append(result, []byteRange{ - newByteRange(b1[0].from, b1[0].to), - newByteRange(b1[1].from, b1[1].to), - newByteRange(b1[2].from, b1[2].to), - newByteRange(r31.from, r31.to), - }) - } - if r32 != nil { - result = append(result, []byteRange{ - newByteRange(b1[0].from, b1[0].to), - newByteRange(b1[1].from, b1[1].to), - newByteRange(b1[2].from, b1[2].to), - newByteRange(r32.from, r32.to), - }) - } - if r21 != nil { - result = append(result, []byteRange{ - newByteRange(b1[0].from, b1[0].to), - newByteRange(b1[1].from, b1[1].to), - newByteRange(r21.from, r21.to), - newByteRange(b2[3].from, b2[3].to), - }) - } - if r22 != nil { - result = append(result, []byteRange{ - newByteRange(b1[0].from, b1[0].to), - newByteRange(b1[1].from, b1[1].to), - newByteRange(r22.from, r22.to), - newByteRange(b2[3].from, b2[3].to), - }) - } - if r11 != nil { - result = append(result, []byteRange{ - newByteRange(b1[0].from, b1[0].to), - newByteRange(r11.from, r11.to), - newByteRange(b2[2].from, b2[2].to), - newByteRange(b2[3].from, b2[3].to), - }) - } - if r12 != nil { - result = append(result, []byteRange{ - newByteRange(b1[0].from, b1[0].to), - newByteRange(r12.from, r12.to), - newByteRange(b2[2].from, b2[2].to), - newByteRange(b2[3].from, b2[3].to), - }) - } - if r01 != nil { - result = append(result, []byteRange{ - newByteRange(r01.from, r01.to), - newByteRange(b2[1].from, b2[1].to), - newByteRange(b2[2].from, b2[2].to), - newByteRange(b2[3].from, b2[3].to), - }) - } - if r02 != nil { - result = append(result, []byteRange{ - newByteRange(r02.from, r02.to), - newByteRange(b2[1].from, b2[1].to), - newByteRange(b2[2].from, b2[2].to), - newByteRange(b2[3].from, b2[3].to), - }) - } - } - return result -} - -// excludeByteRange excludes `r1` from `r2`. -func excludeByteRange(r1, r2 *byteRange) (*byteRange, *byteRange, bool) { - if r1.from <= r2.from { - if r1.to < r2.from { - // no overlapping - return nil, nil, false - } - if r1.to < r2.to { - // The beginning of `r2` overlaps with `r1`. - return &byteRange{ - from: r1.to + 1, - to: r2.to, - }, nil, true - } - // `symbol` overlaps with `base` entirely. - return nil, nil, true - } - if r1.from > r2.to { - // no overlapping - return nil, nil, false - } - if r1.to >= r2.to { - // The end of `r2` overlaps with `r1`. - return &byteRange{ - from: r2.from, - to: r1.from - 1, - }, nil, true - } - // `r2` overlaps with `r1` entirely. - return &byteRange{ - from: r2.from, - to: r1.from - 1, - }, - &byteRange{ - from: r1.to + 1, - to: r2.to, - }, true -} diff --git a/compiler/byte_test.go b/compiler/byte_test.go deleted file mode 100644 index c298501..0000000 --- a/compiler/byte_test.go +++ /dev/null @@ -1,263 +0,0 @@ -package compiler - -import "testing" - -func TestExcludeByteRangeSequence(t *testing.T) { - tests := []struct { - b1 []byteRange - b2 []byteRange - excluded [][]byteRange - }{ - // 1 Byte - { - b1: newByteRangeSeq([]byte{0x00}, []byte{0x00}), - b2: newByteRangeSeq([]byte{0x00}, []byte{0xff}), - excluded: [][]byteRange{ - newByteRangeSeq([]byte{0x01}, []byte{0xff}), - }, - }, - { - b1: newByteRangeSeq([]byte{0xff}, []byte{0xff}), - b2: newByteRangeSeq([]byte{0x00}, []byte{0xff}), - excluded: [][]byteRange{ - newByteRangeSeq([]byte{0x00}, []byte{0xfe}), - }, - }, - { - b1: newByteRangeSeq([]byte{0xc0}, []byte{0xcf}), - b2: newByteRangeSeq([]byte{0x00}, []byte{0xff}), - excluded: [][]byteRange{ - newByteRangeSeq([]byte{0x00}, []byte{0xbf}), - newByteRangeSeq([]byte{0xd0}, []byte{0xff}), - }, - }, - { - b1: newByteRangeSeq([]byte{0x00}, []byte{0xff}), - b2: newByteRangeSeq([]byte{0xc0}, []byte{0xcf}), - excluded: nil, - }, - { - b1: newByteRangeSeq([]byte{0x00}, []byte{0xff}), - b2: newByteRangeSeq([]byte{0x00}, []byte{0xff}), - excluded: nil, - }, - - // 2 Byte - { - b1: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0x00, 0x00}), - b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}), - excluded: [][]byteRange{ - newByteRangeSeq([]byte{0x00, 0x01}, []byte{0x00, 0xff}), - newByteRangeSeq([]byte{0x01, 0x00}, []byte{0xff, 0xff}), - }, - }, - { - b1: newByteRangeSeq([]byte{0xff, 0xff}, []byte{0xff, 0xff}), - b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}), - excluded: [][]byteRange{ - newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xfe, 0xff}), - newByteRangeSeq([]byte{0xff, 0x00}, []byte{0xff, 0xfe}), - }, - }, - { - b1: newByteRangeSeq([]byte{0xc0, 0xc0}, []byte{0xc0, 0xc0}), - b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}), - excluded: [][]byteRange{ - newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xbf, 0xff}), - newByteRangeSeq([]byte{0xc0, 0x00}, []byte{0xc0, 0xbf}), - newByteRangeSeq([]byte{0xc0, 0xc1}, []byte{0xc0, 0xff}), - newByteRangeSeq([]byte{0xc1, 0x00}, []byte{0xff, 0xff}), - }, - }, - { - b1: newByteRangeSeq([]byte{0xc0, 0x00}, []byte{0xc0, 0xff}), - b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}), - excluded: [][]byteRange{ - newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xbf, 0xff}), - newByteRangeSeq([]byte{0xc1, 0x00}, []byte{0xff, 0xff}), - }, - }, - { - b1: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}), - b2: newByteRangeSeq([]byte{0xc0, 0xc0}, []byte{0xc0, 0xc0}), - excluded: nil, - }, - { - b1: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}), - b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}), - excluded: nil, - }, - - // 3 Byte - { - b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0x00, 0x00, 0x00}), - b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), - excluded: [][]byteRange{ - newByteRangeSeq([]byte{0x00, 0x00, 0x01}, []byte{0x00, 0x00, 0xff}), - newByteRangeSeq([]byte{0x00, 0x01, 0x00}, []byte{0x00, 0xff, 0xff}), - newByteRangeSeq([]byte{0x01, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), - }, - }, - { - b1: newByteRangeSeq([]byte{0xff, 0xff, 0xff}, []byte{0xff, 0xff, 0xff}), - b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), - excluded: [][]byteRange{ - newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xfe, 0xff, 0xff}), - newByteRangeSeq([]byte{0xff, 0x00, 0x00}, []byte{0xff, 0xfe, 0xff}), - newByteRangeSeq([]byte{0xff, 0xff, 0x00}, []byte{0xff, 0xff, 0xfe}), - }, - }, - { - b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0}, []byte{0xc0, 0xc0, 0xc0}), - b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), - excluded: [][]byteRange{ - newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff}), - newByteRangeSeq([]byte{0xc0, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff}), - newByteRangeSeq([]byte{0xc0, 0xc0, 0x00}, []byte{0xc0, 0xc0, 0xbf}), - newByteRangeSeq([]byte{0xc0, 0xc0, 0xc1}, []byte{0xc0, 0xc0, 0xff}), - newByteRangeSeq([]byte{0xc0, 0xc1, 0x00}, []byte{0xc0, 0xff, 0xff}), - newByteRangeSeq([]byte{0xc1, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), - }, - }, - { - b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0x00}, []byte{0xc0, 0xc0, 0xff}), - b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), - excluded: [][]byteRange{ - newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff}), - newByteRangeSeq([]byte{0xc0, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff}), - newByteRangeSeq([]byte{0xc0, 0xc1, 0x00}, []byte{0xc0, 0xff, 0xff}), - newByteRangeSeq([]byte{0xc1, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), - }, - }, - { - b1: newByteRangeSeq([]byte{0xc0, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff}), - b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), - excluded: [][]byteRange{ - newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff}), - newByteRangeSeq([]byte{0xc1, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), - }, - }, - { - b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), - b2: newByteRangeSeq([]byte{0xff, 0xff, 0xff}, []byte{0xff, 0xff, 0xff}), - excluded: nil, - }, - { - b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), - b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}), - excluded: nil, - }, - - // 4 Byte - { - b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0x00, 0x00, 0x00, 0x00}), - b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), - excluded: [][]byteRange{ - newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x01}, []byte{0x00, 0x00, 0x00, 0xff}), - newByteRangeSeq([]byte{0x00, 0x00, 0x01, 0x00}, []byte{0x00, 0x00, 0xff, 0xff}), - newByteRangeSeq([]byte{0x00, 0x01, 0x00, 0x00}, []byte{0x00, 0xff, 0xff, 0xff}), - newByteRangeSeq([]byte{0x01, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), - }, - }, - { - b1: newByteRangeSeq([]byte{0xff, 0xff, 0xff, 0xff}, []byte{0xff, 0xff, 0xff, 0xff}), - b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), - excluded: [][]byteRange{ - newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xfe, 0xff, 0xff, 0xff}), - newByteRangeSeq([]byte{0xff, 0x00, 0x00, 0x00}, []byte{0xff, 0xfe, 0xff, 0xff}), - newByteRangeSeq([]byte{0xff, 0xff, 0x00, 0x00}, []byte{0xff, 0xff, 0xfe, 0xff}), - newByteRangeSeq([]byte{0xff, 0xff, 0xff, 0x00}, []byte{0xff, 0xff, 0xff, 0xfe}), - }, - }, - { - b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0xc0}, []byte{0xc0, 0xc0, 0xc0, 0xc0}), - b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), - excluded: [][]byteRange{ - newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff, 0xff}), - newByteRangeSeq([]byte{0xc0, 0x00, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff, 0xff}), - newByteRangeSeq([]byte{0xc0, 0xc0, 0x00, 0x00}, []byte{0xc0, 0xc0, 0xbf, 0xff}), - newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0x00}, []byte{0xc0, 0xc0, 0xc0, 0xbf}), - newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0xc1}, []byte{0xc0, 0xc0, 0xc0, 0xff}), - newByteRangeSeq([]byte{0xc0, 0xc0, 0xc1, 0x00}, []byte{0xc0, 0xc0, 0xff, 0xff}), - newByteRangeSeq([]byte{0xc0, 0xc1, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff, 0xff}), - newByteRangeSeq([]byte{0xc1, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), - }, - }, - { - b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0x00}, []byte{0xc0, 0xc0, 0xc0, 0xff}), - b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), - excluded: [][]byteRange{ - newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff, 0xff}), - newByteRangeSeq([]byte{0xc0, 0x00, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff, 0xff}), - newByteRangeSeq([]byte{0xc0, 0xc0, 0x00, 0x00}, []byte{0xc0, 0xc0, 0xbf, 0xff}), - newByteRangeSeq([]byte{0xc0, 0xc0, 0xc1, 0x00}, []byte{0xc0, 0xc0, 0xff, 0xff}), - newByteRangeSeq([]byte{0xc0, 0xc1, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff, 0xff}), - newByteRangeSeq([]byte{0xc1, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), - }, - }, - { - b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0x00, 0x00}, []byte{0xc0, 0xc0, 0xff, 0xff}), - b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), - excluded: [][]byteRange{ - newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff, 0xff}), - newByteRangeSeq([]byte{0xc0, 0x00, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff, 0xff}), - newByteRangeSeq([]byte{0xc0, 0xc1, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff, 0xff}), - newByteRangeSeq([]byte{0xc1, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), - }, - }, - { - b1: newByteRangeSeq([]byte{0xc0, 0x00, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff, 0xff}), - b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), - excluded: [][]byteRange{ - newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff, 0xff}), - newByteRangeSeq([]byte{0xc1, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), - }, - }, - { - b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), - b2: newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0xc0}, []byte{0xc0, 0xc0, 0xc0, 0xc0}), - excluded: nil, - }, - { - b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), - b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}), - excluded: nil, - }, - } - for _, tt := range tests { - excluded := excludeByteRangeSequence(tt.b1, tt.b2) - t.Logf("b1: %v, b2: %v", tt.b1, tt.b2) - t.Logf("excluded: %+v", excluded) - if len(excluded) != len(tt.excluded) { - t.Errorf("unexpected results; expected: %+v, actual: %+v", tt.excluded, excluded) - } - for _, expectedSeq := range tt.excluded { - found := false - for _, actualSeq := range excluded { - mismatched := false - for i := 0; i < len(expectedSeq); i++ { - if actualSeq[i] != expectedSeq[i] { - mismatched = true - break - } - } - if mismatched { - continue - } - found = true - break - } - if !found { - t.Errorf("an expected byte range sequence was not found: %+v", expectedSeq) - } - } - } -} - -func newByteRangeSeq(from, to []byte) []byteRange { - seq, err := newByteRangeSequence(from, to) - if err != nil { - panic(err) - } - return seq -} diff --git a/compiler/compiler.go b/compiler/compiler.go index 439822c..cfd5faf 100644 --- a/compiler/compiler.go +++ b/compiler/compiler.go @@ -1,8 +1,11 @@ package compiler import ( + "bytes" "fmt" + "github.com/nihei9/maleeni/compiler/dfa" + psr "github.com/nihei9/maleeni/compiler/parser" "github.com/nihei9/maleeni/compressor" "github.com/nihei9/maleeni/spec" ) @@ -23,17 +26,24 @@ type compilerConfig struct { compLv int } -func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSpec, error) { +type CompileError struct { + Kind spec.LexKindName + Fragment bool + Cause error + Detail string +} + +func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSpec, error, []*CompileError) { err := lexspec.Validate() if err != nil { - return nil, fmt.Errorf("invalid lexical specification:\n%w", err) + return nil, fmt.Errorf("invalid lexical specification:\n%w", err), nil } config := &compilerConfig{} for _, opt := range opts { err := opt(config) if err != nil { - return nil, err + return nil, err, nil } } @@ -44,9 +54,9 @@ func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSp } for i, es := range modeEntries[1:] { modeName := modeNames[i+1] - modeSpec, err := compile(es, modeName2ID, fragmetns, config) + modeSpec, err, cerrs := compile(es, modeName2ID, fragmetns, config) if err != nil { - return nil, fmt.Errorf("failed to compile in %v mode: %w", modeName, err) + return nil, fmt.Errorf("failed to compile in %v mode: %w", modeName, err), cerrs } modeSpecs = append(modeSpecs, modeSpec) } @@ -95,10 +105,10 @@ func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSp KindIDs: kindIDs, CompressionLevel: config.compLv, Specs: modeSpecs, - }, nil + }, nil, nil } -func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeID, map[string]*spec.LexEntry) { +func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeID, map[spec.LexKindName]*spec.LexEntry) { modeNames := []spec.LexModeName{ spec.LexModeNameNil, spec.LexModeNameDefault, @@ -112,10 +122,10 @@ func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec nil, {}, } - fragments := map[string]*spec.LexEntry{} + fragments := map[spec.LexKindName]*spec.LexEntry{} for _, e := range entries { if e.Fragment { - fragments[e.Kind.String()] = e + fragments[e.Kind] = e continue } ms := e.Modes @@ -139,15 +149,24 @@ func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec return modeEntries, modeNames, modeName2ID, fragments } -func compile(entries []*spec.LexEntry, modeName2ID map[spec.LexModeName]spec.LexModeID, fragments map[string]*spec.LexEntry, config *compilerConfig) (*spec.CompiledLexModeSpec, error) { +func compile( + entries []*spec.LexEntry, + modeName2ID map[spec.LexModeName]spec.LexModeID, + fragments map[spec.LexKindName]*spec.LexEntry, + config *compilerConfig, +) (*spec.CompiledLexModeSpec, error, []*CompileError) { var kindNames []spec.LexKindName + kindIDToName := map[spec.LexModeKindID]spec.LexKindName{} var patterns map[spec.LexModeKindID][]byte { kindNames = append(kindNames, spec.LexKindNameNil) patterns = map[spec.LexModeKindID][]byte{} for i, e := range entries { + kindID := spec.LexModeKindID(i + 1) + kindNames = append(kindNames, e.Kind) - patterns[spec.LexModeKindID(i+1)] = []byte(e.Pattern) + kindIDToName[kindID] = e.Kind + patterns[kindID] = []byte(e.Pattern) } } @@ -170,39 +189,141 @@ func compile(entries []*spec.LexEntry, modeName2ID map[spec.LexModeName]spec.Lex pop = append(pop, popV) } - fragmentPatterns := map[string][]byte{} + fragmentPatterns := map[spec.LexKindName][]byte{} for k, e := range fragments { fragmentPatterns[k] = []byte(e.Pattern) } - var root astNode - var symTab *symbolTable + fragmentCPTrees := make(map[spec.LexKindName]psr.CPTree, len(fragmentPatterns)) + { + var cerrs []*CompileError + for kind, pat := range fragmentPatterns { + p := psr.NewParser(kind, bytes.NewReader(pat)) + t, err := p.Parse() + if err != nil { + if err == psr.ParseErr { + detail, cause := p.Error() + cerrs = append(cerrs, &CompileError{ + Kind: kind, + Fragment: true, + Cause: cause, + Detail: detail, + }) + } else { + cerrs = append(cerrs, &CompileError{ + Kind: kind, + Fragment: true, + Cause: err, + }) + } + continue + } + fragmentCPTrees[kind] = t + } + if len(cerrs) > 0 { + return nil, fmt.Errorf("compile error"), cerrs + } + + err := psr.CompleteFragments(fragmentCPTrees) + if err != nil { + if err == psr.ParseErr { + for _, frag := range fragmentCPTrees { + kind, frags, err := frag.Describe() + if err != nil { + return nil, err, nil + } + + cerrs = append(cerrs, &CompileError{ + Kind: kind, + Fragment: true, + Cause: fmt.Errorf("fragment contains undefined fragments or cycles"), + Detail: fmt.Sprintf("%v", frags), + }) + } + + return nil, fmt.Errorf("compile error"), cerrs + } + + return nil, err, nil + } + } + + cpTrees := map[spec.LexModeKindID]psr.CPTree{} { - pats := make([]*patternEntry, len(patterns)+1) - pats[spec.LexModeKindIDNil] = &patternEntry{ - id: spec.LexModeKindIDNil, + pats := make([]*psr.PatternEntry, len(patterns)+1) + pats[spec.LexModeKindIDNil] = &psr.PatternEntry{ + ID: spec.LexModeKindIDNil, } for id, pattern := range patterns { - pats[id] = &patternEntry{ - id: id, - pattern: pattern, + pats[id] = &psr.PatternEntry{ + ID: id, + Pattern: pattern, } } - var err error - root, symTab, err = parse(pats, fragmentPatterns) - if err != nil { - return nil, err + var cerrs []*CompileError + for _, pat := range pats { + if pat.ID == spec.LexModeKindIDNil { + continue + } + + p := psr.NewParser(kindIDToName[pat.ID], bytes.NewReader(pat.Pattern)) + t, err := p.Parse() + if err != nil { + if err == psr.ParseErr { + detail, cause := p.Error() + cerrs = append(cerrs, &CompileError{ + Kind: kindIDToName[pat.ID], + Fragment: false, + Cause: cause, + Detail: detail, + }) + } else { + cerrs = append(cerrs, &CompileError{ + Kind: kindIDToName[pat.ID], + Fragment: false, + Cause: err, + }) + } + continue + } + + complete, err := psr.ApplyFragments(t, fragmentCPTrees) + if err != nil { + return nil, err, nil + } + if !complete { + _, frags, err := t.Describe() + if err != nil { + return nil, err, nil + } + + cerrs = append(cerrs, &CompileError{ + Kind: kindIDToName[pat.ID], + Fragment: false, + Cause: fmt.Errorf("pattern contains undefined fragments"), + Detail: fmt.Sprintf("%v", frags), + }) + continue + } + + cpTrees[pat.ID] = t + } + if len(cerrs) > 0 { + return nil, fmt.Errorf("compile error"), cerrs } } var tranTab *spec.TransitionTable { - dfa := genDFA(root, symTab) - var err error - tranTab, err = genTransitionTable(dfa) + root, symTab, err := dfa.ConvertCPTreeToByteTree(cpTrees) if err != nil { - return nil, err + return nil, err, nil + } + d := dfa.GenDFA(root, symTab) + tranTab, err = dfa.GenTransitionTable(d) + if err != nil { + return nil, err, nil } } @@ -211,12 +332,12 @@ func compile(entries []*spec.LexEntry, modeName2ID map[spec.LexModeName]spec.Lex case 2: tranTab, err = compressTransitionTableLv2(tranTab) if err != nil { - return nil, err + return nil, err, nil } case 1: tranTab, err = compressTransitionTableLv1(tranTab) if err != nil { - return nil, err + return nil, err, nil } } @@ -225,7 +346,7 @@ func compile(entries []*spec.LexEntry, modeName2ID map[spec.LexModeName]spec.Lex Push: push, Pop: pop, DFA: tranTab, - }, nil + }, nil, nil } const ( diff --git a/compiler/compiler_test.go b/compiler/compiler_test.go index 456920f..2534d9f 100644 --- a/compiler/compiler_test.go +++ b/compiler/compiler_test.go @@ -85,7 +85,7 @@ func TestCompile(t *testing.T) { if err != nil { t.Fatalf("%v", err) } - clspec, err := Compile(lspec) + clspec, err, _ := Compile(lspec) if tt.Err { if err == nil { t.Fatalf("expected an error") @@ -95,7 +95,7 @@ func TestCompile(t *testing.T) { } } else { if err != nil { - t.Fatalf("unexpected error") + t.Fatalf("unexpected error: %v", err) } if clspec == nil { t.Fatalf("Compile function must return a compiled specification") diff --git a/compiler/dfa.go b/compiler/dfa.go deleted file mode 100644 index 1d0b26a..0000000 --- a/compiler/dfa.go +++ /dev/null @@ -1,139 +0,0 @@ -package compiler - -import ( - "sort" - - "github.com/nihei9/maleeni/spec" -) - -type DFA struct { - States []string - InitialState string - AcceptingStatesTable map[string]spec.LexModeKindID - TransitionTable map[string][256]string -} - -func genDFA(root astNode, symTab *symbolTable) *DFA { - initialState := root.first() - initialStateHash := initialState.hash() - stateMap := map[string]*symbolPositionSet{ - initialStateHash: initialState, - } - tranTab := map[string][256]string{} - { - follow := genFollowTable(root) - unmarkedStates := map[string]*symbolPositionSet{ - initialStateHash: initialState, - } - for len(unmarkedStates) > 0 { - nextUnmarkedStates := map[string]*symbolPositionSet{} - for hash, state := range unmarkedStates { - tranTabOfState := [256]*symbolPositionSet{} - for _, pos := range state.set() { - if pos.isEndMark() { - continue - } - valRange := symTab.symPos2Byte[pos] - for symVal := valRange.from; symVal <= valRange.to; symVal++ { - if tranTabOfState[symVal] == nil { - tranTabOfState[symVal] = newSymbolPositionSet() - } - tranTabOfState[symVal].merge(follow[pos]) - } - } - for _, t := range tranTabOfState { - if t == nil { - continue - } - h := t.hash() - if _, ok := stateMap[h]; ok { - continue - } - stateMap[h] = t - nextUnmarkedStates[h] = t - } - tabOfState := [256]string{} - for v, t := range tranTabOfState { - if t == nil { - continue - } - tabOfState[v] = t.hash() - } - tranTab[hash] = tabOfState - } - unmarkedStates = nextUnmarkedStates - } - } - - accTab := map[string]spec.LexModeKindID{} - { - for h, s := range stateMap { - for _, pos := range s.set() { - if !pos.isEndMark() { - continue - } - priorID, ok := accTab[h] - if !ok { - accTab[h] = symTab.endPos2ID[pos] - } else { - id := symTab.endPos2ID[pos] - if id < priorID { - accTab[h] = id - } - } - } - } - } - - var states []string - { - for s := range stateMap { - states = append(states, s) - } - sort.Slice(states, func(i, j int) bool { - return states[i] < states[j] - }) - } - - return &DFA{ - States: states, - InitialState: initialStateHash, - AcceptingStatesTable: accTab, - TransitionTable: tranTab, - } -} - -func genTransitionTable(dfa *DFA) (*spec.TransitionTable, error) { - stateHash2ID := map[string]spec.StateID{} - for i, s := range dfa.States { - // Since 0 represents an invalid value in a transition table, - // assign a number greater than or equal to 1 to states. - stateHash2ID[s] = spec.StateID(i + spec.StateIDMin.Int()) - } - - acc := make([]spec.LexModeKindID, len(dfa.States)+1) - for _, s := range dfa.States { - id, ok := dfa.AcceptingStatesTable[s] - if !ok { - continue - } - acc[stateHash2ID[s]] = id - } - - rowCount := len(dfa.States) + 1 - colCount := 256 - tran := make([]spec.StateID, rowCount*colCount) - for s, tab := range dfa.TransitionTable { - for v, to := range tab { - tran[stateHash2ID[s].Int()*256+v] = stateHash2ID[to] - } - } - - return &spec.TransitionTable{ - InitialStateID: stateHash2ID[dfa.InitialState], - AcceptingStates: acc, - UncompressedTransition: tran, - RowCount: rowCount, - ColCount: colCount, - }, nil -} diff --git a/compiler/dfa_test.go b/compiler/dfa_test.go deleted file mode 100644 index 74c9ba8..0000000 --- a/compiler/dfa_test.go +++ /dev/null @@ -1,117 +0,0 @@ -package compiler - -import ( - "testing" - - "github.com/nihei9/maleeni/spec" -) - -func TestGenDFA(t *testing.T) { - root, symTab, err := parse([]*patternEntry{ - { - id: spec.LexModeKindIDMin, - pattern: []byte("(a|b)*abb"), - }, - }, nil) - if err != nil { - t.Fatal(err) - } - dfa := genDFA(root, symTab) - if dfa == nil { - t.Fatalf("DFA is nil") - } - - symPos := func(n uint16) symbolPosition { - pos, err := newSymbolPosition(n, false) - if err != nil { - panic(err) - } - return pos - } - - endPos := func(n uint16) symbolPosition { - pos, err := newSymbolPosition(n, true) - if err != nil { - panic(err) - } - return pos - } - - s0 := newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)) - s1 := newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)).add(symPos(4)) - s2 := newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)).add(symPos(5)) - s3 := newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)).add(endPos(6)) - - rune2Int := func(char rune, index int) uint8 { - return uint8([]byte(string(char))[index]) - } - - tranS0 := [256]string{} - tranS0[rune2Int('a', 0)] = s1.hash() - tranS0[rune2Int('b', 0)] = s0.hash() - - tranS1 := [256]string{} - tranS1[rune2Int('a', 0)] = s1.hash() - tranS1[rune2Int('b', 0)] = s2.hash() - - tranS2 := [256]string{} - tranS2[rune2Int('a', 0)] = s1.hash() - tranS2[rune2Int('b', 0)] = s3.hash() - - tranS3 := [256]string{} - tranS3[rune2Int('a', 0)] = s1.hash() - tranS3[rune2Int('b', 0)] = s0.hash() - - expectedTranTab := map[string][256]string{ - s0.hash(): tranS0, - s1.hash(): tranS1, - s2.hash(): tranS2, - s3.hash(): tranS3, - } - if len(dfa.TransitionTable) != len(expectedTranTab) { - t.Errorf("transition table is mismatched; want: %v entries, got: %v entries", len(expectedTranTab), len(dfa.TransitionTable)) - } - for h, eTranTab := range expectedTranTab { - tranTab, ok := dfa.TransitionTable[h] - if !ok { - t.Errorf("no entry; hash: %v", h) - continue - } - if len(tranTab) != len(eTranTab) { - t.Errorf("transition table is mismatched; hash: %v, want: %v entries, got: %v entries", h, len(eTranTab), len(tranTab)) - } - for c, eNext := range eTranTab { - if eNext == "" { - continue - } - - next := tranTab[c] - if next == "" { - t.Errorf("no enatry; hash: %v, char: %v", h, c) - } - if next != eNext { - t.Errorf("next state is mismatched; want: %v, got: %v", eNext, next) - } - } - } - - if dfa.InitialState != s0.hash() { - t.Errorf("initial state is mismatched; want: %v, got: %v", s0.hash(), dfa.InitialState) - } - - accTab := map[string]spec.LexModeKindID{ - s3.hash(): 1, - } - if len(dfa.AcceptingStatesTable) != len(accTab) { - t.Errorf("accepting states are mismatched; want: %v entries, got: %v entries", len(accTab), len(dfa.AcceptingStatesTable)) - } - for eState, eID := range accTab { - id, ok := dfa.AcceptingStatesTable[eState] - if !ok { - t.Errorf("accepting state is not found; state: %v", eState) - } - if id != eID { - t.Errorf("ID is mismatched; state: %v, want: %v, got: %v", eState, eID, id) - } - } -} diff --git a/compiler/lexer.go b/compiler/lexer.go deleted file mode 100644 index 4f6fdac..0000000 --- a/compiler/lexer.go +++ /dev/null @@ -1,578 +0,0 @@ -package compiler - -import ( - "bufio" - "fmt" - "io" - "strings" -) - -type tokenKind string - -const ( - tokenKindChar = tokenKind("char") - tokenKindAnyChar = tokenKind(".") - tokenKindRepeat = tokenKind("*") - tokenKindRepeatOneOrMore = tokenKind("+") - tokenKindOption = tokenKind("?") - tokenKindAlt = tokenKind("|") - tokenKindGroupOpen = tokenKind("(") - tokenKindGroupClose = tokenKind(")") - tokenKindBExpOpen = tokenKind("[") - tokenKindInverseBExpOpen = tokenKind("[^") - tokenKindBExpClose = tokenKind("]") - tokenKindCharRange = tokenKind("-") - tokenKindCodePointLeader = tokenKind("\\u") - tokenKindCharPropLeader = tokenKind("\\p") - tokenKindFragmentLeader = tokenKind("\\f") - tokenKindLBrace = tokenKind("{") - tokenKindRBrace = tokenKind("}") - tokenKindEqual = tokenKind("=") - tokenKindCodePoint = tokenKind("code point") - tokenKindCharPropSymbol = tokenKind("character property symbol") - tokenKindFragmentSymbol = tokenKind("fragment symbol") - tokenKindEOF = tokenKind("eof") -) - -type token struct { - kind tokenKind - char rune - propSymbol string - codePoint string - fragmentSymbol string -} - -const nullChar = '\u0000' - -func newToken(kind tokenKind, char rune) *token { - return &token{ - kind: kind, - char: char, - } -} - -func newCodePointToken(codePoint string) *token { - return &token{ - kind: tokenKindCodePoint, - codePoint: codePoint, - } -} - -func newCharPropSymbolToken(propSymbol string) *token { - return &token{ - kind: tokenKindCharPropSymbol, - propSymbol: propSymbol, - } -} - -func newFragmentSymbolToken(fragmentSymbol string) *token { - return &token{ - kind: tokenKindFragmentSymbol, - fragmentSymbol: fragmentSymbol, - } -} - -type lexerMode string - -const ( - lexerModeDefault = lexerMode("default") - lexerModeBExp = lexerMode("bracket expression") - lexerModeCPExp = lexerMode("code point expression") - lexerModeCharPropExp = lexerMode("character property expression") - lexerModeFragmentExp = lexerMode("fragment expression") -) - -type lexerModeStack struct { - stack []lexerMode -} - -func newLexerModeStack() *lexerModeStack { - return &lexerModeStack{ - stack: []lexerMode{ - lexerModeDefault, - }, - } -} - -func (s *lexerModeStack) top() lexerMode { - return s.stack[len(s.stack)-1] -} - -func (s *lexerModeStack) push(m lexerMode) { - s.stack = append(s.stack, m) -} - -func (s *lexerModeStack) pop() { - s.stack = s.stack[:len(s.stack)-1] -} - -type rangeState string - -// [a-z] -// ^^^^ -// |||`-- ready -// ||`-- expect range terminator -// |`-- read range initiator -// `-- ready -const ( - rangeStateReady = rangeState("ready") - rangeStateReadRangeInitiator = rangeState("read range initiator") - rangeStateExpectRangeTerminator = rangeState("expect range terminator") -) - -type lexer struct { - src *bufio.Reader - peekChar2 rune - peekEOF2 bool - peekChar1 rune - peekEOF1 bool - lastChar rune - reachedEOF bool - prevChar1 rune - prevEOF1 bool - prevChar2 rune - pervEOF2 bool - modeStack *lexerModeStack - rangeState rangeState - errMsgDetails string -} - -func newLexer(src io.Reader) *lexer { - return &lexer{ - src: bufio.NewReader(src), - peekChar2: nullChar, - peekEOF2: false, - peekChar1: nullChar, - peekEOF1: false, - lastChar: nullChar, - reachedEOF: false, - prevChar1: nullChar, - prevEOF1: false, - prevChar2: nullChar, - pervEOF2: false, - modeStack: newLexerModeStack(), - rangeState: rangeStateReady, - } -} - -func (l *lexer) next() (*token, error) { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - return newToken(tokenKindEOF, nullChar), nil - } - - switch l.modeStack.top() { - case lexerModeBExp: - tok, err := l.nextInBExp(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindBExpClose: - l.modeStack.pop() - case tokenKindCharRange: - l.rangeState = rangeStateExpectRangeTerminator - case tokenKindChar: - switch l.rangeState { - case rangeStateReady: - l.rangeState = rangeStateReadRangeInitiator - case rangeStateExpectRangeTerminator: - l.rangeState = rangeStateReady - } - case tokenKindCodePointLeader: - l.modeStack.push(lexerModeCPExp) - case tokenKindCharPropLeader: - l.modeStack.push(lexerModeCharPropExp) - } - return tok, nil - case lexerModeCPExp: - tok, err := l.nextInCodePoint(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindRBrace: - l.modeStack.pop() - } - return tok, nil - case lexerModeCharPropExp: - tok, err := l.nextInCharProp(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindRBrace: - l.modeStack.pop() - } - return tok, nil - case lexerModeFragmentExp: - tok, err := l.nextInFragment(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindRBrace: - l.modeStack.pop() - } - return tok, nil - default: - tok, err := l.nextInDefault(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindBExpOpen: - l.modeStack.push(lexerModeBExp) - l.rangeState = rangeStateReady - case tokenKindInverseBExpOpen: - l.modeStack.push(lexerModeBExp) - l.rangeState = rangeStateReady - case tokenKindCodePointLeader: - l.modeStack.push(lexerModeCPExp) - case tokenKindCharPropLeader: - l.modeStack.push(lexerModeCharPropExp) - case tokenKindFragmentLeader: - l.modeStack.push(lexerModeFragmentExp) - } - return tok, nil - } -} - -func (l *lexer) nextInDefault(c rune) (*token, error) { - switch c { - case '*': - return newToken(tokenKindRepeat, nullChar), nil - case '+': - return newToken(tokenKindRepeatOneOrMore, nullChar), nil - case '?': - return newToken(tokenKindOption, nullChar), nil - case '.': - return newToken(tokenKindAnyChar, nullChar), nil - case '|': - return newToken(tokenKindAlt, nullChar), nil - case '(': - return newToken(tokenKindGroupOpen, nullChar), nil - case ')': - return newToken(tokenKindGroupClose, nullChar), nil - case '[': - c1, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindBExpOpen, nullChar), nil - } - if c1 != '^' { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindBExpOpen, nullChar), nil - } - c2, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindInverseBExpOpen, nullChar), nil - } - if c2 != ']' { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindInverseBExpOpen, nullChar), nil - } - err = l.restore() - if err != nil { - return nil, err - } - err = l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindBExpOpen, nullChar), nil - case '\\': - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - return nil, synErrIncompletedEscSeq - } - if c == 'u' { - return newToken(tokenKindCodePointLeader, nullChar), nil - } - if c == 'p' { - return newToken(tokenKindCharPropLeader, nullChar), nil - } - if c == 'f' { - return newToken(tokenKindFragmentLeader, nullChar), nil - } - if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' { - return newToken(tokenKindChar, c), nil - } - l.errMsgDetails = fmt.Sprintf("\\%v is not supported", string(c)) - return nil, synErrInvalidEscSeq - default: - return newToken(tokenKindChar, c), nil - } -} - -func (l *lexer) nextInBExp(c rune) (*token, error) { - switch c { - case '-': - if l.rangeState != rangeStateReadRangeInitiator { - return newToken(tokenKindChar, c), nil - } - c1, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindChar, c), nil - } - if c1 != ']' { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindCharRange, nullChar), nil - } - err = l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindChar, c), nil - case ']': - return newToken(tokenKindBExpClose, nullChar), nil - case '\\': - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - return nil, synErrIncompletedEscSeq - } - if c == 'u' { - return newToken(tokenKindCodePointLeader, nullChar), nil - } - if c == 'p' { - return newToken(tokenKindCharPropLeader, nullChar), nil - } - if c == '\\' || c == '^' || c == '-' || c == ']' { - return newToken(tokenKindChar, c), nil - } - l.errMsgDetails = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c)) - return nil, synErrInvalidEscSeq - default: - return newToken(tokenKindChar, c), nil - } -} - -func (l *lexer) nextInCodePoint(c rune) (*token, error) { - switch c { - case '{': - return newToken(tokenKindLBrace, nullChar), nil - case '}': - return newToken(tokenKindRBrace, nullChar), nil - default: - if !isHexDigit(c) { - return nil, synErrInvalidCodePoint - } - var b strings.Builder - fmt.Fprint(&b, string(c)) - n := 1 - for { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - l.restore() - if err != nil { - return nil, err - } - break - } - if c == '}' { - err := l.restore() - if err != nil { - return nil, err - } - break - } - if !isHexDigit(c) || n >= 6 { - return nil, synErrInvalidCodePoint - } - fmt.Fprint(&b, string(c)) - n++ - } - cp := b.String() - cpLen := len(cp) - if !(cpLen == 4 || cpLen == 6) { - return nil, synErrInvalidCodePoint - } - return newCodePointToken(b.String()), nil - } -} - -func isHexDigit(c rune) bool { - if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' { - return true - } - return false -} - -func (l *lexer) nextInCharProp(c rune) (*token, error) { - switch c { - case '{': - return newToken(tokenKindLBrace, nullChar), nil - case '}': - return newToken(tokenKindRBrace, nullChar), nil - case '=': - return newToken(tokenKindEqual, nullChar), nil - default: - var b strings.Builder - fmt.Fprint(&b, string(c)) - n := 1 - for { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - l.restore() - if err != nil { - return nil, err - } - break - } - if c == '}' || c == '=' { - err := l.restore() - if err != nil { - return nil, err - } - break - } - fmt.Fprint(&b, string(c)) - n++ - } - sym := strings.TrimSpace(b.String()) - if len(sym) == 0 { - raiseSyntaxError(synErrCharPropInvalidSymbol) - } - return newCharPropSymbolToken(sym), nil - } -} - -func (l *lexer) nextInFragment(c rune) (*token, error) { - switch c { - case '{': - return newToken(tokenKindLBrace, nullChar), nil - case '}': - return newToken(tokenKindRBrace, nullChar), nil - default: - var b strings.Builder - fmt.Fprint(&b, string(c)) - n := 1 - for { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - l.restore() - if err != nil { - return nil, err - } - break - } - if c == '}' { - err := l.restore() - if err != nil { - return nil, err - } - break - } - fmt.Fprint(&b, string(c)) - n++ - } - sym := strings.TrimSpace(b.String()) - if len(sym) == 0 { - raiseSyntaxError(SynErrFragmentInvalidSymbol) - } - return newFragmentSymbolToken(sym), nil - } -} - -func (l *lexer) read() (rune, bool, error) { - if l.reachedEOF { - return l.lastChar, l.reachedEOF, nil - } - if l.peekChar1 != nullChar || l.peekEOF1 { - l.prevChar2 = l.prevChar1 - l.pervEOF2 = l.prevEOF1 - l.prevChar1 = l.lastChar - l.prevEOF1 = l.reachedEOF - l.lastChar = l.peekChar1 - l.reachedEOF = l.peekEOF1 - l.peekChar1 = l.peekChar2 - l.peekEOF1 = l.peekEOF2 - l.peekChar2 = nullChar - l.peekEOF2 = false - return l.lastChar, l.reachedEOF, nil - } - c, _, err := l.src.ReadRune() - if err != nil { - if err == io.EOF { - l.prevChar2 = l.prevChar1 - l.pervEOF2 = l.prevEOF1 - l.prevChar1 = l.lastChar - l.prevEOF1 = l.reachedEOF - l.lastChar = nullChar - l.reachedEOF = true - return l.lastChar, l.reachedEOF, nil - } - return nullChar, false, err - } - l.prevChar2 = l.prevChar1 - l.pervEOF2 = l.prevEOF1 - l.prevChar1 = l.lastChar - l.prevEOF1 = l.reachedEOF - l.lastChar = c - l.reachedEOF = false - return l.lastChar, l.reachedEOF, nil -} - -func (l *lexer) restore() error { - if l.lastChar == nullChar && !l.reachedEOF { - return fmt.Errorf("the lexer failed to call restore() because the last character is null") - } - l.peekChar2 = l.peekChar1 - l.peekEOF2 = l.peekEOF1 - l.peekChar1 = l.lastChar - l.peekEOF1 = l.reachedEOF - l.lastChar = l.prevChar1 - l.reachedEOF = l.prevEOF1 - l.prevChar1 = l.prevChar2 - l.prevEOF1 = l.pervEOF2 - l.prevChar2 = nullChar - l.pervEOF2 = false - return nil -} diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go deleted file mode 100644 index 51eb82e..0000000 --- a/compiler/lexer_test.go +++ /dev/null @@ -1,514 +0,0 @@ -package compiler - -import ( - "strings" - "testing" -) - -func TestLexer(t *testing.T) { - tests := []struct { - caption string - src string - tokens []*token - err error - }{ - { - caption: "lexer can recognize ordinaly characters", - src: "123abcいろは", - tokens: []*token{ - newToken(tokenKindChar, '1'), - newToken(tokenKindChar, '2'), - newToken(tokenKindChar, '3'), - newToken(tokenKindChar, 'a'), - newToken(tokenKindChar, 'b'), - newToken(tokenKindChar, 'c'), - newToken(tokenKindChar, 'い'), - newToken(tokenKindChar, 'ろ'), - newToken(tokenKindChar, 'は'), - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "lexer can recognize the special characters in default mode", - src: ".*+?|()[\\u", - tokens: []*token{ - newToken(tokenKindAnyChar, nullChar), - newToken(tokenKindRepeat, nullChar), - newToken(tokenKindRepeatOneOrMore, nullChar), - newToken(tokenKindOption, nullChar), - newToken(tokenKindAlt, nullChar), - newToken(tokenKindGroupOpen, nullChar), - newToken(tokenKindGroupClose, nullChar), - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "lexer can recognize the escape sequences in default mode", - src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[", - tokens: []*token{ - newToken(tokenKindChar, '\\'), - newToken(tokenKindChar, '.'), - newToken(tokenKindChar, '*'), - newToken(tokenKindChar, '+'), - newToken(tokenKindChar, '?'), - newToken(tokenKindChar, '|'), - newToken(tokenKindChar, '('), - newToken(tokenKindChar, ')'), - newToken(tokenKindChar, '['), - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "], {, and } are treated as an ordinary character in default mode", - src: "]{}", - tokens: []*token{ - newToken(tokenKindChar, ']'), - newToken(tokenKindChar, '{'), - newToken(tokenKindChar, '}'), - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "lexer can recognize the special characters in bracket expression mode", - src: "[a-z\\u{09AF}][^a-z\\u{09abcf}]", - tokens: []*token{ - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, 'a'), - newToken(tokenKindCharRange, nullChar), - newToken(tokenKindChar, 'z'), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("09AF"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindChar, 'a'), - newToken(tokenKindCharRange, nullChar), - newToken(tokenKindChar, 'z'), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("09abcf"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "lexer can recognize the escape sequences in bracket expression mode", - src: "[\\^a\\-z]", - tokens: []*token{ - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, '^'), - newToken(tokenKindChar, 'a'), - newToken(tokenKindChar, '-'), - newToken(tokenKindChar, 'z'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "in a bracket expression, the special characters are also handled as normal characters", - src: "[\\\\.*+?|()[", - tokens: []*token{ - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, '\\'), - newToken(tokenKindChar, '.'), - newToken(tokenKindChar, '*'), - newToken(tokenKindChar, '+'), - newToken(tokenKindChar, '?'), - newToken(tokenKindChar, '|'), - newToken(tokenKindChar, '('), - newToken(tokenKindChar, ')'), - newToken(tokenKindChar, '['), - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "hyphen symbols that appear in bracket expressions are handled as the character range symbol or ordinary characters", - // [...-...][...-][-...][-] - // ~~~~~~~ ~ ~ ~ - // ^ ^ ^ ^ - // | | | `-- Ordinary Character (b) - // | | `-- Ordinary Character (b) - // | `-- Ordinary Character (b) - // `-- Character Range (a) - // - // a. *-* is handled as a character range expression. - // b. *-, -*, or - are handled as ordinary characters. - src: "[a-z][a-][-z][-][--][---][^a-z][^a-][^-z][^-][^--][^---]", - tokens: []*token{ - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, 'a'), - newToken(tokenKindCharRange, nullChar), - newToken(tokenKindChar, 'z'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, 'a'), - newToken(tokenKindChar, '-'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindChar, 'z'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindChar, '-'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindCharRange, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindBExpClose, nullChar), - - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindChar, 'a'), - newToken(tokenKindCharRange, nullChar), - newToken(tokenKindChar, 'z'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindChar, 'a'), - newToken(tokenKindChar, '-'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindChar, 'z'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindChar, '-'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindCharRange, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindBExpClose, nullChar), - - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "caret symbols that appear in bracket expressions are handled as the logical inverse symbol or ordinary characters", - // [^...^...][^] - // ~~ ~ ~~ - // ^ ^ ^^ - // | | |`-- Ordinary Character (c) - // | | `-- Bracket Expression - // | `-- Ordinary Character (b) - // `-- Inverse Bracket Expression (a) - // - // a. Bracket expressions that have a caret symbol at the beginning are handled as logical inverse expressions. - // b. caret symbols that appear as the second and the subsequent symbols are handled as ordinary symbols. - // c. When a bracket expression has just one symbol, a caret symbol at the beginning is handled as an ordinary character. - src: "[^^][^]", - tokens: []*token{ - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindChar, '^'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, '^'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "lexer raises an error when an invalid escape sequence appears", - src: "\\@", - err: synErrInvalidEscSeq, - }, - { - caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears", - src: "\\", - err: synErrIncompletedEscSeq, - }, - { - caption: "lexer raises an error when an invalid escape sequence appears", - src: "[\\@", - tokens: []*token{ - newToken(tokenKindBExpOpen, nullChar), - }, - err: synErrInvalidEscSeq, - }, - { - caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears", - src: "[\\", - tokens: []*token{ - newToken(tokenKindBExpOpen, nullChar), - }, - err: synErrIncompletedEscSeq, - }, - { - caption: "lexer can recognize the special characters and code points in code point expression mode", - src: "\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}[\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}][^\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}]", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("0123"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("4567"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("89abcd"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("efAB"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("CDEF01"), - newToken(tokenKindRBrace, nullChar), - - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("0123"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("4567"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("89abcd"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("efAB"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("CDEF01"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindBExpClose, nullChar), - - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("0123"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("4567"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("89abcd"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("efAB"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("CDEF01"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindBExpClose, nullChar), - - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "a one digit hex string isn't a valid code point", - src: "\\u{0", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - }, - err: synErrInvalidCodePoint, - }, - { - caption: "a two digits hex string isn't a valid code point", - src: "\\u{01", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - }, - err: synErrInvalidCodePoint, - }, - { - caption: "a three digits hex string isn't a valid code point", - src: "\\u{012", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - }, - err: synErrInvalidCodePoint, - }, - { - caption: "a four digits hex string is a valid code point", - src: "\\u{0123}", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("0123"), - newToken(tokenKindRBrace, nullChar), - }, - }, - { - caption: "a five digits hex string isn't a valid code point", - src: "\\u{01234", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - }, - err: synErrInvalidCodePoint, - }, - { - caption: "a six digits hex string is a valid code point", - src: "\\u{012345}", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCodePointToken("012345"), - newToken(tokenKindRBrace, nullChar), - }, - }, - { - caption: "a seven digits hex string isn't a valid code point", - src: "\\u{0123456", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - }, - err: synErrInvalidCodePoint, - }, - { - caption: "a code point must be hex digits", - src: "\\u{g", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - }, - err: synErrInvalidCodePoint, - }, - { - caption: "a code point must be hex digits", - src: "\\u{G", - tokens: []*token{ - newToken(tokenKindCodePointLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - }, - err: synErrInvalidCodePoint, - }, - { - caption: "lexer can recognize the special characters and symbols in character property expression mode", - src: "\\p{Letter}\\p{General_Category=Letter}[\\p{Letter}\\p{General_Category=Letter}][^\\p{Letter}\\p{General_Category=Letter}]", - tokens: []*token{ - newToken(tokenKindCharPropLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCharPropSymbolToken("Letter"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCharPropLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCharPropSymbolToken("General_Category"), - newToken(tokenKindEqual, nullChar), - newCharPropSymbolToken("Letter"), - newToken(tokenKindRBrace, nullChar), - - newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindCharPropLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCharPropSymbolToken("Letter"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCharPropLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCharPropSymbolToken("General_Category"), - newToken(tokenKindEqual, nullChar), - newCharPropSymbolToken("Letter"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindBExpClose, nullChar), - - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindCharPropLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCharPropSymbolToken("Letter"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindCharPropLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newCharPropSymbolToken("General_Category"), - newToken(tokenKindEqual, nullChar), - newCharPropSymbolToken("Letter"), - newToken(tokenKindRBrace, nullChar), - newToken(tokenKindBExpClose, nullChar), - - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "lexer can recognize the special characters and symbols in fragment expression mode", - src: "\\f{integer}", - tokens: []*token{ - newToken(tokenKindFragmentLeader, nullChar), - newToken(tokenKindLBrace, nullChar), - newFragmentSymbolToken("integer"), - newToken(tokenKindRBrace, nullChar), - - newToken(tokenKindEOF, nullChar), - }, - }, - { - caption: "a fragment expression is not supported in a bracket expression", - src: "[\\f", - tokens: []*token{ - newToken(tokenKindBExpOpen, nullChar), - }, - err: synErrInvalidEscSeq, - }, - { - caption: "a fragment expression is not supported in an inverse bracket expression", - src: "[^\\f", - tokens: []*token{ - newToken(tokenKindInverseBExpOpen, nullChar), - }, - err: synErrInvalidEscSeq, - }, - } - for _, tt := range tests { - t.Run(tt.caption, func(t *testing.T) { - lex := newLexer(strings.NewReader(tt.src)) - var err error - var tok *token - i := 0 - for { - tok, err = lex.next() - if err != nil { - break - } - if i >= len(tt.tokens) { - break - } - eTok := tt.tokens[i] - i++ - testToken(t, tok, eTok) - - if tok.kind == tokenKindEOF { - break - } - } - if err != tt.err { - t.Fatalf("unexpected error; want: %v, got: %v (%v)", tt.err, err, lex.errMsgDetails) - } - if i < len(tt.tokens) { - t.Fatalf("expecte more tokens") - } - }) - } -} - -func testToken(t *testing.T, a, e *token) { - t.Helper() - if e.kind != a.kind || e.char != a.char || e.codePoint != a.codePoint { - t.Fatalf("unexpected token; want: %+v, got: %+v", e, a) - } -} diff --git a/compiler/parser.go b/compiler/parser.go deleted file mode 100644 index ce481e3..0000000 --- a/compiler/parser.go +++ /dev/null @@ -1,862 +0,0 @@ -package compiler - -import ( - "bytes" - "encoding/binary" - "encoding/hex" - "fmt" - "io" - "strings" - - "github.com/nihei9/maleeni/spec" - "github.com/nihei9/maleeni/ucd" - "github.com/nihei9/maleeni/utf8" -) - -type ParseErrors struct { - Errors []*ParseError -} - -func (e *ParseErrors) Error() string { - var b strings.Builder - fmt.Fprintf(&b, "%v", e.Errors[0]) - for _, err := range e.Errors[1:] { - fmt.Fprintf(&b, "\n%v", err) - } - return b.String() -} - -type ParseError struct { - ID spec.LexModeKindID - Pattern []byte - Cause error - Details string -} - -func (e *ParseError) Error() string { - var b strings.Builder - fmt.Fprintf(&b, "#%v %v: %v", e.ID, string(e.Pattern), e.Cause) - if e.Details != "" { - fmt.Fprintf(&b, ": %v", e.Details) - } - return b.String() -} - -func raiseSyntaxError(synErr *SyntaxError) { - panic(synErr) -} - -type symbolTable struct { - symPos2Byte map[symbolPosition]byteRange - endPos2ID map[symbolPosition]spec.LexModeKindID -} - -func genSymbolTable(root astNode) *symbolTable { - symTab := &symbolTable{ - symPos2Byte: map[symbolPosition]byteRange{}, - endPos2ID: map[symbolPosition]spec.LexModeKindID{}, - } - return genSymTab(symTab, root) -} - -func genSymTab(symTab *symbolTable, node astNode) *symbolTable { - if node == nil { - return symTab - } - - switch n := node.(type) { - case *symbolNode: - symTab.symPos2Byte[n.pos] = byteRange{ - from: n.from, - to: n.to, - } - case *endMarkerNode: - symTab.endPos2ID[n.pos] = n.id - default: - left, right := node.children() - genSymTab(symTab, left) - genSymTab(symTab, right) - } - return symTab -} - -type patternEntry struct { - id spec.LexModeKindID - pattern []byte -} - -func parse(pats []*patternEntry, fragments map[string][]byte) (astNode, *symbolTable, error) { - if len(pats) == 0 { - return nil, nil, fmt.Errorf("parse() needs at least one token entry") - } - - fragmentASTs, err := parseFragments(fragments) - if err != nil { - return nil, nil, err - } - if fragmentASTs == nil { - fragmentASTs = map[string]astNode{} - } - - root, err := parseRegexp(pats, fragmentASTs) - if err != nil { - return nil, nil, err - } - - return root, genSymbolTable(root), nil -} - -type incompleteFragment struct { - kind string - ast astNode -} - -func parseFragments(fragments map[string][]byte) (map[string]astNode, error) { - if len(fragments) == 0 { - return nil, nil - } - fragmentASTs := map[string]astNode{} - incompleteFragments := []*incompleteFragment{} - var perrs []*ParseError - for kind, pattern := range fragments { - p := newParser(bytes.NewReader(pattern)) - ast, err := p.parse() - if err != nil { - perrs = append(perrs, &ParseError{ - Pattern: pattern, - Cause: err, - Details: p.errMsgDetails, - }) - continue - } - if p.incomplete { - incompleteFragments = append(incompleteFragments, &incompleteFragment{ - kind: kind, - ast: ast, - }) - } else { - fragmentASTs[kind] = ast - } - } - for len(incompleteFragments) > 0 { - lastIncompCount := len(incompleteFragments) - remainingFragments := []*incompleteFragment{} - for _, e := range incompleteFragments { - remains := applyFragments(e.ast, fragmentASTs) - if len(remains) > 0 { - remainingFragments = append(remainingFragments, e) - } else { - fragmentASTs[e.kind] = e.ast - } - } - incompleteFragments = remainingFragments - if len(incompleteFragments) == lastIncompCount { - for _, e := range incompleteFragments { - perrs = append(perrs, &ParseError{ - Cause: fmt.Errorf("%v has an undefined fragment or a cycle", e.kind), - }) - } - break - } - } - if len(perrs) > 0 { - return nil, &ParseErrors{ - Errors: perrs, - } - } - - return fragmentASTs, nil -} - -func parseRegexp(pats []*patternEntry, fragmentASTs map[string]astNode) (astNode, error) { - symPos := symbolPositionMin - var root astNode - var perrs []*ParseError - - for _, pat := range pats { - if pat.id == spec.LexModeKindIDNil { - continue - } - - p := newParser(bytes.NewReader(pat.pattern)) - ast, err := p.parse() - if err != nil { - perrs = append(perrs, &ParseError{ - ID: pat.id, - Pattern: pat.pattern, - Cause: err, - Details: p.errMsgDetails, - }) - continue - } - remains := applyFragments(ast, fragmentASTs) - if len(remains) > 0 { - perrs = append(perrs, &ParseError{ - ID: pat.id, - Pattern: pat.pattern, - Cause: fmt.Errorf("undefined fragment: %+v", remains), - }) - continue - } - ast = newConcatNode(ast, newEndMarkerNode(pat.id)) - symPos, err = positionSymbols(ast, symPos) - if err != nil { - perrs = append(perrs, &ParseError{ - ID: pat.id, - Pattern: pat.pattern, - Cause: err, - Details: p.errMsgDetails, - }) - continue - } - root = genAltNode(root, ast) - } - if len(perrs) > 0 { - return nil, &ParseErrors{ - Errors: perrs, - } - } - - return root, nil -} - -func applyFragments(ast astNode, fragments map[string]astNode) []string { - if ast == nil { - return nil - } - n, ok := ast.(*fragmentNode) - if !ok { - var remains []string - left, right := ast.children() - r := applyFragments(left, fragments) - if len(r) > 0 { - remains = append(remains, r...) - } - r = applyFragments(right, fragments) - if len(r) > 0 { - remains = append(remains, r...) - } - return remains - } - f, ok := fragments[n.symbol] - if !ok { - return []string{n.symbol} - } - n.left = copyAST(f) - return nil -} - -type parser struct { - lex *lexer - peekedTok *token - lastTok *token - incomplete bool - errMsgDetails string - - // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that - // appear in property expressions. - // - // The contributory properties are not exposed, and users cannot use those properties because the parser - // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid. - // - // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to - // interpret derived properties internally because the derived properties consist of other properties that - // may contain the contributory properties. - // - // [UAX #44 5.13 Property APIs] says: - // > The following subtypes of Unicode character properties should generally not be exposed in APIs, - // > except in limited circumstances. They may not be useful, particularly in public API collections, - // > and may instead prove misleading to the users of such API collections. - // > * Contributory properties are not recommended for public APIs. - // > ... - // https://unicode.org/reports/tr44/#Property_APIs - isContributoryPropertyExposed bool -} - -func newParser(src io.Reader) *parser { - return &parser{ - lex: newLexer(src), - isContributoryPropertyExposed: false, - } -} - -func (p *parser) exposeContributoryProperty() { - p.isContributoryPropertyExposed = true -} - -func (p *parser) parse() (ast astNode, retErr error) { - defer func() { - err := recover() - if err != nil { - var ok bool - retErr, ok = err.(error) - if !ok { - retErr = fmt.Errorf("%v", err) - } - return - } - }() - - ast, err := p.parseRegexp() - if err != nil { - return nil, err - } - - return ast, nil -} - -func (p *parser) parseRegexp() (astNode, error) { - alt := p.parseAlt() - if alt == nil { - if p.consume(tokenKindGroupClose) { - raiseSyntaxError(synErrGroupNoInitiator) - } - raiseSyntaxError(synErrNullPattern) - } - if p.consume(tokenKindGroupClose) { - raiseSyntaxError(synErrGroupNoInitiator) - } - p.expect(tokenKindEOF) - return alt, nil -} - -func (p *parser) parseAlt() astNode { - left := p.parseConcat() - if left == nil { - if p.consume(tokenKindAlt) { - raiseSyntaxError(synErrAltLackOfOperand) - } - return nil - } - for { - if !p.consume(tokenKindAlt) { - break - } - right := p.parseConcat() - if right == nil { - raiseSyntaxError(synErrAltLackOfOperand) - } - left = newAltNode(left, right) - } - return left -} - -func (p *parser) parseConcat() astNode { - left := p.parseRepeat() - for { - right := p.parseRepeat() - if right == nil { - break - } - left = newConcatNode(left, right) - } - return left -} - -func (p *parser) parseRepeat() astNode { - group := p.parseGroup() - if group == nil { - if p.consume(tokenKindRepeat) { - p.errMsgDetails = "* needs an operand" - raiseSyntaxError(synErrRepNoTarget) - } - if p.consume(tokenKindRepeatOneOrMore) { - p.errMsgDetails = "+ needs an operand" - raiseSyntaxError(synErrRepNoTarget) - } - if p.consume(tokenKindOption) { - p.errMsgDetails = "? needs an operand" - raiseSyntaxError(synErrRepNoTarget) - } - return nil - } - if p.consume(tokenKindRepeat) { - return newRepeatNode(group) - } - if p.consume(tokenKindRepeatOneOrMore) { - return newRepeatOneOrMoreNode(group) - } - if p.consume(tokenKindOption) { - return newOptionNode(group) - } - return group -} - -func (p *parser) parseGroup() astNode { - if p.consume(tokenKindGroupOpen) { - alt := p.parseAlt() - if alt == nil { - if p.consume(tokenKindEOF) { - raiseSyntaxError(synErrGroupUnclosed) - } - raiseSyntaxError(synErrGroupNoElem) - } - if p.consume(tokenKindEOF) { - raiseSyntaxError(synErrGroupUnclosed) - } - if !p.consume(tokenKindGroupClose) { - raiseSyntaxError(synErrGroupInvalidForm) - } - return alt - } - return p.parseSingleChar() -} - -func (p *parser) parseSingleChar() astNode { - if p.consume(tokenKindAnyChar) { - return genAnyCharAST() - } - if p.consume(tokenKindBExpOpen) { - left := p.parseBExpElem() - if left == nil { - if p.consume(tokenKindEOF) { - raiseSyntaxError(synErrBExpUnclosed) - } - raiseSyntaxError(synErrBExpNoElem) - } - for { - right := p.parseBExpElem() - if right == nil { - break - } - left = newAltNode(left, right) - } - if p.consume(tokenKindEOF) { - raiseSyntaxError(synErrBExpUnclosed) - } - p.expect(tokenKindBExpClose) - return left - } - if p.consume(tokenKindInverseBExpOpen) { - elem := p.parseBExpElem() - if elem == nil { - if p.consume(tokenKindEOF) { - raiseSyntaxError(synErrBExpUnclosed) - } - raiseSyntaxError(synErrBExpNoElem) - } - inverse := exclude(elem, genAnyCharAST()) - if inverse == nil { - panic(fmt.Errorf("a pattern that isn't matching any symbols")) - } - for { - elem := p.parseBExpElem() - if elem == nil { - break - } - inverse = exclude(elem, inverse) - if inverse == nil { - panic(fmt.Errorf("a pattern that isn't matching any symbols")) - } - } - if p.consume(tokenKindEOF) { - raiseSyntaxError(synErrBExpUnclosed) - } - p.expect(tokenKindBExpClose) - return inverse - } - if p.consume(tokenKindCodePointLeader) { - return p.parseCodePoint() - } - if p.consume(tokenKindCharPropLeader) { - return p.parseCharProp() - } - if p.consume(tokenKindFragmentLeader) { - return p.parseFragment() - } - c := p.parseNormalChar() - if c == nil { - if p.consume(tokenKindBExpClose) { - raiseSyntaxError(synErrBExpInvalidForm) - } - return nil - } - return c -} - -func (p *parser) parseBExpElem() astNode { - if p.consume(tokenKindCodePointLeader) { - return p.parseCodePoint() - } - if p.consume(tokenKindCharPropLeader) { - return p.parseCharProp() - } - left := p.parseNormalChar() - if left == nil { - return nil - } - if !p.consume(tokenKindCharRange) { - return left - } - right := p.parseNormalChar() - if right == nil { - panic(fmt.Errorf("invalid range expression")) - } - from := genByteSeq(left) - to := genByteSeq(right) - if !isValidOrder(from, to) { - p.errMsgDetails = fmt.Sprintf("[%s-%s] ([%v-%v])", string(from), string(to), from, to) - raiseSyntaxError(synErrRangeInvalidOrder) - } - return genRangeAST(left, right) -} - -func (p *parser) parseCodePoint() astNode { - if !p.consume(tokenKindLBrace) { - raiseSyntaxError(synErrCPExpInvalidForm) - } - if !p.consume(tokenKindCodePoint) { - raiseSyntaxError(synErrCPExpInvalidForm) - } - - var cp []byte - { - // Although hex.DecodeString method can handle only a hex string that has even length, - // `codePoint` always has even length by the lexical specification. - b, err := hex.DecodeString(p.lastTok.codePoint) - if err != nil { - panic(fmt.Errorf("failed to decode a code point (%v) into a byte slice: %v", p.lastTok.codePoint, err)) - } - // `b` must be 4 bytes to convert it into a 32-bit integer. - l := len(b) - for i := 0; i < 4-l; i++ { - b = append([]byte{0}, b...) - } - n := binary.BigEndian.Uint32(b) - if n < 0x0000 || n > 0x10FFFF { - raiseSyntaxError(synErrCPExpOutOfRange) - } - - cp = []byte(string(rune(n))) - } - - var concat astNode - { - concat = newSymbolNode(cp[0]) - for _, b := range cp[1:] { - concat = genConcatNode( - concat, - newSymbolNode(b), - ) - } - } - - if !p.consume(tokenKindRBrace) { - raiseSyntaxError(synErrCPExpInvalidForm) - } - - return concat -} - -func (p *parser) parseCharProp() astNode { - if !p.consume(tokenKindLBrace) { - raiseSyntaxError(synErrCharPropExpInvalidForm) - } - var sym1, sym2 string - if !p.consume(tokenKindCharPropSymbol) { - raiseSyntaxError(synErrCharPropExpInvalidForm) - } - sym1 = p.lastTok.propSymbol - if p.consume(tokenKindEqual) { - if !p.consume(tokenKindCharPropSymbol) { - raiseSyntaxError(synErrCharPropExpInvalidForm) - } - sym2 = p.lastTok.propSymbol - } - - var alt astNode - var propName, propVal string - if sym2 != "" { - propName = sym1 - propVal = sym2 - } else { - propName = "" - propVal = sym1 - } - if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) { - p.errMsgDetails = propName - raiseSyntaxError(synErrCharPropUnsupported) - } - pat, err := ucd.NormalizeCharacterProperty(propName, propVal) - if err != nil { - p.errMsgDetails = fmt.Sprintf("%v", err) - raiseSyntaxError(synErrCharPropUnsupported) - } - if pat != "" { - p := newParser(bytes.NewReader([]byte(pat))) - p.exposeContributoryProperty() - ast, err := p.parse() - if err != nil { - panic(err) - } - alt = ast - } else { - cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal) - if err != nil { - p.errMsgDetails = fmt.Sprintf("%v", err) - raiseSyntaxError(synErrCharPropUnsupported) - } - if inverse { - r := cpRanges[0] - from := genNormalCharAST(r.From) - to := genNormalCharAST(r.To) - alt = exclude(genRangeAST(from, to), genAnyCharAST()) - if alt == nil { - panic(fmt.Errorf("a pattern that isn't matching any symbols")) - } - for _, r := range cpRanges[1:] { - from := genNormalCharAST(r.From) - to := genNormalCharAST(r.To) - alt = exclude(genRangeAST(from, to), alt) - if alt == nil { - panic(fmt.Errorf("a pattern that isn't matching any symbols")) - } - } - } else { - for _, r := range cpRanges { - from := genNormalCharAST(r.From) - to := genNormalCharAST(r.To) - alt = genAltNode( - alt, - genRangeAST(from, to), - ) - } - } - } - - if !p.consume(tokenKindRBrace) { - raiseSyntaxError(synErrCharPropExpInvalidForm) - } - - return alt -} - -func (p *parser) parseFragment() astNode { - if !p.consume(tokenKindLBrace) { - raiseSyntaxError(synErrFragmentExpInvalidForm) - } - if !p.consume(tokenKindFragmentSymbol) { - raiseSyntaxError(synErrFragmentExpInvalidForm) - } - sym := p.lastTok.fragmentSymbol - - if !p.consume(tokenKindRBrace) { - raiseSyntaxError(synErrFragmentExpInvalidForm) - } - - p.incomplete = true - - return newFragmentNode(sym, nil) -} - -func (p *parser) parseNormalChar() astNode { - if !p.consume(tokenKindChar) { - return nil - } - return genNormalCharAST(p.lastTok.char) -} - -func genNormalCharAST(c rune) astNode { - b := []byte(string(c)) - switch len(b) { - case 1: - return newSymbolNode(b[0]) - case 2: - return genConcatNode( - newSymbolNode(b[0]), - newSymbolNode(b[1]), - ) - case 3: - return genConcatNode( - newSymbolNode(b[0]), - newSymbolNode(b[1]), - newSymbolNode(b[2]), - ) - default: // is equivalent to case 4 - return genConcatNode( - newSymbolNode(b[0]), - newSymbolNode(b[1]), - newSymbolNode(b[2]), - newSymbolNode(b[3]), - ) - } -} - -func exclude(symbol, base astNode) astNode { - if alt, ok := symbol.(*altNode); ok { - return exclude(alt.right, exclude(alt.left, base)) - } - - switch base.(type) { - case *altNode: - left, right := base.children() - return genAltNode( - exclude(symbol, left), - exclude(symbol, right), - ) - case *concatNode: - baseSeq := genByteRangeSeq(base) - symSeq := genByteRangeSeq(symbol) - excluded := excludeByteRangeSequence(symSeq, baseSeq) - if len(excluded) <= 0 { - return nil - } - return convertByteRangeSeqsToAST(excluded) - case *symbolNode: - baseSeq := genByteRangeSeq(base) - symSeq := genByteRangeSeq(symbol) - excluded := excludeByteRangeSequence(symSeq, baseSeq) - if len(excluded) <= 0 { - return nil - } - return convertByteRangeSeqsToAST(excluded) - } - return nil -} - -func convertByteRangeSeqsToAST(seqs [][]byteRange) astNode { - concats := []astNode{} - for _, seq := range seqs { - syms := []astNode{} - for _, elem := range seq { - syms = append(syms, newRangeSymbolNode(elem.from, elem.to)) - } - concats = append(concats, genConcatNode(syms...)) - } - return genAltNode(concats...) -} - -func genAnyCharAST() astNode { - return convertCharBlocksToAST(utf8.AllCharBlocks()) -} - -func genRangeAST(fromNode, toNode astNode) astNode { - from := genByteSeq(fromNode) - to := genByteSeq(toNode) - blks, err := utf8.GenCharBlocks(from, to) - if err != nil { - panic(err) - } - return convertCharBlocksToAST(blks) -} - -func convertCharBlocksToAST(blks []*utf8.CharBlock) astNode { - var alt astNode - for _, blk := range blks { - r := make([]astNode, len(blk.From)) - for i := 0; i < len(blk.From); i++ { - r[i] = newRangeSymbolNode(blk.From[i], blk.To[i]) - } - alt = genAltNode(alt, genConcatNode(r...)) - } - return alt -} - -func genByteSeq(node astNode) []byte { - switch n := node.(type) { - case *symbolNode: - return []byte{n.from} - case *concatNode: - seq := genByteSeq(n.left) - seq = append(seq, genByteSeq(n.right)...) - return seq - } - panic(fmt.Errorf("genByteSeq() cannot handle %T: %v", node, node)) -} - -func genByteRangeSeq(node astNode) []byteRange { - switch n := node.(type) { - case *symbolNode: - return []byteRange{{from: n.from, to: n.to}} - case *concatNode: - seq := genByteRangeSeq(n.left) - seq = append(seq, genByteRangeSeq(n.right)...) - return seq - } - panic(fmt.Errorf("genByteRangeSeq() cannot handle %T: %v", node, node)) -} - -func isValidOrder(from, to []byte) bool { - if len(from) > len(to) { - return false - } - if len(from) < len(to) { - return true - } - for i, f := range from { - t := to[i] - if f > t { - return false - } - if f < t { - return true - } - } - return true -} - -func genConcatNode(cs ...astNode) astNode { - if len(cs) <= 0 { - return nil - } - if len(cs) == 1 { - return cs[0] - } - concat := newConcatNode(cs[0], cs[1]) - for _, c := range cs[2:] { - concat = newConcatNode(concat, c) - } - return concat -} - -func genAltNode(cs ...astNode) astNode { - nonNilNodes := []astNode{} - for _, c := range cs { - if c == nil { - continue - } - nonNilNodes = append(nonNilNodes, c) - } - if len(nonNilNodes) <= 0 { - return nil - } - if len(nonNilNodes) == 1 { - return nonNilNodes[0] - } - alt := newAltNode(nonNilNodes[0], nonNilNodes[1]) - for _, c := range nonNilNodes[2:] { - alt = newAltNode(alt, c) - } - return alt -} - -func (p *parser) expect(expected tokenKind) { - if !p.consume(expected) { - tok := p.peekedTok - p.errMsgDetails = fmt.Sprintf("unexpected token; expected: %v, actual: %v", expected, tok.kind) - raiseSyntaxError(synErrUnexpectedToken) - } -} - -func (p *parser) consume(expected tokenKind) bool { - var tok *token - var err error - if p.peekedTok != nil { - tok = p.peekedTok - p.peekedTok = nil - } else { - tok, err = p.lex.next() - if err != nil { - p.errMsgDetails = p.lex.errMsgDetails - panic(err) - } - } - p.lastTok = tok - if tok.kind == expected { - return true - } - p.peekedTok = tok - p.lastTok = nil - - return false -} diff --git a/compiler/parser_test.go b/compiler/parser_test.go deleted file mode 100644 index b0bc67a..0000000 --- a/compiler/parser_test.go +++ /dev/null @@ -1,1422 +0,0 @@ -package compiler - -import ( - "fmt" - "reflect" - "testing" - - "github.com/nihei9/maleeni/spec" - "github.com/nihei9/maleeni/ucd" -) - -func symPos(n uint16) symbolPosition { - pos, err := newSymbolPosition(n, false) - if err != nil { - panic(err) - } - return pos -} - -func endPos(n uint16) symbolPosition { - pos, err := newSymbolPosition(n, true) - if err != nil { - panic(err) - } - return pos -} - -func TestParse(t *testing.T) { - tests := []struct { - pattern string - fragments map[string]string - ast astNode - syntaxError *SyntaxError - - // When an AST is large, as patterns containing a character property expression, - // this test only checks that the pattern is parsable. - // The check of the validity of such AST is performed by checking that it can be matched correctly using the driver. - skipTestAST bool - }{ - { - pattern: "a", - ast: genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "abc", - ast: genConcatNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "a?", - ast: genConcatNode( - newOptionNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - ), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "[abc]?", - ast: genConcatNode( - newOptionNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "\\u{3042}?", - ast: genConcatNode( - newOptionNode( - genConcatNode( - newSymbolNodeWithPos(0xE3, symPos(1)), - newSymbolNodeWithPos(0x81, symPos(2)), - newSymbolNodeWithPos(0x82, symPos(3)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "\\p{Letter}?", - skipTestAST: true, - }, - { - pattern: "\\f{a2c}?", - fragments: map[string]string{ - "a2c": "abc", - }, - ast: genConcatNode( - newOptionNode( - newFragmentNode("a2c", - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "(a)?", - ast: genConcatNode( - newOptionNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - ), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "((a?)?)?", - ast: genConcatNode( - newOptionNode( - newOptionNode( - newOptionNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - ), - ), - ), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "(abc)?", - ast: genConcatNode( - newOptionNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "(a|b)?", - ast: genConcatNode( - newOptionNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(3)), - ), - }, - { - pattern: "?", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "(?)", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a|?", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "?|b", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a??", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a*", - ast: genConcatNode( - newRepeatNode( - newSymbolNodeWithPos(byte('a'), 1), - ), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "[abc]*", - ast: genConcatNode( - newRepeatNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), 1), - newSymbolNodeWithPos(byte('b'), 2), - newSymbolNodeWithPos(byte('c'), 3), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "\\u{3042}*", - ast: genConcatNode( - newRepeatNode( - genConcatNode( - newSymbolNodeWithPos(0xE3, symPos(1)), - newSymbolNodeWithPos(0x81, symPos(2)), - newSymbolNodeWithPos(0x82, symPos(3)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "\\p{Letter}*", - skipTestAST: true, - }, - { - pattern: "\\f{a2c}*", - fragments: map[string]string{ - "a2c": "abc", - }, - ast: genConcatNode( - newRepeatNode( - newFragmentNode("a2c", - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "((a*)*)*", - ast: genConcatNode( - newRepeatNode( - newRepeatNode( - newRepeatNode( - newSymbolNodeWithPos(byte('a'), 1), - ), - ), - ), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "(abc)*", - ast: genConcatNode( - newRepeatNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), 1), - newSymbolNodeWithPos(byte('b'), 2), - newSymbolNodeWithPos(byte('c'), 3), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "(a|b)*", - ast: genConcatNode( - newRepeatNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), 1), - newSymbolNodeWithPos(byte('b'), 2), - ), - ), - newEndMarkerNodeWithPos(1, endPos(3)), - ), - }, - { - pattern: "*", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "(*)", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a|*", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "*|b", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a**", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a+", - ast: genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newRepeatNode( - newSymbolNodeWithPos(byte('a'), symPos(2)), - ), - newEndMarkerNodeWithPos(1, endPos(3)), - ), - }, - { - pattern: "[abc]+", - ast: genConcatNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - newRepeatNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), symPos(4)), - newSymbolNodeWithPos(byte('b'), symPos(5)), - newSymbolNodeWithPos(byte('c'), symPos(6)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(7)), - ), - }, - { - pattern: "\\u{3042}+", - ast: genConcatNode( - genConcatNode( - newSymbolNodeWithPos(0xE3, symPos(1)), - newSymbolNodeWithPos(0x81, symPos(2)), - newSymbolNodeWithPos(0x82, symPos(3)), - ), - newRepeatNode( - genConcatNode( - newSymbolNodeWithPos(0xE3, symPos(4)), - newSymbolNodeWithPos(0x81, symPos(5)), - newSymbolNodeWithPos(0x82, symPos(6)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(7)), - ), - }, - { - pattern: "\\p{Letter}+", - skipTestAST: true, - }, - { - pattern: "\\f{a2c}+", - fragments: map[string]string{ - "a2c": "abc", - }, - ast: genConcatNode( - newFragmentNode("a2c", - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - ), - newRepeatNode( - newFragmentNode("a2c", - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(4)), - newSymbolNodeWithPos(byte('b'), symPos(5)), - newSymbolNodeWithPos(byte('c'), symPos(6)), - ), - ), - ), - newEndMarkerNodeWithPos(1, endPos(7)), - ), - }, - { - pattern: "((a+)+)+", - ast: genConcatNode( - genConcatNode( - genConcatNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newRepeatNode( - newSymbolNodeWithPos(byte('a'), symPos(2)), - ), - ), - newRepeatNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(3)), - newRepeatNode( - newSymbolNodeWithPos(byte('a'), symPos(4)), - ), - ), - ), - ), - newRepeatNode( - genConcatNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(5)), - newRepeatNode( - newSymbolNodeWithPos(byte('a'), symPos(6)), - ), - ), - newRepeatNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(7)), - newRepeatNode( - newSymbolNodeWithPos(byte('a'), symPos(8)), - ), - ), - ), - ), - ), - ), - newEndMarkerNodeWithPos(1, endPos(9)), - ), - }, - { - pattern: "(abc)+", - ast: genConcatNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - newRepeatNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(4)), - newSymbolNodeWithPos(byte('b'), symPos(5)), - newSymbolNodeWithPos(byte('c'), symPos(6)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(7)), - ), - }, - { - pattern: "(a|b)+", - ast: genConcatNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - ), - newRepeatNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), symPos(3)), - newSymbolNodeWithPos(byte('b'), symPos(4)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(5)), - ), - }, - { - pattern: "+", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "(+)", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a|+", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "+|b", - syntaxError: synErrRepNoTarget, - }, - { - pattern: "a++", - syntaxError: synErrRepNoTarget, - }, - { - pattern: ".", - ast: newConcatNode( - genAltNode( - newRangeSymbolNodeWithPos(0x00, 0x7f, symPos(1)), - genConcatNode( - newRangeSymbolNodeWithPos(0xc2, 0xdf, symPos(2)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(3)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xe0, 0xe0, symPos(4)), - newRangeSymbolNodeWithPos(0xa0, 0xbf, symPos(5)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(6)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xe1, 0xec, symPos(7)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(8)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(9)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xed, 0xed, symPos(10)), - newRangeSymbolNodeWithPos(0x80, 0x9f, symPos(11)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(12)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xee, 0xef, symPos(13)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(14)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(15)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf0, 0xf0, symPos(16)), - newRangeSymbolNodeWithPos(0x90, 0xbf, symPos(17)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(18)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(19)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf1, 0xf3, symPos(20)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(21)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(22)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(23)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf4, 0xf4, symPos(24)), - newRangeSymbolNodeWithPos(0x80, 0x8f, symPos(25)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(26)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(27)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(28)), - ), - }, - { - pattern: "[a]", - ast: newConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "[abc]", - ast: newConcatNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "[a-z]", - ast: newConcatNode( - newRangeSymbolNodeWithPos(byte('a'), byte('z'), symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "[A-Za-z]", - ast: newConcatNode( - genAltNode( - newRangeSymbolNodeWithPos(byte('A'), byte('Z'), symPos(1)), - newRangeSymbolNodeWithPos(byte('a'), byte('z'), symPos(2)), - ), - newEndMarkerNodeWithPos(1, endPos(3)), - ), - }, - { - pattern: "[\\u{004E}]", - ast: newConcatNode( - newSymbolNodeWithPos(byte('N'), symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "[\\p{Lu}]", - skipTestAST: true, - }, - { - pattern: "a[]", - syntaxError: synErrBExpNoElem, - }, - { - pattern: "[]a", - syntaxError: synErrBExpNoElem, - }, - { - pattern: "[]", - syntaxError: synErrBExpNoElem, - }, - { - pattern: "[^]", - ast: newConcatNode( - newSymbolNodeWithPos(byte('^'), symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "[", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "[a", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([a", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "[a-", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([a-", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "[^", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([^", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "[^a", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([^a", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "[^a-", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([^a-", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "]", - ast: newConcatNode( - newSymbolNodeWithPos(byte(']'), symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "(]", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: "a]", - ast: newConcatNode( - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte(']'), symPos(2)), - ), - newEndMarkerNodeWithPos(1, endPos(3)), - ), - }, - { - pattern: "(a]", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: "([)", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "([a)", - syntaxError: synErrBExpUnclosed, - }, - { - pattern: "[a-]", - ast: newConcatNode( - genAltNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('-'), symPos(2)), - ), - newEndMarkerNodeWithPos(1, endPos(3)), - ), - }, - { - pattern: "[^a-]", - ast: newConcatNode( - genAltNode( - newRangeSymbolNodeWithPos(0x00, byte(44), symPos(1)), - newRangeSymbolNodeWithPos(byte(46), byte(96), symPos(2)), - newRangeSymbolNodeWithPos(byte(98), 0x7f, symPos(3)), - genConcatNode( - newRangeSymbolNodeWithPos(0xc2, 0xdf, symPos(4)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(5)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xe0, 0xe0, symPos(6)), - newRangeSymbolNodeWithPos(0xa0, 0xbf, symPos(7)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(8)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xe1, 0xec, symPos(9)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(10)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(11)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xed, 0xed, symPos(12)), - newRangeSymbolNodeWithPos(0x80, 0x9f, symPos(13)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(14)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xee, 0xef, symPos(15)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(16)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(17)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf0, 0xf0, symPos(18)), - newRangeSymbolNodeWithPos(0x90, 0xbf, symPos(19)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(20)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(21)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf1, 0xf3, symPos(22)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(23)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(24)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(25)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf4, 0xf4, symPos(26)), - newRangeSymbolNodeWithPos(0x80, 0x8f, symPos(27)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(28)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(29)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(30)), - ), - }, - { - pattern: "[-z]", - ast: newConcatNode( - genAltNode( - newSymbolNodeWithPos(byte('-'), symPos(1)), - newSymbolNodeWithPos(byte('z'), symPos(2)), - ), - newEndMarkerNodeWithPos(1, endPos(3)), - ), - }, - { - pattern: "[^-z]", - ast: newConcatNode( - genAltNode( - newRangeSymbolNodeWithPos(0x00, byte(44), symPos(1)), - genAltNode( - newRangeSymbolNodeWithPos(byte(46), byte(121), symPos(2)), - newRangeSymbolNodeWithPos(byte(123), 0x7f, symPos(3)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xc2, 0xdf, symPos(4)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(5)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xe0, 0xe0, symPos(6)), - newRangeSymbolNodeWithPos(0xa0, 0xbf, symPos(7)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(8)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xe1, 0xec, symPos(9)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(10)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(11)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xed, 0xed, symPos(12)), - newRangeSymbolNodeWithPos(0x80, 0x9f, symPos(13)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(14)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xee, 0xef, symPos(15)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(16)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(17)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf0, 0xf0, symPos(18)), - newRangeSymbolNodeWithPos(0x90, 0xbf, symPos(19)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(20)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(21)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf1, 0xf3, symPos(22)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(23)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(24)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(25)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf4, 0xf4, symPos(26)), - newRangeSymbolNodeWithPos(0x80, 0x8f, symPos(27)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(28)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(29)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(30)), - ), - }, - { - pattern: "[-]", - ast: newConcatNode( - newSymbolNodeWithPos(byte('-'), symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "[^-]", - ast: newConcatNode( - genAltNode( - newRangeSymbolNodeWithPos(0x00, byte(44), symPos(1)), - newRangeSymbolNodeWithPos(byte(46), 0x7f, symPos(2)), - genConcatNode( - newRangeSymbolNodeWithPos(0xc2, 0xdf, symPos(3)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(4)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xe0, 0xe0, symPos(5)), - newRangeSymbolNodeWithPos(0xa0, 0xbf, symPos(6)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(7)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xe1, 0xec, symPos(8)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(9)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(10)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xed, 0xed, symPos(11)), - newRangeSymbolNodeWithPos(0x80, 0x9f, symPos(12)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(13)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xee, 0xef, symPos(14)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(15)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(16)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf0, 0xf0, symPos(17)), - newRangeSymbolNodeWithPos(0x90, 0xbf, symPos(18)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(19)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(20)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf1, 0xf3, symPos(21)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(22)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(23)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(24)), - ), - genConcatNode( - newRangeSymbolNodeWithPos(0xf4, 0xf4, symPos(25)), - newRangeSymbolNodeWithPos(0x80, 0x8f, symPos(26)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(27)), - newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(28)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(29)), - ), - }, - { - pattern: "\\u{006E}", - ast: genConcatNode( - newSymbolNodeWithPos(0x6E, symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "\\u{03BD}", - ast: genConcatNode( - genConcatNode( - newSymbolNodeWithPos(0xCE, symPos(1)), - newSymbolNodeWithPos(0xBD, symPos(2)), - ), - newEndMarkerNodeWithPos(1, endPos(3)), - ), - }, - { - pattern: "\\u{306B}", - ast: genConcatNode( - genConcatNode( - newSymbolNodeWithPos(0xE3, symPos(1)), - newSymbolNodeWithPos(0x81, symPos(2)), - newSymbolNodeWithPos(0xAB, symPos(3)), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "\\u{01F638}", - ast: genConcatNode( - genConcatNode( - newSymbolNodeWithPos(0xF0, symPos(1)), - newSymbolNodeWithPos(0x9F, symPos(2)), - newSymbolNodeWithPos(0x98, symPos(3)), - newSymbolNodeWithPos(0xB8, symPos(4)), - ), - newEndMarkerNodeWithPos(1, endPos(5)), - ), - }, - { - pattern: "\\u{0000}", - ast: genConcatNode( - newSymbolNodeWithPos(0x00, symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "\\u{10FFFF}", - ast: genConcatNode( - genConcatNode( - newSymbolNodeWithPos(0xF4, symPos(1)), - newSymbolNodeWithPos(0x8F, symPos(2)), - newSymbolNodeWithPos(0xBF, symPos(3)), - newSymbolNodeWithPos(0xBF, symPos(4)), - ), - newEndMarkerNodeWithPos(1, endPos(5)), - ), - }, - { - pattern: "\\u{110000}", - syntaxError: synErrCPExpOutOfRange, - }, - { - pattern: "\\u", - syntaxError: synErrCPExpInvalidForm, - }, - { - pattern: "\\u{", - syntaxError: synErrCPExpInvalidForm, - }, - { - pattern: "\\u{03BD", - syntaxError: synErrCPExpInvalidForm, - }, - { - pattern: "\\u{}", - syntaxError: synErrCPExpInvalidForm, - }, - { - pattern: "\\p{Letter}", - skipTestAST: true, - }, - { - pattern: "\\p{General_Category=Letter}", - skipTestAST: true, - }, - { - pattern: "\\p{ Letter }", - skipTestAST: true, - }, - { - pattern: "\\p{ General_Category = Letter }", - skipTestAST: true, - }, - { - pattern: "\\p", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\p{", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\p{Letter", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\p{General_Category=}", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\p{General_Category= }", - syntaxError: synErrCharPropInvalidSymbol, - }, - { - pattern: "\\p{=Letter}", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\p{ =Letter}", - syntaxError: synErrCharPropInvalidSymbol, - }, - { - pattern: "\\p{=}", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\p{}", - syntaxError: synErrCharPropExpInvalidForm, - }, - { - pattern: "\\f{a2c}", - fragments: map[string]string{ - "a2c": "abc", - }, - ast: genConcatNode( - newFragmentNode("a2c", - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "\\f{ a2c }", - fragments: map[string]string{ - "a2c": "abc", - }, - ast: genConcatNode( - newFragmentNode("a2c", - genConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - newSymbolNodeWithPos(byte('c'), symPos(3)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(4)), - ), - }, - { - pattern: "\\f", - syntaxError: synErrFragmentExpInvalidForm, - }, - { - pattern: "\\f{", - syntaxError: synErrFragmentExpInvalidForm, - }, - { - pattern: "\\f{a2c", - fragments: map[string]string{ - "a2c": "abc", - }, - syntaxError: synErrFragmentExpInvalidForm, - }, - { - pattern: "(a)", - ast: newConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "(((a)))", - ast: newConcatNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newEndMarkerNodeWithPos(1, endPos(2)), - ), - }, - { - pattern: "a()", - syntaxError: synErrGroupNoElem, - }, - { - pattern: "()a", - syntaxError: synErrGroupNoElem, - }, - { - pattern: "()", - syntaxError: synErrGroupNoElem, - }, - { - pattern: "(", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: "a(", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: "(a", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: "((", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: "((a)", - syntaxError: synErrGroupUnclosed, - }, - { - pattern: ")", - syntaxError: synErrGroupNoInitiator, - }, - { - pattern: "a)", - syntaxError: synErrGroupNoInitiator, - }, - { - pattern: ")a", - syntaxError: synErrGroupNoInitiator, - }, - { - pattern: "))", - syntaxError: synErrGroupNoInitiator, - }, - { - pattern: "(a))", - syntaxError: synErrGroupNoInitiator, - }, - { - pattern: "Mulder|Scully", - ast: newConcatNode( - genAltNode( - genConcatNode( - newSymbolNodeWithPos(byte('M'), symPos(1)), - newSymbolNodeWithPos(byte('u'), symPos(2)), - newSymbolNodeWithPos(byte('l'), symPos(3)), - newSymbolNodeWithPos(byte('d'), symPos(4)), - newSymbolNodeWithPos(byte('e'), symPos(5)), - newSymbolNodeWithPos(byte('r'), symPos(6)), - ), - genConcatNode( - newSymbolNodeWithPos(byte('S'), symPos(7)), - newSymbolNodeWithPos(byte('c'), symPos(8)), - newSymbolNodeWithPos(byte('u'), symPos(9)), - newSymbolNodeWithPos(byte('l'), symPos(10)), - newSymbolNodeWithPos(byte('l'), symPos(11)), - newSymbolNodeWithPos(byte('y'), symPos(12)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(13)), - ), - }, - { - pattern: "Langly|Frohike|Byers", - ast: newConcatNode( - genAltNode( - genConcatNode( - newSymbolNodeWithPos(byte('L'), symPos(1)), - newSymbolNodeWithPos(byte('a'), symPos(2)), - newSymbolNodeWithPos(byte('n'), symPos(3)), - newSymbolNodeWithPos(byte('g'), symPos(4)), - newSymbolNodeWithPos(byte('l'), symPos(5)), - newSymbolNodeWithPos(byte('y'), symPos(6)), - ), - genConcatNode( - newSymbolNodeWithPos(byte('F'), symPos(7)), - newSymbolNodeWithPos(byte('r'), symPos(8)), - newSymbolNodeWithPos(byte('o'), symPos(9)), - newSymbolNodeWithPos(byte('h'), symPos(10)), - newSymbolNodeWithPos(byte('i'), symPos(11)), - newSymbolNodeWithPos(byte('k'), symPos(12)), - newSymbolNodeWithPos(byte('e'), symPos(13)), - ), - genConcatNode( - newSymbolNodeWithPos(byte('B'), symPos(14)), - newSymbolNodeWithPos(byte('y'), symPos(15)), - newSymbolNodeWithPos(byte('e'), symPos(16)), - newSymbolNodeWithPos(byte('r'), symPos(17)), - newSymbolNodeWithPos(byte('s'), symPos(18)), - ), - ), - newEndMarkerNodeWithPos(1, endPos(19)), - ), - }, - { - pattern: "|", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "||", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "Mulder|", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "|Scully", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "Langly|Frohike|", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "Langly||Byers", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "|Frohike|Byers", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "|Frohike|", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "Fox(|)Mulder", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "(Fox|)Mulder", - syntaxError: synErrAltLackOfOperand, - }, - { - pattern: "Fox(|Mulder)", - syntaxError: synErrAltLackOfOperand, - }, - } - for i, tt := range tests { - t.Run(fmt.Sprintf("#%v %v", i, tt.pattern), func(t *testing.T) { - fragments := map[string][]byte{} - for kind, pattern := range tt.fragments { - fragments[kind] = []byte(pattern) - } - ast, _, err := parse([]*patternEntry{ - { - id: spec.LexModeKindIDMin, - pattern: []byte(tt.pattern), - }, - }, fragments) - if tt.syntaxError != nil { - // printAST(os.Stdout, ast, "", "", false) - if err == nil { - t.Fatalf("expected syntax error; got: nil") - } - parseErrs, ok := err.(*ParseErrors) - if !ok { - t.Fatalf("expected ParseErrors; got: %v (type: %T)", err, err) - } - parseErr := parseErrs.Errors[0].Cause - synErr, ok := parseErr.(*SyntaxError) - if !ok { - t.Fatalf("expected SyntaxError; got: %v (type: %T)", parseErr, parseErr) - } - if synErr != tt.syntaxError { - t.Fatalf("unexpected syntax error; want: %v, got: %v", tt.syntaxError, synErr) - } - if ast != nil { - t.Fatalf("ast is not nil") - } - } else { - if err != nil { - t.Fatal(err) - } - if ast == nil { - t.Fatal("AST is nil") - } - // printAST(os.Stdout, ast, "", "", false) - if !tt.skipTestAST { - testAST(t, tt.ast, ast) - } - } - }) - } -} - -func TestParse_ContributoryPropertyIsNotExposed(t *testing.T) { - for _, cProp := range ucd.ContributoryProperties() { - t.Run(fmt.Sprintf("%v", cProp), func(t *testing.T) { - ast, _, err := parse([]*patternEntry{ - { - id: spec.LexModeKindIDMin, - pattern: []byte(fmt.Sprintf(`\p{%v=yes}`, cProp)), - }, - }, nil) - if err == nil { - t.Fatalf("expected syntax error; got: nil") - } - parseErrs, ok := err.(*ParseErrors) - if !ok { - t.Fatalf("expected ParseErrors; got: %v (type: %T)", err, err) - } - parseErr := parseErrs.Errors[0].Cause - synErr, ok := parseErr.(*SyntaxError) - if !ok { - t.Fatalf("expected SyntaxError; got: %v (type: %T)", parseErr, parseErr) - } - if synErr != synErrCharPropUnsupported { - t.Fatalf("unexpected syntax error; want: %v, got: %v", synErrCharPropUnsupported, synErr) - } - if ast != nil { - t.Fatalf("ast is not nil") - } - }) - } -} - -func TestParse_FollowAndSymbolTable(t *testing.T) { - root, symTab, err := parse([]*patternEntry{ - { - id: spec.LexModeKindIDMin, - pattern: []byte("(a|b)*abb"), - }, - }, nil) - if err != nil { - t.Fatal(err) - } - if root == nil { - t.Fatal("root of AST is nil") - } - // printAST(os.Stdout, root, "", "", false) - - { - expectedAST := genConcatNode( - newRepeatNode( - newAltNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - ), - ), - newSymbolNodeWithPos(byte('a'), symPos(3)), - newSymbolNodeWithPos(byte('b'), symPos(4)), - newSymbolNodeWithPos(byte('b'), symPos(5)), - newEndMarkerNodeWithPos(1, endPos(6)), - ) - testAST(t, expectedAST, root) - } - - { - followTab := genFollowTable(root) - if followTab == nil { - t.Fatal("follow table is nil") - } - expectedFollowTab := followTable{ - 1: newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)), - 2: newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)), - 3: newSymbolPositionSet().add(symPos(4)), - 4: newSymbolPositionSet().add(symPos(5)), - 5: newSymbolPositionSet().add(endPos(6)), - } - testFollowTable(t, expectedFollowTab, followTab) - } - - { - entry := func(v byte) byteRange { - return byteRange{ - from: v, - to: v, - } - } - - expectedSymTab := &symbolTable{ - symPos2Byte: map[symbolPosition]byteRange{ - symPos(1): entry(byte('a')), - symPos(2): entry(byte('b')), - symPos(3): entry(byte('a')), - symPos(4): entry(byte('b')), - symPos(5): entry(byte('b')), - }, - endPos2ID: map[symbolPosition]spec.LexModeKindID{ - endPos(6): 1, - }, - } - testSymbolTable(t, expectedSymTab, symTab) - } -} - -func testAST(t *testing.T, expected, actual astNode) { - t.Helper() - - aTy := reflect.TypeOf(actual) - eTy := reflect.TypeOf(expected) - if eTy != aTy { - t.Fatalf("AST node type is mismatched; want: %v, got: %v", eTy, aTy) - } - - if actual == nil { - return - } - - switch e := expected.(type) { - case *symbolNode: - a := actual.(*symbolNode) - if a.pos != e.pos || a.from != e.from || a.to != e.to { - t.Fatalf("unexpected node; want: %+v, got: %+v", e, a) - } - case *endMarkerNode: - a := actual.(*endMarkerNode) - if a.pos != e.pos { - t.Fatalf("symbol position is mismatched; want: %v, got: %v", e.pos, a.pos) - } - } - eLeft, eRight := expected.children() - aLeft, aRight := actual.children() - testAST(t, eLeft, aLeft) - testAST(t, eRight, aRight) -} - -func testFollowTable(t *testing.T, expected, actual followTable) { - if len(actual) != len(expected) { - t.Errorf("unexpected number of the follow table entries; want: %v, got: %v", len(expected), len(actual)) - } - for ePos, eSet := range expected { - aSet, ok := actual[ePos] - if !ok { - t.Fatalf("follow entry is not found; position: %v, follow: %v", ePos, eSet) - } - if aSet.hash() != eSet.hash() { - t.Fatalf("follow entry of position %v is mismatched; want: %v, got: %v", ePos, aSet, eSet) - } - } -} - -func testSymbolTable(t *testing.T, expected, actual *symbolTable) { - t.Helper() - - if len(actual.symPos2Byte) != len(expected.symPos2Byte) { - t.Errorf("unexpected symPos2Byte entries; want: %v entries, got: %v entries", len(expected.symPos2Byte), len(actual.symPos2Byte)) - } - for ePos, eByte := range expected.symPos2Byte { - byte, ok := actual.symPos2Byte[ePos] - if !ok { - t.Errorf("a symbol position entry was not found: %v -> %v", ePos, eByte) - continue - } - if byte.from != eByte.from || byte.to != eByte.to { - t.Errorf("unexpected symbol position entry; want: %v -> %v, got: %v -> %v", ePos, eByte, ePos, byte) - } - } - - if len(actual.endPos2ID) != len(expected.endPos2ID) { - t.Errorf("unexpected endPos2ID entries; want: %v entries, got: %v entries", len(expected.endPos2ID), len(actual.endPos2ID)) - } - for ePos, eID := range expected.endPos2ID { - id, ok := actual.endPos2ID[ePos] - if !ok { - t.Errorf("an end position entry was not found: %v -> %v", ePos, eID) - continue - } - if id != eID { - t.Errorf("unexpected end position entry; want: %v -> %v, got: %v -> %v", ePos, eID, ePos, id) - } - } -} diff --git a/compiler/symbol_position.go b/compiler/symbol_position.go deleted file mode 100644 index d35dfde..0000000 --- a/compiler/symbol_position.go +++ /dev/null @@ -1,198 +0,0 @@ -package compiler - -import ( - "encoding/binary" - "fmt" - "strings" -) - -type symbolPosition uint16 - -const ( - symbolPositionNil = symbolPosition(0x0000) // 0000 0000 0000 0000 - - symbolPositionMin = uint16(0x0001) // 0000 0000 0000 0001 - symbolPositionMax = uint16(0x7fff) // 0111 1111 1111 1111 - - symbolPositionMaskSymbol = uint16(0x0000) // 0000 0000 0000 0000 - symbolPositionMaskEndMark = uint16(0x8000) // 1000 0000 0000 0000 - - symbolPositionMaskValue = uint16(0x7fff) // 0111 1111 1111 1111 -) - -func newSymbolPosition(n uint16, endMark bool) (symbolPosition, error) { - if n < symbolPositionMin || n > symbolPositionMax { - return symbolPositionNil, fmt.Errorf("symbol position must be within %v to %v; n: %v, endMark: %v", symbolPositionMin, symbolPositionMax, n, endMark) - } - if endMark { - return symbolPosition(n | symbolPositionMaskEndMark), nil - } - return symbolPosition(n | symbolPositionMaskSymbol), nil -} - -func (p symbolPosition) String() string { - if p.isEndMark() { - return fmt.Sprintf("end#%v", uint16(p)&symbolPositionMaskValue) - } - return fmt.Sprintf("sym#%v", uint16(p)&symbolPositionMaskValue) -} - -func (p symbolPosition) isEndMark() bool { - if uint16(p)&symbolPositionMaskEndMark > 1 { - return true - } - return false -} - -func (p symbolPosition) describe() (uint16, bool) { - v := uint16(p) & symbolPositionMaskValue - if p.isEndMark() { - return v, true - } - return v, false -} - -type symbolPositionSet struct { - // `s` represents a set of symbol positions. - // However, immediately after adding a symbol position, the elements may be duplicated. - // When you need an aligned set with no duplicates, you can get such value via the set function. - s []symbolPosition - sorted bool -} - -func newSymbolPositionSet() *symbolPositionSet { - return &symbolPositionSet{ - s: []symbolPosition{}, - sorted: false, - } -} - -func (s *symbolPositionSet) String() string { - if len(s.s) <= 0 { - return "{}" - } - ps := s.sortAndRemoveDuplicates() - var b strings.Builder - fmt.Fprintf(&b, "{") - for i, p := range ps { - if i <= 0 { - fmt.Fprintf(&b, "%v", p) - continue - } - fmt.Fprintf(&b, ", %v", p) - } - fmt.Fprintf(&b, "}") - return b.String() -} - -func (s *symbolPositionSet) set() []symbolPosition { - s.sortAndRemoveDuplicates() - return s.s -} - -func (s *symbolPositionSet) add(pos symbolPosition) *symbolPositionSet { - s.s = append(s.s, pos) - s.sorted = false - return s -} - -func (s *symbolPositionSet) merge(t *symbolPositionSet) *symbolPositionSet { - s.s = append(s.s, t.s...) - s.sorted = false - return s -} - -func (s *symbolPositionSet) intersect(set *symbolPositionSet) *symbolPositionSet { - in := newSymbolPositionSet() - for _, p1 := range s.s { - for _, p2 := range set.s { - if p1 != p2 { - continue - } - in.add(p1) - } - } - return in -} - -func (s *symbolPositionSet) hash() string { - if len(s.s) <= 0 { - return "" - } - sorted := s.sortAndRemoveDuplicates() - var buf []byte - for _, p := range sorted { - b := make([]byte, 8) - binary.PutUvarint(b, uint64(p)) - buf = append(buf, b...) - } - // Convert to a string to be able to use it as a key of a map. - // But note this byte sequence is made from values of symbol positions, - // so this is not a well-formed UTF-8 sequence. - return string(buf) -} - -func (s *symbolPositionSet) sortAndRemoveDuplicates() []symbolPosition { - if s.sorted { - return s.s - } - - sortSymbolPositions(s.s, 0, len(s.s)-1) - - // Remove duplicates. - lastV := s.s[0] - nextIdx := 1 - for _, v := range s.s[1:] { - if v == lastV { - continue - } - s.s[nextIdx] = v - nextIdx++ - lastV = v - } - s.s = s.s[:nextIdx] - s.sorted = true - - return s.s -} - -// sortSymbolPositions sorts a slice of symbol positions as it uses quick sort. -func sortSymbolPositions(ps []symbolPosition, left, right int) { - if left >= right { - return - } - var pivot symbolPosition - { - // Use a median as a pivot. - p1 := ps[left] - p2 := ps[(left+right)/2] - p3 := ps[right] - if p1 > p2 { - p1, p2 = p2, p1 - } - if p2 > p3 { - p2, p3 = p3, p2 - if p1 > p2 { - p1, p2 = p2, p1 - } - } - pivot = p2 - } - i := left - j := right - for i <= j { - for ps[i] < pivot { - i++ - } - for ps[j] > pivot { - j-- - } - if i <= j { - ps[i], ps[j] = ps[j], ps[i] - i++ - j-- - } - } - sortSymbolPositions(ps, left, j) - sortSymbolPositions(ps, i, right) -} diff --git a/compiler/syntax_error.go b/compiler/syntax_error.go deleted file mode 100644 index 3925259..0000000 --- a/compiler/syntax_error.go +++ /dev/null @@ -1,45 +0,0 @@ -package compiler - -import "fmt" - -type SyntaxError struct { - Message string -} - -func newSyntaxError(msg string) *SyntaxError { - return &SyntaxError{ - Message: msg, - } -} - -func (e *SyntaxError) Error() string { - return fmt.Sprintf("syntax error: %s", e.Message) -} - -var ( - // lexical errors - synErrIncompletedEscSeq = newSyntaxError("incompleted escape sequence; unexpected EOF following \\") - synErrInvalidEscSeq = newSyntaxError("invalid escape sequence") - synErrInvalidCodePoint = newSyntaxError("code points must consist of just 4 or 6 hex digits") - synErrCharPropInvalidSymbol = newSyntaxError("invalid character property symbol") - SynErrFragmentInvalidSymbol = newSyntaxError("invalid fragment symbol") - - // syntax errors - synErrUnexpectedToken = newSyntaxError("unexpected token") - synErrNullPattern = newSyntaxError("a pattern must be a non-empty byte sequence") - synErrAltLackOfOperand = newSyntaxError("an alternation expression must have operands") - synErrRepNoTarget = newSyntaxError("a repeat expression must have an operand") - synErrGroupNoElem = newSyntaxError("a grouping expression must include at least one character") - synErrGroupUnclosed = newSyntaxError("unclosed grouping expression") - synErrGroupNoInitiator = newSyntaxError(") needs preceding (") - synErrGroupInvalidForm = newSyntaxError("invalid grouping expression") - synErrBExpNoElem = newSyntaxError("a bracket expression must include at least one character") - synErrBExpUnclosed = newSyntaxError("unclosed bracket expression") - synErrBExpInvalidForm = newSyntaxError("invalid bracket expression") - synErrRangeInvalidOrder = newSyntaxError("a range expression with invalid order") - synErrCPExpInvalidForm = newSyntaxError("invalid code point expression") - synErrCPExpOutOfRange = newSyntaxError("a code point must be between U+0000 to U+10FFFF") - synErrCharPropExpInvalidForm = newSyntaxError("invalid character property expression") - synErrCharPropUnsupported = newSyntaxError("unsupported character property") - synErrFragmentExpInvalidForm = newSyntaxError("invalid fragment expression") -) diff --git a/compiler/test_util_test.go b/compiler/test_util_test.go deleted file mode 100644 index 72e150b..0000000 --- a/compiler/test_util_test.go +++ /dev/null @@ -1,21 +0,0 @@ -package compiler - -import "github.com/nihei9/maleeni/spec" - -func newRangeSymbolNodeWithPos(from, to byte, pos symbolPosition) *symbolNode { - n := newRangeSymbolNode(from, to) - n.pos = pos - return n -} - -func newSymbolNodeWithPos(v byte, pos symbolPosition) *symbolNode { - n := newSymbolNode(v) - n.pos = pos - return n -} - -func newEndMarkerNodeWithPos(id int, pos symbolPosition) *endMarkerNode { - n := newEndMarkerNode(spec.LexModeKindID(id)) - n.pos = pos - return n -} diff --git a/driver/lexer_test.go b/driver/lexer_test.go index 275e992..e0b82b1 100644 --- a/driver/lexer_test.go +++ b/driver/lexer_test.go @@ -776,8 +776,11 @@ func TestLexer_Next(t *testing.T) { for i, tt := range test { for compLv := compiler.CompressionLevelMin; compLv <= compiler.CompressionLevelMax; compLv++ { t.Run(fmt.Sprintf("#%v-%v", i, compLv), func(t *testing.T) { - clspec, err := compiler.Compile(tt.lspec, compiler.CompressionLevel(compLv)) + clspec, err, cerrs := compiler.Compile(tt.lspec, compiler.CompressionLevel(compLv)) if err != nil { + for _, cerr := range cerrs { + t.Logf("%#v", cerr) + } t.Fatalf("unexpected error: %v", err) } opts := []LexerOption{} @@ -821,7 +824,7 @@ func TestLexer_Next_WithPosition(t *testing.T) { }, } - clspec, err := compiler.Compile(lspec, compiler.CompressionLevel(compiler.CompressionLevelMax)) + clspec, err, _ := compiler.Compile(lspec, compiler.CompressionLevel(compiler.CompressionLevelMax)) if err != nil { t.Fatalf("unexpected error: %v", err) } diff --git a/utf8/utf8.go b/utf8/utf8.go index 79ca1de..16b3746 100644 --- a/utf8/utf8.go +++ b/utf8/utf8.go @@ -83,15 +83,6 @@ var ( cBlk4Last = cBlks4[len(cBlks4)-1] ) -func AllCharBlocks() []*CharBlock { - var blks []*CharBlock - blks = append(blks, cBlks1...) - blks = append(blks, cBlks2...) - blks = append(blks, cBlks3...) - blks = append(blks, cBlks4...) - return blks -} - func GenCharBlocks(from, to []byte) ([]*CharBlock, error) { switch len(from) { case 1: |