aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--cmd/maleeni/compile.go23
-rw-r--r--compiler/ast.go469
-rw-r--r--compiler/ast_test.go198
-rw-r--r--compiler/byte.go281
-rw-r--r--compiler/byte_test.go263
-rw-r--r--compiler/compiler.go183
-rw-r--r--compiler/compiler_test.go4
-rw-r--r--compiler/dfa.go139
-rw-r--r--compiler/dfa_test.go117
-rw-r--r--compiler/lexer.go578
-rw-r--r--compiler/lexer_test.go514
-rw-r--r--compiler/parser.go862
-rw-r--r--compiler/parser_test.go1422
-rw-r--r--compiler/symbol_position.go198
-rw-r--r--compiler/syntax_error.go45
-rw-r--r--compiler/test_util_test.go21
-rw-r--r--driver/lexer_test.go7
-rw-r--r--utf8/utf8.go9
18 files changed, 181 insertions, 5152 deletions
diff --git a/cmd/maleeni/compile.go b/cmd/maleeni/compile.go
index c7a8b59..21e5058 100644
--- a/cmd/maleeni/compile.go
+++ b/cmd/maleeni/compile.go
@@ -3,8 +3,10 @@ package main
import (
"encoding/json"
"fmt"
+ "io"
"io/ioutil"
"os"
+ "strings"
"github.com/nihei9/maleeni/compiler"
"github.com/nihei9/maleeni/spec"
@@ -38,8 +40,17 @@ func runCompile(cmd *cobra.Command, args []string) (retErr error) {
return fmt.Errorf("Cannot read a lexical specification: %w", err)
}
- clspec, err := compiler.Compile(lspec, compiler.CompressionLevel(*compileFlags.compLv))
+ clspec, err, cerrs := compiler.Compile(lspec, compiler.CompressionLevel(*compileFlags.compLv))
if err != nil {
+ if len(cerrs) > 0 {
+ var b strings.Builder
+ writeCompileError(&b, cerrs[0])
+ for _, cerr := range cerrs[1:] {
+ fmt.Fprintf(&b, "\n")
+ writeCompileError(&b, cerr)
+ }
+ return fmt.Errorf(b.String())
+ }
return err
}
err = writeCompiledLexSpec(clspec, *compileFlags.output)
@@ -50,6 +61,16 @@ func runCompile(cmd *cobra.Command, args []string) (retErr error) {
return nil
}
+func writeCompileError(w io.Writer, cerr *compiler.CompileError) {
+ if cerr.Fragment {
+ fmt.Fprintf(w, "fragment ")
+ }
+ fmt.Fprintf(w, "%v: %v", cerr.Kind, cerr.Cause)
+ if cerr.Detail != "" {
+ fmt.Fprintf(w, ": %v", cerr.Detail)
+ }
+}
+
func readLexSpec(path string) (*spec.LexSpec, error) {
r := os.Stdin
if path != "" {
diff --git a/compiler/ast.go b/compiler/ast.go
deleted file mode 100644
index 046662e..0000000
--- a/compiler/ast.go
+++ /dev/null
@@ -1,469 +0,0 @@
-package compiler
-
-import (
- "fmt"
- "io"
-
- "github.com/nihei9/maleeni/spec"
-)
-
-type astNode interface {
- fmt.Stringer
- children() (astNode, astNode)
- nullable() bool
- first() *symbolPositionSet
- last() *symbolPositionSet
-}
-
-var (
- _ astNode = &symbolNode{}
- _ astNode = &endMarkerNode{}
- _ astNode = &concatNode{}
- _ astNode = &altNode{}
- _ astNode = &optionNode{}
- _ astNode = &fragmentNode{}
-)
-
-type symbolNode struct {
- byteRange
- pos symbolPosition
- firstMemo *symbolPositionSet
- lastMemo *symbolPositionSet
-}
-
-func newSymbolNode(value byte) *symbolNode {
- return &symbolNode{
- byteRange: byteRange{
- from: value,
- to: value,
- },
- pos: symbolPositionNil,
- }
-}
-
-func newRangeSymbolNode(from, to byte) *symbolNode {
- return &symbolNode{
- byteRange: byteRange{
- from: from,
- to: to,
- },
- pos: symbolPositionNil,
- }
-}
-
-func (n *symbolNode) String() string {
- return fmt.Sprintf("{type: symbol, value: %v - %v, pos: %v}", n.from, n.to, n.pos)
-}
-
-func (n *symbolNode) children() (astNode, astNode) {
- return nil, nil
-}
-
-func (n *symbolNode) nullable() bool {
- return false
-}
-
-func (n *symbolNode) first() *symbolPositionSet {
- if n.firstMemo == nil {
- n.firstMemo = newSymbolPositionSet()
- n.firstMemo.add(n.pos)
- }
- return n.firstMemo
-}
-
-func (n *symbolNode) last() *symbolPositionSet {
- if n.lastMemo == nil {
- n.lastMemo = newSymbolPositionSet()
- n.lastMemo.add(n.pos)
- }
- return n.lastMemo
-}
-
-type endMarkerNode struct {
- id spec.LexModeKindID
- pos symbolPosition
- firstMemo *symbolPositionSet
- lastMemo *symbolPositionSet
-}
-
-func newEndMarkerNode(id spec.LexModeKindID) *endMarkerNode {
- return &endMarkerNode{
- id: id,
- pos: symbolPositionNil,
- }
-}
-
-func (n *endMarkerNode) String() string {
- return fmt.Sprintf("{type: end, pos: %v}", n.pos)
-}
-
-func (n *endMarkerNode) children() (astNode, astNode) {
- return nil, nil
-}
-
-func (n *endMarkerNode) nullable() bool {
- return false
-}
-
-func (n *endMarkerNode) first() *symbolPositionSet {
- if n.firstMemo == nil {
- n.firstMemo = newSymbolPositionSet()
- n.firstMemo.add(n.pos)
- }
- return n.firstMemo
-}
-
-func (n *endMarkerNode) last() *symbolPositionSet {
- if n.lastMemo == nil {
- n.lastMemo = newSymbolPositionSet()
- n.lastMemo.add(n.pos)
- }
- return n.lastMemo
-}
-
-type concatNode struct {
- left astNode
- right astNode
- firstMemo *symbolPositionSet
- lastMemo *symbolPositionSet
-}
-
-func newConcatNode(left, right astNode) *concatNode {
- return &concatNode{
- left: left,
- right: right,
- }
-}
-
-func (n *concatNode) String() string {
- return fmt.Sprintf("{type: concat}")
-}
-
-func (n *concatNode) children() (astNode, astNode) {
- return n.left, n.right
-}
-
-func (n *concatNode) nullable() bool {
- return n.left.nullable() && n.right.nullable()
-}
-
-func (n *concatNode) first() *symbolPositionSet {
- if n.firstMemo == nil {
- n.firstMemo = newSymbolPositionSet()
- n.firstMemo.merge(n.left.first())
- if n.left.nullable() {
- n.firstMemo.merge(n.right.first())
- }
- n.firstMemo.sortAndRemoveDuplicates()
- }
- return n.firstMemo
-}
-
-func (n *concatNode) last() *symbolPositionSet {
- if n.lastMemo == nil {
- n.lastMemo = newSymbolPositionSet()
- n.lastMemo.merge(n.right.last())
- if n.right.nullable() {
- n.lastMemo.merge(n.left.last())
- }
- n.lastMemo.sortAndRemoveDuplicates()
- }
- return n.lastMemo
-}
-
-type altNode struct {
- left astNode
- right astNode
- firstMemo *symbolPositionSet
- lastMemo *symbolPositionSet
-}
-
-func newAltNode(left, right astNode) *altNode {
- return &altNode{
- left: left,
- right: right,
- }
-}
-
-func (n *altNode) String() string {
- return fmt.Sprintf("{type: alt}")
-}
-
-func (n *altNode) children() (astNode, astNode) {
- return n.left, n.right
-}
-
-func (n *altNode) nullable() bool {
- return n.left.nullable() || n.right.nullable()
-}
-
-func (n *altNode) first() *symbolPositionSet {
- if n.firstMemo == nil {
- n.firstMemo = newSymbolPositionSet()
- n.firstMemo.merge(n.left.first())
- n.firstMemo.merge(n.right.first())
- n.firstMemo.sortAndRemoveDuplicates()
- }
- return n.firstMemo
-}
-
-func (n *altNode) last() *symbolPositionSet {
- if n.lastMemo == nil {
- n.lastMemo = newSymbolPositionSet()
- n.lastMemo.merge(n.left.last())
- n.lastMemo.merge(n.right.last())
- n.lastMemo.sortAndRemoveDuplicates()
- }
- return n.lastMemo
-}
-
-type repeatNode struct {
- left astNode
- firstMemo *symbolPositionSet
- lastMemo *symbolPositionSet
-}
-
-func newRepeatNode(left astNode) *repeatNode {
- return &repeatNode{
- left: left,
- }
-}
-
-func (n *repeatNode) String() string {
- return fmt.Sprintf("{type: repeat}")
-}
-
-func (n *repeatNode) children() (astNode, astNode) {
- return n.left, nil
-}
-
-func (n *repeatNode) nullable() bool {
- return true
-}
-
-func (n *repeatNode) first() *symbolPositionSet {
- if n.firstMemo == nil {
- n.firstMemo = newSymbolPositionSet()
- n.firstMemo.merge(n.left.first())
- n.firstMemo.sortAndRemoveDuplicates()
- }
- return n.firstMemo
-}
-
-func (n *repeatNode) last() *symbolPositionSet {
- if n.lastMemo == nil {
- n.lastMemo = newSymbolPositionSet()
- n.lastMemo.merge(n.left.last())
- n.lastMemo.sortAndRemoveDuplicates()
- }
- return n.lastMemo
-}
-
-func newRepeatOneOrMoreNode(left astNode) *concatNode {
- return newConcatNode(
- left,
- &repeatNode{
- left: copyAST(left),
- })
-}
-
-type optionNode struct {
- left astNode
- firstMemo *symbolPositionSet
- lastMemo *symbolPositionSet
-}
-
-func newOptionNode(left astNode) *optionNode {
- return &optionNode{
- left: left,
- }
-}
-
-func (n *optionNode) String() string {
- return fmt.Sprintf("{type: option}")
-}
-
-func (n *optionNode) children() (astNode, astNode) {
- return n.left, nil
-}
-
-func (n *optionNode) nullable() bool {
- return true
-}
-
-func (n *optionNode) first() *symbolPositionSet {
- if n.firstMemo == nil {
- n.firstMemo = newSymbolPositionSet()
- n.firstMemo.merge(n.left.first())
- n.firstMemo.sortAndRemoveDuplicates()
- }
- return n.firstMemo
-}
-
-func (n *optionNode) last() *symbolPositionSet {
- if n.lastMemo == nil {
- n.lastMemo = newSymbolPositionSet()
- n.lastMemo.merge(n.left.last())
- n.lastMemo.sortAndRemoveDuplicates()
- }
- return n.lastMemo
-}
-
-type fragmentNode struct {
- symbol string
- left astNode
-}
-
-func newFragmentNode(symbol string, ast astNode) *fragmentNode {
- return &fragmentNode{
- symbol: symbol,
- left: ast,
- }
-}
-
-func (n *fragmentNode) String() string {
- return fmt.Sprintf("{type: fragment, symbol: %v}", n.symbol)
-}
-
-func (n *fragmentNode) children() (astNode, astNode) {
- return n.left, nil
-}
-
-func (n *fragmentNode) nullable() bool {
- return n.left.nullable()
-}
-
-func (n *fragmentNode) first() *symbolPositionSet {
- return n.left.first()
-}
-
-func (n *fragmentNode) last() *symbolPositionSet {
- return n.left.last()
-}
-
-func copyAST(src astNode) astNode {
- switch n := src.(type) {
- case *symbolNode:
- return newRangeSymbolNode(n.from, n.to)
- case *endMarkerNode:
- return newEndMarkerNode(n.id)
- case *concatNode:
- return newConcatNode(copyAST(n.left), copyAST(n.right))
- case *altNode:
- return newAltNode(copyAST(n.left), copyAST(n.right))
- case *repeatNode:
- return newRepeatNode(copyAST(n.left))
- case *optionNode:
- return newOptionNode(copyAST(n.left))
- case *fragmentNode:
- if n.left == nil {
- return newFragmentNode(n.symbol, nil)
- }
- return newFragmentNode(n.symbol, copyAST(n.left))
- }
- panic(fmt.Errorf("copyAST cannot handle %T type; AST: %v", src, src))
-}
-
-type followTable map[symbolPosition]*symbolPositionSet
-
-func genFollowTable(root astNode) followTable {
- follow := followTable{}
- calcFollow(follow, root)
- return follow
-}
-
-func calcFollow(follow followTable, ast astNode) {
- if ast == nil {
- return
- }
- left, right := ast.children()
- calcFollow(follow, left)
- calcFollow(follow, right)
- switch n := ast.(type) {
- case *concatNode:
- l, r := n.children()
- for _, p := range l.last().set() {
- if _, ok := follow[p]; !ok {
- follow[p] = newSymbolPositionSet()
- }
- follow[p].merge(r.first())
- }
- case *repeatNode:
- for _, p := range n.last().set() {
- if _, ok := follow[p]; !ok {
- follow[p] = newSymbolPositionSet()
- }
- follow[p].merge(n.first())
- }
- }
-}
-
-func positionSymbols(node astNode, n uint16) (uint16, error) {
- if node == nil {
- return n, nil
- }
-
- l, r := node.children()
- p := n
- p, err := positionSymbols(l, p)
- if err != nil {
- return p, err
- }
- p, err = positionSymbols(r, p)
- if err != nil {
- return p, err
- }
- switch n := node.(type) {
- case *symbolNode:
- n.pos, err = newSymbolPosition(p, false)
- if err != nil {
- return p, err
- }
- p++
- case *endMarkerNode:
- n.pos, err = newSymbolPosition(p, true)
- if err != nil {
- return p, err
- }
- p++
- }
- node.first()
- node.last()
- return p, nil
-}
-
-func printAST(w io.Writer, ast astNode, ruledLine string, childRuledLinePrefix string, withAttrs bool) {
- if ast == nil {
- return
- }
- fmt.Fprintf(w, ruledLine)
- fmt.Fprintf(w, "node: %v", ast)
- if withAttrs {
- fmt.Fprintf(w, ", nullable: %v, first: %v, last: %v", ast.nullable(), ast.first(), ast.last())
- }
- fmt.Fprintf(w, "\n")
- left, right := ast.children()
- children := []astNode{}
- if left != nil {
- children = append(children, left)
- }
- if right != nil {
- children = append(children, right)
- }
- num := len(children)
- for i, child := range children {
- line := "└─ "
- if num > 1 {
- if i == 0 {
- line = "├─ "
- } else if i < num-1 {
- line = "│ "
- }
- }
- prefix := "│ "
- if i >= num-1 {
- prefix = " "
- }
- printAST(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix, withAttrs)
- }
-}
diff --git a/compiler/ast_test.go b/compiler/ast_test.go
deleted file mode 100644
index 2a77dfc..0000000
--- a/compiler/ast_test.go
+++ /dev/null
@@ -1,198 +0,0 @@
-package compiler
-
-import (
- "fmt"
- "testing"
-)
-
-func TestNewSymbolPosition(t *testing.T) {
- tests := []struct {
- n uint16
- endMark bool
- err bool
- }{
- {
- n: 0,
- endMark: false,
- err: true,
- },
- {
- n: 0,
- endMark: true,
- err: true,
- },
- {
- n: symbolPositionMin - 1,
- endMark: false,
- err: true,
- },
- {
- n: symbolPositionMin - 1,
- endMark: true,
- err: true,
- },
- {
- n: symbolPositionMin,
- endMark: false,
- },
- {
- n: symbolPositionMin,
- endMark: true,
- },
- {
- n: symbolPositionMax,
- endMark: false,
- },
- {
- n: symbolPositionMax,
- endMark: true,
- },
- {
- n: symbolPositionMax + 1,
- endMark: false,
- err: true,
- },
- {
- n: symbolPositionMax + 1,
- endMark: true,
- err: true,
- },
- }
- for i, tt := range tests {
- t.Run(fmt.Sprintf("#%v n: %v, endMark: %v", i, tt.n, tt.endMark), func(t *testing.T) {
- pos, err := newSymbolPosition(tt.n, tt.endMark)
- if tt.err {
- if err == nil {
- t.Fatal("err is nil")
- }
- return
- }
- if err != nil {
- t.Fatal(err)
- }
- n, endMark := pos.describe()
- if n != tt.n || endMark != tt.endMark {
- t.Errorf("unexpected symbol position; want: n: %v, endMark: %v, got: n: %v, endMark: %v", tt.n, tt.endMark, n, endMark)
- }
- })
- }
-}
-
-func TestASTNode(t *testing.T) {
- tests := []struct {
- root astNode
- nullable bool
- first *symbolPositionSet
- last *symbolPositionSet
- }{
- {
- root: newSymbolNodeWithPos(0, 1),
- nullable: false,
- first: newSymbolPositionSet().add(1),
- last: newSymbolPositionSet().add(1),
- },
- {
- root: newEndMarkerNodeWithPos(1, 1),
- nullable: false,
- first: newSymbolPositionSet().add(1),
- last: newSymbolPositionSet().add(1),
- },
- {
- root: newConcatNode(
- newSymbolNodeWithPos(0, 1),
- newSymbolNodeWithPos(0, 2),
- ),
- nullable: false,
- first: newSymbolPositionSet().add(1),
- last: newSymbolPositionSet().add(2),
- },
- {
- root: newConcatNode(
- newRepeatNode(newSymbolNodeWithPos(0, 1)),
- newSymbolNodeWithPos(0, 2),
- ),
- nullable: false,
- first: newSymbolPositionSet().add(1).add(2),
- last: newSymbolPositionSet().add(2),
- },
- {
- root: newConcatNode(
- newSymbolNodeWithPos(0, 1),
- newRepeatNode(newSymbolNodeWithPos(0, 2)),
- ),
- nullable: false,
- first: newSymbolPositionSet().add(1),
- last: newSymbolPositionSet().add(1).add(2),
- },
- {
- root: newConcatNode(
- newRepeatNode(newSymbolNodeWithPos(0, 1)),
- newRepeatNode(newSymbolNodeWithPos(0, 2)),
- ),
- nullable: true,
- first: newSymbolPositionSet().add(1).add(2),
- last: newSymbolPositionSet().add(1).add(2),
- },
- {
- root: newAltNode(
- newSymbolNodeWithPos(0, 1),
- newSymbolNodeWithPos(0, 2),
- ),
- nullable: false,
- first: newSymbolPositionSet().add(1).add(2),
- last: newSymbolPositionSet().add(1).add(2),
- },
- {
- root: newAltNode(
- newRepeatNode(newSymbolNodeWithPos(0, 1)),
- newSymbolNodeWithPos(0, 2),
- ),
- nullable: true,
- first: newSymbolPositionSet().add(1).add(2),
- last: newSymbolPositionSet().add(1).add(2),
- },
- {
- root: newAltNode(
- newSymbolNodeWithPos(0, 1),
- newRepeatNode(newSymbolNodeWithPos(0, 2)),
- ),
- nullable: true,
- first: newSymbolPositionSet().add(1).add(2),
- last: newSymbolPositionSet().add(1).add(2),
- },
- {
- root: newAltNode(
- newRepeatNode(newSymbolNodeWithPos(0, 1)),
- newRepeatNode(newSymbolNodeWithPos(0, 2)),
- ),
- nullable: true,
- first: newSymbolPositionSet().add(1).add(2),
- last: newSymbolPositionSet().add(1).add(2),
- },
- {
- root: newRepeatNode(newSymbolNodeWithPos(0, 1)),
- nullable: true,
- first: newSymbolPositionSet().add(1),
- last: newSymbolPositionSet().add(1),
- },
- {
- root: newOptionNode(newSymbolNodeWithPos(0, 1)),
- nullable: true,
- first: newSymbolPositionSet().add(1),
- last: newSymbolPositionSet().add(1),
- },
- }
- for i, tt := range tests {
- t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) {
- if tt.root.nullable() != tt.nullable {
- t.Errorf("unexpected nullable attribute; want: %v, got: %v", tt.nullable, tt.root.nullable())
- }
- if tt.first.hash() != tt.root.first().hash() {
- t.Errorf("unexpected first positions attribute; want: %v, got: %v", tt.first, tt.root.first())
- }
- if tt.last.hash() != tt.root.last().hash() {
- t.Errorf("unexpected last positions attribute; want: %v, got: %v", tt.last, tt.root.last())
- }
- })
- }
-}
diff --git a/compiler/byte.go b/compiler/byte.go
deleted file mode 100644
index 464887e..0000000
--- a/compiler/byte.go
+++ /dev/null
@@ -1,281 +0,0 @@
-package compiler
-
-import "fmt"
-
-type byteRange struct {
- from byte
- to byte
-}
-
-func newByteRange(from, to byte) byteRange {
- return byteRange{
- from: from,
- to: to,
- }
-}
-
-func (r byteRange) String() string {
- return fmt.Sprintf("%X-%X (%v-%v)", r.from, r.to, r.from, r.to)
-}
-
-func newByteRangeSequence(from, to []byte) ([]byteRange, error) {
- if len(from) != len(to) {
- return nil, fmt.Errorf("length of `from` and `to` are mismatched; from: %+v, to: %+v", from, to)
- }
- seq := []byteRange{}
- for i := 0; i < len(from); i++ {
- seq = append(seq, newByteRange(from[i], to[i]))
- }
- return seq, nil
-}
-
-func excludeByteRangeSequence(b1, b2 []byteRange) [][]byteRange {
- if len(b1) != len(b2) {
- // no overlapping
- return [][]byteRange{b2}
- }
- switch len(b1) {
- case 1:
- return exclude1Byte(b1, b2)
- case 2:
- return exclude2Byte(b1, b2)
- case 3:
- return exclude3Byte(b1, b2)
- case 4:
- return exclude4Byte(b1, b2)
- }
- panic(fmt.Errorf("excludeByteRangeSequence can only handle sequences up to 4 bytes in size; b1: %+v, b2: %+v", b1, b2))
-}
-
-func exclude1Byte(b1, b2 []byteRange) [][]byteRange {
- r01, r02, overlapping0 := excludeByteRange(&b1[0], &b2[0])
- if !overlapping0 {
- // no overlapping
- return [][]byteRange{b2}
- }
- result := [][]byteRange{}
- {
- if r01 != nil {
- result = append(result, []byteRange{
- newByteRange(r01.from, r01.to),
- })
- }
- if r02 != nil {
- result = append(result, []byteRange{
- newByteRange(r02.from, r02.to),
- })
- }
- }
- return result
-}
-
-func exclude2Byte(b1, b2 []byteRange) [][]byteRange {
- r01, r02, overlapping0 := excludeByteRange(&b1[0], &b2[0])
- r11, r12, overlapping1 := excludeByteRange(&b1[1], &b2[1])
- if !overlapping0 || !overlapping1 {
- // no overlapping
- return [][]byteRange{b2}
- }
- result := [][]byteRange{}
- {
- if r11 != nil {
- result = append(result, []byteRange{
- newByteRange(b1[0].from, b1[0].to),
- newByteRange(r11.from, r11.to),
- })
- }
- if r12 != nil {
- result = append(result, []byteRange{
- newByteRange(b1[0].from, b1[0].to),
- newByteRange(r12.from, r12.to),
- })
- }
- if r01 != nil {
- result = append(result, []byteRange{
- newByteRange(r01.from, r01.to),
- newByteRange(b2[1].from, b2[1].to),
- })
- }
- if r02 != nil {
- result = append(result, []byteRange{
- newByteRange(r02.from, r02.to),
- newByteRange(b2[1].from, b2[1].to),
- })
- }
- }
- return result
-}
-
-func exclude3Byte(b1, b2 []byteRange) [][]byteRange {
- r01, r02, overlapping0 := excludeByteRange(&b1[0], &b2[0])
- r11, r12, overlapping1 := excludeByteRange(&b1[1], &b2[1])
- r21, r22, overlapping2 := excludeByteRange(&b1[2], &b2[2])
- if !overlapping0 || !overlapping1 || !overlapping2 {
- // no overlapping
- return [][]byteRange{b2}
- }
- result := [][]byteRange{}
- {
- if r21 != nil {
- result = append(result, []byteRange{
- newByteRange(b1[0].from, b1[0].to),
- newByteRange(b1[1].from, b1[1].to),
- newByteRange(r21.from, r21.to),
- })
- }
- if r22 != nil {
- result = append(result, []byteRange{
- newByteRange(b1[0].from, b1[0].to),
- newByteRange(b1[1].from, b1[1].to),
- newByteRange(r22.from, r22.to),
- })
- }
- if r11 != nil {
- result = append(result, []byteRange{
- newByteRange(b1[0].from, b1[0].to),
- newByteRange(r11.from, r11.to),
- newByteRange(b2[2].from, b2[2].to),
- })
- }
- if r12 != nil {
- result = append(result, []byteRange{
- newByteRange(b1[0].from, b1[0].to),
- newByteRange(r12.from, r12.to),
- newByteRange(b2[2].from, b2[2].to),
- })
- }
- if r01 != nil {
- result = append(result, []byteRange{
- newByteRange(r01.from, r01.to),
- newByteRange(b2[1].from, b2[1].to),
- newByteRange(b2[2].from, b2[2].to),
- })
- }
- if r02 != nil {
- result = append(result, []byteRange{
- newByteRange(r02.from, r02.to),
- newByteRange(b2[1].from, b2[1].to),
- newByteRange(b2[2].from, b2[2].to),
- })
- }
- }
- return result
-}
-
-func exclude4Byte(b1, b2 []byteRange) [][]byteRange {
- r01, r02, overlapping0 := excludeByteRange(&b1[0], &b2[0])
- r11, r12, overlapping1 := excludeByteRange(&b1[1], &b2[1])
- r21, r22, overlapping2 := excludeByteRange(&b1[2], &b2[2])
- r31, r32, overlapping3 := excludeByteRange(&b1[3], &b2[3])
- if !overlapping0 || !overlapping1 || !overlapping2 || !overlapping3 {
- // no overlapping
- return [][]byteRange{b2}
- }
- result := [][]byteRange{}
- {
- if r31 != nil {
- result = append(result, []byteRange{
- newByteRange(b1[0].from, b1[0].to),
- newByteRange(b1[1].from, b1[1].to),
- newByteRange(b1[2].from, b1[2].to),
- newByteRange(r31.from, r31.to),
- })
- }
- if r32 != nil {
- result = append(result, []byteRange{
- newByteRange(b1[0].from, b1[0].to),
- newByteRange(b1[1].from, b1[1].to),
- newByteRange(b1[2].from, b1[2].to),
- newByteRange(r32.from, r32.to),
- })
- }
- if r21 != nil {
- result = append(result, []byteRange{
- newByteRange(b1[0].from, b1[0].to),
- newByteRange(b1[1].from, b1[1].to),
- newByteRange(r21.from, r21.to),
- newByteRange(b2[3].from, b2[3].to),
- })
- }
- if r22 != nil {
- result = append(result, []byteRange{
- newByteRange(b1[0].from, b1[0].to),
- newByteRange(b1[1].from, b1[1].to),
- newByteRange(r22.from, r22.to),
- newByteRange(b2[3].from, b2[3].to),
- })
- }
- if r11 != nil {
- result = append(result, []byteRange{
- newByteRange(b1[0].from, b1[0].to),
- newByteRange(r11.from, r11.to),
- newByteRange(b2[2].from, b2[2].to),
- newByteRange(b2[3].from, b2[3].to),
- })
- }
- if r12 != nil {
- result = append(result, []byteRange{
- newByteRange(b1[0].from, b1[0].to),
- newByteRange(r12.from, r12.to),
- newByteRange(b2[2].from, b2[2].to),
- newByteRange(b2[3].from, b2[3].to),
- })
- }
- if r01 != nil {
- result = append(result, []byteRange{
- newByteRange(r01.from, r01.to),
- newByteRange(b2[1].from, b2[1].to),
- newByteRange(b2[2].from, b2[2].to),
- newByteRange(b2[3].from, b2[3].to),
- })
- }
- if r02 != nil {
- result = append(result, []byteRange{
- newByteRange(r02.from, r02.to),
- newByteRange(b2[1].from, b2[1].to),
- newByteRange(b2[2].from, b2[2].to),
- newByteRange(b2[3].from, b2[3].to),
- })
- }
- }
- return result
-}
-
-// excludeByteRange excludes `r1` from `r2`.
-func excludeByteRange(r1, r2 *byteRange) (*byteRange, *byteRange, bool) {
- if r1.from <= r2.from {
- if r1.to < r2.from {
- // no overlapping
- return nil, nil, false
- }
- if r1.to < r2.to {
- // The beginning of `r2` overlaps with `r1`.
- return &byteRange{
- from: r1.to + 1,
- to: r2.to,
- }, nil, true
- }
- // `symbol` overlaps with `base` entirely.
- return nil, nil, true
- }
- if r1.from > r2.to {
- // no overlapping
- return nil, nil, false
- }
- if r1.to >= r2.to {
- // The end of `r2` overlaps with `r1`.
- return &byteRange{
- from: r2.from,
- to: r1.from - 1,
- }, nil, true
- }
- // `r2` overlaps with `r1` entirely.
- return &byteRange{
- from: r2.from,
- to: r1.from - 1,
- },
- &byteRange{
- from: r1.to + 1,
- to: r2.to,
- }, true
-}
diff --git a/compiler/byte_test.go b/compiler/byte_test.go
deleted file mode 100644
index c298501..0000000
--- a/compiler/byte_test.go
+++ /dev/null
@@ -1,263 +0,0 @@
-package compiler
-
-import "testing"
-
-func TestExcludeByteRangeSequence(t *testing.T) {
- tests := []struct {
- b1 []byteRange
- b2 []byteRange
- excluded [][]byteRange
- }{
- // 1 Byte
- {
- b1: newByteRangeSeq([]byte{0x00}, []byte{0x00}),
- b2: newByteRangeSeq([]byte{0x00}, []byte{0xff}),
- excluded: [][]byteRange{
- newByteRangeSeq([]byte{0x01}, []byte{0xff}),
- },
- },
- {
- b1: newByteRangeSeq([]byte{0xff}, []byte{0xff}),
- b2: newByteRangeSeq([]byte{0x00}, []byte{0xff}),
- excluded: [][]byteRange{
- newByteRangeSeq([]byte{0x00}, []byte{0xfe}),
- },
- },
- {
- b1: newByteRangeSeq([]byte{0xc0}, []byte{0xcf}),
- b2: newByteRangeSeq([]byte{0x00}, []byte{0xff}),
- excluded: [][]byteRange{
- newByteRangeSeq([]byte{0x00}, []byte{0xbf}),
- newByteRangeSeq([]byte{0xd0}, []byte{0xff}),
- },
- },
- {
- b1: newByteRangeSeq([]byte{0x00}, []byte{0xff}),
- b2: newByteRangeSeq([]byte{0xc0}, []byte{0xcf}),
- excluded: nil,
- },
- {
- b1: newByteRangeSeq([]byte{0x00}, []byte{0xff}),
- b2: newByteRangeSeq([]byte{0x00}, []byte{0xff}),
- excluded: nil,
- },
-
- // 2 Byte
- {
- b1: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0x00, 0x00}),
- b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}),
- excluded: [][]byteRange{
- newByteRangeSeq([]byte{0x00, 0x01}, []byte{0x00, 0xff}),
- newByteRangeSeq([]byte{0x01, 0x00}, []byte{0xff, 0xff}),
- },
- },
- {
- b1: newByteRangeSeq([]byte{0xff, 0xff}, []byte{0xff, 0xff}),
- b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}),
- excluded: [][]byteRange{
- newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xfe, 0xff}),
- newByteRangeSeq([]byte{0xff, 0x00}, []byte{0xff, 0xfe}),
- },
- },
- {
- b1: newByteRangeSeq([]byte{0xc0, 0xc0}, []byte{0xc0, 0xc0}),
- b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}),
- excluded: [][]byteRange{
- newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xbf, 0xff}),
- newByteRangeSeq([]byte{0xc0, 0x00}, []byte{0xc0, 0xbf}),
- newByteRangeSeq([]byte{0xc0, 0xc1}, []byte{0xc0, 0xff}),
- newByteRangeSeq([]byte{0xc1, 0x00}, []byte{0xff, 0xff}),
- },
- },
- {
- b1: newByteRangeSeq([]byte{0xc0, 0x00}, []byte{0xc0, 0xff}),
- b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}),
- excluded: [][]byteRange{
- newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xbf, 0xff}),
- newByteRangeSeq([]byte{0xc1, 0x00}, []byte{0xff, 0xff}),
- },
- },
- {
- b1: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}),
- b2: newByteRangeSeq([]byte{0xc0, 0xc0}, []byte{0xc0, 0xc0}),
- excluded: nil,
- },
- {
- b1: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}),
- b2: newByteRangeSeq([]byte{0x00, 0x00}, []byte{0xff, 0xff}),
- excluded: nil,
- },
-
- // 3 Byte
- {
- b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0x00, 0x00, 0x00}),
- b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
- excluded: [][]byteRange{
- newByteRangeSeq([]byte{0x00, 0x00, 0x01}, []byte{0x00, 0x00, 0xff}),
- newByteRangeSeq([]byte{0x00, 0x01, 0x00}, []byte{0x00, 0xff, 0xff}),
- newByteRangeSeq([]byte{0x01, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
- },
- },
- {
- b1: newByteRangeSeq([]byte{0xff, 0xff, 0xff}, []byte{0xff, 0xff, 0xff}),
- b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
- excluded: [][]byteRange{
- newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xfe, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xff, 0x00, 0x00}, []byte{0xff, 0xfe, 0xff}),
- newByteRangeSeq([]byte{0xff, 0xff, 0x00}, []byte{0xff, 0xff, 0xfe}),
- },
- },
- {
- b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0}, []byte{0xc0, 0xc0, 0xc0}),
- b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
- excluded: [][]byteRange{
- newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xc0, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff}),
- newByteRangeSeq([]byte{0xc0, 0xc0, 0x00}, []byte{0xc0, 0xc0, 0xbf}),
- newByteRangeSeq([]byte{0xc0, 0xc0, 0xc1}, []byte{0xc0, 0xc0, 0xff}),
- newByteRangeSeq([]byte{0xc0, 0xc1, 0x00}, []byte{0xc0, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xc1, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
- },
- },
- {
- b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0x00}, []byte{0xc0, 0xc0, 0xff}),
- b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
- excluded: [][]byteRange{
- newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xc0, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff}),
- newByteRangeSeq([]byte{0xc0, 0xc1, 0x00}, []byte{0xc0, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xc1, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
- },
- },
- {
- b1: newByteRangeSeq([]byte{0xc0, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff}),
- b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
- excluded: [][]byteRange{
- newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xc1, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
- },
- },
- {
- b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
- b2: newByteRangeSeq([]byte{0xff, 0xff, 0xff}, []byte{0xff, 0xff, 0xff}),
- excluded: nil,
- },
- {
- b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
- b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff}),
- excluded: nil,
- },
-
- // 4 Byte
- {
- b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0x00, 0x00, 0x00, 0x00}),
- b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
- excluded: [][]byteRange{
- newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x01}, []byte{0x00, 0x00, 0x00, 0xff}),
- newByteRangeSeq([]byte{0x00, 0x00, 0x01, 0x00}, []byte{0x00, 0x00, 0xff, 0xff}),
- newByteRangeSeq([]byte{0x00, 0x01, 0x00, 0x00}, []byte{0x00, 0xff, 0xff, 0xff}),
- newByteRangeSeq([]byte{0x01, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
- },
- },
- {
- b1: newByteRangeSeq([]byte{0xff, 0xff, 0xff, 0xff}, []byte{0xff, 0xff, 0xff, 0xff}),
- b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
- excluded: [][]byteRange{
- newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xfe, 0xff, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xff, 0x00, 0x00, 0x00}, []byte{0xff, 0xfe, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xff, 0xff, 0x00, 0x00}, []byte{0xff, 0xff, 0xfe, 0xff}),
- newByteRangeSeq([]byte{0xff, 0xff, 0xff, 0x00}, []byte{0xff, 0xff, 0xff, 0xfe}),
- },
- },
- {
- b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0xc0}, []byte{0xc0, 0xc0, 0xc0, 0xc0}),
- b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
- excluded: [][]byteRange{
- newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xc0, 0x00, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xc0, 0xc0, 0x00, 0x00}, []byte{0xc0, 0xc0, 0xbf, 0xff}),
- newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0x00}, []byte{0xc0, 0xc0, 0xc0, 0xbf}),
- newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0xc1}, []byte{0xc0, 0xc0, 0xc0, 0xff}),
- newByteRangeSeq([]byte{0xc0, 0xc0, 0xc1, 0x00}, []byte{0xc0, 0xc0, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xc0, 0xc1, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xc1, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
- },
- },
- {
- b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0x00}, []byte{0xc0, 0xc0, 0xc0, 0xff}),
- b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
- excluded: [][]byteRange{
- newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xc0, 0x00, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xc0, 0xc0, 0x00, 0x00}, []byte{0xc0, 0xc0, 0xbf, 0xff}),
- newByteRangeSeq([]byte{0xc0, 0xc0, 0xc1, 0x00}, []byte{0xc0, 0xc0, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xc0, 0xc1, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xc1, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
- },
- },
- {
- b1: newByteRangeSeq([]byte{0xc0, 0xc0, 0x00, 0x00}, []byte{0xc0, 0xc0, 0xff, 0xff}),
- b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
- excluded: [][]byteRange{
- newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xc0, 0x00, 0x00, 0x00}, []byte{0xc0, 0xbf, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xc0, 0xc1, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xc1, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
- },
- },
- {
- b1: newByteRangeSeq([]byte{0xc0, 0x00, 0x00, 0x00}, []byte{0xc0, 0xff, 0xff, 0xff}),
- b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
- excluded: [][]byteRange{
- newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xbf, 0xff, 0xff, 0xff}),
- newByteRangeSeq([]byte{0xc1, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
- },
- },
- {
- b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
- b2: newByteRangeSeq([]byte{0xc0, 0xc0, 0xc0, 0xc0}, []byte{0xc0, 0xc0, 0xc0, 0xc0}),
- excluded: nil,
- },
- {
- b1: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
- b2: newByteRangeSeq([]byte{0x00, 0x00, 0x00, 0x00}, []byte{0xff, 0xff, 0xff, 0xff}),
- excluded: nil,
- },
- }
- for _, tt := range tests {
- excluded := excludeByteRangeSequence(tt.b1, tt.b2)
- t.Logf("b1: %v, b2: %v", tt.b1, tt.b2)
- t.Logf("excluded: %+v", excluded)
- if len(excluded) != len(tt.excluded) {
- t.Errorf("unexpected results; expected: %+v, actual: %+v", tt.excluded, excluded)
- }
- for _, expectedSeq := range tt.excluded {
- found := false
- for _, actualSeq := range excluded {
- mismatched := false
- for i := 0; i < len(expectedSeq); i++ {
- if actualSeq[i] != expectedSeq[i] {
- mismatched = true
- break
- }
- }
- if mismatched {
- continue
- }
- found = true
- break
- }
- if !found {
- t.Errorf("an expected byte range sequence was not found: %+v", expectedSeq)
- }
- }
- }
-}
-
-func newByteRangeSeq(from, to []byte) []byteRange {
- seq, err := newByteRangeSequence(from, to)
- if err != nil {
- panic(err)
- }
- return seq
-}
diff --git a/compiler/compiler.go b/compiler/compiler.go
index 439822c..cfd5faf 100644
--- a/compiler/compiler.go
+++ b/compiler/compiler.go
@@ -1,8 +1,11 @@
package compiler
import (
+ "bytes"
"fmt"
+ "github.com/nihei9/maleeni/compiler/dfa"
+ psr "github.com/nihei9/maleeni/compiler/parser"
"github.com/nihei9/maleeni/compressor"
"github.com/nihei9/maleeni/spec"
)
@@ -23,17 +26,24 @@ type compilerConfig struct {
compLv int
}
-func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSpec, error) {
+type CompileError struct {
+ Kind spec.LexKindName
+ Fragment bool
+ Cause error
+ Detail string
+}
+
+func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSpec, error, []*CompileError) {
err := lexspec.Validate()
if err != nil {
- return nil, fmt.Errorf("invalid lexical specification:\n%w", err)
+ return nil, fmt.Errorf("invalid lexical specification:\n%w", err), nil
}
config := &compilerConfig{}
for _, opt := range opts {
err := opt(config)
if err != nil {
- return nil, err
+ return nil, err, nil
}
}
@@ -44,9 +54,9 @@ func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSp
}
for i, es := range modeEntries[1:] {
modeName := modeNames[i+1]
- modeSpec, err := compile(es, modeName2ID, fragmetns, config)
+ modeSpec, err, cerrs := compile(es, modeName2ID, fragmetns, config)
if err != nil {
- return nil, fmt.Errorf("failed to compile in %v mode: %w", modeName, err)
+ return nil, fmt.Errorf("failed to compile in %v mode: %w", modeName, err), cerrs
}
modeSpecs = append(modeSpecs, modeSpec)
}
@@ -95,10 +105,10 @@ func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSp
KindIDs: kindIDs,
CompressionLevel: config.compLv,
Specs: modeSpecs,
- }, nil
+ }, nil, nil
}
-func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeID, map[string]*spec.LexEntry) {
+func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeID, map[spec.LexKindName]*spec.LexEntry) {
modeNames := []spec.LexModeName{
spec.LexModeNameNil,
spec.LexModeNameDefault,
@@ -112,10 +122,10 @@ func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec
nil,
{},
}
- fragments := map[string]*spec.LexEntry{}
+ fragments := map[spec.LexKindName]*spec.LexEntry{}
for _, e := range entries {
if e.Fragment {
- fragments[e.Kind.String()] = e
+ fragments[e.Kind] = e
continue
}
ms := e.Modes
@@ -139,15 +149,24 @@ func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec
return modeEntries, modeNames, modeName2ID, fragments
}
-func compile(entries []*spec.LexEntry, modeName2ID map[spec.LexModeName]spec.LexModeID, fragments map[string]*spec.LexEntry, config *compilerConfig) (*spec.CompiledLexModeSpec, error) {
+func compile(
+ entries []*spec.LexEntry,
+ modeName2ID map[spec.LexModeName]spec.LexModeID,
+ fragments map[spec.LexKindName]*spec.LexEntry,
+ config *compilerConfig,
+) (*spec.CompiledLexModeSpec, error, []*CompileError) {
var kindNames []spec.LexKindName
+ kindIDToName := map[spec.LexModeKindID]spec.LexKindName{}
var patterns map[spec.LexModeKindID][]byte
{
kindNames = append(kindNames, spec.LexKindNameNil)
patterns = map[spec.LexModeKindID][]byte{}
for i, e := range entries {
+ kindID := spec.LexModeKindID(i + 1)
+
kindNames = append(kindNames, e.Kind)
- patterns[spec.LexModeKindID(i+1)] = []byte(e.Pattern)
+ kindIDToName[kindID] = e.Kind
+ patterns[kindID] = []byte(e.Pattern)
}
}
@@ -170,39 +189,141 @@ func compile(entries []*spec.LexEntry, modeName2ID map[spec.LexModeName]spec.Lex
pop = append(pop, popV)
}
- fragmentPatterns := map[string][]byte{}
+ fragmentPatterns := map[spec.LexKindName][]byte{}
for k, e := range fragments {
fragmentPatterns[k] = []byte(e.Pattern)
}
- var root astNode
- var symTab *symbolTable
+ fragmentCPTrees := make(map[spec.LexKindName]psr.CPTree, len(fragmentPatterns))
+ {
+ var cerrs []*CompileError
+ for kind, pat := range fragmentPatterns {
+ p := psr.NewParser(kind, bytes.NewReader(pat))
+ t, err := p.Parse()
+ if err != nil {
+ if err == psr.ParseErr {
+ detail, cause := p.Error()
+ cerrs = append(cerrs, &CompileError{
+ Kind: kind,
+ Fragment: true,
+ Cause: cause,
+ Detail: detail,
+ })
+ } else {
+ cerrs = append(cerrs, &CompileError{
+ Kind: kind,
+ Fragment: true,
+ Cause: err,
+ })
+ }
+ continue
+ }
+ fragmentCPTrees[kind] = t
+ }
+ if len(cerrs) > 0 {
+ return nil, fmt.Errorf("compile error"), cerrs
+ }
+
+ err := psr.CompleteFragments(fragmentCPTrees)
+ if err != nil {
+ if err == psr.ParseErr {
+ for _, frag := range fragmentCPTrees {
+ kind, frags, err := frag.Describe()
+ if err != nil {
+ return nil, err, nil
+ }
+
+ cerrs = append(cerrs, &CompileError{
+ Kind: kind,
+ Fragment: true,
+ Cause: fmt.Errorf("fragment contains undefined fragments or cycles"),
+ Detail: fmt.Sprintf("%v", frags),
+ })
+ }
+
+ return nil, fmt.Errorf("compile error"), cerrs
+ }
+
+ return nil, err, nil
+ }
+ }
+
+ cpTrees := map[spec.LexModeKindID]psr.CPTree{}
{
- pats := make([]*patternEntry, len(patterns)+1)
- pats[spec.LexModeKindIDNil] = &patternEntry{
- id: spec.LexModeKindIDNil,
+ pats := make([]*psr.PatternEntry, len(patterns)+1)
+ pats[spec.LexModeKindIDNil] = &psr.PatternEntry{
+ ID: spec.LexModeKindIDNil,
}
for id, pattern := range patterns {
- pats[id] = &patternEntry{
- id: id,
- pattern: pattern,
+ pats[id] = &psr.PatternEntry{
+ ID: id,
+ Pattern: pattern,
}
}
- var err error
- root, symTab, err = parse(pats, fragmentPatterns)
- if err != nil {
- return nil, err
+ var cerrs []*CompileError
+ for _, pat := range pats {
+ if pat.ID == spec.LexModeKindIDNil {
+ continue
+ }
+
+ p := psr.NewParser(kindIDToName[pat.ID], bytes.NewReader(pat.Pattern))
+ t, err := p.Parse()
+ if err != nil {
+ if err == psr.ParseErr {
+ detail, cause := p.Error()
+ cerrs = append(cerrs, &CompileError{
+ Kind: kindIDToName[pat.ID],
+ Fragment: false,
+ Cause: cause,
+ Detail: detail,
+ })
+ } else {
+ cerrs = append(cerrs, &CompileError{
+ Kind: kindIDToName[pat.ID],
+ Fragment: false,
+ Cause: err,
+ })
+ }
+ continue
+ }
+
+ complete, err := psr.ApplyFragments(t, fragmentCPTrees)
+ if err != nil {
+ return nil, err, nil
+ }
+ if !complete {
+ _, frags, err := t.Describe()
+ if err != nil {
+ return nil, err, nil
+ }
+
+ cerrs = append(cerrs, &CompileError{
+ Kind: kindIDToName[pat.ID],
+ Fragment: false,
+ Cause: fmt.Errorf("pattern contains undefined fragments"),
+ Detail: fmt.Sprintf("%v", frags),
+ })
+ continue
+ }
+
+ cpTrees[pat.ID] = t
+ }
+ if len(cerrs) > 0 {
+ return nil, fmt.Errorf("compile error"), cerrs
}
}
var tranTab *spec.TransitionTable
{
- dfa := genDFA(root, symTab)
- var err error
- tranTab, err = genTransitionTable(dfa)
+ root, symTab, err := dfa.ConvertCPTreeToByteTree(cpTrees)
if err != nil {
- return nil, err
+ return nil, err, nil
+ }
+ d := dfa.GenDFA(root, symTab)
+ tranTab, err = dfa.GenTransitionTable(d)
+ if err != nil {
+ return nil, err, nil
}
}
@@ -211,12 +332,12 @@ func compile(entries []*spec.LexEntry, modeName2ID map[spec.LexModeName]spec.Lex
case 2:
tranTab, err = compressTransitionTableLv2(tranTab)
if err != nil {
- return nil, err
+ return nil, err, nil
}
case 1:
tranTab, err = compressTransitionTableLv1(tranTab)
if err != nil {
- return nil, err
+ return nil, err, nil
}
}
@@ -225,7 +346,7 @@ func compile(entries []*spec.LexEntry, modeName2ID map[spec.LexModeName]spec.Lex
Push: push,
Pop: pop,
DFA: tranTab,
- }, nil
+ }, nil, nil
}
const (
diff --git a/compiler/compiler_test.go b/compiler/compiler_test.go
index 456920f..2534d9f 100644
--- a/compiler/compiler_test.go
+++ b/compiler/compiler_test.go
@@ -85,7 +85,7 @@ func TestCompile(t *testing.T) {
if err != nil {
t.Fatalf("%v", err)
}
- clspec, err := Compile(lspec)
+ clspec, err, _ := Compile(lspec)
if tt.Err {
if err == nil {
t.Fatalf("expected an error")
@@ -95,7 +95,7 @@ func TestCompile(t *testing.T) {
}
} else {
if err != nil {
- t.Fatalf("unexpected error")
+ t.Fatalf("unexpected error: %v", err)
}
if clspec == nil {
t.Fatalf("Compile function must return a compiled specification")
diff --git a/compiler/dfa.go b/compiler/dfa.go
deleted file mode 100644
index 1d0b26a..0000000
--- a/compiler/dfa.go
+++ /dev/null
@@ -1,139 +0,0 @@
-package compiler
-
-import (
- "sort"
-
- "github.com/nihei9/maleeni/spec"
-)
-
-type DFA struct {
- States []string
- InitialState string
- AcceptingStatesTable map[string]spec.LexModeKindID
- TransitionTable map[string][256]string
-}
-
-func genDFA(root astNode, symTab *symbolTable) *DFA {
- initialState := root.first()
- initialStateHash := initialState.hash()
- stateMap := map[string]*symbolPositionSet{
- initialStateHash: initialState,
- }
- tranTab := map[string][256]string{}
- {
- follow := genFollowTable(root)
- unmarkedStates := map[string]*symbolPositionSet{
- initialStateHash: initialState,
- }
- for len(unmarkedStates) > 0 {
- nextUnmarkedStates := map[string]*symbolPositionSet{}
- for hash, state := range unmarkedStates {
- tranTabOfState := [256]*symbolPositionSet{}
- for _, pos := range state.set() {
- if pos.isEndMark() {
- continue
- }
- valRange := symTab.symPos2Byte[pos]
- for symVal := valRange.from; symVal <= valRange.to; symVal++ {
- if tranTabOfState[symVal] == nil {
- tranTabOfState[symVal] = newSymbolPositionSet()
- }
- tranTabOfState[symVal].merge(follow[pos])
- }
- }
- for _, t := range tranTabOfState {
- if t == nil {
- continue
- }
- h := t.hash()
- if _, ok := stateMap[h]; ok {
- continue
- }
- stateMap[h] = t
- nextUnmarkedStates[h] = t
- }
- tabOfState := [256]string{}
- for v, t := range tranTabOfState {
- if t == nil {
- continue
- }
- tabOfState[v] = t.hash()
- }
- tranTab[hash] = tabOfState
- }
- unmarkedStates = nextUnmarkedStates
- }
- }
-
- accTab := map[string]spec.LexModeKindID{}
- {
- for h, s := range stateMap {
- for _, pos := range s.set() {
- if !pos.isEndMark() {
- continue
- }
- priorID, ok := accTab[h]
- if !ok {
- accTab[h] = symTab.endPos2ID[pos]
- } else {
- id := symTab.endPos2ID[pos]
- if id < priorID {
- accTab[h] = id
- }
- }
- }
- }
- }
-
- var states []string
- {
- for s := range stateMap {
- states = append(states, s)
- }
- sort.Slice(states, func(i, j int) bool {
- return states[i] < states[j]
- })
- }
-
- return &DFA{
- States: states,
- InitialState: initialStateHash,
- AcceptingStatesTable: accTab,
- TransitionTable: tranTab,
- }
-}
-
-func genTransitionTable(dfa *DFA) (*spec.TransitionTable, error) {
- stateHash2ID := map[string]spec.StateID{}
- for i, s := range dfa.States {
- // Since 0 represents an invalid value in a transition table,
- // assign a number greater than or equal to 1 to states.
- stateHash2ID[s] = spec.StateID(i + spec.StateIDMin.Int())
- }
-
- acc := make([]spec.LexModeKindID, len(dfa.States)+1)
- for _, s := range dfa.States {
- id, ok := dfa.AcceptingStatesTable[s]
- if !ok {
- continue
- }
- acc[stateHash2ID[s]] = id
- }
-
- rowCount := len(dfa.States) + 1
- colCount := 256
- tran := make([]spec.StateID, rowCount*colCount)
- for s, tab := range dfa.TransitionTable {
- for v, to := range tab {
- tran[stateHash2ID[s].Int()*256+v] = stateHash2ID[to]
- }
- }
-
- return &spec.TransitionTable{
- InitialStateID: stateHash2ID[dfa.InitialState],
- AcceptingStates: acc,
- UncompressedTransition: tran,
- RowCount: rowCount,
- ColCount: colCount,
- }, nil
-}
diff --git a/compiler/dfa_test.go b/compiler/dfa_test.go
deleted file mode 100644
index 74c9ba8..0000000
--- a/compiler/dfa_test.go
+++ /dev/null
@@ -1,117 +0,0 @@
-package compiler
-
-import (
- "testing"
-
- "github.com/nihei9/maleeni/spec"
-)
-
-func TestGenDFA(t *testing.T) {
- root, symTab, err := parse([]*patternEntry{
- {
- id: spec.LexModeKindIDMin,
- pattern: []byte("(a|b)*abb"),
- },
- }, nil)
- if err != nil {
- t.Fatal(err)
- }
- dfa := genDFA(root, symTab)
- if dfa == nil {
- t.Fatalf("DFA is nil")
- }
-
- symPos := func(n uint16) symbolPosition {
- pos, err := newSymbolPosition(n, false)
- if err != nil {
- panic(err)
- }
- return pos
- }
-
- endPos := func(n uint16) symbolPosition {
- pos, err := newSymbolPosition(n, true)
- if err != nil {
- panic(err)
- }
- return pos
- }
-
- s0 := newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3))
- s1 := newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)).add(symPos(4))
- s2 := newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)).add(symPos(5))
- s3 := newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)).add(endPos(6))
-
- rune2Int := func(char rune, index int) uint8 {
- return uint8([]byte(string(char))[index])
- }
-
- tranS0 := [256]string{}
- tranS0[rune2Int('a', 0)] = s1.hash()
- tranS0[rune2Int('b', 0)] = s0.hash()
-
- tranS1 := [256]string{}
- tranS1[rune2Int('a', 0)] = s1.hash()
- tranS1[rune2Int('b', 0)] = s2.hash()
-
- tranS2 := [256]string{}
- tranS2[rune2Int('a', 0)] = s1.hash()
- tranS2[rune2Int('b', 0)] = s3.hash()
-
- tranS3 := [256]string{}
- tranS3[rune2Int('a', 0)] = s1.hash()
- tranS3[rune2Int('b', 0)] = s0.hash()
-
- expectedTranTab := map[string][256]string{
- s0.hash(): tranS0,
- s1.hash(): tranS1,
- s2.hash(): tranS2,
- s3.hash(): tranS3,
- }
- if len(dfa.TransitionTable) != len(expectedTranTab) {
- t.Errorf("transition table is mismatched; want: %v entries, got: %v entries", len(expectedTranTab), len(dfa.TransitionTable))
- }
- for h, eTranTab := range expectedTranTab {
- tranTab, ok := dfa.TransitionTable[h]
- if !ok {
- t.Errorf("no entry; hash: %v", h)
- continue
- }
- if len(tranTab) != len(eTranTab) {
- t.Errorf("transition table is mismatched; hash: %v, want: %v entries, got: %v entries", h, len(eTranTab), len(tranTab))
- }
- for c, eNext := range eTranTab {
- if eNext == "" {
- continue
- }
-
- next := tranTab[c]
- if next == "" {
- t.Errorf("no enatry; hash: %v, char: %v", h, c)
- }
- if next != eNext {
- t.Errorf("next state is mismatched; want: %v, got: %v", eNext, next)
- }
- }
- }
-
- if dfa.InitialState != s0.hash() {
- t.Errorf("initial state is mismatched; want: %v, got: %v", s0.hash(), dfa.InitialState)
- }
-
- accTab := map[string]spec.LexModeKindID{
- s3.hash(): 1,
- }
- if len(dfa.AcceptingStatesTable) != len(accTab) {
- t.Errorf("accepting states are mismatched; want: %v entries, got: %v entries", len(accTab), len(dfa.AcceptingStatesTable))
- }
- for eState, eID := range accTab {
- id, ok := dfa.AcceptingStatesTable[eState]
- if !ok {
- t.Errorf("accepting state is not found; state: %v", eState)
- }
- if id != eID {
- t.Errorf("ID is mismatched; state: %v, want: %v, got: %v", eState, eID, id)
- }
- }
-}
diff --git a/compiler/lexer.go b/compiler/lexer.go
deleted file mode 100644
index 4f6fdac..0000000
--- a/compiler/lexer.go
+++ /dev/null
@@ -1,578 +0,0 @@
-package compiler
-
-import (
- "bufio"
- "fmt"
- "io"
- "strings"
-)
-
-type tokenKind string
-
-const (
- tokenKindChar = tokenKind("char")
- tokenKindAnyChar = tokenKind(".")
- tokenKindRepeat = tokenKind("*")
- tokenKindRepeatOneOrMore = tokenKind("+")
- tokenKindOption = tokenKind("?")
- tokenKindAlt = tokenKind("|")
- tokenKindGroupOpen = tokenKind("(")
- tokenKindGroupClose = tokenKind(")")
- tokenKindBExpOpen = tokenKind("[")
- tokenKindInverseBExpOpen = tokenKind("[^")
- tokenKindBExpClose = tokenKind("]")
- tokenKindCharRange = tokenKind("-")
- tokenKindCodePointLeader = tokenKind("\\u")
- tokenKindCharPropLeader = tokenKind("\\p")
- tokenKindFragmentLeader = tokenKind("\\f")
- tokenKindLBrace = tokenKind("{")
- tokenKindRBrace = tokenKind("}")
- tokenKindEqual = tokenKind("=")
- tokenKindCodePoint = tokenKind("code point")
- tokenKindCharPropSymbol = tokenKind("character property symbol")
- tokenKindFragmentSymbol = tokenKind("fragment symbol")
- tokenKindEOF = tokenKind("eof")
-)
-
-type token struct {
- kind tokenKind
- char rune
- propSymbol string
- codePoint string
- fragmentSymbol string
-}
-
-const nullChar = '\u0000'
-
-func newToken(kind tokenKind, char rune) *token {
- return &token{
- kind: kind,
- char: char,
- }
-}
-
-func newCodePointToken(codePoint string) *token {
- return &token{
- kind: tokenKindCodePoint,
- codePoint: codePoint,
- }
-}
-
-func newCharPropSymbolToken(propSymbol string) *token {
- return &token{
- kind: tokenKindCharPropSymbol,
- propSymbol: propSymbol,
- }
-}
-
-func newFragmentSymbolToken(fragmentSymbol string) *token {
- return &token{
- kind: tokenKindFragmentSymbol,
- fragmentSymbol: fragmentSymbol,
- }
-}
-
-type lexerMode string
-
-const (
- lexerModeDefault = lexerMode("default")
- lexerModeBExp = lexerMode("bracket expression")
- lexerModeCPExp = lexerMode("code point expression")
- lexerModeCharPropExp = lexerMode("character property expression")
- lexerModeFragmentExp = lexerMode("fragment expression")
-)
-
-type lexerModeStack struct {
- stack []lexerMode
-}
-
-func newLexerModeStack() *lexerModeStack {
- return &lexerModeStack{
- stack: []lexerMode{
- lexerModeDefault,
- },
- }
-}
-
-func (s *lexerModeStack) top() lexerMode {
- return s.stack[len(s.stack)-1]
-}
-
-func (s *lexerModeStack) push(m lexerMode) {
- s.stack = append(s.stack, m)
-}
-
-func (s *lexerModeStack) pop() {
- s.stack = s.stack[:len(s.stack)-1]
-}
-
-type rangeState string
-
-// [a-z]
-// ^^^^
-// |||`-- ready
-// ||`-- expect range terminator
-// |`-- read range initiator
-// `-- ready
-const (
- rangeStateReady = rangeState("ready")
- rangeStateReadRangeInitiator = rangeState("read range initiator")
- rangeStateExpectRangeTerminator = rangeState("expect range terminator")
-)
-
-type lexer struct {
- src *bufio.Reader
- peekChar2 rune
- peekEOF2 bool
- peekChar1 rune
- peekEOF1 bool
- lastChar rune
- reachedEOF bool
- prevChar1 rune
- prevEOF1 bool
- prevChar2 rune
- pervEOF2 bool
- modeStack *lexerModeStack
- rangeState rangeState
- errMsgDetails string
-}
-
-func newLexer(src io.Reader) *lexer {
- return &lexer{
- src: bufio.NewReader(src),
- peekChar2: nullChar,
- peekEOF2: false,
- peekChar1: nullChar,
- peekEOF1: false,
- lastChar: nullChar,
- reachedEOF: false,
- prevChar1: nullChar,
- prevEOF1: false,
- prevChar2: nullChar,
- pervEOF2: false,
- modeStack: newLexerModeStack(),
- rangeState: rangeStateReady,
- }
-}
-
-func (l *lexer) next() (*token, error) {
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- return newToken(tokenKindEOF, nullChar), nil
- }
-
- switch l.modeStack.top() {
- case lexerModeBExp:
- tok, err := l.nextInBExp(c)
- if err != nil {
- return nil, err
- }
- switch tok.kind {
- case tokenKindBExpClose:
- l.modeStack.pop()
- case tokenKindCharRange:
- l.rangeState = rangeStateExpectRangeTerminator
- case tokenKindChar:
- switch l.rangeState {
- case rangeStateReady:
- l.rangeState = rangeStateReadRangeInitiator
- case rangeStateExpectRangeTerminator:
- l.rangeState = rangeStateReady
- }
- case tokenKindCodePointLeader:
- l.modeStack.push(lexerModeCPExp)
- case tokenKindCharPropLeader:
- l.modeStack.push(lexerModeCharPropExp)
- }
- return tok, nil
- case lexerModeCPExp:
- tok, err := l.nextInCodePoint(c)
- if err != nil {
- return nil, err
- }
- switch tok.kind {
- case tokenKindRBrace:
- l.modeStack.pop()
- }
- return tok, nil
- case lexerModeCharPropExp:
- tok, err := l.nextInCharProp(c)
- if err != nil {
- return nil, err
- }
- switch tok.kind {
- case tokenKindRBrace:
- l.modeStack.pop()
- }
- return tok, nil
- case lexerModeFragmentExp:
- tok, err := l.nextInFragment(c)
- if err != nil {
- return nil, err
- }
- switch tok.kind {
- case tokenKindRBrace:
- l.modeStack.pop()
- }
- return tok, nil
- default:
- tok, err := l.nextInDefault(c)
- if err != nil {
- return nil, err
- }
- switch tok.kind {
- case tokenKindBExpOpen:
- l.modeStack.push(lexerModeBExp)
- l.rangeState = rangeStateReady
- case tokenKindInverseBExpOpen:
- l.modeStack.push(lexerModeBExp)
- l.rangeState = rangeStateReady
- case tokenKindCodePointLeader:
- l.modeStack.push(lexerModeCPExp)
- case tokenKindCharPropLeader:
- l.modeStack.push(lexerModeCharPropExp)
- case tokenKindFragmentLeader:
- l.modeStack.push(lexerModeFragmentExp)
- }
- return tok, nil
- }
-}
-
-func (l *lexer) nextInDefault(c rune) (*token, error) {
- switch c {
- case '*':
- return newToken(tokenKindRepeat, nullChar), nil
- case '+':
- return newToken(tokenKindRepeatOneOrMore, nullChar), nil
- case '?':
- return newToken(tokenKindOption, nullChar), nil
- case '.':
- return newToken(tokenKindAnyChar, nullChar), nil
- case '|':
- return newToken(tokenKindAlt, nullChar), nil
- case '(':
- return newToken(tokenKindGroupOpen, nullChar), nil
- case ')':
- return newToken(tokenKindGroupClose, nullChar), nil
- case '[':
- c1, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindBExpOpen, nullChar), nil
- }
- if c1 != '^' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindBExpOpen, nullChar), nil
- }
- c2, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindInverseBExpOpen, nullChar), nil
- }
- if c2 != ']' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindInverseBExpOpen, nullChar), nil
- }
- err = l.restore()
- if err != nil {
- return nil, err
- }
- err = l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindBExpOpen, nullChar), nil
- case '\\':
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- return nil, synErrIncompletedEscSeq
- }
- if c == 'u' {
- return newToken(tokenKindCodePointLeader, nullChar), nil
- }
- if c == 'p' {
- return newToken(tokenKindCharPropLeader, nullChar), nil
- }
- if c == 'f' {
- return newToken(tokenKindFragmentLeader, nullChar), nil
- }
- if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' {
- return newToken(tokenKindChar, c), nil
- }
- l.errMsgDetails = fmt.Sprintf("\\%v is not supported", string(c))
- return nil, synErrInvalidEscSeq
- default:
- return newToken(tokenKindChar, c), nil
- }
-}
-
-func (l *lexer) nextInBExp(c rune) (*token, error) {
- switch c {
- case '-':
- if l.rangeState != rangeStateReadRangeInitiator {
- return newToken(tokenKindChar, c), nil
- }
- c1, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindChar, c), nil
- }
- if c1 != ']' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindCharRange, nullChar), nil
- }
- err = l.restore()
- if err != nil {
- return nil, err
- }
- return newToken(tokenKindChar, c), nil
- case ']':
- return newToken(tokenKindBExpClose, nullChar), nil
- case '\\':
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- return nil, synErrIncompletedEscSeq
- }
- if c == 'u' {
- return newToken(tokenKindCodePointLeader, nullChar), nil
- }
- if c == 'p' {
- return newToken(tokenKindCharPropLeader, nullChar), nil
- }
- if c == '\\' || c == '^' || c == '-' || c == ']' {
- return newToken(tokenKindChar, c), nil
- }
- l.errMsgDetails = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c))
- return nil, synErrInvalidEscSeq
- default:
- return newToken(tokenKindChar, c), nil
- }
-}
-
-func (l *lexer) nextInCodePoint(c rune) (*token, error) {
- switch c {
- case '{':
- return newToken(tokenKindLBrace, nullChar), nil
- case '}':
- return newToken(tokenKindRBrace, nullChar), nil
- default:
- if !isHexDigit(c) {
- return nil, synErrInvalidCodePoint
- }
- var b strings.Builder
- fmt.Fprint(&b, string(c))
- n := 1
- for {
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- if c == '}' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- if !isHexDigit(c) || n >= 6 {
- return nil, synErrInvalidCodePoint
- }
- fmt.Fprint(&b, string(c))
- n++
- }
- cp := b.String()
- cpLen := len(cp)
- if !(cpLen == 4 || cpLen == 6) {
- return nil, synErrInvalidCodePoint
- }
- return newCodePointToken(b.String()), nil
- }
-}
-
-func isHexDigit(c rune) bool {
- if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' {
- return true
- }
- return false
-}
-
-func (l *lexer) nextInCharProp(c rune) (*token, error) {
- switch c {
- case '{':
- return newToken(tokenKindLBrace, nullChar), nil
- case '}':
- return newToken(tokenKindRBrace, nullChar), nil
- case '=':
- return newToken(tokenKindEqual, nullChar), nil
- default:
- var b strings.Builder
- fmt.Fprint(&b, string(c))
- n := 1
- for {
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- if c == '}' || c == '=' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- fmt.Fprint(&b, string(c))
- n++
- }
- sym := strings.TrimSpace(b.String())
- if len(sym) == 0 {
- raiseSyntaxError(synErrCharPropInvalidSymbol)
- }
- return newCharPropSymbolToken(sym), nil
- }
-}
-
-func (l *lexer) nextInFragment(c rune) (*token, error) {
- switch c {
- case '{':
- return newToken(tokenKindLBrace, nullChar), nil
- case '}':
- return newToken(tokenKindRBrace, nullChar), nil
- default:
- var b strings.Builder
- fmt.Fprint(&b, string(c))
- n := 1
- for {
- c, eof, err := l.read()
- if err != nil {
- return nil, err
- }
- if eof {
- l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- if c == '}' {
- err := l.restore()
- if err != nil {
- return nil, err
- }
- break
- }
- fmt.Fprint(&b, string(c))
- n++
- }
- sym := strings.TrimSpace(b.String())
- if len(sym) == 0 {
- raiseSyntaxError(SynErrFragmentInvalidSymbol)
- }
- return newFragmentSymbolToken(sym), nil
- }
-}
-
-func (l *lexer) read() (rune, bool, error) {
- if l.reachedEOF {
- return l.lastChar, l.reachedEOF, nil
- }
- if l.peekChar1 != nullChar || l.peekEOF1 {
- l.prevChar2 = l.prevChar1
- l.pervEOF2 = l.prevEOF1
- l.prevChar1 = l.lastChar
- l.prevEOF1 = l.reachedEOF
- l.lastChar = l.peekChar1
- l.reachedEOF = l.peekEOF1
- l.peekChar1 = l.peekChar2
- l.peekEOF1 = l.peekEOF2
- l.peekChar2 = nullChar
- l.peekEOF2 = false
- return l.lastChar, l.reachedEOF, nil
- }
- c, _, err := l.src.ReadRune()
- if err != nil {
- if err == io.EOF {
- l.prevChar2 = l.prevChar1
- l.pervEOF2 = l.prevEOF1
- l.prevChar1 = l.lastChar
- l.prevEOF1 = l.reachedEOF
- l.lastChar = nullChar
- l.reachedEOF = true
- return l.lastChar, l.reachedEOF, nil
- }
- return nullChar, false, err
- }
- l.prevChar2 = l.prevChar1
- l.pervEOF2 = l.prevEOF1
- l.prevChar1 = l.lastChar
- l.prevEOF1 = l.reachedEOF
- l.lastChar = c
- l.reachedEOF = false
- return l.lastChar, l.reachedEOF, nil
-}
-
-func (l *lexer) restore() error {
- if l.lastChar == nullChar && !l.reachedEOF {
- return fmt.Errorf("the lexer failed to call restore() because the last character is null")
- }
- l.peekChar2 = l.peekChar1
- l.peekEOF2 = l.peekEOF1
- l.peekChar1 = l.lastChar
- l.peekEOF1 = l.reachedEOF
- l.lastChar = l.prevChar1
- l.reachedEOF = l.prevEOF1
- l.prevChar1 = l.prevChar2
- l.prevEOF1 = l.pervEOF2
- l.prevChar2 = nullChar
- l.pervEOF2 = false
- return nil
-}
diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go
deleted file mode 100644
index 51eb82e..0000000
--- a/compiler/lexer_test.go
+++ /dev/null
@@ -1,514 +0,0 @@
-package compiler
-
-import (
- "strings"
- "testing"
-)
-
-func TestLexer(t *testing.T) {
- tests := []struct {
- caption string
- src string
- tokens []*token
- err error
- }{
- {
- caption: "lexer can recognize ordinaly characters",
- src: "123abcいろは",
- tokens: []*token{
- newToken(tokenKindChar, '1'),
- newToken(tokenKindChar, '2'),
- newToken(tokenKindChar, '3'),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindChar, 'b'),
- newToken(tokenKindChar, 'c'),
- newToken(tokenKindChar, 'い'),
- newToken(tokenKindChar, 'ろ'),
- newToken(tokenKindChar, 'は'),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "lexer can recognize the special characters in default mode",
- src: ".*+?|()[\\u",
- tokens: []*token{
- newToken(tokenKindAnyChar, nullChar),
- newToken(tokenKindRepeat, nullChar),
- newToken(tokenKindRepeatOneOrMore, nullChar),
- newToken(tokenKindOption, nullChar),
- newToken(tokenKindAlt, nullChar),
- newToken(tokenKindGroupOpen, nullChar),
- newToken(tokenKindGroupClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "lexer can recognize the escape sequences in default mode",
- src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[",
- tokens: []*token{
- newToken(tokenKindChar, '\\'),
- newToken(tokenKindChar, '.'),
- newToken(tokenKindChar, '*'),
- newToken(tokenKindChar, '+'),
- newToken(tokenKindChar, '?'),
- newToken(tokenKindChar, '|'),
- newToken(tokenKindChar, '('),
- newToken(tokenKindChar, ')'),
- newToken(tokenKindChar, '['),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "], {, and } are treated as an ordinary character in default mode",
- src: "]{}",
- tokens: []*token{
- newToken(tokenKindChar, ']'),
- newToken(tokenKindChar, '{'),
- newToken(tokenKindChar, '}'),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "lexer can recognize the special characters in bracket expression mode",
- src: "[a-z\\u{09AF}][^a-z\\u{09abcf}]",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindCharRange, nullChar),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("09AF"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindCharRange, nullChar),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("09abcf"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "lexer can recognize the escape sequences in bracket expression mode",
- src: "[\\^a\\-z]",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '^'),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "in a bracket expression, the special characters are also handled as normal characters",
- src: "[\\\\.*+?|()[",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '\\'),
- newToken(tokenKindChar, '.'),
- newToken(tokenKindChar, '*'),
- newToken(tokenKindChar, '+'),
- newToken(tokenKindChar, '?'),
- newToken(tokenKindChar, '|'),
- newToken(tokenKindChar, '('),
- newToken(tokenKindChar, ')'),
- newToken(tokenKindChar, '['),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "hyphen symbols that appear in bracket expressions are handled as the character range symbol or ordinary characters",
- // [...-...][...-][-...][-]
- // ~~~~~~~ ~ ~ ~
- // ^ ^ ^ ^
- // | | | `-- Ordinary Character (b)
- // | | `-- Ordinary Character (b)
- // | `-- Ordinary Character (b)
- // `-- Character Range (a)
- //
- // a. *-* is handled as a character range expression.
- // b. *-, -*, or - are handled as ordinary characters.
- src: "[a-z][a-][-z][-][--][---][^a-z][^a-][^-z][^-][^--][^---]",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindCharRange, nullChar),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindCharRange, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
-
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindCharRange, nullChar),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, 'a'),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindChar, 'z'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindCharRange, nullChar),
- newToken(tokenKindChar, '-'),
- newToken(tokenKindBExpClose, nullChar),
-
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "caret symbols that appear in bracket expressions are handled as the logical inverse symbol or ordinary characters",
- // [^...^...][^]
- // ~~ ~ ~~
- // ^ ^ ^^
- // | | |`-- Ordinary Character (c)
- // | | `-- Bracket Expression
- // | `-- Ordinary Character (b)
- // `-- Inverse Bracket Expression (a)
- //
- // a. Bracket expressions that have a caret symbol at the beginning are handled as logical inverse expressions.
- // b. caret symbols that appear as the second and the subsequent symbols are handled as ordinary symbols.
- // c. When a bracket expression has just one symbol, a caret symbol at the beginning is handled as an ordinary character.
- src: "[^^][^]",
- tokens: []*token{
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindChar, '^'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindChar, '^'),
- newToken(tokenKindBExpClose, nullChar),
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "lexer raises an error when an invalid escape sequence appears",
- src: "\\@",
- err: synErrInvalidEscSeq,
- },
- {
- caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears",
- src: "\\",
- err: synErrIncompletedEscSeq,
- },
- {
- caption: "lexer raises an error when an invalid escape sequence appears",
- src: "[\\@",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- },
- err: synErrInvalidEscSeq,
- },
- {
- caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears",
- src: "[\\",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- },
- err: synErrIncompletedEscSeq,
- },
- {
- caption: "lexer can recognize the special characters and code points in code point expression mode",
- src: "\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}[\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}][^\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}]",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("0123"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("4567"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("89abcd"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("efAB"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("CDEF01"),
- newToken(tokenKindRBrace, nullChar),
-
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("0123"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("4567"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("89abcd"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("efAB"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("CDEF01"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindBExpClose, nullChar),
-
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("0123"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("4567"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("89abcd"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("efAB"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("CDEF01"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindBExpClose, nullChar),
-
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "a one digit hex string isn't a valid code point",
- src: "\\u{0",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "a two digits hex string isn't a valid code point",
- src: "\\u{01",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "a three digits hex string isn't a valid code point",
- src: "\\u{012",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "a four digits hex string is a valid code point",
- src: "\\u{0123}",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("0123"),
- newToken(tokenKindRBrace, nullChar),
- },
- },
- {
- caption: "a five digits hex string isn't a valid code point",
- src: "\\u{01234",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "a six digits hex string is a valid code point",
- src: "\\u{012345}",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCodePointToken("012345"),
- newToken(tokenKindRBrace, nullChar),
- },
- },
- {
- caption: "a seven digits hex string isn't a valid code point",
- src: "\\u{0123456",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "a code point must be hex digits",
- src: "\\u{g",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "a code point must be hex digits",
- src: "\\u{G",
- tokens: []*token{
- newToken(tokenKindCodePointLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- },
- err: synErrInvalidCodePoint,
- },
- {
- caption: "lexer can recognize the special characters and symbols in character property expression mode",
- src: "\\p{Letter}\\p{General_Category=Letter}[\\p{Letter}\\p{General_Category=Letter}][^\\p{Letter}\\p{General_Category=Letter}]",
- tokens: []*token{
- newToken(tokenKindCharPropLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCharPropSymbolToken("Letter"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCharPropLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCharPropSymbolToken("General_Category"),
- newToken(tokenKindEqual, nullChar),
- newCharPropSymbolToken("Letter"),
- newToken(tokenKindRBrace, nullChar),
-
- newToken(tokenKindBExpOpen, nullChar),
- newToken(tokenKindCharPropLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCharPropSymbolToken("Letter"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCharPropLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCharPropSymbolToken("General_Category"),
- newToken(tokenKindEqual, nullChar),
- newCharPropSymbolToken("Letter"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindBExpClose, nullChar),
-
- newToken(tokenKindInverseBExpOpen, nullChar),
- newToken(tokenKindCharPropLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCharPropSymbolToken("Letter"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindCharPropLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newCharPropSymbolToken("General_Category"),
- newToken(tokenKindEqual, nullChar),
- newCharPropSymbolToken("Letter"),
- newToken(tokenKindRBrace, nullChar),
- newToken(tokenKindBExpClose, nullChar),
-
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "lexer can recognize the special characters and symbols in fragment expression mode",
- src: "\\f{integer}",
- tokens: []*token{
- newToken(tokenKindFragmentLeader, nullChar),
- newToken(tokenKindLBrace, nullChar),
- newFragmentSymbolToken("integer"),
- newToken(tokenKindRBrace, nullChar),
-
- newToken(tokenKindEOF, nullChar),
- },
- },
- {
- caption: "a fragment expression is not supported in a bracket expression",
- src: "[\\f",
- tokens: []*token{
- newToken(tokenKindBExpOpen, nullChar),
- },
- err: synErrInvalidEscSeq,
- },
- {
- caption: "a fragment expression is not supported in an inverse bracket expression",
- src: "[^\\f",
- tokens: []*token{
- newToken(tokenKindInverseBExpOpen, nullChar),
- },
- err: synErrInvalidEscSeq,
- },
- }
- for _, tt := range tests {
- t.Run(tt.caption, func(t *testing.T) {
- lex := newLexer(strings.NewReader(tt.src))
- var err error
- var tok *token
- i := 0
- for {
- tok, err = lex.next()
- if err != nil {
- break
- }
- if i >= len(tt.tokens) {
- break
- }
- eTok := tt.tokens[i]
- i++
- testToken(t, tok, eTok)
-
- if tok.kind == tokenKindEOF {
- break
- }
- }
- if err != tt.err {
- t.Fatalf("unexpected error; want: %v, got: %v (%v)", tt.err, err, lex.errMsgDetails)
- }
- if i < len(tt.tokens) {
- t.Fatalf("expecte more tokens")
- }
- })
- }
-}
-
-func testToken(t *testing.T, a, e *token) {
- t.Helper()
- if e.kind != a.kind || e.char != a.char || e.codePoint != a.codePoint {
- t.Fatalf("unexpected token; want: %+v, got: %+v", e, a)
- }
-}
diff --git a/compiler/parser.go b/compiler/parser.go
deleted file mode 100644
index ce481e3..0000000
--- a/compiler/parser.go
+++ /dev/null
@@ -1,862 +0,0 @@
-package compiler
-
-import (
- "bytes"
- "encoding/binary"
- "encoding/hex"
- "fmt"
- "io"
- "strings"
-
- "github.com/nihei9/maleeni/spec"
- "github.com/nihei9/maleeni/ucd"
- "github.com/nihei9/maleeni/utf8"
-)
-
-type ParseErrors struct {
- Errors []*ParseError
-}
-
-func (e *ParseErrors) Error() string {
- var b strings.Builder
- fmt.Fprintf(&b, "%v", e.Errors[0])
- for _, err := range e.Errors[1:] {
- fmt.Fprintf(&b, "\n%v", err)
- }
- return b.String()
-}
-
-type ParseError struct {
- ID spec.LexModeKindID
- Pattern []byte
- Cause error
- Details string
-}
-
-func (e *ParseError) Error() string {
- var b strings.Builder
- fmt.Fprintf(&b, "#%v %v: %v", e.ID, string(e.Pattern), e.Cause)
- if e.Details != "" {
- fmt.Fprintf(&b, ": %v", e.Details)
- }
- return b.String()
-}
-
-func raiseSyntaxError(synErr *SyntaxError) {
- panic(synErr)
-}
-
-type symbolTable struct {
- symPos2Byte map[symbolPosition]byteRange
- endPos2ID map[symbolPosition]spec.LexModeKindID
-}
-
-func genSymbolTable(root astNode) *symbolTable {
- symTab := &symbolTable{
- symPos2Byte: map[symbolPosition]byteRange{},
- endPos2ID: map[symbolPosition]spec.LexModeKindID{},
- }
- return genSymTab(symTab, root)
-}
-
-func genSymTab(symTab *symbolTable, node astNode) *symbolTable {
- if node == nil {
- return symTab
- }
-
- switch n := node.(type) {
- case *symbolNode:
- symTab.symPos2Byte[n.pos] = byteRange{
- from: n.from,
- to: n.to,
- }
- case *endMarkerNode:
- symTab.endPos2ID[n.pos] = n.id
- default:
- left, right := node.children()
- genSymTab(symTab, left)
- genSymTab(symTab, right)
- }
- return symTab
-}
-
-type patternEntry struct {
- id spec.LexModeKindID
- pattern []byte
-}
-
-func parse(pats []*patternEntry, fragments map[string][]byte) (astNode, *symbolTable, error) {
- if len(pats) == 0 {
- return nil, nil, fmt.Errorf("parse() needs at least one token entry")
- }
-
- fragmentASTs, err := parseFragments(fragments)
- if err != nil {
- return nil, nil, err
- }
- if fragmentASTs == nil {
- fragmentASTs = map[string]astNode{}
- }
-
- root, err := parseRegexp(pats, fragmentASTs)
- if err != nil {
- return nil, nil, err
- }
-
- return root, genSymbolTable(root), nil
-}
-
-type incompleteFragment struct {
- kind string
- ast astNode
-}
-
-func parseFragments(fragments map[string][]byte) (map[string]astNode, error) {
- if len(fragments) == 0 {
- return nil, nil
- }
- fragmentASTs := map[string]astNode{}
- incompleteFragments := []*incompleteFragment{}
- var perrs []*ParseError
- for kind, pattern := range fragments {
- p := newParser(bytes.NewReader(pattern))
- ast, err := p.parse()
- if err != nil {
- perrs = append(perrs, &ParseError{
- Pattern: pattern,
- Cause: err,
- Details: p.errMsgDetails,
- })
- continue
- }
- if p.incomplete {
- incompleteFragments = append(incompleteFragments, &incompleteFragment{
- kind: kind,
- ast: ast,
- })
- } else {
- fragmentASTs[kind] = ast
- }
- }
- for len(incompleteFragments) > 0 {
- lastIncompCount := len(incompleteFragments)
- remainingFragments := []*incompleteFragment{}
- for _, e := range incompleteFragments {
- remains := applyFragments(e.ast, fragmentASTs)
- if len(remains) > 0 {
- remainingFragments = append(remainingFragments, e)
- } else {
- fragmentASTs[e.kind] = e.ast
- }
- }
- incompleteFragments = remainingFragments
- if len(incompleteFragments) == lastIncompCount {
- for _, e := range incompleteFragments {
- perrs = append(perrs, &ParseError{
- Cause: fmt.Errorf("%v has an undefined fragment or a cycle", e.kind),
- })
- }
- break
- }
- }
- if len(perrs) > 0 {
- return nil, &ParseErrors{
- Errors: perrs,
- }
- }
-
- return fragmentASTs, nil
-}
-
-func parseRegexp(pats []*patternEntry, fragmentASTs map[string]astNode) (astNode, error) {
- symPos := symbolPositionMin
- var root astNode
- var perrs []*ParseError
-
- for _, pat := range pats {
- if pat.id == spec.LexModeKindIDNil {
- continue
- }
-
- p := newParser(bytes.NewReader(pat.pattern))
- ast, err := p.parse()
- if err != nil {
- perrs = append(perrs, &ParseError{
- ID: pat.id,
- Pattern: pat.pattern,
- Cause: err,
- Details: p.errMsgDetails,
- })
- continue
- }
- remains := applyFragments(ast, fragmentASTs)
- if len(remains) > 0 {
- perrs = append(perrs, &ParseError{
- ID: pat.id,
- Pattern: pat.pattern,
- Cause: fmt.Errorf("undefined fragment: %+v", remains),
- })
- continue
- }
- ast = newConcatNode(ast, newEndMarkerNode(pat.id))
- symPos, err = positionSymbols(ast, symPos)
- if err != nil {
- perrs = append(perrs, &ParseError{
- ID: pat.id,
- Pattern: pat.pattern,
- Cause: err,
- Details: p.errMsgDetails,
- })
- continue
- }
- root = genAltNode(root, ast)
- }
- if len(perrs) > 0 {
- return nil, &ParseErrors{
- Errors: perrs,
- }
- }
-
- return root, nil
-}
-
-func applyFragments(ast astNode, fragments map[string]astNode) []string {
- if ast == nil {
- return nil
- }
- n, ok := ast.(*fragmentNode)
- if !ok {
- var remains []string
- left, right := ast.children()
- r := applyFragments(left, fragments)
- if len(r) > 0 {
- remains = append(remains, r...)
- }
- r = applyFragments(right, fragments)
- if len(r) > 0 {
- remains = append(remains, r...)
- }
- return remains
- }
- f, ok := fragments[n.symbol]
- if !ok {
- return []string{n.symbol}
- }
- n.left = copyAST(f)
- return nil
-}
-
-type parser struct {
- lex *lexer
- peekedTok *token
- lastTok *token
- incomplete bool
- errMsgDetails string
-
- // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that
- // appear in property expressions.
- //
- // The contributory properties are not exposed, and users cannot use those properties because the parser
- // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid.
- //
- // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to
- // interpret derived properties internally because the derived properties consist of other properties that
- // may contain the contributory properties.
- //
- // [UAX #44 5.13 Property APIs] says:
- // > The following subtypes of Unicode character properties should generally not be exposed in APIs,
- // > except in limited circumstances. They may not be useful, particularly in public API collections,
- // > and may instead prove misleading to the users of such API collections.
- // > * Contributory properties are not recommended for public APIs.
- // > ...
- // https://unicode.org/reports/tr44/#Property_APIs
- isContributoryPropertyExposed bool
-}
-
-func newParser(src io.Reader) *parser {
- return &parser{
- lex: newLexer(src),
- isContributoryPropertyExposed: false,
- }
-}
-
-func (p *parser) exposeContributoryProperty() {
- p.isContributoryPropertyExposed = true
-}
-
-func (p *parser) parse() (ast astNode, retErr error) {
- defer func() {
- err := recover()
- if err != nil {
- var ok bool
- retErr, ok = err.(error)
- if !ok {
- retErr = fmt.Errorf("%v", err)
- }
- return
- }
- }()
-
- ast, err := p.parseRegexp()
- if err != nil {
- return nil, err
- }
-
- return ast, nil
-}
-
-func (p *parser) parseRegexp() (astNode, error) {
- alt := p.parseAlt()
- if alt == nil {
- if p.consume(tokenKindGroupClose) {
- raiseSyntaxError(synErrGroupNoInitiator)
- }
- raiseSyntaxError(synErrNullPattern)
- }
- if p.consume(tokenKindGroupClose) {
- raiseSyntaxError(synErrGroupNoInitiator)
- }
- p.expect(tokenKindEOF)
- return alt, nil
-}
-
-func (p *parser) parseAlt() astNode {
- left := p.parseConcat()
- if left == nil {
- if p.consume(tokenKindAlt) {
- raiseSyntaxError(synErrAltLackOfOperand)
- }
- return nil
- }
- for {
- if !p.consume(tokenKindAlt) {
- break
- }
- right := p.parseConcat()
- if right == nil {
- raiseSyntaxError(synErrAltLackOfOperand)
- }
- left = newAltNode(left, right)
- }
- return left
-}
-
-func (p *parser) parseConcat() astNode {
- left := p.parseRepeat()
- for {
- right := p.parseRepeat()
- if right == nil {
- break
- }
- left = newConcatNode(left, right)
- }
- return left
-}
-
-func (p *parser) parseRepeat() astNode {
- group := p.parseGroup()
- if group == nil {
- if p.consume(tokenKindRepeat) {
- p.errMsgDetails = "* needs an operand"
- raiseSyntaxError(synErrRepNoTarget)
- }
- if p.consume(tokenKindRepeatOneOrMore) {
- p.errMsgDetails = "+ needs an operand"
- raiseSyntaxError(synErrRepNoTarget)
- }
- if p.consume(tokenKindOption) {
- p.errMsgDetails = "? needs an operand"
- raiseSyntaxError(synErrRepNoTarget)
- }
- return nil
- }
- if p.consume(tokenKindRepeat) {
- return newRepeatNode(group)
- }
- if p.consume(tokenKindRepeatOneOrMore) {
- return newRepeatOneOrMoreNode(group)
- }
- if p.consume(tokenKindOption) {
- return newOptionNode(group)
- }
- return group
-}
-
-func (p *parser) parseGroup() astNode {
- if p.consume(tokenKindGroupOpen) {
- alt := p.parseAlt()
- if alt == nil {
- if p.consume(tokenKindEOF) {
- raiseSyntaxError(synErrGroupUnclosed)
- }
- raiseSyntaxError(synErrGroupNoElem)
- }
- if p.consume(tokenKindEOF) {
- raiseSyntaxError(synErrGroupUnclosed)
- }
- if !p.consume(tokenKindGroupClose) {
- raiseSyntaxError(synErrGroupInvalidForm)
- }
- return alt
- }
- return p.parseSingleChar()
-}
-
-func (p *parser) parseSingleChar() astNode {
- if p.consume(tokenKindAnyChar) {
- return genAnyCharAST()
- }
- if p.consume(tokenKindBExpOpen) {
- left := p.parseBExpElem()
- if left == nil {
- if p.consume(tokenKindEOF) {
- raiseSyntaxError(synErrBExpUnclosed)
- }
- raiseSyntaxError(synErrBExpNoElem)
- }
- for {
- right := p.parseBExpElem()
- if right == nil {
- break
- }
- left = newAltNode(left, right)
- }
- if p.consume(tokenKindEOF) {
- raiseSyntaxError(synErrBExpUnclosed)
- }
- p.expect(tokenKindBExpClose)
- return left
- }
- if p.consume(tokenKindInverseBExpOpen) {
- elem := p.parseBExpElem()
- if elem == nil {
- if p.consume(tokenKindEOF) {
- raiseSyntaxError(synErrBExpUnclosed)
- }
- raiseSyntaxError(synErrBExpNoElem)
- }
- inverse := exclude(elem, genAnyCharAST())
- if inverse == nil {
- panic(fmt.Errorf("a pattern that isn't matching any symbols"))
- }
- for {
- elem := p.parseBExpElem()
- if elem == nil {
- break
- }
- inverse = exclude(elem, inverse)
- if inverse == nil {
- panic(fmt.Errorf("a pattern that isn't matching any symbols"))
- }
- }
- if p.consume(tokenKindEOF) {
- raiseSyntaxError(synErrBExpUnclosed)
- }
- p.expect(tokenKindBExpClose)
- return inverse
- }
- if p.consume(tokenKindCodePointLeader) {
- return p.parseCodePoint()
- }
- if p.consume(tokenKindCharPropLeader) {
- return p.parseCharProp()
- }
- if p.consume(tokenKindFragmentLeader) {
- return p.parseFragment()
- }
- c := p.parseNormalChar()
- if c == nil {
- if p.consume(tokenKindBExpClose) {
- raiseSyntaxError(synErrBExpInvalidForm)
- }
- return nil
- }
- return c
-}
-
-func (p *parser) parseBExpElem() astNode {
- if p.consume(tokenKindCodePointLeader) {
- return p.parseCodePoint()
- }
- if p.consume(tokenKindCharPropLeader) {
- return p.parseCharProp()
- }
- left := p.parseNormalChar()
- if left == nil {
- return nil
- }
- if !p.consume(tokenKindCharRange) {
- return left
- }
- right := p.parseNormalChar()
- if right == nil {
- panic(fmt.Errorf("invalid range expression"))
- }
- from := genByteSeq(left)
- to := genByteSeq(right)
- if !isValidOrder(from, to) {
- p.errMsgDetails = fmt.Sprintf("[%s-%s] ([%v-%v])", string(from), string(to), from, to)
- raiseSyntaxError(synErrRangeInvalidOrder)
- }
- return genRangeAST(left, right)
-}
-
-func (p *parser) parseCodePoint() astNode {
- if !p.consume(tokenKindLBrace) {
- raiseSyntaxError(synErrCPExpInvalidForm)
- }
- if !p.consume(tokenKindCodePoint) {
- raiseSyntaxError(synErrCPExpInvalidForm)
- }
-
- var cp []byte
- {
- // Although hex.DecodeString method can handle only a hex string that has even length,
- // `codePoint` always has even length by the lexical specification.
- b, err := hex.DecodeString(p.lastTok.codePoint)
- if err != nil {
- panic(fmt.Errorf("failed to decode a code point (%v) into a byte slice: %v", p.lastTok.codePoint, err))
- }
- // `b` must be 4 bytes to convert it into a 32-bit integer.
- l := len(b)
- for i := 0; i < 4-l; i++ {
- b = append([]byte{0}, b...)
- }
- n := binary.BigEndian.Uint32(b)
- if n < 0x0000 || n > 0x10FFFF {
- raiseSyntaxError(synErrCPExpOutOfRange)
- }
-
- cp = []byte(string(rune(n)))
- }
-
- var concat astNode
- {
- concat = newSymbolNode(cp[0])
- for _, b := range cp[1:] {
- concat = genConcatNode(
- concat,
- newSymbolNode(b),
- )
- }
- }
-
- if !p.consume(tokenKindRBrace) {
- raiseSyntaxError(synErrCPExpInvalidForm)
- }
-
- return concat
-}
-
-func (p *parser) parseCharProp() astNode {
- if !p.consume(tokenKindLBrace) {
- raiseSyntaxError(synErrCharPropExpInvalidForm)
- }
- var sym1, sym2 string
- if !p.consume(tokenKindCharPropSymbol) {
- raiseSyntaxError(synErrCharPropExpInvalidForm)
- }
- sym1 = p.lastTok.propSymbol
- if p.consume(tokenKindEqual) {
- if !p.consume(tokenKindCharPropSymbol) {
- raiseSyntaxError(synErrCharPropExpInvalidForm)
- }
- sym2 = p.lastTok.propSymbol
- }
-
- var alt astNode
- var propName, propVal string
- if sym2 != "" {
- propName = sym1
- propVal = sym2
- } else {
- propName = ""
- propVal = sym1
- }
- if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) {
- p.errMsgDetails = propName
- raiseSyntaxError(synErrCharPropUnsupported)
- }
- pat, err := ucd.NormalizeCharacterProperty(propName, propVal)
- if err != nil {
- p.errMsgDetails = fmt.Sprintf("%v", err)
- raiseSyntaxError(synErrCharPropUnsupported)
- }
- if pat != "" {
- p := newParser(bytes.NewReader([]byte(pat)))
- p.exposeContributoryProperty()
- ast, err := p.parse()
- if err != nil {
- panic(err)
- }
- alt = ast
- } else {
- cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal)
- if err != nil {
- p.errMsgDetails = fmt.Sprintf("%v", err)
- raiseSyntaxError(synErrCharPropUnsupported)
- }
- if inverse {
- r := cpRanges[0]
- from := genNormalCharAST(r.From)
- to := genNormalCharAST(r.To)
- alt = exclude(genRangeAST(from, to), genAnyCharAST())
- if alt == nil {
- panic(fmt.Errorf("a pattern that isn't matching any symbols"))
- }
- for _, r := range cpRanges[1:] {
- from := genNormalCharAST(r.From)
- to := genNormalCharAST(r.To)
- alt = exclude(genRangeAST(from, to), alt)
- if alt == nil {
- panic(fmt.Errorf("a pattern that isn't matching any symbols"))
- }
- }
- } else {
- for _, r := range cpRanges {
- from := genNormalCharAST(r.From)
- to := genNormalCharAST(r.To)
- alt = genAltNode(
- alt,
- genRangeAST(from, to),
- )
- }
- }
- }
-
- if !p.consume(tokenKindRBrace) {
- raiseSyntaxError(synErrCharPropExpInvalidForm)
- }
-
- return alt
-}
-
-func (p *parser) parseFragment() astNode {
- if !p.consume(tokenKindLBrace) {
- raiseSyntaxError(synErrFragmentExpInvalidForm)
- }
- if !p.consume(tokenKindFragmentSymbol) {
- raiseSyntaxError(synErrFragmentExpInvalidForm)
- }
- sym := p.lastTok.fragmentSymbol
-
- if !p.consume(tokenKindRBrace) {
- raiseSyntaxError(synErrFragmentExpInvalidForm)
- }
-
- p.incomplete = true
-
- return newFragmentNode(sym, nil)
-}
-
-func (p *parser) parseNormalChar() astNode {
- if !p.consume(tokenKindChar) {
- return nil
- }
- return genNormalCharAST(p.lastTok.char)
-}
-
-func genNormalCharAST(c rune) astNode {
- b := []byte(string(c))
- switch len(b) {
- case 1:
- return newSymbolNode(b[0])
- case 2:
- return genConcatNode(
- newSymbolNode(b[0]),
- newSymbolNode(b[1]),
- )
- case 3:
- return genConcatNode(
- newSymbolNode(b[0]),
- newSymbolNode(b[1]),
- newSymbolNode(b[2]),
- )
- default: // is equivalent to case 4
- return genConcatNode(
- newSymbolNode(b[0]),
- newSymbolNode(b[1]),
- newSymbolNode(b[2]),
- newSymbolNode(b[3]),
- )
- }
-}
-
-func exclude(symbol, base astNode) astNode {
- if alt, ok := symbol.(*altNode); ok {
- return exclude(alt.right, exclude(alt.left, base))
- }
-
- switch base.(type) {
- case *altNode:
- left, right := base.children()
- return genAltNode(
- exclude(symbol, left),
- exclude(symbol, right),
- )
- case *concatNode:
- baseSeq := genByteRangeSeq(base)
- symSeq := genByteRangeSeq(symbol)
- excluded := excludeByteRangeSequence(symSeq, baseSeq)
- if len(excluded) <= 0 {
- return nil
- }
- return convertByteRangeSeqsToAST(excluded)
- case *symbolNode:
- baseSeq := genByteRangeSeq(base)
- symSeq := genByteRangeSeq(symbol)
- excluded := excludeByteRangeSequence(symSeq, baseSeq)
- if len(excluded) <= 0 {
- return nil
- }
- return convertByteRangeSeqsToAST(excluded)
- }
- return nil
-}
-
-func convertByteRangeSeqsToAST(seqs [][]byteRange) astNode {
- concats := []astNode{}
- for _, seq := range seqs {
- syms := []astNode{}
- for _, elem := range seq {
- syms = append(syms, newRangeSymbolNode(elem.from, elem.to))
- }
- concats = append(concats, genConcatNode(syms...))
- }
- return genAltNode(concats...)
-}
-
-func genAnyCharAST() astNode {
- return convertCharBlocksToAST(utf8.AllCharBlocks())
-}
-
-func genRangeAST(fromNode, toNode astNode) astNode {
- from := genByteSeq(fromNode)
- to := genByteSeq(toNode)
- blks, err := utf8.GenCharBlocks(from, to)
- if err != nil {
- panic(err)
- }
- return convertCharBlocksToAST(blks)
-}
-
-func convertCharBlocksToAST(blks []*utf8.CharBlock) astNode {
- var alt astNode
- for _, blk := range blks {
- r := make([]astNode, len(blk.From))
- for i := 0; i < len(blk.From); i++ {
- r[i] = newRangeSymbolNode(blk.From[i], blk.To[i])
- }
- alt = genAltNode(alt, genConcatNode(r...))
- }
- return alt
-}
-
-func genByteSeq(node astNode) []byte {
- switch n := node.(type) {
- case *symbolNode:
- return []byte{n.from}
- case *concatNode:
- seq := genByteSeq(n.left)
- seq = append(seq, genByteSeq(n.right)...)
- return seq
- }
- panic(fmt.Errorf("genByteSeq() cannot handle %T: %v", node, node))
-}
-
-func genByteRangeSeq(node astNode) []byteRange {
- switch n := node.(type) {
- case *symbolNode:
- return []byteRange{{from: n.from, to: n.to}}
- case *concatNode:
- seq := genByteRangeSeq(n.left)
- seq = append(seq, genByteRangeSeq(n.right)...)
- return seq
- }
- panic(fmt.Errorf("genByteRangeSeq() cannot handle %T: %v", node, node))
-}
-
-func isValidOrder(from, to []byte) bool {
- if len(from) > len(to) {
- return false
- }
- if len(from) < len(to) {
- return true
- }
- for i, f := range from {
- t := to[i]
- if f > t {
- return false
- }
- if f < t {
- return true
- }
- }
- return true
-}
-
-func genConcatNode(cs ...astNode) astNode {
- if len(cs) <= 0 {
- return nil
- }
- if len(cs) == 1 {
- return cs[0]
- }
- concat := newConcatNode(cs[0], cs[1])
- for _, c := range cs[2:] {
- concat = newConcatNode(concat, c)
- }
- return concat
-}
-
-func genAltNode(cs ...astNode) astNode {
- nonNilNodes := []astNode{}
- for _, c := range cs {
- if c == nil {
- continue
- }
- nonNilNodes = append(nonNilNodes, c)
- }
- if len(nonNilNodes) <= 0 {
- return nil
- }
- if len(nonNilNodes) == 1 {
- return nonNilNodes[0]
- }
- alt := newAltNode(nonNilNodes[0], nonNilNodes[1])
- for _, c := range nonNilNodes[2:] {
- alt = newAltNode(alt, c)
- }
- return alt
-}
-
-func (p *parser) expect(expected tokenKind) {
- if !p.consume(expected) {
- tok := p.peekedTok
- p.errMsgDetails = fmt.Sprintf("unexpected token; expected: %v, actual: %v", expected, tok.kind)
- raiseSyntaxError(synErrUnexpectedToken)
- }
-}
-
-func (p *parser) consume(expected tokenKind) bool {
- var tok *token
- var err error
- if p.peekedTok != nil {
- tok = p.peekedTok
- p.peekedTok = nil
- } else {
- tok, err = p.lex.next()
- if err != nil {
- p.errMsgDetails = p.lex.errMsgDetails
- panic(err)
- }
- }
- p.lastTok = tok
- if tok.kind == expected {
- return true
- }
- p.peekedTok = tok
- p.lastTok = nil
-
- return false
-}
diff --git a/compiler/parser_test.go b/compiler/parser_test.go
deleted file mode 100644
index b0bc67a..0000000
--- a/compiler/parser_test.go
+++ /dev/null
@@ -1,1422 +0,0 @@
-package compiler
-
-import (
- "fmt"
- "reflect"
- "testing"
-
- "github.com/nihei9/maleeni/spec"
- "github.com/nihei9/maleeni/ucd"
-)
-
-func symPos(n uint16) symbolPosition {
- pos, err := newSymbolPosition(n, false)
- if err != nil {
- panic(err)
- }
- return pos
-}
-
-func endPos(n uint16) symbolPosition {
- pos, err := newSymbolPosition(n, true)
- if err != nil {
- panic(err)
- }
- return pos
-}
-
-func TestParse(t *testing.T) {
- tests := []struct {
- pattern string
- fragments map[string]string
- ast astNode
- syntaxError *SyntaxError
-
- // When an AST is large, as patterns containing a character property expression,
- // this test only checks that the pattern is parsable.
- // The check of the validity of such AST is performed by checking that it can be matched correctly using the driver.
- skipTestAST bool
- }{
- {
- pattern: "a",
- ast: genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "abc",
- ast: genConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "a?",
- ast: genConcatNode(
- newOptionNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- ),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "[abc]?",
- ast: genConcatNode(
- newOptionNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "\\u{3042}?",
- ast: genConcatNode(
- newOptionNode(
- genConcatNode(
- newSymbolNodeWithPos(0xE3, symPos(1)),
- newSymbolNodeWithPos(0x81, symPos(2)),
- newSymbolNodeWithPos(0x82, symPos(3)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "\\p{Letter}?",
- skipTestAST: true,
- },
- {
- pattern: "\\f{a2c}?",
- fragments: map[string]string{
- "a2c": "abc",
- },
- ast: genConcatNode(
- newOptionNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "(a)?",
- ast: genConcatNode(
- newOptionNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- ),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "((a?)?)?",
- ast: genConcatNode(
- newOptionNode(
- newOptionNode(
- newOptionNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- ),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "(abc)?",
- ast: genConcatNode(
- newOptionNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "(a|b)?",
- ast: genConcatNode(
- newOptionNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(3)),
- ),
- },
- {
- pattern: "?",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "(?)",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a|?",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "?|b",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a??",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a*",
- ast: genConcatNode(
- newRepeatNode(
- newSymbolNodeWithPos(byte('a'), 1),
- ),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "[abc]*",
- ast: genConcatNode(
- newRepeatNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), 1),
- newSymbolNodeWithPos(byte('b'), 2),
- newSymbolNodeWithPos(byte('c'), 3),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "\\u{3042}*",
- ast: genConcatNode(
- newRepeatNode(
- genConcatNode(
- newSymbolNodeWithPos(0xE3, symPos(1)),
- newSymbolNodeWithPos(0x81, symPos(2)),
- newSymbolNodeWithPos(0x82, symPos(3)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "\\p{Letter}*",
- skipTestAST: true,
- },
- {
- pattern: "\\f{a2c}*",
- fragments: map[string]string{
- "a2c": "abc",
- },
- ast: genConcatNode(
- newRepeatNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "((a*)*)*",
- ast: genConcatNode(
- newRepeatNode(
- newRepeatNode(
- newRepeatNode(
- newSymbolNodeWithPos(byte('a'), 1),
- ),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "(abc)*",
- ast: genConcatNode(
- newRepeatNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), 1),
- newSymbolNodeWithPos(byte('b'), 2),
- newSymbolNodeWithPos(byte('c'), 3),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "(a|b)*",
- ast: genConcatNode(
- newRepeatNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), 1),
- newSymbolNodeWithPos(byte('b'), 2),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(3)),
- ),
- },
- {
- pattern: "*",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "(*)",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a|*",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "*|b",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a**",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a+",
- ast: genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newRepeatNode(
- newSymbolNodeWithPos(byte('a'), symPos(2)),
- ),
- newEndMarkerNodeWithPos(1, endPos(3)),
- ),
- },
- {
- pattern: "[abc]+",
- ast: genConcatNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- newRepeatNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), symPos(4)),
- newSymbolNodeWithPos(byte('b'), symPos(5)),
- newSymbolNodeWithPos(byte('c'), symPos(6)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(7)),
- ),
- },
- {
- pattern: "\\u{3042}+",
- ast: genConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(0xE3, symPos(1)),
- newSymbolNodeWithPos(0x81, symPos(2)),
- newSymbolNodeWithPos(0x82, symPos(3)),
- ),
- newRepeatNode(
- genConcatNode(
- newSymbolNodeWithPos(0xE3, symPos(4)),
- newSymbolNodeWithPos(0x81, symPos(5)),
- newSymbolNodeWithPos(0x82, symPos(6)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(7)),
- ),
- },
- {
- pattern: "\\p{Letter}+",
- skipTestAST: true,
- },
- {
- pattern: "\\f{a2c}+",
- fragments: map[string]string{
- "a2c": "abc",
- },
- ast: genConcatNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- ),
- newRepeatNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(4)),
- newSymbolNodeWithPos(byte('b'), symPos(5)),
- newSymbolNodeWithPos(byte('c'), symPos(6)),
- ),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(7)),
- ),
- },
- {
- pattern: "((a+)+)+",
- ast: genConcatNode(
- genConcatNode(
- genConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newRepeatNode(
- newSymbolNodeWithPos(byte('a'), symPos(2)),
- ),
- ),
- newRepeatNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(3)),
- newRepeatNode(
- newSymbolNodeWithPos(byte('a'), symPos(4)),
- ),
- ),
- ),
- ),
- newRepeatNode(
- genConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(5)),
- newRepeatNode(
- newSymbolNodeWithPos(byte('a'), symPos(6)),
- ),
- ),
- newRepeatNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(7)),
- newRepeatNode(
- newSymbolNodeWithPos(byte('a'), symPos(8)),
- ),
- ),
- ),
- ),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(9)),
- ),
- },
- {
- pattern: "(abc)+",
- ast: genConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- newRepeatNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(4)),
- newSymbolNodeWithPos(byte('b'), symPos(5)),
- newSymbolNodeWithPos(byte('c'), symPos(6)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(7)),
- ),
- },
- {
- pattern: "(a|b)+",
- ast: genConcatNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- ),
- newRepeatNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), symPos(3)),
- newSymbolNodeWithPos(byte('b'), symPos(4)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(5)),
- ),
- },
- {
- pattern: "+",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "(+)",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a|+",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "+|b",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: "a++",
- syntaxError: synErrRepNoTarget,
- },
- {
- pattern: ".",
- ast: newConcatNode(
- genAltNode(
- newRangeSymbolNodeWithPos(0x00, 0x7f, symPos(1)),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xc2, 0xdf, symPos(2)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(3)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xe0, 0xe0, symPos(4)),
- newRangeSymbolNodeWithPos(0xa0, 0xbf, symPos(5)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(6)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xe1, 0xec, symPos(7)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(8)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(9)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xed, 0xed, symPos(10)),
- newRangeSymbolNodeWithPos(0x80, 0x9f, symPos(11)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(12)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xee, 0xef, symPos(13)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(14)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(15)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf0, 0xf0, symPos(16)),
- newRangeSymbolNodeWithPos(0x90, 0xbf, symPos(17)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(18)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(19)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf1, 0xf3, symPos(20)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(21)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(22)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(23)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf4, 0xf4, symPos(24)),
- newRangeSymbolNodeWithPos(0x80, 0x8f, symPos(25)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(26)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(27)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(28)),
- ),
- },
- {
- pattern: "[a]",
- ast: newConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "[abc]",
- ast: newConcatNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "[a-z]",
- ast: newConcatNode(
- newRangeSymbolNodeWithPos(byte('a'), byte('z'), symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "[A-Za-z]",
- ast: newConcatNode(
- genAltNode(
- newRangeSymbolNodeWithPos(byte('A'), byte('Z'), symPos(1)),
- newRangeSymbolNodeWithPos(byte('a'), byte('z'), symPos(2)),
- ),
- newEndMarkerNodeWithPos(1, endPos(3)),
- ),
- },
- {
- pattern: "[\\u{004E}]",
- ast: newConcatNode(
- newSymbolNodeWithPos(byte('N'), symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "[\\p{Lu}]",
- skipTestAST: true,
- },
- {
- pattern: "a[]",
- syntaxError: synErrBExpNoElem,
- },
- {
- pattern: "[]a",
- syntaxError: synErrBExpNoElem,
- },
- {
- pattern: "[]",
- syntaxError: synErrBExpNoElem,
- },
- {
- pattern: "[^]",
- ast: newConcatNode(
- newSymbolNodeWithPos(byte('^'), symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "[",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[a",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([a",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[a-",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([a-",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[^",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([^",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[^a",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([^a",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[^a-",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([^a-",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "]",
- ast: newConcatNode(
- newSymbolNodeWithPos(byte(']'), symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "(]",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "a]",
- ast: newConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte(']'), symPos(2)),
- ),
- newEndMarkerNodeWithPos(1, endPos(3)),
- ),
- },
- {
- pattern: "(a]",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "([)",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "([a)",
- syntaxError: synErrBExpUnclosed,
- },
- {
- pattern: "[a-]",
- ast: newConcatNode(
- genAltNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('-'), symPos(2)),
- ),
- newEndMarkerNodeWithPos(1, endPos(3)),
- ),
- },
- {
- pattern: "[^a-]",
- ast: newConcatNode(
- genAltNode(
- newRangeSymbolNodeWithPos(0x00, byte(44), symPos(1)),
- newRangeSymbolNodeWithPos(byte(46), byte(96), symPos(2)),
- newRangeSymbolNodeWithPos(byte(98), 0x7f, symPos(3)),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xc2, 0xdf, symPos(4)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(5)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xe0, 0xe0, symPos(6)),
- newRangeSymbolNodeWithPos(0xa0, 0xbf, symPos(7)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(8)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xe1, 0xec, symPos(9)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(10)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(11)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xed, 0xed, symPos(12)),
- newRangeSymbolNodeWithPos(0x80, 0x9f, symPos(13)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(14)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xee, 0xef, symPos(15)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(16)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(17)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf0, 0xf0, symPos(18)),
- newRangeSymbolNodeWithPos(0x90, 0xbf, symPos(19)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(20)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(21)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf1, 0xf3, symPos(22)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(23)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(24)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(25)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf4, 0xf4, symPos(26)),
- newRangeSymbolNodeWithPos(0x80, 0x8f, symPos(27)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(28)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(29)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(30)),
- ),
- },
- {
- pattern: "[-z]",
- ast: newConcatNode(
- genAltNode(
- newSymbolNodeWithPos(byte('-'), symPos(1)),
- newSymbolNodeWithPos(byte('z'), symPos(2)),
- ),
- newEndMarkerNodeWithPos(1, endPos(3)),
- ),
- },
- {
- pattern: "[^-z]",
- ast: newConcatNode(
- genAltNode(
- newRangeSymbolNodeWithPos(0x00, byte(44), symPos(1)),
- genAltNode(
- newRangeSymbolNodeWithPos(byte(46), byte(121), symPos(2)),
- newRangeSymbolNodeWithPos(byte(123), 0x7f, symPos(3)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xc2, 0xdf, symPos(4)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(5)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xe0, 0xe0, symPos(6)),
- newRangeSymbolNodeWithPos(0xa0, 0xbf, symPos(7)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(8)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xe1, 0xec, symPos(9)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(10)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(11)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xed, 0xed, symPos(12)),
- newRangeSymbolNodeWithPos(0x80, 0x9f, symPos(13)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(14)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xee, 0xef, symPos(15)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(16)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(17)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf0, 0xf0, symPos(18)),
- newRangeSymbolNodeWithPos(0x90, 0xbf, symPos(19)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(20)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(21)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf1, 0xf3, symPos(22)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(23)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(24)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(25)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf4, 0xf4, symPos(26)),
- newRangeSymbolNodeWithPos(0x80, 0x8f, symPos(27)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(28)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(29)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(30)),
- ),
- },
- {
- pattern: "[-]",
- ast: newConcatNode(
- newSymbolNodeWithPos(byte('-'), symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "[^-]",
- ast: newConcatNode(
- genAltNode(
- newRangeSymbolNodeWithPos(0x00, byte(44), symPos(1)),
- newRangeSymbolNodeWithPos(byte(46), 0x7f, symPos(2)),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xc2, 0xdf, symPos(3)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(4)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xe0, 0xe0, symPos(5)),
- newRangeSymbolNodeWithPos(0xa0, 0xbf, symPos(6)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(7)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xe1, 0xec, symPos(8)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(9)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(10)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xed, 0xed, symPos(11)),
- newRangeSymbolNodeWithPos(0x80, 0x9f, symPos(12)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(13)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xee, 0xef, symPos(14)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(15)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(16)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf0, 0xf0, symPos(17)),
- newRangeSymbolNodeWithPos(0x90, 0xbf, symPos(18)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(19)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(20)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf1, 0xf3, symPos(21)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(22)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(23)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(24)),
- ),
- genConcatNode(
- newRangeSymbolNodeWithPos(0xf4, 0xf4, symPos(25)),
- newRangeSymbolNodeWithPos(0x80, 0x8f, symPos(26)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(27)),
- newRangeSymbolNodeWithPos(0x80, 0xbf, symPos(28)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(29)),
- ),
- },
- {
- pattern: "\\u{006E}",
- ast: genConcatNode(
- newSymbolNodeWithPos(0x6E, symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "\\u{03BD}",
- ast: genConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(0xCE, symPos(1)),
- newSymbolNodeWithPos(0xBD, symPos(2)),
- ),
- newEndMarkerNodeWithPos(1, endPos(3)),
- ),
- },
- {
- pattern: "\\u{306B}",
- ast: genConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(0xE3, symPos(1)),
- newSymbolNodeWithPos(0x81, symPos(2)),
- newSymbolNodeWithPos(0xAB, symPos(3)),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "\\u{01F638}",
- ast: genConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(0xF0, symPos(1)),
- newSymbolNodeWithPos(0x9F, symPos(2)),
- newSymbolNodeWithPos(0x98, symPos(3)),
- newSymbolNodeWithPos(0xB8, symPos(4)),
- ),
- newEndMarkerNodeWithPos(1, endPos(5)),
- ),
- },
- {
- pattern: "\\u{0000}",
- ast: genConcatNode(
- newSymbolNodeWithPos(0x00, symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "\\u{10FFFF}",
- ast: genConcatNode(
- genConcatNode(
- newSymbolNodeWithPos(0xF4, symPos(1)),
- newSymbolNodeWithPos(0x8F, symPos(2)),
- newSymbolNodeWithPos(0xBF, symPos(3)),
- newSymbolNodeWithPos(0xBF, symPos(4)),
- ),
- newEndMarkerNodeWithPos(1, endPos(5)),
- ),
- },
- {
- pattern: "\\u{110000}",
- syntaxError: synErrCPExpOutOfRange,
- },
- {
- pattern: "\\u",
- syntaxError: synErrCPExpInvalidForm,
- },
- {
- pattern: "\\u{",
- syntaxError: synErrCPExpInvalidForm,
- },
- {
- pattern: "\\u{03BD",
- syntaxError: synErrCPExpInvalidForm,
- },
- {
- pattern: "\\u{}",
- syntaxError: synErrCPExpInvalidForm,
- },
- {
- pattern: "\\p{Letter}",
- skipTestAST: true,
- },
- {
- pattern: "\\p{General_Category=Letter}",
- skipTestAST: true,
- },
- {
- pattern: "\\p{ Letter }",
- skipTestAST: true,
- },
- {
- pattern: "\\p{ General_Category = Letter }",
- skipTestAST: true,
- },
- {
- pattern: "\\p",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{Letter",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{General_Category=}",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{General_Category= }",
- syntaxError: synErrCharPropInvalidSymbol,
- },
- {
- pattern: "\\p{=Letter}",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{ =Letter}",
- syntaxError: synErrCharPropInvalidSymbol,
- },
- {
- pattern: "\\p{=}",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\p{}",
- syntaxError: synErrCharPropExpInvalidForm,
- },
- {
- pattern: "\\f{a2c}",
- fragments: map[string]string{
- "a2c": "abc",
- },
- ast: genConcatNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "\\f{ a2c }",
- fragments: map[string]string{
- "a2c": "abc",
- },
- ast: genConcatNode(
- newFragmentNode("a2c",
- genConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- newSymbolNodeWithPos(byte('c'), symPos(3)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(4)),
- ),
- },
- {
- pattern: "\\f",
- syntaxError: synErrFragmentExpInvalidForm,
- },
- {
- pattern: "\\f{",
- syntaxError: synErrFragmentExpInvalidForm,
- },
- {
- pattern: "\\f{a2c",
- fragments: map[string]string{
- "a2c": "abc",
- },
- syntaxError: synErrFragmentExpInvalidForm,
- },
- {
- pattern: "(a)",
- ast: newConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "(((a)))",
- ast: newConcatNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newEndMarkerNodeWithPos(1, endPos(2)),
- ),
- },
- {
- pattern: "a()",
- syntaxError: synErrGroupNoElem,
- },
- {
- pattern: "()a",
- syntaxError: synErrGroupNoElem,
- },
- {
- pattern: "()",
- syntaxError: synErrGroupNoElem,
- },
- {
- pattern: "(",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "a(",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "(a",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "((",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: "((a)",
- syntaxError: synErrGroupUnclosed,
- },
- {
- pattern: ")",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: "a)",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: ")a",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: "))",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: "(a))",
- syntaxError: synErrGroupNoInitiator,
- },
- {
- pattern: "Mulder|Scully",
- ast: newConcatNode(
- genAltNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('M'), symPos(1)),
- newSymbolNodeWithPos(byte('u'), symPos(2)),
- newSymbolNodeWithPos(byte('l'), symPos(3)),
- newSymbolNodeWithPos(byte('d'), symPos(4)),
- newSymbolNodeWithPos(byte('e'), symPos(5)),
- newSymbolNodeWithPos(byte('r'), symPos(6)),
- ),
- genConcatNode(
- newSymbolNodeWithPos(byte('S'), symPos(7)),
- newSymbolNodeWithPos(byte('c'), symPos(8)),
- newSymbolNodeWithPos(byte('u'), symPos(9)),
- newSymbolNodeWithPos(byte('l'), symPos(10)),
- newSymbolNodeWithPos(byte('l'), symPos(11)),
- newSymbolNodeWithPos(byte('y'), symPos(12)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(13)),
- ),
- },
- {
- pattern: "Langly|Frohike|Byers",
- ast: newConcatNode(
- genAltNode(
- genConcatNode(
- newSymbolNodeWithPos(byte('L'), symPos(1)),
- newSymbolNodeWithPos(byte('a'), symPos(2)),
- newSymbolNodeWithPos(byte('n'), symPos(3)),
- newSymbolNodeWithPos(byte('g'), symPos(4)),
- newSymbolNodeWithPos(byte('l'), symPos(5)),
- newSymbolNodeWithPos(byte('y'), symPos(6)),
- ),
- genConcatNode(
- newSymbolNodeWithPos(byte('F'), symPos(7)),
- newSymbolNodeWithPos(byte('r'), symPos(8)),
- newSymbolNodeWithPos(byte('o'), symPos(9)),
- newSymbolNodeWithPos(byte('h'), symPos(10)),
- newSymbolNodeWithPos(byte('i'), symPos(11)),
- newSymbolNodeWithPos(byte('k'), symPos(12)),
- newSymbolNodeWithPos(byte('e'), symPos(13)),
- ),
- genConcatNode(
- newSymbolNodeWithPos(byte('B'), symPos(14)),
- newSymbolNodeWithPos(byte('y'), symPos(15)),
- newSymbolNodeWithPos(byte('e'), symPos(16)),
- newSymbolNodeWithPos(byte('r'), symPos(17)),
- newSymbolNodeWithPos(byte('s'), symPos(18)),
- ),
- ),
- newEndMarkerNodeWithPos(1, endPos(19)),
- ),
- },
- {
- pattern: "|",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "||",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Mulder|",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "|Scully",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Langly|Frohike|",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Langly||Byers",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "|Frohike|Byers",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "|Frohike|",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Fox(|)Mulder",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "(Fox|)Mulder",
- syntaxError: synErrAltLackOfOperand,
- },
- {
- pattern: "Fox(|Mulder)",
- syntaxError: synErrAltLackOfOperand,
- },
- }
- for i, tt := range tests {
- t.Run(fmt.Sprintf("#%v %v", i, tt.pattern), func(t *testing.T) {
- fragments := map[string][]byte{}
- for kind, pattern := range tt.fragments {
- fragments[kind] = []byte(pattern)
- }
- ast, _, err := parse([]*patternEntry{
- {
- id: spec.LexModeKindIDMin,
- pattern: []byte(tt.pattern),
- },
- }, fragments)
- if tt.syntaxError != nil {
- // printAST(os.Stdout, ast, "", "", false)
- if err == nil {
- t.Fatalf("expected syntax error; got: nil")
- }
- parseErrs, ok := err.(*ParseErrors)
- if !ok {
- t.Fatalf("expected ParseErrors; got: %v (type: %T)", err, err)
- }
- parseErr := parseErrs.Errors[0].Cause
- synErr, ok := parseErr.(*SyntaxError)
- if !ok {
- t.Fatalf("expected SyntaxError; got: %v (type: %T)", parseErr, parseErr)
- }
- if synErr != tt.syntaxError {
- t.Fatalf("unexpected syntax error; want: %v, got: %v", tt.syntaxError, synErr)
- }
- if ast != nil {
- t.Fatalf("ast is not nil")
- }
- } else {
- if err != nil {
- t.Fatal(err)
- }
- if ast == nil {
- t.Fatal("AST is nil")
- }
- // printAST(os.Stdout, ast, "", "", false)
- if !tt.skipTestAST {
- testAST(t, tt.ast, ast)
- }
- }
- })
- }
-}
-
-func TestParse_ContributoryPropertyIsNotExposed(t *testing.T) {
- for _, cProp := range ucd.ContributoryProperties() {
- t.Run(fmt.Sprintf("%v", cProp), func(t *testing.T) {
- ast, _, err := parse([]*patternEntry{
- {
- id: spec.LexModeKindIDMin,
- pattern: []byte(fmt.Sprintf(`\p{%v=yes}`, cProp)),
- },
- }, nil)
- if err == nil {
- t.Fatalf("expected syntax error; got: nil")
- }
- parseErrs, ok := err.(*ParseErrors)
- if !ok {
- t.Fatalf("expected ParseErrors; got: %v (type: %T)", err, err)
- }
- parseErr := parseErrs.Errors[0].Cause
- synErr, ok := parseErr.(*SyntaxError)
- if !ok {
- t.Fatalf("expected SyntaxError; got: %v (type: %T)", parseErr, parseErr)
- }
- if synErr != synErrCharPropUnsupported {
- t.Fatalf("unexpected syntax error; want: %v, got: %v", synErrCharPropUnsupported, synErr)
- }
- if ast != nil {
- t.Fatalf("ast is not nil")
- }
- })
- }
-}
-
-func TestParse_FollowAndSymbolTable(t *testing.T) {
- root, symTab, err := parse([]*patternEntry{
- {
- id: spec.LexModeKindIDMin,
- pattern: []byte("(a|b)*abb"),
- },
- }, nil)
- if err != nil {
- t.Fatal(err)
- }
- if root == nil {
- t.Fatal("root of AST is nil")
- }
- // printAST(os.Stdout, root, "", "", false)
-
- {
- expectedAST := genConcatNode(
- newRepeatNode(
- newAltNode(
- newSymbolNodeWithPos(byte('a'), symPos(1)),
- newSymbolNodeWithPos(byte('b'), symPos(2)),
- ),
- ),
- newSymbolNodeWithPos(byte('a'), symPos(3)),
- newSymbolNodeWithPos(byte('b'), symPos(4)),
- newSymbolNodeWithPos(byte('b'), symPos(5)),
- newEndMarkerNodeWithPos(1, endPos(6)),
- )
- testAST(t, expectedAST, root)
- }
-
- {
- followTab := genFollowTable(root)
- if followTab == nil {
- t.Fatal("follow table is nil")
- }
- expectedFollowTab := followTable{
- 1: newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)),
- 2: newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)),
- 3: newSymbolPositionSet().add(symPos(4)),
- 4: newSymbolPositionSet().add(symPos(5)),
- 5: newSymbolPositionSet().add(endPos(6)),
- }
- testFollowTable(t, expectedFollowTab, followTab)
- }
-
- {
- entry := func(v byte) byteRange {
- return byteRange{
- from: v,
- to: v,
- }
- }
-
- expectedSymTab := &symbolTable{
- symPos2Byte: map[symbolPosition]byteRange{
- symPos(1): entry(byte('a')),
- symPos(2): entry(byte('b')),
- symPos(3): entry(byte('a')),
- symPos(4): entry(byte('b')),
- symPos(5): entry(byte('b')),
- },
- endPos2ID: map[symbolPosition]spec.LexModeKindID{
- endPos(6): 1,
- },
- }
- testSymbolTable(t, expectedSymTab, symTab)
- }
-}
-
-func testAST(t *testing.T, expected, actual astNode) {
- t.Helper()
-
- aTy := reflect.TypeOf(actual)
- eTy := reflect.TypeOf(expected)
- if eTy != aTy {
- t.Fatalf("AST node type is mismatched; want: %v, got: %v", eTy, aTy)
- }
-
- if actual == nil {
- return
- }
-
- switch e := expected.(type) {
- case *symbolNode:
- a := actual.(*symbolNode)
- if a.pos != e.pos || a.from != e.from || a.to != e.to {
- t.Fatalf("unexpected node; want: %+v, got: %+v", e, a)
- }
- case *endMarkerNode:
- a := actual.(*endMarkerNode)
- if a.pos != e.pos {
- t.Fatalf("symbol position is mismatched; want: %v, got: %v", e.pos, a.pos)
- }
- }
- eLeft, eRight := expected.children()
- aLeft, aRight := actual.children()
- testAST(t, eLeft, aLeft)
- testAST(t, eRight, aRight)
-}
-
-func testFollowTable(t *testing.T, expected, actual followTable) {
- if len(actual) != len(expected) {
- t.Errorf("unexpected number of the follow table entries; want: %v, got: %v", len(expected), len(actual))
- }
- for ePos, eSet := range expected {
- aSet, ok := actual[ePos]
- if !ok {
- t.Fatalf("follow entry is not found; position: %v, follow: %v", ePos, eSet)
- }
- if aSet.hash() != eSet.hash() {
- t.Fatalf("follow entry of position %v is mismatched; want: %v, got: %v", ePos, aSet, eSet)
- }
- }
-}
-
-func testSymbolTable(t *testing.T, expected, actual *symbolTable) {
- t.Helper()
-
- if len(actual.symPos2Byte) != len(expected.symPos2Byte) {
- t.Errorf("unexpected symPos2Byte entries; want: %v entries, got: %v entries", len(expected.symPos2Byte), len(actual.symPos2Byte))
- }
- for ePos, eByte := range expected.symPos2Byte {
- byte, ok := actual.symPos2Byte[ePos]
- if !ok {
- t.Errorf("a symbol position entry was not found: %v -> %v", ePos, eByte)
- continue
- }
- if byte.from != eByte.from || byte.to != eByte.to {
- t.Errorf("unexpected symbol position entry; want: %v -> %v, got: %v -> %v", ePos, eByte, ePos, byte)
- }
- }
-
- if len(actual.endPos2ID) != len(expected.endPos2ID) {
- t.Errorf("unexpected endPos2ID entries; want: %v entries, got: %v entries", len(expected.endPos2ID), len(actual.endPos2ID))
- }
- for ePos, eID := range expected.endPos2ID {
- id, ok := actual.endPos2ID[ePos]
- if !ok {
- t.Errorf("an end position entry was not found: %v -> %v", ePos, eID)
- continue
- }
- if id != eID {
- t.Errorf("unexpected end position entry; want: %v -> %v, got: %v -> %v", ePos, eID, ePos, id)
- }
- }
-}
diff --git a/compiler/symbol_position.go b/compiler/symbol_position.go
deleted file mode 100644
index d35dfde..0000000
--- a/compiler/symbol_position.go
+++ /dev/null
@@ -1,198 +0,0 @@
-package compiler
-
-import (
- "encoding/binary"
- "fmt"
- "strings"
-)
-
-type symbolPosition uint16
-
-const (
- symbolPositionNil = symbolPosition(0x0000) // 0000 0000 0000 0000
-
- symbolPositionMin = uint16(0x0001) // 0000 0000 0000 0001
- symbolPositionMax = uint16(0x7fff) // 0111 1111 1111 1111
-
- symbolPositionMaskSymbol = uint16(0x0000) // 0000 0000 0000 0000
- symbolPositionMaskEndMark = uint16(0x8000) // 1000 0000 0000 0000
-
- symbolPositionMaskValue = uint16(0x7fff) // 0111 1111 1111 1111
-)
-
-func newSymbolPosition(n uint16, endMark bool) (symbolPosition, error) {
- if n < symbolPositionMin || n > symbolPositionMax {
- return symbolPositionNil, fmt.Errorf("symbol position must be within %v to %v; n: %v, endMark: %v", symbolPositionMin, symbolPositionMax, n, endMark)
- }
- if endMark {
- return symbolPosition(n | symbolPositionMaskEndMark), nil
- }
- return symbolPosition(n | symbolPositionMaskSymbol), nil
-}
-
-func (p symbolPosition) String() string {
- if p.isEndMark() {
- return fmt.Sprintf("end#%v", uint16(p)&symbolPositionMaskValue)
- }
- return fmt.Sprintf("sym#%v", uint16(p)&symbolPositionMaskValue)
-}
-
-func (p symbolPosition) isEndMark() bool {
- if uint16(p)&symbolPositionMaskEndMark > 1 {
- return true
- }
- return false
-}
-
-func (p symbolPosition) describe() (uint16, bool) {
- v := uint16(p) & symbolPositionMaskValue
- if p.isEndMark() {
- return v, true
- }
- return v, false
-}
-
-type symbolPositionSet struct {
- // `s` represents a set of symbol positions.
- // However, immediately after adding a symbol position, the elements may be duplicated.
- // When you need an aligned set with no duplicates, you can get such value via the set function.
- s []symbolPosition
- sorted bool
-}
-
-func newSymbolPositionSet() *symbolPositionSet {
- return &symbolPositionSet{
- s: []symbolPosition{},
- sorted: false,
- }
-}
-
-func (s *symbolPositionSet) String() string {
- if len(s.s) <= 0 {
- return "{}"
- }
- ps := s.sortAndRemoveDuplicates()
- var b strings.Builder
- fmt.Fprintf(&b, "{")
- for i, p := range ps {
- if i <= 0 {
- fmt.Fprintf(&b, "%v", p)
- continue
- }
- fmt.Fprintf(&b, ", %v", p)
- }
- fmt.Fprintf(&b, "}")
- return b.String()
-}
-
-func (s *symbolPositionSet) set() []symbolPosition {
- s.sortAndRemoveDuplicates()
- return s.s
-}
-
-func (s *symbolPositionSet) add(pos symbolPosition) *symbolPositionSet {
- s.s = append(s.s, pos)
- s.sorted = false
- return s
-}
-
-func (s *symbolPositionSet) merge(t *symbolPositionSet) *symbolPositionSet {
- s.s = append(s.s, t.s...)
- s.sorted = false
- return s
-}
-
-func (s *symbolPositionSet) intersect(set *symbolPositionSet) *symbolPositionSet {
- in := newSymbolPositionSet()
- for _, p1 := range s.s {
- for _, p2 := range set.s {
- if p1 != p2 {
- continue
- }
- in.add(p1)
- }
- }
- return in
-}
-
-func (s *symbolPositionSet) hash() string {
- if len(s.s) <= 0 {
- return ""
- }
- sorted := s.sortAndRemoveDuplicates()
- var buf []byte
- for _, p := range sorted {
- b := make([]byte, 8)
- binary.PutUvarint(b, uint64(p))
- buf = append(buf, b...)
- }
- // Convert to a string to be able to use it as a key of a map.
- // But note this byte sequence is made from values of symbol positions,
- // so this is not a well-formed UTF-8 sequence.
- return string(buf)
-}
-
-func (s *symbolPositionSet) sortAndRemoveDuplicates() []symbolPosition {
- if s.sorted {
- return s.s
- }
-
- sortSymbolPositions(s.s, 0, len(s.s)-1)
-
- // Remove duplicates.
- lastV := s.s[0]
- nextIdx := 1
- for _, v := range s.s[1:] {
- if v == lastV {
- continue
- }
- s.s[nextIdx] = v
- nextIdx++
- lastV = v
- }
- s.s = s.s[:nextIdx]
- s.sorted = true
-
- return s.s
-}
-
-// sortSymbolPositions sorts a slice of symbol positions as it uses quick sort.
-func sortSymbolPositions(ps []symbolPosition, left, right int) {
- if left >= right {
- return
- }
- var pivot symbolPosition
- {
- // Use a median as a pivot.
- p1 := ps[left]
- p2 := ps[(left+right)/2]
- p3 := ps[right]
- if p1 > p2 {
- p1, p2 = p2, p1
- }
- if p2 > p3 {
- p2, p3 = p3, p2
- if p1 > p2 {
- p1, p2 = p2, p1
- }
- }
- pivot = p2
- }
- i := left
- j := right
- for i <= j {
- for ps[i] < pivot {
- i++
- }
- for ps[j] > pivot {
- j--
- }
- if i <= j {
- ps[i], ps[j] = ps[j], ps[i]
- i++
- j--
- }
- }
- sortSymbolPositions(ps, left, j)
- sortSymbolPositions(ps, i, right)
-}
diff --git a/compiler/syntax_error.go b/compiler/syntax_error.go
deleted file mode 100644
index 3925259..0000000
--- a/compiler/syntax_error.go
+++ /dev/null
@@ -1,45 +0,0 @@
-package compiler
-
-import "fmt"
-
-type SyntaxError struct {
- Message string
-}
-
-func newSyntaxError(msg string) *SyntaxError {
- return &SyntaxError{
- Message: msg,
- }
-}
-
-func (e *SyntaxError) Error() string {
- return fmt.Sprintf("syntax error: %s", e.Message)
-}
-
-var (
- // lexical errors
- synErrIncompletedEscSeq = newSyntaxError("incompleted escape sequence; unexpected EOF following \\")
- synErrInvalidEscSeq = newSyntaxError("invalid escape sequence")
- synErrInvalidCodePoint = newSyntaxError("code points must consist of just 4 or 6 hex digits")
- synErrCharPropInvalidSymbol = newSyntaxError("invalid character property symbol")
- SynErrFragmentInvalidSymbol = newSyntaxError("invalid fragment symbol")
-
- // syntax errors
- synErrUnexpectedToken = newSyntaxError("unexpected token")
- synErrNullPattern = newSyntaxError("a pattern must be a non-empty byte sequence")
- synErrAltLackOfOperand = newSyntaxError("an alternation expression must have operands")
- synErrRepNoTarget = newSyntaxError("a repeat expression must have an operand")
- synErrGroupNoElem = newSyntaxError("a grouping expression must include at least one character")
- synErrGroupUnclosed = newSyntaxError("unclosed grouping expression")
- synErrGroupNoInitiator = newSyntaxError(") needs preceding (")
- synErrGroupInvalidForm = newSyntaxError("invalid grouping expression")
- synErrBExpNoElem = newSyntaxError("a bracket expression must include at least one character")
- synErrBExpUnclosed = newSyntaxError("unclosed bracket expression")
- synErrBExpInvalidForm = newSyntaxError("invalid bracket expression")
- synErrRangeInvalidOrder = newSyntaxError("a range expression with invalid order")
- synErrCPExpInvalidForm = newSyntaxError("invalid code point expression")
- synErrCPExpOutOfRange = newSyntaxError("a code point must be between U+0000 to U+10FFFF")
- synErrCharPropExpInvalidForm = newSyntaxError("invalid character property expression")
- synErrCharPropUnsupported = newSyntaxError("unsupported character property")
- synErrFragmentExpInvalidForm = newSyntaxError("invalid fragment expression")
-)
diff --git a/compiler/test_util_test.go b/compiler/test_util_test.go
deleted file mode 100644
index 72e150b..0000000
--- a/compiler/test_util_test.go
+++ /dev/null
@@ -1,21 +0,0 @@
-package compiler
-
-import "github.com/nihei9/maleeni/spec"
-
-func newRangeSymbolNodeWithPos(from, to byte, pos symbolPosition) *symbolNode {
- n := newRangeSymbolNode(from, to)
- n.pos = pos
- return n
-}
-
-func newSymbolNodeWithPos(v byte, pos symbolPosition) *symbolNode {
- n := newSymbolNode(v)
- n.pos = pos
- return n
-}
-
-func newEndMarkerNodeWithPos(id int, pos symbolPosition) *endMarkerNode {
- n := newEndMarkerNode(spec.LexModeKindID(id))
- n.pos = pos
- return n
-}
diff --git a/driver/lexer_test.go b/driver/lexer_test.go
index 275e992..e0b82b1 100644
--- a/driver/lexer_test.go
+++ b/driver/lexer_test.go
@@ -776,8 +776,11 @@ func TestLexer_Next(t *testing.T) {
for i, tt := range test {
for compLv := compiler.CompressionLevelMin; compLv <= compiler.CompressionLevelMax; compLv++ {
t.Run(fmt.Sprintf("#%v-%v", i, compLv), func(t *testing.T) {
- clspec, err := compiler.Compile(tt.lspec, compiler.CompressionLevel(compLv))
+ clspec, err, cerrs := compiler.Compile(tt.lspec, compiler.CompressionLevel(compLv))
if err != nil {
+ for _, cerr := range cerrs {
+ t.Logf("%#v", cerr)
+ }
t.Fatalf("unexpected error: %v", err)
}
opts := []LexerOption{}
@@ -821,7 +824,7 @@ func TestLexer_Next_WithPosition(t *testing.T) {
},
}
- clspec, err := compiler.Compile(lspec, compiler.CompressionLevel(compiler.CompressionLevelMax))
+ clspec, err, _ := compiler.Compile(lspec, compiler.CompressionLevel(compiler.CompressionLevelMax))
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
diff --git a/utf8/utf8.go b/utf8/utf8.go
index 79ca1de..16b3746 100644
--- a/utf8/utf8.go
+++ b/utf8/utf8.go
@@ -83,15 +83,6 @@ var (
cBlk4Last = cBlks4[len(cBlks4)-1]
)
-func AllCharBlocks() []*CharBlock {
- var blks []*CharBlock
- blks = append(blks, cBlks1...)
- blks = append(blks, cBlks2...)
- blks = append(blks, cBlks3...)
- blks = append(blks, cBlks4...)
- return blks
-}
-
func GenCharBlocks(from, to []byte) ([]*CharBlock, error) {
switch len(from) {
case 1: