aboutsummaryrefslogtreecommitdiff
path: root/tests/unit/urubu/grammar
diff options
context:
space:
mode:
Diffstat (limited to 'tests/unit/urubu/grammar')
-rw-r--r--tests/unit/urubu/grammar/lexical.go338
-rw-r--r--tests/unit/urubu/grammar/lexical/dfa.go442
-rw-r--r--tests/unit/urubu/grammar/lexical/parser.go1907
-rw-r--r--tests/unit/urubu/grammar/symbol.go159
4 files changed, 2846 insertions, 0 deletions
diff --git a/tests/unit/urubu/grammar/lexical.go b/tests/unit/urubu/grammar/lexical.go
new file mode 100644
index 0000000..b621cd2
--- /dev/null
+++ b/tests/unit/urubu/grammar/lexical.go
@@ -0,0 +1,338 @@
+package lexical
+
+import (
+ "encoding/json"
+ "fmt"
+ "testing"
+
+ spec "urubu/spec/grammar"
+)
+
+func TestLexSpec_Validate(t *testing.T) {
+ // We expect that the spelling inconsistency error will occur.
+ spec := &LexSpec{
+ Entries: []*LexEntry{
+ {
+ Modes: []spec.LexModeName{
+ // 'Default' is the spelling inconsistency because 'default' is predefined.
+ "Default",
+ },
+ Kind: "foo",
+ Pattern: "foo",
+ },
+ },
+ }
+ err := spec.Validate()
+ if err == nil {
+ t.Fatalf("expected error didn't occur")
+ }
+}
+
+func TestSnakeCaseToUpperCamelCase(t *testing.T) {
+ tests := []struct {
+ snake string
+ camel string
+ }{
+ {
+ snake: "foo",
+ camel: "Foo",
+ },
+ {
+ snake: "foo_bar",
+ camel: "FooBar",
+ },
+ {
+ snake: "foo_bar_baz",
+ camel: "FooBarBaz",
+ },
+ {
+ snake: "Foo",
+ camel: "Foo",
+ },
+ {
+ snake: "fooBar",
+ camel: "FooBar",
+ },
+ {
+ snake: "FOO",
+ camel: "FOO",
+ },
+ {
+ snake: "FOO_BAR",
+ camel: "FOOBAR",
+ },
+ {
+ snake: "_foo_bar_",
+ camel: "FooBar",
+ },
+ {
+ snake: "___foo___bar___",
+ camel: "FooBar",
+ },
+ }
+ for _, tt := range tests {
+ c := SnakeCaseToUpperCamelCase(tt.snake)
+ if c != tt.camel {
+ t.Errorf("unexpected string; want: %v, got: %v", tt.camel, c)
+ }
+ }
+}
+
+func TestFindSpellingInconsistencies(t *testing.T) {
+ tests := []struct {
+ ids []string
+ duplicated [][]string
+ }{
+ {
+ ids: []string{"foo", "foo"},
+ duplicated: nil,
+ },
+ {
+ ids: []string{"foo", "Foo"},
+ duplicated: [][]string{{"Foo", "foo"}},
+ },
+ {
+ ids: []string{"foo", "foo", "Foo"},
+ duplicated: [][]string{{"Foo", "foo"}},
+ },
+ {
+ ids: []string{"foo_bar_baz", "FooBarBaz"},
+ duplicated: [][]string{{"FooBarBaz", "foo_bar_baz"}},
+ },
+ {
+ ids: []string{"foo", "Foo", "bar", "Bar"},
+ duplicated: [][]string{{"Bar", "bar"}, {"Foo", "foo"}},
+ },
+ {
+ ids: []string{"foo", "Foo", "bar", "Bar", "baz", "bra"},
+ duplicated: [][]string{{"Bar", "bar"}, {"Foo", "foo"}},
+ },
+ }
+ for i, tt := range tests {
+ t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) {
+ duplicated := FindSpellingInconsistencies(tt.ids)
+ if len(duplicated) != len(tt.duplicated) {
+ t.Fatalf("unexpected IDs; want: %#v, got: %#v", tt.duplicated, duplicated)
+ }
+ for i, dupIDs := range duplicated {
+ if len(dupIDs) != len(tt.duplicated[i]) {
+ t.Fatalf("unexpected IDs; want: %#v, got: %#v", tt.duplicated[i], dupIDs)
+ }
+ for j, id := range dupIDs {
+ if id != tt.duplicated[i][j] {
+ t.Fatalf("unexpected IDs; want: %#v, got: %#v", tt.duplicated[i], dupIDs)
+ }
+ }
+ }
+ })
+ }
+}
+
+func TestCompile(t *testing.T) {
+ tests := []struct {
+ Caption string
+ Spec string
+ Err bool
+ }{
+ {
+ Caption: "allow duplicates names between fragments and non-fragments",
+ Spec: `
+{
+ "name": "test",
+ "entries": [
+ {
+ "kind": "a2z",
+ "pattern": "\\f{a2z}"
+ },
+ {
+ "fragment": true,
+ "kind": "a2z",
+ "pattern": "[a-z]"
+ }
+ ]
+}
+`,
+ },
+ {
+ Caption: "don't allow duplicates names in non-fragments",
+ Spec: `
+{
+ "name": "test",
+ "entries": [
+ {
+ "kind": "a2z",
+ "pattern": "a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z"
+ },
+ {
+ "kind": "a2z",
+ "pattern": "[a-z]"
+ }
+ ]
+}
+`,
+ Err: true,
+ },
+ {
+ Caption: "don't allow duplicates names in fragments",
+ Spec: `
+{
+ "name": "test",
+ "entries": [
+ {
+ "kind": "a2z",
+ "pattern": "\\f{a2z}"
+ },
+ {
+ "fragments": true,
+ "kind": "a2z",
+ "pattern": "a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z"
+ },
+ {
+ "fragments": true,
+ "kind": "a2z",
+ "pattern": "[a-z]"
+ }
+ ]
+}
+`,
+ Err: true,
+ },
+ {
+ Caption: "don't allow kind names in the same mode to contain spelling inconsistencies",
+ Spec: `
+{
+ "name": "test",
+ "entries": [
+ {
+ "kind": "foo_1",
+ "pattern": "foo_1"
+ },
+ {
+ "kind": "foo1",
+ "pattern": "foo1"
+ }
+ ]
+}
+`,
+ Err: true,
+ },
+ {
+ Caption: "don't allow kind names across modes to contain spelling inconsistencies",
+ Spec: `
+{
+ "name": "test",
+ "entries": [
+ {
+ "modes": ["default"],
+ "kind": "foo_1",
+ "pattern": "foo_1"
+ },
+ {
+ "modes": ["other_mode"],
+ "kind": "foo1",
+ "pattern": "foo1"
+ }
+ ]
+}
+`,
+ Err: true,
+ },
+ {
+ Caption: "don't allow mode names to contain spelling inconsistencies",
+ Spec: `
+{
+ "name": "test",
+ "entries": [
+ {
+ "modes": ["foo_1"],
+ "kind": "a",
+ "pattern": "a"
+ },
+ {
+ "modes": ["foo1"],
+ "kind": "b",
+ "pattern": "b"
+ }
+ ]
+}
+`,
+ Err: true,
+ },
+ {
+ Caption: "allow fragment names in the same mode to contain spelling inconsistencies because fragments will not appear in output files",
+ Spec: `
+{
+ "name": "test",
+ "entries": [
+ {
+ "kind": "a",
+ "pattern": "a"
+ },
+ {
+ "fragment": true,
+ "kind": "foo_1",
+ "pattern": "foo_1"
+ },
+ {
+ "fragment": true,
+ "kind": "foo1",
+ "pattern": "foo1"
+ }
+ ]
+}
+`,
+ },
+ {
+ Caption: "allow fragment names across modes to contain spelling inconsistencies because fragments will not appear in output files",
+ Spec: `
+{
+ "name": "test",
+ "entries": [
+ {
+ "modes": ["default"],
+ "kind": "a",
+ "pattern": "a"
+ },
+ {
+ "modes": ["default"],
+ "fragment": true,
+ "kind": "foo_1",
+ "pattern": "foo_1"
+ },
+ {
+ "modes": ["other_mode"],
+ "fragment": true,
+ "kind": "foo1",
+ "pattern": "foo1"
+ }
+ ]
+}
+`,
+ },
+ }
+ for i, tt := range tests {
+ t.Run(fmt.Sprintf("#%v %s", i, tt.Caption), func(t *testing.T) {
+ lspec := &LexSpec{}
+ err := json.Unmarshal([]byte(tt.Spec), lspec)
+ if err != nil {
+ t.Fatalf("%v", err)
+ }
+ clspec, err, _ := Compile(lspec, CompressionLevelMin)
+ if tt.Err {
+ if err == nil {
+ t.Fatalf("expected an error")
+ }
+ if clspec != nil {
+ t.Fatalf("Compile function mustn't return a compiled specification")
+ }
+ } else {
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ if clspec == nil {
+ t.Fatalf("Compile function must return a compiled specification")
+ }
+ }
+ })
+ }
+}
diff --git a/tests/unit/urubu/grammar/lexical/dfa.go b/tests/unit/urubu/grammar/lexical/dfa.go
new file mode 100644
index 0000000..1a3e16a
--- /dev/null
+++ b/tests/unit/urubu/grammar/lexical/dfa.go
@@ -0,0 +1,442 @@
+package dfa
+
+import (
+ "fmt"
+ "strings"
+ "testing"
+
+ "urubu/grammar/lexical/parser"
+ spec "urubu/spec/grammar"
+)
+
+func TestGenDFA(t *testing.T) {
+ p := parser.NewParser(spec.LexKindName("test"), strings.NewReader("(a|b)*abb"))
+ cpt, err := p.Parse()
+ if err != nil {
+ t.Fatal(err)
+ }
+ bt, symTab, err := ConvertCPTreeToByteTree(map[spec.LexModeKindID]parser.CPTree{
+ spec.LexModeKindIDMin: cpt,
+ })
+ if err != nil {
+ t.Fatal(err)
+ }
+ dfa := GenDFA(bt, symTab)
+ if dfa == nil {
+ t.Fatalf("DFA is nil")
+ }
+
+ symPos := func(n uint16) symbolPosition {
+ pos, err := newSymbolPosition(n, false)
+ if err != nil {
+ panic(err)
+ }
+ return pos
+ }
+
+ endPos := func(n uint16) symbolPosition {
+ pos, err := newSymbolPosition(n, true)
+ if err != nil {
+ panic(err)
+ }
+ return pos
+ }
+
+ s0 := newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3))
+ s1 := newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)).add(symPos(4))
+ s2 := newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)).add(symPos(5))
+ s3 := newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)).add(endPos(6))
+
+ rune2Int := func(char rune, index int) uint8 {
+ return uint8([]byte(string(char))[index])
+ }
+
+ tranS0 := [256]string{}
+ tranS0[rune2Int('a', 0)] = s1.hash()
+ tranS0[rune2Int('b', 0)] = s0.hash()
+
+ tranS1 := [256]string{}
+ tranS1[rune2Int('a', 0)] = s1.hash()
+ tranS1[rune2Int('b', 0)] = s2.hash()
+
+ tranS2 := [256]string{}
+ tranS2[rune2Int('a', 0)] = s1.hash()
+ tranS2[rune2Int('b', 0)] = s3.hash()
+
+ tranS3 := [256]string{}
+ tranS3[rune2Int('a', 0)] = s1.hash()
+ tranS3[rune2Int('b', 0)] = s0.hash()
+
+ expectedTranTab := map[string][256]string{
+ s0.hash(): tranS0,
+ s1.hash(): tranS1,
+ s2.hash(): tranS2,
+ s3.hash(): tranS3,
+ }
+ if len(dfa.TransitionTable) != len(expectedTranTab) {
+ t.Errorf("transition table is mismatched: want: %v entries, got: %v entries", len(expectedTranTab), len(dfa.TransitionTable))
+ }
+ for h, eTranTab := range expectedTranTab {
+ tranTab, ok := dfa.TransitionTable[h]
+ if !ok {
+ t.Errorf("no entry; hash: %v", h)
+ continue
+ }
+ if len(tranTab) != len(eTranTab) {
+ t.Errorf("transition table is mismatched: hash: %v, want: %v entries, got: %v entries", h, len(eTranTab), len(tranTab))
+ }
+ for c, eNext := range eTranTab {
+ if eNext == "" {
+ continue
+ }
+
+ next := tranTab[c]
+ if next == "" {
+ t.Errorf("no enatry: hash: %v, char: %v", h, c)
+ }
+ if next != eNext {
+ t.Errorf("next state is mismatched: want: %v, got: %v", eNext, next)
+ }
+ }
+ }
+
+ if dfa.InitialState != s0.hash() {
+ t.Errorf("initial state is mismatched: want: %v, got: %v", s0.hash(), dfa.InitialState)
+ }
+
+ accTab := map[string]spec.LexModeKindID{
+ s3.hash(): 1,
+ }
+ if len(dfa.AcceptingStatesTable) != len(accTab) {
+ t.Errorf("accepting states are mismatched: want: %v entries, got: %v entries", len(accTab), len(dfa.AcceptingStatesTable))
+ }
+ for eState, eID := range accTab {
+ id, ok := dfa.AcceptingStatesTable[eState]
+ if !ok {
+ t.Errorf("accepting state is not found: state: %v", eState)
+ }
+ if id != eID {
+ t.Errorf("ID is mismatched: state: %v, want: %v, got: %v", eState, eID, id)
+ }
+ }
+}
+
+func TestNewSymbolPosition(t *testing.T) {
+ tests := []struct {
+ n uint16
+ endMark bool
+ err bool
+ }{
+ {
+ n: 0,
+ endMark: false,
+ err: true,
+ },
+ {
+ n: 0,
+ endMark: true,
+ err: true,
+ },
+ {
+ n: symbolPositionMin - 1,
+ endMark: false,
+ err: true,
+ },
+ {
+ n: symbolPositionMin - 1,
+ endMark: true,
+ err: true,
+ },
+ {
+ n: symbolPositionMin,
+ endMark: false,
+ },
+ {
+ n: symbolPositionMin,
+ endMark: true,
+ },
+ {
+ n: symbolPositionMax,
+ endMark: false,
+ },
+ {
+ n: symbolPositionMax,
+ endMark: true,
+ },
+ {
+ n: symbolPositionMax + 1,
+ endMark: false,
+ err: true,
+ },
+ {
+ n: symbolPositionMax + 1,
+ endMark: true,
+ err: true,
+ },
+ }
+ for i, tt := range tests {
+ t.Run(fmt.Sprintf("#%v n: %v, endMark: %v", i, tt.n, tt.endMark), func(t *testing.T) {
+ pos, err := newSymbolPosition(tt.n, tt.endMark)
+ if tt.err {
+ if err == nil {
+ t.Fatal("err is nil")
+ }
+ return
+ }
+ if err != nil {
+ t.Fatal(err)
+ }
+ n, endMark := pos.describe()
+ if n != tt.n || endMark != tt.endMark {
+ t.Errorf("unexpected symbol position: want: n: %v, endMark: %v, got: n: %v, endMark: %v", tt.n, tt.endMark, n, endMark)
+ }
+ })
+ }
+}
+
+func TestByteTree(t *testing.T) {
+ tests := []struct {
+ root byteTree
+ nullable bool
+ first *symbolPositionSet
+ last *symbolPositionSet
+ }{
+ {
+ root: newSymbolNodeWithPos(0, 1),
+ nullable: false,
+ first: newSymbolPositionSet().add(1),
+ last: newSymbolPositionSet().add(1),
+ },
+ {
+ root: newEndMarkerNodeWithPos(1, 1),
+ nullable: false,
+ first: newSymbolPositionSet().add(1),
+ last: newSymbolPositionSet().add(1),
+ },
+ {
+ root: newConcatNode(
+ newSymbolNodeWithPos(0, 1),
+ newSymbolNodeWithPos(0, 2),
+ ),
+ nullable: false,
+ first: newSymbolPositionSet().add(1),
+ last: newSymbolPositionSet().add(2),
+ },
+ {
+ root: newConcatNode(
+ newRepeatNode(newSymbolNodeWithPos(0, 1)),
+ newSymbolNodeWithPos(0, 2),
+ ),
+ nullable: false,
+ first: newSymbolPositionSet().add(1).add(2),
+ last: newSymbolPositionSet().add(2),
+ },
+ {
+ root: newConcatNode(
+ newSymbolNodeWithPos(0, 1),
+ newRepeatNode(newSymbolNodeWithPos(0, 2)),
+ ),
+ nullable: false,
+ first: newSymbolPositionSet().add(1),
+ last: newSymbolPositionSet().add(1).add(2),
+ },
+ {
+ root: newConcatNode(
+ newRepeatNode(newSymbolNodeWithPos(0, 1)),
+ newRepeatNode(newSymbolNodeWithPos(0, 2)),
+ ),
+ nullable: true,
+ first: newSymbolPositionSet().add(1).add(2),
+ last: newSymbolPositionSet().add(1).add(2),
+ },
+ {
+ root: newAltNode(
+ newSymbolNodeWithPos(0, 1),
+ newSymbolNodeWithPos(0, 2),
+ ),
+ nullable: false,
+ first: newSymbolPositionSet().add(1).add(2),
+ last: newSymbolPositionSet().add(1).add(2),
+ },
+ {
+ root: newAltNode(
+ newRepeatNode(newSymbolNodeWithPos(0, 1)),
+ newSymbolNodeWithPos(0, 2),
+ ),
+ nullable: true,
+ first: newSymbolPositionSet().add(1).add(2),
+ last: newSymbolPositionSet().add(1).add(2),
+ },
+ {
+ root: newAltNode(
+ newSymbolNodeWithPos(0, 1),
+ newRepeatNode(newSymbolNodeWithPos(0, 2)),
+ ),
+ nullable: true,
+ first: newSymbolPositionSet().add(1).add(2),
+ last: newSymbolPositionSet().add(1).add(2),
+ },
+ {
+ root: newAltNode(
+ newRepeatNode(newSymbolNodeWithPos(0, 1)),
+ newRepeatNode(newSymbolNodeWithPos(0, 2)),
+ ),
+ nullable: true,
+ first: newSymbolPositionSet().add(1).add(2),
+ last: newSymbolPositionSet().add(1).add(2),
+ },
+ {
+ root: newRepeatNode(newSymbolNodeWithPos(0, 1)),
+ nullable: true,
+ first: newSymbolPositionSet().add(1),
+ last: newSymbolPositionSet().add(1),
+ },
+ {
+ root: newOptionNode(newSymbolNodeWithPos(0, 1)),
+ nullable: true,
+ first: newSymbolPositionSet().add(1),
+ last: newSymbolPositionSet().add(1),
+ },
+ }
+ for i, tt := range tests {
+ t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) {
+ if tt.root.nullable() != tt.nullable {
+ t.Errorf("unexpected nullable attribute; want: %v, got: %v", tt.nullable, tt.root.nullable())
+ }
+ if tt.first.hash() != tt.root.first().hash() {
+ t.Errorf("unexpected first positions attribute; want: %v, got: %v", tt.first, tt.root.first())
+ }
+ if tt.last.hash() != tt.root.last().hash() {
+ t.Errorf("unexpected last positions attribute; want: %v, got: %v", tt.last, tt.root.last())
+ }
+ })
+ }
+}
+
+func newSymbolNodeWithPos(v byte, pos symbolPosition) *symbolNode {
+ n := newSymbolNode(v)
+ n.pos = pos
+ return n
+}
+
+func newEndMarkerNodeWithPos(id int, pos symbolPosition) *endMarkerNode {
+ n := newEndMarkerNode(spec.LexModeKindID(id))
+ n.pos = pos
+ return n
+}
+
+func TestFollowAndSymbolTable(t *testing.T) {
+ symPos := func(n uint16) symbolPosition {
+ pos, err := newSymbolPosition(n, false)
+ if err != nil {
+ panic(err)
+ }
+ return pos
+ }
+
+ endPos := func(n uint16) symbolPosition {
+ pos, err := newSymbolPosition(n, true)
+ if err != nil {
+ panic(err)
+ }
+ return pos
+ }
+
+ p := parser.NewParser(spec.LexKindName("test"), strings.NewReader("(a|b)*abb"))
+ cpt, err := p.Parse()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ bt, symTab, err := ConvertCPTreeToByteTree(map[spec.LexModeKindID]parser.CPTree{
+ spec.LexModeKindIDMin: cpt,
+ })
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ {
+ followTab := genFollowTable(bt)
+ if followTab == nil {
+ t.Fatal("follow table is nil")
+ }
+ expectedFollowTab := followTable{
+ 1: newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)),
+ 2: newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)),
+ 3: newSymbolPositionSet().add(symPos(4)),
+ 4: newSymbolPositionSet().add(symPos(5)),
+ 5: newSymbolPositionSet().add(endPos(6)),
+ }
+ testFollowTable(t, expectedFollowTab, followTab)
+ }
+
+ {
+ entry := func(v byte) byteRange {
+ return byteRange{
+ from: v,
+ to: v,
+ }
+ }
+
+ expectedSymTab := &symbolTable{
+ symPos2Byte: map[symbolPosition]byteRange{
+ symPos(1): entry(byte('a')),
+ symPos(2): entry(byte('b')),
+ symPos(3): entry(byte('a')),
+ symPos(4): entry(byte('b')),
+ symPos(5): entry(byte('b')),
+ },
+ endPos2ID: map[symbolPosition]spec.LexModeKindID{
+ endPos(6): 1,
+ },
+ }
+ testSymbolTable(t, expectedSymTab, symTab)
+ }
+}
+
+func testFollowTable(t *testing.T, expected, actual followTable) {
+ if len(actual) != len(expected) {
+ t.Errorf("unexpected number of the follow table entries; want: %v, got: %v", len(expected), len(actual))
+ }
+ for ePos, eSet := range expected {
+ aSet, ok := actual[ePos]
+ if !ok {
+ t.Fatalf("follow entry is not found: position: %v, follow: %v", ePos, eSet)
+ }
+ if aSet.hash() != eSet.hash() {
+ t.Fatalf("follow entry of position %v is mismatched: want: %v, got: %v", ePos, aSet, eSet)
+ }
+ }
+}
+
+func testSymbolTable(t *testing.T, expected, actual *symbolTable) {
+ t.Helper()
+
+ if len(actual.symPos2Byte) != len(expected.symPos2Byte) {
+ t.Errorf("unexpected symPos2Byte entries: want: %v entries, got: %v entries", len(expected.symPos2Byte), len(actual.symPos2Byte))
+ }
+ for ePos, eByte := range expected.symPos2Byte {
+ byte, ok := actual.symPos2Byte[ePos]
+ if !ok {
+ t.Errorf("a symbol position entry is not found: %v -> %v", ePos, eByte)
+ continue
+ }
+ if byte.from != eByte.from || byte.to != eByte.to {
+ t.Errorf("unexpected symbol position entry: want: %v -> %v, got: %v -> %v", ePos, eByte, ePos, byte)
+ }
+ }
+
+ if len(actual.endPos2ID) != len(expected.endPos2ID) {
+ t.Errorf("unexpected endPos2ID entries: want: %v entries, got: %v entries", len(expected.endPos2ID), len(actual.endPos2ID))
+ }
+ for ePos, eID := range expected.endPos2ID {
+ id, ok := actual.endPos2ID[ePos]
+ if !ok {
+ t.Errorf("an end position entry is not found: %v -> %v", ePos, eID)
+ continue
+ }
+ if id != eID {
+ t.Errorf("unexpected end position entry: want: %v -> %v, got: %v -> %v", ePos, eID, ePos, id)
+ }
+ }
+}
diff --git a/tests/unit/urubu/grammar/lexical/parser.go b/tests/unit/urubu/grammar/lexical/parser.go
new file mode 100644
index 0000000..d5d7039
--- /dev/null
+++ b/tests/unit/urubu/grammar/lexical/parser.go
@@ -0,0 +1,1907 @@
+package parser
+
+import (
+ "fmt"
+ "reflect"
+ "strings"
+ "testing"
+
+ spec "urubu/spec/grammar"
+ "urubu/ucd"
+)
+
+func TestLexer(t *testing.T) {
+ tests := []struct {
+ caption string
+ src string
+ tokens []*token
+ err error
+ }{
+ {
+ caption: "lexer can recognize ordinaly characters",
+ src: "123abcいろは",
+ tokens: []*token{
+ newToken(tokenKindChar, '1'),
+ newToken(tokenKindChar, '2'),
+ newToken(tokenKindChar, '3'),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindChar, 'b'),
+ newToken(tokenKindChar, 'c'),
+ newToken(tokenKindChar, 'い'),
+ newToken(tokenKindChar, 'ろ'),
+ newToken(tokenKindChar, 'は'),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "lexer can recognize the special characters in default mode",
+ src: ".*+?|()[\\u",
+ tokens: []*token{
+ newToken(tokenKindAnyChar, nullChar),
+ newToken(tokenKindRepeat, nullChar),
+ newToken(tokenKindRepeatOneOrMore, nullChar),
+ newToken(tokenKindOption, nullChar),
+ newToken(tokenKindAlt, nullChar),
+ newToken(tokenKindGroupOpen, nullChar),
+ newToken(tokenKindGroupClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "lexer can recognize the escape sequences in default mode",
+ src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[",
+ tokens: []*token{
+ newToken(tokenKindChar, '\\'),
+ newToken(tokenKindChar, '.'),
+ newToken(tokenKindChar, '*'),
+ newToken(tokenKindChar, '+'),
+ newToken(tokenKindChar, '?'),
+ newToken(tokenKindChar, '|'),
+ newToken(tokenKindChar, '('),
+ newToken(tokenKindChar, ')'),
+ newToken(tokenKindChar, '['),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "], {, and } are treated as an ordinary character in default mode",
+ src: "]{}",
+ tokens: []*token{
+ newToken(tokenKindChar, ']'),
+ newToken(tokenKindChar, '{'),
+ newToken(tokenKindChar, '}'),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "lexer can recognize the special characters in bracket expression mode",
+ src: "[a-z\\u{09AF}][^a-z\\u{09abcf}]",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindCharRange, nullChar),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("09AF"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindCharRange, nullChar),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("09abcf"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "lexer can recognize the escape sequences in bracket expression mode",
+ src: "[\\^a\\-z]",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '^'),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "in a bracket expression, the special characters are also handled as normal characters",
+ src: "[\\\\.*+?|()[",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '\\'),
+ newToken(tokenKindChar, '.'),
+ newToken(tokenKindChar, '*'),
+ newToken(tokenKindChar, '+'),
+ newToken(tokenKindChar, '?'),
+ newToken(tokenKindChar, '|'),
+ newToken(tokenKindChar, '('),
+ newToken(tokenKindChar, ')'),
+ newToken(tokenKindChar, '['),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "hyphen symbols that appear in bracket expressions are handled as the character range symbol or ordinary characters",
+ // [...-...][...-][-...][-]
+ // ~~~~~~~ ~ ~ ~
+ // ^ ^ ^ ^
+ // | | | `-- Ordinary Character (b)
+ // | | `-- Ordinary Character (b)
+ // | `-- Ordinary Character (b)
+ // `-- Character Range (a)
+ //
+ // a. *-* is handled as a character-range expression.
+ // b. *-, -*, or - are handled as ordinary characters.
+ src: "[a-z][a-][-z][-][--][---][^a-z][^a-][^-z][^-][^--][^---]",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindCharRange, nullChar),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindCharRange, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindCharRange, nullChar),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, 'a'),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindChar, 'z'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindCharRange, nullChar),
+ newToken(tokenKindChar, '-'),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "caret symbols that appear in bracket expressions are handled as the logical inverse symbol or ordinary characters",
+ // [^...^...][^]
+ // ~~ ~ ~~
+ // ^ ^ ^^
+ // | | |`-- Ordinary Character (c)
+ // | | `-- Bracket Expression
+ // | `-- Ordinary Character (b)
+ // `-- Inverse Bracket Expression (a)
+ //
+ // a. Bracket expressions that have a caret symbol at the beginning are handled as logical inverse expressions.
+ // b. caret symbols that appear as the second and the subsequent symbols are handled as ordinary symbols.
+ // c. When a bracket expression has just one symbol, a caret symbol at the beginning is handled as an ordinary character.
+ src: "[^^][^]",
+ tokens: []*token{
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindChar, '^'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindChar, '^'),
+ newToken(tokenKindBExpClose, nullChar),
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "lexer raises an error when an invalid escape sequence appears",
+ src: "\\@",
+ err: synErrInvalidEscSeq,
+ },
+ {
+ caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears",
+ src: "\\",
+ err: synErrIncompletedEscSeq,
+ },
+ {
+ caption: "lexer raises an error when an invalid escape sequence appears",
+ src: "[\\@",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ },
+ err: synErrInvalidEscSeq,
+ },
+ {
+ caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears",
+ src: "[\\",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ },
+ err: synErrIncompletedEscSeq,
+ },
+ {
+ caption: "lexer can recognize the special characters and code points in code point expression mode",
+ src: "\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}[\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}][^\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}]",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("0123"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("4567"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("89abcd"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("efAB"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("CDEF01"),
+ newToken(tokenKindRBrace, nullChar),
+
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("0123"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("4567"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("89abcd"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("efAB"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("CDEF01"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("0123"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("4567"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("89abcd"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("efAB"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("CDEF01"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "a one digit hex string isn't a valid code point",
+ src: "\\u{0",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a two digits hex string isn't a valid code point",
+ src: "\\u{01",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a three digits hex string isn't a valid code point",
+ src: "\\u{012",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a four digits hex string is a valid code point",
+ src: "\\u{0123}",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("0123"),
+ newToken(tokenKindRBrace, nullChar),
+ },
+ },
+ {
+ caption: "a five digits hex string isn't a valid code point",
+ src: "\\u{01234",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a six digits hex string is a valid code point",
+ src: "\\u{012345}",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCodePointToken("012345"),
+ newToken(tokenKindRBrace, nullChar),
+ },
+ },
+ {
+ caption: "a seven digits hex string isn't a valid code point",
+ src: "\\u{0123456",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a code point must be hex digits",
+ src: "\\u{g",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "a code point must be hex digits",
+ src: "\\u{G",
+ tokens: []*token{
+ newToken(tokenKindCodePointLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ },
+ err: synErrInvalidCodePoint,
+ },
+ {
+ caption: "lexer can recognize the special characters and symbols in character property expression mode",
+ src: "\\p{Letter}\\p{General_Category=Letter}[\\p{Letter}\\p{General_Category=Letter}][^\\p{Letter}\\p{General_Category=Letter}]",
+ tokens: []*token{
+ newToken(tokenKindCharPropLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCharPropSymbolToken("Letter"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCharPropLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCharPropSymbolToken("General_Category"),
+ newToken(tokenKindEqual, nullChar),
+ newCharPropSymbolToken("Letter"),
+ newToken(tokenKindRBrace, nullChar),
+
+ newToken(tokenKindBExpOpen, nullChar),
+ newToken(tokenKindCharPropLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCharPropSymbolToken("Letter"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCharPropLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCharPropSymbolToken("General_Category"),
+ newToken(tokenKindEqual, nullChar),
+ newCharPropSymbolToken("Letter"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ newToken(tokenKindCharPropLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCharPropSymbolToken("Letter"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindCharPropLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newCharPropSymbolToken("General_Category"),
+ newToken(tokenKindEqual, nullChar),
+ newCharPropSymbolToken("Letter"),
+ newToken(tokenKindRBrace, nullChar),
+ newToken(tokenKindBExpClose, nullChar),
+
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "lexer can recognize the special characters and symbols in fragment expression mode",
+ src: "\\f{integer}",
+ tokens: []*token{
+ newToken(tokenKindFragmentLeader, nullChar),
+ newToken(tokenKindLBrace, nullChar),
+ newFragmentSymbolToken("integer"),
+ newToken(tokenKindRBrace, nullChar),
+
+ newToken(tokenKindEOF, nullChar),
+ },
+ },
+ {
+ caption: "a fragment expression is not supported in a bracket expression",
+ src: "[\\f",
+ tokens: []*token{
+ newToken(tokenKindBExpOpen, nullChar),
+ },
+ err: synErrInvalidEscSeq,
+ },
+ {
+ caption: "a fragment expression is not supported in an inverse bracket expression",
+ src: "[^\\f",
+ tokens: []*token{
+ newToken(tokenKindInverseBExpOpen, nullChar),
+ },
+ err: synErrInvalidEscSeq,
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.caption, func(t *testing.T) {
+ lex := newLexer(strings.NewReader(tt.src))
+ var err error
+ var tok *token
+ i := 0
+ for {
+ tok, err = lex.next()
+ if err != nil {
+ break
+ }
+ if i >= len(tt.tokens) {
+ break
+ }
+ eTok := tt.tokens[i]
+ i++
+ testToken(t, tok, eTok)
+
+ if tok.kind == tokenKindEOF {
+ break
+ }
+ }
+ if tt.err != nil {
+ if err != ParseErr {
+ t.Fatalf("unexpected error: want: %v, got: %v", ParseErr, err)
+ }
+ detail, cause := lex.error()
+ if cause != tt.err {
+ t.Fatalf("unexpected error: want: %v, got: %v (%v)", tt.err, cause, detail)
+ }
+ } else {
+ if err != nil {
+ t.Fatalf("unexpected error: %v", err)
+ }
+ }
+ if i < len(tt.tokens) {
+ t.Fatalf("expecte more tokens")
+ }
+ })
+ }
+}
+
+func testToken(t *testing.T, a, e *token) {
+ t.Helper()
+ if e.kind != a.kind || e.char != a.char || e.codePoint != a.codePoint {
+ t.Fatalf("unexpected token: want: %+v, got: %+v", e, a)
+ }
+}
+
+func TestParse(t *testing.T) {
+ tests := []struct {
+ pattern string
+ fragments map[spec.LexKindName]string
+ ast CPTree
+ syntaxError error
+
+ // When an AST is large, as patterns containing a character property expression, this test only checks
+ // that the pattern is parsable. The check of the validity of such AST is performed by checking that it
+ // can be matched correctly using the driver.
+ skipTestAST bool
+ }{
+ {
+ pattern: "a",
+ ast: newSymbolNode('a'),
+ },
+ {
+ pattern: "abc",
+ ast: genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ },
+ {
+ pattern: "a?",
+ ast: newOptionNode(
+ newSymbolNode('a'),
+ ),
+ },
+ {
+ pattern: "[abc]?",
+ ast: newOptionNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ },
+ {
+ pattern: "\\u{3042}?",
+ ast: newOptionNode(
+ newSymbolNode('\u3042'),
+ ),
+ },
+ {
+ pattern: "\\p{Letter}?",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\f{a2c}?",
+ fragments: map[spec.LexKindName]string{
+ "a2c": "abc",
+ },
+ ast: newOptionNode(
+ newFragmentNode("a2c",
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "(a)?",
+ ast: newOptionNode(
+ newSymbolNode('a'),
+ ),
+ },
+ {
+ pattern: "((a?)?)?",
+ ast: newOptionNode(
+ newOptionNode(
+ newOptionNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "(abc)?",
+ ast: newOptionNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ },
+ {
+ pattern: "(a|b)?",
+ ast: newOptionNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ ),
+ ),
+ },
+ {
+ pattern: "?",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "(?)",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a|?",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "?|b",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a??",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a*",
+ ast: newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ },
+ {
+ pattern: "[abc]*",
+ ast: newRepeatNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ },
+ {
+ pattern: "\\u{3042}*",
+ ast: newRepeatNode(
+ newSymbolNode('\u3042'),
+ ),
+ },
+ {
+ pattern: "\\p{Letter}*",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\f{a2c}*",
+ fragments: map[spec.LexKindName]string{
+ "a2c": "abc",
+ },
+ ast: newRepeatNode(
+ newFragmentNode("a2c",
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "((a*)*)*",
+ ast: newRepeatNode(
+ newRepeatNode(
+ newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "(abc)*",
+ ast: newRepeatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ },
+ {
+ pattern: "(a|b)*",
+ ast: newRepeatNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ ),
+ ),
+ },
+ {
+ pattern: "*",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "(*)",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a|*",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "*|b",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a**",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a+",
+ ast: genConcatNode(
+ newSymbolNode('a'),
+ newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ },
+ {
+ pattern: "[abc]+",
+ ast: genConcatNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ newRepeatNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "\\u{3042}+",
+ ast: genConcatNode(
+ newSymbolNode('\u3042'),
+ newRepeatNode(
+ newSymbolNode('\u3042'),
+ ),
+ ),
+ },
+ {
+ pattern: "\\p{Letter}+",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\f{a2c}+",
+ fragments: map[spec.LexKindName]string{
+ "a2c": "abc",
+ },
+ ast: genConcatNode(
+ newFragmentNode("a2c",
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ newRepeatNode(
+ newFragmentNode("a2c",
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "((a+)+)+",
+ ast: genConcatNode(
+ genConcatNode(
+ genConcatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ newRepeatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ ),
+ ),
+ newRepeatNode(
+ genConcatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ newRepeatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newRepeatNode(
+ newSymbolNode('a'),
+ ),
+ ),
+ ),
+ ),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "(abc)+",
+ ast: genConcatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ newRepeatNode(
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "(a|b)+",
+ ast: genConcatNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ ),
+ newRepeatNode(
+ genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ ),
+ ),
+ ),
+ },
+ {
+ pattern: "+",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "(+)",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a|+",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "+|b",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: "a++",
+ syntaxError: synErrRepNoTarget,
+ },
+ {
+ pattern: ".",
+ ast: newRangeSymbolNode(0x00, 0x10FFFF),
+ },
+ {
+ pattern: "[a]",
+ ast: newSymbolNode('a'),
+ },
+ {
+ pattern: "[abc]",
+ ast: genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ },
+ {
+ pattern: "[a-z]",
+ ast: newRangeSymbolNode('a', 'z'),
+ },
+ {
+ pattern: "[A-Za-z]",
+ ast: genAltNode(
+ newRangeSymbolNode('A', 'Z'),
+ newRangeSymbolNode('a', 'z'),
+ ),
+ },
+ {
+ pattern: "[\\u{004E}]",
+ ast: newSymbolNode('N'),
+ },
+ {
+ pattern: "[\\u{0061}-\\u{007A}]",
+ ast: newRangeSymbolNode('a', 'z'),
+ },
+ {
+ pattern: "[\\p{Lu}]",
+ skipTestAST: true,
+ },
+ {
+ pattern: "[a-\\p{Lu}]",
+ syntaxError: synErrRangePropIsUnavailable,
+ },
+ {
+ pattern: "[\\p{Lu}-z]",
+ syntaxError: synErrRangePropIsUnavailable,
+ },
+ {
+ pattern: "[\\p{Lu}-\\p{Ll}]",
+ syntaxError: synErrRangePropIsUnavailable,
+ },
+ {
+ pattern: "[z-a]",
+ syntaxError: synErrRangeInvalidOrder,
+ },
+ {
+ pattern: "a[]",
+ syntaxError: synErrBExpNoElem,
+ },
+ {
+ pattern: "[]a",
+ syntaxError: synErrBExpNoElem,
+ },
+ {
+ pattern: "[]",
+ syntaxError: synErrBExpNoElem,
+ },
+ {
+ pattern: "[^\\u{004E}]",
+ ast: genAltNode(
+ newRangeSymbolNode(0x00, '\u004E'-1),
+ newRangeSymbolNode('\u004E'+1, 0x10FFFF),
+ ),
+ },
+ {
+ pattern: "[^\\u{0061}-\\u{007A}]",
+ ast: genAltNode(
+ newRangeSymbolNode(0x00, '\u0061'-1),
+ newRangeSymbolNode('\u007A'+1, 0x10FFFF),
+ ),
+ },
+ {
+ pattern: "[^\\p{Lu}]",
+ skipTestAST: true,
+ },
+ {
+ pattern: "[^a-\\p{Lu}]",
+ syntaxError: synErrRangePropIsUnavailable,
+ },
+ {
+ pattern: "[^\\p{Lu}-z]",
+ syntaxError: synErrRangePropIsUnavailable,
+ },
+ {
+ pattern: "[^\\p{Lu}-\\p{Ll}]",
+ syntaxError: synErrRangePropIsUnavailable,
+ },
+ {
+ pattern: "[^\\u{0000}-\\u{10FFFF}]",
+ syntaxError: synErrUnmatchablePattern,
+ },
+ {
+ pattern: "[^\\u{0000}-\\u{FFFF}\\u{010000}-\\u{10FFFF}]",
+ syntaxError: synErrUnmatchablePattern,
+ },
+ {
+ pattern: "[^]",
+ ast: newSymbolNode('^'),
+ },
+ {
+ pattern: "[",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "[a",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([a",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "[a-",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([a-",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "[^",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([^",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "[^a",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([^a",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "[^a-",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([^a-",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "]",
+ ast: newSymbolNode(']'),
+ },
+ {
+ pattern: "(]",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: "a]",
+ ast: genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode(']'),
+ ),
+ },
+ {
+ pattern: "(a]",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: "([)",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "([a)",
+ syntaxError: synErrBExpUnclosed,
+ },
+ {
+ pattern: "[a-]",
+ ast: genAltNode(
+ newSymbolNode('a'),
+ newSymbolNode('-'),
+ ),
+ },
+ {
+ pattern: "[^a-]",
+ ast: genAltNode(
+ newRangeSymbolNode(0x00, 0x2C),
+ newRangeSymbolNode(0x2E, 0x60),
+ newRangeSymbolNode(0x62, 0x10FFFF),
+ ),
+ },
+ {
+ pattern: "[-z]",
+ ast: genAltNode(
+ newSymbolNode('-'),
+ newSymbolNode('z'),
+ ),
+ },
+ {
+ pattern: "[^-z]",
+ ast: newAltNode(
+ newRangeSymbolNode(0x00, 0x2C),
+ newAltNode(
+ newRangeSymbolNode(0x2E, 0x79),
+ newRangeSymbolNode(0x7B, 0x10FFFF),
+ ),
+ ),
+ },
+ {
+ pattern: "[-]",
+ ast: newSymbolNode('-'),
+ },
+ {
+ pattern: "[^-]",
+ ast: genAltNode(
+ newRangeSymbolNode(0x00, 0x2C),
+ newRangeSymbolNode(0x2E, 0x10FFFF),
+ ),
+ },
+ {
+ pattern: "[^01]",
+ ast: genAltNode(
+ newRangeSymbolNode(0x00, '0'-1),
+ newRangeSymbolNode('1'+1, 0x10FFFF),
+ ),
+ },
+ {
+ pattern: "[^10]",
+ ast: genAltNode(
+ newRangeSymbolNode(0x00, '0'-1),
+ newRangeSymbolNode('1'+1, 0x10FFFF),
+ ),
+ },
+ {
+ pattern: "[^a-z]",
+ ast: genAltNode(
+ newRangeSymbolNode(0x00, 'a'-1),
+ newRangeSymbolNode('z'+1, 0x10FFFF),
+ ),
+ },
+ {
+ pattern: "[^az]",
+ ast: genAltNode(
+ newRangeSymbolNode(0x00, 'a'-1),
+ genAltNode(
+ newRangeSymbolNode('a'+1, 'z'-1),
+ newRangeSymbolNode('z'+1, 0x10FFFF),
+ ),
+ ),
+ },
+ {
+ pattern: "\\u{006E}",
+ ast: newSymbolNode('\u006E'),
+ },
+ {
+ pattern: "\\u{03BD}",
+ ast: newSymbolNode('\u03BD'),
+ },
+ {
+ pattern: "\\u{306B}",
+ ast: newSymbolNode('\u306B'),
+ },
+ {
+ pattern: "\\u{01F638}",
+ ast: newSymbolNode('\U0001F638'),
+ },
+ {
+ pattern: "\\u{0000}",
+ ast: newSymbolNode('\u0000'),
+ },
+ {
+ pattern: "\\u{10FFFF}",
+ ast: newSymbolNode('\U0010FFFF'),
+ },
+ {
+ pattern: "\\u{110000}",
+ syntaxError: synErrCPExpOutOfRange,
+ },
+ {
+ pattern: "\\u",
+ syntaxError: synErrCPExpInvalidForm,
+ },
+ {
+ pattern: "\\u{",
+ syntaxError: synErrCPExpInvalidForm,
+ },
+ {
+ pattern: "\\u{03BD",
+ syntaxError: synErrCPExpInvalidForm,
+ },
+ {
+ pattern: "\\u{}",
+ syntaxError: synErrCPExpInvalidForm,
+ },
+ {
+ pattern: "\\p{Letter}",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\p{General_Category=Letter}",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\p{ Letter }",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\p{ General_Category = Letter }",
+ skipTestAST: true,
+ },
+ {
+ pattern: "\\p",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\p{",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\p{Letter",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\p{General_Category=}",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\p{General_Category= }",
+ syntaxError: synErrCharPropInvalidSymbol,
+ },
+ {
+ pattern: "\\p{=Letter}",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\p{ =Letter}",
+ syntaxError: synErrCharPropInvalidSymbol,
+ },
+ {
+ pattern: "\\p{=}",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\p{}",
+ syntaxError: synErrCharPropExpInvalidForm,
+ },
+ {
+ pattern: "\\f{a2c}",
+ fragments: map[spec.LexKindName]string{
+ "a2c": "abc",
+ },
+ ast: newFragmentNode("a2c",
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ },
+ {
+ pattern: "\\f{ a2c }",
+ fragments: map[spec.LexKindName]string{
+ "a2c": "abc",
+ },
+ ast: newFragmentNode("a2c",
+ genConcatNode(
+ newSymbolNode('a'),
+ newSymbolNode('b'),
+ newSymbolNode('c'),
+ ),
+ ),
+ },
+ {
+ pattern: "\\f",
+ syntaxError: synErrFragmentExpInvalidForm,
+ },
+ {
+ pattern: "\\f{",
+ syntaxError: synErrFragmentExpInvalidForm,
+ },
+ {
+ pattern: "\\f{a2c",
+ fragments: map[spec.LexKindName]string{
+ "a2c": "abc",
+ },
+ syntaxError: synErrFragmentExpInvalidForm,
+ },
+ {
+ pattern: "(a)",
+ ast: newSymbolNode('a'),
+ },
+ {
+ pattern: "(((a)))",
+ ast: newSymbolNode('a'),
+ },
+ {
+ pattern: "a()",
+ syntaxError: synErrGroupNoElem,
+ },
+ {
+ pattern: "()a",
+ syntaxError: synErrGroupNoElem,
+ },
+ {
+ pattern: "()",
+ syntaxError: synErrGroupNoElem,
+ },
+ {
+ pattern: "(",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: "a(",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: "(a",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: "((",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: "((a)",
+ syntaxError: synErrGroupUnclosed,
+ },
+ {
+ pattern: ")",
+ syntaxError: synErrGroupNoInitiator,
+ },
+ {
+ pattern: "a)",
+ syntaxError: synErrGroupNoInitiator,
+ },
+ {
+ pattern: ")a",
+ syntaxError: synErrGroupNoInitiator,
+ },
+ {
+ pattern: "))",
+ syntaxError: synErrGroupNoInitiator,
+ },
+ {
+ pattern: "(a))",
+ syntaxError: synErrGroupNoInitiator,
+ },
+ {
+ pattern: "Mulder|Scully",
+ ast: genAltNode(
+ genConcatNode(
+ newSymbolNode('M'),
+ newSymbolNode('u'),
+ newSymbolNode('l'),
+ newSymbolNode('d'),
+ newSymbolNode('e'),
+ newSymbolNode('r'),
+ ),
+ genConcatNode(
+ newSymbolNode('S'),
+ newSymbolNode('c'),
+ newSymbolNode('u'),
+ newSymbolNode('l'),
+ newSymbolNode('l'),
+ newSymbolNode('y'),
+ ),
+ ),
+ },
+ {
+ pattern: "Langly|Frohike|Byers",
+ ast: genAltNode(
+ genConcatNode(
+ newSymbolNode('L'),
+ newSymbolNode('a'),
+ newSymbolNode('n'),
+ newSymbolNode('g'),
+ newSymbolNode('l'),
+ newSymbolNode('y'),
+ ),
+ genConcatNode(
+ newSymbolNode('F'),
+ newSymbolNode('r'),
+ newSymbolNode('o'),
+ newSymbolNode('h'),
+ newSymbolNode('i'),
+ newSymbolNode('k'),
+ newSymbolNode('e'),
+ ),
+ genConcatNode(
+ newSymbolNode('B'),
+ newSymbolNode('y'),
+ newSymbolNode('e'),
+ newSymbolNode('r'),
+ newSymbolNode('s'),
+ ),
+ ),
+ },
+ {
+ pattern: "|",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "||",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "Mulder|",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "|Scully",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "Langly|Frohike|",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "Langly||Byers",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "|Frohike|Byers",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "|Frohike|",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "Fox(|)Mulder",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "(Fox|)Mulder",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ {
+ pattern: "Fox(|Mulder)",
+ syntaxError: synErrAltLackOfOperand,
+ },
+ }
+ for i, tt := range tests {
+ t.Run(fmt.Sprintf("#%v %v", i, tt.pattern), func(t *testing.T) {
+ fragmentTrees := map[spec.LexKindName]CPTree{}
+ for kind, pattern := range tt.fragments {
+ p := NewParser(kind, strings.NewReader(pattern))
+ root, err := p.Parse()
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ fragmentTrees[kind] = root
+ }
+ err := CompleteFragments(fragmentTrees)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ p := NewParser(spec.LexKindName("test"), strings.NewReader(tt.pattern))
+ root, err := p.Parse()
+ if tt.syntaxError != nil {
+ // printCPTree(os.Stdout, root, "", "")
+ if err != ParseErr {
+ t.Fatalf("unexpected error: want: %v, got: %v", ParseErr, err)
+ }
+ _, synErr := p.Error()
+ if synErr != tt.syntaxError {
+ t.Fatalf("unexpected syntax error: want: %v, got: %v", tt.syntaxError, synErr)
+ }
+ if root != nil {
+ t.Fatalf("tree must be nil")
+ }
+ } else {
+ if err != nil {
+ detail, cause := p.Error()
+ t.Fatalf("%v: %v: %v", err, cause, detail)
+ }
+ if root == nil {
+ t.Fatal("tree must be non-nil")
+ }
+
+ complete, err := ApplyFragments(root, fragmentTrees)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if !complete {
+ t.Fatalf("incomplete fragments")
+ }
+
+ // printCPTree(os.Stdout, root, "", "")
+ if !tt.skipTestAST {
+ r := root.(*rootNode)
+ testAST(t, tt.ast, r.tree)
+ }
+ }
+ })
+ }
+}
+
+func TestParse_ContributoryPropertyIsNotExposed(t *testing.T) {
+ for _, cProp := range ucd.ContributoryProperties() {
+ t.Run(fmt.Sprintf("%v", cProp), func(t *testing.T) {
+ p := NewParser(spec.LexKindName("test"), strings.NewReader(fmt.Sprintf(`\p{%v=yes}`, cProp)))
+ root, err := p.Parse()
+ if err == nil {
+ t.Fatalf("expected syntax error: got: nil")
+ }
+ _, synErr := p.Error()
+ if synErr != synErrCharPropUnsupported {
+ t.Fatalf("unexpected syntax error: want: %v, got: %v", synErrCharPropUnsupported, synErr)
+ }
+ if root != nil {
+ t.Fatalf("tree is not nil")
+ }
+ })
+ }
+}
+
+func TestExclude(t *testing.T) {
+ for _, test := range []struct {
+ caption string
+ target CPTree
+ base CPTree
+ result CPTree
+ }{
+ // t.From > b.From && t.To < b.To
+
+ // |t.From - b.From| = 1
+ // |b.To - t.To| = 1
+ //
+ // Target (t): +--+
+ // Base (b): +--+--+--+
+ // Result (b - t): +--+ +--+
+ {
+ caption: "|t.From - b.From| = 1 && |b.To - t.To| = 1",
+ target: newSymbolNode('1'),
+ base: newRangeSymbolNode('0', '2'),
+ result: newAltNode(
+ newSymbolNode('0'),
+ newSymbolNode('2'),
+ ),
+ },
+ // |t.From - b.From| > 1
+ // |b.To - t.To| > 1
+ //
+ // Target (t): +--+
+ // Base (b): +--+--+--+--+--+
+ // Result (b - t): +--+--+ +--+--+
+ {
+ caption: "|t.From - b.From| > 1 && |b.To - t.To| > 1",
+ target: newSymbolNode('2'),
+ base: newRangeSymbolNode('0', '4'),
+ result: newAltNode(
+ newRangeSymbolNode('0', '1'),
+ newRangeSymbolNode('3', '4'),
+ ),
+ },
+
+ // t.From <= b.From && t.To >= b.From && t.To < b.To
+
+ // |b.From - t.From| = 0
+ // |t.To - b.From| = 0
+ // |b.To - t.To| = 1
+ //
+ // Target (t): +--+
+ // Base (b): +--+--+
+ // Result (b - t): +--+
+ {
+ caption: "|b.From - t.From| = 0 && |t.To - b.From| = 0 && |b.To - t.To| = 1",
+ target: newSymbolNode('0'),
+ base: newRangeSymbolNode('0', '1'),
+ result: newSymbolNode('1'),
+ },
+ // |b.From - t.From| = 0
+ // |t.To - b.From| = 0
+ // |b.To - t.To| > 1
+ //
+ // Target (t): +--+
+ // Base (b): +--+--+--+
+ // Result (b - t): +--+--+
+ {
+ caption: "|b.From - t.From| = 0 && |t.To - b.From| = 0 && |b.To - t.To| > 1",
+ target: newSymbolNode('0'),
+ base: newRangeSymbolNode('0', '2'),
+ result: newRangeSymbolNode('1', '2'),
+ },
+ // |b.From - t.From| = 0
+ // |t.To - b.From| > 0
+ // |b.To - t.To| = 1
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+--+--+
+ // Result (b - t): +--+
+ {
+ caption: "|b.From - t.From| = 0 && |t.To - b.From| > 0 && |b.To - t.To| = 1",
+ target: newRangeSymbolNode('0', '1'),
+ base: newRangeSymbolNode('0', '2'),
+ result: newSymbolNode('2'),
+ },
+ // |b.From - t.From| = 0
+ // |t.To - b.From| > 0
+ // |b.To - t.To| > 1
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+--+--+--+
+ // Result (b - t): +--+--+
+ {
+ caption: "|b.From - t.From| = 0 && |t.To - b.From| > 0 && |b.To - t.To| > 1",
+ target: newRangeSymbolNode('0', '1'),
+ base: newRangeSymbolNode('0', '3'),
+ result: newRangeSymbolNode('2', '3'),
+ },
+ // |b.From - t.From| > 0
+ // |t.To - b.From| = 0
+ // |b.To - t.To| = 1
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+--+
+ // Result (b - t): +--+
+ {
+ caption: "|b.From - t.From| > 0 && |t.To - b.From| = 0 && |b.To - t.To| = 1",
+ target: newRangeSymbolNode('0', '1'),
+ base: newRangeSymbolNode('1', '2'),
+ result: newSymbolNode('2'),
+ },
+ // |b.From - t.From| > 0
+ // |t.To - b.From| = 0
+ // |b.To - t.To| > 1
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+--+--+
+ // Result (b - t): +--+--+
+ {
+ caption: "|b.From - t.From| > 0 && |t.To - b.From| = 0 && |b.To - t.To| > 1",
+ target: newRangeSymbolNode('0', '1'),
+ base: newRangeSymbolNode('1', '3'),
+ result: newRangeSymbolNode('2', '3'),
+ },
+ // |b.From - t.From| > 0
+ // |t.To - b.From| > 0
+ // |b.To - t.To| = 1
+ //
+ // Target (t): +--+--+--+
+ // Base (b): +--+--+--+
+ // Result (b - t): +--+
+ {
+ caption: "|b.From - t.From| > 0 && |t.To - b.From| > 0 && |b.To - t.To| = 1",
+ target: newRangeSymbolNode('0', '2'),
+ base: newRangeSymbolNode('1', '3'),
+ result: newSymbolNode('3'),
+ },
+ // |b.From - t.From| > 0
+ // |t.To - b.From| > 0
+ // |b.To - t.To| > 1
+ //
+ // Target (t): +--+--+--+
+ // Base (b): +--+--+--+--+
+ // Result (b - t): +--+--+
+ {
+ caption: "|b.From - t.From| > 0 && |t.To - b.From| > 0 && |b.To - t.To| > 1",
+ target: newRangeSymbolNode('0', '2'),
+ base: newRangeSymbolNode('1', '4'),
+ result: newRangeSymbolNode('3', '4'),
+ },
+
+ // t.From > b.From && t.From <= b.To && t.To >= b.To
+
+ // |t.From - b.From| = 1
+ // |b.To - t.From| = 0
+ // |t.To - b.To| = 0
+ //
+ // Target (t): +--+
+ // Base (b): +--+--+
+ // Result (b - t): +--+
+ {
+ caption: "|t.From - b.From| = 1 && |b.To - t.From| = 0 && |t.To - b.To| = 0",
+ target: newSymbolNode('1'),
+ base: newRangeSymbolNode('0', '1'),
+ result: newSymbolNode('0'),
+ },
+ // |t.From - b.From| = 1
+ // |b.To - t.From| = 0
+ // |t.To - b.To| > 0
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+--+
+ // Result (b - t): +--+
+ {
+ caption: "|t.From - b.From| = 1 && |b.To - t.From| = 0 && |t.To - b.To| > 0",
+ target: newRangeSymbolNode('1', '2'),
+ base: newRangeSymbolNode('0', '1'),
+ result: newSymbolNode('0'),
+ },
+ // |t.From - b.From| = 1
+ // |b.To - t.From| > 0
+ // |t.To - b.To| = 0
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+--+--+
+ // Result (b - t): +--+
+ {
+ caption: "|t.From - b.From| = 1 && |b.To - t.From| > 0 && |t.To - b.To| = 0",
+ target: newRangeSymbolNode('1', '2'),
+ base: newRangeSymbolNode('0', '2'),
+ result: newSymbolNode('0'),
+ },
+ // |t.From - b.From| = 1
+ // |b.To - t.From| > 0
+ // |t.To - b.To| > 0
+ //
+ // Target (t): +--+--+--+
+ // Base (b): +--+--+--+
+ // Result (b - t): +--+
+ {
+ caption: "|t.From - b.From| = 1 && |b.To - t.From| > 0 && |t.To - b.To| > 0",
+ target: newRangeSymbolNode('1', '3'),
+ base: newRangeSymbolNode('0', '2'),
+ result: newSymbolNode('0'),
+ },
+ // |t.From - b.From| > 1
+ // |b.To - t.From| = 0
+ // |t.To - b.To| = 0
+ //
+ // Target (t): +--+
+ // Base (b): +--+--+--+
+ // Result (b - t): +--+--+
+ {
+ caption: "|t.From - b.From| > 1 && |b.To - t.From| = 0 && |t.To - b.To| = 0",
+ target: newSymbolNode('2'),
+ base: newRangeSymbolNode('0', '2'),
+ result: newRangeSymbolNode('0', '1'),
+ },
+ // |t.From - b.From| > 1
+ // |b.To - t.From| = 0
+ // |t.To - b.To| > 0
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+--+--+
+ // Result (b - t): +--+--+
+ {
+ caption: "|t.From - b.From| > 1 && |b.To - t.From| = 0 && |t.To - b.To| > 0",
+ target: newRangeSymbolNode('2', '3'),
+ base: newRangeSymbolNode('0', '2'),
+ result: newRangeSymbolNode('0', '1'),
+ },
+ // |t.From - b.From| > 1
+ // |b.To - t.From| > 0
+ // |t.To - b.To| = 0
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+--+--+--+
+ // Result (b - t): +--+--+
+ {
+ caption: "|t.From - b.From| > 1 && |b.To - t.From| > 0 && |t.To - b.To| = 0",
+ target: newRangeSymbolNode('2', '3'),
+ base: newRangeSymbolNode('0', '3'),
+ result: newRangeSymbolNode('0', '1'),
+ },
+ // |t.From - b.From| > 1
+ // |b.To - t.From| > 0
+ // |t.To - b.To| > 0
+ //
+ // Target (t): +--+--+--+
+ // Base (b): +--+--+--+--+
+ // Result (b - t): +--+--+
+ {
+ caption: "|t.From - b.From| > 1 && |b.To - t.From| > 0 && |t.To - b.To| > 0",
+ target: newRangeSymbolNode('2', '4'),
+ base: newRangeSymbolNode('0', '3'),
+ result: newRangeSymbolNode('0', '1'),
+ },
+
+ // t.From <= b.From && t.To >= b.To
+
+ // |b.From - t.From| = 0
+ // |t.To - b.To| = 0
+ //
+ // Target (t): +--+
+ // Base (b): +--+
+ // Result (b - t): N/A
+ {
+ caption: "|b.From - t.From| = 0 && |t.To - b.To| = 0",
+ target: newSymbolNode('0'),
+ base: newSymbolNode('0'),
+ result: nil,
+ },
+ // |b.From - t.From| = 0
+ // |t.To - b.To| > 0
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+
+ // Result (b - t): N/A
+ {
+ caption: "|b.From - t.From| = 0 && |t.To - b.To| > 0",
+ target: newRangeSymbolNode('0', '1'),
+ base: newSymbolNode('0'),
+ result: nil,
+ },
+ // |b.From - t.From| > 0
+ // |t.To - b.To| = 0
+ //
+ // Target (t): +--+--+
+ // Base (b): +--+
+ // Result (b - t): N/A
+ {
+ caption: "|b.From - t.From| > 0 && |t.To - b.To| = 0",
+ target: newRangeSymbolNode('0', '1'),
+ base: newSymbolNode('1'),
+ result: nil,
+ },
+ // |b.From - t.From| > 0
+ // |t.To - b.To| > 0
+ //
+ // Target (t): +--+--+--+
+ // Base (b): +--+
+ // Result (b - t): N/A
+ {
+ caption: "|b.From - t.From| > 0 && |t.To - b.To| > 0",
+ target: newRangeSymbolNode('0', '2'),
+ base: newSymbolNode('1'),
+ result: nil,
+ },
+
+ // Others
+
+ // |b.From - t.From| = 1
+ //
+ // Target (t): +--+
+ // Base (b): +--+
+ // Result (b - t): +--+
+ {
+ caption: "|b.From - t.From| = 1",
+ target: newSymbolNode('0'),
+ base: newSymbolNode('1'),
+ result: newSymbolNode('1'),
+ },
+ // |b.From - t.From| > 1
+ //
+ // Target (t): +--+
+ // Base (b): +--+
+ // Result (b - t): +--+
+ {
+ caption: "|b.From - t.From| > 1",
+ target: newSymbolNode('0'),
+ base: newSymbolNode('2'),
+ result: newSymbolNode('2'),
+ },
+ // |t.To - b.To| = 1
+ //
+ // Target (t): +--+
+ // Base (b): +--+
+ // Result (b - t): +--+
+ {
+ caption: "|t.To - b.To| = 1",
+ target: newSymbolNode('1'),
+ base: newSymbolNode('0'),
+ result: newSymbolNode('0'),
+ },
+ // |t.To - b.To| > 1
+ //
+ // Target (t): +--+
+ // Base (b): +--+
+ // Result (b - t): +--+
+ {
+ caption: "|t.To - b.To| > 1",
+ target: newSymbolNode('2'),
+ base: newSymbolNode('0'),
+ result: newSymbolNode('0'),
+ },
+ } {
+ t.Run(test.caption, func(t *testing.T) {
+ r := exclude(test.target, test.base)
+ testAST(t, test.result, r)
+ })
+ }
+}
+
+func testAST(t *testing.T, expected, actual CPTree) {
+ t.Helper()
+
+ aTy := reflect.TypeOf(actual)
+ eTy := reflect.TypeOf(expected)
+ if eTy != aTy {
+ t.Fatalf("unexpected node: want: %+v, got: %+v", eTy, aTy)
+ }
+
+ if actual == nil {
+ return
+ }
+
+ switch e := expected.(type) {
+ case *symbolNode:
+ a := actual.(*symbolNode)
+ if a.From != e.From || a.To != e.To {
+ t.Fatalf("unexpected node: want: %+v, got: %+v", e, a)
+ }
+ }
+ eLeft, eRight := expected.children()
+ aLeft, aRight := actual.children()
+ testAST(t, eLeft, aLeft)
+ testAST(t, eRight, aRight)
+}
diff --git a/tests/unit/urubu/grammar/symbol.go b/tests/unit/urubu/grammar/symbol.go
new file mode 100644
index 0000000..31c3edd
--- /dev/null
+++ b/tests/unit/urubu/grammar/symbol.go
@@ -0,0 +1,159 @@
+package symbol
+
+import "testing"
+
+func TestSymbol(t *testing.T) {
+ tab := NewSymbolTable()
+ w := tab.Writer()
+ _, _ = w.RegisterStartSymbol("expr'")
+ _, _ = w.RegisterNonTerminalSymbol("expr")
+ _, _ = w.RegisterNonTerminalSymbol("term")
+ _, _ = w.RegisterNonTerminalSymbol("factor")
+ _, _ = w.RegisterTerminalSymbol("id")
+ _, _ = w.RegisterTerminalSymbol("add")
+ _, _ = w.RegisterTerminalSymbol("mul")
+ _, _ = w.RegisterTerminalSymbol("l_paren")
+ _, _ = w.RegisterTerminalSymbol("r_paren")
+
+ nonTermTexts := []string{
+ "", // Nil
+ "expr'",
+ "expr",
+ "term",
+ "factor",
+ }
+
+ termTexts := []string{
+ "", // Nil
+ symbolNameEOF, // EOF
+ "id",
+ "add",
+ "mul",
+ "l_paren",
+ "r_paren",
+ }
+
+ tests := []struct {
+ text string
+ isNil bool
+ isStart bool
+ isEOF bool
+ isNonTerminal bool
+ isTerminal bool
+ }{
+ {
+ text: "expr'",
+ isStart: true,
+ isNonTerminal: true,
+ },
+ {
+ text: "expr",
+ isNonTerminal: true,
+ },
+ {
+ text: "term",
+ isNonTerminal: true,
+ },
+ {
+ text: "factor",
+ isNonTerminal: true,
+ },
+ {
+ text: "id",
+ isTerminal: true,
+ },
+ {
+ text: "add",
+ isTerminal: true,
+ },
+ {
+ text: "mul",
+ isTerminal: true,
+ },
+ {
+ text: "l_paren",
+ isTerminal: true,
+ },
+ {
+ text: "r_paren",
+ isTerminal: true,
+ },
+ }
+ for _, tt := range tests {
+ t.Run(tt.text, func(t *testing.T) {
+ r := tab.Reader()
+ sym, ok := r.ToSymbol(tt.text)
+ if !ok {
+ t.Fatalf("symbol was not found")
+ }
+ testSymbolProperty(t, sym, tt.isNil, tt.isStart, tt.isEOF, tt.isNonTerminal, tt.isTerminal)
+ text, ok := r.ToText(sym)
+ if !ok {
+ t.Fatalf("text was not found")
+ }
+ if text != tt.text {
+ t.Fatalf("unexpected text representation; want: %v, got: %v", tt.text, text)
+ }
+ })
+ }
+
+ t.Run("EOF", func(t *testing.T) {
+ testSymbolProperty(t, SymbolEOF, false, false, true, false, true)
+ })
+
+ t.Run("Nil", func(t *testing.T) {
+ testSymbolProperty(t, SymbolNil, true, false, false, false, false)
+ })
+
+ t.Run("texts of non-terminals", func(t *testing.T) {
+ r := tab.Reader()
+ ts, err := r.NonTerminalTexts()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(ts) != len(nonTermTexts) {
+ t.Fatalf("unexpected non-terminal count; want: %v (%#v), got: %v (%#v)", len(nonTermTexts), nonTermTexts, len(ts), ts)
+ }
+ for i, text := range ts {
+ if text != nonTermTexts[i] {
+ t.Fatalf("unexpected non-terminal; want: %v, got: %v", nonTermTexts[i], text)
+ }
+ }
+ })
+
+ t.Run("texts of terminals", func(t *testing.T) {
+ r := tab.Reader()
+ ts, err := r.TerminalTexts()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if len(ts) != len(termTexts) {
+ t.Fatalf("unexpected terminal count; want: %v (%#v), got: %v (%#v)", len(termTexts), termTexts, len(ts), ts)
+ }
+ for i, text := range ts {
+ if text != termTexts[i] {
+ t.Fatalf("unexpected terminal; want: %v, got: %v", termTexts[i], text)
+ }
+ }
+ })
+}
+
+func testSymbolProperty(t *testing.T, sym Symbol, isNil, isStart, isEOF, isNonTerminal, isTerminal bool) {
+ t.Helper()
+
+ if v := sym.IsNil(); v != isNil {
+ t.Fatalf("isNil property is mismatched; want: %v, got: %v", isNil, v)
+ }
+ if v := sym.IsStart(); v != isStart {
+ t.Fatalf("isStart property is mismatched; want: %v, got: %v", isStart, v)
+ }
+ if v := sym.isEOF(); v != isEOF {
+ t.Fatalf("isEOF property is mismatched; want: %v, got: %v", isEOF, v)
+ }
+ if v := sym.isNonTerminal(); v != isNonTerminal {
+ t.Fatalf("isNonTerminal property is mismatched; want: %v, got: %v", isNonTerminal, v)
+ }
+ if v := sym.IsTerminal(); v != isTerminal {
+ t.Fatalf("isTerminal property is mismatched; want: %v, got: %v", isTerminal, v)
+ }
+}