aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRyo Nihei <nihei.dev@gmail.com>2021-08-01 15:29:18 +0900
committerRyo Nihei <nihei.dev@gmail.com>2021-08-01 15:38:54 +0900
commit03e3688e3928c88c12107ea734c35281c814e0c0 (patch)
tree7f57554aec423098c8325238aef72cffdae7651e
parentFix CHANGELOG (diff)
downloadtre-03e3688e3928c88c12107ea734c35281c814e0c0.tar.gz
tre-03e3688e3928c88c12107ea734c35281c814e0c0.tar.xz
Add unique kind IDs to tokens
-rw-r--r--README.md1
-rw-r--r--compiler/compiler.go38
-rw-r--r--driver/lexer.go17
-rw-r--r--driver/lexer_test.go347
-rw-r--r--spec/spec.go14
5 files changed, 239 insertions, 178 deletions
diff --git a/README.md b/README.md
index 15a35a5..8f9f01c 100644
--- a/README.md
+++ b/README.md
@@ -67,6 +67,7 @@ The JSON format of tokens that `maleeni lex` command prints is as follows:
|-----------|-------------------|----------------------------------------------------------------------------------------|
| mode | integer | `mode` represents a number that corresponds to a `mode_name`. |
| mode_name | string | `mode_name` is a mode name that represents in which mode the lexer detected the token. |
+| kind_id | integer | `kind_id` represents an ID of a kind and is unique among modes. |
| kind | integer | `kind` represents a number that corresponds to a `KindName`. |
| kind_name | string | `kind_name` is a kind name that represents what kind the token has. |
| match | array of integers | `match` is a byte sequence matched a pattern of a lexical specification. |
diff --git a/compiler/compiler.go b/compiler/compiler.go
index f382d16..5d3e52f 100644
--- a/compiler/compiler.go
+++ b/compiler/compiler.go
@@ -69,9 +69,47 @@ func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSp
modeSpecs = append(modeSpecs, modeSpec)
}
+ var kindNames []spec.LexKind
+ var name2ID map[spec.LexKind]spec.LexKindID
+ {
+ name2ID = map[spec.LexKind]spec.LexKindID{}
+ id := spec.LexKindIDMin
+ for _, modeSpec := range modeSpecs[1:] {
+ for _, name := range modeSpec.Kinds[1:] {
+ if _, ok := name2ID[name]; ok {
+ continue
+ }
+ name2ID[name] = id
+ id++
+ }
+ }
+
+ kindNames = make([]spec.LexKind, len(name2ID)+1)
+ for name, id := range name2ID {
+ kindNames[id] = name
+ }
+ }
+
+ var kindIDs [][]spec.LexKindID
+ {
+ kindIDs = make([][]spec.LexKindID, len(modeSpecs))
+ for i, modeSpec := range modeSpecs[1:] {
+ ids := make([]spec.LexKindID, len(modeSpec.Kinds))
+ for modeID, name := range modeSpec.Kinds {
+ if modeID == 0 {
+ continue
+ }
+ ids[modeID] = name2ID[name]
+ }
+ kindIDs[i+1] = ids
+ }
+ }
+
return &spec.CompiledLexSpec{
InitialMode: spec.LexModeNumDefault,
Modes: modes,
+ Kinds: kindNames,
+ KindIDs: kindIDs,
CompressionLevel: config.compLv,
Specs: modeSpecs,
}, nil
diff --git a/driver/lexer.go b/driver/lexer.go
index 1e54fa6..7ad2dd0 100644
--- a/driver/lexer.go
+++ b/driver/lexer.go
@@ -62,6 +62,9 @@ type Token struct {
// `ModeName` is a mode name that represents in which mode the lexer detected the token.
ModeName spec.LexModeName
+ // `KindID` is a unique ID among modes.
+ KindID int
+
// `Kind` represents a number that corresponds to a `KindName`.
Kind int
@@ -78,11 +81,12 @@ type Token struct {
match byteSequence
}
-func newToken(mode spec.LexModeNum, modeName spec.LexModeName, kind int, kindName string, match byteSequence) *Token {
+func newToken(mode spec.LexModeNum, modeName spec.LexModeName, kindID int, modeKindID int, kindName string, match byteSequence) *Token {
return &Token{
Mode: mode,
ModeName: modeName,
- Kind: kind,
+ KindID: kindID,
+ Kind: modeKindID,
KindName: kindName,
match: match,
}
@@ -131,6 +135,7 @@ func (t *Token) MarshalJSON() ([]byte, error) {
return json.Marshal(struct {
Mode int `json:"mode"`
ModeName string `json:"mode_name"`
+ KindID int `json:"kind_id"`
Kind int `json:"kind"`
KindName string `json:"kind_name"`
Match byteSequence `json:"match"`
@@ -140,6 +145,7 @@ func (t *Token) MarshalJSON() ([]byte, error) {
}{
Mode: t.Mode.Int(),
ModeName: t.ModeName.String(),
+ KindID: t.KindID,
Kind: t.Kind,
KindName: t.KindName,
Match: t.match,
@@ -321,9 +327,10 @@ func (l *Lexer) next() (*Token, error) {
return newInvalidToken(mode, modeName, newByteSequence(buf)), nil
}
state = nextState
- id := spec.DFA.AcceptingStates[state]
- if id != 0 {
- tok = newToken(mode, modeName, id, spec.Kinds[id].String(), newByteSequence(buf))
+ modeKindID := spec.DFA.AcceptingStates[state]
+ if modeKindID != 0 {
+ kindID := l.clspec.KindIDs[mode][modeKindID]
+ tok = newToken(mode, modeName, kindID.Int(), modeKindID, spec.Kinds[modeKindID].String(), newByteSequence(buf))
unfixedBufLen = 0
}
}
diff --git a/driver/lexer_test.go b/driver/lexer_test.go
index 4dfed99..79ee12e 100644
--- a/driver/lexer_test.go
+++ b/driver/lexer_test.go
@@ -42,8 +42,8 @@ func newLexEntryFragment(kind string, pattern string) *spec.LexEntry {
}
}
-func newTokenDefault(id int, kind string, match byteSequence) *Token {
- return newToken(spec.LexModeNumDefault, spec.LexModeNameDefault, id, kind, match)
+func newTokenDefault(kindID int, modeKindID int, kindName string, match byteSequence) *Token {
+ return newToken(spec.LexModeNumDefault, spec.LexModeNameDefault, kindID, modeKindID, kindName, match)
}
func newEOFTokenDefault() *Token {
@@ -67,17 +67,17 @@ func TestLexer_Next(t *testing.T) {
},
src: "abb aabb aaabb babb bbabb abbbabb",
tokens: []*Token{
- newTokenDefault(1, "t1", newByteSequence([]byte("abb"))),
- newTokenDefault(2, "t2", newByteSequence([]byte(" "))),
- newTokenDefault(1, "t1", newByteSequence([]byte("aabb"))),
- newTokenDefault(2, "t2", newByteSequence([]byte(" "))),
- newTokenDefault(1, "t1", newByteSequence([]byte("aaabb"))),
- newTokenDefault(2, "t2", newByteSequence([]byte(" "))),
- newTokenDefault(1, "t1", newByteSequence([]byte("babb"))),
- newTokenDefault(2, "t2", newByteSequence([]byte(" "))),
- newTokenDefault(1, "t1", newByteSequence([]byte("bbabb"))),
- newTokenDefault(2, "t2", newByteSequence([]byte(" "))),
- newTokenDefault(1, "t1", newByteSequence([]byte("abbbabb"))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("abb"))),
+ newTokenDefault(2, 2, "t2", newByteSequence([]byte(" "))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("aabb"))),
+ newTokenDefault(2, 2, "t2", newByteSequence([]byte(" "))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("aaabb"))),
+ newTokenDefault(2, 2, "t2", newByteSequence([]byte(" "))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("babb"))),
+ newTokenDefault(2, 2, "t2", newByteSequence([]byte(" "))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("bbabb"))),
+ newTokenDefault(2, 2, "t2", newByteSequence([]byte(" "))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("abbbabb"))),
newEOFTokenDefault(),
},
},
@@ -91,21 +91,21 @@ func TestLexer_Next(t *testing.T) {
},
src: "ba baaa a aaa abcd abcdcdcd cd cdcdcd",
tokens: []*Token{
- newTokenDefault(1, "t1", newByteSequence([]byte("ba"))),
- newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
- newTokenDefault(1, "t1", newByteSequence([]byte("baaa"))),
- newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
- newTokenDefault(1, "t1", newByteSequence([]byte("a"))),
- newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
- newTokenDefault(1, "t1", newByteSequence([]byte("aaa"))),
- newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
- newTokenDefault(2, "t2", newByteSequence([]byte("abcd"))),
- newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
- newTokenDefault(2, "t2", newByteSequence([]byte("abcdcdcd"))),
- newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
- newTokenDefault(2, "t2", newByteSequence([]byte("cd"))),
- newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
- newTokenDefault(2, "t2", newByteSequence([]byte("cdcdcd"))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("ba"))),
+ newTokenDefault(3, 3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("baaa"))),
+ newTokenDefault(3, 3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("a"))),
+ newTokenDefault(3, 3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("aaa"))),
+ newTokenDefault(3, 3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(2, 2, "t2", newByteSequence([]byte("abcd"))),
+ newTokenDefault(3, 3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(2, 2, "t2", newByteSequence([]byte("abcdcdcd"))),
+ newTokenDefault(3, 3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(2, 2, "t2", newByteSequence([]byte("cd"))),
+ newTokenDefault(3, 3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(2, 2, "t2", newByteSequence([]byte("cdcdcd"))),
newEOFTokenDefault(),
},
},
@@ -134,22 +134,22 @@ func TestLexer_Next(t *testing.T) {
0xf4, 0x8f, 0xbf, 0xbf,
}),
tokens: []*Token{
- newTokenDefault(1, "t1", newByteSequence([]byte{0x00})),
- newTokenDefault(1, "t1", newByteSequence([]byte{0x7f})),
- newTokenDefault(1, "t1", newByteSequence([]byte{0xc2, 0x80})),
- newTokenDefault(1, "t1", newByteSequence([]byte{0xdf, 0xbf})),
- newTokenDefault(1, "t1", newByteSequence([]byte{0xe1, 0x80, 0x80})),
- newTokenDefault(1, "t1", newByteSequence([]byte{0xec, 0xbf, 0xbf})),
- newTokenDefault(1, "t1", newByteSequence([]byte{0xed, 0x80, 0x80})),
- newTokenDefault(1, "t1", newByteSequence([]byte{0xed, 0x9f, 0xbf})),
- newTokenDefault(1, "t1", newByteSequence([]byte{0xee, 0x80, 0x80})),
- newTokenDefault(1, "t1", newByteSequence([]byte{0xef, 0xbf, 0xbf})),
- newTokenDefault(1, "t1", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
- newTokenDefault(1, "t1", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})),
- newTokenDefault(1, "t1", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})),
- newTokenDefault(1, "t1", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})),
- newTokenDefault(1, "t1", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})),
- newTokenDefault(1, "t1", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte{0x00})),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte{0x7f})),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xc2, 0x80})),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xdf, 0xbf})),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xe1, 0x80, 0x80})),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xec, 0xbf, 0xbf})),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xed, 0x80, 0x80})),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xed, 0x9f, 0xbf})),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xee, 0x80, 0x80})),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xef, 0xbf, 0xbf})),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})),
newEOFTokenDefault(),
},
},
@@ -161,17 +161,17 @@ func TestLexer_Next(t *testing.T) {
},
src: "ab.*+?|()[]",
tokens: []*Token{
- newTokenDefault(1, "t1", newByteSequence([]byte("a"))),
- newTokenDefault(1, "t1", newByteSequence([]byte("b"))),
- newTokenDefault(1, "t1", newByteSequence([]byte("."))),
- newTokenDefault(1, "t1", newByteSequence([]byte("*"))),
- newTokenDefault(1, "t1", newByteSequence([]byte("+"))),
- newTokenDefault(1, "t1", newByteSequence([]byte("?"))),
- newTokenDefault(1, "t1", newByteSequence([]byte("|"))),
- newTokenDefault(1, "t1", newByteSequence([]byte("("))),
- newTokenDefault(1, "t1", newByteSequence([]byte(")"))),
- newTokenDefault(1, "t1", newByteSequence([]byte("["))),
- newTokenDefault(1, "t1", newByteSequence([]byte("]"))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("a"))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("b"))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("."))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("*"))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("+"))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("?"))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("|"))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("("))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte(")"))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("["))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("]"))),
newEOFTokenDefault(),
},
},
@@ -194,10 +194,10 @@ func TestLexer_Next(t *testing.T) {
0x7f,
}),
tokens: []*Token{
- newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x01})),
- newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x02})),
- newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x7e})),
- newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x7f})),
+ newTokenDefault(1, 1, "1ByteChar", newByteSequence([]byte{0x01})),
+ newTokenDefault(1, 1, "1ByteChar", newByteSequence([]byte{0x02})),
+ newTokenDefault(1, 1, "1ByteChar", newByteSequence([]byte{0x7e})),
+ newTokenDefault(1, 1, "1ByteChar", newByteSequence([]byte{0x7f})),
newEOFTokenDefault(),
},
},
@@ -215,10 +215,10 @@ func TestLexer_Next(t *testing.T) {
0xdf, 0xbf,
}),
tokens: []*Token{
- newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x80})),
- newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x81})),
- newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbe})),
- newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbf})),
+ newTokenDefault(1, 1, "2ByteChar", newByteSequence([]byte{0xc2, 0x80})),
+ newTokenDefault(1, 1, "2ByteChar", newByteSequence([]byte{0xc2, 0x81})),
+ newTokenDefault(1, 1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbe})),
+ newTokenDefault(1, 1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbf})),
newEOFTokenDefault(),
},
},
@@ -233,7 +233,7 @@ func TestLexer_Next(t *testing.T) {
0xe0, 0xa0, 0x80,
}),
tokens: []*Token{
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
newEOFTokenDefault(),
},
},
@@ -251,10 +251,10 @@ func TestLexer_Next(t *testing.T) {
0xe0, 0xa0, 0xbf,
}),
tokens: []*Token{
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbe})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbf})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbe})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbf})),
newEOFTokenDefault(),
},
},
@@ -272,10 +272,10 @@ func TestLexer_Next(t *testing.T) {
0xe0, 0xbf, 0xbf,
}),
tokens: []*Token{
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})),
newEOFTokenDefault(),
},
},
@@ -305,22 +305,22 @@ func TestLexer_Next(t *testing.T) {
0xef, 0xbf, 0xbf,
}),
tokens: []*Token{
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x80})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x81})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbe})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbf})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x80})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x81})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbe})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbf})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x80})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x81})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbe})),
- newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbf})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x80})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x81})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbe})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbf})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x80})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x81})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbe})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbf})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x80})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x81})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbe})),
+ newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbf})),
newEOFTokenDefault(),
},
},
@@ -335,7 +335,7 @@ func TestLexer_Next(t *testing.T) {
0xf0, 0x90, 0x80, 0x80,
}),
tokens: []*Token{
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
newEOFTokenDefault(),
},
},
@@ -353,10 +353,10 @@ func TestLexer_Next(t *testing.T) {
0xf0, 0x90, 0x80, 0xbf,
}),
tokens: []*Token{
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbe})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbf})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbe})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbf})),
newEOFTokenDefault(),
},
},
@@ -374,10 +374,10 @@ func TestLexer_Next(t *testing.T) {
0xf0, 0x90, 0xbf, 0xbf,
}),
tokens: []*Token{
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbe})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbf})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbe})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbf})),
newEOFTokenDefault(),
},
},
@@ -395,10 +395,10 @@ func TestLexer_Next(t *testing.T) {
0xf0, 0xbf, 0xbf, 0xbf,
}),
tokens: []*Token{
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})),
newEOFTokenDefault(),
},
},
@@ -424,18 +424,18 @@ func TestLexer_Next(t *testing.T) {
0xf4, 0x8f, 0xbf, 0xbf,
}),
tokens: []*Token{
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x81})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbe})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x81})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbe})),
- newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x81})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbe})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x81})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbe})),
+ newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})),
newEOFTokenDefault(),
},
},
@@ -447,7 +447,7 @@ func TestLexer_Next(t *testing.T) {
},
src: "foo9",
tokens: []*Token{
- newTokenDefault(1, "NonNumber", newByteSequence([]byte("foo9"))),
+ newTokenDefault(1, 1, "NonNumber", newByteSequence([]byte("foo9"))),
newEOFTokenDefault(),
},
},
@@ -462,10 +462,10 @@ func TestLexer_Next(t *testing.T) {
},
src: "nνに😸",
tokens: []*Token{
- newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x6E})),
- newTokenDefault(2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})),
- newTokenDefault(3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})),
- newTokenDefault(4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})),
+ newTokenDefault(1, 1, "1ByteChar", newByteSequence([]byte{0x6E})),
+ newTokenDefault(2, 2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})),
+ newTokenDefault(3, 3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})),
+ newTokenDefault(4, 4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})),
newEOFTokenDefault(),
},
},
@@ -477,10 +477,10 @@ func TestLexer_Next(t *testing.T) {
},
src: "nνに😸",
tokens: []*Token{
- newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0x6E})),
- newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})),
- newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})),
- newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})),
+ newTokenDefault(1, 1, "codePointsAlt", newByteSequence([]byte{0x6E})),
+ newTokenDefault(1, 1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})),
+ newTokenDefault(1, 1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})),
+ newTokenDefault(1, 1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})),
newEOFTokenDefault(),
},
},
@@ -494,8 +494,8 @@ func TestLexer_Next(t *testing.T) {
},
src: "abcdefdefabcdef",
tokens: []*Token{
- newTokenDefault(1, "t1", newByteSequence([]byte("abcdefdef"))),
- newTokenDefault(1, "t1", newByteSequence([]byte("abcdef"))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("abcdefdef"))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("abcdef"))),
newEOFTokenDefault(),
},
},
@@ -509,7 +509,7 @@ func TestLexer_Next(t *testing.T) {
},
src: "abcdefdefabc",
tokens: []*Token{
- newTokenDefault(1, "t1", newByteSequence([]byte("abcdefdefabc"))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("abcdefdefabc"))),
newEOFTokenDefault(),
},
},
@@ -524,7 +524,7 @@ func TestLexer_Next(t *testing.T) {
},
src: "abcdefdefabc",
tokens: []*Token{
- newTokenDefault(1, "t1", newByteSequence([]byte("abcdefdefabc"))),
+ newTokenDefault(1, 1, "t1", newByteSequence([]byte("abcdefdefabc"))),
newEOFTokenDefault(),
},
},
@@ -540,16 +540,16 @@ func TestLexer_Next(t *testing.T) {
},
src: `"" "Hello world.\n\"Hello world.\""`,
tokens: []*Token{
- newToken(1, "default", 2, "string_open", newByteSequence([]byte(`"`))),
- newToken(2, "string", 3, "string_close", newByteSequence([]byte(`"`))),
- newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))),
- newToken(1, "default", 2, "string_open", newByteSequence([]byte(`"`))),
- newToken(2, "string", 2, "char_sequence", newByteSequence([]byte(`Hello world.`))),
- newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\n`))),
- newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\"`))),
- newToken(2, "string", 2, "char_sequence", newByteSequence([]byte(`Hello world.`))),
- newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\"`))),
- newToken(2, "string", 3, "string_close", newByteSequence([]byte(`"`))),
+ newToken(1, "default", 2, 2, "string_open", newByteSequence([]byte(`"`))),
+ newToken(2, "string", 5, 3, "string_close", newByteSequence([]byte(`"`))),
+ newToken(1, "default", 1, 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(1, "default", 2, 2, "string_open", newByteSequence([]byte(`"`))),
+ newToken(2, "string", 4, 2, "char_sequence", newByteSequence([]byte(`Hello world.`))),
+ newToken(2, "string", 3, 1, "escape_sequence", newByteSequence([]byte(`\n`))),
+ newToken(2, "string", 3, 1, "escape_sequence", newByteSequence([]byte(`\"`))),
+ newToken(2, "string", 4, 2, "char_sequence", newByteSequence([]byte(`Hello world.`))),
+ newToken(2, "string", 3, 1, "escape_sequence", newByteSequence([]byte(`\"`))),
+ newToken(2, "string", 5, 3, "string_close", newByteSequence([]byte(`"`))),
newEOFTokenDefault(),
},
},
@@ -566,15 +566,15 @@ func TestLexer_Next(t *testing.T) {
},
src: ` a b < < `,
tokens: []*Token{
- newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))),
- newToken(1, "default", 2, "char_a", newByteSequence([]byte(`a`))),
- newToken(2, "state_a", 1, "white_space", newByteSequence([]byte(` `))),
- newToken(2, "state_a", 2, "char_b", newByteSequence([]byte(`b`))),
- newToken(3, "state_b", 1, "white_space", newByteSequence([]byte(` `))),
- newToken(3, "state_b", 2, "back_from_b", newByteSequence([]byte(`<`))),
- newToken(2, "state_a", 1, "white_space", newByteSequence([]byte(` `))),
- newToken(2, "state_a", 3, "back_from_a", newByteSequence([]byte(`<`))),
- newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(1, "default", 1, 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(1, "default", 2, 2, "char_a", newByteSequence([]byte(`a`))),
+ newToken(2, "state_a", 1, 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(2, "state_a", 3, 2, "char_b", newByteSequence([]byte(`b`))),
+ newToken(3, "state_b", 1, 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(3, "state_b", 5, 2, "back_from_b", newByteSequence([]byte(`<`))),
+ newToken(2, "state_a", 1, 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(2, "state_a", 4, 3, "back_from_a", newByteSequence([]byte(`<`))),
+ newToken(1, "default", 1, 1, "white_space", newByteSequence([]byte(` `))),
newEOFTokenDefault(),
},
},
@@ -591,15 +591,15 @@ func TestLexer_Next(t *testing.T) {
},
src: `-> 1 -> 2 <- <- a`,
tokens: []*Token{
- newToken(1, "default", 3, "push_1", newByteSequence([]byte(`-> 1`))),
- newToken(2, "mode_1", 1, "white_space", newByteSequence([]byte(` `))),
- newToken(2, "mode_1", 2, "push_2", newByteSequence([]byte(`-> 2`))),
- newToken(3, "mode_2", 1, "white_space", newByteSequence([]byte(` `))),
- newToken(3, "mode_2", 2, "pop_2", newByteSequence([]byte(`<-`))),
- newToken(2, "mode_1", 1, "white_space", newByteSequence([]byte(` `))),
- newToken(2, "mode_1", 3, "pop_1", newByteSequence([]byte(`<-`))),
- newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))),
- newToken(1, "default", 2, "char", newByteSequence([]byte(`a`))),
+ newToken(1, "default", 3, 3, "push_1", newByteSequence([]byte(`-> 1`))),
+ newToken(2, "mode_1", 1, 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(2, "mode_1", 4, 2, "push_2", newByteSequence([]byte(`-> 2`))),
+ newToken(3, "mode_2", 1, 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(3, "mode_2", 6, 2, "pop_2", newByteSequence([]byte(`<-`))),
+ newToken(2, "mode_1", 1, 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(2, "mode_1", 5, 3, "pop_1", newByteSequence([]byte(`<-`))),
+ newToken(1, "default", 1, 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(1, "default", 2, 2, "char", newByteSequence([]byte(`a`))),
newEOFTokenDefault(),
},
passiveModeTran: true,
@@ -639,15 +639,15 @@ func TestLexer_Next(t *testing.T) {
},
src: `-> 1 -> 2 <- <- a`,
tokens: []*Token{
- newToken(1, "default", 3, "push_1", newByteSequence([]byte(`-> 1`))),
- newToken(2, "mode_1", 1, "white_space", newByteSequence([]byte(` `))),
- newToken(2, "mode_1", 2, "push_2", newByteSequence([]byte(`-> 2`))),
- newToken(3, "mode_2", 1, "white_space", newByteSequence([]byte(` `))),
- newToken(3, "mode_2", 2, "pop_2", newByteSequence([]byte(`<-`))),
- newToken(2, "mode_1", 1, "white_space", newByteSequence([]byte(` `))),
- newToken(2, "mode_1", 3, "pop_1", newByteSequence([]byte(`<-`))),
- newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))),
- newToken(1, "default", 2, "char", newByteSequence([]byte(`a`))),
+ newToken(1, "default", 3, 3, "push_1", newByteSequence([]byte(`-> 1`))),
+ newToken(2, "mode_1", 1, 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(2, "mode_1", 4, 2, "push_2", newByteSequence([]byte(`-> 2`))),
+ newToken(3, "mode_2", 1, 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(3, "mode_2", 6, 2, "pop_2", newByteSequence([]byte(`<-`))),
+ newToken(2, "mode_1", 1, 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(2, "mode_1", 5, 3, "pop_1", newByteSequence([]byte(`<-`))),
+ newToken(1, "default", 1, 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(1, "default", 2, 2, "char", newByteSequence([]byte(`a`))),
newEOFTokenDefault(),
},
// Active mode transition and an external transition function can be used together.
@@ -681,15 +681,15 @@ func TestLexer_Next(t *testing.T) {
},
src: `.*+?|()[\`,
tokens: []*Token{
- newTokenDefault(1, "dot", newByteSequence([]byte(`.`))),
- newTokenDefault(2, "star", newByteSequence([]byte(`*`))),
- newTokenDefault(3, "plus", newByteSequence([]byte(`+`))),
- newTokenDefault(4, "question", newByteSequence([]byte(`?`))),
- newTokenDefault(5, "vbar", newByteSequence([]byte(`|`))),
- newTokenDefault(6, "lparen", newByteSequence([]byte(`(`))),
- newTokenDefault(7, "rparen", newByteSequence([]byte(`)`))),
- newTokenDefault(8, "lbrace", newByteSequence([]byte(`[`))),
- newTokenDefault(9, "backslash", newByteSequence([]byte(`\`))),
+ newTokenDefault(1, 1, "dot", newByteSequence([]byte(`.`))),
+ newTokenDefault(2, 2, "star", newByteSequence([]byte(`*`))),
+ newTokenDefault(3, 3, "plus", newByteSequence([]byte(`+`))),
+ newTokenDefault(4, 4, "question", newByteSequence([]byte(`?`))),
+ newTokenDefault(5, 5, "vbar", newByteSequence([]byte(`|`))),
+ newTokenDefault(6, 6, "lparen", newByteSequence([]byte(`(`))),
+ newTokenDefault(7, 7, "rparen", newByteSequence([]byte(`)`))),
+ newTokenDefault(8, 8, "lbrace", newByteSequence([]byte(`[`))),
+ newTokenDefault(9, 9, "backslash", newByteSequence([]byte(`\`))),
newEOFTokenDefault(),
},
},
@@ -737,7 +737,8 @@ func testToken(t *testing.T, expected, actual *Token) {
t.Helper()
if actual.Mode != expected.Mode ||
- actual.ModeName != actual.ModeName ||
+ actual.ModeName != expected.ModeName ||
+ actual.KindID != expected.KindID ||
actual.Kind != expected.Kind ||
actual.KindName != expected.KindName ||
!bytes.Equal(actual.Match(), expected.Match()) ||
diff --git a/spec/spec.go b/spec/spec.go
index 829008a..b8aae33 100644
--- a/spec/spec.go
+++ b/spec/spec.go
@@ -29,6 +29,18 @@ func (k LexKind) validate() error {
return nil
}
+// LexKindID is a unique ID among modes.
+type LexKindID int
+
+func (id LexKindID) Int() int {
+ return int(id)
+}
+
+const (
+ LexKindIDNil = LexKindID(0)
+ LexKindIDMin = LexKindID(1)
+)
+
type LexPattern string
func (p LexPattern) validate() error {
@@ -199,6 +211,8 @@ type CompiledLexModeSpec struct {
type CompiledLexSpec struct {
InitialMode LexModeNum `json:"initial_mode"`
Modes []LexModeName `json:"modes"`
+ Kinds []LexKind `json:"kinds"`
+ KindIDs [][]LexKindID `json:"kind_ids"`
CompressionLevel int `json:"compression_level"`
Specs []*CompiledLexModeSpec `json:"specs"`
}