aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md46
-rw-r--r--compiler/ast.go6
-rw-r--r--compiler/compiler.go112
-rw-r--r--compiler/dfa.go18
-rw-r--r--compiler/dfa_test.go6
-rw-r--r--compiler/parser.go12
-rw-r--r--compiler/parser_test.go8
-rw-r--r--compiler/test_util_test.go4
-rw-r--r--driver/lexer.go137
-rw-r--r--driver/lexer_test.go18
-rw-r--r--spec/spec.go153
11 files changed, 289 insertions, 231 deletions
diff --git a/README.md b/README.md
index 8f9f01c..ae3e8fa 100644
--- a/README.md
+++ b/README.md
@@ -47,33 +47,33 @@ If you want to make sure that the lexical specification behaves as expected, you
⚠️ An encoding that `maleeni lex` and the driver can handle is only UTF-8.
```sh
-$ echo -n 'The truth is out there.' | maleeni lex clexspec.json | jq -r '[.kind_name, .text, .eof] | @csv'
-"word","The",false
-"whitespace"," ",false
-"word","truth",false
-"whitespace"," ",false
-"word","is",false
-"whitespace"," ",false
-"word","out",false
-"whitespace"," ",false
-"word","there",false
-"punctuation",".",false
-"","",true
+$ echo -n 'The truth is out there.' | maleeni lex clexspec.json | jq -r '[.kind_id, .kind_name, .text, .eof] | @csv'
+2,"word","The",false
+1,"whitespace"," ",false
+2,"word","truth",false
+1,"whitespace"," ",false
+2,"word","is",false
+1,"whitespace"," ",false
+2,"word","out",false
+1,"whitespace"," ",false
+2,"word","there",false
+3,"punctuation",".",false
+0,"","",true
```
The JSON format of tokens that `maleeni lex` command prints is as follows:
-| Field | Type | Description |
-|-----------|-------------------|----------------------------------------------------------------------------------------|
-| mode | integer | `mode` represents a number that corresponds to a `mode_name`. |
-| mode_name | string | `mode_name` is a mode name that represents in which mode the lexer detected the token. |
-| kind_id | integer | `kind_id` represents an ID of a kind and is unique among modes. |
-| kind | integer | `kind` represents a number that corresponds to a `KindName`. |
-| kind_name | string | `kind_name` is a kind name that represents what kind the token has. |
-| match | array of integers | `match` is a byte sequence matched a pattern of a lexical specification. |
-| text | string | `text` is a string representation of `match`. |
-| eof | bool | If `eof` is true, it means the token is the EOF token. |
-| invalid | bool | If `invalid` is true, it means the token is an error token. |
+| Field | Type | Description |
+|--------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------|
+| mode_id | integer | An ID of a lex mode. |
+| mode_name | string | A name of a lex mode. |
+| kind_id | integer | An ID of a kind. This is unique among all modes. |
+| mode_kind_id | integer | An ID of a lexical kind. This is unique only within a mode. Note that you need to use `KindID` field if you want to identify a kind across all modes. |
+| kind_name | string | A name of a lexical kind. |
+| match | array of integers | A byte sequense of a lexeme. |
+| text | string | A string representation of a lexeme. |
+| eof | bool | When this field is `true`, it means the token is the EOF token. |
+| invalid | bool | When this field is `true`, it means the token is an error token. |
When using the driver, please import `github.com/nihei9/maleeni/driver` and `github.com/nihei9/maleeni/spec` package.
You can use the driver easily in the following way:
diff --git a/compiler/ast.go b/compiler/ast.go
index 7d3965a..046662e 100644
--- a/compiler/ast.go
+++ b/compiler/ast.go
@@ -3,6 +3,8 @@ package compiler
import (
"fmt"
"io"
+
+ "github.com/nihei9/maleeni/spec"
)
type astNode interface {
@@ -78,13 +80,13 @@ func (n *symbolNode) last() *symbolPositionSet {
}
type endMarkerNode struct {
- id int
+ id spec.LexModeKindID
pos symbolPosition
firstMemo *symbolPositionSet
lastMemo *symbolPositionSet
}
-func newEndMarkerNode(id int) *endMarkerNode {
+func newEndMarkerNode(id spec.LexModeKindID) *endMarkerNode {
return &endMarkerNode{
id: id,
pos: symbolPositionNil,
diff --git a/compiler/compiler.go b/compiler/compiler.go
index 5d3e52f..5d1a1d5 100644
--- a/compiler/compiler.go
+++ b/compiler/compiler.go
@@ -54,28 +54,28 @@ func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSp
}
}
- modeEntries, modes, modeNums, fragmetns := groupEntriesByLexMode(lexspec.Entries)
+ modeEntries, modeNames, modeName2ID, fragmetns := groupEntriesByLexMode(lexspec.Entries)
modeSpecs := []*spec.CompiledLexModeSpec{
nil,
}
for i, es := range modeEntries[1:] {
- modeName := modes[i+1]
+ modeName := modeNames[i+1]
config.logger.Log("Compile %v mode:", modeName)
- modeSpec, err := compile(es, modeNums, fragmetns, config)
+ modeSpec, err := compile(es, modeName2ID, fragmetns, config)
if err != nil {
return nil, fmt.Errorf("failed to compile in %v mode: %w", modeName, err)
}
modeSpecs = append(modeSpecs, modeSpec)
}
- var kindNames []spec.LexKind
- var name2ID map[spec.LexKind]spec.LexKindID
+ var kindNames []spec.LexKindName
+ var name2ID map[spec.LexKindName]spec.LexKindID
{
- name2ID = map[spec.LexKind]spec.LexKindID{}
+ name2ID = map[spec.LexKindName]spec.LexKindID{}
id := spec.LexKindIDMin
for _, modeSpec := range modeSpecs[1:] {
- for _, name := range modeSpec.Kinds[1:] {
+ for _, name := range modeSpec.KindNames[1:] {
if _, ok := name2ID[name]; ok {
continue
}
@@ -84,7 +84,7 @@ func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSp
}
}
- kindNames = make([]spec.LexKind, len(name2ID)+1)
+ kindNames = make([]spec.LexKindName, len(name2ID)+1)
for name, id := range name2ID {
kindNames[id] = name
}
@@ -94,8 +94,8 @@ func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSp
{
kindIDs = make([][]spec.LexKindID, len(modeSpecs))
for i, modeSpec := range modeSpecs[1:] {
- ids := make([]spec.LexKindID, len(modeSpec.Kinds))
- for modeID, name := range modeSpec.Kinds {
+ ids := make([]spec.LexKindID, len(modeSpec.KindNames))
+ for modeID, name := range modeSpec.KindNames {
if modeID == 0 {
continue
}
@@ -106,25 +106,25 @@ func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSp
}
return &spec.CompiledLexSpec{
- InitialMode: spec.LexModeNumDefault,
- Modes: modes,
- Kinds: kindNames,
+ InitialModeID: spec.LexModeIDDefault,
+ ModeNames: modeNames,
+ KindNames: kindNames,
KindIDs: kindIDs,
CompressionLevel: config.compLv,
Specs: modeSpecs,
}, nil
}
-func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeNum, map[string]*spec.LexEntry) {
- modes := []spec.LexModeName{
+func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeID, map[string]*spec.LexEntry) {
+ modeNames := []spec.LexModeName{
spec.LexModeNameNil,
spec.LexModeNameDefault,
}
- modeNums := map[spec.LexModeName]spec.LexModeNum{
- spec.LexModeNameNil: spec.LexModeNumNil,
- spec.LexModeNameDefault: spec.LexModeNumDefault,
+ modeName2ID := map[spec.LexModeName]spec.LexModeID{
+ spec.LexModeNameNil: spec.LexModeIDNil,
+ spec.LexModeNameDefault: spec.LexModeIDDefault,
}
- lastModeNum := spec.LexModeNumDefault
+ lastModeID := spec.LexModeIDDefault
modeEntries := [][]*spec.LexEntry{
nil,
{},
@@ -141,30 +141,30 @@ func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec
spec.LexModeNameDefault,
}
}
- for _, mode := range ms {
- num, ok := modeNums[mode]
+ for _, modeName := range ms {
+ modeID, ok := modeName2ID[modeName]
if !ok {
- num = lastModeNum.Succ()
- lastModeNum = num
- modeNums[mode] = num
- modes = append(modes, mode)
+ modeID = lastModeID + 1
+ lastModeID = modeID
+ modeName2ID[modeName] = modeID
+ modeNames = append(modeNames, modeName)
modeEntries = append(modeEntries, []*spec.LexEntry{})
}
- modeEntries[num] = append(modeEntries[num], e)
+ modeEntries[modeID] = append(modeEntries[modeID], e)
}
}
- return modeEntries, modes, modeNums, fragments
+ return modeEntries, modeNames, modeName2ID, fragments
}
-func compile(entries []*spec.LexEntry, modeNums map[spec.LexModeName]spec.LexModeNum, fragments map[string]*spec.LexEntry, config *compilerConfig) (*spec.CompiledLexModeSpec, error) {
- var kinds []spec.LexKind
- var patterns map[int][]byte
+func compile(entries []*spec.LexEntry, modeName2ID map[spec.LexModeName]spec.LexModeID, fragments map[string]*spec.LexEntry, config *compilerConfig) (*spec.CompiledLexModeSpec, error) {
+ var kindNames []spec.LexKindName
+ var patterns map[spec.LexModeKindID][]byte
{
- kinds = append(kinds, spec.LexKindNil)
- patterns = map[int][]byte{}
+ kindNames = append(kindNames, spec.LexKindNameNil)
+ patterns = map[spec.LexModeKindID][]byte{}
for i, e := range entries {
- kinds = append(kinds, e.Kind)
- patterns[i+1] = []byte(e.Pattern)
+ kindNames = append(kindNames, e.Kind)
+ patterns[spec.LexModeKindID(i+1)] = []byte(e.Pattern)
}
config.logger.Log("Patterns:")
@@ -173,16 +173,16 @@ func compile(entries []*spec.LexEntry, modeNums map[spec.LexModeName]spec.LexMod
}
}
- push := []spec.LexModeNum{
- spec.LexModeNumNil,
+ push := []spec.LexModeID{
+ spec.LexModeIDNil,
}
pop := []int{
0,
}
for _, e := range entries {
- pushV := spec.LexModeNumNil
+ pushV := spec.LexModeIDNil
if e.Push != "" {
- pushV = modeNums[e.Push]
+ pushV = modeName2ID[e.Push]
}
push = append(push, pushV)
popV := 0
@@ -222,7 +222,7 @@ func compile(entries []*spec.LexEntry, modeNums map[spec.LexModeName]spec.LexMod
config.logger.Log(`DFA:
States: %v states (%v entries)
- Initial State: %v`, tranTab.RowCount, tranTab.RowCount*tranTab.ColCount, tranTab.InitialState)
+ Initial State ID: %v`, tranTab.RowCount, tranTab.RowCount*tranTab.ColCount, tranTab.InitialStateID)
config.logger.Log(" Accepting States:")
for state, symbol := range tranTab.AcceptingStates {
config.logger.Log(" %v: %v", state, symbol)
@@ -244,10 +244,10 @@ func compile(entries []*spec.LexEntry, modeNums map[spec.LexModeName]spec.LexMod
}
return &spec.CompiledLexModeSpec{
- Kinds: kinds,
- Push: push,
- Pop: pop,
- DFA: tranTab,
+ KindNames: kindNames,
+ Push: push,
+ Pop: pop,
+ DFA: tranTab,
}, nil
}
@@ -259,7 +259,7 @@ const (
func compressTransitionTableLv2(tranTab *spec.TransitionTable) (*spec.TransitionTable, error) {
ueTab := compressor.NewUniqueEntriesTable()
{
- orig, err := compressor.NewOriginalTable(tranTab.UncompressedTransition, tranTab.ColCount)
+ orig, err := compressor.NewOriginalTable(convertStateIDSliceToIntSlice(tranTab.UncompressedTransition), tranTab.ColCount)
if err != nil {
return nil, err
}
@@ -285,8 +285,8 @@ func compressTransitionTableLv2(tranTab *spec.TransitionTable) (*spec.Transition
UniqueEntries: &spec.RowDisplacementTable{
OriginalRowCount: rdTab.OriginalRowCount,
OriginalColCount: rdTab.OriginalColCount,
- EmptyValue: rdTab.EmptyValue,
- Entries: rdTab.Entries,
+ EmptyValue: spec.StateIDNil,
+ Entries: convertIntSliceToStateIDSlice(rdTab.Entries),
Bounds: rdTab.Bounds,
RowDisplacement: rdTab.RowDisplacement,
},
@@ -302,7 +302,7 @@ func compressTransitionTableLv2(tranTab *spec.TransitionTable) (*spec.Transition
func compressTransitionTableLv1(tranTab *spec.TransitionTable) (*spec.TransitionTable, error) {
ueTab := compressor.NewUniqueEntriesTable()
{
- orig, err := compressor.NewOriginalTable(tranTab.UncompressedTransition, tranTab.ColCount)
+ orig, err := compressor.NewOriginalTable(convertStateIDSliceToIntSlice(tranTab.UncompressedTransition), tranTab.ColCount)
if err != nil {
return nil, err
}
@@ -313,7 +313,7 @@ func compressTransitionTableLv1(tranTab *spec.TransitionTable) (*spec.Transition
}
tranTab.Transition = &spec.UniqueEntriesTable{
- UncompressedUniqueEntries: ueTab.UniqueEntries,
+ UncompressedUniqueEntries: convertIntSliceToStateIDSlice(ueTab.UniqueEntries),
RowNums: ueTab.RowNums,
OriginalRowCount: ueTab.OriginalRowCount,
OriginalColCount: ueTab.OriginalColCount,
@@ -322,3 +322,19 @@ func compressTransitionTableLv1(tranTab *spec.TransitionTable) (*spec.Transition
return tranTab, nil
}
+
+func convertStateIDSliceToIntSlice(s []spec.StateID) []int {
+ is := make([]int, len(s))
+ for i, v := range s {
+ is[i] = v.Int()
+ }
+ return is
+}
+
+func convertIntSliceToStateIDSlice(s []int) []spec.StateID {
+ ss := make([]spec.StateID, len(s))
+ for i, v := range s {
+ ss[i] = spec.StateID(v)
+ }
+ return ss
+}
diff --git a/compiler/dfa.go b/compiler/dfa.go
index b94fca8..1d0b26a 100644
--- a/compiler/dfa.go
+++ b/compiler/dfa.go
@@ -9,7 +9,7 @@ import (
type DFA struct {
States []string
InitialState string
- AcceptingStatesTable map[string]int
+ AcceptingStatesTable map[string]spec.LexModeKindID
TransitionTable map[string][256]string
}
@@ -65,7 +65,7 @@ func genDFA(root astNode, symTab *symbolTable) *DFA {
}
}
- accTab := map[string]int{}
+ accTab := map[string]spec.LexModeKindID{}
{
for h, s := range stateMap {
for _, pos := range s.set() {
@@ -104,33 +104,33 @@ func genDFA(root astNode, symTab *symbolTable) *DFA {
}
func genTransitionTable(dfa *DFA) (*spec.TransitionTable, error) {
- state2Num := map[string]int{}
+ stateHash2ID := map[string]spec.StateID{}
for i, s := range dfa.States {
// Since 0 represents an invalid value in a transition table,
// assign a number greater than or equal to 1 to states.
- state2Num[s] = i + 1
+ stateHash2ID[s] = spec.StateID(i + spec.StateIDMin.Int())
}
- acc := make([]int, len(dfa.States)+1)
+ acc := make([]spec.LexModeKindID, len(dfa.States)+1)
for _, s := range dfa.States {
id, ok := dfa.AcceptingStatesTable[s]
if !ok {
continue
}
- acc[state2Num[s]] = id
+ acc[stateHash2ID[s]] = id
}
rowCount := len(dfa.States) + 1
colCount := 256
- tran := make([]int, rowCount*colCount)
+ tran := make([]spec.StateID, rowCount*colCount)
for s, tab := range dfa.TransitionTable {
for v, to := range tab {
- tran[state2Num[s]*256+v] = state2Num[to]
+ tran[stateHash2ID[s].Int()*256+v] = stateHash2ID[to]
}
}
return &spec.TransitionTable{
- InitialState: state2Num[dfa.InitialState],
+ InitialStateID: stateHash2ID[dfa.InitialState],
AcceptingStates: acc,
UncompressedTransition: tran,
RowCount: rowCount,
diff --git a/compiler/dfa_test.go b/compiler/dfa_test.go
index a797313..2683815 100644
--- a/compiler/dfa_test.go
+++ b/compiler/dfa_test.go
@@ -2,10 +2,12 @@ package compiler
import (
"testing"
+
+ "github.com/nihei9/maleeni/spec"
)
func TestGenDFA(t *testing.T) {
- root, symTab, err := parse(map[int][]byte{
+ root, symTab, err := parse(map[spec.LexModeKindID][]byte{
1: []byte("(a|b)*abb"),
}, nil)
if err != nil {
@@ -94,7 +96,7 @@ func TestGenDFA(t *testing.T) {
t.Errorf("initial state is mismatched; want: %v, got: %v", s0.hash(), dfa.InitialState)
}
- accTab := map[string]int{
+ accTab := map[string]spec.LexModeKindID{
s3.hash(): 1,
}
if len(dfa.AcceptingStatesTable) != len(accTab) {
diff --git a/compiler/parser.go b/compiler/parser.go
index 06a762d..1c9126a 100644
--- a/compiler/parser.go
+++ b/compiler/parser.go
@@ -7,6 +7,8 @@ import (
"fmt"
"io"
"strings"
+
+ "github.com/nihei9/maleeni/spec"
)
type ParseErrors struct {
@@ -23,7 +25,7 @@ func (e *ParseErrors) Error() string {
}
type ParseError struct {
- ID int
+ ID spec.LexModeKindID
Pattern []byte
Cause error
Details string
@@ -44,13 +46,13 @@ func raiseSyntaxError(synErr *SyntaxError) {
type symbolTable struct {
symPos2Byte map[symbolPosition]byteRange
- endPos2ID map[symbolPosition]int
+ endPos2ID map[symbolPosition]spec.LexModeKindID
}
func genSymbolTable(root astNode) *symbolTable {
symTab := &symbolTable{
symPos2Byte: map[symbolPosition]byteRange{},
- endPos2ID: map[symbolPosition]int{},
+ endPos2ID: map[symbolPosition]spec.LexModeKindID{},
}
return genSymTab(symTab, root)
}
@@ -76,7 +78,7 @@ func genSymTab(symTab *symbolTable, node astNode) *symbolTable {
return symTab
}
-func parse(regexps map[int][]byte, fragments map[string][]byte) (astNode, *symbolTable, error) {
+func parse(regexps map[spec.LexModeKindID][]byte, fragments map[string][]byte) (astNode, *symbolTable, error) {
if len(regexps) == 0 {
return nil, nil, fmt.Errorf("parse() needs at least one token entry")
}
@@ -159,7 +161,7 @@ func parseFragments(fragments map[string][]byte) (map[string]astNode, error) {
return fragmentASTs, nil
}
-func parseRegexp(regexps map[int][]byte, fragmentASTs map[string]astNode) (astNode, error) {
+func parseRegexp(regexps map[spec.LexModeKindID][]byte, fragmentASTs map[string]astNode) (astNode, error) {
symPos := symbolPositionMin
var root astNode
var perrs []*ParseError
diff --git a/compiler/parser_test.go b/compiler/parser_test.go
index 3394845..30e6130 100644
--- a/compiler/parser_test.go
+++ b/compiler/parser_test.go
@@ -4,6 +4,8 @@ import (
"fmt"
"reflect"
"testing"
+
+ "github.com/nihei9/maleeni/spec"
)
func symPos(n uint16) symbolPosition {
@@ -1197,7 +1199,7 @@ func TestParse(t *testing.T) {
for kind, pattern := range tt.fragments {
fragments[kind] = []byte(pattern)
}
- ast, _, err := parse(map[int][]byte{
+ ast, _, err := parse(map[spec.LexModeKindID][]byte{
1: []byte(tt.pattern),
}, fragments)
if tt.syntaxError != nil {
@@ -1237,7 +1239,7 @@ func TestParse(t *testing.T) {
}
func TestParse_FollowAndSymbolTable(t *testing.T) {
- root, symTab, err := parse(map[int][]byte{
+ root, symTab, err := parse(map[spec.LexModeKindID][]byte{
1: []byte("(a|b)*abb"),
}, nil)
if err != nil {
@@ -1295,7 +1297,7 @@ func TestParse_FollowAndSymbolTable(t *testing.T) {
symPos(4): entry(byte('b')),
symPos(5): entry(byte('b')),
},
- endPos2ID: map[symbolPosition]int{
+ endPos2ID: map[symbolPosition]spec.LexModeKindID{
endPos(6): 1,
},
}
diff --git a/compiler/test_util_test.go b/compiler/test_util_test.go
index 2ead2c9..72e150b 100644
--- a/compiler/test_util_test.go
+++ b/compiler/test_util_test.go
@@ -1,5 +1,7 @@
package compiler
+import "github.com/nihei9/maleeni/spec"
+
func newRangeSymbolNodeWithPos(from, to byte, pos symbolPosition) *symbolNode {
n := newRangeSymbolNode(from, to)
n.pos = pos
@@ -13,7 +15,7 @@ func newSymbolNodeWithPos(v byte, pos symbolPosition) *symbolNode {
}
func newEndMarkerNodeWithPos(id int, pos symbolPosition) *endMarkerNode {
- n := newEndMarkerNode(id)
+ n := newEndMarkerNode(spec.LexModeKindID(id))
n.pos = pos
return n
}
diff --git a/driver/lexer.go b/driver/lexer.go
index 7ad2dd0..bad7dbd 100644
--- a/driver/lexer.go
+++ b/driver/lexer.go
@@ -56,69 +56,70 @@ func (s byteSequence) merge(a byteSequence) byteSequence {
// Token representes a token.
type Token struct {
- // `Mode` represents a number that corresponds to a `ModeName`.
- Mode spec.LexModeNum
+ // ModeID is an ID of a lex mode.
+ ModeID spec.LexModeID
- // `ModeName` is a mode name that represents in which mode the lexer detected the token.
+ // ModeName is a name of a lex mode.
ModeName spec.LexModeName
- // `KindID` is a unique ID among modes.
- KindID int
+ // KindID is an ID of a kind. This is unique among all modes.
+ KindID spec.LexKindID
- // `Kind` represents a number that corresponds to a `KindName`.
- Kind int
+ // ModeKindID is an ID of a lexical kind. This is unique only within a mode.
+ // Note that you need to use KindID field if you want to identify a kind across all modes.
+ ModeKindID spec.LexModeKindID
- // `KindName` is a kind name that represents what kind the token has.
- KindName string
+ // KindName is a name of a lexical kind.
+ KindName spec.LexKindName
- // If `EOF` is true, it means the token is the EOF token.
+ // When this field is true, it means the token is the EOF token.
EOF bool
- // If `Invalid` is true, it means the token is an error token.
+ // When this field is true, it means the token is an error token.
Invalid bool
- // `match` is a byte sequence matched a pattern of a lexical specification.
+ // match is a byte sequence matched a pattern of a lexical specification.
match byteSequence
}
-func newToken(mode spec.LexModeNum, modeName spec.LexModeName, kindID int, modeKindID int, kindName string, match byteSequence) *Token {
+func newToken(modeID spec.LexModeID, modeName spec.LexModeName, kindID spec.LexKindID, modeKindID spec.LexModeKindID, kindName spec.LexKindName, match byteSequence) *Token {
return &Token{
- Mode: mode,
- ModeName: modeName,
- KindID: kindID,
- Kind: modeKindID,
- KindName: kindName,
- match: match,
+ ModeID: modeID,
+ ModeName: modeName,
+ KindID: kindID,
+ ModeKindID: modeKindID,
+ KindName: kindName,
+ match: match,
}
}
-func newEOFToken(mode spec.LexModeNum, modeName spec.LexModeName) *Token {
+func newEOFToken(modeID spec.LexModeID, modeName spec.LexModeName) *Token {
return &Token{
- Mode: mode,
- ModeName: modeName,
- Kind: 0,
- EOF: true,
+ ModeID: modeID,
+ ModeName: modeName,
+ ModeKindID: 0,
+ EOF: true,
}
}
-func newInvalidToken(mode spec.LexModeNum, modeName spec.LexModeName, match byteSequence) *Token {
+func newInvalidToken(modeID spec.LexModeID, modeName spec.LexModeName, match byteSequence) *Token {
return &Token{
- Mode: mode,
- ModeName: modeName,
- Kind: 0,
- match: match,
- Invalid: true,
+ ModeID: modeID,
+ ModeName: modeName,
+ ModeKindID: 0,
+ match: match,
+ Invalid: true,
}
}
func (t *Token) String() string {
if t.Invalid {
- return fmt.Sprintf("!{mode: %v, mode name: %v, text: %v, byte: %v}", t.Mode, t.ModeName, t.Text(), t.Match())
+ return fmt.Sprintf("!{mode id: %v, mode name: %v, text: %v, byte: %v}", t.ModeID, t.ModeName, t.Text(), t.Match())
}
if t.EOF {
return "{eof}"
}
- return fmt.Sprintf("{mode: %v, mode name: %v, kind: %v, kind name: %v, text: %v, byte: %v}", t.Mode, t.ModeName, t.Kind, t.KindName, t.Text(), t.Match())
+ return fmt.Sprintf("{mode id: %v, mode name: %v, kind id: %v, mode kind id: %v, kind name: %v, text: %v, byte: %v}", t.ModeID, t.ModeName, t.KindID, t.ModeKindID, t.KindName, t.Text(), t.Match())
}
// Match returns a byte slice matched a pattern of a lexical specification.
@@ -133,25 +134,25 @@ func (t *Token) Text() string {
func (t *Token) MarshalJSON() ([]byte, error) {
return json.Marshal(struct {
- Mode int `json:"mode"`
- ModeName string `json:"mode_name"`
- KindID int `json:"kind_id"`
- Kind int `json:"kind"`
- KindName string `json:"kind_name"`
- Match byteSequence `json:"match"`
- Text string `json:"text"`
- EOF bool `json:"eof"`
- Invalid bool `json:"invalid"`
+ ModeID int `json:"mode_id"`
+ ModeName string `json:"mode_name"`
+ KindID int `json:"kind_id"`
+ ModeKindID int `json:"mode_kind_id"`
+ KindName string `json:"kind_name"`
+ Match byteSequence `json:"match"`
+ Text string `json:"text"`
+ EOF bool `json:"eof"`
+ Invalid bool `json:"invalid"`
}{
- Mode: t.Mode.Int(),
- ModeName: t.ModeName.String(),
- KindID: t.KindID,
- Kind: t.Kind,
- KindName: t.KindName,
- Match: t.match,
- Text: t.Text(),
- EOF: t.EOF,
- Invalid: t.Invalid,
+ ModeID: t.ModeID.Int(),
+ ModeName: t.ModeName.String(),
+ KindID: t.KindID.Int(),
+ ModeKindID: t.ModeKindID.Int(),
+ KindName: t.KindName.String(),
+ Match: t.match,
+ Text: t.Text(),
+ EOF: t.EOF,
+ Invalid: t.Invalid,
})
}
@@ -180,7 +181,7 @@ type Lexer struct {
src []byte
srcPtr int
tokBuf []*Token
- modeStack []spec.LexModeNum
+ modeStack []spec.LexModeID
passiveModeTran bool
logger log.Logger
}
@@ -194,8 +195,8 @@ func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...LexerOption)
clspec: clspec,
src: b,
srcPtr: 0,
- modeStack: []spec.LexModeNum{
- clspec.InitialMode,
+ modeStack: []spec.LexModeID{
+ clspec.InitialModeID,
},
passiveModeTran: false,
logger: log.NewNopLogger(),
@@ -216,7 +217,7 @@ func (l *Lexer) Next() (*Token, error) {
State:
mode: #%v %v
pointer: %v
- token buffer: %v`, l.Mode(), l.clspec.Modes[l.Mode()], l.srcPtr, l.tokBuf)
+ token buffer: %v`, l.Mode(), l.clspec.ModeNames[l.Mode()], l.srcPtr, l.tokBuf)
if len(l.tokBuf) > 0 {
tok := l.tokBuf[0]
@@ -273,13 +274,13 @@ func (l *Lexer) nextAndTransition() (*Token, error) {
return tok, nil
}
spec := l.clspec.Specs[l.Mode()]
- if spec.Pop[tok.Kind] == 1 {
+ if spec.Pop[tok.ModeKindID] == 1 {
err := l.PopMode()
if err != nil {
return nil, err
}
}
- mode := spec.Push[tok.Kind]
+ mode := spec.Push[tok.ModeKindID]
if !mode.IsNil() {
l.PushMode(mode)
}
@@ -296,9 +297,9 @@ func (l *Lexer) nextAndTransition() (*Token, error) {
func (l *Lexer) next() (*Token, error) {
mode := l.Mode()
- modeName := l.clspec.Modes[mode]
+ modeName := l.clspec.ModeNames[mode]
spec := l.clspec.Specs[mode]
- state := spec.DFA.InitialState
+ state := spec.DFA.InitialStateID
buf := []byte{}
unfixedBufLen := 0
var tok *Token
@@ -330,13 +331,13 @@ func (l *Lexer) next() (*Token, error) {
modeKindID := spec.DFA.AcceptingStates[state]
if modeKindID != 0 {
kindID := l.clspec.KindIDs[mode][modeKindID]
- tok = newToken(mode, modeName, kindID.Int(), modeKindID, spec.Kinds[modeKindID].String(), newByteSequence(buf))
+ tok = newToken(mode, modeName, kindID, modeKindID, spec.KindNames[modeKindID], newByteSequence(buf))
unfixedBufLen = 0
}
}
}
-func (l *Lexer) lookupNextState(mode spec.LexModeNum, state int, v int) (int, bool) {
+func (l *Lexer) lookupNextState(mode spec.LexModeID, state spec.StateID, v int) (spec.StateID, bool) {
switch l.clspec.CompressionLevel {
case 2:
tab := l.clspec.Specs[mode].DFA.Transition
@@ -349,24 +350,24 @@ func (l *Lexer) lookupNextState(mode spec.LexModeNum, state int, v int) (int, bo
case 1:
tab := l.clspec.Specs[mode].DFA.Transition
next := tab.UncompressedUniqueEntries[tab.RowNums[state]*tab.OriginalColCount+v]
- if next == 0 {
- return 0, false
+ if next == spec.StateIDNil {
+ return spec.StateIDNil, false
}
return next, true
}
- spec := l.clspec.Specs[mode]
- next := spec.DFA.UncompressedTransition[state*spec.DFA.ColCount+v]
- if next == 0 {
- return 0, false
+ modeSpec := l.clspec.Specs[mode]
+ next := modeSpec.DFA.UncompressedTransition[state.Int()*modeSpec.DFA.ColCount+v]
+ if next == spec.StateIDNil {
+ return spec.StateIDNil, false
}
return next, true
}
-func (l *Lexer) Mode() spec.LexModeNum {
+func (l *Lexer) Mode() spec.LexModeID {
return l.modeStack[len(l.modeStack)-1]
}
-func (l *Lexer) PushMode(mode spec.LexModeNum) {
+func (l *Lexer) PushMode(mode spec.LexModeID) {
l.modeStack = append(l.modeStack, mode)
}
diff --git a/driver/lexer_test.go b/driver/lexer_test.go
index 79ee12e..5abe83c 100644
--- a/driver/lexer_test.go
+++ b/driver/lexer_test.go
@@ -16,7 +16,7 @@ func newLexEntry(modes []string, kind string, pattern string, push string, pop b
ms = append(ms, spec.LexModeName(m))
}
return &spec.LexEntry{
- Kind: spec.LexKind(kind),
+ Kind: spec.LexKindName(kind),
Pattern: spec.LexPattern(pattern),
Modes: ms,
Push: spec.LexModeName(push),
@@ -26,7 +26,7 @@ func newLexEntry(modes []string, kind string, pattern string, push string, pop b
func newLexEntryDefaultNOP(kind string, pattern string) *spec.LexEntry {
return &spec.LexEntry{
- Kind: spec.LexKind(kind),
+ Kind: spec.LexKindName(kind),
Pattern: spec.LexPattern(pattern),
Modes: []spec.LexModeName{
spec.LexModeNameDefault,
@@ -36,18 +36,18 @@ func newLexEntryDefaultNOP(kind string, pattern string) *spec.LexEntry {
func newLexEntryFragment(kind string, pattern string) *spec.LexEntry {
return &spec.LexEntry{
- Kind: spec.LexKind(kind),
+ Kind: spec.LexKindName(kind),
Pattern: spec.LexPattern(pattern),
Fragment: true,
}
}
func newTokenDefault(kindID int, modeKindID int, kindName string, match byteSequence) *Token {
- return newToken(spec.LexModeNumDefault, spec.LexModeNameDefault, kindID, modeKindID, kindName, match)
+ return newToken(spec.LexModeIDDefault, spec.LexModeNameDefault, spec.LexKindID(kindID), spec.LexModeKindID(modeKindID), spec.LexKindName(kindName), match)
}
func newEOFTokenDefault() *Token {
- return newEOFToken(spec.LexModeNumDefault, spec.LexModeNameDefault)
+ return newEOFToken(spec.LexModeIDDefault, spec.LexModeNameDefault)
}
func TestLexer_Next(t *testing.T) {
@@ -604,7 +604,7 @@ func TestLexer_Next(t *testing.T) {
},
passiveModeTran: true,
tran: func(l *Lexer, tok *Token) error {
- switch l.clspec.Modes[l.Mode().Int()] {
+ switch l.clspec.ModeNames[l.Mode()] {
case "default":
switch tok.KindName {
case "push_1":
@@ -653,7 +653,7 @@ func TestLexer_Next(t *testing.T) {
// Active mode transition and an external transition function can be used together.
passiveModeTran: false,
tran: func(l *Lexer, tok *Token) error {
- switch l.clspec.Modes[l.Mode().Int()] {
+ switch l.clspec.ModeNames[l.Mode()] {
case "mode_1":
switch tok.KindName {
case "push_2":
@@ -736,10 +736,10 @@ func TestLexer_Next(t *testing.T) {
func testToken(t *testing.T, expected, actual *Token) {
t.Helper()
- if actual.Mode != expected.Mode ||
+ if actual.ModeID != expected.ModeID ||
actual.ModeName != expected.ModeName ||
actual.KindID != expected.KindID ||
- actual.Kind != expected.Kind ||
+ actual.ModeKindID != expected.ModeKindID ||
actual.KindName != expected.KindName ||
!bytes.Equal(actual.Match(), expected.Match()) ||
actual.EOF != expected.EOF ||
diff --git a/spec/spec.go b/spec/spec.go
index b8aae33..d4f6346 100644
--- a/spec/spec.go
+++ b/spec/spec.go
@@ -7,40 +7,56 @@ import (
"strings"
)
-const lexKindPattern = "[A-Za-z_][0-9A-Za-z_]*"
+// LexKindID represents an ID of a lexical kind and is unique across all modes.
+type LexKindID int
+
+const (
+ LexKindIDNil = LexKindID(0)
+ LexKindIDMin = LexKindID(1)
+)
+
+func (id LexKindID) Int() int {
+ return int(id)
+}
+
+// LexModeKindID represents an ID of a lexical kind and is unique within a mode.
+// Use LexKindID to identify a kind across all modes uniquely.
+type LexModeKindID int
+
+const (
+ LexModeKindIDNil = LexKindID(0)
+ LexModeKindIDMin = LexKindID(1)
+)
-var lexKindRE = regexp.MustCompile(lexKindPattern)
+func (id LexModeKindID) Int() int {
+ return int(id)
+}
-type LexKind string
+// LexKindName represents a name of a lexical kind.
+type LexKindName string
-const LexKindNil = LexKind("")
+const LexKindNameNil = LexKindName("")
-func (k LexKind) String() string {
+func (k LexKindName) String() string {
return string(k)
}
-func (k LexKind) validate() error {
+func (k LexKindName) validate() error {
if k == "" {
return fmt.Errorf("kind doesn't allow to be the empty string")
}
- if !lexKindRE.Match([]byte(k)) {
- return fmt.Errorf("kind must be %v", lexKindPattern)
+ if !lexKindNameRE.Match([]byte(k)) {
+ return fmt.Errorf("kind must be %v", lexKindNamePattern)
}
return nil
}
-// LexKindID is a unique ID among modes.
-type LexKindID int
-
-func (id LexKindID) Int() int {
- return int(id)
-}
+const lexKindNamePattern = "[A-Za-z_][0-9A-Za-z_]*"
-const (
- LexKindIDNil = LexKindID(0)
- LexKindIDMin = LexKindID(1)
-)
+var lexKindNameRE = regexp.MustCompile(lexKindNamePattern)
+// LexPattern represents a pattern of a lexeme.
+// The pattern is written in regular expression.
type LexPattern string
func (p LexPattern) validate() error {
@@ -50,10 +66,27 @@ func (p LexPattern) validate() error {
return nil
}
-const lexModePattern = "[A-Za-z_][0-9A-Za-z_]*"
+// LexModeID represents an ID of a lex mode.
+type LexModeID int
-var lexModeRE = regexp.MustCompile(lexKindPattern)
+const (
+ LexModeIDNil = LexModeID(0)
+ LexModeIDDefault = LexModeID(1)
+)
+func (n LexModeID) String() string {
+ return strconv.Itoa(int(n))
+}
+
+func (n LexModeID) Int() int {
+ return int(n)
+}
+
+func (n LexModeID) IsNil() bool {
+ return n == LexModeIDNil
+}
+
+// LexModeName represents a name of a lex mode.
type LexModeName string
const (
@@ -66,8 +99,8 @@ func (m LexModeName) String() string {
}
func (m LexModeName) validate() error {
- if m.isNil() || !lexModeRE.Match([]byte(m)) {
- return fmt.Errorf("mode must be %v", lexModePattern)
+ if m.isNil() || !lexModeNameRE.Match([]byte(m)) {
+ return fmt.Errorf("mode must be %v", lexModeNamePattern)
}
return nil
}
@@ -76,31 +109,12 @@ func (m LexModeName) isNil() bool {
return m == LexModeNameNil
}
-type LexModeNum int
+const lexModeNamePattern = "[A-Za-z_][0-9A-Za-z_]*"
-const (
- LexModeNumNil = LexModeNum(0)
- LexModeNumDefault = LexModeNum(1)
-)
-
-func (n LexModeNum) String() string {
- return strconv.Itoa(int(n))
-}
-
-func (n LexModeNum) Int() int {
- return int(n)
-}
-
-func (n LexModeNum) Succ() LexModeNum {
- return n + 1
-}
-
-func (n LexModeNum) IsNil() bool {
- return n == LexModeNumNil
-}
+var lexModeNameRE = regexp.MustCompile(lexModeNamePattern)
type LexEntry struct {
- Kind LexKind `json:"kind"`
+ Kind LexKindName `json:"kind"`
Pattern LexPattern `json:"pattern"`
Modes []LexModeName `json:"modes"`
Push LexModeName `json:"push"`
@@ -174,18 +188,35 @@ func (s *LexSpec) Validate() error {
return nil
}
+// StateID represents an ID of a state of a transition table.
+type StateID int
+
+const (
+ // StateIDNil represents an empty entry of a transition table.
+ // When the driver reads this value, it raises an error meaning lexical analysis failed.
+ StateIDNil = StateID(0)
+
+ // StateIDMin is the minimum value of the state ID. All valid state IDs are represented as
+ // sequential numbers starting from this value.
+ StateIDMin = StateID(1)
+)
+
+func (id StateID) Int() int {
+ return int(id)
+}
+
type RowDisplacementTable struct {
- OriginalRowCount int `json:"original_row_count"`
- OriginalColCount int `json:"original_col_count"`
- EmptyValue int `json:"empty_value"`
- Entries []int `json:"entries"`
- Bounds []int `json:"bounds"`
- RowDisplacement []int `json:"row_displacement"`
+ OriginalRowCount int `json:"original_row_count"`
+ OriginalColCount int `json:"original_col_count"`
+ EmptyValue StateID `json:"empty_value"`
+ Entries []StateID `json:"entries"`
+ Bounds []int `json:"bounds"`
+ RowDisplacement []int `json:"row_displacement"`
}
type UniqueEntriesTable struct {
UniqueEntries *RowDisplacementTable `json:"unique_entries,omitempty"`
- UncompressedUniqueEntries []int `json:"uncompressed_unique_entries,omitempty"`
+ UncompressedUniqueEntries []StateID `json:"uncompressed_unique_entries,omitempty"`
RowNums []int `json:"row_nums"`
OriginalRowCount int `json:"original_row_count"`
OriginalColCount int `json:"original_col_count"`
@@ -193,25 +224,25 @@ type UniqueEntriesTable struct {
}
type TransitionTable struct {
- InitialState int `json:"initial_state"`
- AcceptingStates []int `json:"accepting_states"`
+ InitialStateID StateID `json:"initial_state_id"`
+ AcceptingStates []LexModeKindID `json:"accepting_states"`
RowCount int `json:"row_count"`
ColCount int `json:"col_count"`
Transition *UniqueEntriesTable `json:"transition,omitempty"`
- UncompressedTransition []int `json:"uncompressed_transition,omitempty"`
+ UncompressedTransition []StateID `json:"uncompressed_transition,omitempty"`
}
type CompiledLexModeSpec struct {
- Kinds []LexKind `json:"kinds"`
- Push []LexModeNum `json:"push"`
- Pop []int `json:"pop"`
- DFA *TransitionTable `json:"dfa"`
+ KindNames []LexKindName `json:"kind_names"`
+ Push []LexModeID `json:"push"`
+ Pop []int `json:"pop"`
+ DFA *TransitionTable `json:"dfa"`
}
type CompiledLexSpec struct {
- InitialMode LexModeNum `json:"initial_mode"`
- Modes []LexModeName `json:"modes"`
- Kinds []LexKind `json:"kinds"`
+ InitialModeID LexModeID `json:"initial_mode_id"`
+ ModeNames []LexModeName `json:"mode_names"`
+ KindNames []LexKindName `json:"kind_names"`
KindIDs [][]LexKindID `json:"kind_ids"`
CompressionLevel int `json:"compression_level"`
Specs []*CompiledLexModeSpec `json:"specs"`