diff options
author | Ryo Nihei <nihei.dev@gmail.com> | 2021-08-01 17:17:53 +0900 |
---|---|---|
committer | Ryo Nihei <nihei.dev@gmail.com> | 2021-08-01 21:18:34 +0900 |
commit | 2433c27f26bc1be2d9b33f6550482abc48fa31ef (patch) | |
tree | c7cbc22929045e2d5dccdc37cc978138b59c1bdb | |
parent | Add unique kind IDs to tokens (diff) | |
download | tre-2433c27f26bc1be2d9b33f6550482abc48fa31ef.tar.gz tre-2433c27f26bc1be2d9b33f6550482abc48fa31ef.tar.xz |
Change APIs
Change fields of tokens, results of lexical analysis, as follows:
- Rename: mode -> mode_id
- Rename: kind_id -> mode_kind_id
- Add: kind_id
The kind ID is unique across all modes, but the mode kind ID is unique only within a mode.
Change fields of a transition table as follows:
- Rename: initial_mode -> initial_mode_id
- Rename: modes -> mode_names
- Rename: kinds -> kind_names
- Rename: specs[].kinds -> specs[].kind_names
- Rename: specs[].dfa.initial_state -> specs[].dfa.initial_state_id
Change public types defined in the spec package as follows:
- Rename: LexModeNum -> LexModeID
- Rename: LexKind -> LexKindName
- Add: LexKindID
- Add: StateID
-rw-r--r-- | README.md | 46 | ||||
-rw-r--r-- | compiler/ast.go | 6 | ||||
-rw-r--r-- | compiler/compiler.go | 112 | ||||
-rw-r--r-- | compiler/dfa.go | 18 | ||||
-rw-r--r-- | compiler/dfa_test.go | 6 | ||||
-rw-r--r-- | compiler/parser.go | 12 | ||||
-rw-r--r-- | compiler/parser_test.go | 8 | ||||
-rw-r--r-- | compiler/test_util_test.go | 4 | ||||
-rw-r--r-- | driver/lexer.go | 137 | ||||
-rw-r--r-- | driver/lexer_test.go | 18 | ||||
-rw-r--r-- | spec/spec.go | 153 |
11 files changed, 289 insertions, 231 deletions
@@ -47,33 +47,33 @@ If you want to make sure that the lexical specification behaves as expected, you ⚠️ An encoding that `maleeni lex` and the driver can handle is only UTF-8. ```sh -$ echo -n 'The truth is out there.' | maleeni lex clexspec.json | jq -r '[.kind_name, .text, .eof] | @csv' -"word","The",false -"whitespace"," ",false -"word","truth",false -"whitespace"," ",false -"word","is",false -"whitespace"," ",false -"word","out",false -"whitespace"," ",false -"word","there",false -"punctuation",".",false -"","",true +$ echo -n 'The truth is out there.' | maleeni lex clexspec.json | jq -r '[.kind_id, .kind_name, .text, .eof] | @csv' +2,"word","The",false +1,"whitespace"," ",false +2,"word","truth",false +1,"whitespace"," ",false +2,"word","is",false +1,"whitespace"," ",false +2,"word","out",false +1,"whitespace"," ",false +2,"word","there",false +3,"punctuation",".",false +0,"","",true ``` The JSON format of tokens that `maleeni lex` command prints is as follows: -| Field | Type | Description | -|-----------|-------------------|----------------------------------------------------------------------------------------| -| mode | integer | `mode` represents a number that corresponds to a `mode_name`. | -| mode_name | string | `mode_name` is a mode name that represents in which mode the lexer detected the token. | -| kind_id | integer | `kind_id` represents an ID of a kind and is unique among modes. | -| kind | integer | `kind` represents a number that corresponds to a `KindName`. | -| kind_name | string | `kind_name` is a kind name that represents what kind the token has. | -| match | array of integers | `match` is a byte sequence matched a pattern of a lexical specification. | -| text | string | `text` is a string representation of `match`. | -| eof | bool | If `eof` is true, it means the token is the EOF token. | -| invalid | bool | If `invalid` is true, it means the token is an error token. | +| Field | Type | Description | +|--------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------| +| mode_id | integer | An ID of a lex mode. | +| mode_name | string | A name of a lex mode. | +| kind_id | integer | An ID of a kind. This is unique among all modes. | +| mode_kind_id | integer | An ID of a lexical kind. This is unique only within a mode. Note that you need to use `KindID` field if you want to identify a kind across all modes. | +| kind_name | string | A name of a lexical kind. | +| match | array of integers | A byte sequense of a lexeme. | +| text | string | A string representation of a lexeme. | +| eof | bool | When this field is `true`, it means the token is the EOF token. | +| invalid | bool | When this field is `true`, it means the token is an error token. | When using the driver, please import `github.com/nihei9/maleeni/driver` and `github.com/nihei9/maleeni/spec` package. You can use the driver easily in the following way: diff --git a/compiler/ast.go b/compiler/ast.go index 7d3965a..046662e 100644 --- a/compiler/ast.go +++ b/compiler/ast.go @@ -3,6 +3,8 @@ package compiler import ( "fmt" "io" + + "github.com/nihei9/maleeni/spec" ) type astNode interface { @@ -78,13 +80,13 @@ func (n *symbolNode) last() *symbolPositionSet { } type endMarkerNode struct { - id int + id spec.LexModeKindID pos symbolPosition firstMemo *symbolPositionSet lastMemo *symbolPositionSet } -func newEndMarkerNode(id int) *endMarkerNode { +func newEndMarkerNode(id spec.LexModeKindID) *endMarkerNode { return &endMarkerNode{ id: id, pos: symbolPositionNil, diff --git a/compiler/compiler.go b/compiler/compiler.go index 5d3e52f..5d1a1d5 100644 --- a/compiler/compiler.go +++ b/compiler/compiler.go @@ -54,28 +54,28 @@ func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSp } } - modeEntries, modes, modeNums, fragmetns := groupEntriesByLexMode(lexspec.Entries) + modeEntries, modeNames, modeName2ID, fragmetns := groupEntriesByLexMode(lexspec.Entries) modeSpecs := []*spec.CompiledLexModeSpec{ nil, } for i, es := range modeEntries[1:] { - modeName := modes[i+1] + modeName := modeNames[i+1] config.logger.Log("Compile %v mode:", modeName) - modeSpec, err := compile(es, modeNums, fragmetns, config) + modeSpec, err := compile(es, modeName2ID, fragmetns, config) if err != nil { return nil, fmt.Errorf("failed to compile in %v mode: %w", modeName, err) } modeSpecs = append(modeSpecs, modeSpec) } - var kindNames []spec.LexKind - var name2ID map[spec.LexKind]spec.LexKindID + var kindNames []spec.LexKindName + var name2ID map[spec.LexKindName]spec.LexKindID { - name2ID = map[spec.LexKind]spec.LexKindID{} + name2ID = map[spec.LexKindName]spec.LexKindID{} id := spec.LexKindIDMin for _, modeSpec := range modeSpecs[1:] { - for _, name := range modeSpec.Kinds[1:] { + for _, name := range modeSpec.KindNames[1:] { if _, ok := name2ID[name]; ok { continue } @@ -84,7 +84,7 @@ func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSp } } - kindNames = make([]spec.LexKind, len(name2ID)+1) + kindNames = make([]spec.LexKindName, len(name2ID)+1) for name, id := range name2ID { kindNames[id] = name } @@ -94,8 +94,8 @@ func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSp { kindIDs = make([][]spec.LexKindID, len(modeSpecs)) for i, modeSpec := range modeSpecs[1:] { - ids := make([]spec.LexKindID, len(modeSpec.Kinds)) - for modeID, name := range modeSpec.Kinds { + ids := make([]spec.LexKindID, len(modeSpec.KindNames)) + for modeID, name := range modeSpec.KindNames { if modeID == 0 { continue } @@ -106,25 +106,25 @@ func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSp } return &spec.CompiledLexSpec{ - InitialMode: spec.LexModeNumDefault, - Modes: modes, - Kinds: kindNames, + InitialModeID: spec.LexModeIDDefault, + ModeNames: modeNames, + KindNames: kindNames, KindIDs: kindIDs, CompressionLevel: config.compLv, Specs: modeSpecs, }, nil } -func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeNum, map[string]*spec.LexEntry) { - modes := []spec.LexModeName{ +func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeID, map[string]*spec.LexEntry) { + modeNames := []spec.LexModeName{ spec.LexModeNameNil, spec.LexModeNameDefault, } - modeNums := map[spec.LexModeName]spec.LexModeNum{ - spec.LexModeNameNil: spec.LexModeNumNil, - spec.LexModeNameDefault: spec.LexModeNumDefault, + modeName2ID := map[spec.LexModeName]spec.LexModeID{ + spec.LexModeNameNil: spec.LexModeIDNil, + spec.LexModeNameDefault: spec.LexModeIDDefault, } - lastModeNum := spec.LexModeNumDefault + lastModeID := spec.LexModeIDDefault modeEntries := [][]*spec.LexEntry{ nil, {}, @@ -141,30 +141,30 @@ func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec spec.LexModeNameDefault, } } - for _, mode := range ms { - num, ok := modeNums[mode] + for _, modeName := range ms { + modeID, ok := modeName2ID[modeName] if !ok { - num = lastModeNum.Succ() - lastModeNum = num - modeNums[mode] = num - modes = append(modes, mode) + modeID = lastModeID + 1 + lastModeID = modeID + modeName2ID[modeName] = modeID + modeNames = append(modeNames, modeName) modeEntries = append(modeEntries, []*spec.LexEntry{}) } - modeEntries[num] = append(modeEntries[num], e) + modeEntries[modeID] = append(modeEntries[modeID], e) } } - return modeEntries, modes, modeNums, fragments + return modeEntries, modeNames, modeName2ID, fragments } -func compile(entries []*spec.LexEntry, modeNums map[spec.LexModeName]spec.LexModeNum, fragments map[string]*spec.LexEntry, config *compilerConfig) (*spec.CompiledLexModeSpec, error) { - var kinds []spec.LexKind - var patterns map[int][]byte +func compile(entries []*spec.LexEntry, modeName2ID map[spec.LexModeName]spec.LexModeID, fragments map[string]*spec.LexEntry, config *compilerConfig) (*spec.CompiledLexModeSpec, error) { + var kindNames []spec.LexKindName + var patterns map[spec.LexModeKindID][]byte { - kinds = append(kinds, spec.LexKindNil) - patterns = map[int][]byte{} + kindNames = append(kindNames, spec.LexKindNameNil) + patterns = map[spec.LexModeKindID][]byte{} for i, e := range entries { - kinds = append(kinds, e.Kind) - patterns[i+1] = []byte(e.Pattern) + kindNames = append(kindNames, e.Kind) + patterns[spec.LexModeKindID(i+1)] = []byte(e.Pattern) } config.logger.Log("Patterns:") @@ -173,16 +173,16 @@ func compile(entries []*spec.LexEntry, modeNums map[spec.LexModeName]spec.LexMod } } - push := []spec.LexModeNum{ - spec.LexModeNumNil, + push := []spec.LexModeID{ + spec.LexModeIDNil, } pop := []int{ 0, } for _, e := range entries { - pushV := spec.LexModeNumNil + pushV := spec.LexModeIDNil if e.Push != "" { - pushV = modeNums[e.Push] + pushV = modeName2ID[e.Push] } push = append(push, pushV) popV := 0 @@ -222,7 +222,7 @@ func compile(entries []*spec.LexEntry, modeNums map[spec.LexModeName]spec.LexMod config.logger.Log(`DFA: States: %v states (%v entries) - Initial State: %v`, tranTab.RowCount, tranTab.RowCount*tranTab.ColCount, tranTab.InitialState) + Initial State ID: %v`, tranTab.RowCount, tranTab.RowCount*tranTab.ColCount, tranTab.InitialStateID) config.logger.Log(" Accepting States:") for state, symbol := range tranTab.AcceptingStates { config.logger.Log(" %v: %v", state, symbol) @@ -244,10 +244,10 @@ func compile(entries []*spec.LexEntry, modeNums map[spec.LexModeName]spec.LexMod } return &spec.CompiledLexModeSpec{ - Kinds: kinds, - Push: push, - Pop: pop, - DFA: tranTab, + KindNames: kindNames, + Push: push, + Pop: pop, + DFA: tranTab, }, nil } @@ -259,7 +259,7 @@ const ( func compressTransitionTableLv2(tranTab *spec.TransitionTable) (*spec.TransitionTable, error) { ueTab := compressor.NewUniqueEntriesTable() { - orig, err := compressor.NewOriginalTable(tranTab.UncompressedTransition, tranTab.ColCount) + orig, err := compressor.NewOriginalTable(convertStateIDSliceToIntSlice(tranTab.UncompressedTransition), tranTab.ColCount) if err != nil { return nil, err } @@ -285,8 +285,8 @@ func compressTransitionTableLv2(tranTab *spec.TransitionTable) (*spec.Transition UniqueEntries: &spec.RowDisplacementTable{ OriginalRowCount: rdTab.OriginalRowCount, OriginalColCount: rdTab.OriginalColCount, - EmptyValue: rdTab.EmptyValue, - Entries: rdTab.Entries, + EmptyValue: spec.StateIDNil, + Entries: convertIntSliceToStateIDSlice(rdTab.Entries), Bounds: rdTab.Bounds, RowDisplacement: rdTab.RowDisplacement, }, @@ -302,7 +302,7 @@ func compressTransitionTableLv2(tranTab *spec.TransitionTable) (*spec.Transition func compressTransitionTableLv1(tranTab *spec.TransitionTable) (*spec.TransitionTable, error) { ueTab := compressor.NewUniqueEntriesTable() { - orig, err := compressor.NewOriginalTable(tranTab.UncompressedTransition, tranTab.ColCount) + orig, err := compressor.NewOriginalTable(convertStateIDSliceToIntSlice(tranTab.UncompressedTransition), tranTab.ColCount) if err != nil { return nil, err } @@ -313,7 +313,7 @@ func compressTransitionTableLv1(tranTab *spec.TransitionTable) (*spec.Transition } tranTab.Transition = &spec.UniqueEntriesTable{ - UncompressedUniqueEntries: ueTab.UniqueEntries, + UncompressedUniqueEntries: convertIntSliceToStateIDSlice(ueTab.UniqueEntries), RowNums: ueTab.RowNums, OriginalRowCount: ueTab.OriginalRowCount, OriginalColCount: ueTab.OriginalColCount, @@ -322,3 +322,19 @@ func compressTransitionTableLv1(tranTab *spec.TransitionTable) (*spec.Transition return tranTab, nil } + +func convertStateIDSliceToIntSlice(s []spec.StateID) []int { + is := make([]int, len(s)) + for i, v := range s { + is[i] = v.Int() + } + return is +} + +func convertIntSliceToStateIDSlice(s []int) []spec.StateID { + ss := make([]spec.StateID, len(s)) + for i, v := range s { + ss[i] = spec.StateID(v) + } + return ss +} diff --git a/compiler/dfa.go b/compiler/dfa.go index b94fca8..1d0b26a 100644 --- a/compiler/dfa.go +++ b/compiler/dfa.go @@ -9,7 +9,7 @@ import ( type DFA struct { States []string InitialState string - AcceptingStatesTable map[string]int + AcceptingStatesTable map[string]spec.LexModeKindID TransitionTable map[string][256]string } @@ -65,7 +65,7 @@ func genDFA(root astNode, symTab *symbolTable) *DFA { } } - accTab := map[string]int{} + accTab := map[string]spec.LexModeKindID{} { for h, s := range stateMap { for _, pos := range s.set() { @@ -104,33 +104,33 @@ func genDFA(root astNode, symTab *symbolTable) *DFA { } func genTransitionTable(dfa *DFA) (*spec.TransitionTable, error) { - state2Num := map[string]int{} + stateHash2ID := map[string]spec.StateID{} for i, s := range dfa.States { // Since 0 represents an invalid value in a transition table, // assign a number greater than or equal to 1 to states. - state2Num[s] = i + 1 + stateHash2ID[s] = spec.StateID(i + spec.StateIDMin.Int()) } - acc := make([]int, len(dfa.States)+1) + acc := make([]spec.LexModeKindID, len(dfa.States)+1) for _, s := range dfa.States { id, ok := dfa.AcceptingStatesTable[s] if !ok { continue } - acc[state2Num[s]] = id + acc[stateHash2ID[s]] = id } rowCount := len(dfa.States) + 1 colCount := 256 - tran := make([]int, rowCount*colCount) + tran := make([]spec.StateID, rowCount*colCount) for s, tab := range dfa.TransitionTable { for v, to := range tab { - tran[state2Num[s]*256+v] = state2Num[to] + tran[stateHash2ID[s].Int()*256+v] = stateHash2ID[to] } } return &spec.TransitionTable{ - InitialState: state2Num[dfa.InitialState], + InitialStateID: stateHash2ID[dfa.InitialState], AcceptingStates: acc, UncompressedTransition: tran, RowCount: rowCount, diff --git a/compiler/dfa_test.go b/compiler/dfa_test.go index a797313..2683815 100644 --- a/compiler/dfa_test.go +++ b/compiler/dfa_test.go @@ -2,10 +2,12 @@ package compiler import ( "testing" + + "github.com/nihei9/maleeni/spec" ) func TestGenDFA(t *testing.T) { - root, symTab, err := parse(map[int][]byte{ + root, symTab, err := parse(map[spec.LexModeKindID][]byte{ 1: []byte("(a|b)*abb"), }, nil) if err != nil { @@ -94,7 +96,7 @@ func TestGenDFA(t *testing.T) { t.Errorf("initial state is mismatched; want: %v, got: %v", s0.hash(), dfa.InitialState) } - accTab := map[string]int{ + accTab := map[string]spec.LexModeKindID{ s3.hash(): 1, } if len(dfa.AcceptingStatesTable) != len(accTab) { diff --git a/compiler/parser.go b/compiler/parser.go index 06a762d..1c9126a 100644 --- a/compiler/parser.go +++ b/compiler/parser.go @@ -7,6 +7,8 @@ import ( "fmt" "io" "strings" + + "github.com/nihei9/maleeni/spec" ) type ParseErrors struct { @@ -23,7 +25,7 @@ func (e *ParseErrors) Error() string { } type ParseError struct { - ID int + ID spec.LexModeKindID Pattern []byte Cause error Details string @@ -44,13 +46,13 @@ func raiseSyntaxError(synErr *SyntaxError) { type symbolTable struct { symPos2Byte map[symbolPosition]byteRange - endPos2ID map[symbolPosition]int + endPos2ID map[symbolPosition]spec.LexModeKindID } func genSymbolTable(root astNode) *symbolTable { symTab := &symbolTable{ symPos2Byte: map[symbolPosition]byteRange{}, - endPos2ID: map[symbolPosition]int{}, + endPos2ID: map[symbolPosition]spec.LexModeKindID{}, } return genSymTab(symTab, root) } @@ -76,7 +78,7 @@ func genSymTab(symTab *symbolTable, node astNode) *symbolTable { return symTab } -func parse(regexps map[int][]byte, fragments map[string][]byte) (astNode, *symbolTable, error) { +func parse(regexps map[spec.LexModeKindID][]byte, fragments map[string][]byte) (astNode, *symbolTable, error) { if len(regexps) == 0 { return nil, nil, fmt.Errorf("parse() needs at least one token entry") } @@ -159,7 +161,7 @@ func parseFragments(fragments map[string][]byte) (map[string]astNode, error) { return fragmentASTs, nil } -func parseRegexp(regexps map[int][]byte, fragmentASTs map[string]astNode) (astNode, error) { +func parseRegexp(regexps map[spec.LexModeKindID][]byte, fragmentASTs map[string]astNode) (astNode, error) { symPos := symbolPositionMin var root astNode var perrs []*ParseError diff --git a/compiler/parser_test.go b/compiler/parser_test.go index 3394845..30e6130 100644 --- a/compiler/parser_test.go +++ b/compiler/parser_test.go @@ -4,6 +4,8 @@ import ( "fmt" "reflect" "testing" + + "github.com/nihei9/maleeni/spec" ) func symPos(n uint16) symbolPosition { @@ -1197,7 +1199,7 @@ func TestParse(t *testing.T) { for kind, pattern := range tt.fragments { fragments[kind] = []byte(pattern) } - ast, _, err := parse(map[int][]byte{ + ast, _, err := parse(map[spec.LexModeKindID][]byte{ 1: []byte(tt.pattern), }, fragments) if tt.syntaxError != nil { @@ -1237,7 +1239,7 @@ func TestParse(t *testing.T) { } func TestParse_FollowAndSymbolTable(t *testing.T) { - root, symTab, err := parse(map[int][]byte{ + root, symTab, err := parse(map[spec.LexModeKindID][]byte{ 1: []byte("(a|b)*abb"), }, nil) if err != nil { @@ -1295,7 +1297,7 @@ func TestParse_FollowAndSymbolTable(t *testing.T) { symPos(4): entry(byte('b')), symPos(5): entry(byte('b')), }, - endPos2ID: map[symbolPosition]int{ + endPos2ID: map[symbolPosition]spec.LexModeKindID{ endPos(6): 1, }, } diff --git a/compiler/test_util_test.go b/compiler/test_util_test.go index 2ead2c9..72e150b 100644 --- a/compiler/test_util_test.go +++ b/compiler/test_util_test.go @@ -1,5 +1,7 @@ package compiler +import "github.com/nihei9/maleeni/spec" + func newRangeSymbolNodeWithPos(from, to byte, pos symbolPosition) *symbolNode { n := newRangeSymbolNode(from, to) n.pos = pos @@ -13,7 +15,7 @@ func newSymbolNodeWithPos(v byte, pos symbolPosition) *symbolNode { } func newEndMarkerNodeWithPos(id int, pos symbolPosition) *endMarkerNode { - n := newEndMarkerNode(id) + n := newEndMarkerNode(spec.LexModeKindID(id)) n.pos = pos return n } diff --git a/driver/lexer.go b/driver/lexer.go index 7ad2dd0..bad7dbd 100644 --- a/driver/lexer.go +++ b/driver/lexer.go @@ -56,69 +56,70 @@ func (s byteSequence) merge(a byteSequence) byteSequence { // Token representes a token. type Token struct { - // `Mode` represents a number that corresponds to a `ModeName`. - Mode spec.LexModeNum + // ModeID is an ID of a lex mode. + ModeID spec.LexModeID - // `ModeName` is a mode name that represents in which mode the lexer detected the token. + // ModeName is a name of a lex mode. ModeName spec.LexModeName - // `KindID` is a unique ID among modes. - KindID int + // KindID is an ID of a kind. This is unique among all modes. + KindID spec.LexKindID - // `Kind` represents a number that corresponds to a `KindName`. - Kind int + // ModeKindID is an ID of a lexical kind. This is unique only within a mode. + // Note that you need to use KindID field if you want to identify a kind across all modes. + ModeKindID spec.LexModeKindID - // `KindName` is a kind name that represents what kind the token has. - KindName string + // KindName is a name of a lexical kind. + KindName spec.LexKindName - // If `EOF` is true, it means the token is the EOF token. + // When this field is true, it means the token is the EOF token. EOF bool - // If `Invalid` is true, it means the token is an error token. + // When this field is true, it means the token is an error token. Invalid bool - // `match` is a byte sequence matched a pattern of a lexical specification. + // match is a byte sequence matched a pattern of a lexical specification. match byteSequence } -func newToken(mode spec.LexModeNum, modeName spec.LexModeName, kindID int, modeKindID int, kindName string, match byteSequence) *Token { +func newToken(modeID spec.LexModeID, modeName spec.LexModeName, kindID spec.LexKindID, modeKindID spec.LexModeKindID, kindName spec.LexKindName, match byteSequence) *Token { return &Token{ - Mode: mode, - ModeName: modeName, - KindID: kindID, - Kind: modeKindID, - KindName: kindName, - match: match, + ModeID: modeID, + ModeName: modeName, + KindID: kindID, + ModeKindID: modeKindID, + KindName: kindName, + match: match, } } -func newEOFToken(mode spec.LexModeNum, modeName spec.LexModeName) *Token { +func newEOFToken(modeID spec.LexModeID, modeName spec.LexModeName) *Token { return &Token{ - Mode: mode, - ModeName: modeName, - Kind: 0, - EOF: true, + ModeID: modeID, + ModeName: modeName, + ModeKindID: 0, + EOF: true, } } -func newInvalidToken(mode spec.LexModeNum, modeName spec.LexModeName, match byteSequence) *Token { +func newInvalidToken(modeID spec.LexModeID, modeName spec.LexModeName, match byteSequence) *Token { return &Token{ - Mode: mode, - ModeName: modeName, - Kind: 0, - match: match, - Invalid: true, + ModeID: modeID, + ModeName: modeName, + ModeKindID: 0, + match: match, + Invalid: true, } } func (t *Token) String() string { if t.Invalid { - return fmt.Sprintf("!{mode: %v, mode name: %v, text: %v, byte: %v}", t.Mode, t.ModeName, t.Text(), t.Match()) + return fmt.Sprintf("!{mode id: %v, mode name: %v, text: %v, byte: %v}", t.ModeID, t.ModeName, t.Text(), t.Match()) } if t.EOF { return "{eof}" } - return fmt.Sprintf("{mode: %v, mode name: %v, kind: %v, kind name: %v, text: %v, byte: %v}", t.Mode, t.ModeName, t.Kind, t.KindName, t.Text(), t.Match()) + return fmt.Sprintf("{mode id: %v, mode name: %v, kind id: %v, mode kind id: %v, kind name: %v, text: %v, byte: %v}", t.ModeID, t.ModeName, t.KindID, t.ModeKindID, t.KindName, t.Text(), t.Match()) } // Match returns a byte slice matched a pattern of a lexical specification. @@ -133,25 +134,25 @@ func (t *Token) Text() string { func (t *Token) MarshalJSON() ([]byte, error) { return json.Marshal(struct { - Mode int `json:"mode"` - ModeName string `json:"mode_name"` - KindID int `json:"kind_id"` - Kind int `json:"kind"` - KindName string `json:"kind_name"` - Match byteSequence `json:"match"` - Text string `json:"text"` - EOF bool `json:"eof"` - Invalid bool `json:"invalid"` + ModeID int `json:"mode_id"` + ModeName string `json:"mode_name"` + KindID int `json:"kind_id"` + ModeKindID int `json:"mode_kind_id"` + KindName string `json:"kind_name"` + Match byteSequence `json:"match"` + Text string `json:"text"` + EOF bool `json:"eof"` + Invalid bool `json:"invalid"` }{ - Mode: t.Mode.Int(), - ModeName: t.ModeName.String(), - KindID: t.KindID, - Kind: t.Kind, - KindName: t.KindName, - Match: t.match, - Text: t.Text(), - EOF: t.EOF, - Invalid: t.Invalid, + ModeID: t.ModeID.Int(), + ModeName: t.ModeName.String(), + KindID: t.KindID.Int(), + ModeKindID: t.ModeKindID.Int(), + KindName: t.KindName.String(), + Match: t.match, + Text: t.Text(), + EOF: t.EOF, + Invalid: t.Invalid, }) } @@ -180,7 +181,7 @@ type Lexer struct { src []byte srcPtr int tokBuf []*Token - modeStack []spec.LexModeNum + modeStack []spec.LexModeID passiveModeTran bool logger log.Logger } @@ -194,8 +195,8 @@ func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...LexerOption) clspec: clspec, src: b, srcPtr: 0, - modeStack: []spec.LexModeNum{ - clspec.InitialMode, + modeStack: []spec.LexModeID{ + clspec.InitialModeID, }, passiveModeTran: false, logger: log.NewNopLogger(), @@ -216,7 +217,7 @@ func (l *Lexer) Next() (*Token, error) { State: mode: #%v %v pointer: %v - token buffer: %v`, l.Mode(), l.clspec.Modes[l.Mode()], l.srcPtr, l.tokBuf) + token buffer: %v`, l.Mode(), l.clspec.ModeNames[l.Mode()], l.srcPtr, l.tokBuf) if len(l.tokBuf) > 0 { tok := l.tokBuf[0] @@ -273,13 +274,13 @@ func (l *Lexer) nextAndTransition() (*Token, error) { return tok, nil } spec := l.clspec.Specs[l.Mode()] - if spec.Pop[tok.Kind] == 1 { + if spec.Pop[tok.ModeKindID] == 1 { err := l.PopMode() if err != nil { return nil, err } } - mode := spec.Push[tok.Kind] + mode := spec.Push[tok.ModeKindID] if !mode.IsNil() { l.PushMode(mode) } @@ -296,9 +297,9 @@ func (l *Lexer) nextAndTransition() (*Token, error) { func (l *Lexer) next() (*Token, error) { mode := l.Mode() - modeName := l.clspec.Modes[mode] + modeName := l.clspec.ModeNames[mode] spec := l.clspec.Specs[mode] - state := spec.DFA.InitialState + state := spec.DFA.InitialStateID buf := []byte{} unfixedBufLen := 0 var tok *Token @@ -330,13 +331,13 @@ func (l *Lexer) next() (*Token, error) { modeKindID := spec.DFA.AcceptingStates[state] if modeKindID != 0 { kindID := l.clspec.KindIDs[mode][modeKindID] - tok = newToken(mode, modeName, kindID.Int(), modeKindID, spec.Kinds[modeKindID].String(), newByteSequence(buf)) + tok = newToken(mode, modeName, kindID, modeKindID, spec.KindNames[modeKindID], newByteSequence(buf)) unfixedBufLen = 0 } } } -func (l *Lexer) lookupNextState(mode spec.LexModeNum, state int, v int) (int, bool) { +func (l *Lexer) lookupNextState(mode spec.LexModeID, state spec.StateID, v int) (spec.StateID, bool) { switch l.clspec.CompressionLevel { case 2: tab := l.clspec.Specs[mode].DFA.Transition @@ -349,24 +350,24 @@ func (l *Lexer) lookupNextState(mode spec.LexModeNum, state int, v int) (int, bo case 1: tab := l.clspec.Specs[mode].DFA.Transition next := tab.UncompressedUniqueEntries[tab.RowNums[state]*tab.OriginalColCount+v] - if next == 0 { - return 0, false + if next == spec.StateIDNil { + return spec.StateIDNil, false } return next, true } - spec := l.clspec.Specs[mode] - next := spec.DFA.UncompressedTransition[state*spec.DFA.ColCount+v] - if next == 0 { - return 0, false + modeSpec := l.clspec.Specs[mode] + next := modeSpec.DFA.UncompressedTransition[state.Int()*modeSpec.DFA.ColCount+v] + if next == spec.StateIDNil { + return spec.StateIDNil, false } return next, true } -func (l *Lexer) Mode() spec.LexModeNum { +func (l *Lexer) Mode() spec.LexModeID { return l.modeStack[len(l.modeStack)-1] } -func (l *Lexer) PushMode(mode spec.LexModeNum) { +func (l *Lexer) PushMode(mode spec.LexModeID) { l.modeStack = append(l.modeStack, mode) } diff --git a/driver/lexer_test.go b/driver/lexer_test.go index 79ee12e..5abe83c 100644 --- a/driver/lexer_test.go +++ b/driver/lexer_test.go @@ -16,7 +16,7 @@ func newLexEntry(modes []string, kind string, pattern string, push string, pop b ms = append(ms, spec.LexModeName(m)) } return &spec.LexEntry{ - Kind: spec.LexKind(kind), + Kind: spec.LexKindName(kind), Pattern: spec.LexPattern(pattern), Modes: ms, Push: spec.LexModeName(push), @@ -26,7 +26,7 @@ func newLexEntry(modes []string, kind string, pattern string, push string, pop b func newLexEntryDefaultNOP(kind string, pattern string) *spec.LexEntry { return &spec.LexEntry{ - Kind: spec.LexKind(kind), + Kind: spec.LexKindName(kind), Pattern: spec.LexPattern(pattern), Modes: []spec.LexModeName{ spec.LexModeNameDefault, @@ -36,18 +36,18 @@ func newLexEntryDefaultNOP(kind string, pattern string) *spec.LexEntry { func newLexEntryFragment(kind string, pattern string) *spec.LexEntry { return &spec.LexEntry{ - Kind: spec.LexKind(kind), + Kind: spec.LexKindName(kind), Pattern: spec.LexPattern(pattern), Fragment: true, } } func newTokenDefault(kindID int, modeKindID int, kindName string, match byteSequence) *Token { - return newToken(spec.LexModeNumDefault, spec.LexModeNameDefault, kindID, modeKindID, kindName, match) + return newToken(spec.LexModeIDDefault, spec.LexModeNameDefault, spec.LexKindID(kindID), spec.LexModeKindID(modeKindID), spec.LexKindName(kindName), match) } func newEOFTokenDefault() *Token { - return newEOFToken(spec.LexModeNumDefault, spec.LexModeNameDefault) + return newEOFToken(spec.LexModeIDDefault, spec.LexModeNameDefault) } func TestLexer_Next(t *testing.T) { @@ -604,7 +604,7 @@ func TestLexer_Next(t *testing.T) { }, passiveModeTran: true, tran: func(l *Lexer, tok *Token) error { - switch l.clspec.Modes[l.Mode().Int()] { + switch l.clspec.ModeNames[l.Mode()] { case "default": switch tok.KindName { case "push_1": @@ -653,7 +653,7 @@ func TestLexer_Next(t *testing.T) { // Active mode transition and an external transition function can be used together. passiveModeTran: false, tran: func(l *Lexer, tok *Token) error { - switch l.clspec.Modes[l.Mode().Int()] { + switch l.clspec.ModeNames[l.Mode()] { case "mode_1": switch tok.KindName { case "push_2": @@ -736,10 +736,10 @@ func TestLexer_Next(t *testing.T) { func testToken(t *testing.T, expected, actual *Token) { t.Helper() - if actual.Mode != expected.Mode || + if actual.ModeID != expected.ModeID || actual.ModeName != expected.ModeName || actual.KindID != expected.KindID || - actual.Kind != expected.Kind || + actual.ModeKindID != expected.ModeKindID || actual.KindName != expected.KindName || !bytes.Equal(actual.Match(), expected.Match()) || actual.EOF != expected.EOF || diff --git a/spec/spec.go b/spec/spec.go index b8aae33..d4f6346 100644 --- a/spec/spec.go +++ b/spec/spec.go @@ -7,40 +7,56 @@ import ( "strings" ) -const lexKindPattern = "[A-Za-z_][0-9A-Za-z_]*" +// LexKindID represents an ID of a lexical kind and is unique across all modes. +type LexKindID int + +const ( + LexKindIDNil = LexKindID(0) + LexKindIDMin = LexKindID(1) +) + +func (id LexKindID) Int() int { + return int(id) +} + +// LexModeKindID represents an ID of a lexical kind and is unique within a mode. +// Use LexKindID to identify a kind across all modes uniquely. +type LexModeKindID int + +const ( + LexModeKindIDNil = LexKindID(0) + LexModeKindIDMin = LexKindID(1) +) -var lexKindRE = regexp.MustCompile(lexKindPattern) +func (id LexModeKindID) Int() int { + return int(id) +} -type LexKind string +// LexKindName represents a name of a lexical kind. +type LexKindName string -const LexKindNil = LexKind("") +const LexKindNameNil = LexKindName("") -func (k LexKind) String() string { +func (k LexKindName) String() string { return string(k) } -func (k LexKind) validate() error { +func (k LexKindName) validate() error { if k == "" { return fmt.Errorf("kind doesn't allow to be the empty string") } - if !lexKindRE.Match([]byte(k)) { - return fmt.Errorf("kind must be %v", lexKindPattern) + if !lexKindNameRE.Match([]byte(k)) { + return fmt.Errorf("kind must be %v", lexKindNamePattern) } return nil } -// LexKindID is a unique ID among modes. -type LexKindID int - -func (id LexKindID) Int() int { - return int(id) -} +const lexKindNamePattern = "[A-Za-z_][0-9A-Za-z_]*" -const ( - LexKindIDNil = LexKindID(0) - LexKindIDMin = LexKindID(1) -) +var lexKindNameRE = regexp.MustCompile(lexKindNamePattern) +// LexPattern represents a pattern of a lexeme. +// The pattern is written in regular expression. type LexPattern string func (p LexPattern) validate() error { @@ -50,10 +66,27 @@ func (p LexPattern) validate() error { return nil } -const lexModePattern = "[A-Za-z_][0-9A-Za-z_]*" +// LexModeID represents an ID of a lex mode. +type LexModeID int -var lexModeRE = regexp.MustCompile(lexKindPattern) +const ( + LexModeIDNil = LexModeID(0) + LexModeIDDefault = LexModeID(1) +) +func (n LexModeID) String() string { + return strconv.Itoa(int(n)) +} + +func (n LexModeID) Int() int { + return int(n) +} + +func (n LexModeID) IsNil() bool { + return n == LexModeIDNil +} + +// LexModeName represents a name of a lex mode. type LexModeName string const ( @@ -66,8 +99,8 @@ func (m LexModeName) String() string { } func (m LexModeName) validate() error { - if m.isNil() || !lexModeRE.Match([]byte(m)) { - return fmt.Errorf("mode must be %v", lexModePattern) + if m.isNil() || !lexModeNameRE.Match([]byte(m)) { + return fmt.Errorf("mode must be %v", lexModeNamePattern) } return nil } @@ -76,31 +109,12 @@ func (m LexModeName) isNil() bool { return m == LexModeNameNil } -type LexModeNum int +const lexModeNamePattern = "[A-Za-z_][0-9A-Za-z_]*" -const ( - LexModeNumNil = LexModeNum(0) - LexModeNumDefault = LexModeNum(1) -) - -func (n LexModeNum) String() string { - return strconv.Itoa(int(n)) -} - -func (n LexModeNum) Int() int { - return int(n) -} - -func (n LexModeNum) Succ() LexModeNum { - return n + 1 -} - -func (n LexModeNum) IsNil() bool { - return n == LexModeNumNil -} +var lexModeNameRE = regexp.MustCompile(lexModeNamePattern) type LexEntry struct { - Kind LexKind `json:"kind"` + Kind LexKindName `json:"kind"` Pattern LexPattern `json:"pattern"` Modes []LexModeName `json:"modes"` Push LexModeName `json:"push"` @@ -174,18 +188,35 @@ func (s *LexSpec) Validate() error { return nil } +// StateID represents an ID of a state of a transition table. +type StateID int + +const ( + // StateIDNil represents an empty entry of a transition table. + // When the driver reads this value, it raises an error meaning lexical analysis failed. + StateIDNil = StateID(0) + + // StateIDMin is the minimum value of the state ID. All valid state IDs are represented as + // sequential numbers starting from this value. + StateIDMin = StateID(1) +) + +func (id StateID) Int() int { + return int(id) +} + type RowDisplacementTable struct { - OriginalRowCount int `json:"original_row_count"` - OriginalColCount int `json:"original_col_count"` - EmptyValue int `json:"empty_value"` - Entries []int `json:"entries"` - Bounds []int `json:"bounds"` - RowDisplacement []int `json:"row_displacement"` + OriginalRowCount int `json:"original_row_count"` + OriginalColCount int `json:"original_col_count"` + EmptyValue StateID `json:"empty_value"` + Entries []StateID `json:"entries"` + Bounds []int `json:"bounds"` + RowDisplacement []int `json:"row_displacement"` } type UniqueEntriesTable struct { UniqueEntries *RowDisplacementTable `json:"unique_entries,omitempty"` - UncompressedUniqueEntries []int `json:"uncompressed_unique_entries,omitempty"` + UncompressedUniqueEntries []StateID `json:"uncompressed_unique_entries,omitempty"` RowNums []int `json:"row_nums"` OriginalRowCount int `json:"original_row_count"` OriginalColCount int `json:"original_col_count"` @@ -193,25 +224,25 @@ type UniqueEntriesTable struct { } type TransitionTable struct { - InitialState int `json:"initial_state"` - AcceptingStates []int `json:"accepting_states"` + InitialStateID StateID `json:"initial_state_id"` + AcceptingStates []LexModeKindID `json:"accepting_states"` RowCount int `json:"row_count"` ColCount int `json:"col_count"` Transition *UniqueEntriesTable `json:"transition,omitempty"` - UncompressedTransition []int `json:"uncompressed_transition,omitempty"` + UncompressedTransition []StateID `json:"uncompressed_transition,omitempty"` } type CompiledLexModeSpec struct { - Kinds []LexKind `json:"kinds"` - Push []LexModeNum `json:"push"` - Pop []int `json:"pop"` - DFA *TransitionTable `json:"dfa"` + KindNames []LexKindName `json:"kind_names"` + Push []LexModeID `json:"push"` + Pop []int `json:"pop"` + DFA *TransitionTable `json:"dfa"` } type CompiledLexSpec struct { - InitialMode LexModeNum `json:"initial_mode"` - Modes []LexModeName `json:"modes"` - Kinds []LexKind `json:"kinds"` + InitialModeID LexModeID `json:"initial_mode_id"` + ModeNames []LexModeName `json:"mode_names"` + KindNames []LexKindName `json:"kind_names"` KindIDs [][]LexKindID `json:"kind_ids"` CompressionLevel int `json:"compression_level"` Specs []*CompiledLexModeSpec `json:"specs"` |