diff options
-rw-r--r-- | compiler/compiler.go | 84 | ||||
-rw-r--r-- | driver/lexer.go | 135 | ||||
-rw-r--r-- | driver/lexer_test.go | 421 | ||||
-rw-r--r-- | spec/spec.go | 75 |
4 files changed, 504 insertions, 211 deletions
diff --git a/compiler/compiler.go b/compiler/compiler.go index 15f42f3..6f878c5 100644 --- a/compiler/compiler.go +++ b/compiler/compiler.go @@ -42,12 +42,71 @@ func Compile(lexspec *spec.LexSpec, opts ...compilerOption) (*spec.CompiledLexSp } } + modeEntries, modes, modeNums := groupEntriesByLexMode(lexspec.Entries) + + modeSpecs := []*spec.CompiledLexModeSpec{ + nil, + } + for i, es := range modeEntries[1:] { + modeName := modes[i+1] + config.logger.Log("Compile %v mode:", modeName) + modeSpec, err := compile(es, modeNums, config) + if err != nil { + return nil, fmt.Errorf("failed to compile in %v mode: %w", modeName, err) + } + modeSpecs = append(modeSpecs, modeSpec) + } + + return &spec.CompiledLexSpec{ + InitialMode: spec.LexModeNumDefault, + Modes: modes, + Specs: modeSpecs, + }, nil +} + +func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeNum) { + modes := []spec.LexModeName{ + spec.LexModeNameNil, + spec.LexModeNameDefault, + } + modeNums := map[spec.LexModeName]spec.LexModeNum{ + spec.LexModeNameNil: spec.LexModeNumNil, + spec.LexModeNameDefault: spec.LexModeNumDefault, + } + lastModeNum := spec.LexModeNumDefault + modeEntries := [][]*spec.LexEntry{ + nil, + []*spec.LexEntry{}, + } + for _, e := range entries { + ms := e.Modes + if len(ms) == 0 { + ms = []spec.LexModeName{ + spec.LexModeNameDefault, + } + } + for _, mode := range ms { + num, ok := modeNums[mode] + if !ok { + num = lastModeNum.Succ() + lastModeNum = num + modeNums[mode] = num + modes = append(modes, mode) + modeEntries = append(modeEntries, []*spec.LexEntry{}) + } + modeEntries[num] = append(modeEntries[num], e) + } + } + return modeEntries, modes, modeNums +} + +func compile(entries []*spec.LexEntry, modeNums map[spec.LexModeName]spec.LexModeNum, config *compilerConfig) (*spec.CompiledLexModeSpec, error) { var kinds []spec.LexKind var patterns map[int][]byte { kinds = append(kinds, spec.LexKindNil) patterns = map[int][]byte{} - for i, e := range lexspec.Entries { + for i, e := range entries { kinds = append(kinds, e.Kind) patterns[i+1] = []byte(e.Pattern) } @@ -58,6 +117,25 @@ func Compile(lexspec *spec.LexSpec, opts ...compilerOption) (*spec.CompiledLexSp } } + push := []spec.LexModeNum{ + spec.LexModeNumNil, + } + pop := []int{ + 0, + } + for _, e := range entries { + pushV := spec.LexModeNumNil + if e.Push != "" { + pushV = modeNums[e.Push] + } + push = append(push, pushV) + popV := 0 + if e.Pop { + popV = 1 + } + pop = append(pop, popV) + } + var root astNode var symTab *symbolTable { @@ -90,8 +168,10 @@ func Compile(lexspec *spec.LexSpec, opts ...compilerOption) (*spec.CompiledLexSp } } - return &spec.CompiledLexSpec{ + return &spec.CompiledLexModeSpec{ Kinds: kinds, + Push: push, + Pop: pop, DFA: tranTab, }, nil } diff --git a/driver/lexer.go b/driver/lexer.go index 8a2afd8..72a6f69 100644 --- a/driver/lexer.go +++ b/driver/lexer.go @@ -54,46 +54,54 @@ func (s byteSequence) merge(a byteSequence) byteSequence { } type Token struct { - ID int `json:"id"` - Kind string `json:"kind"` - Match byteSequence `json:"match"` - Text string `json:"text"` - EOF bool `json:"eof"` - Invalid bool `json:"invalid"` + Mode spec.LexModeNum `json:"mode"` + ModeName spec.LexModeName `json:"mode_name"` + ID int `json:"id"` + Kind string `json:"kind"` + Match byteSequence `json:"match"` + Text string `json:"text"` + EOF bool `json:"eof"` + Invalid bool `json:"invalid"` } -func newToken(id int, kind string, match byteSequence) *Token { +func newToken(mode spec.LexModeNum, modeName spec.LexModeName, id int, kind string, match byteSequence) *Token { return &Token{ - ID: id, - Kind: kind, - Match: match, - Text: string(match.ByteSlice()), + Mode: mode, + ModeName: modeName, + ID: id, + Kind: kind, + Match: match, + Text: string(match.ByteSlice()), } } -func newEOFToken() *Token { +func newEOFToken(mode spec.LexModeNum, modeName spec.LexModeName) *Token { return &Token{ - ID: 0, - EOF: true, + Mode: mode, + ModeName: modeName, + ID: 0, + EOF: true, } } -func newInvalidToken(match byteSequence) *Token { +func newInvalidToken(mode spec.LexModeNum, modeName spec.LexModeName, match byteSequence) *Token { return &Token{ - ID: 0, - Match: match, - Invalid: true, + Mode: mode, + ModeName: modeName, + ID: 0, + Match: match, + Invalid: true, } } func (t *Token) String() string { if t.Invalid { - return fmt.Sprintf("!{text: %v, byte: %v}", t.Text, t.Match) + return fmt.Sprintf("!{mode: %v, mode name: %v, text: %v, byte: %v}", t.Mode, t.ModeName, t.Text, t.Match) } if t.EOF { return "{eof}" } - return fmt.Sprintf("{id: %v, kind: %v, text: %v, byte: %v}", t.ID, t.Kind, t.Text, t.Match) + return fmt.Sprintf("{mode: %v, mode name: %v, id: %v, kind: %v, text: %v, byte: %v}", t.Mode, t.ModeName, t.ID, t.Kind, t.Text, t.Match) } type lexerOption func(l *lexer) error @@ -110,11 +118,12 @@ func EnableLogging(w io.Writer) lexerOption { } type lexer struct { - clspec *spec.CompiledLexSpec - src []byte - srcPtr int - tokBuf []*Token - logger log.Logger + clspec *spec.CompiledLexSpec + src []byte + srcPtr int + tokBuf []*Token + modeStack []spec.LexModeNum + logger log.Logger } func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...lexerOption) (*lexer, error) { @@ -126,6 +135,9 @@ func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...lexerOption) clspec: clspec, src: b, srcPtr: 0, + modeStack: []spec.LexModeNum{ + clspec.InitialMode, + }, logger: log.NewNopLogger(), } for _, opt := range opts { @@ -142,8 +154,9 @@ func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...lexerOption) func (l *lexer) Next() (*Token, error) { l.logger.Log(`lexer#Next(): State: + mode: #%v %v pointer: %v - token buffer: %v`, l.srcPtr, l.tokBuf) + token buffer: %v`, l.mode(), l.clspec.Modes[l.mode()], l.srcPtr, l.tokBuf) if len(l.tokBuf) > 0 { tok := l.tokBuf[0] @@ -154,7 +167,7 @@ func (l *lexer) Next() (*Token, error) { return tok, nil } - tok, err := l.next() + tok, err := l.nextAndTranMode() if err != nil { l.logger.Log(" Detectes an error: %v", err) return nil, err @@ -168,7 +181,7 @@ func (l *lexer) Next() (*Token, error) { } errTok := tok for { - tok, err = l.next() + tok, err = l.nextAndTranMode() if err != nil { l.logger.Log(" Detectes an error: %v", err) return nil, err @@ -205,7 +218,7 @@ func (l *lexer) peekN(n int) (*Token, error) { return nil, fmt.Errorf("peekN() can handle only [0..2]") } for len(l.tokBuf) < n+1 { - tok, err := l.next() + tok, err := l.nextAndTranMode() if err != nil { return nil, err } @@ -214,8 +227,41 @@ func (l *lexer) peekN(n int) (*Token, error) { return l.tokBuf[n], nil } +func (l *lexer) nextAndTranMode() (*Token, error) { + tok, err := l.next() + if err != nil { + return nil, err + } + if tok.EOF || tok.Invalid { + return tok, nil + } + spec := l.clspec.Specs[l.mode()] + if spec.Pop[tok.ID] == 1 { + err := l.popMode() + if err != nil { + return nil, err + } + } + mode := spec.Push[tok.ID] + if !mode.IsNil() { + l.pushMode(mode) + } + // The checking length of the mode stack must be at after pop and push operations + // because those operations can be performed at the same time. + // When the mode stack has just one element and popped it, the mode stack will be temporarily emptied. + // However, since a push operation may be performed immediately after it, + // the lexer allows the stack to be temporarily empty. + if len(l.modeStack) == 0 { + return nil, fmt.Errorf("a mode stack must have at least one element") + } + return tok, nil +} + func (l *lexer) next() (*Token, error) { - state := l.clspec.DFA.InitialState + mode := l.mode() + modeName := l.clspec.Modes[mode] + spec := l.clspec.Specs[mode] + state := spec.DFA.InitialState buf := []byte{} unfixedBufLen := 0 var tok *Token @@ -229,13 +275,13 @@ func (l *lexer) next() (*Token, error) { // When `buf` has unaccepted data and reads the EOF, // the lexer treats the buffered data as an invalid token. if len(buf) > 0 { - return newInvalidToken(newByteSequence(buf)), nil + return newInvalidToken(mode, modeName, newByteSequence(buf)), nil } - return newEOFToken(), nil + return newEOFToken(mode, modeName), nil } buf = append(buf, v) unfixedBufLen++ - entry := l.clspec.DFA.Transition[state] + entry := spec.DFA.Transition[state] if len(entry) == 0 { return nil, fmt.Errorf("no transition entry; state: %v", state) } @@ -245,17 +291,34 @@ func (l *lexer) next() (*Token, error) { l.unread(unfixedBufLen) return tok, nil } - return newInvalidToken(newByteSequence(buf)), nil + return newInvalidToken(mode, modeName, newByteSequence(buf)), nil } state = nextState - id, ok := l.clspec.DFA.AcceptingStates[state] + id, ok := spec.DFA.AcceptingStates[state] if ok { - tok = newToken(id, l.clspec.Kinds[id].String(), newByteSequence(buf)) + tok = newToken(mode, modeName, id, spec.Kinds[id].String(), newByteSequence(buf)) unfixedBufLen = 0 } } } +func (l *lexer) mode() spec.LexModeNum { + return l.modeStack[len(l.modeStack)-1] +} + +func (l *lexer) pushMode(mode spec.LexModeNum) { + l.modeStack = append(l.modeStack, mode) +} + +func (l *lexer) popMode() error { + sLen := len(l.modeStack) + if sLen == 0 { + return fmt.Errorf("cannot pop a lex mode from a lex mode stack any more") + } + l.modeStack = l.modeStack[:sLen-1] + return nil +} + func (l *lexer) read() (byte, bool) { if l.srcPtr >= len(l.src) { return 0, true diff --git a/driver/lexer_test.go b/driver/lexer_test.go index 26b5d49..d3edb3c 100644 --- a/driver/lexer_test.go +++ b/driver/lexer_test.go @@ -10,13 +10,38 @@ import ( "github.com/nihei9/maleeni/spec" ) -func newLexEntry(kind string, pattern string) *spec.LexEntry { +func newLexEntry(modes []string, kind string, pattern string, push string, pop bool) *spec.LexEntry { + ms := []spec.LexModeName{} + for _, m := range modes { + ms = append(ms, spec.LexModeName(m)) + } + return &spec.LexEntry{ + Kind: spec.LexKind(kind), + Pattern: spec.LexPattern(pattern), + Modes: ms, + Push: spec.LexModeName(push), + Pop: pop, + } +} + +func newLexEntryDefaultNOP(kind string, pattern string) *spec.LexEntry { return &spec.LexEntry{ Kind: spec.LexKind(kind), Pattern: spec.LexPattern(pattern), + Modes: []spec.LexModeName{ + spec.LexModeNameDefault, + }, } } +func newTokenDefault(id int, kind string, match byteSequence) *Token { + return newToken(spec.LexModeNumDefault, spec.LexModeNameDefault, id, kind, match) +} + +func newEOFTokenDefault() *Token { + return newEOFToken(spec.LexModeNumDefault, spec.LexModeNameDefault) +} + func TestLexer_Next(t *testing.T) { test := []struct { lspec *spec.LexSpec @@ -26,58 +51,58 @@ func TestLexer_Next(t *testing.T) { { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("t1", "(a|b)*abb"), - newLexEntry("t2", " +"), + newLexEntryDefaultNOP("t1", "(a|b)*abb"), + newLexEntryDefaultNOP("t2", " +"), }, }, src: "abb aabb aaabb babb bbabb abbbabb", tokens: []*Token{ - newToken(1, "t1", newByteSequence([]byte("abb"))), - newToken(2, "t2", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("aabb"))), - newToken(2, "t2", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("aaabb"))), - newToken(2, "t2", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("babb"))), - newToken(2, "t2", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("bbabb"))), - newToken(2, "t2", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("abbbabb"))), - newEOFToken(), + newTokenDefault(1, "t1", newByteSequence([]byte("abb"))), + newTokenDefault(2, "t2", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("aabb"))), + newTokenDefault(2, "t2", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("aaabb"))), + newTokenDefault(2, "t2", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("babb"))), + newTokenDefault(2, "t2", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("bbabb"))), + newTokenDefault(2, "t2", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("abbbabb"))), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("t1", "b?a+"), - newLexEntry("t2", "(ab)?(cd)+"), - newLexEntry("t3", " +"), + newLexEntryDefaultNOP("t1", "b?a+"), + newLexEntryDefaultNOP("t2", "(ab)?(cd)+"), + newLexEntryDefaultNOP("t3", " +"), }, }, src: "ba baaa a aaa abcd abcdcdcd cd cdcdcd", tokens: []*Token{ - newToken(1, "t1", newByteSequence([]byte("ba"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("baaa"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("a"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("aaa"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(2, "t2", newByteSequence([]byte("abcd"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(2, "t2", newByteSequence([]byte("abcdcdcd"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(2, "t2", newByteSequence([]byte("cd"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(2, "t2", newByteSequence([]byte("cdcdcd"))), - newEOFToken(), + newTokenDefault(1, "t1", newByteSequence([]byte("ba"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("baaa"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("a"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("aaa"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(2, "t2", newByteSequence([]byte("abcd"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(2, "t2", newByteSequence([]byte("abcdcdcd"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(2, "t2", newByteSequence([]byte("cd"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(2, "t2", newByteSequence([]byte("cdcdcd"))), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("t1", "."), + newLexEntryDefaultNOP("t1", "."), }, }, src: string([]byte{ @@ -99,45 +124,45 @@ func TestLexer_Next(t *testing.T) { 0xf4, 0x8f, 0xbf, 0xbf, }), tokens: []*Token{ - newToken(1, "t1", newByteSequence([]byte{0x00})), - newToken(1, "t1", newByteSequence([]byte{0x7f})), - newToken(1, "t1", newByteSequence([]byte{0xc2, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xdf, 0xbf})), - newToken(1, "t1", newByteSequence([]byte{0xe1, 0x80, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xec, 0xbf, 0xbf})), - newToken(1, "t1", newByteSequence([]byte{0xed, 0x80, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xed, 0x9f, 0xbf})), - newToken(1, "t1", newByteSequence([]byte{0xee, 0x80, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xef, 0xbf, 0xbf})), - newToken(1, "t1", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), - newToken(1, "t1", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})), - newToken(1, "t1", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "t1", newByteSequence([]byte{0x00})), + newTokenDefault(1, "t1", newByteSequence([]byte{0x7f})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xc2, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xdf, 0xbf})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xe1, 0x80, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xec, 0xbf, 0xbf})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xed, 0x80, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xed, 0x9f, 0xbf})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xee, 0x80, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xef, 0xbf, 0xbf})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("t1", "[ab.*+?|()[\\]]"), + newLexEntryDefaultNOP("t1", "[ab.*+?|()[\\]]"), }, }, src: "ab.*+?|()[]", tokens: []*Token{ - newToken(1, "t1", newByteSequence([]byte("a"))), - newToken(1, "t1", newByteSequence([]byte("b"))), - newToken(1, "t1", newByteSequence([]byte("."))), - newToken(1, "t1", newByteSequence([]byte("*"))), - newToken(1, "t1", newByteSequence([]byte("+"))), - newToken(1, "t1", newByteSequence([]byte("?"))), - newToken(1, "t1", newByteSequence([]byte("|"))), - newToken(1, "t1", newByteSequence([]byte("("))), - newToken(1, "t1", newByteSequence([]byte(")"))), - newToken(1, "t1", newByteSequence([]byte("["))), - newToken(1, "t1", newByteSequence([]byte("]"))), - newEOFToken(), + newTokenDefault(1, "t1", newByteSequence([]byte("a"))), + newTokenDefault(1, "t1", newByteSequence([]byte("b"))), + newTokenDefault(1, "t1", newByteSequence([]byte("."))), + newTokenDefault(1, "t1", newByteSequence([]byte("*"))), + newTokenDefault(1, "t1", newByteSequence([]byte("+"))), + newTokenDefault(1, "t1", newByteSequence([]byte("?"))), + newTokenDefault(1, "t1", newByteSequence([]byte("|"))), + newTokenDefault(1, "t1", newByteSequence([]byte("("))), + newTokenDefault(1, "t1", newByteSequence([]byte(")"))), + newTokenDefault(1, "t1", newByteSequence([]byte("["))), + newTokenDefault(1, "t1", newByteSequence([]byte("]"))), + newEOFTokenDefault(), }, }, { @@ -149,7 +174,7 @@ func TestLexer_Next(t *testing.T) { // maleeni cannot handle the null character in patterns because compiler.lexer, // specifically read() and restore(), recognizes the null characters as that a symbol doesn't exist. // If a pattern needs a null character, use code point expression \u{0000}. - newLexEntry("1ByteChar", "[\x01-\x7f]"), + newLexEntryDefaultNOP("1ByteChar", "[\x01-\x7f]"), }, }, src: string([]byte{ @@ -159,18 +184,18 @@ func TestLexer_Next(t *testing.T) { 0x7f, }), tokens: []*Token{ - newToken(1, "1ByteChar", newByteSequence([]byte{0x01})), - newToken(1, "1ByteChar", newByteSequence([]byte{0x02})), - newToken(1, "1ByteChar", newByteSequence([]byte{0x7e})), - newToken(1, "1ByteChar", newByteSequence([]byte{0x7f})), - newEOFToken(), + newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x01})), + newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x02})), + newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x7e})), + newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x7f})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // all 2 byte characters - newLexEntry("2ByteChar", "[\xc2\x80-\xdf\xbf]"), + newLexEntryDefaultNOP("2ByteChar", "[\xc2\x80-\xdf\xbf]"), }, }, src: string([]byte{ @@ -180,33 +205,33 @@ func TestLexer_Next(t *testing.T) { 0xdf, 0xbf, }), tokens: []*Token{ - newToken(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x80})), - newToken(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x81})), - newToken(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbe})), - newToken(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x80})), + newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x81})), + newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbe})), + newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // All bytes are the same. - newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\x80]"), + newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\x80]"), }, }, src: string([]byte{ 0xe0, 0xa0, 0x80, }), tokens: []*Token{ - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), - newEOFToken(), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first two bytes are the same. - newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\xbf]"), + newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\xbf]"), }, }, src: string([]byte{ @@ -216,18 +241,18 @@ func TestLexer_Next(t *testing.T) { 0xe0, 0xa0, 0xbf, }), tokens: []*Token{ - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbe})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbf})), - newEOFToken(), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbe})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first byte are the same. - newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xbf\xbf]"), + newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xe0\xbf\xbf]"), }, }, src: string([]byte{ @@ -237,18 +262,18 @@ func TestLexer_Next(t *testing.T) { 0xe0, 0xbf, 0xbf, }), tokens: []*Token{ - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // all 3 byte characters - newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xef\xbf\xbf]"), + newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xef\xbf\xbf]"), }, }, src: string([]byte{ @@ -270,45 +295,45 @@ func TestLexer_Next(t *testing.T) { 0xef, 0xbf, 0xbf, }), tokens: []*Token{ - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x80})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x81})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbe})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbf})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x80})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x81})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbe})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbf})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x80})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x81})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbe})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x80})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x81})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbe})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbf})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x80})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x81})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbe})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbf})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x80})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x81})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbe})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // All bytes are the same. - newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\x80]"), + newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\x80]"), }, }, src: string([]byte{ 0xf0, 0x90, 0x80, 0x80, }), tokens: []*Token{ - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newEOFToken(), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first 3 bytes are the same. - newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\xbf]"), + newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\xbf]"), }, }, src: string([]byte{ @@ -318,18 +343,18 @@ func TestLexer_Next(t *testing.T) { 0xf0, 0x90, 0x80, 0xbf, }), tokens: []*Token{ - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbe})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbf})), - newEOFToken(), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbe})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first 2 bytes are the same. - newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\xbf\xbf]"), + newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\xbf\xbf]"), }, }, src: string([]byte{ @@ -339,18 +364,18 @@ func TestLexer_Next(t *testing.T) { 0xf0, 0x90, 0xbf, 0xbf, }), tokens: []*Token{ - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbe})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbe})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first byte are the same. - newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\xbf\xbf\xbf]"), + newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\xbf\xbf\xbf]"), }, }, src: string([]byte{ @@ -360,18 +385,18 @@ func TestLexer_Next(t *testing.T) { 0xf0, 0xbf, 0xbf, 0xbf, }), tokens: []*Token{ - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // all 4 byte characters - newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf4\x8f\xbf\xbf]"), + newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf4\x8f\xbf\xbf]"), }, }, src: string([]byte{ @@ -389,64 +414,114 @@ func TestLexer_Next(t *testing.T) { 0xf4, 0x8f, 0xbf, 0xbf, }), tokens: []*Token{ - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x81})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbe})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x81})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbe})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x81})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbe})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x81})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbe})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("NonNumber", "[^0-9]+[0-9]"), + newLexEntryDefaultNOP("NonNumber", "[^0-9]+[0-9]"), }, }, src: "foo9", tokens: []*Token{ - newToken(1, "NonNumber", newByteSequence([]byte("foo9"))), - newEOFToken(), + newTokenDefault(1, "NonNumber", newByteSequence([]byte("foo9"))), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("1ByteChar", "\\u{006E}"), - newLexEntry("2ByteChar", "\\u{03BD}"), - newLexEntry("3ByteChar", "\\u{306B}"), - newLexEntry("4ByteChar", "\\u{01F638}"), + newLexEntryDefaultNOP("1ByteChar", "\\u{006E}"), + newLexEntryDefaultNOP("2ByteChar", "\\u{03BD}"), + newLexEntryDefaultNOP("3ByteChar", "\\u{306B}"), + newLexEntryDefaultNOP("4ByteChar", "\\u{01F638}"), }, }, src: "nνに😸", tokens: []*Token{ - newToken(1, "1ByteChar", newByteSequence([]byte{0x6E})), - newToken(2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})), - newToken(3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})), - newToken(4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), - newEOFToken(), + newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x6E})), + newTokenDefault(2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})), + newTokenDefault(3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})), + newTokenDefault(4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("codePointsAlt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"), + newLexEntryDefaultNOP("codePointsAlt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"), }, }, src: "nνに😸", tokens: []*Token{ - newToken(1, "codePointsAlt", newByteSequence([]byte{0x6E})), - newToken(1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})), - newToken(1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})), - newToken(1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), - newEOFToken(), + newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0x6E})), + newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})), + newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})), + newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), + newEOFTokenDefault(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + newLexEntryDefaultNOP("white_space", ` *`), + newLexEntry([]string{"default"}, "string_open", `"`, "string", false), + newLexEntry([]string{"string"}, "escape_sequence", `\\[n"\\]`, "", false), + newLexEntry([]string{"string"}, "char_sequence", `[^"\\]*`, "", false), + newLexEntry([]string{"string"}, "string_close", `"`, "", true), + }, + }, + src: `"" "Hello world.\n\"Hello world.\""`, + tokens: []*Token{ + newToken(1, "default", 2, "string_open", newByteSequence([]byte(`"`))), + newToken(2, "string", 3, "string_close", newByteSequence([]byte(`"`))), + newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))), + newToken(1, "default", 2, "string_open", newByteSequence([]byte(`"`))), + newToken(2, "string", 2, "char_sequence", newByteSequence([]byte(`Hello world.`))), + newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\n`))), + newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\"`))), + newToken(2, "string", 2, "char_sequence", newByteSequence([]byte(`Hello world.`))), + newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\"`))), + newToken(2, "string", 3, "string_close", newByteSequence([]byte(`"`))), + newEOFTokenDefault(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // `white_space` is enabled in multiple modes. + newLexEntry([]string{"default", "state_a", "state_b"}, "white_space", ` *`, "", false), + newLexEntry([]string{"default"}, "char_a", `a`, "state_a", false), + newLexEntry([]string{"state_a"}, "char_b", `b`, "state_b", false), + newLexEntry([]string{"state_a"}, "back_from_a", `<`, "", true), + newLexEntry([]string{"state_b"}, "back_from_b", `<`, "", true), + }, + }, + src: ` a b < < `, + tokens: []*Token{ + newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))), + newToken(1, "default", 2, "char_a", newByteSequence([]byte(`a`))), + newToken(2, "state_a", 1, "white_space", newByteSequence([]byte(` `))), + newToken(2, "state_a", 2, "char_b", newByteSequence([]byte(`b`))), + newToken(3, "state_b", 1, "white_space", newByteSequence([]byte(` `))), + newToken(3, "state_b", 2, "back_from_b", newByteSequence([]byte(`<`))), + newToken(2, "state_a", 1, "white_space", newByteSequence([]byte(` `))), + newToken(2, "state_a", 3, "back_from_a", newByteSequence([]byte(`<`))), + newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))), + newEOFTokenDefault(), }, }, } @@ -479,8 +554,8 @@ func TestLexer_Next(t *testing.T) { func TestLexer_PeekN(t *testing.T) { clspec, err := compiler.Compile(&spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("t1", "foo"), - newLexEntry("t2", "bar"), + newLexEntryDefaultNOP("t1", "foo"), + newLexEntryDefaultNOP("t2", "bar"), }, }) if err != nil { @@ -492,9 +567,9 @@ func TestLexer_PeekN(t *testing.T) { } expectedTokens := []*Token{ - newToken(1, "t1", []byte("foo")), - newToken(2, "t2", []byte("bar")), - newEOFToken(), + newTokenDefault(1, "t1", []byte("foo")), + newTokenDefault(2, "t2", []byte("bar")), + newEOFTokenDefault(), } tok, err := lex.Peek1() @@ -539,7 +614,13 @@ func TestLexer_PeekN(t *testing.T) { func testToken(t *testing.T, expected, actual *Token) { t.Helper() - if actual.ID != expected.ID || actual.Kind != expected.Kind || !bytes.Equal(actual.Match, expected.Match) || actual.EOF != expected.EOF || actual.Invalid != expected.Invalid { - t.Errorf("unexpected token; want: %v (\"%v\"), got: %v (\"%v\")", expected, string(expected.Match), actual, string(actual.Match)) + if actual.Mode != expected.Mode || + actual.ModeName != actual.ModeName || + actual.ID != expected.ID || + actual.Kind != expected.Kind || + !bytes.Equal(actual.Match, expected.Match) || + actual.EOF != expected.EOF || + actual.Invalid != expected.Invalid { + t.Fatalf(`unexpected token; want: %v ("%v"), got: %v ("%v")`, expected, string(expected.Match), actual, string(actual.Match)) } } diff --git a/spec/spec.go b/spec/spec.go index 0f9b484..e2291e9 100644 --- a/spec/spec.go +++ b/spec/spec.go @@ -3,6 +3,7 @@ package spec import ( "fmt" "regexp" + "strconv" "strings" ) @@ -37,9 +38,61 @@ func (p LexPattern) validate() error { return nil } +const lexModePattern = "[A-Za-z_][0-9A-Za-z_]*" + +var lexModeRE = regexp.MustCompile(lexKindPattern) + +type LexModeName string + +const ( + LexModeNameNil = LexModeName("") + LexModeNameDefault = LexModeName("default") +) + +func (m LexModeName) String() string { + return string(m) +} + +func (m LexModeName) validate() error { + if m.isNil() || !lexModeRE.Match([]byte(m)) { + return fmt.Errorf("mode must be %v", lexModePattern) + } + return nil +} + +func (m LexModeName) isNil() bool { + return m == LexModeNameNil +} + +type LexModeNum int + +const ( + LexModeNumNil = LexModeNum(0) + LexModeNumDefault = LexModeNum(1) +) + +func (n LexModeNum) String() string { + return strconv.Itoa(int(n)) +} + +func (n LexModeNum) Int() int { + return int(n) +} + +func (n LexModeNum) Succ() LexModeNum { + return n + 1 +} + +func (n LexModeNum) IsNil() bool { + return n == LexModeNumNil +} + type LexEntry struct { - Kind LexKind `json:"kind"` - Pattern LexPattern `json:"pattern"` + Kind LexKind `json:"kind"` + Pattern LexPattern `json:"pattern"` + Modes []LexModeName `json:"modes"` + Push LexModeName `json:"push"` + Pop bool `json:"pop"` } func (e *LexEntry) validate() error { @@ -51,6 +104,14 @@ func (e *LexEntry) validate() error { if err != nil { return err } + if len(e.Modes) > 0 { + for _, mode := range e.Modes { + err = mode.validate() + if err != nil { + return err + } + } + } return nil } @@ -97,7 +158,15 @@ type TransitionTable struct { Transition [][]int `json:"transition"` } -type CompiledLexSpec struct { +type CompiledLexModeSpec struct { Kinds []LexKind `json:"kinds"` + Push []LexModeNum `json:"push"` + Pop []int `json:"pop"` DFA *TransitionTable `json:"dfa"` } + +type CompiledLexSpec struct { + InitialMode LexModeNum `json:"initial_mode"` + Modes []LexModeName `json:"modes"` + Specs []*CompiledLexModeSpec `json:"specs"` +} |