diff options
Diffstat (limited to 'driver')
-rw-r--r-- | driver/lexer.go | 135 | ||||
-rw-r--r-- | driver/lexer_test.go | 421 |
2 files changed, 350 insertions, 206 deletions
diff --git a/driver/lexer.go b/driver/lexer.go index 8a2afd8..72a6f69 100644 --- a/driver/lexer.go +++ b/driver/lexer.go @@ -54,46 +54,54 @@ func (s byteSequence) merge(a byteSequence) byteSequence { } type Token struct { - ID int `json:"id"` - Kind string `json:"kind"` - Match byteSequence `json:"match"` - Text string `json:"text"` - EOF bool `json:"eof"` - Invalid bool `json:"invalid"` + Mode spec.LexModeNum `json:"mode"` + ModeName spec.LexModeName `json:"mode_name"` + ID int `json:"id"` + Kind string `json:"kind"` + Match byteSequence `json:"match"` + Text string `json:"text"` + EOF bool `json:"eof"` + Invalid bool `json:"invalid"` } -func newToken(id int, kind string, match byteSequence) *Token { +func newToken(mode spec.LexModeNum, modeName spec.LexModeName, id int, kind string, match byteSequence) *Token { return &Token{ - ID: id, - Kind: kind, - Match: match, - Text: string(match.ByteSlice()), + Mode: mode, + ModeName: modeName, + ID: id, + Kind: kind, + Match: match, + Text: string(match.ByteSlice()), } } -func newEOFToken() *Token { +func newEOFToken(mode spec.LexModeNum, modeName spec.LexModeName) *Token { return &Token{ - ID: 0, - EOF: true, + Mode: mode, + ModeName: modeName, + ID: 0, + EOF: true, } } -func newInvalidToken(match byteSequence) *Token { +func newInvalidToken(mode spec.LexModeNum, modeName spec.LexModeName, match byteSequence) *Token { return &Token{ - ID: 0, - Match: match, - Invalid: true, + Mode: mode, + ModeName: modeName, + ID: 0, + Match: match, + Invalid: true, } } func (t *Token) String() string { if t.Invalid { - return fmt.Sprintf("!{text: %v, byte: %v}", t.Text, t.Match) + return fmt.Sprintf("!{mode: %v, mode name: %v, text: %v, byte: %v}", t.Mode, t.ModeName, t.Text, t.Match) } if t.EOF { return "{eof}" } - return fmt.Sprintf("{id: %v, kind: %v, text: %v, byte: %v}", t.ID, t.Kind, t.Text, t.Match) + return fmt.Sprintf("{mode: %v, mode name: %v, id: %v, kind: %v, text: %v, byte: %v}", t.Mode, t.ModeName, t.ID, t.Kind, t.Text, t.Match) } type lexerOption func(l *lexer) error @@ -110,11 +118,12 @@ func EnableLogging(w io.Writer) lexerOption { } type lexer struct { - clspec *spec.CompiledLexSpec - src []byte - srcPtr int - tokBuf []*Token - logger log.Logger + clspec *spec.CompiledLexSpec + src []byte + srcPtr int + tokBuf []*Token + modeStack []spec.LexModeNum + logger log.Logger } func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...lexerOption) (*lexer, error) { @@ -126,6 +135,9 @@ func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...lexerOption) clspec: clspec, src: b, srcPtr: 0, + modeStack: []spec.LexModeNum{ + clspec.InitialMode, + }, logger: log.NewNopLogger(), } for _, opt := range opts { @@ -142,8 +154,9 @@ func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...lexerOption) func (l *lexer) Next() (*Token, error) { l.logger.Log(`lexer#Next(): State: + mode: #%v %v pointer: %v - token buffer: %v`, l.srcPtr, l.tokBuf) + token buffer: %v`, l.mode(), l.clspec.Modes[l.mode()], l.srcPtr, l.tokBuf) if len(l.tokBuf) > 0 { tok := l.tokBuf[0] @@ -154,7 +167,7 @@ func (l *lexer) Next() (*Token, error) { return tok, nil } - tok, err := l.next() + tok, err := l.nextAndTranMode() if err != nil { l.logger.Log(" Detectes an error: %v", err) return nil, err @@ -168,7 +181,7 @@ func (l *lexer) Next() (*Token, error) { } errTok := tok for { - tok, err = l.next() + tok, err = l.nextAndTranMode() if err != nil { l.logger.Log(" Detectes an error: %v", err) return nil, err @@ -205,7 +218,7 @@ func (l *lexer) peekN(n int) (*Token, error) { return nil, fmt.Errorf("peekN() can handle only [0..2]") } for len(l.tokBuf) < n+1 { - tok, err := l.next() + tok, err := l.nextAndTranMode() if err != nil { return nil, err } @@ -214,8 +227,41 @@ func (l *lexer) peekN(n int) (*Token, error) { return l.tokBuf[n], nil } +func (l *lexer) nextAndTranMode() (*Token, error) { + tok, err := l.next() + if err != nil { + return nil, err + } + if tok.EOF || tok.Invalid { + return tok, nil + } + spec := l.clspec.Specs[l.mode()] + if spec.Pop[tok.ID] == 1 { + err := l.popMode() + if err != nil { + return nil, err + } + } + mode := spec.Push[tok.ID] + if !mode.IsNil() { + l.pushMode(mode) + } + // The checking length of the mode stack must be at after pop and push operations + // because those operations can be performed at the same time. + // When the mode stack has just one element and popped it, the mode stack will be temporarily emptied. + // However, since a push operation may be performed immediately after it, + // the lexer allows the stack to be temporarily empty. + if len(l.modeStack) == 0 { + return nil, fmt.Errorf("a mode stack must have at least one element") + } + return tok, nil +} + func (l *lexer) next() (*Token, error) { - state := l.clspec.DFA.InitialState + mode := l.mode() + modeName := l.clspec.Modes[mode] + spec := l.clspec.Specs[mode] + state := spec.DFA.InitialState buf := []byte{} unfixedBufLen := 0 var tok *Token @@ -229,13 +275,13 @@ func (l *lexer) next() (*Token, error) { // When `buf` has unaccepted data and reads the EOF, // the lexer treats the buffered data as an invalid token. if len(buf) > 0 { - return newInvalidToken(newByteSequence(buf)), nil + return newInvalidToken(mode, modeName, newByteSequence(buf)), nil } - return newEOFToken(), nil + return newEOFToken(mode, modeName), nil } buf = append(buf, v) unfixedBufLen++ - entry := l.clspec.DFA.Transition[state] + entry := spec.DFA.Transition[state] if len(entry) == 0 { return nil, fmt.Errorf("no transition entry; state: %v", state) } @@ -245,17 +291,34 @@ func (l *lexer) next() (*Token, error) { l.unread(unfixedBufLen) return tok, nil } - return newInvalidToken(newByteSequence(buf)), nil + return newInvalidToken(mode, modeName, newByteSequence(buf)), nil } state = nextState - id, ok := l.clspec.DFA.AcceptingStates[state] + id, ok := spec.DFA.AcceptingStates[state] if ok { - tok = newToken(id, l.clspec.Kinds[id].String(), newByteSequence(buf)) + tok = newToken(mode, modeName, id, spec.Kinds[id].String(), newByteSequence(buf)) unfixedBufLen = 0 } } } +func (l *lexer) mode() spec.LexModeNum { + return l.modeStack[len(l.modeStack)-1] +} + +func (l *lexer) pushMode(mode spec.LexModeNum) { + l.modeStack = append(l.modeStack, mode) +} + +func (l *lexer) popMode() error { + sLen := len(l.modeStack) + if sLen == 0 { + return fmt.Errorf("cannot pop a lex mode from a lex mode stack any more") + } + l.modeStack = l.modeStack[:sLen-1] + return nil +} + func (l *lexer) read() (byte, bool) { if l.srcPtr >= len(l.src) { return 0, true diff --git a/driver/lexer_test.go b/driver/lexer_test.go index 26b5d49..d3edb3c 100644 --- a/driver/lexer_test.go +++ b/driver/lexer_test.go @@ -10,13 +10,38 @@ import ( "github.com/nihei9/maleeni/spec" ) -func newLexEntry(kind string, pattern string) *spec.LexEntry { +func newLexEntry(modes []string, kind string, pattern string, push string, pop bool) *spec.LexEntry { + ms := []spec.LexModeName{} + for _, m := range modes { + ms = append(ms, spec.LexModeName(m)) + } + return &spec.LexEntry{ + Kind: spec.LexKind(kind), + Pattern: spec.LexPattern(pattern), + Modes: ms, + Push: spec.LexModeName(push), + Pop: pop, + } +} + +func newLexEntryDefaultNOP(kind string, pattern string) *spec.LexEntry { return &spec.LexEntry{ Kind: spec.LexKind(kind), Pattern: spec.LexPattern(pattern), + Modes: []spec.LexModeName{ + spec.LexModeNameDefault, + }, } } +func newTokenDefault(id int, kind string, match byteSequence) *Token { + return newToken(spec.LexModeNumDefault, spec.LexModeNameDefault, id, kind, match) +} + +func newEOFTokenDefault() *Token { + return newEOFToken(spec.LexModeNumDefault, spec.LexModeNameDefault) +} + func TestLexer_Next(t *testing.T) { test := []struct { lspec *spec.LexSpec @@ -26,58 +51,58 @@ func TestLexer_Next(t *testing.T) { { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("t1", "(a|b)*abb"), - newLexEntry("t2", " +"), + newLexEntryDefaultNOP("t1", "(a|b)*abb"), + newLexEntryDefaultNOP("t2", " +"), }, }, src: "abb aabb aaabb babb bbabb abbbabb", tokens: []*Token{ - newToken(1, "t1", newByteSequence([]byte("abb"))), - newToken(2, "t2", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("aabb"))), - newToken(2, "t2", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("aaabb"))), - newToken(2, "t2", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("babb"))), - newToken(2, "t2", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("bbabb"))), - newToken(2, "t2", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("abbbabb"))), - newEOFToken(), + newTokenDefault(1, "t1", newByteSequence([]byte("abb"))), + newTokenDefault(2, "t2", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("aabb"))), + newTokenDefault(2, "t2", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("aaabb"))), + newTokenDefault(2, "t2", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("babb"))), + newTokenDefault(2, "t2", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("bbabb"))), + newTokenDefault(2, "t2", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("abbbabb"))), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("t1", "b?a+"), - newLexEntry("t2", "(ab)?(cd)+"), - newLexEntry("t3", " +"), + newLexEntryDefaultNOP("t1", "b?a+"), + newLexEntryDefaultNOP("t2", "(ab)?(cd)+"), + newLexEntryDefaultNOP("t3", " +"), }, }, src: "ba baaa a aaa abcd abcdcdcd cd cdcdcd", tokens: []*Token{ - newToken(1, "t1", newByteSequence([]byte("ba"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("baaa"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("a"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("aaa"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(2, "t2", newByteSequence([]byte("abcd"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(2, "t2", newByteSequence([]byte("abcdcdcd"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(2, "t2", newByteSequence([]byte("cd"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(2, "t2", newByteSequence([]byte("cdcdcd"))), - newEOFToken(), + newTokenDefault(1, "t1", newByteSequence([]byte("ba"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("baaa"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("a"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("aaa"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(2, "t2", newByteSequence([]byte("abcd"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(2, "t2", newByteSequence([]byte("abcdcdcd"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(2, "t2", newByteSequence([]byte("cd"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(2, "t2", newByteSequence([]byte("cdcdcd"))), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("t1", "."), + newLexEntryDefaultNOP("t1", "."), }, }, src: string([]byte{ @@ -99,45 +124,45 @@ func TestLexer_Next(t *testing.T) { 0xf4, 0x8f, 0xbf, 0xbf, }), tokens: []*Token{ - newToken(1, "t1", newByteSequence([]byte{0x00})), - newToken(1, "t1", newByteSequence([]byte{0x7f})), - newToken(1, "t1", newByteSequence([]byte{0xc2, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xdf, 0xbf})), - newToken(1, "t1", newByteSequence([]byte{0xe1, 0x80, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xec, 0xbf, 0xbf})), - newToken(1, "t1", newByteSequence([]byte{0xed, 0x80, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xed, 0x9f, 0xbf})), - newToken(1, "t1", newByteSequence([]byte{0xee, 0x80, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xef, 0xbf, 0xbf})), - newToken(1, "t1", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), - newToken(1, "t1", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})), - newToken(1, "t1", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "t1", newByteSequence([]byte{0x00})), + newTokenDefault(1, "t1", newByteSequence([]byte{0x7f})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xc2, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xdf, 0xbf})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xe1, 0x80, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xec, 0xbf, 0xbf})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xed, 0x80, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xed, 0x9f, 0xbf})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xee, 0x80, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xef, 0xbf, 0xbf})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("t1", "[ab.*+?|()[\\]]"), + newLexEntryDefaultNOP("t1", "[ab.*+?|()[\\]]"), }, }, src: "ab.*+?|()[]", tokens: []*Token{ - newToken(1, "t1", newByteSequence([]byte("a"))), - newToken(1, "t1", newByteSequence([]byte("b"))), - newToken(1, "t1", newByteSequence([]byte("."))), - newToken(1, "t1", newByteSequence([]byte("*"))), - newToken(1, "t1", newByteSequence([]byte("+"))), - newToken(1, "t1", newByteSequence([]byte("?"))), - newToken(1, "t1", newByteSequence([]byte("|"))), - newToken(1, "t1", newByteSequence([]byte("("))), - newToken(1, "t1", newByteSequence([]byte(")"))), - newToken(1, "t1", newByteSequence([]byte("["))), - newToken(1, "t1", newByteSequence([]byte("]"))), - newEOFToken(), + newTokenDefault(1, "t1", newByteSequence([]byte("a"))), + newTokenDefault(1, "t1", newByteSequence([]byte("b"))), + newTokenDefault(1, "t1", newByteSequence([]byte("."))), + newTokenDefault(1, "t1", newByteSequence([]byte("*"))), + newTokenDefault(1, "t1", newByteSequence([]byte("+"))), + newTokenDefault(1, "t1", newByteSequence([]byte("?"))), + newTokenDefault(1, "t1", newByteSequence([]byte("|"))), + newTokenDefault(1, "t1", newByteSequence([]byte("("))), + newTokenDefault(1, "t1", newByteSequence([]byte(")"))), + newTokenDefault(1, "t1", newByteSequence([]byte("["))), + newTokenDefault(1, "t1", newByteSequence([]byte("]"))), + newEOFTokenDefault(), }, }, { @@ -149,7 +174,7 @@ func TestLexer_Next(t *testing.T) { // maleeni cannot handle the null character in patterns because compiler.lexer, // specifically read() and restore(), recognizes the null characters as that a symbol doesn't exist. // If a pattern needs a null character, use code point expression \u{0000}. - newLexEntry("1ByteChar", "[\x01-\x7f]"), + newLexEntryDefaultNOP("1ByteChar", "[\x01-\x7f]"), }, }, src: string([]byte{ @@ -159,18 +184,18 @@ func TestLexer_Next(t *testing.T) { 0x7f, }), tokens: []*Token{ - newToken(1, "1ByteChar", newByteSequence([]byte{0x01})), - newToken(1, "1ByteChar", newByteSequence([]byte{0x02})), - newToken(1, "1ByteChar", newByteSequence([]byte{0x7e})), - newToken(1, "1ByteChar", newByteSequence([]byte{0x7f})), - newEOFToken(), + newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x01})), + newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x02})), + newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x7e})), + newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x7f})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // all 2 byte characters - newLexEntry("2ByteChar", "[\xc2\x80-\xdf\xbf]"), + newLexEntryDefaultNOP("2ByteChar", "[\xc2\x80-\xdf\xbf]"), }, }, src: string([]byte{ @@ -180,33 +205,33 @@ func TestLexer_Next(t *testing.T) { 0xdf, 0xbf, }), tokens: []*Token{ - newToken(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x80})), - newToken(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x81})), - newToken(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbe})), - newToken(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x80})), + newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x81})), + newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbe})), + newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // All bytes are the same. - newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\x80]"), + newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\x80]"), }, }, src: string([]byte{ 0xe0, 0xa0, 0x80, }), tokens: []*Token{ - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), - newEOFToken(), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first two bytes are the same. - newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\xbf]"), + newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\xbf]"), }, }, src: string([]byte{ @@ -216,18 +241,18 @@ func TestLexer_Next(t *testing.T) { 0xe0, 0xa0, 0xbf, }), tokens: []*Token{ - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbe})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbf})), - newEOFToken(), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbe})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first byte are the same. - newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xbf\xbf]"), + newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xe0\xbf\xbf]"), }, }, src: string([]byte{ @@ -237,18 +262,18 @@ func TestLexer_Next(t *testing.T) { 0xe0, 0xbf, 0xbf, }), tokens: []*Token{ - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // all 3 byte characters - newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xef\xbf\xbf]"), + newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xef\xbf\xbf]"), }, }, src: string([]byte{ @@ -270,45 +295,45 @@ func TestLexer_Next(t *testing.T) { 0xef, 0xbf, 0xbf, }), tokens: []*Token{ - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x80})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x81})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbe})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbf})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x80})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x81})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbe})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbf})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x80})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x81})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbe})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x80})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x81})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbe})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbf})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x80})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x81})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbe})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbf})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x80})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x81})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbe})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // All bytes are the same. - newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\x80]"), + newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\x80]"), }, }, src: string([]byte{ 0xf0, 0x90, 0x80, 0x80, }), tokens: []*Token{ - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newEOFToken(), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first 3 bytes are the same. - newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\xbf]"), + newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\xbf]"), }, }, src: string([]byte{ @@ -318,18 +343,18 @@ func TestLexer_Next(t *testing.T) { 0xf0, 0x90, 0x80, 0xbf, }), tokens: []*Token{ - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbe})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbf})), - newEOFToken(), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbe})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first 2 bytes are the same. - newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\xbf\xbf]"), + newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\xbf\xbf]"), }, }, src: string([]byte{ @@ -339,18 +364,18 @@ func TestLexer_Next(t *testing.T) { 0xf0, 0x90, 0xbf, 0xbf, }), tokens: []*Token{ - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbe})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbe})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first byte are the same. - newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\xbf\xbf\xbf]"), + newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\xbf\xbf\xbf]"), }, }, src: string([]byte{ @@ -360,18 +385,18 @@ func TestLexer_Next(t *testing.T) { 0xf0, 0xbf, 0xbf, 0xbf, }), tokens: []*Token{ - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // all 4 byte characters - newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf4\x8f\xbf\xbf]"), + newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf4\x8f\xbf\xbf]"), }, }, src: string([]byte{ @@ -389,64 +414,114 @@ func TestLexer_Next(t *testing.T) { 0xf4, 0x8f, 0xbf, 0xbf, }), tokens: []*Token{ - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x81})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbe})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x81})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbe})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x81})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbe})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x81})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbe})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("NonNumber", "[^0-9]+[0-9]"), + newLexEntryDefaultNOP("NonNumber", "[^0-9]+[0-9]"), }, }, src: "foo9", tokens: []*Token{ - newToken(1, "NonNumber", newByteSequence([]byte("foo9"))), - newEOFToken(), + newTokenDefault(1, "NonNumber", newByteSequence([]byte("foo9"))), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("1ByteChar", "\\u{006E}"), - newLexEntry("2ByteChar", "\\u{03BD}"), - newLexEntry("3ByteChar", "\\u{306B}"), - newLexEntry("4ByteChar", "\\u{01F638}"), + newLexEntryDefaultNOP("1ByteChar", "\\u{006E}"), + newLexEntryDefaultNOP("2ByteChar", "\\u{03BD}"), + newLexEntryDefaultNOP("3ByteChar", "\\u{306B}"), + newLexEntryDefaultNOP("4ByteChar", "\\u{01F638}"), }, }, src: "nνに😸", tokens: []*Token{ - newToken(1, "1ByteChar", newByteSequence([]byte{0x6E})), - newToken(2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})), - newToken(3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})), - newToken(4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), - newEOFToken(), + newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x6E})), + newTokenDefault(2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})), + newTokenDefault(3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})), + newTokenDefault(4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("codePointsAlt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"), + newLexEntryDefaultNOP("codePointsAlt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"), }, }, src: "nνに😸", tokens: []*Token{ - newToken(1, "codePointsAlt", newByteSequence([]byte{0x6E})), - newToken(1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})), - newToken(1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})), - newToken(1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), - newEOFToken(), + newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0x6E})), + newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})), + newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})), + newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), + newEOFTokenDefault(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + newLexEntryDefaultNOP("white_space", ` *`), + newLexEntry([]string{"default"}, "string_open", `"`, "string", false), + newLexEntry([]string{"string"}, "escape_sequence", `\\[n"\\]`, "", false), + newLexEntry([]string{"string"}, "char_sequence", `[^"\\]*`, "", false), + newLexEntry([]string{"string"}, "string_close", `"`, "", true), + }, + }, + src: `"" "Hello world.\n\"Hello world.\""`, + tokens: []*Token{ + newToken(1, "default", 2, "string_open", newByteSequence([]byte(`"`))), + newToken(2, "string", 3, "string_close", newByteSequence([]byte(`"`))), + newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))), + newToken(1, "default", 2, "string_open", newByteSequence([]byte(`"`))), + newToken(2, "string", 2, "char_sequence", newByteSequence([]byte(`Hello world.`))), + newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\n`))), + newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\"`))), + newToken(2, "string", 2, "char_sequence", newByteSequence([]byte(`Hello world.`))), + newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\"`))), + newToken(2, "string", 3, "string_close", newByteSequence([]byte(`"`))), + newEOFTokenDefault(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // `white_space` is enabled in multiple modes. + newLexEntry([]string{"default", "state_a", "state_b"}, "white_space", ` *`, "", false), + newLexEntry([]string{"default"}, "char_a", `a`, "state_a", false), + newLexEntry([]string{"state_a"}, "char_b", `b`, "state_b", false), + newLexEntry([]string{"state_a"}, "back_from_a", `<`, "", true), + newLexEntry([]string{"state_b"}, "back_from_b", `<`, "", true), + }, + }, + src: ` a b < < `, + tokens: []*Token{ + newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))), + newToken(1, "default", 2, "char_a", newByteSequence([]byte(`a`))), + newToken(2, "state_a", 1, "white_space", newByteSequence([]byte(` `))), + newToken(2, "state_a", 2, "char_b", newByteSequence([]byte(`b`))), + newToken(3, "state_b", 1, "white_space", newByteSequence([]byte(` `))), + newToken(3, "state_b", 2, "back_from_b", newByteSequence([]byte(`<`))), + newToken(2, "state_a", 1, "white_space", newByteSequence([]byte(` `))), + newToken(2, "state_a", 3, "back_from_a", newByteSequence([]byte(`<`))), + newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))), + newEOFTokenDefault(), }, }, } @@ -479,8 +554,8 @@ func TestLexer_Next(t *testing.T) { func TestLexer_PeekN(t *testing.T) { clspec, err := compiler.Compile(&spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("t1", "foo"), - newLexEntry("t2", "bar"), + newLexEntryDefaultNOP("t1", "foo"), + newLexEntryDefaultNOP("t2", "bar"), }, }) if err != nil { @@ -492,9 +567,9 @@ func TestLexer_PeekN(t *testing.T) { } expectedTokens := []*Token{ - newToken(1, "t1", []byte("foo")), - newToken(2, "t2", []byte("bar")), - newEOFToken(), + newTokenDefault(1, "t1", []byte("foo")), + newTokenDefault(2, "t2", []byte("bar")), + newEOFTokenDefault(), } tok, err := lex.Peek1() @@ -539,7 +614,13 @@ func TestLexer_PeekN(t *testing.T) { func testToken(t *testing.T, expected, actual *Token) { t.Helper() - if actual.ID != expected.ID || actual.Kind != expected.Kind || !bytes.Equal(actual.Match, expected.Match) || actual.EOF != expected.EOF || actual.Invalid != expected.Invalid { - t.Errorf("unexpected token; want: %v (\"%v\"), got: %v (\"%v\")", expected, string(expected.Match), actual, string(actual.Match)) + if actual.Mode != expected.Mode || + actual.ModeName != actual.ModeName || + actual.ID != expected.ID || + actual.Kind != expected.Kind || + !bytes.Equal(actual.Match, expected.Match) || + actual.EOF != expected.EOF || + actual.Invalid != expected.Invalid { + t.Fatalf(`unexpected token; want: %v ("%v"), got: %v ("%v")`, expected, string(expected.Match), actual, string(actual.Match)) } } |