aboutsummaryrefslogtreecommitdiff
path: root/driver
diff options
context:
space:
mode:
Diffstat (limited to 'driver')
-rw-r--r--driver/lexer.go135
-rw-r--r--driver/lexer_test.go421
2 files changed, 350 insertions, 206 deletions
diff --git a/driver/lexer.go b/driver/lexer.go
index 8a2afd8..72a6f69 100644
--- a/driver/lexer.go
+++ b/driver/lexer.go
@@ -54,46 +54,54 @@ func (s byteSequence) merge(a byteSequence) byteSequence {
}
type Token struct {
- ID int `json:"id"`
- Kind string `json:"kind"`
- Match byteSequence `json:"match"`
- Text string `json:"text"`
- EOF bool `json:"eof"`
- Invalid bool `json:"invalid"`
+ Mode spec.LexModeNum `json:"mode"`
+ ModeName spec.LexModeName `json:"mode_name"`
+ ID int `json:"id"`
+ Kind string `json:"kind"`
+ Match byteSequence `json:"match"`
+ Text string `json:"text"`
+ EOF bool `json:"eof"`
+ Invalid bool `json:"invalid"`
}
-func newToken(id int, kind string, match byteSequence) *Token {
+func newToken(mode spec.LexModeNum, modeName spec.LexModeName, id int, kind string, match byteSequence) *Token {
return &Token{
- ID: id,
- Kind: kind,
- Match: match,
- Text: string(match.ByteSlice()),
+ Mode: mode,
+ ModeName: modeName,
+ ID: id,
+ Kind: kind,
+ Match: match,
+ Text: string(match.ByteSlice()),
}
}
-func newEOFToken() *Token {
+func newEOFToken(mode spec.LexModeNum, modeName spec.LexModeName) *Token {
return &Token{
- ID: 0,
- EOF: true,
+ Mode: mode,
+ ModeName: modeName,
+ ID: 0,
+ EOF: true,
}
}
-func newInvalidToken(match byteSequence) *Token {
+func newInvalidToken(mode spec.LexModeNum, modeName spec.LexModeName, match byteSequence) *Token {
return &Token{
- ID: 0,
- Match: match,
- Invalid: true,
+ Mode: mode,
+ ModeName: modeName,
+ ID: 0,
+ Match: match,
+ Invalid: true,
}
}
func (t *Token) String() string {
if t.Invalid {
- return fmt.Sprintf("!{text: %v, byte: %v}", t.Text, t.Match)
+ return fmt.Sprintf("!{mode: %v, mode name: %v, text: %v, byte: %v}", t.Mode, t.ModeName, t.Text, t.Match)
}
if t.EOF {
return "{eof}"
}
- return fmt.Sprintf("{id: %v, kind: %v, text: %v, byte: %v}", t.ID, t.Kind, t.Text, t.Match)
+ return fmt.Sprintf("{mode: %v, mode name: %v, id: %v, kind: %v, text: %v, byte: %v}", t.Mode, t.ModeName, t.ID, t.Kind, t.Text, t.Match)
}
type lexerOption func(l *lexer) error
@@ -110,11 +118,12 @@ func EnableLogging(w io.Writer) lexerOption {
}
type lexer struct {
- clspec *spec.CompiledLexSpec
- src []byte
- srcPtr int
- tokBuf []*Token
- logger log.Logger
+ clspec *spec.CompiledLexSpec
+ src []byte
+ srcPtr int
+ tokBuf []*Token
+ modeStack []spec.LexModeNum
+ logger log.Logger
}
func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...lexerOption) (*lexer, error) {
@@ -126,6 +135,9 @@ func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...lexerOption)
clspec: clspec,
src: b,
srcPtr: 0,
+ modeStack: []spec.LexModeNum{
+ clspec.InitialMode,
+ },
logger: log.NewNopLogger(),
}
for _, opt := range opts {
@@ -142,8 +154,9 @@ func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...lexerOption)
func (l *lexer) Next() (*Token, error) {
l.logger.Log(`lexer#Next():
State:
+ mode: #%v %v
pointer: %v
- token buffer: %v`, l.srcPtr, l.tokBuf)
+ token buffer: %v`, l.mode(), l.clspec.Modes[l.mode()], l.srcPtr, l.tokBuf)
if len(l.tokBuf) > 0 {
tok := l.tokBuf[0]
@@ -154,7 +167,7 @@ func (l *lexer) Next() (*Token, error) {
return tok, nil
}
- tok, err := l.next()
+ tok, err := l.nextAndTranMode()
if err != nil {
l.logger.Log(" Detectes an error: %v", err)
return nil, err
@@ -168,7 +181,7 @@ func (l *lexer) Next() (*Token, error) {
}
errTok := tok
for {
- tok, err = l.next()
+ tok, err = l.nextAndTranMode()
if err != nil {
l.logger.Log(" Detectes an error: %v", err)
return nil, err
@@ -205,7 +218,7 @@ func (l *lexer) peekN(n int) (*Token, error) {
return nil, fmt.Errorf("peekN() can handle only [0..2]")
}
for len(l.tokBuf) < n+1 {
- tok, err := l.next()
+ tok, err := l.nextAndTranMode()
if err != nil {
return nil, err
}
@@ -214,8 +227,41 @@ func (l *lexer) peekN(n int) (*Token, error) {
return l.tokBuf[n], nil
}
+func (l *lexer) nextAndTranMode() (*Token, error) {
+ tok, err := l.next()
+ if err != nil {
+ return nil, err
+ }
+ if tok.EOF || tok.Invalid {
+ return tok, nil
+ }
+ spec := l.clspec.Specs[l.mode()]
+ if spec.Pop[tok.ID] == 1 {
+ err := l.popMode()
+ if err != nil {
+ return nil, err
+ }
+ }
+ mode := spec.Push[tok.ID]
+ if !mode.IsNil() {
+ l.pushMode(mode)
+ }
+ // The checking length of the mode stack must be at after pop and push operations
+ // because those operations can be performed at the same time.
+ // When the mode stack has just one element and popped it, the mode stack will be temporarily emptied.
+ // However, since a push operation may be performed immediately after it,
+ // the lexer allows the stack to be temporarily empty.
+ if len(l.modeStack) == 0 {
+ return nil, fmt.Errorf("a mode stack must have at least one element")
+ }
+ return tok, nil
+}
+
func (l *lexer) next() (*Token, error) {
- state := l.clspec.DFA.InitialState
+ mode := l.mode()
+ modeName := l.clspec.Modes[mode]
+ spec := l.clspec.Specs[mode]
+ state := spec.DFA.InitialState
buf := []byte{}
unfixedBufLen := 0
var tok *Token
@@ -229,13 +275,13 @@ func (l *lexer) next() (*Token, error) {
// When `buf` has unaccepted data and reads the EOF,
// the lexer treats the buffered data as an invalid token.
if len(buf) > 0 {
- return newInvalidToken(newByteSequence(buf)), nil
+ return newInvalidToken(mode, modeName, newByteSequence(buf)), nil
}
- return newEOFToken(), nil
+ return newEOFToken(mode, modeName), nil
}
buf = append(buf, v)
unfixedBufLen++
- entry := l.clspec.DFA.Transition[state]
+ entry := spec.DFA.Transition[state]
if len(entry) == 0 {
return nil, fmt.Errorf("no transition entry; state: %v", state)
}
@@ -245,17 +291,34 @@ func (l *lexer) next() (*Token, error) {
l.unread(unfixedBufLen)
return tok, nil
}
- return newInvalidToken(newByteSequence(buf)), nil
+ return newInvalidToken(mode, modeName, newByteSequence(buf)), nil
}
state = nextState
- id, ok := l.clspec.DFA.AcceptingStates[state]
+ id, ok := spec.DFA.AcceptingStates[state]
if ok {
- tok = newToken(id, l.clspec.Kinds[id].String(), newByteSequence(buf))
+ tok = newToken(mode, modeName, id, spec.Kinds[id].String(), newByteSequence(buf))
unfixedBufLen = 0
}
}
}
+func (l *lexer) mode() spec.LexModeNum {
+ return l.modeStack[len(l.modeStack)-1]
+}
+
+func (l *lexer) pushMode(mode spec.LexModeNum) {
+ l.modeStack = append(l.modeStack, mode)
+}
+
+func (l *lexer) popMode() error {
+ sLen := len(l.modeStack)
+ if sLen == 0 {
+ return fmt.Errorf("cannot pop a lex mode from a lex mode stack any more")
+ }
+ l.modeStack = l.modeStack[:sLen-1]
+ return nil
+}
+
func (l *lexer) read() (byte, bool) {
if l.srcPtr >= len(l.src) {
return 0, true
diff --git a/driver/lexer_test.go b/driver/lexer_test.go
index 26b5d49..d3edb3c 100644
--- a/driver/lexer_test.go
+++ b/driver/lexer_test.go
@@ -10,13 +10,38 @@ import (
"github.com/nihei9/maleeni/spec"
)
-func newLexEntry(kind string, pattern string) *spec.LexEntry {
+func newLexEntry(modes []string, kind string, pattern string, push string, pop bool) *spec.LexEntry {
+ ms := []spec.LexModeName{}
+ for _, m := range modes {
+ ms = append(ms, spec.LexModeName(m))
+ }
+ return &spec.LexEntry{
+ Kind: spec.LexKind(kind),
+ Pattern: spec.LexPattern(pattern),
+ Modes: ms,
+ Push: spec.LexModeName(push),
+ Pop: pop,
+ }
+}
+
+func newLexEntryDefaultNOP(kind string, pattern string) *spec.LexEntry {
return &spec.LexEntry{
Kind: spec.LexKind(kind),
Pattern: spec.LexPattern(pattern),
+ Modes: []spec.LexModeName{
+ spec.LexModeNameDefault,
+ },
}
}
+func newTokenDefault(id int, kind string, match byteSequence) *Token {
+ return newToken(spec.LexModeNumDefault, spec.LexModeNameDefault, id, kind, match)
+}
+
+func newEOFTokenDefault() *Token {
+ return newEOFToken(spec.LexModeNumDefault, spec.LexModeNameDefault)
+}
+
func TestLexer_Next(t *testing.T) {
test := []struct {
lspec *spec.LexSpec
@@ -26,58 +51,58 @@ func TestLexer_Next(t *testing.T) {
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
- newLexEntry("t1", "(a|b)*abb"),
- newLexEntry("t2", " +"),
+ newLexEntryDefaultNOP("t1", "(a|b)*abb"),
+ newLexEntryDefaultNOP("t2", " +"),
},
},
src: "abb aabb aaabb babb bbabb abbbabb",
tokens: []*Token{
- newToken(1, "t1", newByteSequence([]byte("abb"))),
- newToken(2, "t2", newByteSequence([]byte(" "))),
- newToken(1, "t1", newByteSequence([]byte("aabb"))),
- newToken(2, "t2", newByteSequence([]byte(" "))),
- newToken(1, "t1", newByteSequence([]byte("aaabb"))),
- newToken(2, "t2", newByteSequence([]byte(" "))),
- newToken(1, "t1", newByteSequence([]byte("babb"))),
- newToken(2, "t2", newByteSequence([]byte(" "))),
- newToken(1, "t1", newByteSequence([]byte("bbabb"))),
- newToken(2, "t2", newByteSequence([]byte(" "))),
- newToken(1, "t1", newByteSequence([]byte("abbbabb"))),
- newEOFToken(),
+ newTokenDefault(1, "t1", newByteSequence([]byte("abb"))),
+ newTokenDefault(2, "t2", newByteSequence([]byte(" "))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("aabb"))),
+ newTokenDefault(2, "t2", newByteSequence([]byte(" "))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("aaabb"))),
+ newTokenDefault(2, "t2", newByteSequence([]byte(" "))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("babb"))),
+ newTokenDefault(2, "t2", newByteSequence([]byte(" "))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("bbabb"))),
+ newTokenDefault(2, "t2", newByteSequence([]byte(" "))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("abbbabb"))),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
- newLexEntry("t1", "b?a+"),
- newLexEntry("t2", "(ab)?(cd)+"),
- newLexEntry("t3", " +"),
+ newLexEntryDefaultNOP("t1", "b?a+"),
+ newLexEntryDefaultNOP("t2", "(ab)?(cd)+"),
+ newLexEntryDefaultNOP("t3", " +"),
},
},
src: "ba baaa a aaa abcd abcdcdcd cd cdcdcd",
tokens: []*Token{
- newToken(1, "t1", newByteSequence([]byte("ba"))),
- newToken(3, "t3", newByteSequence([]byte(" "))),
- newToken(1, "t1", newByteSequence([]byte("baaa"))),
- newToken(3, "t3", newByteSequence([]byte(" "))),
- newToken(1, "t1", newByteSequence([]byte("a"))),
- newToken(3, "t3", newByteSequence([]byte(" "))),
- newToken(1, "t1", newByteSequence([]byte("aaa"))),
- newToken(3, "t3", newByteSequence([]byte(" "))),
- newToken(2, "t2", newByteSequence([]byte("abcd"))),
- newToken(3, "t3", newByteSequence([]byte(" "))),
- newToken(2, "t2", newByteSequence([]byte("abcdcdcd"))),
- newToken(3, "t3", newByteSequence([]byte(" "))),
- newToken(2, "t2", newByteSequence([]byte("cd"))),
- newToken(3, "t3", newByteSequence([]byte(" "))),
- newToken(2, "t2", newByteSequence([]byte("cdcdcd"))),
- newEOFToken(),
+ newTokenDefault(1, "t1", newByteSequence([]byte("ba"))),
+ newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("baaa"))),
+ newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("a"))),
+ newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("aaa"))),
+ newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(2, "t2", newByteSequence([]byte("abcd"))),
+ newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(2, "t2", newByteSequence([]byte("abcdcdcd"))),
+ newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(2, "t2", newByteSequence([]byte("cd"))),
+ newTokenDefault(3, "t3", newByteSequence([]byte(" "))),
+ newTokenDefault(2, "t2", newByteSequence([]byte("cdcdcd"))),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
- newLexEntry("t1", "."),
+ newLexEntryDefaultNOP("t1", "."),
},
},
src: string([]byte{
@@ -99,45 +124,45 @@ func TestLexer_Next(t *testing.T) {
0xf4, 0x8f, 0xbf, 0xbf,
}),
tokens: []*Token{
- newToken(1, "t1", newByteSequence([]byte{0x00})),
- newToken(1, "t1", newByteSequence([]byte{0x7f})),
- newToken(1, "t1", newByteSequence([]byte{0xc2, 0x80})),
- newToken(1, "t1", newByteSequence([]byte{0xdf, 0xbf})),
- newToken(1, "t1", newByteSequence([]byte{0xe1, 0x80, 0x80})),
- newToken(1, "t1", newByteSequence([]byte{0xec, 0xbf, 0xbf})),
- newToken(1, "t1", newByteSequence([]byte{0xed, 0x80, 0x80})),
- newToken(1, "t1", newByteSequence([]byte{0xed, 0x9f, 0xbf})),
- newToken(1, "t1", newByteSequence([]byte{0xee, 0x80, 0x80})),
- newToken(1, "t1", newByteSequence([]byte{0xef, 0xbf, 0xbf})),
- newToken(1, "t1", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
- newToken(1, "t1", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})),
- newToken(1, "t1", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})),
- newToken(1, "t1", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})),
- newToken(1, "t1", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})),
- newToken(1, "t1", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})),
- newEOFToken(),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0x00})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0x7f})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xc2, 0x80})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xdf, 0xbf})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xe1, 0x80, 0x80})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xec, 0xbf, 0xbf})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xed, 0x80, 0x80})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xed, 0x9f, 0xbf})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xee, 0x80, 0x80})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xef, 0xbf, 0xbf})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})),
+ newTokenDefault(1, "t1", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
- newLexEntry("t1", "[ab.*+?|()[\\]]"),
+ newLexEntryDefaultNOP("t1", "[ab.*+?|()[\\]]"),
},
},
src: "ab.*+?|()[]",
tokens: []*Token{
- newToken(1, "t1", newByteSequence([]byte("a"))),
- newToken(1, "t1", newByteSequence([]byte("b"))),
- newToken(1, "t1", newByteSequence([]byte("."))),
- newToken(1, "t1", newByteSequence([]byte("*"))),
- newToken(1, "t1", newByteSequence([]byte("+"))),
- newToken(1, "t1", newByteSequence([]byte("?"))),
- newToken(1, "t1", newByteSequence([]byte("|"))),
- newToken(1, "t1", newByteSequence([]byte("("))),
- newToken(1, "t1", newByteSequence([]byte(")"))),
- newToken(1, "t1", newByteSequence([]byte("["))),
- newToken(1, "t1", newByteSequence([]byte("]"))),
- newEOFToken(),
+ newTokenDefault(1, "t1", newByteSequence([]byte("a"))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("b"))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("."))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("*"))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("+"))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("?"))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("|"))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("("))),
+ newTokenDefault(1, "t1", newByteSequence([]byte(")"))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("["))),
+ newTokenDefault(1, "t1", newByteSequence([]byte("]"))),
+ newEOFTokenDefault(),
},
},
{
@@ -149,7 +174,7 @@ func TestLexer_Next(t *testing.T) {
// maleeni cannot handle the null character in patterns because compiler.lexer,
// specifically read() and restore(), recognizes the null characters as that a symbol doesn't exist.
// If a pattern needs a null character, use code point expression \u{0000}.
- newLexEntry("1ByteChar", "[\x01-\x7f]"),
+ newLexEntryDefaultNOP("1ByteChar", "[\x01-\x7f]"),
},
},
src: string([]byte{
@@ -159,18 +184,18 @@ func TestLexer_Next(t *testing.T) {
0x7f,
}),
tokens: []*Token{
- newToken(1, "1ByteChar", newByteSequence([]byte{0x01})),
- newToken(1, "1ByteChar", newByteSequence([]byte{0x02})),
- newToken(1, "1ByteChar", newByteSequence([]byte{0x7e})),
- newToken(1, "1ByteChar", newByteSequence([]byte{0x7f})),
- newEOFToken(),
+ newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x01})),
+ newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x02})),
+ newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x7e})),
+ newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x7f})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// all 2 byte characters
- newLexEntry("2ByteChar", "[\xc2\x80-\xdf\xbf]"),
+ newLexEntryDefaultNOP("2ByteChar", "[\xc2\x80-\xdf\xbf]"),
},
},
src: string([]byte{
@@ -180,33 +205,33 @@ func TestLexer_Next(t *testing.T) {
0xdf, 0xbf,
}),
tokens: []*Token{
- newToken(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x80})),
- newToken(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x81})),
- newToken(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbe})),
- newToken(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbf})),
- newEOFToken(),
+ newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x80})),
+ newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x81})),
+ newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbe})),
+ newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbf})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// All bytes are the same.
- newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\x80]"),
+ newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\x80]"),
},
},
src: string([]byte{
0xe0, 0xa0, 0x80,
}),
tokens: []*Token{
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
- newEOFToken(),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// The first two bytes are the same.
- newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\xbf]"),
+ newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\xbf]"),
},
},
src: string([]byte{
@@ -216,18 +241,18 @@ func TestLexer_Next(t *testing.T) {
0xe0, 0xa0, 0xbf,
}),
tokens: []*Token{
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbe})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbf})),
- newEOFToken(),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbe})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbf})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// The first byte are the same.
- newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xbf\xbf]"),
+ newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xe0\xbf\xbf]"),
},
},
src: string([]byte{
@@ -237,18 +262,18 @@ func TestLexer_Next(t *testing.T) {
0xe0, 0xbf, 0xbf,
}),
tokens: []*Token{
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})),
- newEOFToken(),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// all 3 byte characters
- newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xef\xbf\xbf]"),
+ newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xef\xbf\xbf]"),
},
},
src: string([]byte{
@@ -270,45 +295,45 @@ func TestLexer_Next(t *testing.T) {
0xef, 0xbf, 0xbf,
}),
tokens: []*Token{
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x80})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x81})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbe})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbf})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x80})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x81})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbe})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbf})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x80})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x81})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbe})),
- newToken(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbf})),
- newEOFToken(),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x80})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x81})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbe})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbf})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x80})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x81})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbe})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbf})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x80})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x81})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbe})),
+ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbf})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// All bytes are the same.
- newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\x80]"),
+ newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\x80]"),
},
},
src: string([]byte{
0xf0, 0x90, 0x80, 0x80,
}),
tokens: []*Token{
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
- newEOFToken(),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// The first 3 bytes are the same.
- newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\xbf]"),
+ newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\xbf]"),
},
},
src: string([]byte{
@@ -318,18 +343,18 @@ func TestLexer_Next(t *testing.T) {
0xf0, 0x90, 0x80, 0xbf,
}),
tokens: []*Token{
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbe})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbf})),
- newEOFToken(),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbe})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbf})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// The first 2 bytes are the same.
- newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\xbf\xbf]"),
+ newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\xbf\xbf]"),
},
},
src: string([]byte{
@@ -339,18 +364,18 @@ func TestLexer_Next(t *testing.T) {
0xf0, 0x90, 0xbf, 0xbf,
}),
tokens: []*Token{
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbe})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbf})),
- newEOFToken(),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbe})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbf})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// The first byte are the same.
- newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\xbf\xbf\xbf]"),
+ newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\xbf\xbf\xbf]"),
},
},
src: string([]byte{
@@ -360,18 +385,18 @@ func TestLexer_Next(t *testing.T) {
0xf0, 0xbf, 0xbf, 0xbf,
}),
tokens: []*Token{
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})),
- newEOFToken(),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
// all 4 byte characters
- newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf4\x8f\xbf\xbf]"),
+ newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf4\x8f\xbf\xbf]"),
},
},
src: string([]byte{
@@ -389,64 +414,114 @@ func TestLexer_Next(t *testing.T) {
0xf4, 0x8f, 0xbf, 0xbf,
}),
tokens: []*Token{
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x81})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbe})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x81})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbe})),
- newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})),
- newEOFToken(),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x81})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbe})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x81})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbe})),
+ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
- newLexEntry("NonNumber", "[^0-9]+[0-9]"),
+ newLexEntryDefaultNOP("NonNumber", "[^0-9]+[0-9]"),
},
},
src: "foo9",
tokens: []*Token{
- newToken(1, "NonNumber", newByteSequence([]byte("foo9"))),
- newEOFToken(),
+ newTokenDefault(1, "NonNumber", newByteSequence([]byte("foo9"))),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
- newLexEntry("1ByteChar", "\\u{006E}"),
- newLexEntry("2ByteChar", "\\u{03BD}"),
- newLexEntry("3ByteChar", "\\u{306B}"),
- newLexEntry("4ByteChar", "\\u{01F638}"),
+ newLexEntryDefaultNOP("1ByteChar", "\\u{006E}"),
+ newLexEntryDefaultNOP("2ByteChar", "\\u{03BD}"),
+ newLexEntryDefaultNOP("3ByteChar", "\\u{306B}"),
+ newLexEntryDefaultNOP("4ByteChar", "\\u{01F638}"),
},
},
src: "nνに😸",
tokens: []*Token{
- newToken(1, "1ByteChar", newByteSequence([]byte{0x6E})),
- newToken(2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})),
- newToken(3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})),
- newToken(4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})),
- newEOFToken(),
+ newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x6E})),
+ newTokenDefault(2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})),
+ newTokenDefault(3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})),
+ newTokenDefault(4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})),
+ newEOFTokenDefault(),
},
},
{
lspec: &spec.LexSpec{
Entries: []*spec.LexEntry{
- newLexEntry("codePointsAlt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"),
+ newLexEntryDefaultNOP("codePointsAlt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"),
},
},
src: "nνに😸",
tokens: []*Token{
- newToken(1, "codePointsAlt", newByteSequence([]byte{0x6E})),
- newToken(1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})),
- newToken(1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})),
- newToken(1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})),
- newEOFToken(),
+ newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0x6E})),
+ newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})),
+ newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})),
+ newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})),
+ newEOFTokenDefault(),
+ },
+ },
+ {
+ lspec: &spec.LexSpec{
+ Entries: []*spec.LexEntry{
+ newLexEntryDefaultNOP("white_space", ` *`),
+ newLexEntry([]string{"default"}, "string_open", `"`, "string", false),
+ newLexEntry([]string{"string"}, "escape_sequence", `\\[n"\\]`, "", false),
+ newLexEntry([]string{"string"}, "char_sequence", `[^"\\]*`, "", false),
+ newLexEntry([]string{"string"}, "string_close", `"`, "", true),
+ },
+ },
+ src: `"" "Hello world.\n\"Hello world.\""`,
+ tokens: []*Token{
+ newToken(1, "default", 2, "string_open", newByteSequence([]byte(`"`))),
+ newToken(2, "string", 3, "string_close", newByteSequence([]byte(`"`))),
+ newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(1, "default", 2, "string_open", newByteSequence([]byte(`"`))),
+ newToken(2, "string", 2, "char_sequence", newByteSequence([]byte(`Hello world.`))),
+ newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\n`))),
+ newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\"`))),
+ newToken(2, "string", 2, "char_sequence", newByteSequence([]byte(`Hello world.`))),
+ newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\"`))),
+ newToken(2, "string", 3, "string_close", newByteSequence([]byte(`"`))),
+ newEOFTokenDefault(),
+ },
+ },
+ {
+ lspec: &spec.LexSpec{
+ Entries: []*spec.LexEntry{
+ // `white_space` is enabled in multiple modes.
+ newLexEntry([]string{"default", "state_a", "state_b"}, "white_space", ` *`, "", false),
+ newLexEntry([]string{"default"}, "char_a", `a`, "state_a", false),
+ newLexEntry([]string{"state_a"}, "char_b", `b`, "state_b", false),
+ newLexEntry([]string{"state_a"}, "back_from_a", `<`, "", true),
+ newLexEntry([]string{"state_b"}, "back_from_b", `<`, "", true),
+ },
+ },
+ src: ` a b < < `,
+ tokens: []*Token{
+ newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(1, "default", 2, "char_a", newByteSequence([]byte(`a`))),
+ newToken(2, "state_a", 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(2, "state_a", 2, "char_b", newByteSequence([]byte(`b`))),
+ newToken(3, "state_b", 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(3, "state_b", 2, "back_from_b", newByteSequence([]byte(`<`))),
+ newToken(2, "state_a", 1, "white_space", newByteSequence([]byte(` `))),
+ newToken(2, "state_a", 3, "back_from_a", newByteSequence([]byte(`<`))),
+ newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))),
+ newEOFTokenDefault(),
},
},
}
@@ -479,8 +554,8 @@ func TestLexer_Next(t *testing.T) {
func TestLexer_PeekN(t *testing.T) {
clspec, err := compiler.Compile(&spec.LexSpec{
Entries: []*spec.LexEntry{
- newLexEntry("t1", "foo"),
- newLexEntry("t2", "bar"),
+ newLexEntryDefaultNOP("t1", "foo"),
+ newLexEntryDefaultNOP("t2", "bar"),
},
})
if err != nil {
@@ -492,9 +567,9 @@ func TestLexer_PeekN(t *testing.T) {
}
expectedTokens := []*Token{
- newToken(1, "t1", []byte("foo")),
- newToken(2, "t2", []byte("bar")),
- newEOFToken(),
+ newTokenDefault(1, "t1", []byte("foo")),
+ newTokenDefault(2, "t2", []byte("bar")),
+ newEOFTokenDefault(),
}
tok, err := lex.Peek1()
@@ -539,7 +614,13 @@ func TestLexer_PeekN(t *testing.T) {
func testToken(t *testing.T, expected, actual *Token) {
t.Helper()
- if actual.ID != expected.ID || actual.Kind != expected.Kind || !bytes.Equal(actual.Match, expected.Match) || actual.EOF != expected.EOF || actual.Invalid != expected.Invalid {
- t.Errorf("unexpected token; want: %v (\"%v\"), got: %v (\"%v\")", expected, string(expected.Match), actual, string(actual.Match))
+ if actual.Mode != expected.Mode ||
+ actual.ModeName != actual.ModeName ||
+ actual.ID != expected.ID ||
+ actual.Kind != expected.Kind ||
+ !bytes.Equal(actual.Match, expected.Match) ||
+ actual.EOF != expected.EOF ||
+ actual.Invalid != expected.Invalid {
+ t.Fatalf(`unexpected token; want: %v ("%v"), got: %v ("%v")`, expected, string(expected.Match), actual, string(actual.Match))
}
}