diff options
Diffstat (limited to 'driver/lexer_test.go')
-rw-r--r-- | driver/lexer_test.go | 421 |
1 files changed, 251 insertions, 170 deletions
diff --git a/driver/lexer_test.go b/driver/lexer_test.go index 26b5d49..d3edb3c 100644 --- a/driver/lexer_test.go +++ b/driver/lexer_test.go @@ -10,13 +10,38 @@ import ( "github.com/nihei9/maleeni/spec" ) -func newLexEntry(kind string, pattern string) *spec.LexEntry { +func newLexEntry(modes []string, kind string, pattern string, push string, pop bool) *spec.LexEntry { + ms := []spec.LexModeName{} + for _, m := range modes { + ms = append(ms, spec.LexModeName(m)) + } + return &spec.LexEntry{ + Kind: spec.LexKind(kind), + Pattern: spec.LexPattern(pattern), + Modes: ms, + Push: spec.LexModeName(push), + Pop: pop, + } +} + +func newLexEntryDefaultNOP(kind string, pattern string) *spec.LexEntry { return &spec.LexEntry{ Kind: spec.LexKind(kind), Pattern: spec.LexPattern(pattern), + Modes: []spec.LexModeName{ + spec.LexModeNameDefault, + }, } } +func newTokenDefault(id int, kind string, match byteSequence) *Token { + return newToken(spec.LexModeNumDefault, spec.LexModeNameDefault, id, kind, match) +} + +func newEOFTokenDefault() *Token { + return newEOFToken(spec.LexModeNumDefault, spec.LexModeNameDefault) +} + func TestLexer_Next(t *testing.T) { test := []struct { lspec *spec.LexSpec @@ -26,58 +51,58 @@ func TestLexer_Next(t *testing.T) { { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("t1", "(a|b)*abb"), - newLexEntry("t2", " +"), + newLexEntryDefaultNOP("t1", "(a|b)*abb"), + newLexEntryDefaultNOP("t2", " +"), }, }, src: "abb aabb aaabb babb bbabb abbbabb", tokens: []*Token{ - newToken(1, "t1", newByteSequence([]byte("abb"))), - newToken(2, "t2", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("aabb"))), - newToken(2, "t2", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("aaabb"))), - newToken(2, "t2", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("babb"))), - newToken(2, "t2", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("bbabb"))), - newToken(2, "t2", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("abbbabb"))), - newEOFToken(), + newTokenDefault(1, "t1", newByteSequence([]byte("abb"))), + newTokenDefault(2, "t2", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("aabb"))), + newTokenDefault(2, "t2", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("aaabb"))), + newTokenDefault(2, "t2", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("babb"))), + newTokenDefault(2, "t2", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("bbabb"))), + newTokenDefault(2, "t2", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("abbbabb"))), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("t1", "b?a+"), - newLexEntry("t2", "(ab)?(cd)+"), - newLexEntry("t3", " +"), + newLexEntryDefaultNOP("t1", "b?a+"), + newLexEntryDefaultNOP("t2", "(ab)?(cd)+"), + newLexEntryDefaultNOP("t3", " +"), }, }, src: "ba baaa a aaa abcd abcdcdcd cd cdcdcd", tokens: []*Token{ - newToken(1, "t1", newByteSequence([]byte("ba"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("baaa"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("a"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(1, "t1", newByteSequence([]byte("aaa"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(2, "t2", newByteSequence([]byte("abcd"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(2, "t2", newByteSequence([]byte("abcdcdcd"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(2, "t2", newByteSequence([]byte("cd"))), - newToken(3, "t3", newByteSequence([]byte(" "))), - newToken(2, "t2", newByteSequence([]byte("cdcdcd"))), - newEOFToken(), + newTokenDefault(1, "t1", newByteSequence([]byte("ba"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("baaa"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("a"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(1, "t1", newByteSequence([]byte("aaa"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(2, "t2", newByteSequence([]byte("abcd"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(2, "t2", newByteSequence([]byte("abcdcdcd"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(2, "t2", newByteSequence([]byte("cd"))), + newTokenDefault(3, "t3", newByteSequence([]byte(" "))), + newTokenDefault(2, "t2", newByteSequence([]byte("cdcdcd"))), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("t1", "."), + newLexEntryDefaultNOP("t1", "."), }, }, src: string([]byte{ @@ -99,45 +124,45 @@ func TestLexer_Next(t *testing.T) { 0xf4, 0x8f, 0xbf, 0xbf, }), tokens: []*Token{ - newToken(1, "t1", newByteSequence([]byte{0x00})), - newToken(1, "t1", newByteSequence([]byte{0x7f})), - newToken(1, "t1", newByteSequence([]byte{0xc2, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xdf, 0xbf})), - newToken(1, "t1", newByteSequence([]byte{0xe1, 0x80, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xec, 0xbf, 0xbf})), - newToken(1, "t1", newByteSequence([]byte{0xed, 0x80, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xed, 0x9f, 0xbf})), - newToken(1, "t1", newByteSequence([]byte{0xee, 0x80, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xef, 0xbf, 0xbf})), - newToken(1, "t1", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), - newToken(1, "t1", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})), - newToken(1, "t1", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})), - newToken(1, "t1", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "t1", newByteSequence([]byte{0x00})), + newTokenDefault(1, "t1", newByteSequence([]byte{0x7f})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xc2, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xdf, 0xbf})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xe1, 0x80, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xec, 0xbf, 0xbf})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xed, 0x80, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xed, 0x9f, 0xbf})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xee, 0x80, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xef, 0xbf, 0xbf})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})), + newTokenDefault(1, "t1", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("t1", "[ab.*+?|()[\\]]"), + newLexEntryDefaultNOP("t1", "[ab.*+?|()[\\]]"), }, }, src: "ab.*+?|()[]", tokens: []*Token{ - newToken(1, "t1", newByteSequence([]byte("a"))), - newToken(1, "t1", newByteSequence([]byte("b"))), - newToken(1, "t1", newByteSequence([]byte("."))), - newToken(1, "t1", newByteSequence([]byte("*"))), - newToken(1, "t1", newByteSequence([]byte("+"))), - newToken(1, "t1", newByteSequence([]byte("?"))), - newToken(1, "t1", newByteSequence([]byte("|"))), - newToken(1, "t1", newByteSequence([]byte("("))), - newToken(1, "t1", newByteSequence([]byte(")"))), - newToken(1, "t1", newByteSequence([]byte("["))), - newToken(1, "t1", newByteSequence([]byte("]"))), - newEOFToken(), + newTokenDefault(1, "t1", newByteSequence([]byte("a"))), + newTokenDefault(1, "t1", newByteSequence([]byte("b"))), + newTokenDefault(1, "t1", newByteSequence([]byte("."))), + newTokenDefault(1, "t1", newByteSequence([]byte("*"))), + newTokenDefault(1, "t1", newByteSequence([]byte("+"))), + newTokenDefault(1, "t1", newByteSequence([]byte("?"))), + newTokenDefault(1, "t1", newByteSequence([]byte("|"))), + newTokenDefault(1, "t1", newByteSequence([]byte("("))), + newTokenDefault(1, "t1", newByteSequence([]byte(")"))), + newTokenDefault(1, "t1", newByteSequence([]byte("["))), + newTokenDefault(1, "t1", newByteSequence([]byte("]"))), + newEOFTokenDefault(), }, }, { @@ -149,7 +174,7 @@ func TestLexer_Next(t *testing.T) { // maleeni cannot handle the null character in patterns because compiler.lexer, // specifically read() and restore(), recognizes the null characters as that a symbol doesn't exist. // If a pattern needs a null character, use code point expression \u{0000}. - newLexEntry("1ByteChar", "[\x01-\x7f]"), + newLexEntryDefaultNOP("1ByteChar", "[\x01-\x7f]"), }, }, src: string([]byte{ @@ -159,18 +184,18 @@ func TestLexer_Next(t *testing.T) { 0x7f, }), tokens: []*Token{ - newToken(1, "1ByteChar", newByteSequence([]byte{0x01})), - newToken(1, "1ByteChar", newByteSequence([]byte{0x02})), - newToken(1, "1ByteChar", newByteSequence([]byte{0x7e})), - newToken(1, "1ByteChar", newByteSequence([]byte{0x7f})), - newEOFToken(), + newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x01})), + newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x02})), + newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x7e})), + newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x7f})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // all 2 byte characters - newLexEntry("2ByteChar", "[\xc2\x80-\xdf\xbf]"), + newLexEntryDefaultNOP("2ByteChar", "[\xc2\x80-\xdf\xbf]"), }, }, src: string([]byte{ @@ -180,33 +205,33 @@ func TestLexer_Next(t *testing.T) { 0xdf, 0xbf, }), tokens: []*Token{ - newToken(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x80})), - newToken(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x81})), - newToken(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbe})), - newToken(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x80})), + newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x81})), + newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbe})), + newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // All bytes are the same. - newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\x80]"), + newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\x80]"), }, }, src: string([]byte{ 0xe0, 0xa0, 0x80, }), tokens: []*Token{ - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), - newEOFToken(), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first two bytes are the same. - newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\xbf]"), + newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\xbf]"), }, }, src: string([]byte{ @@ -216,18 +241,18 @@ func TestLexer_Next(t *testing.T) { 0xe0, 0xa0, 0xbf, }), tokens: []*Token{ - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbe})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbf})), - newEOFToken(), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbe})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first byte are the same. - newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xbf\xbf]"), + newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xe0\xbf\xbf]"), }, }, src: string([]byte{ @@ -237,18 +262,18 @@ func TestLexer_Next(t *testing.T) { 0xe0, 0xbf, 0xbf, }), tokens: []*Token{ - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // all 3 byte characters - newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xef\xbf\xbf]"), + newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xef\xbf\xbf]"), }, }, src: string([]byte{ @@ -270,45 +295,45 @@ func TestLexer_Next(t *testing.T) { 0xef, 0xbf, 0xbf, }), tokens: []*Token{ - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x80})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x81})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbe})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbf})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x80})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x81})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbe})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbf})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x80})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x81})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbe})), - newToken(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x80})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x81})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbe})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbf})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x80})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x81})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbe})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbf})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x80})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x81})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbe})), + newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // All bytes are the same. - newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\x80]"), + newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\x80]"), }, }, src: string([]byte{ 0xf0, 0x90, 0x80, 0x80, }), tokens: []*Token{ - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newEOFToken(), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first 3 bytes are the same. - newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\xbf]"), + newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\xbf]"), }, }, src: string([]byte{ @@ -318,18 +343,18 @@ func TestLexer_Next(t *testing.T) { 0xf0, 0x90, 0x80, 0xbf, }), tokens: []*Token{ - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbe})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbf})), - newEOFToken(), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbe})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first 2 bytes are the same. - newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\xbf\xbf]"), + newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\xbf\xbf]"), }, }, src: string([]byte{ @@ -339,18 +364,18 @@ func TestLexer_Next(t *testing.T) { 0xf0, 0x90, 0xbf, 0xbf, }), tokens: []*Token{ - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbe})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbe})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first byte are the same. - newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\xbf\xbf\xbf]"), + newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\xbf\xbf\xbf]"), }, }, src: string([]byte{ @@ -360,18 +385,18 @@ func TestLexer_Next(t *testing.T) { 0xf0, 0xbf, 0xbf, 0xbf, }), tokens: []*Token{ - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // all 4 byte characters - newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf4\x8f\xbf\xbf]"), + newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf4\x8f\xbf\xbf]"), }, }, src: string([]byte{ @@ -389,64 +414,114 @@ func TestLexer_Next(t *testing.T) { 0xf4, 0x8f, 0xbf, 0xbf, }), tokens: []*Token{ - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x81})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbe})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x81})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbe})), - newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})), - newEOFToken(), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x81})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbe})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x81})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbe})), + newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("NonNumber", "[^0-9]+[0-9]"), + newLexEntryDefaultNOP("NonNumber", "[^0-9]+[0-9]"), }, }, src: "foo9", tokens: []*Token{ - newToken(1, "NonNumber", newByteSequence([]byte("foo9"))), - newEOFToken(), + newTokenDefault(1, "NonNumber", newByteSequence([]byte("foo9"))), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("1ByteChar", "\\u{006E}"), - newLexEntry("2ByteChar", "\\u{03BD}"), - newLexEntry("3ByteChar", "\\u{306B}"), - newLexEntry("4ByteChar", "\\u{01F638}"), + newLexEntryDefaultNOP("1ByteChar", "\\u{006E}"), + newLexEntryDefaultNOP("2ByteChar", "\\u{03BD}"), + newLexEntryDefaultNOP("3ByteChar", "\\u{306B}"), + newLexEntryDefaultNOP("4ByteChar", "\\u{01F638}"), }, }, src: "nνに😸", tokens: []*Token{ - newToken(1, "1ByteChar", newByteSequence([]byte{0x6E})), - newToken(2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})), - newToken(3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})), - newToken(4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), - newEOFToken(), + newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x6E})), + newTokenDefault(2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})), + newTokenDefault(3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})), + newTokenDefault(4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), + newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("codePointsAlt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"), + newLexEntryDefaultNOP("codePointsAlt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"), }, }, src: "nνに😸", tokens: []*Token{ - newToken(1, "codePointsAlt", newByteSequence([]byte{0x6E})), - newToken(1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})), - newToken(1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})), - newToken(1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), - newEOFToken(), + newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0x6E})), + newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})), + newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})), + newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), + newEOFTokenDefault(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + newLexEntryDefaultNOP("white_space", ` *`), + newLexEntry([]string{"default"}, "string_open", `"`, "string", false), + newLexEntry([]string{"string"}, "escape_sequence", `\\[n"\\]`, "", false), + newLexEntry([]string{"string"}, "char_sequence", `[^"\\]*`, "", false), + newLexEntry([]string{"string"}, "string_close", `"`, "", true), + }, + }, + src: `"" "Hello world.\n\"Hello world.\""`, + tokens: []*Token{ + newToken(1, "default", 2, "string_open", newByteSequence([]byte(`"`))), + newToken(2, "string", 3, "string_close", newByteSequence([]byte(`"`))), + newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))), + newToken(1, "default", 2, "string_open", newByteSequence([]byte(`"`))), + newToken(2, "string", 2, "char_sequence", newByteSequence([]byte(`Hello world.`))), + newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\n`))), + newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\"`))), + newToken(2, "string", 2, "char_sequence", newByteSequence([]byte(`Hello world.`))), + newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\"`))), + newToken(2, "string", 3, "string_close", newByteSequence([]byte(`"`))), + newEOFTokenDefault(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // `white_space` is enabled in multiple modes. + newLexEntry([]string{"default", "state_a", "state_b"}, "white_space", ` *`, "", false), + newLexEntry([]string{"default"}, "char_a", `a`, "state_a", false), + newLexEntry([]string{"state_a"}, "char_b", `b`, "state_b", false), + newLexEntry([]string{"state_a"}, "back_from_a", `<`, "", true), + newLexEntry([]string{"state_b"}, "back_from_b", `<`, "", true), + }, + }, + src: ` a b < < `, + tokens: []*Token{ + newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))), + newToken(1, "default", 2, "char_a", newByteSequence([]byte(`a`))), + newToken(2, "state_a", 1, "white_space", newByteSequence([]byte(` `))), + newToken(2, "state_a", 2, "char_b", newByteSequence([]byte(`b`))), + newToken(3, "state_b", 1, "white_space", newByteSequence([]byte(` `))), + newToken(3, "state_b", 2, "back_from_b", newByteSequence([]byte(`<`))), + newToken(2, "state_a", 1, "white_space", newByteSequence([]byte(` `))), + newToken(2, "state_a", 3, "back_from_a", newByteSequence([]byte(`<`))), + newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))), + newEOFTokenDefault(), }, }, } @@ -479,8 +554,8 @@ func TestLexer_Next(t *testing.T) { func TestLexer_PeekN(t *testing.T) { clspec, err := compiler.Compile(&spec.LexSpec{ Entries: []*spec.LexEntry{ - newLexEntry("t1", "foo"), - newLexEntry("t2", "bar"), + newLexEntryDefaultNOP("t1", "foo"), + newLexEntryDefaultNOP("t2", "bar"), }, }) if err != nil { @@ -492,9 +567,9 @@ func TestLexer_PeekN(t *testing.T) { } expectedTokens := []*Token{ - newToken(1, "t1", []byte("foo")), - newToken(2, "t2", []byte("bar")), - newEOFToken(), + newTokenDefault(1, "t1", []byte("foo")), + newTokenDefault(2, "t2", []byte("bar")), + newEOFTokenDefault(), } tok, err := lex.Peek1() @@ -539,7 +614,13 @@ func TestLexer_PeekN(t *testing.T) { func testToken(t *testing.T, expected, actual *Token) { t.Helper() - if actual.ID != expected.ID || actual.Kind != expected.Kind || !bytes.Equal(actual.Match, expected.Match) || actual.EOF != expected.EOF || actual.Invalid != expected.Invalid { - t.Errorf("unexpected token; want: %v (\"%v\"), got: %v (\"%v\")", expected, string(expected.Match), actual, string(actual.Match)) + if actual.Mode != expected.Mode || + actual.ModeName != actual.ModeName || + actual.ID != expected.ID || + actual.Kind != expected.Kind || + !bytes.Equal(actual.Match, expected.Match) || + actual.EOF != expected.EOF || + actual.Invalid != expected.Invalid { + t.Fatalf(`unexpected token; want: %v ("%v"), got: %v ("%v")`, expected, string(expected.Match), actual, string(actual.Match)) } } |