package driver import ( "bytes" "fmt" "strings" "testing" "github.com/nihei9/maleeni/compiler" "github.com/nihei9/maleeni/spec" ) func newLexEntry(modes []string, kind string, pattern string, push string, pop bool) *spec.LexEntry { ms := []spec.LexModeName{} for _, m := range modes { ms = append(ms, spec.LexModeName(m)) } return &spec.LexEntry{ Kind: spec.LexKind(kind), Pattern: spec.LexPattern(pattern), Modes: ms, Push: spec.LexModeName(push), Pop: pop, } } func newLexEntryDefaultNOP(kind string, pattern string) *spec.LexEntry { return &spec.LexEntry{ Kind: spec.LexKind(kind), Pattern: spec.LexPattern(pattern), Modes: []spec.LexModeName{ spec.LexModeNameDefault, }, } } func newLexEntryFragment(kind string, pattern string) *spec.LexEntry { return &spec.LexEntry{ Kind: spec.LexKind(kind), Pattern: spec.LexPattern(pattern), Fragment: true, } } func newTokenDefault(id int, kind string, match byteSequence) *Token { return newToken(spec.LexModeNumDefault, spec.LexModeNameDefault, id, kind, match) } func newEOFTokenDefault() *Token { return newEOFToken(spec.LexModeNumDefault, spec.LexModeNameDefault) } func TestLexer_Next(t *testing.T) { test := []struct { lspec *spec.LexSpec src string tokens []*Token passiveModeTran bool tran func(l *Lexer, tok *Token) error }{ { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("t1", "(a|b)*abb"), newLexEntryDefaultNOP("t2", " +"), }, }, src: "abb aabb aaabb babb bbabb abbbabb", tokens: []*Token{ newTokenDefault(1, "t1", newByteSequence([]byte("abb"))), newTokenDefault(2, "t2", newByteSequence([]byte(" "))), newTokenDefault(1, "t1", newByteSequence([]byte("aabb"))), newTokenDefault(2, "t2", newByteSequence([]byte(" "))), newTokenDefault(1, "t1", newByteSequence([]byte("aaabb"))), newTokenDefault(2, "t2", newByteSequence([]byte(" "))), newTokenDefault(1, "t1", newByteSequence([]byte("babb"))), newTokenDefault(2, "t2", newByteSequence([]byte(" "))), newTokenDefault(1, "t1", newByteSequence([]byte("bbabb"))), newTokenDefault(2, "t2", newByteSequence([]byte(" "))), newTokenDefault(1, "t1", newByteSequence([]byte("abbbabb"))), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("t1", "b?a+"), newLexEntryDefaultNOP("t2", "(ab)?(cd)+"), newLexEntryDefaultNOP("t3", " +"), }, }, src: "ba baaa a aaa abcd abcdcdcd cd cdcdcd", tokens: []*Token{ newTokenDefault(1, "t1", newByteSequence([]byte("ba"))), newTokenDefault(3, "t3", newByteSequence([]byte(" "))), newTokenDefault(1, "t1", newByteSequence([]byte("baaa"))), newTokenDefault(3, "t3", newByteSequence([]byte(" "))), newTokenDefault(1, "t1", newByteSequence([]byte("a"))), newTokenDefault(3, "t3", newByteSequence([]byte(" "))), newTokenDefault(1, "t1", newByteSequence([]byte("aaa"))), newTokenDefault(3, "t3", newByteSequence([]byte(" "))), newTokenDefault(2, "t2", newByteSequence([]byte("abcd"))), newTokenDefault(3, "t3", newByteSequence([]byte(" "))), newTokenDefault(2, "t2", newByteSequence([]byte("abcdcdcd"))), newTokenDefault(3, "t3", newByteSequence([]byte(" "))), newTokenDefault(2, "t2", newByteSequence([]byte("cd"))), newTokenDefault(3, "t3", newByteSequence([]byte(" "))), newTokenDefault(2, "t2", newByteSequence([]byte("cdcdcd"))), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("t1", "."), }, }, src: string([]byte{ 0x00, 0x7f, 0xc2, 0x80, 0xdf, 0xbf, 0xe1, 0x80, 0x80, 0xec, 0xbf, 0xbf, 0xed, 0x80, 0x80, 0xed, 0x9f, 0xbf, 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80, 0xf0, 0xbf, 0xbf, 0xbf, 0xf1, 0x80, 0x80, 0x80, 0xf3, 0xbf, 0xbf, 0xbf, 0xf4, 0x80, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf, }), tokens: []*Token{ newTokenDefault(1, "t1", newByteSequence([]byte{0x00})), newTokenDefault(1, "t1", newByteSequence([]byte{0x7f})), newTokenDefault(1, "t1", newByteSequence([]byte{0xc2, 0x80})), newTokenDefault(1, "t1", newByteSequence([]byte{0xdf, 0xbf})), newTokenDefault(1, "t1", newByteSequence([]byte{0xe1, 0x80, 0x80})), newTokenDefault(1, "t1", newByteSequence([]byte{0xec, 0xbf, 0xbf})), newTokenDefault(1, "t1", newByteSequence([]byte{0xed, 0x80, 0x80})), newTokenDefault(1, "t1", newByteSequence([]byte{0xed, 0x9f, 0xbf})), newTokenDefault(1, "t1", newByteSequence([]byte{0xee, 0x80, 0x80})), newTokenDefault(1, "t1", newByteSequence([]byte{0xef, 0xbf, 0xbf})), newTokenDefault(1, "t1", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), newTokenDefault(1, "t1", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), newTokenDefault(1, "t1", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})), newTokenDefault(1, "t1", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})), newTokenDefault(1, "t1", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})), newTokenDefault(1, "t1", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("t1", "[ab.*+?|()[\\]]"), }, }, src: "ab.*+?|()[]", tokens: []*Token{ newTokenDefault(1, "t1", newByteSequence([]byte("a"))), newTokenDefault(1, "t1", newByteSequence([]byte("b"))), newTokenDefault(1, "t1", newByteSequence([]byte("."))), newTokenDefault(1, "t1", newByteSequence([]byte("*"))), newTokenDefault(1, "t1", newByteSequence([]byte("+"))), newTokenDefault(1, "t1", newByteSequence([]byte("?"))), newTokenDefault(1, "t1", newByteSequence([]byte("|"))), newTokenDefault(1, "t1", newByteSequence([]byte("("))), newTokenDefault(1, "t1", newByteSequence([]byte(")"))), newTokenDefault(1, "t1", newByteSequence([]byte("["))), newTokenDefault(1, "t1", newByteSequence([]byte("]"))), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // all 1 byte characters except null character (U+0000) // // NOTE: // maleeni cannot handle the null character in patterns because compiler.lexer, // specifically read() and restore(), recognizes the null characters as that a symbol doesn't exist. // If a pattern needs a null character, use code point expression \u{0000}. newLexEntryDefaultNOP("1ByteChar", "[\x01-\x7f]"), }, }, src: string([]byte{ 0x01, 0x02, 0x7e, 0x7f, }), tokens: []*Token{ newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x01})), newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x02})), newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x7e})), newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x7f})), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // all 2 byte characters newLexEntryDefaultNOP("2ByteChar", "[\xc2\x80-\xdf\xbf]"), }, }, src: string([]byte{ 0xc2, 0x80, 0xc2, 0x81, 0xdf, 0xbe, 0xdf, 0xbf, }), tokens: []*Token{ newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x80})), newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x81})), newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbe})), newTokenDefault(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbf})), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // All bytes are the same. newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\x80]"), }, }, src: string([]byte{ 0xe0, 0xa0, 0x80, }), tokens: []*Token{ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first two bytes are the same. newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\xbf]"), }, }, src: string([]byte{ 0xe0, 0xa0, 0x80, 0xe0, 0xa0, 0x81, 0xe0, 0xa0, 0xbe, 0xe0, 0xa0, 0xbf, }), tokens: []*Token{ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbe})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbf})), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first byte are the same. newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xe0\xbf\xbf]"), }, }, src: string([]byte{ 0xe0, 0xa0, 0x80, 0xe0, 0xa0, 0x81, 0xe0, 0xbf, 0xbe, 0xe0, 0xbf, 0xbf, }), tokens: []*Token{ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // all 3 byte characters newLexEntryDefaultNOP("3ByteChar", "[\xe0\xa0\x80-\xef\xbf\xbf]"), }, }, src: string([]byte{ 0xe0, 0xa0, 0x80, 0xe0, 0xa0, 0x81, 0xe0, 0xbf, 0xbe, 0xe0, 0xbf, 0xbf, 0xe1, 0x80, 0x80, 0xe1, 0x80, 0x81, 0xec, 0xbf, 0xbe, 0xec, 0xbf, 0xbf, 0xed, 0x80, 0x80, 0xed, 0x80, 0x81, 0xed, 0x9f, 0xbe, 0xed, 0x9f, 0xbf, 0xee, 0x80, 0x80, 0xee, 0x80, 0x81, 0xef, 0xbf, 0xbe, 0xef, 0xbf, 0xbf, }), tokens: []*Token{ newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x80})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x81})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbe})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbf})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x80})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x81})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbe})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbf})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x80})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x81})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbe})), newTokenDefault(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbf})), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // All bytes are the same. newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\x80]"), }, }, src: string([]byte{ 0xf0, 0x90, 0x80, 0x80, }), tokens: []*Token{ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first 3 bytes are the same. newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\xbf]"), }, }, src: string([]byte{ 0xf0, 0x90, 0x80, 0x80, 0xf0, 0x90, 0x80, 0x81, 0xf0, 0x90, 0x80, 0xbe, 0xf0, 0x90, 0x80, 0xbf, }), tokens: []*Token{ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbe})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbf})), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first 2 bytes are the same. newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\xbf\xbf]"), }, }, src: string([]byte{ 0xf0, 0x90, 0x80, 0x80, 0xf0, 0x90, 0x80, 0x81, 0xf0, 0x90, 0xbf, 0xbe, 0xf0, 0x90, 0xbf, 0xbf, }), tokens: []*Token{ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbe})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbf})), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first byte are the same. newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf0\xbf\xbf\xbf]"), }, }, src: string([]byte{ 0xf0, 0x90, 0x80, 0x80, 0xf0, 0x90, 0x80, 0x81, 0xf0, 0xbf, 0xbf, 0xbe, 0xf0, 0xbf, 0xbf, 0xbf, }), tokens: []*Token{ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // all 4 byte characters newLexEntryDefaultNOP("4ByteChar", "[\xf0\x90\x80\x80-\xf4\x8f\xbf\xbf]"), }, }, src: string([]byte{ 0xf0, 0x90, 0x80, 0x80, 0xf0, 0x90, 0x80, 0x81, 0xf0, 0xbf, 0xbf, 0xbe, 0xf0, 0xbf, 0xbf, 0xbf, 0xf1, 0x80, 0x80, 0x80, 0xf1, 0x80, 0x80, 0x81, 0xf3, 0xbf, 0xbf, 0xbe, 0xf3, 0xbf, 0xbf, 0xbf, 0xf4, 0x80, 0x80, 0x80, 0xf4, 0x80, 0x80, 0x81, 0xf4, 0x8f, 0xbf, 0xbe, 0xf4, 0x8f, 0xbf, 0xbf, }), tokens: []*Token{ newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x81})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbe})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x81})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbe})), newTokenDefault(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("NonNumber", "[^0-9]+[0-9]"), }, }, src: "foo9", tokens: []*Token{ newTokenDefault(1, "NonNumber", newByteSequence([]byte("foo9"))), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("1ByteChar", "\\u{006E}"), newLexEntryDefaultNOP("2ByteChar", "\\u{03BD}"), newLexEntryDefaultNOP("3ByteChar", "\\u{306B}"), newLexEntryDefaultNOP("4ByteChar", "\\u{01F638}"), }, }, src: "nνに😸", tokens: []*Token{ newTokenDefault(1, "1ByteChar", newByteSequence([]byte{0x6E})), newTokenDefault(2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})), newTokenDefault(3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})), newTokenDefault(4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("codePointsAlt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"), }, }, src: "nνに😸", tokens: []*Token{ newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0x6E})), newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})), newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})), newTokenDefault(1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("t1", "\\f{a2c}\\f{d2f}+"), newLexEntryFragment("a2c", "abc"), newLexEntryFragment("d2f", "def"), }, }, src: "abcdefdefabcdef", tokens: []*Token{ newTokenDefault(1, "t1", newByteSequence([]byte("abcdefdef"))), newTokenDefault(1, "t1", newByteSequence([]byte("abcdef"))), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("t1", "(\\f{a2c}|\\f{d2f})+"), newLexEntryFragment("a2c", "abc"), newLexEntryFragment("d2f", "def"), }, }, src: "abcdefdefabc", tokens: []*Token{ newTokenDefault(1, "t1", newByteSequence([]byte("abcdefdefabc"))), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("t1", "\\f{a2c_or_d2f}+"), newLexEntryFragment("a2c_or_d2f", "\\f{a2c}|\\f{d2f}"), newLexEntryFragment("a2c", "abc"), newLexEntryFragment("d2f", "def"), }, }, src: "abcdefdefabc", tokens: []*Token{ newTokenDefault(1, "t1", newByteSequence([]byte("abcdefdefabc"))), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("white_space", ` *`), newLexEntry([]string{"default"}, "string_open", `"`, "string", false), newLexEntry([]string{"string"}, "escape_sequence", `\\[n"\\]`, "", false), newLexEntry([]string{"string"}, "char_sequence", `[^"\\]*`, "", false), newLexEntry([]string{"string"}, "string_close", `"`, "", true), }, }, src: `"" "Hello world.\n\"Hello world.\""`, tokens: []*Token{ newToken(1, "default", 2, "string_open", newByteSequence([]byte(`"`))), newToken(2, "string", 3, "string_close", newByteSequence([]byte(`"`))), newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))), newToken(1, "default", 2, "string_open", newByteSequence([]byte(`"`))), newToken(2, "string", 2, "char_sequence", newByteSequence([]byte(`Hello world.`))), newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\n`))), newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\"`))), newToken(2, "string", 2, "char_sequence", newByteSequence([]byte(`Hello world.`))), newToken(2, "string", 1, "escape_sequence", newByteSequence([]byte(`\"`))), newToken(2, "string", 3, "string_close", newByteSequence([]byte(`"`))), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // `white_space` is enabled in multiple modes. newLexEntry([]string{"default", "state_a", "state_b"}, "white_space", ` *`, "", false), newLexEntry([]string{"default"}, "char_a", `a`, "state_a", false), newLexEntry([]string{"state_a"}, "char_b", `b`, "state_b", false), newLexEntry([]string{"state_a"}, "back_from_a", `<`, "", true), newLexEntry([]string{"state_b"}, "back_from_b", `<`, "", true), }, }, src: ` a b < < `, tokens: []*Token{ newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))), newToken(1, "default", 2, "char_a", newByteSequence([]byte(`a`))), newToken(2, "state_a", 1, "white_space", newByteSequence([]byte(` `))), newToken(2, "state_a", 2, "char_b", newByteSequence([]byte(`b`))), newToken(3, "state_b", 1, "white_space", newByteSequence([]byte(` `))), newToken(3, "state_b", 2, "back_from_b", newByteSequence([]byte(`<`))), newToken(2, "state_a", 1, "white_space", newByteSequence([]byte(` `))), newToken(2, "state_a", 3, "back_from_a", newByteSequence([]byte(`<`))), newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))), newEOFTokenDefault(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntry([]string{"default", "mode_1", "mode_2"}, "white_space", ` *`, "", false), newLexEntry([]string{"default"}, "char", `.`, "", false), newLexEntry([]string{"default"}, "push_1", `-> 1`, "", false), newLexEntry([]string{"mode_1"}, "push_2", `-> 2`, "", false), newLexEntry([]string{"mode_1"}, "pop_1", `<-`, "", false), newLexEntry([]string{"mode_2"}, "pop_2", `<-`, "", false), }, }, src: `-> 1 -> 2 <- <- a`, tokens: []*Token{ newToken(1, "default", 3, "push_1", newByteSequence([]byte(`-> 1`))), newToken(2, "mode_1", 1, "white_space", newByteSequence([]byte(` `))), newToken(2, "mode_1", 2, "push_2", newByteSequence([]byte(`-> 2`))), newToken(3, "mode_2", 1, "white_space", newByteSequence([]byte(` `))), newToken(3, "mode_2", 2, "pop_2", newByteSequence([]byte(`<-`))), newToken(2, "mode_1", 1, "white_space", newByteSequence([]byte(` `))), newToken(2, "mode_1", 3, "pop_1", newByteSequence([]byte(`<-`))), newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))), newToken(1, "default", 2, "char", newByteSequence([]byte(`a`))), newEOFTokenDefault(), }, passiveModeTran: true, tran: func(l *Lexer, tok *Token) error { switch l.clspec.Modes[l.Mode().Int()] { case "default": switch tok.KindName { case "push_1": l.PushMode(2) } case "mode_1": switch tok.KindName { case "push_2": l.PushMode(3) case "pop_1": return l.PopMode() } case "mode_2": switch tok.KindName { case "pop_2": return l.PopMode() } } return nil }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntry([]string{"default", "mode_1", "mode_2"}, "white_space", ` *`, "", false), newLexEntry([]string{"default"}, "char", `.`, "", false), newLexEntry([]string{"default"}, "push_1", `-> 1`, "mode_1", false), newLexEntry([]string{"mode_1"}, "push_2", `-> 2`, "", false), newLexEntry([]string{"mode_1"}, "pop_1", `<-`, "", false), newLexEntry([]string{"mode_2"}, "pop_2", `<-`, "", true), }, }, src: `-> 1 -> 2 <- <- a`, tokens: []*Token{ newToken(1, "default", 3, "push_1", newByteSequence([]byte(`-> 1`))), newToken(2, "mode_1", 1, "white_space", newByteSequence([]byte(` `))), newToken(2, "mode_1", 2, "push_2", newByteSequence([]byte(`-> 2`))), newToken(3, "mode_2", 1, "white_space", newByteSequence([]byte(` `))), newToken(3, "mode_2", 2, "pop_2", newByteSequence([]byte(`<-`))), newToken(2, "mode_1", 1, "white_space", newByteSequence([]byte(` `))), newToken(2, "mode_1", 3, "pop_1", newByteSequence([]byte(`<-`))), newToken(1, "default", 1, "white_space", newByteSequence([]byte(` `))), newToken(1, "default", 2, "char", newByteSequence([]byte(`a`))), newEOFTokenDefault(), }, // Active mode transition and an external transition function can be used together. passiveModeTran: false, tran: func(l *Lexer, tok *Token) error { switch l.clspec.Modes[l.Mode().Int()] { case "mode_1": switch tok.KindName { case "push_2": l.PushMode(3) case "pop_1": return l.PopMode() } } return nil }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("dot", spec.EscapePattern(`.`)), newLexEntryDefaultNOP("star", spec.EscapePattern(`*`)), newLexEntryDefaultNOP("plus", spec.EscapePattern(`+`)), newLexEntryDefaultNOP("question", spec.EscapePattern(`?`)), newLexEntryDefaultNOP("vbar", spec.EscapePattern(`|`)), newLexEntryDefaultNOP("lparen", spec.EscapePattern(`(`)), newLexEntryDefaultNOP("rparen", spec.EscapePattern(`)`)), newLexEntryDefaultNOP("lbrace", spec.EscapePattern(`[`)), newLexEntryDefaultNOP("backslash", spec.EscapePattern(`\`)), }, }, src: `.*+?|()[\`, tokens: []*Token{ newTokenDefault(1, "dot", newByteSequence([]byte(`.`))), newTokenDefault(2, "star", newByteSequence([]byte(`*`))), newTokenDefault(3, "plus", newByteSequence([]byte(`+`))), newTokenDefault(4, "question", newByteSequence([]byte(`?`))), newTokenDefault(5, "vbar", newByteSequence([]byte(`|`))), newTokenDefault(6, "lparen", newByteSequence([]byte(`(`))), newTokenDefault(7, "rparen", newByteSequence([]byte(`)`))), newTokenDefault(8, "lbrace", newByteSequence([]byte(`[`))), newTokenDefault(9, "backslash", newByteSequence([]byte(`\`))), newEOFTokenDefault(), }, }, } for i, tt := range test { for compLv := compiler.CompressionLevelMin; compLv <= compiler.CompressionLevelMax; compLv++ { t.Run(fmt.Sprintf("#%v-%v", i, compLv), func(t *testing.T) { clspec, err := compiler.Compile(tt.lspec, compiler.CompressionLevel(compLv)) if err != nil { t.Fatalf("unexpected error: %v", err) } opts := []LexerOption{} if tt.passiveModeTran { opts = append(opts, DisableModeTransition()) } lexer, err := NewLexer(clspec, strings.NewReader(tt.src), opts...) if err != nil { t.Fatalf("unexpected error: %v", err) } for _, eTok := range tt.tokens { tok, err := lexer.Next() if err != nil { t.Log(err) break } testToken(t, eTok, tok) // t.Logf("token: ID: %v, Match: %+v Text: \"%v\", EOF: %v, Invalid: %v", tok.ID, tok.Match(), tok.Text(), tok.EOF, tok.Invalid) if tok.EOF { break } if tt.tran != nil { err := tt.tran(lexer, tok) if err != nil { t.Fatalf("unexpected error: %v", err) } } } }) } } } func testToken(t *testing.T, expected, actual *Token) { t.Helper() if actual.Mode != expected.Mode || actual.ModeName != actual.ModeName || actual.Kind != expected.Kind || actual.KindName != expected.KindName || !bytes.Equal(actual.Match(), expected.Match()) || actual.EOF != expected.EOF || actual.Invalid != expected.Invalid { t.Fatalf(`unexpected token; want: %v ("%v"), got: %v ("%v")`, expected, expected.Text(), actual, actual.Text()) } }