package driver import ( "bytes" "fmt" "strings" "testing" "github.com/nihei9/maleeni/compiler" "github.com/nihei9/maleeni/spec" ) func newLexEntry(kind string, pattern string) *spec.LexEntry { return &spec.LexEntry{ Kind: spec.LexKind(kind), Pattern: spec.LexPattern(pattern), } } func TestLexer_Next(t *testing.T) { test := []struct { lspec *spec.LexSpec src string tokens []*Token }{ { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntry("t1", "(a|b)*abb"), newLexEntry("t2", " +"), }, }, src: "abb aabb aaabb babb bbabb abbbabb", tokens: []*Token{ newToken(1, "t1", newByteSequence([]byte("abb"))), newToken(2, "t2", newByteSequence([]byte(" "))), newToken(1, "t1", newByteSequence([]byte("aabb"))), newToken(2, "t2", newByteSequence([]byte(" "))), newToken(1, "t1", newByteSequence([]byte("aaabb"))), newToken(2, "t2", newByteSequence([]byte(" "))), newToken(1, "t1", newByteSequence([]byte("babb"))), newToken(2, "t2", newByteSequence([]byte(" "))), newToken(1, "t1", newByteSequence([]byte("bbabb"))), newToken(2, "t2", newByteSequence([]byte(" "))), newToken(1, "t1", newByteSequence([]byte("abbbabb"))), newEOFToken(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntry("t1", "b?a+"), newLexEntry("t2", "(ab)?(cd)+"), newLexEntry("t3", " +"), }, }, src: "ba baaa a aaa abcd abcdcdcd cd cdcdcd", tokens: []*Token{ newToken(1, "t1", newByteSequence([]byte("ba"))), newToken(3, "t3", newByteSequence([]byte(" "))), newToken(1, "t1", newByteSequence([]byte("baaa"))), newToken(3, "t3", newByteSequence([]byte(" "))), newToken(1, "t1", newByteSequence([]byte("a"))), newToken(3, "t3", newByteSequence([]byte(" "))), newToken(1, "t1", newByteSequence([]byte("aaa"))), newToken(3, "t3", newByteSequence([]byte(" "))), newToken(2, "t2", newByteSequence([]byte("abcd"))), newToken(3, "t3", newByteSequence([]byte(" "))), newToken(2, "t2", newByteSequence([]byte("abcdcdcd"))), newToken(3, "t3", newByteSequence([]byte(" "))), newToken(2, "t2", newByteSequence([]byte("cd"))), newToken(3, "t3", newByteSequence([]byte(" "))), newToken(2, "t2", newByteSequence([]byte("cdcdcd"))), newEOFToken(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntry("t1", "."), }, }, src: string([]byte{ 0x00, 0x7f, 0xc2, 0x80, 0xdf, 0xbf, 0xe1, 0x80, 0x80, 0xec, 0xbf, 0xbf, 0xed, 0x80, 0x80, 0xed, 0x9f, 0xbf, 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80, 0xf0, 0xbf, 0xbf, 0xbf, 0xf1, 0x80, 0x80, 0x80, 0xf3, 0xbf, 0xbf, 0xbf, 0xf4, 0x80, 0x80, 0x80, 0xf4, 0x8f, 0xbf, 0xbf, }), tokens: []*Token{ newToken(1, "t1", newByteSequence([]byte{0x00})), newToken(1, "t1", newByteSequence([]byte{0x7f})), newToken(1, "t1", newByteSequence([]byte{0xc2, 0x80})), newToken(1, "t1", newByteSequence([]byte{0xdf, 0xbf})), newToken(1, "t1", newByteSequence([]byte{0xe1, 0x80, 0x80})), newToken(1, "t1", newByteSequence([]byte{0xec, 0xbf, 0xbf})), newToken(1, "t1", newByteSequence([]byte{0xed, 0x80, 0x80})), newToken(1, "t1", newByteSequence([]byte{0xed, 0x9f, 0xbf})), newToken(1, "t1", newByteSequence([]byte{0xee, 0x80, 0x80})), newToken(1, "t1", newByteSequence([]byte{0xef, 0xbf, 0xbf})), newToken(1, "t1", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), newToken(1, "t1", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), newToken(1, "t1", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})), newToken(1, "t1", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})), newToken(1, "t1", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})), newToken(1, "t1", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})), newEOFToken(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntry("t1", "[ab.*+?|()[\\]]"), }, }, src: "ab.*+?|()[]", tokens: []*Token{ newToken(1, "t1", newByteSequence([]byte("a"))), newToken(1, "t1", newByteSequence([]byte("b"))), newToken(1, "t1", newByteSequence([]byte("."))), newToken(1, "t1", newByteSequence([]byte("*"))), newToken(1, "t1", newByteSequence([]byte("+"))), newToken(1, "t1", newByteSequence([]byte("?"))), newToken(1, "t1", newByteSequence([]byte("|"))), newToken(1, "t1", newByteSequence([]byte("("))), newToken(1, "t1", newByteSequence([]byte(")"))), newToken(1, "t1", newByteSequence([]byte("["))), newToken(1, "t1", newByteSequence([]byte("]"))), newEOFToken(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // all 1 byte characters except null character (U+0000) // // NOTE: // maleeni cannot handle the null character in patterns because compiler.lexer, // specifically read() and restore(), recognizes the null characters as that a symbol doesn't exist. // If a pattern needs a null character, use code point expression \u{0000}. newLexEntry("1ByteChar", "[\x01-\x7f]"), }, }, src: string([]byte{ 0x01, 0x02, 0x7e, 0x7f, }), tokens: []*Token{ newToken(1, "1ByteChar", newByteSequence([]byte{0x01})), newToken(1, "1ByteChar", newByteSequence([]byte{0x02})), newToken(1, "1ByteChar", newByteSequence([]byte{0x7e})), newToken(1, "1ByteChar", newByteSequence([]byte{0x7f})), newEOFToken(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // all 2 byte characters newLexEntry("2ByteChar", "[\xc2\x80-\xdf\xbf]"), }, }, src: string([]byte{ 0xc2, 0x80, 0xc2, 0x81, 0xdf, 0xbe, 0xdf, 0xbf, }), tokens: []*Token{ newToken(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x80})), newToken(1, "2ByteChar", newByteSequence([]byte{0xc2, 0x81})), newToken(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbe})), newToken(1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbf})), newEOFToken(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // All bytes are the same. newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\x80]"), }, }, src: string([]byte{ 0xe0, 0xa0, 0x80, }), tokens: []*Token{ newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), newEOFToken(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first two bytes are the same. newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\xbf]"), }, }, src: string([]byte{ 0xe0, 0xa0, 0x80, 0xe0, 0xa0, 0x81, 0xe0, 0xa0, 0xbe, 0xe0, 0xa0, 0xbf, }), tokens: []*Token{ newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbe})), newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbf})), newEOFToken(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first byte are the same. newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xbf\xbf]"), }, }, src: string([]byte{ 0xe0, 0xa0, 0x80, 0xe0, 0xa0, 0x81, 0xe0, 0xbf, 0xbe, 0xe0, 0xbf, 0xbf, }), tokens: []*Token{ newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})), newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})), newEOFToken(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // all 3 byte characters newLexEntry("3ByteChar", "[\xe0\xa0\x80-\xef\xbf\xbf]"), }, }, src: string([]byte{ 0xe0, 0xa0, 0x80, 0xe0, 0xa0, 0x81, 0xe0, 0xbf, 0xbe, 0xe0, 0xbf, 0xbf, 0xe1, 0x80, 0x80, 0xe1, 0x80, 0x81, 0xec, 0xbf, 0xbe, 0xec, 0xbf, 0xbf, 0xed, 0x80, 0x80, 0xed, 0x80, 0x81, 0xed, 0x9f, 0xbe, 0xed, 0x9f, 0xbf, 0xee, 0x80, 0x80, 0xee, 0x80, 0x81, 0xef, 0xbf, 0xbe, 0xef, 0xbf, 0xbf, }), tokens: []*Token{ newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})), newToken(1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})), newToken(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x80})), newToken(1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x81})), newToken(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbe})), newToken(1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbf})), newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x80})), newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x81})), newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbe})), newToken(1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbf})), newToken(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x80})), newToken(1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x81})), newToken(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbe})), newToken(1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbf})), newEOFToken(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // All bytes are the same. newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\x80]"), }, }, src: string([]byte{ 0xf0, 0x90, 0x80, 0x80, }), tokens: []*Token{ newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), newEOFToken(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first 3 bytes are the same. newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\xbf]"), }, }, src: string([]byte{ 0xf0, 0x90, 0x80, 0x80, 0xf0, 0x90, 0x80, 0x81, 0xf0, 0x90, 0x80, 0xbe, 0xf0, 0x90, 0x80, 0xbf, }), tokens: []*Token{ newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbe})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbf})), newEOFToken(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first 2 bytes are the same. newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\xbf\xbf]"), }, }, src: string([]byte{ 0xf0, 0x90, 0x80, 0x80, 0xf0, 0x90, 0x80, 0x81, 0xf0, 0x90, 0xbf, 0xbe, 0xf0, 0x90, 0xbf, 0xbf, }), tokens: []*Token{ newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbe})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbf})), newEOFToken(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // The first byte are the same. newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\xbf\xbf\xbf]"), }, }, src: string([]byte{ 0xf0, 0x90, 0x80, 0x80, 0xf0, 0x90, 0x80, 0x81, 0xf0, 0xbf, 0xbf, 0xbe, 0xf0, 0xbf, 0xbf, 0xbf, }), tokens: []*Token{ newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), newEOFToken(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ // all 4 byte characters newLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf4\x8f\xbf\xbf]"), }, }, src: string([]byte{ 0xf0, 0x90, 0x80, 0x80, 0xf0, 0x90, 0x80, 0x81, 0xf0, 0xbf, 0xbf, 0xbe, 0xf0, 0xbf, 0xbf, 0xbf, 0xf1, 0x80, 0x80, 0x80, 0xf1, 0x80, 0x80, 0x81, 0xf3, 0xbf, 0xbf, 0xbe, 0xf3, 0xbf, 0xbf, 0xbf, 0xf4, 0x80, 0x80, 0x80, 0xf4, 0x80, 0x80, 0x81, 0xf4, 0x8f, 0xbf, 0xbe, 0xf4, 0x8f, 0xbf, 0xbf, }), tokens: []*Token{ newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x81})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbe})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x81})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbe})), newToken(1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})), newEOFToken(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntry("NonNumber", "[^0-9]+[0-9]"), }, }, src: "foo9", tokens: []*Token{ newToken(1, "NonNumber", newByteSequence([]byte("foo9"))), newEOFToken(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntry("1ByteChar", "\\u{006E}"), newLexEntry("2ByteChar", "\\u{03BD}"), newLexEntry("3ByteChar", "\\u{306B}"), newLexEntry("4ByteChar", "\\u{01F638}"), }, }, src: "nνに😸", tokens: []*Token{ newToken(1, "1ByteChar", newByteSequence([]byte{0x6E})), newToken(2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})), newToken(3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})), newToken(4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), newEOFToken(), }, }, { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntry("codePointsAlt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"), }, }, src: "nνに😸", tokens: []*Token{ newToken(1, "codePointsAlt", newByteSequence([]byte{0x6E})), newToken(1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})), newToken(1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})), newToken(1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), newEOFToken(), }, }, } for i, tt := range test { t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) { clspec, err := compiler.Compile(tt.lspec) if err != nil { t.Fatalf("unexpected error occurred: %v", err) } lexer, err := NewLexer(clspec, strings.NewReader(tt.src)) if err != nil { t.Fatalf("unexpecated error occurred; %v", err) } for _, eTok := range tt.tokens { tok, err := lexer.Next() if err != nil { t.Log(err) break } testToken(t, eTok, tok) // t.Logf("token: ID: %v, Match: %+v Text: \"%v\", EOF: %v, Invalid: %v", tok.ID, tok.Match, string(tok.Match), tok.EOF, tok.Invalid) if tok.EOF { break } } }) } } func TestLexer_PeekN(t *testing.T) { clspec, err := compiler.Compile(&spec.LexSpec{ Entries: []*spec.LexEntry{ newLexEntry("t1", "foo"), newLexEntry("t2", "bar"), }, }) if err != nil { t.Fatalf("unexpected error occurred: %v", err) } lex, err := NewLexer(clspec, strings.NewReader("foobar")) if err != nil { t.Fatalf("unexpected error occurred: %v", err) } expectedTokens := []*Token{ newToken(1, "t1", []byte("foo")), newToken(2, "t2", []byte("bar")), newEOFToken(), } tok, err := lex.Peek1() if err != nil { t.Fatalf("unexpected error occurred: %v", err) } if tok == nil { t.Fatalf("token is nil") } testToken(t, expectedTokens[0], tok) tok, err = lex.Peek2() if err != nil { t.Fatalf("unexpected error occurred: %v", err) } if tok == nil { t.Fatalf("token is nil") } testToken(t, expectedTokens[1], tok) tok, err = lex.Peek3() if err != nil { t.Fatalf("unexpected error occurred: %v", err) } if tok == nil { t.Fatalf("token is nil") } testToken(t, expectedTokens[2], tok) for _, eTok := range expectedTokens { tok, err = lex.Next() if err != nil { t.Fatalf("unexpected error occurred: %v", err) } if tok == nil { t.Fatalf("token is nil") } testToken(t, eTok, tok) } } func testToken(t *testing.T, expected, actual *Token) { t.Helper() if actual.ID != expected.ID || actual.Kind != expected.Kind || !bytes.Equal(actual.Match, expected.Match) || actual.EOF != expected.EOF || actual.Invalid != expected.Invalid { t.Errorf("unexpected token; want: %v (\"%v\"), got: %v (\"%v\")", expected, string(expected.Match), actual, string(actual.Match)) } }