diff options
author | Ryo Nihei <nihei.dev@gmail.com> | 2021-02-24 01:30:08 +0900 |
---|---|---|
committer | Ryo Nihei <nihei.dev@gmail.com> | 2021-02-24 01:30:08 +0900 |
commit | e5fb2fe4f4dfc7dff550b934933b88e9392a6e11 (patch) | |
tree | 7ffa58f2106d8b3bbbe931b84f73a9fb5c2b51a1 /driver | |
parent | Add + and ? operators (diff) | |
download | tre-e5fb2fe4f4dfc7dff550b934933b88e9392a6e11.tar.gz tre-e5fb2fe4f4dfc7dff550b934933b88e9392a6e11.tar.xz |
Add range expression
[a-z] matches any one character from a to z. The order of the characters depends on Unicode code points.
Diffstat (limited to 'driver')
-rw-r--r-- | driver/lexer_test.go | 261 |
1 files changed, 260 insertions, 1 deletions
diff --git a/driver/lexer_test.go b/driver/lexer_test.go index 283d5fe..bdf5f03 100644 --- a/driver/lexer_test.go +++ b/driver/lexer_test.go @@ -132,6 +132,265 @@ func TestLexer_Next(t *testing.T) { newEOFToken(), }, }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // all 1 byte characters + spec.NewLexEntry("1ByteChar", "[\x00-\x7f]"), + }, + }, + src: string([]byte{ + 0x00, + 0x01, + 0x7e, + 0x7f, + }), + tokens: []*Token{ + newToken(1, "1ByteChar", []byte{0x00}), + newToken(1, "1ByteChar", []byte{0x01}), + newToken(1, "1ByteChar", []byte{0x7e}), + newToken(1, "1ByteChar", []byte{0x7f}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // all 2 byte characters + spec.NewLexEntry("2ByteChar", "[\xc2\x80-\xdf\xbf]"), + }, + }, + src: string([]byte{ + 0xc2, 0x80, + 0xc2, 0x81, + 0xdf, 0xbe, + 0xdf, 0xbf, + }), + tokens: []*Token{ + newToken(1, "2ByteChar", []byte{0xc2, 0x80}), + newToken(1, "2ByteChar", []byte{0xc2, 0x81}), + newToken(1, "2ByteChar", []byte{0xdf, 0xbe}), + newToken(1, "2ByteChar", []byte{0xdf, 0xbf}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // All bytes are the same. + spec.NewLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\x80]"), + }, + }, + src: string([]byte{ + 0xe0, 0xa0, 0x80, + }), + tokens: []*Token{ + newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // The first two bytes are the same. + spec.NewLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\xbf]"), + }, + }, + src: string([]byte{ + 0xe0, 0xa0, 0x80, + 0xe0, 0xa0, 0x81, + 0xe0, 0xa0, 0xbe, + 0xe0, 0xa0, 0xbf, + }), + tokens: []*Token{ + newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}), + newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x81}), + newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0xbe}), + newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0xbf}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // The first byte are the same. + spec.NewLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xbf\xbf]"), + }, + }, + src: string([]byte{ + 0xe0, 0xa0, 0x80, + 0xe0, 0xa0, 0x81, + 0xe0, 0xbf, 0xbe, + 0xe0, 0xbf, 0xbf, + }), + tokens: []*Token{ + newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}), + newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x81}), + newToken(1, "3ByteChar", []byte{0xe0, 0xbf, 0xbe}), + newToken(1, "3ByteChar", []byte{0xe0, 0xbf, 0xbf}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // all 3 byte characters + spec.NewLexEntry("3ByteChar", "[\xe0\xa0\x80-\xef\xbf\xbf]"), + }, + }, + src: string([]byte{ + 0xe0, 0xa0, 0x80, + 0xe0, 0xa0, 0x81, + 0xe0, 0xbf, 0xbe, + 0xe0, 0xbf, 0xbf, + 0xe1, 0x80, 0x80, + 0xe1, 0x80, 0x81, + 0xec, 0xbf, 0xbe, + 0xec, 0xbf, 0xbf, + 0xed, 0x80, 0x80, + 0xed, 0x80, 0x81, + 0xed, 0x9f, 0xbe, + 0xed, 0x9f, 0xbf, + 0xee, 0x80, 0x80, + 0xee, 0x80, 0x81, + 0xef, 0xbf, 0xbe, + 0xef, 0xbf, 0xbf, + }), + tokens: []*Token{ + newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}), + newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x81}), + newToken(1, "3ByteChar", []byte{0xe0, 0xbf, 0xbe}), + newToken(1, "3ByteChar", []byte{0xe0, 0xbf, 0xbf}), + newToken(1, "3ByteChar", []byte{0xe1, 0x80, 0x80}), + newToken(1, "3ByteChar", []byte{0xe1, 0x80, 0x81}), + newToken(1, "3ByteChar", []byte{0xec, 0xbf, 0xbe}), + newToken(1, "3ByteChar", []byte{0xec, 0xbf, 0xbf}), + newToken(1, "3ByteChar", []byte{0xed, 0x80, 0x80}), + newToken(1, "3ByteChar", []byte{0xed, 0x80, 0x81}), + newToken(1, "3ByteChar", []byte{0xed, 0x9f, 0xbe}), + newToken(1, "3ByteChar", []byte{0xed, 0x9f, 0xbf}), + newToken(1, "3ByteChar", []byte{0xee, 0x80, 0x80}), + newToken(1, "3ByteChar", []byte{0xee, 0x80, 0x81}), + newToken(1, "3ByteChar", []byte{0xef, 0xbf, 0xbe}), + newToken(1, "3ByteChar", []byte{0xef, 0xbf, 0xbf}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // All bytes are the same. + spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\x80]"), + }, + }, + src: string([]byte{ + 0xf0, 0x90, 0x80, 0x80, + }), + tokens: []*Token{ + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // The first 3 bytes are the same. + spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\xbf]"), + }, + }, + src: string([]byte{ + 0xf0, 0x90, 0x80, 0x80, + 0xf0, 0x90, 0x80, 0x81, + 0xf0, 0x90, 0x80, 0xbe, + 0xf0, 0x90, 0x80, 0xbf, + }), + tokens: []*Token{ + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}), + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}), + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0xbe}), + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0xbf}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // The first 2 bytes are the same. + spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\xbf\xbf]"), + }, + }, + src: string([]byte{ + 0xf0, 0x90, 0x80, 0x80, + 0xf0, 0x90, 0x80, 0x81, + 0xf0, 0x90, 0xbf, 0xbe, + 0xf0, 0x90, 0xbf, 0xbf, + }), + tokens: []*Token{ + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}), + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}), + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0xbf, 0xbe}), + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0xbf, 0xbf}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // The first byte are the same. + spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\xbf\xbf\xbf]"), + }, + }, + src: string([]byte{ + 0xf0, 0x90, 0x80, 0x80, + 0xf0, 0x90, 0x80, 0x81, + 0xf0, 0xbf, 0xbf, 0xbe, + 0xf0, 0xbf, 0xbf, 0xbf, + }), + tokens: []*Token{ + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}), + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}), + newToken(1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbe}), + newToken(1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbf}), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + // all 4 byte characters + spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf4\x8f\xbf\xbf]"), + }, + }, + src: string([]byte{ + 0xf0, 0x90, 0x80, 0x80, + 0xf0, 0x90, 0x80, 0x81, + 0xf0, 0xbf, 0xbf, 0xbe, + 0xf0, 0xbf, 0xbf, 0xbf, + 0xf1, 0x80, 0x80, 0x80, + 0xf1, 0x80, 0x80, 0x81, + 0xf3, 0xbf, 0xbf, 0xbe, + 0xf3, 0xbf, 0xbf, 0xbf, + 0xf4, 0x80, 0x80, 0x80, + 0xf4, 0x80, 0x80, 0x81, + 0xf4, 0x8f, 0xbf, 0xbe, + 0xf4, 0x8f, 0xbf, 0xbf, + }), + tokens: []*Token{ + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}), + newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}), + newToken(1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbe}), + newToken(1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbf}), + newToken(1, "4ByteChar", []byte{0xf1, 0x80, 0x80, 0x80}), + newToken(1, "4ByteChar", []byte{0xf1, 0x80, 0x80, 0x81}), + newToken(1, "4ByteChar", []byte{0xf3, 0xbf, 0xbf, 0xbe}), + newToken(1, "4ByteChar", []byte{0xf3, 0xbf, 0xbf, 0xbf}), + newToken(1, "4ByteChar", []byte{0xf4, 0x80, 0x80, 0x80}), + newToken(1, "4ByteChar", []byte{0xf4, 0x80, 0x80, 0x81}), + newToken(1, "4ByteChar", []byte{0xf4, 0x8f, 0xbf, 0xbe}), + newToken(1, "4ByteChar", []byte{0xf4, 0x8f, 0xbf, 0xbf}), + newEOFToken(), + }, + }, } for _, tt := range test { clspec, err := compiler.Compile(tt.lspec) @@ -149,7 +408,7 @@ func TestLexer_Next(t *testing.T) { break } testToken(t, eTok, tok) - t.Logf("token: ID: %v, Match: %+v Text: \"%v\", EOF: %v, Invalid: %v", tok.ID, tok.Match, string(tok.Match), tok.EOF, tok.Invalid) + // t.Logf("token: ID: %v, Match: %+v Text: \"%v\", EOF: %v, Invalid: %v", tok.ID, tok.Match, string(tok.Match), tok.EOF, tok.Invalid) if tok.EOF { break } |