Add range expression

[a-z] matches any one character from a to z. The order of the characters depends on Unicode code points.
author: Ryo Nihei <nihei.dev@gmail.com> 2021-02-24 01:30:08 +0900
committer: Ryo Nihei <nihei.dev@gmail.com> 2021-02-24 01:30:08 +0900
commit: e5fb2fe4f4dfc7dff550b934933b88e9392a6e11 (patch)
tree: 7ffa58f2106d8b3bbbe931b84f73a9fb5c2b51a1 /driver
parent: Add + and ? operators (diff)
download: tre-e5fb2fe4f4dfc7dff550b934933b88e9392a6e11.tar.gz
tre-e5fb2fe4f4dfc7dff550b934933b88e9392a6e11.tar.xz
1 files changed, 260 insertions, 1 deletions
diff --git a/driver/lexer_test.go b/driver/lexer_test.go
index 283d5fe..bdf5f03 100644
--- a/driver/lexer_test.go
+++ b/driver/lexer_test.go
@@ -132,6 +132,265 @@ func TestLexer_Next(t *testing.T) {
 				newEOFToken(),
 			},
 		},
+		{
+			lspec: &spec.LexSpec{
+				Entries: []*spec.LexEntry{
+					// all 1 byte characters
+					spec.NewLexEntry("1ByteChar", "[\x00-\x7f]"),
+				},
+			},
+			src: string([]byte{
+				0x00,
+				0x01,
+				0x7e,
+				0x7f,
+			}),
+			tokens: []*Token{
+				newToken(1, "1ByteChar", []byte{0x00}),
+				newToken(1, "1ByteChar", []byte{0x01}),
+				newToken(1, "1ByteChar", []byte{0x7e}),
+				newToken(1, "1ByteChar", []byte{0x7f}),
+				newEOFToken(),
+			},
+		},
+		{
+			lspec: &spec.LexSpec{
+				Entries: []*spec.LexEntry{
+					// all 2 byte characters
+					spec.NewLexEntry("2ByteChar", "[\xc2\x80-\xdf\xbf]"),
+				},
+			},
+			src: string([]byte{
+				0xc2, 0x80,
+				0xc2, 0x81,
+				0xdf, 0xbe,
+				0xdf, 0xbf,
+			}),
+			tokens: []*Token{
+				newToken(1, "2ByteChar", []byte{0xc2, 0x80}),
+				newToken(1, "2ByteChar", []byte{0xc2, 0x81}),
+				newToken(1, "2ByteChar", []byte{0xdf, 0xbe}),
+				newToken(1, "2ByteChar", []byte{0xdf, 0xbf}),
+				newEOFToken(),
+			},
+		},
+		{
+			lspec: &spec.LexSpec{
+				Entries: []*spec.LexEntry{
+					// All bytes are the same.
+					spec.NewLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\x80]"),
+				},
+			},
+			src: string([]byte{
+				0xe0, 0xa0, 0x80,
+			}),
+			tokens: []*Token{
+				newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}),
+				newEOFToken(),
+			},
+		},
+		{
+			lspec: &spec.LexSpec{
+				Entries: []*spec.LexEntry{
+					// The first two bytes are the same.
+					spec.NewLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xa0\xbf]"),
+				},
+			},
+			src: string([]byte{
+				0xe0, 0xa0, 0x80,
+				0xe0, 0xa0, 0x81,
+				0xe0, 0xa0, 0xbe,
+				0xe0, 0xa0, 0xbf,
+			}),
+			tokens: []*Token{
+				newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}),
+				newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x81}),
+				newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0xbe}),
+				newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0xbf}),
+				newEOFToken(),
+			},
+		},
+		{
+			lspec: &spec.LexSpec{
+				Entries: []*spec.LexEntry{
+					// The first byte are the same.
+					spec.NewLexEntry("3ByteChar", "[\xe0\xa0\x80-\xe0\xbf\xbf]"),
+				},
+			},
+			src: string([]byte{
+				0xe0, 0xa0, 0x80,
+				0xe0, 0xa0, 0x81,
+				0xe0, 0xbf, 0xbe,
+				0xe0, 0xbf, 0xbf,
+			}),
+			tokens: []*Token{
+				newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}),
+				newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x81}),
+				newToken(1, "3ByteChar", []byte{0xe0, 0xbf, 0xbe}),
+				newToken(1, "3ByteChar", []byte{0xe0, 0xbf, 0xbf}),
+				newEOFToken(),
+			},
+		},
+		{
+			lspec: &spec.LexSpec{
+				Entries: []*spec.LexEntry{
+					// all 3 byte characters
+					spec.NewLexEntry("3ByteChar", "[\xe0\xa0\x80-\xef\xbf\xbf]"),
+				},
+			},
+			src: string([]byte{
+				0xe0, 0xa0, 0x80,
+				0xe0, 0xa0, 0x81,
+				0xe0, 0xbf, 0xbe,
+				0xe0, 0xbf, 0xbf,
+				0xe1, 0x80, 0x80,
+				0xe1, 0x80, 0x81,
+				0xec, 0xbf, 0xbe,
+				0xec, 0xbf, 0xbf,
+				0xed, 0x80, 0x80,
+				0xed, 0x80, 0x81,
+				0xed, 0x9f, 0xbe,
+				0xed, 0x9f, 0xbf,
+				0xee, 0x80, 0x80,
+				0xee, 0x80, 0x81,
+				0xef, 0xbf, 0xbe,
+				0xef, 0xbf, 0xbf,
+			}),
+			tokens: []*Token{
+				newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}),
+				newToken(1, "3ByteChar", []byte{0xe0, 0xa0, 0x81}),
+				newToken(1, "3ByteChar", []byte{0xe0, 0xbf, 0xbe}),
+				newToken(1, "3ByteChar", []byte{0xe0, 0xbf, 0xbf}),
+				newToken(1, "3ByteChar", []byte{0xe1, 0x80, 0x80}),
+				newToken(1, "3ByteChar", []byte{0xe1, 0x80, 0x81}),
+				newToken(1, "3ByteChar", []byte{0xec, 0xbf, 0xbe}),
+				newToken(1, "3ByteChar", []byte{0xec, 0xbf, 0xbf}),
+				newToken(1, "3ByteChar", []byte{0xed, 0x80, 0x80}),
+				newToken(1, "3ByteChar", []byte{0xed, 0x80, 0x81}),
+				newToken(1, "3ByteChar", []byte{0xed, 0x9f, 0xbe}),
+				newToken(1, "3ByteChar", []byte{0xed, 0x9f, 0xbf}),
+				newToken(1, "3ByteChar", []byte{0xee, 0x80, 0x80}),
+				newToken(1, "3ByteChar", []byte{0xee, 0x80, 0x81}),
+				newToken(1, "3ByteChar", []byte{0xef, 0xbf, 0xbe}),
+				newToken(1, "3ByteChar", []byte{0xef, 0xbf, 0xbf}),
+				newEOFToken(),
+			},
+		},
+		{
+			lspec: &spec.LexSpec{
+				Entries: []*spec.LexEntry{
+					// All bytes are the same.
+					spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\x80]"),
+				},
+			},
+			src: string([]byte{
+				0xf0, 0x90, 0x80, 0x80,
+			}),
+			tokens: []*Token{
+				newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}),
+				newEOFToken(),
+			},
+		},
+		{
+			lspec: &spec.LexSpec{
+				Entries: []*spec.LexEntry{
+					// The first 3 bytes are the same.
+					spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\x80\xbf]"),
+				},
+			},
+			src: string([]byte{
+				0xf0, 0x90, 0x80, 0x80,
+				0xf0, 0x90, 0x80, 0x81,
+				0xf0, 0x90, 0x80, 0xbe,
+				0xf0, 0x90, 0x80, 0xbf,
+			}),
+			tokens: []*Token{
+				newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}),
+				newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}),
+				newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0xbe}),
+				newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0xbf}),
+				newEOFToken(),
+			},
+		},
+		{
+			lspec: &spec.LexSpec{
+				Entries: []*spec.LexEntry{
+					// The first 2 bytes are the same.
+					spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\x90\xbf\xbf]"),
+				},
+			},
+			src: string([]byte{
+				0xf0, 0x90, 0x80, 0x80,
+				0xf0, 0x90, 0x80, 0x81,
+				0xf0, 0x90, 0xbf, 0xbe,
+				0xf0, 0x90, 0xbf, 0xbf,
+			}),
+			tokens: []*Token{
+				newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}),
+				newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}),
+				newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0xbf, 0xbe}),
+				newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0xbf, 0xbf}),
+				newEOFToken(),
+			},
+		},
+		{
+			lspec: &spec.LexSpec{
+				Entries: []*spec.LexEntry{
+					// The first byte are the same.
+					spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf0\xbf\xbf\xbf]"),
+				},
+			},
+			src: string([]byte{
+				0xf0, 0x90, 0x80, 0x80,
+				0xf0, 0x90, 0x80, 0x81,
+				0xf0, 0xbf, 0xbf, 0xbe,
+				0xf0, 0xbf, 0xbf, 0xbf,
+			}),
+			tokens: []*Token{
+				newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}),
+				newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}),
+				newToken(1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbe}),
+				newToken(1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbf}),
+				newEOFToken(),
+			},
+		},
+		{
+			lspec: &spec.LexSpec{
+				Entries: []*spec.LexEntry{
+					// all 4 byte characters
+					spec.NewLexEntry("4ByteChar", "[\xf0\x90\x80\x80-\xf4\x8f\xbf\xbf]"),
+				},
+			},
+			src: string([]byte{
+				0xf0, 0x90, 0x80, 0x80,
+				0xf0, 0x90, 0x80, 0x81,
+				0xf0, 0xbf, 0xbf, 0xbe,
+				0xf0, 0xbf, 0xbf, 0xbf,
+				0xf1, 0x80, 0x80, 0x80,
+				0xf1, 0x80, 0x80, 0x81,
+				0xf3, 0xbf, 0xbf, 0xbe,
+				0xf3, 0xbf, 0xbf, 0xbf,
+				0xf4, 0x80, 0x80, 0x80,
+				0xf4, 0x80, 0x80, 0x81,
+				0xf4, 0x8f, 0xbf, 0xbe,
+				0xf4, 0x8f, 0xbf, 0xbf,
+			}),
+			tokens: []*Token{
+				newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}),
+				newToken(1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}),
+				newToken(1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbe}),
+				newToken(1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbf}),
+				newToken(1, "4ByteChar", []byte{0xf1, 0x80, 0x80, 0x80}),
+				newToken(1, "4ByteChar", []byte{0xf1, 0x80, 0x80, 0x81}),
+				newToken(1, "4ByteChar", []byte{0xf3, 0xbf, 0xbf, 0xbe}),
+				newToken(1, "4ByteChar", []byte{0xf3, 0xbf, 0xbf, 0xbf}),
+				newToken(1, "4ByteChar", []byte{0xf4, 0x80, 0x80, 0x80}),
+				newToken(1, "4ByteChar", []byte{0xf4, 0x80, 0x80, 0x81}),
+				newToken(1, "4ByteChar", []byte{0xf4, 0x8f, 0xbf, 0xbe}),
+				newToken(1, "4ByteChar", []byte{0xf4, 0x8f, 0xbf, 0xbf}),
+				newEOFToken(),
+			},
+		},
 	}
 	for _, tt := range test {
 		clspec, err := compiler.Compile(tt.lspec)
@@ -149,7 +408,7 @@ func TestLexer_Next(t *testing.T) {
 				break
 			}
 			testToken(t, eTok, tok)
-			t.Logf("token: ID: %v, Match: %+v Text: \"%v\", EOF: %v, Invalid: %v", tok.ID, tok.Match, string(tok.Match), tok.EOF, tok.Invalid)
+			// t.Logf("token: ID: %v, Match: %+v Text: \"%v\", EOF: %v, Invalid: %v", tok.ID, tok.Match, string(tok.Match), tok.EOF, tok.Invalid)
 			if tok.EOF {
 				break
 			}
author	Ryo Nihei <nihei.dev@gmail.com>	2021-02-24 01:30:08 +0900
committer	Ryo Nihei <nihei.dev@gmail.com>	2021-02-24 01:30:08 +0900
commit	e5fb2fe4f4dfc7dff550b934933b88e9392a6e11 (patch)
tree	7ffa58f2106d8b3bbbe931b84f73a9fb5c2b51a1 /driver
parent	Add + and ? operators (diff)
download	tre-e5fb2fe4f4dfc7dff550b934933b88e9392a6e11.tar.gz tre-e5fb2fe4f4dfc7dff550b934933b88e9392a6e11.tar.xz