Add code point expression (Meet RL1.1 of UTS #18)

\u{hex string} matches a character has the code point represented by the hex string. For instance, \u{3042} matches hiragana あ (U+3042). The hex string must have 4 or 6 digits. This feature meets RL1.1 of UTS #18. RL1.1 Hex Notation: https://unicode.org/reports/tr18/#RL1.1
author: Ryo Nihei <nihei.dev@gmail.com> 2021-04-24 17:52:10 +0900
committer: Ryo Nihei <nihei.dev@gmail.com> 2021-04-24 23:32:55 +0900
commit: 5708644933b364ffbcc625c2010e051ca031e867 (patch)
tree: 53c458938cd685b7c77a3d638920a083effc7dee /compiler/lexer_test.go
parent: Add validation of lexical specs and improve error messages (diff)
download: tre-5708644933b364ffbcc625c2010e051ca031e867.tar.gz
tre-5708644933b364ffbcc625c2010e051ca031e867.tar.xz
1 files changed, 174 insertions, 6 deletions
diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go
index c77d7c7..87e3a81 100644
--- a/compiler/lexer_test.go
+++ b/compiler/lexer_test.go
@@ -30,7 +30,7 @@ func TestLexer(t *testing.T) {
 		},
 		{
 			caption: "lexer can recognize the special characters in default mode",
-			src:     ".*+?|()[",
+			src:     ".*+?|()[\\u",
 			tokens: []*token{
 				newToken(tokenKindAnyChar, nullChar),
 				newToken(tokenKindRepeat, nullChar),
@@ -40,6 +40,7 @@ func TestLexer(t *testing.T) {
 				newToken(tokenKindGroupOpen, nullChar),
 				newToken(tokenKindGroupClose, nullChar),
 				newToken(tokenKindBExpOpen, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
 				newToken(tokenKindEOF, nullChar),
 			},
 		},
@@ -60,26 +61,36 @@ func TestLexer(t *testing.T) {
 			},
 		},
 		{
-			caption: "] is treated as an ordinary character in default mode",
-			src:     "]",
+			caption: "], {, and } are treated as an ordinary character in default mode",
+			src:     "]{}",
 			tokens: []*token{
 				newToken(tokenKindChar, ']'),
+				newToken(tokenKindChar, '{'),
+				newToken(tokenKindChar, '}'),
 				newToken(tokenKindEOF, nullChar),
 			},
 		},
 		{
 			caption: "lexer can recognize the special characters in bracket expression mode",
-			src:     "[a-z][^a-z]",
+			src:     "[a-z\\u{09AF}][^a-z\\u{09abcf}]",
 			tokens: []*token{
 				newToken(tokenKindBExpOpen, nullChar),
 				newToken(tokenKindChar, 'a'),
 				newToken(tokenKindCharRange, nullChar),
 				newToken(tokenKindChar, 'z'),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("09AF"),
+				newToken(tokenKindRBrace, nullChar),
 				newToken(tokenKindBExpClose, nullChar),
 				newToken(tokenKindInverseBExpOpen, nullChar),
 				newToken(tokenKindChar, 'a'),
 				newToken(tokenKindCharRange, nullChar),
 				newToken(tokenKindChar, 'z'),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("09abcf"),
+				newToken(tokenKindRBrace, nullChar),
 				newToken(tokenKindBExpClose, nullChar),
 				newToken(tokenKindEOF, nullChar),
 			},
@@ -233,6 +244,163 @@ func TestLexer(t *testing.T) {
 			},
 			err: synErrIncompletedEscSeq,
 		},
+		{
+			caption: "lexer can recognize the special characters and code points in code point expression mode",
+			src:     "\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}[\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}][^\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}]",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("0123"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("4567"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("89abcd"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("efAB"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("CDEF01"),
+				newToken(tokenKindRBrace, nullChar),
+
+				newToken(tokenKindBExpOpen, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("0123"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("4567"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("89abcd"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("efAB"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("CDEF01"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindBExpClose, nullChar),
+
+				newToken(tokenKindInverseBExpOpen, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("0123"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("4567"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("89abcd"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("efAB"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("CDEF01"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindBExpClose, nullChar),
+
+				newToken(tokenKindEOF, nullChar),
+			},
+		},
+		{
+			caption: "a one digit hex string isn't a valid code point",
+			src:     "\\u{0",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+			},
+			err: synErrInvalidCodePoint,
+		},
+		{
+			caption: "a two digits hex string isn't a valid code point",
+			src:     "\\u{01",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+			},
+			err: synErrInvalidCodePoint,
+		},
+		{
+			caption: "a three digits hex string isn't a valid code point",
+			src:     "\\u{012",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+			},
+			err: synErrInvalidCodePoint,
+		},
+		{
+			caption: "a four digits hex string is a valid code point",
+			src:     "\\u{0123}",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("0123"),
+				newToken(tokenKindRBrace, nullChar),
+			},
+		},
+		{
+			caption: "a five digits hex string isn't a valid code point",
+			src:     "\\u{01234",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+			},
+			err: synErrInvalidCodePoint,
+		},
+		{
+			caption: "a six digits hex string is a valid code point",
+			src:     "\\u{012345}",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("012345"),
+				newToken(tokenKindRBrace, nullChar),
+			},
+		},
+		{
+			caption: "a seven digits hex string isn't a valid code point",
+			src:     "\\u{0123456",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+			},
+			err: synErrInvalidCodePoint,
+		},
+		{
+			caption: "a code point must be hex digits",
+			src:     "\\u{g",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+			},
+			err: synErrInvalidCodePoint,
+		},
+		{
+			caption: "a code point must be hex digits",
+			src:     "\\u{G",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+			},
+			err: synErrInvalidCodePoint,
+		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.caption, func(t *testing.T) {
@@ -268,7 +436,7 @@ func TestLexer(t *testing.T) {
 
 func testToken(t *testing.T, a, e *token) {
 	t.Helper()
-	if e.kind != a.kind || e.char != a.char {
-		t.Fatalf("unexpected token; want: %v, got: %v", e, a)
+	if e.kind != a.kind || e.char != a.char || e.codePoint != a.codePoint {
+		t.Fatalf("unexpected token; want: %+v, got: %+v", e, a)
 	}
 }
author	Ryo Nihei <nihei.dev@gmail.com>	2021-04-24 17:52:10 +0900
committer	Ryo Nihei <nihei.dev@gmail.com>	2021-04-24 23:32:55 +0900
commit	5708644933b364ffbcc625c2010e051ca031e867 (patch)
tree	53c458938cd685b7c77a3d638920a083effc7dee /compiler/lexer_test.go
parent	Add validation of lexical specs and improve error messages (diff)
download	tre-5708644933b364ffbcc625c2010e051ca031e867.tar.gz tre-5708644933b364ffbcc625c2010e051ca031e867.tar.xz