From 5708644933b364ffbcc625c2010e051ca031e867 Mon Sep 17 00:00:00 2001
From: Ryo Nihei <nihei.dev@gmail.com>
Date: Sat, 24 Apr 2021 17:52:10 +0900
Subject: Add code point expression (Meet RL1.1 of UTS #18)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

\u{hex string} matches a character has the code point represented by the hex string.
For instance, \u{3042} matches hiragana あ (U+3042). The hex string must have 4 or 6 digits.
This feature meets RL1.1 of UTS #18.

RL1.1 Hex Notation: https://unicode.org/reports/tr18/#RL1.1
---
 driver/lexer_test.go | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

(limited to 'driver')

diff --git a/driver/lexer_test.go b/driver/lexer_test.go
index 68830a5..26b5d49 100644
--- a/driver/lexer_test.go
+++ b/driver/lexer_test.go
@@ -143,12 +143,12 @@ func TestLexer_Next(t *testing.T) {
 		{
 			lspec: &spec.LexSpec{
 				Entries: []*spec.LexEntry{
-					// all 1 byte characters
+					// all 1 byte characters except null character (U+0000)
 					//
 					// NOTE:
 					// maleeni cannot handle the null character in patterns because compiler.lexer,
 					// specifically read() and restore(), recognizes the null characters as that a symbol doesn't exist.
-					// There is room for improvement in this behavior of the lexer.
+					// If a pattern needs a null character, use code point expression \u{0000}.
 					newLexEntry("1ByteChar", "[\x01-\x7f]"),
 				},
 			},
@@ -416,6 +416,39 @@ func TestLexer_Next(t *testing.T) {
 				newEOFToken(),
 			},
 		},
+		{
+			lspec: &spec.LexSpec{
+				Entries: []*spec.LexEntry{
+					newLexEntry("1ByteChar", "\\u{006E}"),
+					newLexEntry("2ByteChar", "\\u{03BD}"),
+					newLexEntry("3ByteChar", "\\u{306B}"),
+					newLexEntry("4ByteChar", "\\u{01F638}"),
+				},
+			},
+			src: "nνに😸",
+			tokens: []*Token{
+				newToken(1, "1ByteChar", newByteSequence([]byte{0x6E})),
+				newToken(2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})),
+				newToken(3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})),
+				newToken(4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})),
+				newEOFToken(),
+			},
+		},
+		{
+			lspec: &spec.LexSpec{
+				Entries: []*spec.LexEntry{
+					newLexEntry("codePointsAlt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"),
+				},
+			},
+			src: "nνに😸",
+			tokens: []*Token{
+				newToken(1, "codePointsAlt", newByteSequence([]byte{0x6E})),
+				newToken(1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})),
+				newToken(1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})),
+				newToken(1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})),
+				newEOFToken(),
+			},
+		},
 	}
 	for i, tt := range test {
 		t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) {
-- 
cgit v1.2.3