From 5708644933b364ffbcc625c2010e051ca031e867 Mon Sep 17 00:00:00 2001 From: Ryo Nihei Date: Sat, 24 Apr 2021 17:52:10 +0900 Subject: Add code point expression (Meet RL1.1 of UTS #18) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit \u{hex string} matches a character has the code point represented by the hex string. For instance, \u{3042} matches hiragana あ (U+3042). The hex string must have 4 or 6 digits. This feature meets RL1.1 of UTS #18. RL1.1 Hex Notation: https://unicode.org/reports/tr18/#RL1.1 --- driver/lexer_test.go | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'driver') diff --git a/driver/lexer_test.go b/driver/lexer_test.go index 68830a5..26b5d49 100644 --- a/driver/lexer_test.go +++ b/driver/lexer_test.go @@ -143,12 +143,12 @@ func TestLexer_Next(t *testing.T) { { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - // all 1 byte characters + // all 1 byte characters except null character (U+0000) // // NOTE: // maleeni cannot handle the null character in patterns because compiler.lexer, // specifically read() and restore(), recognizes the null characters as that a symbol doesn't exist. - // There is room for improvement in this behavior of the lexer. + // If a pattern needs a null character, use code point expression \u{0000}. newLexEntry("1ByteChar", "[\x01-\x7f]"), }, }, @@ -416,6 +416,39 @@ func TestLexer_Next(t *testing.T) { newEOFToken(), }, }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + newLexEntry("1ByteChar", "\\u{006E}"), + newLexEntry("2ByteChar", "\\u{03BD}"), + newLexEntry("3ByteChar", "\\u{306B}"), + newLexEntry("4ByteChar", "\\u{01F638}"), + }, + }, + src: "nνに😸", + tokens: []*Token{ + newToken(1, "1ByteChar", newByteSequence([]byte{0x6E})), + newToken(2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})), + newToken(3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})), + newToken(4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + newLexEntry("codePointsAlt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"), + }, + }, + src: "nνに😸", + tokens: []*Token{ + newToken(1, "codePointsAlt", newByteSequence([]byte{0x6E})), + newToken(1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})), + newToken(1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})), + newToken(1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), + newEOFToken(), + }, + }, } for i, tt := range test { t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) { -- cgit v1.2.3