diff options
author | Ryo Nihei <nihei.dev@gmail.com> | 2021-04-24 17:52:10 +0900 |
---|---|---|
committer | Ryo Nihei <nihei.dev@gmail.com> | 2021-04-24 23:32:55 +0900 |
commit | 5708644933b364ffbcc625c2010e051ca031e867 (patch) | |
tree | 53c458938cd685b7c77a3d638920a083effc7dee /compiler/lexer_test.go | |
parent | Add validation of lexical specs and improve error messages (diff) | |
download | tre-5708644933b364ffbcc625c2010e051ca031e867.tar.gz tre-5708644933b364ffbcc625c2010e051ca031e867.tar.xz |
Add code point expression (Meet RL1.1 of UTS #18)
\u{hex string} matches a character has the code point represented by the hex string.
For instance, \u{3042} matches hiragana あ (U+3042). The hex string must have 4 or 6 digits.
This feature meets RL1.1 of UTS #18.
RL1.1 Hex Notation: https://unicode.org/reports/tr18/#RL1.1
Diffstat (limited to 'compiler/lexer_test.go')
-rw-r--r-- | compiler/lexer_test.go | 180 |
1 files changed, 174 insertions, 6 deletions
diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go index c77d7c7..87e3a81 100644 --- a/compiler/lexer_test.go +++ b/compiler/lexer_test.go @@ -30,7 +30,7 @@ func TestLexer(t *testing.T) { }, { caption: "lexer can recognize the special characters in default mode", - src: ".*+?|()[", + src: ".*+?|()[\\u", tokens: []*token{ newToken(tokenKindAnyChar, nullChar), newToken(tokenKindRepeat, nullChar), @@ -40,6 +40,7 @@ func TestLexer(t *testing.T) { newToken(tokenKindGroupOpen, nullChar), newToken(tokenKindGroupClose, nullChar), newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindCodePointLeader, nullChar), newToken(tokenKindEOF, nullChar), }, }, @@ -60,26 +61,36 @@ func TestLexer(t *testing.T) { }, }, { - caption: "] is treated as an ordinary character in default mode", - src: "]", + caption: "], {, and } are treated as an ordinary character in default mode", + src: "]{}", tokens: []*token{ newToken(tokenKindChar, ']'), + newToken(tokenKindChar, '{'), + newToken(tokenKindChar, '}'), newToken(tokenKindEOF, nullChar), }, }, { caption: "lexer can recognize the special characters in bracket expression mode", - src: "[a-z][^a-z]", + src: "[a-z\\u{09AF}][^a-z\\u{09abcf}]", tokens: []*token{ newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindChar, 'a'), newToken(tokenKindCharRange, nullChar), newToken(tokenKindChar, 'z'), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("09AF"), + newToken(tokenKindRBrace, nullChar), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindInverseBExpOpen, nullChar), newToken(tokenKindChar, 'a'), newToken(tokenKindCharRange, nullChar), newToken(tokenKindChar, 'z'), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("09abcf"), + newToken(tokenKindRBrace, nullChar), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindEOF, nullChar), }, @@ -233,6 +244,163 @@ func TestLexer(t *testing.T) { }, err: synErrIncompletedEscSeq, }, + { + caption: "lexer can recognize the special characters and code points in code point expression mode", + src: "\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}[\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}][^\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}]", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("0123"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("4567"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("89abcd"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("efAB"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("CDEF01"), + newToken(tokenKindRBrace, nullChar), + + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("0123"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("4567"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("89abcd"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("efAB"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("CDEF01"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindBExpClose, nullChar), + + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("0123"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("4567"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("89abcd"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("efAB"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("CDEF01"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindBExpClose, nullChar), + + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "a one digit hex string isn't a valid code point", + src: "\\u{0", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a two digits hex string isn't a valid code point", + src: "\\u{01", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a three digits hex string isn't a valid code point", + src: "\\u{012", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a four digits hex string is a valid code point", + src: "\\u{0123}", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("0123"), + newToken(tokenKindRBrace, nullChar), + }, + }, + { + caption: "a five digits hex string isn't a valid code point", + src: "\\u{01234", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a six digits hex string is a valid code point", + src: "\\u{012345}", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("012345"), + newToken(tokenKindRBrace, nullChar), + }, + }, + { + caption: "a seven digits hex string isn't a valid code point", + src: "\\u{0123456", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a code point must be hex digits", + src: "\\u{g", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a code point must be hex digits", + src: "\\u{G", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, } for _, tt := range tests { t.Run(tt.caption, func(t *testing.T) { @@ -268,7 +436,7 @@ func TestLexer(t *testing.T) { func testToken(t *testing.T, a, e *token) { t.Helper() - if e.kind != a.kind || e.char != a.char { - t.Fatalf("unexpected token; want: %v, got: %v", e, a) + if e.kind != a.kind || e.char != a.char || e.codePoint != a.codePoint { + t.Fatalf("unexpected token; want: %+v, got: %+v", e, a) } } |