From a1d1cfe08ae809d454ac6f1ce80a19395e7940e5 Mon Sep 17 00:00:00 2001 From: Ryo Nihei Date: Sun, 14 Feb 2021 17:38:46 +0900 Subject: Add dot symbol matching any single character The dot symbol matches any single character. When the dot symbol appears, the parser generates an AST matching all of the well-formed UTF-8 byte sequences. Refelences: * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf#G7404 * Table 3-6. UTF-8 Bit Distribution * Table 3-7. Well-Formed UTF-8 Byte Sequences --- compiler/lexer.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'compiler/lexer.go') diff --git a/compiler/lexer.go b/compiler/lexer.go index f78b920..1c09260 100644 --- a/compiler/lexer.go +++ b/compiler/lexer.go @@ -10,6 +10,7 @@ type tokenKind string const ( tokenKindChar = tokenKind("char") + tokenKindAnyChar = tokenKind(".") tokenKindRepeat = tokenKind("*") tokenKindAlt = tokenKind("|") tokenKindGroupOpen = tokenKind("(") @@ -59,6 +60,8 @@ func (l *lexer) next() (*token, error) { switch c { case '*': return newToken(tokenKindRepeat, nullChar), nil + case '.': + return newToken(tokenKindAnyChar, nullChar), nil case '|': return newToken(tokenKindAlt, nullChar), nil case '(': @@ -76,7 +79,7 @@ func (l *lexer) next() (*token, error) { } } switch { - case c == '\\' || c == '*' || c == '|' || c == '(' || c == ')': + case c == '\\' || c == '.' || c == '*' || c == '|' || c == '(' || c == ')': return newToken(tokenKindChar, c), nil default: return nil, &SyntaxError{ -- cgit v1.2.3