From 5708644933b364ffbcc625c2010e051ca031e867 Mon Sep 17 00:00:00 2001 From: Ryo Nihei Date: Sat, 24 Apr 2021 17:52:10 +0900 Subject: Add code point expression (Meet RL1.1 of UTS #18) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit \u{hex string} matches a character has the code point represented by the hex string. For instance, \u{3042} matches hiragana あ (U+3042). The hex string must have 4 or 6 digits. This feature meets RL1.1 of UTS #18. RL1.1 Hex Notation: https://unicode.org/reports/tr18/#RL1.1 --- compiler/lexer.go | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 120 insertions(+), 8 deletions(-) (limited to 'compiler/lexer.go') diff --git a/compiler/lexer.go b/compiler/lexer.go index c1aa67e..5987e44 100644 --- a/compiler/lexer.go +++ b/compiler/lexer.go @@ -4,6 +4,7 @@ import ( "bufio" "fmt" "io" + "strings" ) type tokenKind string @@ -21,12 +22,17 @@ const ( tokenKindInverseBExpOpen = tokenKind("[^") tokenKindBExpClose = tokenKind("]") tokenKindCharRange = tokenKind("-") + tokenKindCodePointLeader = tokenKind("\\u") + tokenKindLBrace = tokenKind("{") + tokenKindRBrace = tokenKind("}") + tokenKindCodePoint = tokenKind("code point") tokenKindEOF = tokenKind("eof") ) type token struct { - kind tokenKind - char rune + kind tokenKind + char rune + codePoint string } const nullChar = '\u0000' @@ -38,13 +44,45 @@ func newToken(kind tokenKind, char rune) *token { } } +func newCodePointToken(codePoint string) *token { + return &token{ + kind: tokenKindCodePoint, + codePoint: codePoint, + } +} + type lexerMode string const ( lexerModeDefault = lexerMode("default") lexerModeBExp = lexerMode("bracket expression") + lexerModeCPExp = lexerMode("code point expression") ) +type lexerModeStack struct { + stack []lexerMode +} + +func newLexerModeStack() *lexerModeStack { + return &lexerModeStack{ + stack: []lexerMode{ + lexerModeDefault, + }, + } +} + +func (s *lexerModeStack) top() lexerMode { + return s.stack[len(s.stack)-1] +} + +func (s *lexerModeStack) push(m lexerMode) { + s.stack = append(s.stack, m) +} + +func (s *lexerModeStack) pop() { + s.stack = s.stack[:len(s.stack)-1] +} + type rangeState string // [a-z] @@ -71,7 +109,7 @@ type lexer struct { prevEOF1 bool prevChar2 rune pervEOF2 bool - mode lexerMode + modeStack *lexerModeStack rangeState rangeState errMsgDetails string } @@ -89,7 +127,7 @@ func newLexer(src io.Reader) *lexer { prevEOF1: false, prevChar2: nullChar, pervEOF2: false, - mode: lexerModeDefault, + modeStack: newLexerModeStack(), rangeState: rangeStateReady, } } @@ -103,7 +141,7 @@ func (l *lexer) next() (*token, error) { return newToken(tokenKindEOF, nullChar), nil } - switch l.mode { + switch l.modeStack.top() { case lexerModeBExp: tok, err := l.nextInBExp(c) if err != nil { @@ -111,7 +149,7 @@ func (l *lexer) next() (*token, error) { } switch tok.kind { case tokenKindBExpClose: - l.mode = lexerModeDefault + l.modeStack.pop() case tokenKindCharRange: l.rangeState = rangeStateExpectRangeTerminator case tokenKindChar: @@ -121,6 +159,18 @@ func (l *lexer) next() (*token, error) { case rangeStateExpectRangeTerminator: l.rangeState = rangeStateReady } + case tokenKindCodePointLeader: + l.modeStack.push(lexerModeCPExp) + } + return tok, nil + case lexerModeCPExp: + tok, err := l.nextInCodePoint(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() } return tok, nil default: @@ -130,11 +180,13 @@ func (l *lexer) next() (*token, error) { } switch tok.kind { case tokenKindBExpOpen: - l.mode = lexerModeBExp + l.modeStack.push(lexerModeBExp) l.rangeState = rangeStateReady case tokenKindInverseBExpOpen: - l.mode = lexerModeBExp + l.modeStack.push(lexerModeBExp) l.rangeState = rangeStateReady + case tokenKindCodePointLeader: + l.modeStack.push(lexerModeCPExp) } return tok, nil } @@ -210,6 +262,9 @@ func (l *lexer) nextInDefault(c rune) (*token, error) { if eof { return nil, synErrIncompletedEscSeq } + if c == 'u' { + return newToken(tokenKindCodePointLeader, nullChar), nil + } if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' { return newToken(tokenKindChar, c), nil } @@ -259,6 +314,9 @@ func (l *lexer) nextInBExp(c rune) (*token, error) { if eof { return nil, synErrIncompletedEscSeq } + if c == 'u' { + return newToken(tokenKindCodePointLeader, nullChar), nil + } if c == '\\' || c == '^' || c == '-' || c == ']' { return newToken(tokenKindChar, c), nil } @@ -269,6 +327,60 @@ func (l *lexer) nextInBExp(c rune) (*token, error) { } } +func (l *lexer) nextInCodePoint(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + default: + if !isHexDigit(c) { + return nil, synErrInvalidCodePoint + } + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if !isHexDigit(c) || n >= 6 { + return nil, synErrInvalidCodePoint + } + fmt.Fprint(&b, string(c)) + n++ + } + cp := b.String() + cpLen := len(cp) + if !(cpLen == 4 || cpLen == 6) { + return nil, synErrInvalidCodePoint + } + return newCodePointToken(b.String()), nil + } +} + +func isHexDigit(c rune) bool { + if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' { + return true + } + return false +} + func (l *lexer) read() (rune, bool, error) { if l.reachedEOF { return l.lastChar, l.reachedEOF, nil -- cgit v1.2.3