diff options
author | Ryo Nihei <nihei.dev@gmail.com> | 2021-04-30 01:54:02 +0900 |
---|---|---|
committer | Ryo Nihei <nihei.dev@gmail.com> | 2021-04-30 01:54:02 +0900 |
commit | 43fdbf94ad87ea91a173c72688cad70a0a5f1ab4 (patch) | |
tree | 655f651e39f13b5e415445d1ef24f4ecb7511041 /compiler/lexer.go | |
parent | Add code point expression (Meet RL1.1 of UTS #18) (diff) | |
download | tre-43fdbf94ad87ea91a173c72688cad70a0a5f1ab4.tar.gz tre-43fdbf94ad87ea91a173c72688cad70a0a5f1ab4.tar.xz |
Add character property expression (Meet RL1.2 of UTS #18 partially)
\p{property name=property value} matches a character has the property.
When the property name is General_Category, it can be omitted.
That is, \p{Letter} equals \p{General_Category=Letter}.
Currently, only General_Category is supported.
This feature meets RL1.2 of UTS #18 partially.
RL1.2 Properties: https://unicode.org/reports/tr18/#RL1.2
Diffstat (limited to 'compiler/lexer.go')
-rw-r--r-- | compiler/lexer.go | 88 |
1 files changed, 81 insertions, 7 deletions
diff --git a/compiler/lexer.go b/compiler/lexer.go index 5987e44..1aadf50 100644 --- a/compiler/lexer.go +++ b/compiler/lexer.go @@ -23,16 +23,20 @@ const ( tokenKindBExpClose = tokenKind("]") tokenKindCharRange = tokenKind("-") tokenKindCodePointLeader = tokenKind("\\u") + tokenKindCharPropLeader = tokenKind("\\p") tokenKindLBrace = tokenKind("{") tokenKindRBrace = tokenKind("}") + tokenKindEqual = tokenKind("=") tokenKindCodePoint = tokenKind("code point") + tokenKindCharPropSymbol = tokenKind("character property symbol") tokenKindEOF = tokenKind("eof") ) type token struct { - kind tokenKind - char rune - codePoint string + kind tokenKind + char rune + propSymbol string + codePoint string } const nullChar = '\u0000' @@ -51,12 +55,20 @@ func newCodePointToken(codePoint string) *token { } } +func newCharPropSymbolToken(propSymbol string) *token { + return &token{ + kind: tokenKindCharPropSymbol, + propSymbol: propSymbol, + } +} + type lexerMode string const ( - lexerModeDefault = lexerMode("default") - lexerModeBExp = lexerMode("bracket expression") - lexerModeCPExp = lexerMode("code point expression") + lexerModeDefault = lexerMode("default") + lexerModeBExp = lexerMode("bracket expression") + lexerModeCPExp = lexerMode("code point expression") + lexerModeCharPropExp = lexerMode("character property expression") ) type lexerModeStack struct { @@ -161,6 +173,8 @@ func (l *lexer) next() (*token, error) { } case tokenKindCodePointLeader: l.modeStack.push(lexerModeCPExp) + case tokenKindCharPropLeader: + l.modeStack.push(lexerModeCharPropExp) } return tok, nil case lexerModeCPExp: @@ -173,6 +187,16 @@ func (l *lexer) next() (*token, error) { l.modeStack.pop() } return tok, nil + case lexerModeCharPropExp: + tok, err := l.nextInCharProp(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() + } + return tok, nil default: tok, err := l.nextInDefault(c) if err != nil { @@ -187,6 +211,8 @@ func (l *lexer) next() (*token, error) { l.rangeState = rangeStateReady case tokenKindCodePointLeader: l.modeStack.push(lexerModeCPExp) + case tokenKindCharPropLeader: + l.modeStack.push(lexerModeCharPropExp) } return tok, nil } @@ -265,6 +291,9 @@ func (l *lexer) nextInDefault(c rune) (*token, error) { if c == 'u' { return newToken(tokenKindCodePointLeader, nullChar), nil } + if c == 'p' { + return newToken(tokenKindCharPropLeader, nullChar), nil + } if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' { return newToken(tokenKindChar, c), nil } @@ -317,10 +346,13 @@ func (l *lexer) nextInBExp(c rune) (*token, error) { if c == 'u' { return newToken(tokenKindCodePointLeader, nullChar), nil } + if c == 'p' { + return newToken(tokenKindCharPropLeader, nullChar), nil + } if c == '\\' || c == '^' || c == '-' || c == ']' { return newToken(tokenKindChar, c), nil } - l.errMsgDetails = fmt.Sprintf("\\%v is not supported", string(c)) + l.errMsgDetails = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c)) return nil, synErrInvalidEscSeq default: return newToken(tokenKindChar, c), nil @@ -381,6 +413,48 @@ func isHexDigit(c rune) bool { return false } +func (l *lexer) nextInCharProp(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + case '=': + return newToken(tokenKindEqual, nullChar), nil + default: + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' || c == '=' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + fmt.Fprint(&b, string(c)) + n++ + } + sym := strings.TrimSpace(b.String()) + if len(sym) == 0 { + raiseSyntaxError(synErrCharPropInvalidSymbol) + } + return newCharPropSymbolToken(sym), nil + } +} + func (l *lexer) read() (rune, bool, error) { if l.reachedEOF { return l.lastChar, l.reachedEOF, nil |