aboutsummaryrefslogtreecommitdiff
path: root/compiler/lexer.go
diff options
context:
space:
mode:
authorRyo Nihei <nihei.dev@gmail.com>2021-04-30 01:54:02 +0900
committerRyo Nihei <nihei.dev@gmail.com>2021-04-30 01:54:02 +0900
commit43fdbf94ad87ea91a173c72688cad70a0a5f1ab4 (patch)
tree655f651e39f13b5e415445d1ef24f4ecb7511041 /compiler/lexer.go
parentAdd code point expression (Meet RL1.1 of UTS #18) (diff)
downloadtre-43fdbf94ad87ea91a173c72688cad70a0a5f1ab4.tar.gz
tre-43fdbf94ad87ea91a173c72688cad70a0a5f1ab4.tar.xz
Add character property expression (Meet RL1.2 of UTS #18 partially)
\p{property name=property value} matches a character has the property. When the property name is General_Category, it can be omitted. That is, \p{Letter} equals \p{General_Category=Letter}. Currently, only General_Category is supported. This feature meets RL1.2 of UTS #18 partially. RL1.2 Properties: https://unicode.org/reports/tr18/#RL1.2
Diffstat (limited to 'compiler/lexer.go')
-rw-r--r--compiler/lexer.go88
1 files changed, 81 insertions, 7 deletions
diff --git a/compiler/lexer.go b/compiler/lexer.go
index 5987e44..1aadf50 100644
--- a/compiler/lexer.go
+++ b/compiler/lexer.go
@@ -23,16 +23,20 @@ const (
tokenKindBExpClose = tokenKind("]")
tokenKindCharRange = tokenKind("-")
tokenKindCodePointLeader = tokenKind("\\u")
+ tokenKindCharPropLeader = tokenKind("\\p")
tokenKindLBrace = tokenKind("{")
tokenKindRBrace = tokenKind("}")
+ tokenKindEqual = tokenKind("=")
tokenKindCodePoint = tokenKind("code point")
+ tokenKindCharPropSymbol = tokenKind("character property symbol")
tokenKindEOF = tokenKind("eof")
)
type token struct {
- kind tokenKind
- char rune
- codePoint string
+ kind tokenKind
+ char rune
+ propSymbol string
+ codePoint string
}
const nullChar = '\u0000'
@@ -51,12 +55,20 @@ func newCodePointToken(codePoint string) *token {
}
}
+func newCharPropSymbolToken(propSymbol string) *token {
+ return &token{
+ kind: tokenKindCharPropSymbol,
+ propSymbol: propSymbol,
+ }
+}
+
type lexerMode string
const (
- lexerModeDefault = lexerMode("default")
- lexerModeBExp = lexerMode("bracket expression")
- lexerModeCPExp = lexerMode("code point expression")
+ lexerModeDefault = lexerMode("default")
+ lexerModeBExp = lexerMode("bracket expression")
+ lexerModeCPExp = lexerMode("code point expression")
+ lexerModeCharPropExp = lexerMode("character property expression")
)
type lexerModeStack struct {
@@ -161,6 +173,8 @@ func (l *lexer) next() (*token, error) {
}
case tokenKindCodePointLeader:
l.modeStack.push(lexerModeCPExp)
+ case tokenKindCharPropLeader:
+ l.modeStack.push(lexerModeCharPropExp)
}
return tok, nil
case lexerModeCPExp:
@@ -173,6 +187,16 @@ func (l *lexer) next() (*token, error) {
l.modeStack.pop()
}
return tok, nil
+ case lexerModeCharPropExp:
+ tok, err := l.nextInCharProp(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindRBrace:
+ l.modeStack.pop()
+ }
+ return tok, nil
default:
tok, err := l.nextInDefault(c)
if err != nil {
@@ -187,6 +211,8 @@ func (l *lexer) next() (*token, error) {
l.rangeState = rangeStateReady
case tokenKindCodePointLeader:
l.modeStack.push(lexerModeCPExp)
+ case tokenKindCharPropLeader:
+ l.modeStack.push(lexerModeCharPropExp)
}
return tok, nil
}
@@ -265,6 +291,9 @@ func (l *lexer) nextInDefault(c rune) (*token, error) {
if c == 'u' {
return newToken(tokenKindCodePointLeader, nullChar), nil
}
+ if c == 'p' {
+ return newToken(tokenKindCharPropLeader, nullChar), nil
+ }
if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' {
return newToken(tokenKindChar, c), nil
}
@@ -317,10 +346,13 @@ func (l *lexer) nextInBExp(c rune) (*token, error) {
if c == 'u' {
return newToken(tokenKindCodePointLeader, nullChar), nil
}
+ if c == 'p' {
+ return newToken(tokenKindCharPropLeader, nullChar), nil
+ }
if c == '\\' || c == '^' || c == '-' || c == ']' {
return newToken(tokenKindChar, c), nil
}
- l.errMsgDetails = fmt.Sprintf("\\%v is not supported", string(c))
+ l.errMsgDetails = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c))
return nil, synErrInvalidEscSeq
default:
return newToken(tokenKindChar, c), nil
@@ -381,6 +413,48 @@ func isHexDigit(c rune) bool {
return false
}
+func (l *lexer) nextInCharProp(c rune) (*token, error) {
+ switch c {
+ case '{':
+ return newToken(tokenKindLBrace, nullChar), nil
+ case '}':
+ return newToken(tokenKindRBrace, nullChar), nil
+ case '=':
+ return newToken(tokenKindEqual, nullChar), nil
+ default:
+ var b strings.Builder
+ fmt.Fprint(&b, string(c))
+ n := 1
+ for {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if c == '}' || c == '=' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ fmt.Fprint(&b, string(c))
+ n++
+ }
+ sym := strings.TrimSpace(b.String())
+ if len(sym) == 0 {
+ raiseSyntaxError(synErrCharPropInvalidSymbol)
+ }
+ return newCharPropSymbolToken(sym), nil
+ }
+}
+
func (l *lexer) read() (rune, bool, error) {
if l.reachedEOF {
return l.lastChar, l.reachedEOF, nil