Add character property expression (Meet RL1.2 of UTS #18 partially)

\p{property name=property value} matches a character has the property. When the property name is General_Category, it can be omitted. That is, \p{Letter} equals \p{General_Category=Letter}. Currently, only General_Category is supported. This feature meets RL1.2 of UTS #18 partially. RL1.2 Properties: https://unicode.org/reports/tr18/#RL1.2
author: Ryo Nihei <nihei.dev@gmail.com> 2021-04-30 01:54:02 +0900
committer: Ryo Nihei <nihei.dev@gmail.com> 2021-04-30 01:54:02 +0900
commit: 43fdbf94ad87ea91a173c72688cad70a0a5f1ab4 (patch)
tree: 655f651e39f13b5e415445d1ef24f4ecb7511041 /compiler/lexer.go
parent: Add code point expression (Meet RL1.1 of UTS #18) (diff)
download: tre-43fdbf94ad87ea91a173c72688cad70a0a5f1ab4.tar.gz
tre-43fdbf94ad87ea91a173c72688cad70a0a5f1ab4.tar.xz
1 files changed, 81 insertions, 7 deletions
diff --git a/compiler/lexer.go b/compiler/lexer.go
index 5987e44..1aadf50 100644
--- a/compiler/lexer.go
+++ b/compiler/lexer.go
@@ -23,16 +23,20 @@ const (
 	tokenKindBExpClose       = tokenKind("]")
 	tokenKindCharRange       = tokenKind("-")
 	tokenKindCodePointLeader = tokenKind("\\u")
+	tokenKindCharPropLeader  = tokenKind("\\p")
 	tokenKindLBrace          = tokenKind("{")
 	tokenKindRBrace          = tokenKind("}")
+	tokenKindEqual           = tokenKind("=")
 	tokenKindCodePoint       = tokenKind("code point")
+	tokenKindCharPropSymbol  = tokenKind("character property symbol")
 	tokenKindEOF             = tokenKind("eof")
 )
 
 type token struct {
-	kind      tokenKind
-	char      rune
-	codePoint string
+	kind       tokenKind
+	char       rune
+	propSymbol string
+	codePoint  string
 }
 
 const nullChar = '\u0000'
@@ -51,12 +55,20 @@ func newCodePointToken(codePoint string) *token {
 	}
 }
 
+func newCharPropSymbolToken(propSymbol string) *token {
+	return &token{
+		kind:       tokenKindCharPropSymbol,
+		propSymbol: propSymbol,
+	}
+}
+
 type lexerMode string
 
 const (
-	lexerModeDefault = lexerMode("default")
-	lexerModeBExp    = lexerMode("bracket expression")
-	lexerModeCPExp   = lexerMode("code point expression")
+	lexerModeDefault     = lexerMode("default")
+	lexerModeBExp        = lexerMode("bracket expression")
+	lexerModeCPExp       = lexerMode("code point expression")
+	lexerModeCharPropExp = lexerMode("character property expression")
 )
 
 type lexerModeStack struct {
@@ -161,6 +173,8 @@ func (l *lexer) next() (*token, error) {
 			}
 		case tokenKindCodePointLeader:
 			l.modeStack.push(lexerModeCPExp)
+		case tokenKindCharPropLeader:
+			l.modeStack.push(lexerModeCharPropExp)
 		}
 		return tok, nil
 	case lexerModeCPExp:
@@ -173,6 +187,16 @@ func (l *lexer) next() (*token, error) {
 			l.modeStack.pop()
 		}
 		return tok, nil
+	case lexerModeCharPropExp:
+		tok, err := l.nextInCharProp(c)
+		if err != nil {
+			return nil, err
+		}
+		switch tok.kind {
+		case tokenKindRBrace:
+			l.modeStack.pop()
+		}
+		return tok, nil
 	default:
 		tok, err := l.nextInDefault(c)
 		if err != nil {
@@ -187,6 +211,8 @@ func (l *lexer) next() (*token, error) {
 			l.rangeState = rangeStateReady
 		case tokenKindCodePointLeader:
 			l.modeStack.push(lexerModeCPExp)
+		case tokenKindCharPropLeader:
+			l.modeStack.push(lexerModeCharPropExp)
 		}
 		return tok, nil
 	}
@@ -265,6 +291,9 @@ func (l *lexer) nextInDefault(c rune) (*token, error) {
 		if c == 'u' {
 			return newToken(tokenKindCodePointLeader, nullChar), nil
 		}
+		if c == 'p' {
+			return newToken(tokenKindCharPropLeader, nullChar), nil
+		}
 		if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' {
 			return newToken(tokenKindChar, c), nil
 		}
@@ -317,10 +346,13 @@ func (l *lexer) nextInBExp(c rune) (*token, error) {
 		if c == 'u' {
 			return newToken(tokenKindCodePointLeader, nullChar), nil
 		}
+		if c == 'p' {
+			return newToken(tokenKindCharPropLeader, nullChar), nil
+		}
 		if c == '\\' || c == '^' || c == '-' || c == ']' {
 			return newToken(tokenKindChar, c), nil
 		}
-		l.errMsgDetails = fmt.Sprintf("\\%v is not supported", string(c))
+		l.errMsgDetails = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c))
 		return nil, synErrInvalidEscSeq
 	default:
 		return newToken(tokenKindChar, c), nil
@@ -381,6 +413,48 @@ func isHexDigit(c rune) bool {
 	return false
 }
 
+func (l *lexer) nextInCharProp(c rune) (*token, error) {
+	switch c {
+	case '{':
+		return newToken(tokenKindLBrace, nullChar), nil
+	case '}':
+		return newToken(tokenKindRBrace, nullChar), nil
+	case '=':
+		return newToken(tokenKindEqual, nullChar), nil
+	default:
+		var b strings.Builder
+		fmt.Fprint(&b, string(c))
+		n := 1
+		for {
+			c, eof, err := l.read()
+			if err != nil {
+				return nil, err
+			}
+			if eof {
+				l.restore()
+				if err != nil {
+					return nil, err
+				}
+				break
+			}
+			if c == '}' || c == '=' {
+				err := l.restore()
+				if err != nil {
+					return nil, err
+				}
+				break
+			}
+			fmt.Fprint(&b, string(c))
+			n++
+		}
+		sym := strings.TrimSpace(b.String())
+		if len(sym) == 0 {
+			raiseSyntaxError(synErrCharPropInvalidSymbol)
+		}
+		return newCharPropSymbolToken(sym), nil
+	}
+}
+
 func (l *lexer) read() (rune, bool, error) {
 	if l.reachedEOF {
 		return l.lastChar, l.reachedEOF, nil
author	Ryo Nihei <nihei.dev@gmail.com>	2021-04-30 01:54:02 +0900
committer	Ryo Nihei <nihei.dev@gmail.com>	2021-04-30 01:54:02 +0900
commit	43fdbf94ad87ea91a173c72688cad70a0a5f1ab4 (patch)
tree	655f651e39f13b5e415445d1ef24f4ecb7511041 /compiler/lexer.go
parent	Add code point expression (Meet RL1.1 of UTS #18) (diff)
download	tre-43fdbf94ad87ea91a173c72688cad70a0a5f1ab4.tar.gz tre-43fdbf94ad87ea91a173c72688cad70a0a5f1ab4.tar.xz