From 43fdbf94ad87ea91a173c72688cad70a0a5f1ab4 Mon Sep 17 00:00:00 2001 From: Ryo Nihei Date: Fri, 30 Apr 2021 01:54:02 +0900 Subject: Add character property expression (Meet RL1.2 of UTS #18 partially) \p{property name=property value} matches a character has the property. When the property name is General_Category, it can be omitted. That is, \p{Letter} equals \p{General_Category=Letter}. Currently, only General_Category is supported. This feature meets RL1.2 of UTS #18 partially. RL1.2 Properties: https://unicode.org/reports/tr18/#RL1.2 --- compiler/parser.go | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) (limited to 'compiler/parser.go') diff --git a/compiler/parser.go b/compiler/parser.go index ba15cd0..7b50459 100644 --- a/compiler/parser.go +++ b/compiler/parser.go @@ -312,6 +312,9 @@ func (p *parser) parseSingleChar() astNode { if p.consume(tokenKindCodePointLeader) { return p.parseCodePoint() } + if p.consume(tokenKindCharPropLeader) { + return p.parseCharProp() + } c := p.parseNormalChar() if c == nil { if p.consume(tokenKindBExpClose) { @@ -393,12 +396,62 @@ func (p *parser) parseCodePoint() astNode { return concat } +func (p *parser) parseCharProp() astNode { + if !p.consume(tokenKindLBrace) { + raiseSyntaxError(synErrCharPropExpInvalidForm) + } + var sym1, sym2 string + if !p.consume(tokenKindCharPropSymbol) { + raiseSyntaxError(synErrCharPropExpInvalidForm) + } + sym1 = p.lastTok.propSymbol + if p.consume(tokenKindEqual) { + if !p.consume(tokenKindCharPropSymbol) { + raiseSyntaxError(synErrCharPropExpInvalidForm) + } + sym2 = p.lastTok.propSymbol + } + + var propName, propVal string + if sym2 != "" { + propName = sym1 + propVal = sym2 + } else { + propName = "gc" + propVal = sym1 + } + cpRanges, err := findCodePointRanges(propName, propVal) + if err != nil { + p.errMsgDetails = fmt.Sprintf("%v", err) + raiseSyntaxError(synErrCharPropUnsupported) + } + + var alt astNode + for _, r := range cpRanges { + from := genNormalCharAST(r.From) + to := genNormalCharAST(r.To) + alt = genAltNode( + alt, + genRangeAST(from, to), + ) + } + + if !p.consume(tokenKindRBrace) { + raiseSyntaxError(synErrCharPropExpInvalidForm) + } + + return alt +} + func (p *parser) parseNormalChar() astNode { if !p.consume(tokenKindChar) { return nil } + return genNormalCharAST(p.lastTok.char) +} - b := []byte(string(p.lastTok.char)) +func genNormalCharAST(c rune) astNode { + b := []byte(string(c)) switch len(b) { case 1: return newSymbolNode(b[0]) -- cgit v1.2.3