diff options
author | Ryo Nihei <nihei.dev@gmail.com> | 2021-04-30 01:54:02 +0900 |
---|---|---|
committer | Ryo Nihei <nihei.dev@gmail.com> | 2021-04-30 01:54:02 +0900 |
commit | 43fdbf94ad87ea91a173c72688cad70a0a5f1ab4 (patch) | |
tree | 655f651e39f13b5e415445d1ef24f4ecb7511041 /compiler/parser.go | |
parent | Add code point expression (Meet RL1.1 of UTS #18) (diff) | |
download | tre-43fdbf94ad87ea91a173c72688cad70a0a5f1ab4.tar.gz tre-43fdbf94ad87ea91a173c72688cad70a0a5f1ab4.tar.xz |
Add character property expression (Meet RL1.2 of UTS #18 partially)
\p{property name=property value} matches a character has the property.
When the property name is General_Category, it can be omitted.
That is, \p{Letter} equals \p{General_Category=Letter}.
Currently, only General_Category is supported.
This feature meets RL1.2 of UTS #18 partially.
RL1.2 Properties: https://unicode.org/reports/tr18/#RL1.2
Diffstat (limited to 'compiler/parser.go')
-rw-r--r-- | compiler/parser.go | 55 |
1 files changed, 54 insertions, 1 deletions
diff --git a/compiler/parser.go b/compiler/parser.go index ba15cd0..7b50459 100644 --- a/compiler/parser.go +++ b/compiler/parser.go @@ -312,6 +312,9 @@ func (p *parser) parseSingleChar() astNode { if p.consume(tokenKindCodePointLeader) { return p.parseCodePoint() } + if p.consume(tokenKindCharPropLeader) { + return p.parseCharProp() + } c := p.parseNormalChar() if c == nil { if p.consume(tokenKindBExpClose) { @@ -393,12 +396,62 @@ func (p *parser) parseCodePoint() astNode { return concat } +func (p *parser) parseCharProp() astNode { + if !p.consume(tokenKindLBrace) { + raiseSyntaxError(synErrCharPropExpInvalidForm) + } + var sym1, sym2 string + if !p.consume(tokenKindCharPropSymbol) { + raiseSyntaxError(synErrCharPropExpInvalidForm) + } + sym1 = p.lastTok.propSymbol + if p.consume(tokenKindEqual) { + if !p.consume(tokenKindCharPropSymbol) { + raiseSyntaxError(synErrCharPropExpInvalidForm) + } + sym2 = p.lastTok.propSymbol + } + + var propName, propVal string + if sym2 != "" { + propName = sym1 + propVal = sym2 + } else { + propName = "gc" + propVal = sym1 + } + cpRanges, err := findCodePointRanges(propName, propVal) + if err != nil { + p.errMsgDetails = fmt.Sprintf("%v", err) + raiseSyntaxError(synErrCharPropUnsupported) + } + + var alt astNode + for _, r := range cpRanges { + from := genNormalCharAST(r.From) + to := genNormalCharAST(r.To) + alt = genAltNode( + alt, + genRangeAST(from, to), + ) + } + + if !p.consume(tokenKindRBrace) { + raiseSyntaxError(synErrCharPropExpInvalidForm) + } + + return alt +} + func (p *parser) parseNormalChar() astNode { if !p.consume(tokenKindChar) { return nil } + return genNormalCharAST(p.lastTok.char) +} - b := []byte(string(p.lastTok.char)) +func genNormalCharAST(c rune) astNode { + b := []byte(string(c)) switch len(b) { case 1: return newSymbolNode(b[0]) |