diff options
author | Ryo Nihei <nihei.dev@gmail.com> | 2021-04-24 17:52:10 +0900 |
---|---|---|
committer | Ryo Nihei <nihei.dev@gmail.com> | 2021-04-24 23:32:55 +0900 |
commit | 5708644933b364ffbcc625c2010e051ca031e867 (patch) | |
tree | 53c458938cd685b7c77a3d638920a083effc7dee /compiler/parser.go | |
parent | Add validation of lexical specs and improve error messages (diff) | |
download | tre-5708644933b364ffbcc625c2010e051ca031e867.tar.gz tre-5708644933b364ffbcc625c2010e051ca031e867.tar.xz |
Add code point expression (Meet RL1.1 of UTS #18)
\u{hex string} matches a character has the code point represented by the hex string.
For instance, \u{3042} matches hiragana あ (U+3042). The hex string must have 4 or 6 digits.
This feature meets RL1.1 of UTS #18.
RL1.1 Hex Notation: https://unicode.org/reports/tr18/#RL1.1
Diffstat (limited to 'compiler/parser.go')
-rw-r--r-- | compiler/parser.go | 55 |
1 files changed, 55 insertions, 0 deletions
diff --git a/compiler/parser.go b/compiler/parser.go index 63e5549..ba15cd0 100644 --- a/compiler/parser.go +++ b/compiler/parser.go @@ -2,6 +2,8 @@ package compiler import ( "bytes" + "encoding/binary" + "encoding/hex" "fmt" "io" "strings" @@ -307,6 +309,9 @@ func (p *parser) parseSingleChar() astNode { p.expect(tokenKindBExpClose) return inverse } + if p.consume(tokenKindCodePointLeader) { + return p.parseCodePoint() + } c := p.parseNormalChar() if c == nil { if p.consume(tokenKindBExpClose) { @@ -318,6 +323,9 @@ func (p *parser) parseSingleChar() astNode { } func (p *parser) parseBExpElem() astNode { + if p.consume(tokenKindCodePointLeader) { + return p.parseCodePoint() + } left := p.parseNormalChar() if left == nil { return nil @@ -338,6 +346,53 @@ func (p *parser) parseBExpElem() astNode { return genRangeAST(left, right) } +func (p *parser) parseCodePoint() astNode { + if !p.consume(tokenKindLBrace) { + raiseSyntaxError(synErrCPExpInvalidForm) + } + if !p.consume(tokenKindCodePoint) { + raiseSyntaxError(synErrCPExpInvalidForm) + } + + var cp []byte + { + // Although hex.DecodeString method can handle only a hex string that has even length, + // `codePoint` always has even length by the lexical specification. + b, err := hex.DecodeString(p.lastTok.codePoint) + if err != nil { + panic(fmt.Errorf("failed to decode a code point (%v) into a byte slice: %v", p.lastTok.codePoint, err)) + } + // `b` must be 4 bytes to convert it into a 32-bit integer. + l := len(b) + for i := 0; i < 4-l; i++ { + b = append([]byte{0}, b...) + } + n := binary.BigEndian.Uint32(b) + if n < 0x0000 || n > 0x10FFFF { + raiseSyntaxError(synErrCPExpOutOfRange) + } + + cp = []byte(string(rune(n))) + } + + var concat astNode + { + concat = newSymbolNode(cp[0]) + for _, b := range cp[1:] { + concat = genConcatNode( + concat, + newSymbolNode(b), + ) + } + } + + if !p.consume(tokenKindRBrace) { + raiseSyntaxError(synErrCPExpInvalidForm) + } + + return concat +} + func (p *parser) parseNormalChar() astNode { if !p.consume(tokenKindChar) { return nil |