Add code point expression (Meet RL1.1 of UTS #18)

\u{hex string} matches a character has the code point represented by the hex string. For instance, \u{3042} matches hiragana あ (U+3042). The hex string must have 4 or 6 digits. This feature meets RL1.1 of UTS #18. RL1.1 Hex Notation: https://unicode.org/reports/tr18/#RL1.1
author: Ryo Nihei <nihei.dev@gmail.com> 2021-04-24 17:52:10 +0900
committer: Ryo Nihei <nihei.dev@gmail.com> 2021-04-24 23:32:55 +0900
commit: 5708644933b364ffbcc625c2010e051ca031e867 (patch)
tree: 53c458938cd685b7c77a3d638920a083effc7dee /compiler/parser.go
parent: Add validation of lexical specs and improve error messages (diff)
download: tre-5708644933b364ffbcc625c2010e051ca031e867.tar.gz
tre-5708644933b364ffbcc625c2010e051ca031e867.tar.xz
1 files changed, 55 insertions, 0 deletions
diff --git a/compiler/parser.go b/compiler/parser.go
index 63e5549..ba15cd0 100644
--- a/compiler/parser.go
+++ b/compiler/parser.go
@@ -2,6 +2,8 @@ package compiler
 
 import (
 	"bytes"
+	"encoding/binary"
+	"encoding/hex"
 	"fmt"
 	"io"
 	"strings"
@@ -307,6 +309,9 @@ func (p *parser) parseSingleChar() astNode {
 		p.expect(tokenKindBExpClose)
 		return inverse
 	}
+	if p.consume(tokenKindCodePointLeader) {
+		return p.parseCodePoint()
+	}
 	c := p.parseNormalChar()
 	if c == nil {
 		if p.consume(tokenKindBExpClose) {
@@ -318,6 +323,9 @@ func (p *parser) parseSingleChar() astNode {
 }
 
 func (p *parser) parseBExpElem() astNode {
+	if p.consume(tokenKindCodePointLeader) {
+		return p.parseCodePoint()
+	}
 	left := p.parseNormalChar()
 	if left == nil {
 		return nil
@@ -338,6 +346,53 @@ func (p *parser) parseBExpElem() astNode {
 	return genRangeAST(left, right)
 }
 
+func (p *parser) parseCodePoint() astNode {
+	if !p.consume(tokenKindLBrace) {
+		raiseSyntaxError(synErrCPExpInvalidForm)
+	}
+	if !p.consume(tokenKindCodePoint) {
+		raiseSyntaxError(synErrCPExpInvalidForm)
+	}
+
+	var cp []byte
+	{
+		// Although hex.DecodeString method can handle only a hex string that has even length,
+		// `codePoint` always has even length by the lexical specification.
+		b, err := hex.DecodeString(p.lastTok.codePoint)
+		if err != nil {
+			panic(fmt.Errorf("failed to decode a code point (%v) into a byte slice: %v", p.lastTok.codePoint, err))
+		}
+		// `b` must be 4 bytes to convert it into a 32-bit integer.
+		l := len(b)
+		for i := 0; i < 4-l; i++ {
+			b = append([]byte{0}, b...)
+		}
+		n := binary.BigEndian.Uint32(b)
+		if n < 0x0000 || n > 0x10FFFF {
+			raiseSyntaxError(synErrCPExpOutOfRange)
+		}
+
+		cp = []byte(string(rune(n)))
+	}
+
+	var concat astNode
+	{
+		concat = newSymbolNode(cp[0])
+		for _, b := range cp[1:] {
+			concat = genConcatNode(
+				concat,
+				newSymbolNode(b),
+			)
+		}
+	}
+
+	if !p.consume(tokenKindRBrace) {
+		raiseSyntaxError(synErrCPExpInvalidForm)
+	}
+
+	return concat
+}
+
 func (p *parser) parseNormalChar() astNode {
 	if !p.consume(tokenKindChar) {
 		return nil
author	Ryo Nihei <nihei.dev@gmail.com>	2021-04-24 17:52:10 +0900
committer	Ryo Nihei <nihei.dev@gmail.com>	2021-04-24 23:32:55 +0900
commit	5708644933b364ffbcc625c2010e051ca031e867 (patch)
tree	53c458938cd685b7c77a3d638920a083effc7dee /compiler/parser.go
parent	Add validation of lexical specs and improve error messages (diff)
download	tre-5708644933b364ffbcc625c2010e051ca031e867.tar.gz tre-5708644933b364ffbcc625c2010e051ca031e867.tar.xz