6 files changed, 512 insertions, 18 deletions
diff --git a/compiler/lexer.go b/compiler/lexer.go
index c1aa67e..5987e44 100644
--- a/compiler/lexer.go
+++ b/compiler/lexer.go
@@ -4,6 +4,7 @@ import (
 	"bufio"
 	"fmt"
 	"io"
+	"strings"
 )
 
 type tokenKind string
@@ -21,12 +22,17 @@ const (
 	tokenKindInverseBExpOpen = tokenKind("[^")
 	tokenKindBExpClose       = tokenKind("]")
 	tokenKindCharRange       = tokenKind("-")
+	tokenKindCodePointLeader = tokenKind("\\u")
+	tokenKindLBrace          = tokenKind("{")
+	tokenKindRBrace          = tokenKind("}")
+	tokenKindCodePoint       = tokenKind("code point")
 	tokenKindEOF             = tokenKind("eof")
 )
 
 type token struct {
-	kind tokenKind
-	char rune
+	kind      tokenKind
+	char      rune
+	codePoint string
 }
 
 const nullChar = '\u0000'
@@ -38,13 +44,45 @@ func newToken(kind tokenKind, char rune) *token {
 	}
 }
 
+func newCodePointToken(codePoint string) *token {
+	return &token{
+		kind:      tokenKindCodePoint,
+		codePoint: codePoint,
+	}
+}
+
 type lexerMode string
 
 const (
 	lexerModeDefault = lexerMode("default")
 	lexerModeBExp    = lexerMode("bracket expression")
+	lexerModeCPExp   = lexerMode("code point expression")
 )
 
+type lexerModeStack struct {
+	stack []lexerMode
+}
+
+func newLexerModeStack() *lexerModeStack {
+	return &lexerModeStack{
+		stack: []lexerMode{
+			lexerModeDefault,
+		},
+	}
+}
+
+func (s *lexerModeStack) top() lexerMode {
+	return s.stack[len(s.stack)-1]
+}
+
+func (s *lexerModeStack) push(m lexerMode) {
+	s.stack = append(s.stack, m)
+}
+
+func (s *lexerModeStack) pop() {
+	s.stack = s.stack[:len(s.stack)-1]
+}
+
 type rangeState string
 
 // [a-z]
@@ -71,7 +109,7 @@ type lexer struct {
 	prevEOF1      bool
 	prevChar2     rune
 	pervEOF2      bool
-	mode          lexerMode
+	modeStack     *lexerModeStack
 	rangeState    rangeState
 	errMsgDetails string
 }
@@ -89,7 +127,7 @@ func newLexer(src io.Reader) *lexer {
 		prevEOF1:   false,
 		prevChar2:  nullChar,
 		pervEOF2:   false,
-		mode:       lexerModeDefault,
+		modeStack:  newLexerModeStack(),
 		rangeState: rangeStateReady,
 	}
 }
@@ -103,7 +141,7 @@ func (l *lexer) next() (*token, error) {
 		return newToken(tokenKindEOF, nullChar), nil
 	}
 
-	switch l.mode {
+	switch l.modeStack.top() {
 	case lexerModeBExp:
 		tok, err := l.nextInBExp(c)
 		if err != nil {
@@ -111,7 +149,7 @@ func (l *lexer) next() (*token, error) {
 		}
 		switch tok.kind {
 		case tokenKindBExpClose:
-			l.mode = lexerModeDefault
+			l.modeStack.pop()
 		case tokenKindCharRange:
 			l.rangeState = rangeStateExpectRangeTerminator
 		case tokenKindChar:
@@ -121,6 +159,18 @@ func (l *lexer) next() (*token, error) {
 			case rangeStateExpectRangeTerminator:
 				l.rangeState = rangeStateReady
 			}
+		case tokenKindCodePointLeader:
+			l.modeStack.push(lexerModeCPExp)
+		}
+		return tok, nil
+	case lexerModeCPExp:
+		tok, err := l.nextInCodePoint(c)
+		if err != nil {
+			return nil, err
+		}
+		switch tok.kind {
+		case tokenKindRBrace:
+			l.modeStack.pop()
 		}
 		return tok, nil
 	default:
@@ -130,11 +180,13 @@ func (l *lexer) next() (*token, error) {
 		}
 		switch tok.kind {
 		case tokenKindBExpOpen:
-			l.mode = lexerModeBExp
+			l.modeStack.push(lexerModeBExp)
 			l.rangeState = rangeStateReady
 		case tokenKindInverseBExpOpen:
-			l.mode = lexerModeBExp
+			l.modeStack.push(lexerModeBExp)
 			l.rangeState = rangeStateReady
+		case tokenKindCodePointLeader:
+			l.modeStack.push(lexerModeCPExp)
 		}
 		return tok, nil
 	}
@@ -210,6 +262,9 @@ func (l *lexer) nextInDefault(c rune) (*token, error) {
 		if eof {
 			return nil, synErrIncompletedEscSeq
 		}
+		if c == 'u' {
+			return newToken(tokenKindCodePointLeader, nullChar), nil
+		}
 		if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' {
 			return newToken(tokenKindChar, c), nil
 		}
@@ -259,6 +314,9 @@ func (l *lexer) nextInBExp(c rune) (*token, error) {
 		if eof {
 			return nil, synErrIncompletedEscSeq
 		}
+		if c == 'u' {
+			return newToken(tokenKindCodePointLeader, nullChar), nil
+		}
 		if c == '\\' || c == '^' || c == '-' || c == ']' {
 			return newToken(tokenKindChar, c), nil
 		}
@@ -269,6 +327,60 @@ func (l *lexer) nextInBExp(c rune) (*token, error) {
 	}
 }
 
+func (l *lexer) nextInCodePoint(c rune) (*token, error) {
+	switch c {
+	case '{':
+		return newToken(tokenKindLBrace, nullChar), nil
+	case '}':
+		return newToken(tokenKindRBrace, nullChar), nil
+	default:
+		if !isHexDigit(c) {
+			return nil, synErrInvalidCodePoint
+		}
+		var b strings.Builder
+		fmt.Fprint(&b, string(c))
+		n := 1
+		for {
+			c, eof, err := l.read()
+			if err != nil {
+				return nil, err
+			}
+			if eof {
+				l.restore()
+				if err != nil {
+					return nil, err
+				}
+				break
+			}
+			if c == '}' {
+				err := l.restore()
+				if err != nil {
+					return nil, err
+				}
+				break
+			}
+			if !isHexDigit(c) || n >= 6 {
+				return nil, synErrInvalidCodePoint
+			}
+			fmt.Fprint(&b, string(c))
+			n++
+		}
+		cp := b.String()
+		cpLen := len(cp)
+		if !(cpLen == 4 || cpLen == 6) {
+			return nil, synErrInvalidCodePoint
+		}
+		return newCodePointToken(b.String()), nil
+	}
+}
+
+func isHexDigit(c rune) bool {
+	if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' {
+		return true
+	}
+	return false
+}
+
 func (l *lexer) read() (rune, bool, error) {
 	if l.reachedEOF {
 		return l.lastChar, l.reachedEOF, nil
diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go
index c77d7c7..87e3a81 100644
--- a/compiler/lexer_test.go
+++ b/compiler/lexer_test.go
@@ -30,7 +30,7 @@ func TestLexer(t *testing.T) {
 		},
 		{
 			caption: "lexer can recognize the special characters in default mode",
-			src:     ".*+?|()[",
+			src:     ".*+?|()[\\u",
 			tokens: []*token{
 				newToken(tokenKindAnyChar, nullChar),
 				newToken(tokenKindRepeat, nullChar),
@@ -40,6 +40,7 @@ func TestLexer(t *testing.T) {
 				newToken(tokenKindGroupOpen, nullChar),
 				newToken(tokenKindGroupClose, nullChar),
 				newToken(tokenKindBExpOpen, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
 				newToken(tokenKindEOF, nullChar),
 			},
 		},
@@ -60,26 +61,36 @@ func TestLexer(t *testing.T) {
 			},
 		},
 		{
-			caption: "] is treated as an ordinary character in default mode",
-			src:     "]",
+			caption: "], {, and } are treated as an ordinary character in default mode",
+			src:     "]{}",
 			tokens: []*token{
 				newToken(tokenKindChar, ']'),
+				newToken(tokenKindChar, '{'),
+				newToken(tokenKindChar, '}'),
 				newToken(tokenKindEOF, nullChar),
 			},
 		},
 		{
 			caption: "lexer can recognize the special characters in bracket expression mode",
-			src:     "[a-z][^a-z]",
+			src:     "[a-z\\u{09AF}][^a-z\\u{09abcf}]",
 			tokens: []*token{
 				newToken(tokenKindBExpOpen, nullChar),
 				newToken(tokenKindChar, 'a'),
 				newToken(tokenKindCharRange, nullChar),
 				newToken(tokenKindChar, 'z'),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("09AF"),
+				newToken(tokenKindRBrace, nullChar),
 				newToken(tokenKindBExpClose, nullChar),
 				newToken(tokenKindInverseBExpOpen, nullChar),
 				newToken(tokenKindChar, 'a'),
 				newToken(tokenKindCharRange, nullChar),
 				newToken(tokenKindChar, 'z'),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("09abcf"),
+				newToken(tokenKindRBrace, nullChar),
 				newToken(tokenKindBExpClose, nullChar),
 				newToken(tokenKindEOF, nullChar),
 			},
@@ -233,6 +244,163 @@ func TestLexer(t *testing.T) {
 			},
 			err: synErrIncompletedEscSeq,
 		},
+		{
+			caption: "lexer can recognize the special characters and code points in code point expression mode",
+			src:     "\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}[\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}][^\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}]",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("0123"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("4567"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("89abcd"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("efAB"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("CDEF01"),
+				newToken(tokenKindRBrace, nullChar),
+
+				newToken(tokenKindBExpOpen, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("0123"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("4567"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("89abcd"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("efAB"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("CDEF01"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindBExpClose, nullChar),
+
+				newToken(tokenKindInverseBExpOpen, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("0123"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("4567"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("89abcd"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("efAB"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("CDEF01"),
+				newToken(tokenKindRBrace, nullChar),
+				newToken(tokenKindBExpClose, nullChar),
+
+				newToken(tokenKindEOF, nullChar),
+			},
+		},
+		{
+			caption: "a one digit hex string isn't a valid code point",
+			src:     "\\u{0",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+			},
+			err: synErrInvalidCodePoint,
+		},
+		{
+			caption: "a two digits hex string isn't a valid code point",
+			src:     "\\u{01",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+			},
+			err: synErrInvalidCodePoint,
+		},
+		{
+			caption: "a three digits hex string isn't a valid code point",
+			src:     "\\u{012",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+			},
+			err: synErrInvalidCodePoint,
+		},
+		{
+			caption: "a four digits hex string is a valid code point",
+			src:     "\\u{0123}",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("0123"),
+				newToken(tokenKindRBrace, nullChar),
+			},
+		},
+		{
+			caption: "a five digits hex string isn't a valid code point",
+			src:     "\\u{01234",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+			},
+			err: synErrInvalidCodePoint,
+		},
+		{
+			caption: "a six digits hex string is a valid code point",
+			src:     "\\u{012345}",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+				newCodePointToken("012345"),
+				newToken(tokenKindRBrace, nullChar),
+			},
+		},
+		{
+			caption: "a seven digits hex string isn't a valid code point",
+			src:     "\\u{0123456",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+			},
+			err: synErrInvalidCodePoint,
+		},
+		{
+			caption: "a code point must be hex digits",
+			src:     "\\u{g",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+			},
+			err: synErrInvalidCodePoint,
+		},
+		{
+			caption: "a code point must be hex digits",
+			src:     "\\u{G",
+			tokens: []*token{
+				newToken(tokenKindCodePointLeader, nullChar),
+				newToken(tokenKindLBrace, nullChar),
+			},
+			err: synErrInvalidCodePoint,
+		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.caption, func(t *testing.T) {
@@ -268,7 +436,7 @@ func TestLexer(t *testing.T) {
 
 func testToken(t *testing.T, a, e *token) {
 	t.Helper()
-	if e.kind != a.kind || e.char != a.char {
-		t.Fatalf("unexpected token; want: %v, got: %v", e, a)
+	if e.kind != a.kind || e.char != a.char || e.codePoint != a.codePoint {
+		t.Fatalf("unexpected token; want: %+v, got: %+v", e, a)
 	}
 }
diff --git a/compiler/parser.go b/compiler/parser.go
index 63e5549..ba15cd0 100644
--- a/compiler/parser.go
+++ b/compiler/parser.go
@@ -2,6 +2,8 @@ package compiler
 
 import (
 	"bytes"
+	"encoding/binary"
+	"encoding/hex"
 	"fmt"
 	"io"
 	"strings"
@@ -307,6 +309,9 @@ func (p *parser) parseSingleChar() astNode {
 		p.expect(tokenKindBExpClose)
 		return inverse
 	}
+	if p.consume(tokenKindCodePointLeader) {
+		return p.parseCodePoint()
+	}
 	c := p.parseNormalChar()
 	if c == nil {
 		if p.consume(tokenKindBExpClose) {
@@ -318,6 +323,9 @@ func (p *parser) parseSingleChar() astNode {
 }
 
 func (p *parser) parseBExpElem() astNode {
+	if p.consume(tokenKindCodePointLeader) {
+		return p.parseCodePoint()
+	}
 	left := p.parseNormalChar()
 	if left == nil {
 		return nil
@@ -338,6 +346,53 @@ func (p *parser) parseBExpElem() astNode {
 	return genRangeAST(left, right)
 }
 
+func (p *parser) parseCodePoint() astNode {
+	if !p.consume(tokenKindLBrace) {
+		raiseSyntaxError(synErrCPExpInvalidForm)
+	}
+	if !p.consume(tokenKindCodePoint) {
+		raiseSyntaxError(synErrCPExpInvalidForm)
+	}
+
+	var cp []byte
+	{
+		// Although hex.DecodeString method can handle only a hex string that has even length,
+		// `codePoint` always has even length by the lexical specification.
+		b, err := hex.DecodeString(p.lastTok.codePoint)
+		if err != nil {
+			panic(fmt.Errorf("failed to decode a code point (%v) into a byte slice: %v", p.lastTok.codePoint, err))
+		}
+		// `b` must be 4 bytes to convert it into a 32-bit integer.
+		l := len(b)
+		for i := 0; i < 4-l; i++ {
+			b = append([]byte{0}, b...)
+		}
+		n := binary.BigEndian.Uint32(b)
+		if n < 0x0000 || n > 0x10FFFF {
+			raiseSyntaxError(synErrCPExpOutOfRange)
+		}
+
+		cp = []byte(string(rune(n)))
+	}
+
+	var concat astNode
+	{
+		concat = newSymbolNode(cp[0])
+		for _, b := range cp[1:] {
+			concat = genConcatNode(
+				concat,
+				newSymbolNode(b),
+			)
+		}
+	}
+
+	if !p.consume(tokenKindRBrace) {
+		raiseSyntaxError(synErrCPExpInvalidForm)
+	}
+
+	return concat
+}
+
 func (p *parser) parseNormalChar() astNode {
 	if !p.consume(tokenKindChar) {
 		return nil
diff --git a/compiler/parser_test.go b/compiler/parser_test.go
index dcbe924..c636d8b 100644
--- a/compiler/parser_test.go
+++ b/compiler/parser_test.go
@@ -71,6 +71,19 @@ func TestParser_parse(t *testing.T) {
 			),
 		},
 		{
+			pattern: "\\u{3042}?",
+			ast: genConcatNode(
+				newOptionNode(
+					genConcatNode(
+						newSymbolNodeWithPos(0xE3, symPos(1)),
+						newSymbolNodeWithPos(0x81, symPos(2)),
+						newSymbolNodeWithPos(0x82, symPos(3)),
+					),
+				),
+				newEndMarkerNodeWithPos(1, endPos(4)),
+			),
+		},
+		{
 			pattern: "(a)?",
 			ast: genConcatNode(
 				newOptionNode(
@@ -160,6 +173,19 @@ func TestParser_parse(t *testing.T) {
 			),
 		},
 		{
+			pattern: "\\u{3042}*",
+			ast: genConcatNode(
+				newRepeatNode(
+					genConcatNode(
+						newSymbolNodeWithPos(0xE3, symPos(1)),
+						newSymbolNodeWithPos(0x81, symPos(2)),
+						newSymbolNodeWithPos(0x82, symPos(3)),
+					),
+				),
+				newEndMarkerNodeWithPos(1, endPos(4)),
+			),
+		},
+		{
 			pattern: "((a*)*)*",
 			ast: genConcatNode(
 				newRepeatNode(
@@ -246,6 +272,24 @@ func TestParser_parse(t *testing.T) {
 			),
 		},
 		{
+			pattern: "\\u{3042}+",
+			ast: genConcatNode(
+				genConcatNode(
+					newSymbolNodeWithPos(0xE3, symPos(1)),
+					newSymbolNodeWithPos(0x81, symPos(2)),
+					newSymbolNodeWithPos(0x82, symPos(3)),
+				),
+				newRepeatNode(
+					genConcatNode(
+						newSymbolNodeWithPos(0xE3, symPos(4)),
+						newSymbolNodeWithPos(0x81, symPos(5)),
+						newSymbolNodeWithPos(0x82, symPos(6)),
+					),
+				),
+				newEndMarkerNodeWithPos(1, endPos(7)),
+			),
+		},
+		{
 			pattern: "((a+)+)+",
 			ast: genConcatNode(
 				genConcatNode(
@@ -715,6 +759,85 @@ func TestParser_parse(t *testing.T) {
 			),
 		},
 		{
+			pattern: "\\u{006E}",
+			ast: genConcatNode(
+				newSymbolNodeWithPos(0x6E, symPos(1)),
+				newEndMarkerNodeWithPos(1, endPos(2)),
+			),
+		},
+		{
+			pattern: "\\u{03BD}",
+			ast: genConcatNode(
+				genConcatNode(
+					newSymbolNodeWithPos(0xCE, symPos(1)),
+					newSymbolNodeWithPos(0xBD, symPos(2)),
+				),
+				newEndMarkerNodeWithPos(1, endPos(3)),
+			),
+		},
+		{
+			pattern: "\\u{306B}",
+			ast: genConcatNode(
+				genConcatNode(
+					newSymbolNodeWithPos(0xE3, symPos(1)),
+					newSymbolNodeWithPos(0x81, symPos(2)),
+					newSymbolNodeWithPos(0xAB, symPos(3)),
+				),
+				newEndMarkerNodeWithPos(1, endPos(4)),
+			),
+		},
+		{
+			pattern: "\\u{01F638}",
+			ast: genConcatNode(
+				genConcatNode(
+					newSymbolNodeWithPos(0xF0, symPos(1)),
+					newSymbolNodeWithPos(0x9F, symPos(2)),
+					newSymbolNodeWithPos(0x98, symPos(3)),
+					newSymbolNodeWithPos(0xB8, symPos(4)),
+				),
+				newEndMarkerNodeWithPos(1, endPos(5)),
+			),
+		},
+		{
+			pattern: "\\u{0000}",
+			ast: genConcatNode(
+				newSymbolNodeWithPos(0x00, symPos(1)),
+				newEndMarkerNodeWithPos(1, endPos(2)),
+			),
+		},
+		{
+			pattern: "\\u{10FFFF}",
+			ast: genConcatNode(
+				genConcatNode(
+					newSymbolNodeWithPos(0xF4, symPos(1)),
+					newSymbolNodeWithPos(0x8F, symPos(2)),
+					newSymbolNodeWithPos(0xBF, symPos(3)),
+					newSymbolNodeWithPos(0xBF, symPos(4)),
+				),
+				newEndMarkerNodeWithPos(1, endPos(5)),
+			),
+		},
+		{
+			pattern:     "\\u{110000}",
+			syntaxError: synErrCPExpOutOfRange,
+		},
+		{
+			pattern:     "\\u",
+			syntaxError: synErrCPExpInvalidForm,
+		},
+		{
+			pattern:     "\\u{",
+			syntaxError: synErrCPExpInvalidForm,
+		},
+		{
+			pattern:     "\\u{03BD",
+			syntaxError: synErrCPExpInvalidForm,
+		},
+		{
+			pattern:     "\\u{}",
+			syntaxError: synErrCPExpInvalidForm,
+		},
+		{
 			pattern: "(a)",
 			ast: newConcatNode(
 				newSymbolNodeWithPos(byte('a'), symPos(1)),
@@ -997,8 +1120,8 @@ func testAST(t *testing.T, expected, actual astNode) {
 	switch e := expected.(type) {
 	case *symbolNode:
 		a := actual.(*symbolNode)
-		if a.pos != e.pos {
-			t.Fatalf("symbol position is mismatched; want: %v, got: %v", e.pos, a.pos)
+		if a.pos != e.pos || a.from != e.from || a.to != e.to {
+			t.Fatalf("unexpected node; want: %+v, got: %+v", e, a)
 		}
 	case *endMarkerNode:
 		a := actual.(*endMarkerNode)
diff --git a/compiler/syntax_error.go b/compiler/syntax_error.go
index be2dc38..df8977d 100644
--- a/compiler/syntax_error.go
+++ b/compiler/syntax_error.go
@@ -20,6 +20,7 @@ var (
 	// lexical errors
 	synErrIncompletedEscSeq = newSyntaxError("incompleted escape sequence; unexpected EOF following \\")
 	synErrInvalidEscSeq     = newSyntaxError("invalid escape sequence")
+	synErrInvalidCodePoint  = newSyntaxError("code points must consist of just 4 or 6 hex digits")
 
 	// syntax errors
 	synErrUnexpectedToken   = newSyntaxError("unexpected token")
@@ -34,4 +35,6 @@ var (
 	synErrBExpUnclosed      = newSyntaxError("unclosed bracket expression")
 	synErrBExpInvalidForm   = newSyntaxError("invalid bracket expression")
 	synErrRangeInvalidOrder = newSyntaxError("a range expression with invalid order")
+	synErrCPExpInvalidForm  = newSyntaxError("invalid code point expression")
+	synErrCPExpOutOfRange   = newSyntaxError("a code point must be between U+0000 to U+10FFFF")
 )
diff --git a/driver/lexer_test.go b/driver/lexer_test.go
index 68830a5..26b5d49 100644
--- a/driver/lexer_test.go
+++ b/driver/lexer_test.go
@@ -143,12 +143,12 @@ func TestLexer_Next(t *testing.T) {
 		{
 			lspec: &spec.LexSpec{
 				Entries: []*spec.LexEntry{
-					// all 1 byte characters
+					// all 1 byte characters except null character (U+0000)
 					//
 					// NOTE:
 					// maleeni cannot handle the null character in patterns because compiler.lexer,
 					// specifically read() and restore(), recognizes the null characters as that a symbol doesn't exist.
-					// There is room for improvement in this behavior of the lexer.
+					// If a pattern needs a null character, use code point expression \u{0000}.
 					newLexEntry("1ByteChar", "[\x01-\x7f]"),
 				},
 			},
@@ -416,6 +416,39 @@ func TestLexer_Next(t *testing.T) {
 				newEOFToken(),
 			},
 		},
+		{
+			lspec: &spec.LexSpec{
+				Entries: []*spec.LexEntry{
+					newLexEntry("1ByteChar", "\\u{006E}"),
+					newLexEntry("2ByteChar", "\\u{03BD}"),
+					newLexEntry("3ByteChar", "\\u{306B}"),
+					newLexEntry("4ByteChar", "\\u{01F638}"),
+				},
+			},
+			src: "nνに😸",
+			tokens: []*Token{
+				newToken(1, "1ByteChar", newByteSequence([]byte{0x6E})),
+				newToken(2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})),
+				newToken(3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})),
+				newToken(4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})),
+				newEOFToken(),
+			},
+		},
+		{
+			lspec: &spec.LexSpec{
+				Entries: []*spec.LexEntry{
+					newLexEntry("codePointsAlt", "[\\u{006E}\\u{03BD}\\u{306B}\\u{01F638}]"),
+				},
+			},
+			src: "nνに😸",
+			tokens: []*Token{
+				newToken(1, "codePointsAlt", newByteSequence([]byte{0x6E})),
+				newToken(1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})),
+				newToken(1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})),
+				newToken(1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})),
+				newEOFToken(),
+			},
+		},
 	}
 	for i, tt := range test {
 		t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) {