Add dot symbol matching any single character

The dot symbol matches any single character. When the dot symbol appears, the parser generates an AST matching all of the well-formed UTF-8 byte sequences. Refelences: * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf#G7404 * Table 3-6. UTF-8 Bit Distribution * Table 3-7. Well-Formed UTF-8 Byte Sequences
author: Ryo Nihei <nihei.dev@gmail.com> 2021-02-14 17:38:46 +0900
committer: Ryo Nihei <nihei.dev@gmail.com> 2021-02-14 17:54:18 +0900
commit: a1d1cfe08ae809d454ac6f1ce80a19395e7940e5 (patch)
tree: 9fb55c6b8bbf25e493588442936e65c1cb7755db
parent: Add driver (diff)
download: tre-a1d1cfe08ae809d454ac6f1ce80a19395e7940e5.tar.gz
tre-a1d1cfe08ae809d454ac6f1ce80a19395e7940e5.tar.xz
7 files changed, 201 insertions, 21 deletions
diff --git a/compiler/ast.go b/compiler/ast.go
index d31c92b..d4b8956 100644
--- a/compiler/ast.go
+++ b/compiler/ast.go
@@ -119,6 +119,15 @@ func (s symbolPositionSet) sort() []symbolPosition {
 	return sorted
 }
 
+type byteRange struct {
+	from byte
+	to   byte
+}
+
+func (r byteRange) String() string {
+	return fmt.Sprintf("%v - %v", r.from, r.to)
+}
+
 type astNode interface {
 	fmt.Stringer
 	children() (astNode, astNode)
@@ -128,21 +137,35 @@ type astNode interface {
 }
 
 type symbolNode struct {
+	byteRange
 	token *token
-	value byte
 	pos   symbolPosition
 }
 
 func newSymbolNode(tok *token, value byte, pos symbolPosition) *symbolNode {
 	return &symbolNode{
+		byteRange: byteRange{
+			from: value,
+			to:   value,
+		},
+		token: tok,
+		pos:   pos,
+	}
+}
+
+func newRangeSymbolNode(tok *token, from, to byte, pos symbolPosition) *symbolNode {
+	return &symbolNode{
+		byteRange: byteRange{
+			from: from,
+			to:   to,
+		},
 		token: tok,
-		value: value,
 		pos:   pos,
 	}
 }
 
 func (n *symbolNode) String() string {
-	return fmt.Sprintf("{type: char, char: %v, int: %v, pos: %v}", string(n.token.char), n.token.char, n.pos)
+	return fmt.Sprintf("{type: symbol, value: %v - %v, token char: %v, pos: %v}", n.from, n.to, string(n.token.char), n.pos)
 }
 
 func (n *symbolNode) children() (astNode, astNode) {
diff --git a/compiler/dfa.go b/compiler/dfa.go
index 84692a4..fec93ce 100644
--- a/compiler/dfa.go
+++ b/compiler/dfa.go
@@ -29,11 +29,13 @@ func genDFA(root astNode, symTab *symbolTable) *DFA {
 					if pos.isEndMark() {
 						continue
 					}
-					symVal := int(symTab.symPos2Byte[pos])
-					if tranTabOfState[symVal] == nil {
-						tranTabOfState[symVal] = newSymbolPositionSet()
+					valRange := symTab.symPos2Byte[pos]
+					for symVal := valRange.from; symVal <= valRange.to; symVal++ {
+						if tranTabOfState[symVal] == nil {
+							tranTabOfState[symVal] = newSymbolPositionSet()
+						}
+						tranTabOfState[symVal].merge(follow[pos])
 					}
-					tranTabOfState[symVal].merge(follow[pos])
 				}
 				for _, t := range tranTabOfState {
 					if t == nil {
diff --git a/compiler/lexer.go b/compiler/lexer.go
index f78b920..1c09260 100644
--- a/compiler/lexer.go
+++ b/compiler/lexer.go
@@ -10,6 +10,7 @@ type tokenKind string
 
 const (
 	tokenKindChar       = tokenKind("char")
+	tokenKindAnyChar    = tokenKind(".")
 	tokenKindRepeat     = tokenKind("*")
 	tokenKindAlt        = tokenKind("|")
 	tokenKindGroupOpen  = tokenKind("(")
@@ -59,6 +60,8 @@ func (l *lexer) next() (*token, error) {
 	switch c {
 	case '*':
 		return newToken(tokenKindRepeat, nullChar), nil
+	case '.':
+		return newToken(tokenKindAnyChar, nullChar), nil
 	case '|':
 		return newToken(tokenKindAlt, nullChar), nil
 	case '(':
@@ -76,7 +79,7 @@ func (l *lexer) next() (*token, error) {
 			}
 		}
 		switch {
-		case c == '\\' || c == '*' || c == '|' || c == '(' || c == ')':
+		case c == '\\' || c == '.' || c == '*' || c == '|' || c == '(' || c == ')':
 			return newToken(tokenKindChar, c), nil
 		default:
 			return nil, &SyntaxError{
diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go
index b172ae9..75770b0 100644
--- a/compiler/lexer_test.go
+++ b/compiler/lexer_test.go
@@ -31,8 +31,9 @@ func TestLexer(t *testing.T) {
 		},
 		{
 			caption: "lexer can recognize the special characters",
-			src:     "*|()",
+			src:     ".*|()",
 			tokens: []*token{
+				newToken(tokenKindAnyChar, nullChar),
 				newToken(tokenKindRepeat, nullChar),
 				newToken(tokenKindAlt, nullChar),
 				newToken(tokenKindGroupOpen, nullChar),
@@ -42,9 +43,10 @@ func TestLexer(t *testing.T) {
 		},
 		{
 			caption: "lexer can recognize the escape sequences",
-			src:     "\\\\\\*\\|\\(\\)",
+			src:     "\\\\\\.\\*\\|\\(\\)",
 			tokens: []*token{
 				newToken(tokenKindChar, '\\'),
+				newToken(tokenKindChar, '.'),
 				newToken(tokenKindChar, '*'),
 				newToken(tokenKindChar, '|'),
 				newToken(tokenKindChar, '('),
diff --git a/compiler/parser.go b/compiler/parser.go
index 0039404..03dc198 100644
--- a/compiler/parser.go
+++ b/compiler/parser.go
@@ -22,13 +22,13 @@ func raiseSyntaxError(message string) {
 }
 
 type symbolTable struct {
-	symPos2Byte map[symbolPosition]byte
+	symPos2Byte map[symbolPosition]byteRange
 	endPos2ID   map[symbolPosition]int
 }
 
 func genSymbolTable(root astNode) *symbolTable {
 	symTab := &symbolTable{
-		symPos2Byte: map[symbolPosition]byte{},
+		symPos2Byte: map[symbolPosition]byteRange{},
 		endPos2ID:   map[symbolPosition]int{},
 	}
 	return genSymTab(symTab, root)
@@ -41,7 +41,10 @@ func genSymTab(symTab *symbolTable, node astNode) *symbolTable {
 
 	switch n := node.(type) {
 	case *symbolNode:
-		symTab.symPos2Byte[n.pos] = n.value
+		symTab.symPos2Byte[n.pos] = byteRange{
+			from: n.from,
+			to:   n.to,
+		}
 	case *endMarkerNode:
 		symTab.endPos2ID[n.pos] = n.id
 	default:
@@ -152,6 +155,9 @@ func (p *parser) parseGroup() astNode {
 		defer p.expect(tokenKindGroupClose)
 		return p.parseAlt()
 	}
+	if p.consume(tokenKindAnyChar) {
+		return genAnyCharAST(p.lastTok)
+	}
 	if !p.consume(tokenKindChar) {
 		return nil
 	}
@@ -187,6 +193,101 @@ func (p *parser) parseGroup() astNode {
 	}
 }
 
+// Refelences:
+// * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf#G7404
+//   * Table 3-6.  UTF-8 Bit Distribution
+//   * Table 3-7.  Well-Formed UTF-8 Byte Sequences
+func genAnyCharAST(tok *token) astNode {
+	return newAltNode(
+		newAltNode(
+			newAltNode(
+				newAltNode(
+					newAltNode(
+						newAltNode(
+							newAltNode(
+								newAltNode(
+									// 1 byte character <00..7F>
+									newRangeSymbolNode(tok, 0x00, 0x7f, symbolPositionNil),
+									// 2 bytes character <C2..DF 80..BF>
+									newConcatNode(
+										newRangeSymbolNode(tok, 0xc2, 0xdf, symbolPositionNil),
+										newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+									),
+								),
+								// 3 bytes character <E0 A0..BF 80..BF>
+								newConcatNode(
+									newConcatNode(
+										newSymbolNode(tok, 0xe0, symbolPositionNil),
+										newRangeSymbolNode(tok, 0xa0, 0xbf, symbolPositionNil),
+									),
+									newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+								),
+							),
+							// 3 bytes character <E1..EC 80..BF 80..BF>
+							newConcatNode(
+								newConcatNode(
+									newRangeSymbolNode(tok, 0xe1, 0xec, symbolPositionNil),
+									newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+								),
+								newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+							),
+						),
+						// 3 bytes character <ED 80..9F 80..BF>
+						newConcatNode(
+							newConcatNode(
+								newSymbolNode(tok, 0xed, symbolPositionNil),
+								newRangeSymbolNode(tok, 0x80, 0x9f, symbolPositionNil),
+							),
+							newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+						),
+					),
+					// 3 bytes character <EE..EF 80..BF 80..BF>
+					newConcatNode(
+						newConcatNode(
+							newRangeSymbolNode(tok, 0xee, 0xef, symbolPositionNil),
+							newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+						),
+						newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+					),
+				),
+				// 4 bytes character <F0 90..BF 80..BF 80..BF>
+				newConcatNode(
+					newConcatNode(
+						newConcatNode(
+							newSymbolNode(tok, 0xf0, symbolPositionNil),
+							newRangeSymbolNode(tok, 0x90, 0xbf, symbolPositionNil),
+						),
+						newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+					),
+					newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+				),
+			),
+			// 4 bytes character <F1..F3 80..BF 80..BF 80..BF>
+			newConcatNode(
+				newConcatNode(
+					newConcatNode(
+						newRangeSymbolNode(tok, 0xf1, 0xf3, symbolPositionNil),
+						newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+					),
+					newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+				),
+				newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+			),
+		),
+		// 4 bytes character <F4 80..8F 80..BF 80..BF>
+		newConcatNode(
+			newConcatNode(
+				newConcatNode(
+					newSymbolNode(tok, 0xf4, symbolPositionNil),
+					newRangeSymbolNode(tok, 0x80, 0x8f, symbolPositionNil),
+				),
+				newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+			),
+			newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+		),
+	)
+}
+
 func (p *parser) expect(expected tokenKind) {
 	if !p.consume(expected) {
 		tok := p.peekedTok
diff --git a/compiler/parser_test.go b/compiler/parser_test.go
index 09392be..d96cb4d 100644
--- a/compiler/parser_test.go
+++ b/compiler/parser_test.go
@@ -110,13 +110,20 @@ func TestParser(t *testing.T) {
 	}
 
 	{
+		entry := func(v byte) byteRange {
+			return byteRange{
+				from: v,
+				to:   v,
+			}
+		}
+
 		expectedSymTab := &symbolTable{
-			symPos2Byte: map[symbolPosition]byte{
-				symPos(1): byte('a'),
-				symPos(2): byte('b'),
-				symPos(3): byte('a'),
-				symPos(4): byte('b'),
-				symPos(5): byte('b'),
+			symPos2Byte: map[symbolPosition]byteRange{
+				symPos(1): entry(byte('a')),
+				symPos(2): entry(byte('b')),
+				symPos(3): entry(byte('a')),
+				symPos(4): entry(byte('b')),
+				symPos(5): entry(byte('b')),
 			},
 			endPos2ID: map[symbolPosition]int{
 				endPos(6): 1,
@@ -187,7 +194,7 @@ func testSymbolTable(t *testing.T, expected, actual *symbolTable) {
 			t.Errorf("a symbol position entry was not found: %v -> %v", ePos, eByte)
 			continue
 		}
-		if byte != eByte {
+		if byte.from != eByte.from || byte.to != eByte.to {
 			t.Errorf("unexpected symbol position entry; want: %v -> %v, got: %v -> %v", ePos, eByte, ePos, byte)
 		}
 	}
diff --git a/driver/lexer_test.go b/driver/lexer_test.go
index 4fd4bf8..1a46470 100644
--- a/driver/lexer_test.go
+++ b/driver/lexer_test.go
@@ -35,6 +35,48 @@ func TestLexer_Next(t *testing.T) {
 				newEOFToken(),
 			},
 		},
+		{
+			regexps: [][]byte{
+				[]byte("."),
+			},
+			src: string([]byte{
+				0x00,
+				0x7f,
+				0xc2, 0x80,
+				0xdf, 0xbf,
+				0xe1, 0x80, 0x80,
+				0xec, 0xbf, 0xbf,
+				0xed, 0x80, 0x80,
+				0xed, 0x9f, 0xbf,
+				0xee, 0x80, 0x80,
+				0xef, 0xbf, 0xbf,
+				0xf0, 0x90, 0x80, 0x80,
+				0xf0, 0xbf, 0xbf, 0xbf,
+				0xf1, 0x80, 0x80, 0x80,
+				0xf3, 0xbf, 0xbf, 0xbf,
+				0xf4, 0x80, 0x80, 0x80,
+				0xf4, 0x8f, 0xbf, 0xbf,
+			}),
+			tokens: []*Token{
+				newToken(1, []byte{0x00}),
+				newToken(1, []byte{0x7f}),
+				newToken(1, []byte{0xc2, 0x80}),
+				newToken(1, []byte{0xdf, 0xbf}),
+				newToken(1, []byte{0xe1, 0x80, 0x80}),
+				newToken(1, []byte{0xec, 0xbf, 0xbf}),
+				newToken(1, []byte{0xed, 0x80, 0x80}),
+				newToken(1, []byte{0xed, 0x9f, 0xbf}),
+				newToken(1, []byte{0xee, 0x80, 0x80}),
+				newToken(1, []byte{0xef, 0xbf, 0xbf}),
+				newToken(1, []byte{0xf0, 0x90, 0x80, 0x80}),
+				newToken(1, []byte{0xf0, 0xbf, 0xbf, 0xbf}),
+				newToken(1, []byte{0xf1, 0x80, 0x80, 0x80}),
+				newToken(1, []byte{0xf3, 0xbf, 0xbf, 0xbf}),
+				newToken(1, []byte{0xf4, 0x80, 0x80, 0x80}),
+				newToken(1, []byte{0xf4, 0x8f, 0xbf, 0xbf}),
+				newEOFToken(),
+			},
+		},
 	}
 	for _, tt := range test {
 		res := map[int][]byte{}
@@ -142,6 +184,6 @@ func testToken(t *testing.T, expected, actual *Token) {
 	t.Helper()
 
 	if actual.ID != expected.ID || !bytes.Equal(actual.Match, expected.Match) || actual.EOF != expected.EOF || actual.Invalid != expected.Invalid {
-		t.Errorf("unexpected token; want: %v, got: %v", expected, actual)
+		t.Errorf("unexpected token; want: %v (\"%v\"), got: %v (\"%v\")", expected, string(expected.Match), actual, string(actual.Match))
 	}
 }
author	Ryo Nihei <nihei.dev@gmail.com>	2021-02-14 17:38:46 +0900
committer	Ryo Nihei <nihei.dev@gmail.com>	2021-02-14 17:54:18 +0900
commit	a1d1cfe08ae809d454ac6f1ce80a19395e7940e5 (patch)
tree	9fb55c6b8bbf25e493588442936e65c1cb7755db
parent	Add driver (diff)
download	tre-a1d1cfe08ae809d454ac6f1ce80a19395e7940e5.tar.gz tre-a1d1cfe08ae809d454ac6f1ce80a19395e7940e5.tar.xz