diff options
author | Ryo Nihei <nihei.dev@gmail.com> | 2021-02-14 17:38:46 +0900 |
---|---|---|
committer | Ryo Nihei <nihei.dev@gmail.com> | 2021-02-14 17:54:18 +0900 |
commit | a1d1cfe08ae809d454ac6f1ce80a19395e7940e5 (patch) | |
tree | 9fb55c6b8bbf25e493588442936e65c1cb7755db | |
parent | Add driver (diff) | |
download | tre-a1d1cfe08ae809d454ac6f1ce80a19395e7940e5.tar.gz tre-a1d1cfe08ae809d454ac6f1ce80a19395e7940e5.tar.xz |
Add dot symbol matching any single character
The dot symbol matches any single character. When the dot symbol appears, the parser generates an AST matching all of the well-formed UTF-8 byte sequences.
Refelences:
* https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf#G7404
* Table 3-6. UTF-8 Bit Distribution
* Table 3-7. Well-Formed UTF-8 Byte Sequences
-rw-r--r-- | compiler/ast.go | 29 | ||||
-rw-r--r-- | compiler/dfa.go | 10 | ||||
-rw-r--r-- | compiler/lexer.go | 5 | ||||
-rw-r--r-- | compiler/lexer_test.go | 6 | ||||
-rw-r--r-- | compiler/parser.go | 107 | ||||
-rw-r--r-- | compiler/parser_test.go | 21 | ||||
-rw-r--r-- | driver/lexer_test.go | 44 |
7 files changed, 201 insertions, 21 deletions
diff --git a/compiler/ast.go b/compiler/ast.go index d31c92b..d4b8956 100644 --- a/compiler/ast.go +++ b/compiler/ast.go @@ -119,6 +119,15 @@ func (s symbolPositionSet) sort() []symbolPosition { return sorted } +type byteRange struct { + from byte + to byte +} + +func (r byteRange) String() string { + return fmt.Sprintf("%v - %v", r.from, r.to) +} + type astNode interface { fmt.Stringer children() (astNode, astNode) @@ -128,21 +137,35 @@ type astNode interface { } type symbolNode struct { + byteRange token *token - value byte pos symbolPosition } func newSymbolNode(tok *token, value byte, pos symbolPosition) *symbolNode { return &symbolNode{ + byteRange: byteRange{ + from: value, + to: value, + }, + token: tok, + pos: pos, + } +} + +func newRangeSymbolNode(tok *token, from, to byte, pos symbolPosition) *symbolNode { + return &symbolNode{ + byteRange: byteRange{ + from: from, + to: to, + }, token: tok, - value: value, pos: pos, } } func (n *symbolNode) String() string { - return fmt.Sprintf("{type: char, char: %v, int: %v, pos: %v}", string(n.token.char), n.token.char, n.pos) + return fmt.Sprintf("{type: symbol, value: %v - %v, token char: %v, pos: %v}", n.from, n.to, string(n.token.char), n.pos) } func (n *symbolNode) children() (astNode, astNode) { diff --git a/compiler/dfa.go b/compiler/dfa.go index 84692a4..fec93ce 100644 --- a/compiler/dfa.go +++ b/compiler/dfa.go @@ -29,11 +29,13 @@ func genDFA(root astNode, symTab *symbolTable) *DFA { if pos.isEndMark() { continue } - symVal := int(symTab.symPos2Byte[pos]) - if tranTabOfState[symVal] == nil { - tranTabOfState[symVal] = newSymbolPositionSet() + valRange := symTab.symPos2Byte[pos] + for symVal := valRange.from; symVal <= valRange.to; symVal++ { + if tranTabOfState[symVal] == nil { + tranTabOfState[symVal] = newSymbolPositionSet() + } + tranTabOfState[symVal].merge(follow[pos]) } - tranTabOfState[symVal].merge(follow[pos]) } for _, t := range tranTabOfState { if t == nil { diff --git a/compiler/lexer.go b/compiler/lexer.go index f78b920..1c09260 100644 --- a/compiler/lexer.go +++ b/compiler/lexer.go @@ -10,6 +10,7 @@ type tokenKind string const ( tokenKindChar = tokenKind("char") + tokenKindAnyChar = tokenKind(".") tokenKindRepeat = tokenKind("*") tokenKindAlt = tokenKind("|") tokenKindGroupOpen = tokenKind("(") @@ -59,6 +60,8 @@ func (l *lexer) next() (*token, error) { switch c { case '*': return newToken(tokenKindRepeat, nullChar), nil + case '.': + return newToken(tokenKindAnyChar, nullChar), nil case '|': return newToken(tokenKindAlt, nullChar), nil case '(': @@ -76,7 +79,7 @@ func (l *lexer) next() (*token, error) { } } switch { - case c == '\\' || c == '*' || c == '|' || c == '(' || c == ')': + case c == '\\' || c == '.' || c == '*' || c == '|' || c == '(' || c == ')': return newToken(tokenKindChar, c), nil default: return nil, &SyntaxError{ diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go index b172ae9..75770b0 100644 --- a/compiler/lexer_test.go +++ b/compiler/lexer_test.go @@ -31,8 +31,9 @@ func TestLexer(t *testing.T) { }, { caption: "lexer can recognize the special characters", - src: "*|()", + src: ".*|()", tokens: []*token{ + newToken(tokenKindAnyChar, nullChar), newToken(tokenKindRepeat, nullChar), newToken(tokenKindAlt, nullChar), newToken(tokenKindGroupOpen, nullChar), @@ -42,9 +43,10 @@ func TestLexer(t *testing.T) { }, { caption: "lexer can recognize the escape sequences", - src: "\\\\\\*\\|\\(\\)", + src: "\\\\\\.\\*\\|\\(\\)", tokens: []*token{ newToken(tokenKindChar, '\\'), + newToken(tokenKindChar, '.'), newToken(tokenKindChar, '*'), newToken(tokenKindChar, '|'), newToken(tokenKindChar, '('), diff --git a/compiler/parser.go b/compiler/parser.go index 0039404..03dc198 100644 --- a/compiler/parser.go +++ b/compiler/parser.go @@ -22,13 +22,13 @@ func raiseSyntaxError(message string) { } type symbolTable struct { - symPos2Byte map[symbolPosition]byte + symPos2Byte map[symbolPosition]byteRange endPos2ID map[symbolPosition]int } func genSymbolTable(root astNode) *symbolTable { symTab := &symbolTable{ - symPos2Byte: map[symbolPosition]byte{}, + symPos2Byte: map[symbolPosition]byteRange{}, endPos2ID: map[symbolPosition]int{}, } return genSymTab(symTab, root) @@ -41,7 +41,10 @@ func genSymTab(symTab *symbolTable, node astNode) *symbolTable { switch n := node.(type) { case *symbolNode: - symTab.symPos2Byte[n.pos] = n.value + symTab.symPos2Byte[n.pos] = byteRange{ + from: n.from, + to: n.to, + } case *endMarkerNode: symTab.endPos2ID[n.pos] = n.id default: @@ -152,6 +155,9 @@ func (p *parser) parseGroup() astNode { defer p.expect(tokenKindGroupClose) return p.parseAlt() } + if p.consume(tokenKindAnyChar) { + return genAnyCharAST(p.lastTok) + } if !p.consume(tokenKindChar) { return nil } @@ -187,6 +193,101 @@ func (p *parser) parseGroup() astNode { } } +// Refelences: +// * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf#G7404 +// * Table 3-6. UTF-8 Bit Distribution +// * Table 3-7. Well-Formed UTF-8 Byte Sequences +func genAnyCharAST(tok *token) astNode { + return newAltNode( + newAltNode( + newAltNode( + newAltNode( + newAltNode( + newAltNode( + newAltNode( + newAltNode( + // 1 byte character <00..7F> + newRangeSymbolNode(tok, 0x00, 0x7f, symbolPositionNil), + // 2 bytes character <C2..DF 80..BF> + newConcatNode( + newRangeSymbolNode(tok, 0xc2, 0xdf, symbolPositionNil), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + ), + // 3 bytes character <E0 A0..BF 80..BF> + newConcatNode( + newConcatNode( + newSymbolNode(tok, 0xe0, symbolPositionNil), + newRangeSymbolNode(tok, 0xa0, 0xbf, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + ), + // 3 bytes character <E1..EC 80..BF 80..BF> + newConcatNode( + newConcatNode( + newRangeSymbolNode(tok, 0xe1, 0xec, symbolPositionNil), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + ), + // 3 bytes character <ED 80..9F 80..BF> + newConcatNode( + newConcatNode( + newSymbolNode(tok, 0xed, symbolPositionNil), + newRangeSymbolNode(tok, 0x80, 0x9f, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + ), + // 3 bytes character <EE..EF 80..BF 80..BF> + newConcatNode( + newConcatNode( + newRangeSymbolNode(tok, 0xee, 0xef, symbolPositionNil), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + ), + // 4 bytes character <F0 90..BF 80..BF 80..BF> + newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(tok, 0xf0, symbolPositionNil), + newRangeSymbolNode(tok, 0x90, 0xbf, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + ), + // 4 bytes character <F1..F3 80..BF 80..BF 80..BF> + newConcatNode( + newConcatNode( + newConcatNode( + newRangeSymbolNode(tok, 0xf1, 0xf3, symbolPositionNil), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + ), + // 4 bytes character <F4 80..8F 80..BF 80..BF> + newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(tok, 0xf4, symbolPositionNil), + newRangeSymbolNode(tok, 0x80, 0x8f, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + ) +} + func (p *parser) expect(expected tokenKind) { if !p.consume(expected) { tok := p.peekedTok diff --git a/compiler/parser_test.go b/compiler/parser_test.go index 09392be..d96cb4d 100644 --- a/compiler/parser_test.go +++ b/compiler/parser_test.go @@ -110,13 +110,20 @@ func TestParser(t *testing.T) { } { + entry := func(v byte) byteRange { + return byteRange{ + from: v, + to: v, + } + } + expectedSymTab := &symbolTable{ - symPos2Byte: map[symbolPosition]byte{ - symPos(1): byte('a'), - symPos(2): byte('b'), - symPos(3): byte('a'), - symPos(4): byte('b'), - symPos(5): byte('b'), + symPos2Byte: map[symbolPosition]byteRange{ + symPos(1): entry(byte('a')), + symPos(2): entry(byte('b')), + symPos(3): entry(byte('a')), + symPos(4): entry(byte('b')), + symPos(5): entry(byte('b')), }, endPos2ID: map[symbolPosition]int{ endPos(6): 1, @@ -187,7 +194,7 @@ func testSymbolTable(t *testing.T, expected, actual *symbolTable) { t.Errorf("a symbol position entry was not found: %v -> %v", ePos, eByte) continue } - if byte != eByte { + if byte.from != eByte.from || byte.to != eByte.to { t.Errorf("unexpected symbol position entry; want: %v -> %v, got: %v -> %v", ePos, eByte, ePos, byte) } } diff --git a/driver/lexer_test.go b/driver/lexer_test.go index 4fd4bf8..1a46470 100644 --- a/driver/lexer_test.go +++ b/driver/lexer_test.go @@ -35,6 +35,48 @@ func TestLexer_Next(t *testing.T) { newEOFToken(), }, }, + { + regexps: [][]byte{ + []byte("."), + }, + src: string([]byte{ + 0x00, + 0x7f, + 0xc2, 0x80, + 0xdf, 0xbf, + 0xe1, 0x80, 0x80, + 0xec, 0xbf, 0xbf, + 0xed, 0x80, 0x80, + 0xed, 0x9f, 0xbf, + 0xee, 0x80, 0x80, + 0xef, 0xbf, 0xbf, + 0xf0, 0x90, 0x80, 0x80, + 0xf0, 0xbf, 0xbf, 0xbf, + 0xf1, 0x80, 0x80, 0x80, + 0xf3, 0xbf, 0xbf, 0xbf, + 0xf4, 0x80, 0x80, 0x80, + 0xf4, 0x8f, 0xbf, 0xbf, + }), + tokens: []*Token{ + newToken(1, []byte{0x00}), + newToken(1, []byte{0x7f}), + newToken(1, []byte{0xc2, 0x80}), + newToken(1, []byte{0xdf, 0xbf}), + newToken(1, []byte{0xe1, 0x80, 0x80}), + newToken(1, []byte{0xec, 0xbf, 0xbf}), + newToken(1, []byte{0xed, 0x80, 0x80}), + newToken(1, []byte{0xed, 0x9f, 0xbf}), + newToken(1, []byte{0xee, 0x80, 0x80}), + newToken(1, []byte{0xef, 0xbf, 0xbf}), + newToken(1, []byte{0xf0, 0x90, 0x80, 0x80}), + newToken(1, []byte{0xf0, 0xbf, 0xbf, 0xbf}), + newToken(1, []byte{0xf1, 0x80, 0x80, 0x80}), + newToken(1, []byte{0xf3, 0xbf, 0xbf, 0xbf}), + newToken(1, []byte{0xf4, 0x80, 0x80, 0x80}), + newToken(1, []byte{0xf4, 0x8f, 0xbf, 0xbf}), + newEOFToken(), + }, + }, } for _, tt := range test { res := map[int][]byte{} @@ -142,6 +184,6 @@ func testToken(t *testing.T, expected, actual *Token) { t.Helper() if actual.ID != expected.ID || !bytes.Equal(actual.Match, expected.Match) || actual.EOF != expected.EOF || actual.Invalid != expected.Invalid { - t.Errorf("unexpected token; want: %v, got: %v", expected, actual) + t.Errorf("unexpected token; want: %v (\"%v\"), got: %v (\"%v\")", expected, string(expected.Match), actual, string(actual.Match)) } } |