From a1d1cfe08ae809d454ac6f1ce80a19395e7940e5 Mon Sep 17 00:00:00 2001 From: Ryo Nihei Date: Sun, 14 Feb 2021 17:38:46 +0900 Subject: Add dot symbol matching any single character The dot symbol matches any single character. When the dot symbol appears, the parser generates an AST matching all of the well-formed UTF-8 byte sequences. Refelences: * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf#G7404 * Table 3-6. UTF-8 Bit Distribution * Table 3-7. Well-Formed UTF-8 Byte Sequences --- compiler/parser.go | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 104 insertions(+), 3 deletions(-) (limited to 'compiler/parser.go') diff --git a/compiler/parser.go b/compiler/parser.go index 0039404..03dc198 100644 --- a/compiler/parser.go +++ b/compiler/parser.go @@ -22,13 +22,13 @@ func raiseSyntaxError(message string) { } type symbolTable struct { - symPos2Byte map[symbolPosition]byte + symPos2Byte map[symbolPosition]byteRange endPos2ID map[symbolPosition]int } func genSymbolTable(root astNode) *symbolTable { symTab := &symbolTable{ - symPos2Byte: map[symbolPosition]byte{}, + symPos2Byte: map[symbolPosition]byteRange{}, endPos2ID: map[symbolPosition]int{}, } return genSymTab(symTab, root) @@ -41,7 +41,10 @@ func genSymTab(symTab *symbolTable, node astNode) *symbolTable { switch n := node.(type) { case *symbolNode: - symTab.symPos2Byte[n.pos] = n.value + symTab.symPos2Byte[n.pos] = byteRange{ + from: n.from, + to: n.to, + } case *endMarkerNode: symTab.endPos2ID[n.pos] = n.id default: @@ -152,6 +155,9 @@ func (p *parser) parseGroup() astNode { defer p.expect(tokenKindGroupClose) return p.parseAlt() } + if p.consume(tokenKindAnyChar) { + return genAnyCharAST(p.lastTok) + } if !p.consume(tokenKindChar) { return nil } @@ -187,6 +193,101 @@ func (p *parser) parseGroup() astNode { } } +// Refelences: +// * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf#G7404 +// * Table 3-6. UTF-8 Bit Distribution +// * Table 3-7. Well-Formed UTF-8 Byte Sequences +func genAnyCharAST(tok *token) astNode { + return newAltNode( + newAltNode( + newAltNode( + newAltNode( + newAltNode( + newAltNode( + newAltNode( + newAltNode( + // 1 byte character <00..7F> + newRangeSymbolNode(tok, 0x00, 0x7f, symbolPositionNil), + // 2 bytes character + newConcatNode( + newRangeSymbolNode(tok, 0xc2, 0xdf, symbolPositionNil), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + ), + // 3 bytes character + newConcatNode( + newConcatNode( + newSymbolNode(tok, 0xe0, symbolPositionNil), + newRangeSymbolNode(tok, 0xa0, 0xbf, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + ), + // 3 bytes character + newConcatNode( + newConcatNode( + newRangeSymbolNode(tok, 0xe1, 0xec, symbolPositionNil), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + ), + // 3 bytes character + newConcatNode( + newConcatNode( + newSymbolNode(tok, 0xed, symbolPositionNil), + newRangeSymbolNode(tok, 0x80, 0x9f, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + ), + // 3 bytes character + newConcatNode( + newConcatNode( + newRangeSymbolNode(tok, 0xee, 0xef, symbolPositionNil), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + ), + // 4 bytes character + newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(tok, 0xf0, symbolPositionNil), + newRangeSymbolNode(tok, 0x90, 0xbf, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + ), + // 4 bytes character + newConcatNode( + newConcatNode( + newConcatNode( + newRangeSymbolNode(tok, 0xf1, 0xf3, symbolPositionNil), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + ), + // 4 bytes character + newConcatNode( + newConcatNode( + newConcatNode( + newSymbolNode(tok, 0xf4, symbolPositionNil), + newRangeSymbolNode(tok, 0x80, 0x8f, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil), + ), + ) +} + func (p *parser) expect(expected tokenKind) { if !p.consume(expected) { tok := p.peekedTok -- cgit v1.2.3