aboutsummaryrefslogtreecommitdiff
path: root/compiler
diff options
context:
space:
mode:
Diffstat (limited to 'compiler')
-rw-r--r--compiler/ast.go29
-rw-r--r--compiler/dfa.go10
-rw-r--r--compiler/lexer.go5
-rw-r--r--compiler/lexer_test.go6
-rw-r--r--compiler/parser.go107
-rw-r--r--compiler/parser_test.go21
6 files changed, 158 insertions, 20 deletions
diff --git a/compiler/ast.go b/compiler/ast.go
index d31c92b..d4b8956 100644
--- a/compiler/ast.go
+++ b/compiler/ast.go
@@ -119,6 +119,15 @@ func (s symbolPositionSet) sort() []symbolPosition {
return sorted
}
+type byteRange struct {
+ from byte
+ to byte
+}
+
+func (r byteRange) String() string {
+ return fmt.Sprintf("%v - %v", r.from, r.to)
+}
+
type astNode interface {
fmt.Stringer
children() (astNode, astNode)
@@ -128,21 +137,35 @@ type astNode interface {
}
type symbolNode struct {
+ byteRange
token *token
- value byte
pos symbolPosition
}
func newSymbolNode(tok *token, value byte, pos symbolPosition) *symbolNode {
return &symbolNode{
+ byteRange: byteRange{
+ from: value,
+ to: value,
+ },
+ token: tok,
+ pos: pos,
+ }
+}
+
+func newRangeSymbolNode(tok *token, from, to byte, pos symbolPosition) *symbolNode {
+ return &symbolNode{
+ byteRange: byteRange{
+ from: from,
+ to: to,
+ },
token: tok,
- value: value,
pos: pos,
}
}
func (n *symbolNode) String() string {
- return fmt.Sprintf("{type: char, char: %v, int: %v, pos: %v}", string(n.token.char), n.token.char, n.pos)
+ return fmt.Sprintf("{type: symbol, value: %v - %v, token char: %v, pos: %v}", n.from, n.to, string(n.token.char), n.pos)
}
func (n *symbolNode) children() (astNode, astNode) {
diff --git a/compiler/dfa.go b/compiler/dfa.go
index 84692a4..fec93ce 100644
--- a/compiler/dfa.go
+++ b/compiler/dfa.go
@@ -29,11 +29,13 @@ func genDFA(root astNode, symTab *symbolTable) *DFA {
if pos.isEndMark() {
continue
}
- symVal := int(symTab.symPos2Byte[pos])
- if tranTabOfState[symVal] == nil {
- tranTabOfState[symVal] = newSymbolPositionSet()
+ valRange := symTab.symPos2Byte[pos]
+ for symVal := valRange.from; symVal <= valRange.to; symVal++ {
+ if tranTabOfState[symVal] == nil {
+ tranTabOfState[symVal] = newSymbolPositionSet()
+ }
+ tranTabOfState[symVal].merge(follow[pos])
}
- tranTabOfState[symVal].merge(follow[pos])
}
for _, t := range tranTabOfState {
if t == nil {
diff --git a/compiler/lexer.go b/compiler/lexer.go
index f78b920..1c09260 100644
--- a/compiler/lexer.go
+++ b/compiler/lexer.go
@@ -10,6 +10,7 @@ type tokenKind string
const (
tokenKindChar = tokenKind("char")
+ tokenKindAnyChar = tokenKind(".")
tokenKindRepeat = tokenKind("*")
tokenKindAlt = tokenKind("|")
tokenKindGroupOpen = tokenKind("(")
@@ -59,6 +60,8 @@ func (l *lexer) next() (*token, error) {
switch c {
case '*':
return newToken(tokenKindRepeat, nullChar), nil
+ case '.':
+ return newToken(tokenKindAnyChar, nullChar), nil
case '|':
return newToken(tokenKindAlt, nullChar), nil
case '(':
@@ -76,7 +79,7 @@ func (l *lexer) next() (*token, error) {
}
}
switch {
- case c == '\\' || c == '*' || c == '|' || c == '(' || c == ')':
+ case c == '\\' || c == '.' || c == '*' || c == '|' || c == '(' || c == ')':
return newToken(tokenKindChar, c), nil
default:
return nil, &SyntaxError{
diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go
index b172ae9..75770b0 100644
--- a/compiler/lexer_test.go
+++ b/compiler/lexer_test.go
@@ -31,8 +31,9 @@ func TestLexer(t *testing.T) {
},
{
caption: "lexer can recognize the special characters",
- src: "*|()",
+ src: ".*|()",
tokens: []*token{
+ newToken(tokenKindAnyChar, nullChar),
newToken(tokenKindRepeat, nullChar),
newToken(tokenKindAlt, nullChar),
newToken(tokenKindGroupOpen, nullChar),
@@ -42,9 +43,10 @@ func TestLexer(t *testing.T) {
},
{
caption: "lexer can recognize the escape sequences",
- src: "\\\\\\*\\|\\(\\)",
+ src: "\\\\\\.\\*\\|\\(\\)",
tokens: []*token{
newToken(tokenKindChar, '\\'),
+ newToken(tokenKindChar, '.'),
newToken(tokenKindChar, '*'),
newToken(tokenKindChar, '|'),
newToken(tokenKindChar, '('),
diff --git a/compiler/parser.go b/compiler/parser.go
index 0039404..03dc198 100644
--- a/compiler/parser.go
+++ b/compiler/parser.go
@@ -22,13 +22,13 @@ func raiseSyntaxError(message string) {
}
type symbolTable struct {
- symPos2Byte map[symbolPosition]byte
+ symPos2Byte map[symbolPosition]byteRange
endPos2ID map[symbolPosition]int
}
func genSymbolTable(root astNode) *symbolTable {
symTab := &symbolTable{
- symPos2Byte: map[symbolPosition]byte{},
+ symPos2Byte: map[symbolPosition]byteRange{},
endPos2ID: map[symbolPosition]int{},
}
return genSymTab(symTab, root)
@@ -41,7 +41,10 @@ func genSymTab(symTab *symbolTable, node astNode) *symbolTable {
switch n := node.(type) {
case *symbolNode:
- symTab.symPos2Byte[n.pos] = n.value
+ symTab.symPos2Byte[n.pos] = byteRange{
+ from: n.from,
+ to: n.to,
+ }
case *endMarkerNode:
symTab.endPos2ID[n.pos] = n.id
default:
@@ -152,6 +155,9 @@ func (p *parser) parseGroup() astNode {
defer p.expect(tokenKindGroupClose)
return p.parseAlt()
}
+ if p.consume(tokenKindAnyChar) {
+ return genAnyCharAST(p.lastTok)
+ }
if !p.consume(tokenKindChar) {
return nil
}
@@ -187,6 +193,101 @@ func (p *parser) parseGroup() astNode {
}
}
+// Refelences:
+// * https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf#G7404
+// * Table 3-6. UTF-8 Bit Distribution
+// * Table 3-7. Well-Formed UTF-8 Byte Sequences
+func genAnyCharAST(tok *token) astNode {
+ return newAltNode(
+ newAltNode(
+ newAltNode(
+ newAltNode(
+ newAltNode(
+ newAltNode(
+ newAltNode(
+ newAltNode(
+ // 1 byte character <00..7F>
+ newRangeSymbolNode(tok, 0x00, 0x7f, symbolPositionNil),
+ // 2 bytes character <C2..DF 80..BF>
+ newConcatNode(
+ newRangeSymbolNode(tok, 0xc2, 0xdf, symbolPositionNil),
+ newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+ ),
+ ),
+ // 3 bytes character <E0 A0..BF 80..BF>
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(tok, 0xe0, symbolPositionNil),
+ newRangeSymbolNode(tok, 0xa0, 0xbf, symbolPositionNil),
+ ),
+ newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+ ),
+ ),
+ // 3 bytes character <E1..EC 80..BF 80..BF>
+ newConcatNode(
+ newConcatNode(
+ newRangeSymbolNode(tok, 0xe1, 0xec, symbolPositionNil),
+ newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+ ),
+ newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+ ),
+ ),
+ // 3 bytes character <ED 80..9F 80..BF>
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(tok, 0xed, symbolPositionNil),
+ newRangeSymbolNode(tok, 0x80, 0x9f, symbolPositionNil),
+ ),
+ newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+ ),
+ ),
+ // 3 bytes character <EE..EF 80..BF 80..BF>
+ newConcatNode(
+ newConcatNode(
+ newRangeSymbolNode(tok, 0xee, 0xef, symbolPositionNil),
+ newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+ ),
+ newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+ ),
+ ),
+ // 4 bytes character <F0 90..BF 80..BF 80..BF>
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(tok, 0xf0, symbolPositionNil),
+ newRangeSymbolNode(tok, 0x90, 0xbf, symbolPositionNil),
+ ),
+ newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+ ),
+ newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+ ),
+ ),
+ // 4 bytes character <F1..F3 80..BF 80..BF 80..BF>
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newRangeSymbolNode(tok, 0xf1, 0xf3, symbolPositionNil),
+ newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+ ),
+ newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+ ),
+ newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+ ),
+ ),
+ // 4 bytes character <F4 80..8F 80..BF 80..BF>
+ newConcatNode(
+ newConcatNode(
+ newConcatNode(
+ newSymbolNode(tok, 0xf4, symbolPositionNil),
+ newRangeSymbolNode(tok, 0x80, 0x8f, symbolPositionNil),
+ ),
+ newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+ ),
+ newRangeSymbolNode(tok, 0x80, 0xbf, symbolPositionNil),
+ ),
+ )
+}
+
func (p *parser) expect(expected tokenKind) {
if !p.consume(expected) {
tok := p.peekedTok
diff --git a/compiler/parser_test.go b/compiler/parser_test.go
index 09392be..d96cb4d 100644
--- a/compiler/parser_test.go
+++ b/compiler/parser_test.go
@@ -110,13 +110,20 @@ func TestParser(t *testing.T) {
}
{
+ entry := func(v byte) byteRange {
+ return byteRange{
+ from: v,
+ to: v,
+ }
+ }
+
expectedSymTab := &symbolTable{
- symPos2Byte: map[symbolPosition]byte{
- symPos(1): byte('a'),
- symPos(2): byte('b'),
- symPos(3): byte('a'),
- symPos(4): byte('b'),
- symPos(5): byte('b'),
+ symPos2Byte: map[symbolPosition]byteRange{
+ symPos(1): entry(byte('a')),
+ symPos(2): entry(byte('b')),
+ symPos(3): entry(byte('a')),
+ symPos(4): entry(byte('b')),
+ symPos(5): entry(byte('b')),
},
endPos2ID: map[symbolPosition]int{
endPos(6): 1,
@@ -187,7 +194,7 @@ func testSymbolTable(t *testing.T, expected, actual *symbolTable) {
t.Errorf("a symbol position entry was not found: %v -> %v", ePos, eByte)
continue
}
- if byte != eByte {
+ if byte.from != eByte.from || byte.to != eByte.to {
t.Errorf("unexpected symbol position entry; want: %v -> %v, got: %v -> %v", ePos, eByte, ePos, byte)
}
}