diff options
-rw-r--r-- | compiler/ast.go | 42 | ||||
-rw-r--r-- | compiler/ast_test.go | 6 | ||||
-rw-r--r-- | compiler/lexer.go | 26 | ||||
-rw-r--r-- | compiler/lexer_test.go | 12 | ||||
-rw-r--r-- | compiler/parser.go | 12 | ||||
-rw-r--r-- | driver/lexer_test.go | 40 |
6 files changed, 117 insertions, 21 deletions
diff --git a/compiler/ast.go b/compiler/ast.go index 17054d0..e4609ac 100644 --- a/compiler/ast.go +++ b/compiler/ast.go @@ -338,6 +338,48 @@ func (n *repeatNode) last() symbolPositionSet { return s } +func newRepeatOneOrMoreNode(left astNode) *concatNode { + return newConcatNode( + left, + &repeatNode{ + left: left, + }) +} + +type optionNode struct { + left astNode +} + +func newOptionNode(left astNode) *optionNode { + return &optionNode{ + left: left, + } +} + +func (n *optionNode) String() string { + return fmt.Sprintf("{type: option}") +} + +func (n *optionNode) children() (astNode, astNode) { + return n.left, nil +} + +func (n *optionNode) nullable() bool { + return true +} + +func (n *optionNode) first() symbolPositionSet { + s := newSymbolPositionSet() + s.merge(n.left.first()) + return s +} + +func (n *optionNode) last() symbolPositionSet { + s := newSymbolPositionSet() + s.merge(n.left.last()) + return s +} + type followTable map[symbolPosition]symbolPositionSet func genFollowTable(root astNode) followTable { diff --git a/compiler/ast_test.go b/compiler/ast_test.go index 61d6064..735ccf8 100644 --- a/compiler/ast_test.go +++ b/compiler/ast_test.go @@ -102,6 +102,12 @@ func TestASTNode(t *testing.T) { first: newSymbolPositionSet().add(1), last: newSymbolPositionSet().add(1), }, + { + root: newOptionNode(newSymbolNode(nil, 0, 1)), + nullable: true, + first: newSymbolPositionSet().add(1), + last: newSymbolPositionSet().add(1), + }, } for i, tt := range tests { t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) { diff --git a/compiler/lexer.go b/compiler/lexer.go index 3e3cf35..91cd209 100644 --- a/compiler/lexer.go +++ b/compiler/lexer.go @@ -9,15 +9,17 @@ import ( type tokenKind string const ( - tokenKindChar = tokenKind("char") - tokenKindAnyChar = tokenKind(".") - tokenKindRepeat = tokenKind("*") - tokenKindAlt = tokenKind("|") - tokenKindGroupOpen = tokenKind("(") - tokenKindGroupClose = tokenKind(")") - tokenKindBExpOpen = tokenKind("[") - tokenKindBExpClose = tokenKind("]") - tokenKindEOF = tokenKind("eof") + tokenKindChar = tokenKind("char") + tokenKindAnyChar = tokenKind(".") + tokenKindRepeat = tokenKind("*") + tokenKindRepeatOneOrMore = tokenKind("+") + tokenKindOption = tokenKind("?") + tokenKindAlt = tokenKind("|") + tokenKindGroupOpen = tokenKind("(") + tokenKindGroupClose = tokenKind(")") + tokenKindBExpOpen = tokenKind("[") + tokenKindBExpClose = tokenKind("]") + tokenKindEOF = tokenKind("eof") ) type token struct { @@ -80,6 +82,10 @@ func (l *lexer) nextInDefault(c rune) (*token, error) { switch c { case '*': return newToken(tokenKindRepeat, nullChar), nil + case '+': + return newToken(tokenKindRepeatOneOrMore, nullChar), nil + case '?': + return newToken(tokenKindOption, nullChar), nil case '.': return newToken(tokenKindAnyChar, nullChar), nil case '|': @@ -104,7 +110,7 @@ func (l *lexer) nextInDefault(c rune) (*token, error) { } } switch { - case c == '\\' || c == '.' || c == '*' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']': + case c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']': return newToken(tokenKindChar, c), nil default: return nil, &SyntaxError{ diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go index 11cf043..e2e7d7a 100644 --- a/compiler/lexer_test.go +++ b/compiler/lexer_test.go @@ -31,10 +31,12 @@ func TestLexer(t *testing.T) { }, { caption: "lexer can recognize the special characters", - src: ".*|()[]", + src: ".*+?|()[]", tokens: []*token{ newToken(tokenKindAnyChar, nullChar), newToken(tokenKindRepeat, nullChar), + newToken(tokenKindRepeatOneOrMore, nullChar), + newToken(tokenKindOption, nullChar), newToken(tokenKindAlt, nullChar), newToken(tokenKindGroupOpen, nullChar), newToken(tokenKindGroupClose, nullChar), @@ -45,11 +47,13 @@ func TestLexer(t *testing.T) { }, { caption: "lexer can recognize the escape sequences", - src: "\\\\\\.\\*\\|\\(\\)\\[\\]", + src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[\\]", tokens: []*token{ newToken(tokenKindChar, '\\'), newToken(tokenKindChar, '.'), newToken(tokenKindChar, '*'), + newToken(tokenKindChar, '+'), + newToken(tokenKindChar, '?'), newToken(tokenKindChar, '|'), newToken(tokenKindChar, '('), newToken(tokenKindChar, ')'), @@ -60,12 +64,14 @@ func TestLexer(t *testing.T) { }, { caption: "in a bracket expression, the special characters are also handled as normal characters", - src: "[\\\\.*|()[\\]].*|()][", + src: "[\\\\.*+?|()[\\]].*|()][", tokens: []*token{ newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindChar, '\\'), newToken(tokenKindChar, '.'), newToken(tokenKindChar, '*'), + newToken(tokenKindChar, '+'), + newToken(tokenKindChar, '?'), newToken(tokenKindChar, '|'), newToken(tokenKindChar, '('), newToken(tokenKindChar, ')'), diff --git a/compiler/parser.go b/compiler/parser.go index ede601d..04cd8ad 100644 --- a/compiler/parser.go +++ b/compiler/parser.go @@ -144,10 +144,16 @@ func (p *parser) parseConcat() astNode { func (p *parser) parseRepeat() astNode { group := p.parseGroup() - if !p.consume(tokenKindRepeat) { - return group + if p.consume(tokenKindRepeat) { + return newRepeatNode(group) } - return newRepeatNode(group) + if p.consume(tokenKindRepeatOneOrMore) { + return newRepeatOneOrMoreNode(group) + } + if p.consume(tokenKindOption) { + return newOptionNode(group) + } + return group } func (p *parser) parseGroup() astNode { diff --git a/driver/lexer_test.go b/driver/lexer_test.go index 133b758..283d5fe 100644 --- a/driver/lexer_test.go +++ b/driver/lexer_test.go @@ -19,15 +19,15 @@ func TestLexer_Next(t *testing.T) { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ spec.NewLexEntry("t1", "(a|b)*abb"), - spec.NewLexEntry("t2", " *"), + spec.NewLexEntry("t2", " +"), }, }, - src: "abb aabb aaabb babb bbabb abbbabb", + src: "abb aabb aaabb babb bbabb abbbabb", tokens: []*Token{ newToken(1, "t1", []byte("abb")), newToken(2, "t2", []byte(" ")), newToken(1, "t1", []byte("aabb")), - newToken(2, "t2", []byte(" ")), + newToken(2, "t2", []byte(" ")), newToken(1, "t1", []byte("aaabb")), newToken(2, "t2", []byte(" ")), newToken(1, "t1", []byte("babb")), @@ -41,6 +41,34 @@ func TestLexer_Next(t *testing.T) { { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ + spec.NewLexEntry("t1", "b?a+"), + spec.NewLexEntry("t2", "(ab)?(cd)+"), + spec.NewLexEntry("t3", " +"), + }, + }, + src: "ba baaa a aaa abcd abcdcdcd cd cdcdcd", + tokens: []*Token{ + newToken(1, "t1", []byte("ba")), + newToken(3, "t3", []byte(" ")), + newToken(1, "t1", []byte("baaa")), + newToken(3, "t3", []byte(" ")), + newToken(1, "t1", []byte("a")), + newToken(3, "t3", []byte(" ")), + newToken(1, "t1", []byte("aaa")), + newToken(3, "t3", []byte(" ")), + newToken(2, "t2", []byte("abcd")), + newToken(3, "t3", []byte(" ")), + newToken(2, "t2", []byte("abcdcdcd")), + newToken(3, "t3", []byte(" ")), + newToken(2, "t2", []byte("cd")), + newToken(3, "t3", []byte(" ")), + newToken(2, "t2", []byte("cdcdcd")), + newEOFToken(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ spec.NewLexEntry("t1", "."), }, }, @@ -85,15 +113,17 @@ func TestLexer_Next(t *testing.T) { { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ - spec.NewLexEntry("t1", "[ab.*|()[\\]]"), + spec.NewLexEntry("t1", "[ab.*+?|()[\\]]"), }, }, - src: "ab.*|()[]", + src: "ab.*+?|()[]", tokens: []*Token{ newToken(1, "t1", []byte("a")), newToken(1, "t1", []byte("b")), newToken(1, "t1", []byte(".")), newToken(1, "t1", []byte("*")), + newToken(1, "t1", []byte("+")), + newToken(1, "t1", []byte("?")), newToken(1, "t1", []byte("|")), newToken(1, "t1", []byte("(")), newToken(1, "t1", []byte(")")), |