diff options
Diffstat (limited to 'compiler')
-rw-r--r-- | compiler/lexer.go | 57 | ||||
-rw-r--r-- | compiler/lexer_test.go | 87 | ||||
-rw-r--r-- | compiler/parser.go | 186 | ||||
-rw-r--r-- | compiler/parser_test.go | 626 | ||||
-rw-r--r-- | compiler/syntax_error.go | 37 |
5 files changed, 568 insertions, 425 deletions
diff --git a/compiler/lexer.go b/compiler/lexer.go index 19c7105..c1aa67e 100644 --- a/compiler/lexer.go +++ b/compiler/lexer.go @@ -60,19 +60,20 @@ const ( ) type lexer struct { - src *bufio.Reader - peekChar2 rune - peekEOF2 bool - peekChar1 rune - peekEOF1 bool - lastChar rune - reachedEOF bool - prevChar1 rune - prevEOF1 bool - prevChar2 rune - pervEOF2 bool - mode lexerMode - rangeState rangeState + src *bufio.Reader + peekChar2 rune + peekEOF2 bool + peekChar1 rune + peekEOF1 bool + lastChar rune + reachedEOF bool + prevChar1 rune + prevEOF1 bool + prevChar2 rune + pervEOF2 bool + mode lexerMode + rangeState rangeState + errMsgDetails string } func newLexer(src io.Reader) *lexer { @@ -201,26 +202,19 @@ func (l *lexer) nextInDefault(c rune) (*token, error) { return nil, err } return newToken(tokenKindBExpOpen, nullChar), nil - case ']': - return newToken(tokenKindBExpClose, nullChar), nil case '\\': c, eof, err := l.read() if err != nil { return nil, err } if eof { - return nil, &SyntaxError{ - message: "incompleted escape sequence; unexpected EOF follows \\ character", - } + return nil, synErrIncompletedEscSeq } - switch { - case c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']': + if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' { return newToken(tokenKindChar, c), nil - default: - return nil, &SyntaxError{ - message: fmt.Sprintf("invalid escape sequence '\\%s'", string(c)), - } } + l.errMsgDetails = fmt.Sprintf("\\%v is not supported", string(c)) + return nil, synErrInvalidEscSeq default: return newToken(tokenKindChar, c), nil } @@ -241,7 +235,7 @@ func (l *lexer) nextInBExp(c rune) (*token, error) { if err != nil { return nil, err } - return newToken(tokenKindCharRange, nullChar), nil + return newToken(tokenKindChar, c), nil } if c1 != ']' { err := l.restore() @@ -263,18 +257,13 @@ func (l *lexer) nextInBExp(c rune) (*token, error) { return nil, err } if eof { - return nil, &SyntaxError{ - message: "incompleted escape sequence; unexpected EOF follows \\ character", - } + return nil, synErrIncompletedEscSeq } - switch { - case c == '\\' || c == '^' || c == '-' || c == ']': + if c == '\\' || c == '^' || c == '-' || c == ']' { return newToken(tokenKindChar, c), nil - default: - return nil, &SyntaxError{ - message: fmt.Sprintf("invalid escape sequence '\\%s'", string(c)), - } } + l.errMsgDetails = fmt.Sprintf("\\%v is not supported", string(c)) + return nil, synErrInvalidEscSeq default: return newToken(tokenKindChar, c), nil } diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go index 451c3c7..c77d7c7 100644 --- a/compiler/lexer_test.go +++ b/compiler/lexer_test.go @@ -1,7 +1,6 @@ package compiler import ( - "reflect" "strings" "testing" ) @@ -30,8 +29,8 @@ func TestLexer(t *testing.T) { }, }, { - caption: "lexer can recognize the special characters", - src: ".*+?|()[a-z][^^]", + caption: "lexer can recognize the special characters in default mode", + src: ".*+?|()[", tokens: []*token{ newToken(tokenKindAnyChar, nullChar), newToken(tokenKindRepeat, nullChar), @@ -41,19 +40,12 @@ func TestLexer(t *testing.T) { newToken(tokenKindGroupOpen, nullChar), newToken(tokenKindGroupClose, nullChar), newToken(tokenKindBExpOpen, nullChar), - newToken(tokenKindChar, 'a'), - newToken(tokenKindCharRange, nullChar), - newToken(tokenKindChar, 'z'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindInverseBExpOpen, nullChar), - newToken(tokenKindChar, '^'), - newToken(tokenKindBExpClose, nullChar), newToken(tokenKindEOF, nullChar), }, }, { - caption: "lexer can recognize the escape sequences", - src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[\\][\\^\\-]", + caption: "lexer can recognize the escape sequences in default mode", + src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[", tokens: []*token{ newToken(tokenKindChar, '\\'), newToken(tokenKindChar, '.'), @@ -64,17 +56,50 @@ func TestLexer(t *testing.T) { newToken(tokenKindChar, '('), newToken(tokenKindChar, ')'), newToken(tokenKindChar, '['), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "] is treated as an ordinary character in default mode", + src: "]", + tokens: []*token{ newToken(tokenKindChar, ']'), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "lexer can recognize the special characters in bracket expression mode", + src: "[a-z][^a-z]", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, 'a'), + newToken(tokenKindCharRange, nullChar), + newToken(tokenKindChar, 'z'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, 'a'), + newToken(tokenKindCharRange, nullChar), + newToken(tokenKindChar, 'z'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "lexer can recognize the escape sequences in bracket expression mode", + src: "[\\^a\\-z]", + tokens: []*token{ newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindChar, '^'), + newToken(tokenKindChar, 'a'), newToken(tokenKindChar, '-'), + newToken(tokenKindChar, 'z'), newToken(tokenKindBExpClose, nullChar), newToken(tokenKindEOF, nullChar), }, }, { caption: "in a bracket expression, the special characters are also handled as normal characters", - src: "[\\\\.*+?|()[\\]].*|()-][", + src: "[\\\\.*+?|()[", tokens: []*token{ newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindChar, '\\'), @@ -86,16 +111,6 @@ func TestLexer(t *testing.T) { newToken(tokenKindChar, '('), newToken(tokenKindChar, ')'), newToken(tokenKindChar, '['), - newToken(tokenKindChar, ']'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindAnyChar, nullChar), - newToken(tokenKindRepeat, nullChar), - newToken(tokenKindAlt, nullChar), - newToken(tokenKindGroupOpen, nullChar), - newToken(tokenKindGroupClose, nullChar), - newToken(tokenKindChar, '-'), - newToken(tokenKindBExpClose, nullChar), - newToken(tokenKindBExpOpen, nullChar), newToken(tokenKindEOF, nullChar), }, }, @@ -195,12 +210,28 @@ func TestLexer(t *testing.T) { { caption: "lexer raises an error when an invalid escape sequence appears", src: "\\@", - err: &SyntaxError{}, + err: synErrInvalidEscSeq, }, { caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears", src: "\\", - err: &SyntaxError{}, + err: synErrIncompletedEscSeq, + }, + { + caption: "lexer raises an error when an invalid escape sequence appears", + src: "[\\@", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + }, + err: synErrInvalidEscSeq, + }, + { + caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears", + src: "[\\", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + }, + err: synErrIncompletedEscSeq, }, } for _, tt := range tests { @@ -225,10 +256,8 @@ func TestLexer(t *testing.T) { break } } - ty := reflect.TypeOf(err) - eTy := reflect.TypeOf(tt.err) - if ty != eTy { - t.Fatalf("unexpected error type; want: %v, got: %v", eTy, ty) + if err != tt.err { + t.Fatalf("unexpected error; want: %v, got: %v", tt.err, err) } if i < len(tt.tokens) { t.Fatalf("expecte more tokens") diff --git a/compiler/parser.go b/compiler/parser.go index 8658231..63e5549 100644 --- a/compiler/parser.go +++ b/compiler/parser.go @@ -4,20 +4,40 @@ import ( "bytes" "fmt" "io" + "strings" ) -type SyntaxError struct { - message string +type ParseErrors struct { + Errors []*ParseError } -func (err *SyntaxError) Error() string { - return fmt.Sprintf("Syntax Error: %v", err.message) +func (e *ParseErrors) Error() string { + var b strings.Builder + fmt.Fprintf(&b, "%v", e.Errors[0]) + for _, err := range e.Errors[1:] { + fmt.Fprintf(&b, "\n%v", err) + } + return b.String() +} + +type ParseError struct { + ID int + Pattern []byte + Cause error + Details string +} + +func (e *ParseError) Error() string { + var b strings.Builder + fmt.Fprintf(&b, "#%v %v: %v", e.ID, string(e.Pattern), e.Cause) + if e.Details != "" { + fmt.Fprintf(&b, ": %v", e.Details) + } + return b.String() } -func raiseSyntaxError(format string, a ...interface{}) { - panic(&SyntaxError{ - message: fmt.Sprintf(format, a...), - }) +func raiseSyntaxError(synErr *SyntaxError) { + panic(synErr) } type symbolTable struct { @@ -58,43 +78,58 @@ func parse(regexps map[int][]byte) (astNode, *symbolTable, error) { if len(regexps) == 0 { return nil, nil, fmt.Errorf("parse() needs at least one token entry") } + symPos := symbolPositionMin var root astNode - for id, re := range regexps { - if len(re) == 0 { - return nil, nil, fmt.Errorf("regular expression must be a non-empty byte sequence") - } - p := newParser(id, bytes.NewReader(re)) - n, err := p.parse() + var perrs []*ParseError + for id, pattern := range regexps { + p := newParser(id, symPos, bytes.NewReader(pattern)) + ast, err := p.parse() if err != nil { - return nil, nil, err + perr := &ParseError{ + ID: id, + Pattern: pattern, + Cause: err, + Details: p.errMsgDetails, + } + if p.errMsgDetails != "" { + perr.Details = p.errMsgDetails + } + perrs = append(perrs, perr) + continue } if root == nil { - root = n + root = ast } else { - root = newAltNode(root, n) + root = newAltNode(root, ast) } + symPos = p.symPos } - _, err := positionSymbols(root, 1) - if err != nil { - return nil, nil, err + if len(perrs) > 0 { + return nil, nil, &ParseErrors{ + Errors: perrs, + } } return root, genSymbolTable(root), nil } type parser struct { - id int - lex *lexer - peekedTok *token - lastTok *token + id int + lex *lexer + peekedTok *token + lastTok *token + errMsgDetails string + symPos uint16 } -func newParser(id int, src io.Reader) *parser { +func newParser(id int, initialSymPos uint16, src io.Reader) *parser { return &parser{ - id: id, - lex: newLexer(src), - peekedTok: nil, - lastTok: nil, + id: id, + lex: newLexer(src), + peekedTok: nil, + lastTok: nil, + errMsgDetails: "", + symPos: initialSymPos, } } @@ -105,17 +140,34 @@ func (p *parser) parse() (ast astNode, retErr error) { var ok bool retErr, ok = err.(error) if !ok { - retErr = fmt.Errorf("Error: %v", err) + retErr = fmt.Errorf("%v", err) } return } }() - return p.parseRegexp() + ast, err := p.parseRegexp() + if err != nil { + return nil, err + } + p.symPos, err = positionSymbols(ast, p.symPos) + if err != nil { + return nil, err + } + return ast, nil } func (p *parser) parseRegexp() (astNode, error) { alt := p.parseAlt() + if alt == nil { + if p.consume(tokenKindGroupClose) { + raiseSyntaxError(synErrGroupNoInitiator) + } + raiseSyntaxError(synErrNullPattern) + } + if p.consume(tokenKindGroupClose) { + raiseSyntaxError(synErrGroupNoInitiator) + } p.expect(tokenKindEOF) return newConcatNode(alt, newEndMarkerNode(p.id)), nil } @@ -123,7 +175,10 @@ func (p *parser) parseRegexp() (astNode, error) { func (p *parser) parseAlt() astNode { left := p.parseConcat() if left == nil { - raiseSyntaxError("alternation expression must have operands") + if p.consume(tokenKindAlt) { + raiseSyntaxError(synErrAltLackOfOperand) + } + return nil } for { if !p.consume(tokenKindAlt) { @@ -131,7 +186,7 @@ func (p *parser) parseAlt() astNode { } right := p.parseConcat() if right == nil { - raiseSyntaxError("alternation expression must have operands") + raiseSyntaxError(synErrAltLackOfOperand) } left = newAltNode(left, right) } @@ -153,6 +208,18 @@ func (p *parser) parseConcat() astNode { func (p *parser) parseRepeat() astNode { group := p.parseGroup() if group == nil { + if p.consume(tokenKindRepeat) { + p.errMsgDetails = "* needs an operand" + raiseSyntaxError(synErrRepNoTarget) + } + if p.consume(tokenKindRepeatOneOrMore) { + p.errMsgDetails = "+ needs an operand" + raiseSyntaxError(synErrRepNoTarget) + } + if p.consume(tokenKindOption) { + p.errMsgDetails = "? needs an operand" + raiseSyntaxError(synErrRepNoTarget) + } return nil } if p.consume(tokenKindRepeat) { @@ -169,10 +236,18 @@ func (p *parser) parseRepeat() astNode { func (p *parser) parseGroup() astNode { if p.consume(tokenKindGroupOpen) { - defer p.expect(tokenKindGroupClose) alt := p.parseAlt() if alt == nil { - raiseSyntaxError("grouping expression must include at least one character") + if p.consume(tokenKindEOF) { + raiseSyntaxError(synErrGroupUnclosed) + } + raiseSyntaxError(synErrGroupNoElem) + } + if p.consume(tokenKindEOF) { + raiseSyntaxError(synErrGroupUnclosed) + } + if !p.consume(tokenKindGroupClose) { + raiseSyntaxError(synErrGroupInvalidForm) } return alt } @@ -186,7 +261,10 @@ func (p *parser) parseSingleChar() astNode { if p.consume(tokenKindBExpOpen) { left := p.parseBExpElem() if left == nil { - raiseSyntaxError("bracket expression must include at least one character") + if p.consume(tokenKindEOF) { + raiseSyntaxError(synErrBExpUnclosed) + } + raiseSyntaxError(synErrBExpNoElem) } for { right := p.parseBExpElem() @@ -195,13 +273,19 @@ func (p *parser) parseSingleChar() astNode { } left = newAltNode(left, right) } + if p.consume(tokenKindEOF) { + raiseSyntaxError(synErrBExpUnclosed) + } p.expect(tokenKindBExpClose) return left } if p.consume(tokenKindInverseBExpOpen) { elem := p.parseBExpElem() if elem == nil { - raiseSyntaxError("bracket expression must include at least one character") + if p.consume(tokenKindEOF) { + raiseSyntaxError(synErrBExpUnclosed) + } + raiseSyntaxError(synErrBExpNoElem) } inverse := exclude(elem, genAnyCharAST()) if inverse == nil { @@ -217,10 +301,20 @@ func (p *parser) parseSingleChar() astNode { panic(fmt.Errorf("a pattern that isn't matching any symbols")) } } + if p.consume(tokenKindEOF) { + raiseSyntaxError(synErrBExpUnclosed) + } p.expect(tokenKindBExpClose) return inverse } - return p.parseNormalChar() + c := p.parseNormalChar() + if c == nil { + if p.consume(tokenKindBExpClose) { + raiseSyntaxError(synErrBExpInvalidForm) + } + return nil + } + return c } func (p *parser) parseBExpElem() astNode { @@ -233,7 +327,13 @@ func (p *parser) parseBExpElem() astNode { } right := p.parseNormalChar() if right == nil { - raiseSyntaxError("invalid range expression") + panic(fmt.Errorf("invalid range expression")) + } + from := genByteSeq(left) + to := genByteSeq(right) + if !isValidOrder(from, to) { + p.errMsgDetails = fmt.Sprintf("[%s-%s] ([%v-%v])", string(from), string(to), from, to) + raiseSyntaxError(synErrRangeInvalidOrder) } return genRangeAST(left, right) } @@ -372,9 +472,6 @@ func genAnyCharAST() astNode { func genRangeAST(fromNode, toNode astNode) astNode { from := genByteSeq(fromNode) to := genByteSeq(toNode) - if !isValidOrder(from, to) { - raiseSyntaxError("range expression with invalid order: [%s-%s] ([%v-%v])", string(from), string(to), from, to) - } switch len(from) { case 1: switch len(to) { @@ -993,8 +1090,8 @@ func genAltNode(cs ...astNode) astNode { func (p *parser) expect(expected tokenKind) { if !p.consume(expected) { tok := p.peekedTok - errMsg := fmt.Sprintf("unexpected token; expected: %v, actual: %v", expected, tok.kind) - raiseSyntaxError(errMsg) + p.errMsgDetails = fmt.Sprintf("unexpected token; expected: %v, actual: %v", expected, tok.kind) + raiseSyntaxError(synErrUnexpectedToken) } } @@ -1007,6 +1104,7 @@ func (p *parser) consume(expected tokenKind) bool { } else { tok, err = p.lex.next() if err != nil { + p.errMsgDetails = p.lex.errMsgDetails panic(err) } } diff --git a/compiler/parser_test.go b/compiler/parser_test.go index c56a82a..dcbe924 100644 --- a/compiler/parser_test.go +++ b/compiler/parser_test.go @@ -1,47 +1,44 @@ package compiler import ( + "bytes" "fmt" "os" "reflect" "testing" ) -func TestParse(t *testing.T) { - symPos := func(n uint16) symbolPosition { - pos, err := newSymbolPosition(n, false) - if err != nil { - panic(err) - } - return pos +func symPos(n uint16) symbolPosition { + pos, err := newSymbolPosition(n, false) + if err != nil { + panic(err) } + return pos +} - endPos := func(n uint16) symbolPosition { - pos, err := newSymbolPosition(n, true) - if err != nil { - panic(err) - } - return pos +func endPos(n uint16) symbolPosition { + pos, err := newSymbolPosition(n, true) + if err != nil { + panic(err) } + return pos +} +func TestParser_parse(t *testing.T) { tests := []struct { - patterns []string + pattern string ast astNode - syntaxError bool + syntaxError *SyntaxError }{ { - patterns: []string{ - "a", - }, + pattern: "a", ast: genConcatNode( newSymbolNodeWithPos(byte('a'), symPos(1)), newEndMarkerNodeWithPos(1, endPos(2)), ), }, { - patterns: []string{ - "abc", - }, + pattern: "abc", ast: genConcatNode( genConcatNode( newSymbolNodeWithPos(byte('a'), symPos(1)), @@ -52,9 +49,7 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "a?", - }, + pattern: "a?", ast: genConcatNode( newOptionNode( newSymbolNodeWithPos(byte('a'), symPos(1)), @@ -63,9 +58,7 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "[abc]?", - }, + pattern: "[abc]?", ast: genConcatNode( newOptionNode( genAltNode( @@ -78,9 +71,7 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "(a)?", - }, + pattern: "(a)?", ast: genConcatNode( newOptionNode( newSymbolNodeWithPos(byte('a'), symPos(1)), @@ -89,9 +80,7 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "((a?)?)?", - }, + pattern: "((a?)?)?", ast: genConcatNode( newOptionNode( newOptionNode( @@ -104,9 +93,7 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "(abc)?", - }, + pattern: "(abc)?", ast: genConcatNode( newOptionNode( genConcatNode( @@ -119,9 +106,7 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "(a|b)?", - }, + pattern: "(a|b)?", ast: genConcatNode( newOptionNode( genAltNode( @@ -133,21 +118,27 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "?", - }, - syntaxError: true, + pattern: "?", + syntaxError: synErrRepNoTarget, }, { - patterns: []string{ - "a??", - }, - syntaxError: true, + pattern: "(?)", + syntaxError: synErrRepNoTarget, }, { - patterns: []string{ - "a*", - }, + pattern: "a|?", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "?|b", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a??", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a*", ast: genConcatNode( newRepeatNode( newSymbolNodeWithPos(byte('a'), 1), @@ -156,9 +147,7 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "[abc]*", - }, + pattern: "[abc]*", ast: genConcatNode( newRepeatNode( genAltNode( @@ -171,9 +160,7 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "((a*)*)*", - }, + pattern: "((a*)*)*", ast: genConcatNode( newRepeatNode( newRepeatNode( @@ -186,9 +173,7 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "(abc)*", - }, + pattern: "(abc)*", ast: genConcatNode( newRepeatNode( genConcatNode( @@ -201,9 +186,7 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "(a|b)*", - }, + pattern: "(a|b)*", ast: genConcatNode( newRepeatNode( genAltNode( @@ -215,21 +198,27 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "*", - }, - syntaxError: true, + pattern: "*", + syntaxError: synErrRepNoTarget, }, { - patterns: []string{ - "a**", - }, - syntaxError: true, + pattern: "(*)", + syntaxError: synErrRepNoTarget, }, { - patterns: []string{ - "a+", - }, + pattern: "a|*", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "*|b", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a**", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a+", ast: genConcatNode( newSymbolNodeWithPos(byte('a'), symPos(1)), newRepeatNode( @@ -239,9 +228,7 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "[abc]+", - }, + pattern: "[abc]+", ast: genConcatNode( genAltNode( newSymbolNodeWithPos(byte('a'), symPos(1)), @@ -259,9 +246,7 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "((a+)+)+", - }, + pattern: "((a+)+)+", ast: genConcatNode( genConcatNode( genConcatNode( @@ -303,9 +288,7 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "(abc)+", - }, + pattern: "(abc)+", ast: genConcatNode( genConcatNode( newSymbolNodeWithPos(byte('a'), symPos(1)), @@ -323,9 +306,7 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "(a|b)+", - }, + pattern: "(a|b)+", ast: genConcatNode( genAltNode( newSymbolNodeWithPos(byte('a'), symPos(1)), @@ -341,21 +322,27 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "+", - }, - syntaxError: true, + pattern: "+", + syntaxError: synErrRepNoTarget, }, { - patterns: []string{ - "a++", - }, - syntaxError: true, + pattern: "(+)", + syntaxError: synErrRepNoTarget, }, { - patterns: []string{ - ".", - }, + pattern: "a|+", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "+|b", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a++", + syntaxError: synErrRepNoTarget, + }, + { + pattern: ".", ast: newConcatNode( genAltNode( newRangeSymbolNodeWithPos(bounds1[1][0].min, bounds1[1][0].max, symPos(1)), @@ -406,18 +393,14 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "[a]", - }, + pattern: "[a]", ast: newConcatNode( newSymbolNodeWithPos(byte('a'), symPos(1)), newEndMarkerNodeWithPos(1, endPos(2)), ), }, { - patterns: []string{ - "[abc]", - }, + pattern: "[abc]", ast: newConcatNode( genAltNode( newSymbolNodeWithPos(byte('a'), symPos(1)), @@ -428,18 +411,14 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "[a-z]", - }, + pattern: "[a-z]", ast: newConcatNode( newRangeSymbolNodeWithPos(byte('a'), byte('z'), symPos(1)), newEndMarkerNodeWithPos(1, endPos(2)), ), }, { - patterns: []string{ - "[A-Za-z]", - }, + pattern: "[A-Za-z]", ast: newConcatNode( genAltNode( newRangeSymbolNodeWithPos(byte('A'), byte('Z'), symPos(1)), @@ -449,78 +428,107 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "a[]", - }, - syntaxError: true, + pattern: "a[]", + syntaxError: synErrBExpNoElem, }, { - patterns: []string{ - "[]a", - }, - syntaxError: true, + pattern: "[]a", + syntaxError: synErrBExpNoElem, }, { - patterns: []string{ - "[]", - }, - syntaxError: true, + pattern: "[]", + syntaxError: synErrBExpNoElem, }, { - patterns: []string{ - "[^]", - }, + pattern: "[^]", ast: newConcatNode( newSymbolNodeWithPos(byte('^'), symPos(1)), newEndMarkerNodeWithPos(1, endPos(2)), ), }, { - patterns: []string{ - "[", - }, - syntaxError: true, + pattern: "[", + syntaxError: synErrBExpUnclosed, }, { - patterns: []string{ - "[a", - }, - syntaxError: true, + pattern: "([", + syntaxError: synErrBExpUnclosed, }, { - patterns: []string{ - "[a-", - }, - syntaxError: true, + pattern: "[a", + syntaxError: synErrBExpUnclosed, }, { - patterns: []string{ - "[^", - }, - syntaxError: true, + pattern: "([a", + syntaxError: synErrBExpUnclosed, }, { - patterns: []string{ - "[^a", - }, - syntaxError: true, + pattern: "[a-", + syntaxError: synErrBExpUnclosed, }, { - patterns: []string{ - "[^a-", - }, - syntaxError: true, + pattern: "([a-", + syntaxError: synErrBExpUnclosed, }, { - patterns: []string{ - "]", - }, - syntaxError: true, + pattern: "[^", + syntaxError: synErrBExpUnclosed, }, { - patterns: []string{ - "[a-]", - }, + pattern: "([^", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "[^a", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "([^a", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "[^a-", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "([^a-", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "]", + ast: newConcatNode( + newSymbolNodeWithPos(byte(']'), symPos(1)), + newEndMarkerNodeWithPos(1, endPos(2)), + ), + }, + { + pattern: "(]", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: "a]", + ast: newConcatNode( + genConcatNode( + newSymbolNodeWithPos(byte('a'), symPos(1)), + newSymbolNodeWithPos(byte(']'), symPos(2)), + ), + newEndMarkerNodeWithPos(1, endPos(3)), + ), + }, + { + pattern: "(a]", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: "([)", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "([a)", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "[a-]", ast: newConcatNode( genAltNode( newSymbolNodeWithPos(byte('a'), symPos(1)), @@ -530,9 +538,7 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "[^a-]", - }, + pattern: "[^a-]", ast: newConcatNode( genAltNode( newRangeSymbolNodeWithPos(bounds1[1][0].min, byte(44), symPos(1)), @@ -585,9 +591,7 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "[-z]", - }, + pattern: "[-z]", ast: newConcatNode( genAltNode( newSymbolNodeWithPos(byte('-'), symPos(1)), @@ -597,9 +601,7 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "[^-z]", - }, + pattern: "[^-z]", ast: newConcatNode( genAltNode( newRangeSymbolNodeWithPos(bounds1[1][0].min, byte(44), symPos(1)), @@ -654,18 +656,14 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "[-]", - }, + pattern: "[-]", ast: newConcatNode( newSymbolNodeWithPos(byte('-'), symPos(1)), newEndMarkerNodeWithPos(1, endPos(2)), ), }, { - patterns: []string{ - "[^-]", - }, + pattern: "[^-]", ast: newConcatNode( genAltNode( newRangeSymbolNodeWithPos(bounds1[1][0].min, byte(44), symPos(1)), @@ -717,57 +715,73 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "(a)", - }, + pattern: "(a)", ast: newConcatNode( newSymbolNodeWithPos(byte('a'), symPos(1)), newEndMarkerNodeWithPos(1, endPos(2)), ), }, { - patterns: []string{ - "(((a)))", - }, + pattern: "(((a)))", ast: newConcatNode( newSymbolNodeWithPos(byte('a'), symPos(1)), newEndMarkerNodeWithPos(1, endPos(2)), ), }, { - patterns: []string{ - "a()", - }, - syntaxError: true, + pattern: "a()", + syntaxError: synErrGroupNoElem, }, { - patterns: []string{ - "()a", - }, - syntaxError: true, + pattern: "()a", + syntaxError: synErrGroupNoElem, }, { - patterns: []string{ - "()", - }, - syntaxError: true, + pattern: "()", + syntaxError: synErrGroupNoElem, }, { - patterns: []string{ - "(", - }, - syntaxError: true, + pattern: "(", + syntaxError: synErrGroupUnclosed, }, { - patterns: []string{ - ")", - }, - syntaxError: true, + pattern: "a(", + syntaxError: synErrGroupUnclosed, }, { - patterns: []string{ - "Mulder|Scully", - }, + pattern: "(a", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: "((", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: "((a)", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: ")", + syntaxError: synErrGroupNoInitiator, + }, + { + pattern: "a)", + syntaxError: synErrGroupNoInitiator, + }, + { + pattern: ")a", + syntaxError: synErrGroupNoInitiator, + }, + { + pattern: "))", + syntaxError: synErrGroupNoInitiator, + }, + { + pattern: "(a))", + syntaxError: synErrGroupNoInitiator, + }, + { + pattern: "Mulder|Scully", ast: newConcatNode( genAltNode( genConcatNode( @@ -791,9 +805,7 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "Langly|Frohike|Byers", - }, + pattern: "Langly|Frohike|Byers", ast: newConcatNode( genAltNode( genConcatNode( @@ -825,169 +837,147 @@ func TestParse(t *testing.T) { ), }, { - patterns: []string{ - "|", - }, - syntaxError: true, + pattern: "|", + syntaxError: synErrAltLackOfOperand, }, { - patterns: []string{ - "||", - }, - syntaxError: true, + pattern: "||", + syntaxError: synErrAltLackOfOperand, }, { - patterns: []string{ - "Mulder|", - }, - syntaxError: true, + pattern: "Mulder|", + syntaxError: synErrAltLackOfOperand, }, { - patterns: []string{ - "|Scully", - }, - syntaxError: true, + pattern: "|Scully", + syntaxError: synErrAltLackOfOperand, }, { - patterns: []string{ - "Langly|Frohike|", - }, - syntaxError: true, + pattern: "Langly|Frohike|", + syntaxError: synErrAltLackOfOperand, }, { - patterns: []string{ - "Langly||Byers", - }, - syntaxError: true, + pattern: "Langly||Byers", + syntaxError: synErrAltLackOfOperand, }, { - patterns: []string{ - "|Frohike|Byers", - }, - syntaxError: true, + pattern: "|Frohike|Byers", + syntaxError: synErrAltLackOfOperand, }, { - patterns: []string{ - "|Frohike|", - }, - syntaxError: true, + pattern: "|Frohike|", + syntaxError: synErrAltLackOfOperand, }, { - patterns: []string{ - "Fox(|)Mulder", - }, - syntaxError: true, + pattern: "Fox(|)Mulder", + syntaxError: synErrAltLackOfOperand, }, { - patterns: []string{ - "(Fox|)Mulder", - }, - syntaxError: true, + pattern: "(Fox|)Mulder", + syntaxError: synErrAltLackOfOperand, }, { - patterns: []string{ - "Fox(|Mulder)", - }, - syntaxError: true, + pattern: "Fox(|Mulder)", + syntaxError: synErrAltLackOfOperand, }, } for i, tt := range tests { - t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) { - ps := map[int][]byte{} - for i, p := range tt.patterns { - ps[i+1] = []byte(p) - } - root, _, err := parse(ps) - if tt.syntaxError { - // printAST(os.Stdout, root, "", "", false) + t.Run(fmt.Sprintf("#%v %v", i, tt.pattern), func(t *testing.T) { + p := newParser(1, symbolPositionMin, bytes.NewReader([]byte(tt.pattern))) + ast, err := p.parse() + if tt.syntaxError != nil { + // printAST(os.Stdout, ast, "", "", false) if err == nil { t.Fatalf("expected syntax error; got: nil") } - if _, ok := err.(*SyntaxError); !ok { - t.Fatalf("expected syntax error; got: %v (type: %T)", err, err) + synErr, ok := err.(*SyntaxError) + if !ok { + t.Fatalf("expected SyntaxError; got: %v (type: %T)", err, err) } - if root != nil { - t.Fatalf("root is not nil") + if synErr != tt.syntaxError { + t.Fatalf("unexpected syntax error; want: %v, got: %v", tt.syntaxError, synErr) + } + if ast != nil { + t.Fatalf("ast is not nil") } } else { if err != nil { t.Fatal(err) } - if root == nil { + if ast == nil { t.Fatal("AST is nil") } - // printAST(os.Stdout, root, "", "", false) - testAST(t, tt.ast, root) + // printAST(os.Stdout, ast, "", "", false) + testAST(t, tt.ast, ast) } }) } +} - // Test a FOLLOE table and a symbol table - { - root, symTab, err := parse(map[int][]byte{ - 1: []byte("(a|b)*abb"), - }) - if err != nil { - t.Fatal(err) - } - if root == nil { - t.Fatal("root of AST is nil") - } - printAST(os.Stdout, root, "", "", false) +func TestParse(t *testing.T) { + root, symTab, err := parse(map[int][]byte{ + 1: []byte("(a|b)*abb"), + }) + if err != nil { + t.Fatal(err) + } + if root == nil { + t.Fatal("root of AST is nil") + } + printAST(os.Stdout, root, "", "", false) - { - expectedAST := genConcatNode( - newRepeatNode( - newAltNode( - newSymbolNodeWithPos(byte('a'), symPos(1)), - newSymbolNodeWithPos(byte('b'), symPos(2)), - ), + { + expectedAST := genConcatNode( + newRepeatNode( + newAltNode( + newSymbolNodeWithPos(byte('a'), symPos(1)), + newSymbolNodeWithPos(byte('b'), symPos(2)), ), - newSymbolNodeWithPos(byte('a'), symPos(3)), - newSymbolNodeWithPos(byte('b'), symPos(4)), - newSymbolNodeWithPos(byte('b'), symPos(5)), - newEndMarkerNodeWithPos(1, endPos(6)), - ) - testAST(t, expectedAST, root) - } + ), + newSymbolNodeWithPos(byte('a'), symPos(3)), + newSymbolNodeWithPos(byte('b'), symPos(4)), + newSymbolNodeWithPos(byte('b'), symPos(5)), + newEndMarkerNodeWithPos(1, endPos(6)), + ) + testAST(t, expectedAST, root) + } - { - followTab := genFollowTable(root) - if followTab == nil { - t.Fatal("follow table is nil") - } - expectedFollowTab := followTable{ - 1: newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)), - 2: newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)), - 3: newSymbolPositionSet().add(symPos(4)), - 4: newSymbolPositionSet().add(symPos(5)), - 5: newSymbolPositionSet().add(endPos(6)), - } - testFollowTable(t, expectedFollowTab, followTab) + { + followTab := genFollowTable(root) + if followTab == nil { + t.Fatal("follow table is nil") + } + expectedFollowTab := followTable{ + 1: newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)), + 2: newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)), + 3: newSymbolPositionSet().add(symPos(4)), + 4: newSymbolPositionSet().add(symPos(5)), + 5: newSymbolPositionSet().add(endPos(6)), } + testFollowTable(t, expectedFollowTab, followTab) + } - { - entry := func(v byte) byteRange { - return byteRange{ - from: v, - to: v, - } + { + entry := func(v byte) byteRange { + return byteRange{ + from: v, + to: v, } + } - expectedSymTab := &symbolTable{ - symPos2Byte: map[symbolPosition]byteRange{ - symPos(1): entry(byte('a')), - symPos(2): entry(byte('b')), - symPos(3): entry(byte('a')), - symPos(4): entry(byte('b')), - symPos(5): entry(byte('b')), - }, - endPos2ID: map[symbolPosition]int{ - endPos(6): 1, - }, - } - testSymbolTable(t, expectedSymTab, symTab) + expectedSymTab := &symbolTable{ + symPos2Byte: map[symbolPosition]byteRange{ + symPos(1): entry(byte('a')), + symPos(2): entry(byte('b')), + symPos(3): entry(byte('a')), + symPos(4): entry(byte('b')), + symPos(5): entry(byte('b')), + }, + endPos2ID: map[symbolPosition]int{ + endPos(6): 1, + }, } + testSymbolTable(t, expectedSymTab, symTab) } } diff --git a/compiler/syntax_error.go b/compiler/syntax_error.go new file mode 100644 index 0000000..be2dc38 --- /dev/null +++ b/compiler/syntax_error.go @@ -0,0 +1,37 @@ +package compiler + +import "fmt" + +type SyntaxError struct { + Message string +} + +func newSyntaxError(msg string) *SyntaxError { + return &SyntaxError{ + Message: msg, + } +} + +func (e *SyntaxError) Error() string { + return fmt.Sprintf("syntax error: %s", e.Message) +} + +var ( + // lexical errors + synErrIncompletedEscSeq = newSyntaxError("incompleted escape sequence; unexpected EOF following \\") + synErrInvalidEscSeq = newSyntaxError("invalid escape sequence") + + // syntax errors + synErrUnexpectedToken = newSyntaxError("unexpected token") + synErrNullPattern = newSyntaxError("a pattern must be a non-empty byte sequence") + synErrAltLackOfOperand = newSyntaxError("an alternation expression must have operands") + synErrRepNoTarget = newSyntaxError("a repeat expression must have an operand") + synErrGroupNoElem = newSyntaxError("a grouping expression must include at least one character") + synErrGroupUnclosed = newSyntaxError("unclosed grouping expression") + synErrGroupNoInitiator = newSyntaxError(") needs preceding (") + synErrGroupInvalidForm = newSyntaxError("invalid grouping expression") + synErrBExpNoElem = newSyntaxError("a bracket expression must include at least one character") + synErrBExpUnclosed = newSyntaxError("unclosed bracket expression") + synErrBExpInvalidForm = newSyntaxError("invalid bracket expression") + synErrRangeInvalidOrder = newSyntaxError("a range expression with invalid order") +) |