diff options
-rw-r--r-- | README.md | 49 | ||||
-rw-r--r-- | compiler/ast.go | 38 | ||||
-rw-r--r-- | compiler/compiler.go | 24 | ||||
-rw-r--r-- | compiler/dfa_test.go | 2 | ||||
-rw-r--r-- | compiler/lexer.go | 74 | ||||
-rw-r--r-- | compiler/lexer_test.go | 28 | ||||
-rw-r--r-- | compiler/parser.go | 187 | ||||
-rw-r--r-- | compiler/parser_test.go | 134 | ||||
-rw-r--r-- | compiler/syntax_error.go | 2 | ||||
-rw-r--r-- | driver/lexer_test.go | 52 | ||||
-rw-r--r-- | spec/spec.go | 11 |
11 files changed, 540 insertions, 61 deletions
@@ -113,13 +113,14 @@ top level object: entry object: -| Field | Type | Nullable | Description | -|---------|------------------|----------|-----------------------------------------------------------------------------------------------| -| kind | string | false | A name of a token kind | -| pattern | string | false | A pattern in a regular expression | -| modes | array of strings | true | Mode names that an entry is enabled in (default: "default") | -| push | string | true | A mode name that the lexer pushes to own mode stack when a token matching the pattern appears | -| pop | bool | true | When `pop` is true, the lexer pops a mode from own mode stack. | +| Field | Type | Nullable | Description | +|----------|------------------|----------|-----------------------------------------------------------------------------------------------| +| kind | string | false | A name of a token kind | +| pattern | string | false | A pattern in a regular expression | +| modes | array of strings | true | Mode names that an entry is enabled in (default: "default") | +| push | string | true | A mode name that the lexer pushes to own mode stack when a token matching the pattern appears | +| pop | bool | true | When `pop` is `true`, the lexer pops a mode from own mode stack. | +| fragment | bool | true | When `fragment` is `true`, its entry is a fragment. | See [Regular Expression Syntax](#regular-expression-syntax) for more details on the regular expression syntax. @@ -198,6 +199,40 @@ The repetitions match a string that repeats the previous single character or gro | a(bc)*d | matches 'ad', 'abcd', 'abcbcd', and so on | | (ab\|cd)+ | matches 'ab', 'cd', 'abcd', 'cdab', abcdab', and so on | +### Fragment + +The fragment is a feature that allows you to define a part of a pattern. This feature is useful for decomposing complex patterns into simple patterns and for defining common parts between patterns. +A fragment entry is defined by an entry whose `fragment` field is `true`, and is referenced by a fragment expression (`\f{...}`). +Fragment patterns can be nested, but they are not allowed to contain circular references. + +For instance, you can define [an identifier of golang](https://golang.org/ref/spec#Identifiers) as follows: + +```json +{ + "entries": [ + { + "fragment": true, + "kind": "unicode_letter", + "pattern": "\\p{Letter}" + }, + { + "fragment": true, + "kind": "unicode_digit", + "pattern": "\\p{Number}" + }, + { + "fragment": true, + "kind": "letter", + "pattern": "\\f{unicode_letter}|_" + }, + { + "kind": "identifier", + "pattern": "\\f{letter}(\\f{letter}|\\f{unicode_digit})*" + } + ] +} +``` + ## Lex Mode Lex Mode is a feature that allows you to separate a DFA transition table for each mode. diff --git a/compiler/ast.go b/compiler/ast.go index a419f98..7d3965a 100644 --- a/compiler/ast.go +++ b/compiler/ast.go @@ -19,6 +19,7 @@ var ( _ astNode = &concatNode{} _ astNode = &altNode{} _ astNode = &optionNode{} + _ astNode = &fragmentNode{} ) type symbolNode struct { @@ -306,6 +307,38 @@ func (n *optionNode) last() *symbolPositionSet { return n.lastMemo } +type fragmentNode struct { + symbol string + left astNode +} + +func newFragmentNode(symbol string, ast astNode) *fragmentNode { + return &fragmentNode{ + symbol: symbol, + left: ast, + } +} + +func (n *fragmentNode) String() string { + return fmt.Sprintf("{type: fragment, symbol: %v}", n.symbol) +} + +func (n *fragmentNode) children() (astNode, astNode) { + return n.left, nil +} + +func (n *fragmentNode) nullable() bool { + return n.left.nullable() +} + +func (n *fragmentNode) first() *symbolPositionSet { + return n.left.first() +} + +func (n *fragmentNode) last() *symbolPositionSet { + return n.left.last() +} + func copyAST(src astNode) astNode { switch n := src.(type) { case *symbolNode: @@ -320,6 +353,11 @@ func copyAST(src astNode) astNode { return newRepeatNode(copyAST(n.left)) case *optionNode: return newOptionNode(copyAST(n.left)) + case *fragmentNode: + if n.left == nil { + return newFragmentNode(n.symbol, nil) + } + return newFragmentNode(n.symbol, copyAST(n.left)) } panic(fmt.Errorf("copyAST cannot handle %T type; AST: %v", src, src)) } diff --git a/compiler/compiler.go b/compiler/compiler.go index 2fdbce1..f382d16 100644 --- a/compiler/compiler.go +++ b/compiler/compiler.go @@ -54,7 +54,7 @@ func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSp } } - modeEntries, modes, modeNums := groupEntriesByLexMode(lexspec.Entries) + modeEntries, modes, modeNums, fragmetns := groupEntriesByLexMode(lexspec.Entries) modeSpecs := []*spec.CompiledLexModeSpec{ nil, @@ -62,7 +62,7 @@ func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSp for i, es := range modeEntries[1:] { modeName := modes[i+1] config.logger.Log("Compile %v mode:", modeName) - modeSpec, err := compile(es, modeNums, config) + modeSpec, err := compile(es, modeNums, fragmetns, config) if err != nil { return nil, fmt.Errorf("failed to compile in %v mode: %w", modeName, err) } @@ -77,7 +77,7 @@ func Compile(lexspec *spec.LexSpec, opts ...CompilerOption) (*spec.CompiledLexSp }, nil } -func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeNum) { +func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeNum, map[string]*spec.LexEntry) { modes := []spec.LexModeName{ spec.LexModeNameNil, spec.LexModeNameDefault, @@ -89,9 +89,14 @@ func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec lastModeNum := spec.LexModeNumDefault modeEntries := [][]*spec.LexEntry{ nil, - []*spec.LexEntry{}, + {}, } + fragments := map[string]*spec.LexEntry{} for _, e := range entries { + if e.Fragment { + fragments[e.Kind.String()] = e + continue + } ms := e.Modes if len(ms) == 0 { ms = []spec.LexModeName{ @@ -110,10 +115,10 @@ func groupEntriesByLexMode(entries []*spec.LexEntry) ([][]*spec.LexEntry, []spec modeEntries[num] = append(modeEntries[num], e) } } - return modeEntries, modes, modeNums + return modeEntries, modes, modeNums, fragments } -func compile(entries []*spec.LexEntry, modeNums map[spec.LexModeName]spec.LexModeNum, config *compilerConfig) (*spec.CompiledLexModeSpec, error) { +func compile(entries []*spec.LexEntry, modeNums map[spec.LexModeName]spec.LexModeNum, fragments map[string]*spec.LexEntry, config *compilerConfig) (*spec.CompiledLexModeSpec, error) { var kinds []spec.LexKind var patterns map[int][]byte { @@ -149,11 +154,16 @@ func compile(entries []*spec.LexEntry, modeNums map[spec.LexModeName]spec.LexMod pop = append(pop, popV) } + fragmentPatterns := map[string][]byte{} + for k, e := range fragments { + fragmentPatterns[k] = []byte(e.Pattern) + } + var root astNode var symTab *symbolTable { var err error - root, symTab, err = parse(patterns) + root, symTab, err = parse(patterns, fragmentPatterns) if err != nil { return nil, err } diff --git a/compiler/dfa_test.go b/compiler/dfa_test.go index dcf1ae5..a797313 100644 --- a/compiler/dfa_test.go +++ b/compiler/dfa_test.go @@ -7,7 +7,7 @@ import ( func TestGenDFA(t *testing.T) { root, symTab, err := parse(map[int][]byte{ 1: []byte("(a|b)*abb"), - }) + }, nil) if err != nil { t.Fatal(err) } diff --git a/compiler/lexer.go b/compiler/lexer.go index 1aadf50..4f6fdac 100644 --- a/compiler/lexer.go +++ b/compiler/lexer.go @@ -24,19 +24,22 @@ const ( tokenKindCharRange = tokenKind("-") tokenKindCodePointLeader = tokenKind("\\u") tokenKindCharPropLeader = tokenKind("\\p") + tokenKindFragmentLeader = tokenKind("\\f") tokenKindLBrace = tokenKind("{") tokenKindRBrace = tokenKind("}") tokenKindEqual = tokenKind("=") tokenKindCodePoint = tokenKind("code point") tokenKindCharPropSymbol = tokenKind("character property symbol") + tokenKindFragmentSymbol = tokenKind("fragment symbol") tokenKindEOF = tokenKind("eof") ) type token struct { - kind tokenKind - char rune - propSymbol string - codePoint string + kind tokenKind + char rune + propSymbol string + codePoint string + fragmentSymbol string } const nullChar = '\u0000' @@ -62,6 +65,13 @@ func newCharPropSymbolToken(propSymbol string) *token { } } +func newFragmentSymbolToken(fragmentSymbol string) *token { + return &token{ + kind: tokenKindFragmentSymbol, + fragmentSymbol: fragmentSymbol, + } +} + type lexerMode string const ( @@ -69,6 +79,7 @@ const ( lexerModeBExp = lexerMode("bracket expression") lexerModeCPExp = lexerMode("code point expression") lexerModeCharPropExp = lexerMode("character property expression") + lexerModeFragmentExp = lexerMode("fragment expression") ) type lexerModeStack struct { @@ -197,6 +208,16 @@ func (l *lexer) next() (*token, error) { l.modeStack.pop() } return tok, nil + case lexerModeFragmentExp: + tok, err := l.nextInFragment(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() + } + return tok, nil default: tok, err := l.nextInDefault(c) if err != nil { @@ -213,6 +234,8 @@ func (l *lexer) next() (*token, error) { l.modeStack.push(lexerModeCPExp) case tokenKindCharPropLeader: l.modeStack.push(lexerModeCharPropExp) + case tokenKindFragmentLeader: + l.modeStack.push(lexerModeFragmentExp) } return tok, nil } @@ -294,6 +317,9 @@ func (l *lexer) nextInDefault(c rune) (*token, error) { if c == 'p' { return newToken(tokenKindCharPropLeader, nullChar), nil } + if c == 'f' { + return newToken(tokenKindFragmentLeader, nullChar), nil + } if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' { return newToken(tokenKindChar, c), nil } @@ -455,6 +481,46 @@ func (l *lexer) nextInCharProp(c rune) (*token, error) { } } +func (l *lexer) nextInFragment(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + default: + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + fmt.Fprint(&b, string(c)) + n++ + } + sym := strings.TrimSpace(b.String()) + if len(sym) == 0 { + raiseSyntaxError(SynErrFragmentInvalidSymbol) + } + return newFragmentSymbolToken(sym), nil + } +} + func (l *lexer) read() (rune, bool, error) { if l.reachedEOF { return l.lastChar, l.reachedEOF, nil diff --git a/compiler/lexer_test.go b/compiler/lexer_test.go index 089253b..51eb82e 100644 --- a/compiler/lexer_test.go +++ b/compiler/lexer_test.go @@ -445,6 +445,34 @@ func TestLexer(t *testing.T) { newToken(tokenKindEOF, nullChar), }, }, + { + caption: "lexer can recognize the special characters and symbols in fragment expression mode", + src: "\\f{integer}", + tokens: []*token{ + newToken(tokenKindFragmentLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newFragmentSymbolToken("integer"), + newToken(tokenKindRBrace, nullChar), + + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "a fragment expression is not supported in a bracket expression", + src: "[\\f", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + }, + err: synErrInvalidEscSeq, + }, + { + caption: "a fragment expression is not supported in an inverse bracket expression", + src: "[^\\f", + tokens: []*token{ + newToken(tokenKindInverseBExpOpen, nullChar), + }, + err: synErrInvalidEscSeq, + }, } for _, tt := range tests { t.Run(tt.caption, func(t *testing.T) { diff --git a/compiler/parser.go b/compiler/parser.go index 5b8c912..06a762d 100644 --- a/compiler/parser.go +++ b/compiler/parser.go @@ -76,62 +76,173 @@ func genSymTab(symTab *symbolTable, node astNode) *symbolTable { return symTab } -func parse(regexps map[int][]byte) (astNode, *symbolTable, error) { +func parse(regexps map[int][]byte, fragments map[string][]byte) (astNode, *symbolTable, error) { if len(regexps) == 0 { return nil, nil, fmt.Errorf("parse() needs at least one token entry") } + + fragmentASTs, err := parseFragments(fragments) + if err != nil { + return nil, nil, err + } + if fragmentASTs == nil { + fragmentASTs = map[string]astNode{} + } + + root, err := parseRegexp(regexps, fragmentASTs) + if err != nil { + return nil, nil, err + } + + return root, genSymbolTable(root), nil +} + +type incompleteFragment struct { + kind string + ast astNode +} + +func parseFragments(fragments map[string][]byte) (map[string]astNode, error) { + if len(fragments) == 0 { + return nil, nil + } + fragmentASTs := map[string]astNode{} + incompleteFragments := []*incompleteFragment{} + var perrs []*ParseError + for kind, pattern := range fragments { + p := newParser(bytes.NewReader(pattern)) + ast, err := p.parse() + if err != nil { + perrs = append(perrs, &ParseError{ + Pattern: pattern, + Cause: err, + Details: p.errMsgDetails, + }) + continue + } + if p.incomplete { + incompleteFragments = append(incompleteFragments, &incompleteFragment{ + kind: kind, + ast: ast, + }) + } else { + fragmentASTs[kind] = ast + } + } + for len(incompleteFragments) > 0 { + lastIncompCount := len(incompleteFragments) + remainingFragments := []*incompleteFragment{} + for _, e := range incompleteFragments { + remains := applyFragments(e.ast, fragmentASTs) + if len(remains) > 0 { + remainingFragments = append(remainingFragments, e) + } else { + fragmentASTs[e.kind] = e.ast + } + } + incompleteFragments = remainingFragments + if len(incompleteFragments) == lastIncompCount { + for _, e := range incompleteFragments { + perrs = append(perrs, &ParseError{ + Cause: fmt.Errorf("%v has an undefined fragment or a cycle", e.kind), + }) + } + break + } + } + if len(perrs) > 0 { + return nil, &ParseErrors{ + Errors: perrs, + } + } + + return fragmentASTs, nil +} + +func parseRegexp(regexps map[int][]byte, fragmentASTs map[string]astNode) (astNode, error) { symPos := symbolPositionMin var root astNode var perrs []*ParseError for id, pattern := range regexps { - p := newParser(id, symPos, bytes.NewReader(pattern)) + p := newParser(bytes.NewReader(pattern)) ast, err := p.parse() if err != nil { - perr := &ParseError{ + perrs = append(perrs, &ParseError{ ID: id, Pattern: pattern, Cause: err, Details: p.errMsgDetails, - } - if p.errMsgDetails != "" { - perr.Details = p.errMsgDetails - } - perrs = append(perrs, perr) + }) continue } - if root == nil { - root = ast - } else { - root = newAltNode(root, ast) + remains := applyFragments(ast, fragmentASTs) + if len(remains) > 0 { + perrs = append(perrs, &ParseError{ + ID: id, + Pattern: pattern, + Cause: fmt.Errorf("undefined fragment: %+v", remains), + }) + continue } - symPos = p.symPos + ast = newConcatNode(ast, newEndMarkerNode(id)) + symPos, err = positionSymbols(ast, symPos) + if err != nil { + perrs = append(perrs, &ParseError{ + ID: id, + Pattern: pattern, + Cause: err, + Details: p.errMsgDetails, + }) + continue + } + root = genAltNode(root, ast) } if len(perrs) > 0 { - return nil, nil, &ParseErrors{ + return nil, &ParseErrors{ Errors: perrs, } } - return root, genSymbolTable(root), nil + return root, nil +} + +func applyFragments(ast astNode, fragments map[string]astNode) []string { + if ast == nil { + return nil + } + n, ok := ast.(*fragmentNode) + if !ok { + var remains []string + left, right := ast.children() + r := applyFragments(left, fragments) + if len(r) > 0 { + remains = append(remains, r...) + } + r = applyFragments(right, fragments) + if len(r) > 0 { + remains = append(remains, r...) + } + return remains + } + f, ok := fragments[n.symbol] + if !ok { + return []string{n.symbol} + } + n.left = copyAST(f) + return nil } type parser struct { - id int lex *lexer peekedTok *token lastTok *token + incomplete bool errMsgDetails string - symPos uint16 } -func newParser(id int, initialSymPos uint16, src io.Reader) *parser { +func newParser(src io.Reader) *parser { return &parser{ - id: id, - lex: newLexer(src), - peekedTok: nil, - lastTok: nil, - errMsgDetails: "", - symPos: initialSymPos, + lex: newLexer(src), } } @@ -152,10 +263,7 @@ func (p *parser) parse() (ast astNode, retErr error) { if err != nil { return nil, err } - p.symPos, err = positionSymbols(ast, p.symPos) - if err != nil { - return nil, err - } + return ast, nil } @@ -171,7 +279,7 @@ func (p *parser) parseRegexp() (astNode, error) { raiseSyntaxError(synErrGroupNoInitiator) } p.expect(tokenKindEOF) - return newConcatNode(alt, newEndMarkerNode(p.id)), nil + return alt, nil } func (p *parser) parseAlt() astNode { @@ -315,6 +423,9 @@ func (p *parser) parseSingleChar() astNode { if p.consume(tokenKindCharPropLeader) { return p.parseCharProp() } + if p.consume(tokenKindFragmentLeader) { + return p.parseFragment() + } c := p.parseNormalChar() if c == nil { if p.consume(tokenKindBExpClose) { @@ -446,6 +557,24 @@ func (p *parser) parseCharProp() astNode { return alt } +func (p *parser) parseFragment() astNode { + if !p.consume(tokenKindLBrace) { + raiseSyntaxError(synErrFragmentExpInvalidForm) + } + if !p.consume(tokenKindFragmentSymbol) { + raiseSyntaxError(synErrFragmentExpInvalidForm) + } + sym := p.lastTok.fragmentSymbol + + if !p.consume(tokenKindRBrace) { + raiseSyntaxError(synErrFragmentExpInvalidForm) + } + + p.incomplete = true + + return newFragmentNode(sym, nil) +} + func (p *parser) parseNormalChar() astNode { if !p.consume(tokenKindChar) { return nil diff --git a/compiler/parser_test.go b/compiler/parser_test.go index aca08ec..3394845 100644 --- a/compiler/parser_test.go +++ b/compiler/parser_test.go @@ -1,7 +1,6 @@ package compiler import ( - "bytes" "fmt" "reflect" "testing" @@ -23,9 +22,10 @@ func endPos(n uint16) symbolPosition { return pos } -func TestParser_parse(t *testing.T) { +func TestParse(t *testing.T) { tests := []struct { pattern string + fragments map[string]string ast astNode syntaxError *SyntaxError @@ -92,6 +92,24 @@ func TestParser_parse(t *testing.T) { skipTestAST: true, }, { + pattern: "\\f{a2c}?", + fragments: map[string]string{ + "a2c": "abc", + }, + ast: genConcatNode( + newOptionNode( + newFragmentNode("a2c", + genConcatNode( + newSymbolNodeWithPos(byte('a'), symPos(1)), + newSymbolNodeWithPos(byte('b'), symPos(2)), + newSymbolNodeWithPos(byte('c'), symPos(3)), + ), + ), + ), + newEndMarkerNodeWithPos(1, endPos(4)), + ), + }, + { pattern: "(a)?", ast: genConcatNode( newOptionNode( @@ -198,6 +216,24 @@ func TestParser_parse(t *testing.T) { skipTestAST: true, }, { + pattern: "\\f{a2c}*", + fragments: map[string]string{ + "a2c": "abc", + }, + ast: genConcatNode( + newRepeatNode( + newFragmentNode("a2c", + genConcatNode( + newSymbolNodeWithPos(byte('a'), symPos(1)), + newSymbolNodeWithPos(byte('b'), symPos(2)), + newSymbolNodeWithPos(byte('c'), symPos(3)), + ), + ), + ), + newEndMarkerNodeWithPos(1, endPos(4)), + ), + }, + { pattern: "((a*)*)*", ast: genConcatNode( newRepeatNode( @@ -306,6 +342,31 @@ func TestParser_parse(t *testing.T) { skipTestAST: true, }, { + pattern: "\\f{a2c}+", + fragments: map[string]string{ + "a2c": "abc", + }, + ast: genConcatNode( + newFragmentNode("a2c", + genConcatNode( + newSymbolNodeWithPos(byte('a'), symPos(1)), + newSymbolNodeWithPos(byte('b'), symPos(2)), + newSymbolNodeWithPos(byte('c'), symPos(3)), + ), + ), + newRepeatNode( + newFragmentNode("a2c", + genConcatNode( + newSymbolNodeWithPos(byte('a'), symPos(4)), + newSymbolNodeWithPos(byte('b'), symPos(5)), + newSymbolNodeWithPos(byte('c'), symPos(6)), + ), + ), + ), + newEndMarkerNodeWithPos(1, endPos(7)), + ), + }, + { pattern: "((a+)+)+", ast: genConcatNode( genConcatNode( @@ -917,6 +978,53 @@ func TestParser_parse(t *testing.T) { syntaxError: synErrCharPropExpInvalidForm, }, { + pattern: "\\f{a2c}", + fragments: map[string]string{ + "a2c": "abc", + }, + ast: genConcatNode( + newFragmentNode("a2c", + genConcatNode( + newSymbolNodeWithPos(byte('a'), symPos(1)), + newSymbolNodeWithPos(byte('b'), symPos(2)), + newSymbolNodeWithPos(byte('c'), symPos(3)), + ), + ), + newEndMarkerNodeWithPos(1, endPos(4)), + ), + }, + { + pattern: "\\f{ a2c }", + fragments: map[string]string{ + "a2c": "abc", + }, + ast: genConcatNode( + newFragmentNode("a2c", + genConcatNode( + newSymbolNodeWithPos(byte('a'), symPos(1)), + newSymbolNodeWithPos(byte('b'), symPos(2)), + newSymbolNodeWithPos(byte('c'), symPos(3)), + ), + ), + newEndMarkerNodeWithPos(1, endPos(4)), + ), + }, + { + pattern: "\\f", + syntaxError: synErrFragmentExpInvalidForm, + }, + { + pattern: "\\f{", + syntaxError: synErrFragmentExpInvalidForm, + }, + { + pattern: "\\f{a2c", + fragments: map[string]string{ + "a2c": "abc", + }, + syntaxError: synErrFragmentExpInvalidForm, + }, + { pattern: "(a)", ast: newConcatNode( newSymbolNodeWithPos(byte('a'), symPos(1)), @@ -1085,16 +1193,26 @@ func TestParser_parse(t *testing.T) { } for i, tt := range tests { t.Run(fmt.Sprintf("#%v %v", i, tt.pattern), func(t *testing.T) { - p := newParser(1, symbolPositionMin, bytes.NewReader([]byte(tt.pattern))) - ast, err := p.parse() + fragments := map[string][]byte{} + for kind, pattern := range tt.fragments { + fragments[kind] = []byte(pattern) + } + ast, _, err := parse(map[int][]byte{ + 1: []byte(tt.pattern), + }, fragments) if tt.syntaxError != nil { // printAST(os.Stdout, ast, "", "", false) if err == nil { t.Fatalf("expected syntax error; got: nil") } - synErr, ok := err.(*SyntaxError) + parseErrs, ok := err.(*ParseErrors) + if !ok { + t.Fatalf("expected ParseErrors; got: %v (type: %T)", err, err) + } + parseErr := parseErrs.Errors[0].Cause + synErr, ok := parseErr.(*SyntaxError) if !ok { - t.Fatalf("expected SyntaxError; got: %v (type: %T)", err, err) + t.Fatalf("expected SyntaxError; got: %v (type: %T)", parseErr, parseErr) } if synErr != tt.syntaxError { t.Fatalf("unexpected syntax error; want: %v, got: %v", tt.syntaxError, synErr) @@ -1118,10 +1236,10 @@ func TestParser_parse(t *testing.T) { } } -func TestParse(t *testing.T) { +func TestParse_FollowAndSymbolTable(t *testing.T) { root, symTab, err := parse(map[int][]byte{ 1: []byte("(a|b)*abb"), - }) + }, nil) if err != nil { t.Fatal(err) } diff --git a/compiler/syntax_error.go b/compiler/syntax_error.go index d784995..3925259 100644 --- a/compiler/syntax_error.go +++ b/compiler/syntax_error.go @@ -22,6 +22,7 @@ var ( synErrInvalidEscSeq = newSyntaxError("invalid escape sequence") synErrInvalidCodePoint = newSyntaxError("code points must consist of just 4 or 6 hex digits") synErrCharPropInvalidSymbol = newSyntaxError("invalid character property symbol") + SynErrFragmentInvalidSymbol = newSyntaxError("invalid fragment symbol") // syntax errors synErrUnexpectedToken = newSyntaxError("unexpected token") @@ -40,4 +41,5 @@ var ( synErrCPExpOutOfRange = newSyntaxError("a code point must be between U+0000 to U+10FFFF") synErrCharPropExpInvalidForm = newSyntaxError("invalid character property expression") synErrCharPropUnsupported = newSyntaxError("unsupported character property") + synErrFragmentExpInvalidForm = newSyntaxError("invalid fragment expression") ) diff --git a/driver/lexer_test.go b/driver/lexer_test.go index 1d0e887..87a381c 100644 --- a/driver/lexer_test.go +++ b/driver/lexer_test.go @@ -34,6 +34,14 @@ func newLexEntryDefaultNOP(kind string, pattern string) *spec.LexEntry { } } +func newLexEntryFragment(kind string, pattern string) *spec.LexEntry { + return &spec.LexEntry{ + Kind: spec.LexKind(kind), + Pattern: spec.LexPattern(pattern), + Fragment: true, + } +} + func newTokenDefault(id int, kind string, match byteSequence) *Token { return newToken(spec.LexModeNumDefault, spec.LexModeNameDefault, id, kind, match) } @@ -477,6 +485,50 @@ func TestLexer_Next(t *testing.T) { { lspec: &spec.LexSpec{ Entries: []*spec.LexEntry{ + newLexEntryDefaultNOP("t1", "\\f{a2c}\\f{d2f}+"), + newLexEntryFragment("a2c", "abc"), + newLexEntryFragment("d2f", "def"), + }, + }, + src: "abcdefdefabcdef", + tokens: []*Token{ + newTokenDefault(1, "t1", newByteSequence([]byte("abcdefdef"))), + newTokenDefault(1, "t1", newByteSequence([]byte("abcdef"))), + newEOFTokenDefault(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + newLexEntryDefaultNOP("t1", "(\\f{a2c}|\\f{d2f})+"), + newLexEntryFragment("a2c", "abc"), + newLexEntryFragment("d2f", "def"), + }, + }, + src: "abcdefdefabc", + tokens: []*Token{ + newTokenDefault(1, "t1", newByteSequence([]byte("abcdefdefabc"))), + newEOFTokenDefault(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ + newLexEntryDefaultNOP("t1", "\\f{a2c_or_d2f}+"), + newLexEntryFragment("a2c_or_d2f", "\\f{a2c}|\\f{d2f}"), + newLexEntryFragment("a2c", "abc"), + newLexEntryFragment("d2f", "def"), + }, + }, + src: "abcdefdefabc", + tokens: []*Token{ + newTokenDefault(1, "t1", newByteSequence([]byte("abcdefdefabc"))), + newEOFTokenDefault(), + }, + }, + { + lspec: &spec.LexSpec{ + Entries: []*spec.LexEntry{ newLexEntryDefaultNOP("white_space", ` *`), newLexEntry([]string{"default"}, "string_open", `"`, "string", false), newLexEntry([]string{"string"}, "escape_sequence", `\\[n"\\]`, "", false), diff --git a/spec/spec.go b/spec/spec.go index e3c318e..61955ac 100644 --- a/spec/spec.go +++ b/spec/spec.go @@ -88,11 +88,12 @@ func (n LexModeNum) IsNil() bool { } type LexEntry struct { - Kind LexKind `json:"kind"` - Pattern LexPattern `json:"pattern"` - Modes []LexModeName `json:"modes"` - Push LexModeName `json:"push"` - Pop bool `json:"pop"` + Kind LexKind `json:"kind"` + Pattern LexPattern `json:"pattern"` + Modes []LexModeName `json:"modes"` + Push LexModeName `json:"push"` + Pop bool `json:"pop"` + Fragment bool `json:"fragment"` } func (e *LexEntry) validate() error { |