package parser import ( "fmt" "sort" "strings" "testing" "urubu/grammar" spec "urubu/spec/grammar" "urubu/spec/grammar/parser" ) func TestParserWithConflicts(t *testing.T) { tests := []struct { caption string specSrc string src string cst *Node }{ { caption: "when a shift/reduce conflict occurred, we prioritize the shift action", specSrc: ` #name test; expr : expr assign expr | id ; id: "[A-Za-z0-9_]+"; assign: '='; `, src: `foo=bar=baz`, cst: nonTermNode("expr", nonTermNode("expr", termNode("id", "foo"), ), termNode("assign", "="), nonTermNode("expr", nonTermNode("expr", termNode("id", "bar"), ), termNode("assign", "="), nonTermNode("expr", termNode("id", "baz"), ), ), ), }, { caption: "when a reduce/reduce conflict occurred, we prioritize the production defined earlier in the grammar", specSrc: ` #name test; s : a | b ; a : id ; b : id ; id: "[A-Za-z0-9_]+"; `, src: `foo`, cst: nonTermNode("s", nonTermNode("a", termNode("id", "foo"), ), ), }, { caption: "left associativities defined earlier in the grammar have higher precedence", specSrc: ` #name test; #prec ( #left mul #left add ); expr : expr add expr | expr mul expr | id ; id: "[A-Za-z0-9_]+"; add: '+'; mul: '*'; `, src: `a+b*c*d+e`, cst: nonTermNode("expr", nonTermNode("expr", nonTermNode("expr", termNode("id", "a"), ), termNode("add", "+"), nonTermNode("expr", nonTermNode("expr", nonTermNode("expr", termNode("id", "b"), ), termNode("mul", "*"), nonTermNode("expr", termNode("id", "c"), ), ), termNode("mul", "*"), nonTermNode("expr", termNode("id", "d"), ), ), ), termNode("add", "+"), nonTermNode("expr", termNode("id", "e"), ), ), }, { caption: "left associativities defined in the same line have the same precedence", specSrc: ` #name test; #prec ( #left add sub ); expr : expr add expr | expr sub expr | id ; id: "[A-Za-z0-9_]+"; add: '+'; sub: '-'; `, src: `a-b+c+d-e`, cst: nonTermNode("expr", nonTermNode("expr", nonTermNode("expr", nonTermNode("expr", nonTermNode("expr", termNode("id", "a"), ), termNode("sub", "-"), nonTermNode("expr", termNode("id", "b"), ), ), termNode("add", "+"), nonTermNode("expr", termNode("id", "c"), ), ), termNode("add", "+"), nonTermNode("expr", termNode("id", "d"), ), ), termNode("sub", "-"), nonTermNode("expr", termNode("id", "e"), ), ), }, { caption: "right associativities defined earlier in the grammar have higher precedence", specSrc: ` #name test; #prec ( #right r1 #right r2 ); expr : expr r2 expr | expr r1 expr | id ; whitespaces #skip : "[\u{0009}\u{0020}]+"; r1 : 'r1'; r2 : 'r2'; id : "[A-Za-z0-9_]+"; `, src: `a r2 b r1 c r1 d r2 e`, cst: nonTermNode("expr", nonTermNode("expr", termNode("id", "a"), ), termNode("r2", "r2"), nonTermNode("expr", nonTermNode("expr", nonTermNode("expr", termNode("id", "b"), ), termNode("r1", "r1"), nonTermNode("expr", nonTermNode("expr", termNode("id", "c"), ), termNode("r1", "r1"), nonTermNode("expr", termNode("id", "d"), ), ), ), termNode("r2", "r2"), nonTermNode("expr", termNode("id", "e"), ), ), ), }, { caption: "right associativities defined in the same line have the same precedence", specSrc: ` #name test; #prec ( #right r1 r2 ); expr : expr r2 expr | expr r1 expr | id ; whitespaces #skip : "[\u{0009}\u{0020}]+"; r1 : 'r1'; r2 : 'r2'; id : "[A-Za-z0-9_]+"; `, src: `a r2 b r1 c r1 d r2 e`, cst: nonTermNode("expr", nonTermNode("expr", termNode("id", "a"), ), termNode("r2", "r2"), nonTermNode("expr", nonTermNode("expr", termNode("id", "b"), ), termNode("r1", "r1"), nonTermNode("expr", nonTermNode("expr", termNode("id", "c"), ), termNode("r1", "r1"), nonTermNode("expr", nonTermNode("expr", termNode("id", "d"), ), termNode("r2", "r2"), nonTermNode("expr", termNode("id", "e"), ), ), ), ), ), }, { caption: "terminal symbols with an #assign directive defined earlier in the grammar have higher precedence", specSrc: ` #name test; #prec ( #assign a1 #assign a2 ); expr : expr a2 expr | expr a1 expr | id ; whitespaces #skip : "[\u{0009}\u{0020}]+"; a1 : 'a1'; a2 : 'a2'; id : "[A-Za-z0-9_]+"; `, src: `a a2 b a1 c a1 d a2 e`, cst: nonTermNode("expr", nonTermNode("expr", termNode("id", "a"), ), termNode("a2", "a2"), nonTermNode("expr", nonTermNode("expr", nonTermNode("expr", termNode("id", "b"), ), termNode("a1", "a1"), nonTermNode("expr", nonTermNode("expr", termNode("id", "c"), ), termNode("a1", "a1"), nonTermNode("expr", termNode("id", "d"), ), ), ), termNode("a2", "a2"), nonTermNode("expr", termNode("id", "e"), ), ), ), }, { caption: "terminal symbols with an #assign directive defined in the same line have the same precedence", specSrc: ` #name test; #prec ( #assign a1 a2 ); expr : expr a2 expr | expr a1 expr | id ; whitespaces #skip : "[\u{0009}\u{0020}]+"; a1 : 'a1'; a2 : 'a2'; id : "[A-Za-z0-9_]+"; `, src: `a a2 b a1 c a1 d a2 e`, cst: nonTermNode("expr", nonTermNode("expr", termNode("id", "a"), ), termNode("a2", "a2"), nonTermNode("expr", nonTermNode("expr", termNode("id", "b"), ), termNode("a1", "a1"), nonTermNode("expr", nonTermNode("expr", termNode("id", "c"), ), termNode("a1", "a1"), nonTermNode("expr", nonTermNode("expr", termNode("id", "d"), ), termNode("a2", "a2"), nonTermNode("expr", termNode("id", "e"), ), ), ), ), ), }, { caption: "#left, #right, and #assign can be mixed", specSrc: ` #name test; #prec ( #left mul div #left add sub #assign else #assign then #right assign ); expr : expr add expr | expr sub expr | expr mul expr | expr div expr | expr assign expr | if expr then expr | if expr then expr else expr | id ; ws #skip: "[\u{0009}\u{0020}]+"; if: 'if'; then: 'then'; else: 'else'; id: "[A-Za-z0-9_]+"; add: '+'; sub: '-'; mul: '*'; div: '/'; assign: '='; `, src: `x = y = a + b * c - d / e + if f then if g then h else i`, cst: nonTermNode( "expr", nonTermNode("expr", termNode("id", "x"), ), termNode("assign", "="), nonTermNode("expr", nonTermNode("expr", termNode("id", "y"), ), termNode("assign", "="), nonTermNode("expr", nonTermNode("expr", nonTermNode("expr", nonTermNode("expr", termNode("id", "a"), ), termNode("add", "+"), nonTermNode("expr", nonTermNode("expr", termNode("id", "b"), ), termNode("mul", "*"), nonTermNode("expr", termNode("id", "c"), ), ), ), termNode("sub", "-"), nonTermNode("expr", nonTermNode("expr", termNode("id", "d"), ), termNode("div", "/"), nonTermNode("expr", termNode("id", "e"), ), ), ), termNode("add", "+"), nonTermNode("expr", termNode("if", "if"), nonTermNode("expr", termNode("id", "f"), ), termNode("then", "then"), nonTermNode("expr", termNode("if", "if"), nonTermNode("expr", termNode("id", "g"), ), termNode("then", "then"), nonTermNode("expr", termNode("id", "h"), ), termNode("else", "else"), nonTermNode("expr", termNode("id", "i"), ), ), ), ), ), ), }, } for _, tt := range tests { t.Run(tt.caption, func(t *testing.T) { ast, err := parser.Parse(strings.NewReader(tt.specSrc)) if err != nil { t.Fatal(err) } b := grammar.GrammarBuilder{ AST: ast, } cg, _, err := b.Build() if err != nil { t.Fatal(err) } toks, err := NewTokenStream(cg, strings.NewReader(tt.src)) if err != nil { t.Fatal(err) } gram := NewGrammar(cg) tb := NewDefaultSyntaxTreeBuilder() p, err := NewParser(toks, gram, SemanticAction(NewCSTActionSet(gram, tb))) if err != nil { t.Fatal(err) } err = p.Parse() if err != nil { t.Fatal(err) } if tt.cst != nil { testTree(t, tb.Tree(), tt.cst) } }) } } func TestParserWithLAC(t *testing.T) { specSrc := ` #name test; s : t t ; t : c t | d ; c: 'c'; d: 'd'; ` src := `ccd` actLogWithLAC := []string{ "shift/c", "shift/c", "shift/d", "miss", } actLogWithoutLAC := []string{ "shift/c", "shift/c", "shift/d", "reduce/t", "reduce/t", "reduce/t", "miss", } ast, err := parser.Parse(strings.NewReader(specSrc)) if err != nil { t.Fatal(err) } b := grammar.GrammarBuilder{ AST: ast, } gram, _, err := b.Build() if err != nil { t.Fatal(err) } t.Run("LAC is enabled", func(t *testing.T) { semAct := &testSemAct{ gram: gram, } toks, err := NewTokenStream(gram, strings.NewReader(src)) if err != nil { t.Fatal(err) } p, err := NewParser(toks, NewGrammar(gram), SemanticAction(semAct)) if err != nil { t.Fatal(err) } err = p.Parse() if err != nil { t.Fatal(err) } if len(semAct.actLog) != len(actLogWithLAC) { t.Fatalf("unexpected action log; want: %+v, got: %+v", actLogWithLAC, semAct.actLog) } for i, e := range actLogWithLAC { if semAct.actLog[i] != e { t.Fatalf("unexpected action log; want: %+v, got: %+v", actLogWithLAC, semAct.actLog) } } }) t.Run("LAC is disabled", func(t *testing.T) { semAct := &testSemAct{ gram: gram, } toks, err := NewTokenStream(gram, strings.NewReader(src)) if err != nil { t.Fatal(err) } p, err := NewParser(toks, NewGrammar(gram), SemanticAction(semAct), DisableLAC()) if err != nil { t.Fatal(err) } err = p.Parse() if err != nil { t.Fatal(err) } if len(semAct.actLog) != len(actLogWithoutLAC) { t.Fatalf("unexpected action log; want: %+v, got: %+v", actLogWithoutLAC, semAct.actLog) } for i, e := range actLogWithoutLAC { if semAct.actLog[i] != e { t.Fatalf("unexpected action log; want: %+v, got: %+v", actLogWithoutLAC, semAct.actLog) } } }) } func termNode(kind string, text string, children ...*Node) *Node { return &Node{ Type: NodeTypeTerminal, KindName: kind, Text: text, Children: children, } } func errorNode() *Node { return &Node{ Type: NodeTypeError, KindName: "error", } } func nonTermNode(kind string, children ...*Node) *Node { return &Node{ Type: NodeTypeNonTerminal, KindName: kind, Children: children, } } func TestParser_Parse(t *testing.T) { tests := []struct { specSrc string src string synErr bool cst *Node ast *Node }{ { specSrc: ` #name test; expr : expr add term | term ; term : term mul factor | factor ; factor : l_paren expr r_paren | id ; add : '+'; mul : '*'; l_paren : '('; r_paren : ')'; id : "[A-Za-z_][0-9A-Za-z_]*"; `, src: `(a+(b+c))*d+e`, cst: nonTermNode("expr", nonTermNode("expr", nonTermNode("term", nonTermNode("term", nonTermNode("factor", termNode("l_paren", "("), nonTermNode("expr", nonTermNode("expr", nonTermNode("term", nonTermNode("factor", termNode("id", "a"), ), ), ), termNode("add", "+"), nonTermNode("term", nonTermNode("factor", termNode("l_paren", "("), nonTermNode("expr", nonTermNode("expr", nonTermNode("term", nonTermNode("factor", termNode("id", "b"), ), ), ), termNode("add", "+"), nonTermNode("term", nonTermNode("factor", termNode("id", "c"), ), ), ), termNode("r_paren", ")"), ), ), ), termNode("r_paren", ")"), ), ), termNode("mul", "*"), nonTermNode("factor", termNode("id", "d"), ), ), ), termNode("add", "+"), nonTermNode("term", nonTermNode("factor", termNode("id", "e"), ), ), ), }, // Fragments (\f{}), code point expressions (\u{}), and character property expressions (\p{}) are // not allowed in string literals. { specSrc: ` #name test; s : a b c ; a : '\f{foo}'; b : '\u{0000}'; c : '\p{gc=Letter}'; `, src: `\f{foo}\u{0000}\p{gc=Letter}`, cst: nonTermNode("s", termNode("a", `\f{foo}`), termNode("b", `\u{0000}`), termNode("c", `\p{gc=Letter}`), ), }, // The driver can reduce productions that have the empty alternative and can generate a CST (and AST) node. { specSrc: ` #name test; s : foo bar ; foo : ; bar : bar_text | ; bar_text: "bar"; `, src: ``, cst: nonTermNode("s", nonTermNode("foo"), nonTermNode("bar"), ), }, // The driver can reduce productions that have the empty alternative and can generate a CST (and AST) node. { specSrc: ` #name test; s : foo bar ; foo : ; bar : bar_text | ; bar_text : "bar"; `, src: `bar`, cst: nonTermNode("s", nonTermNode("foo"), nonTermNode("bar", termNode("bar_text", "bar"), ), ), }, // A production can have multiple alternative productions. { specSrc: ` #name test; #prec ( #assign $uminus #left mul div #left add sub ); expr : expr add expr | expr sub expr | expr mul expr | expr div expr | int | sub int #prec $uminus // This 'sub' means the unary minus symbol. ; int : "0|[1-9][0-9]*"; add : '+'; sub : '-'; mul : '*'; div : '/'; `, src: `-1*-2+3-4/5`, ast: nonTermNode("expr", nonTermNode("expr", nonTermNode("expr", nonTermNode("expr", termNode("sub", "-"), termNode("int", "1"), ), termNode("mul", "*"), nonTermNode("expr", termNode("sub", "-"), termNode("int", "2"), ), ), termNode("add", "+"), nonTermNode("expr", termNode("int", "3"), ), ), termNode("sub", "-"), nonTermNode("expr", nonTermNode("expr", termNode("int", "4"), ), termNode("div", "/"), nonTermNode("expr", termNode("int", "5"), ), ), ), }, // A lexical production can have multiple production directives. { specSrc: ` #name test; s : push_a push_b pop pop ; push_a #mode default #push a : '->a'; push_b #mode a #push b : '->b'; pop #mode a b #pop : '<-'; `, src: `->a->b<-<-`, ast: nonTermNode("s", termNode("push_a", "->a"), termNode("push_b", "->b"), termNode("pop", "<-"), termNode("pop", "<-"), ), }, { specSrc: ` #name test; mode_tran_seq : mode_tran_seq mode_tran | mode_tran ; mode_tran : push_m1 | push_m2 | pop_m1 | pop_m2 ; push_m1 #push m1 : "->"; push_m2 #mode m1 #push m2 : "-->"; pop_m1 #mode m1 #pop : "<-"; pop_m2 #mode m2 #pop : "<--"; whitespace #mode default m1 m2 #skip : "\u{0020}+"; `, src: ` -> --> <-- <- `, }, { specSrc: ` #name test; s : foo bar ; foo : "foo"; bar #mode default : "bar"; `, src: `foobar`, }, // When #push and #pop are applied to the same symbol, #pop will run first, then #push. { specSrc: ` #name test; s : foo bar baz ; foo #push m1 : 'foo'; bar #mode m1 #pop #push m2 : 'bar'; baz #mode m2 : 'baz'; `, src: `foobarbaz`, ast: nonTermNode("s", termNode("foo", "foo"), termNode("bar", "bar"), termNode("baz", "baz"), ), }, // When #push and #pop are applied to the same symbol, #pop will run first, then #push, even if #push appears first // in a definition. That is, the order in which #push and #pop appear in grammar has nothing to do with the order in which // they are executed. { specSrc: ` #name test; s : foo bar baz ; foo #push m1 : 'foo'; bar #mode m1 #push m2 #pop : 'bar'; baz #mode m2 : 'baz'; `, src: `foobarbaz`, ast: nonTermNode("s", termNode("foo", "foo"), termNode("bar", "bar"), termNode("baz", "baz"), ), }, // The parser can skips specified tokens. { specSrc: ` #name test; s : foo bar ; foo : "foo"; bar : "bar"; white_space #skip : "[\u{0009}\u{0020}]+"; `, src: `foo bar`, }, // A grammar can contain fragments. { specSrc: ` #name test; s : tagline ; tagline : "\f{words} IS OUT THERE."; fragment words : "[A-Za-z\u{0020}]+"; `, src: `THE TRUTH IS OUT THERE.`, }, // A grammar can contain ast actions. { specSrc: ` #name test; list : l_bracket elems r_bracket #ast elems... ; elems : elems comma id #ast elems... id | id ; whitespace #skip : "\u{0020}+"; l_bracket : '['; r_bracket : ']'; comma : ','; id : "[A-Za-z]+"; `, src: `[Byers, Frohike, Langly]`, cst: nonTermNode("list", termNode("x_1", "["), nonTermNode("elems", nonTermNode("elems", nonTermNode("elems", termNode("id", "Byers"), ), termNode("x_3", ","), termNode("id", "Frohike"), ), termNode("x_3", ","), termNode("id", "Langly"), ), termNode("x_2", "]"), ), ast: nonTermNode("list", termNode("id", "Byers"), termNode("id", "Frohike"), termNode("id", "Langly"), ), }, // The '...' operator can expand child nodes. { specSrc: ` #name test; s : a #ast a... ; a : a comma foo #ast a... foo | foo ; comma : ','; foo : 'foo'; `, src: `foo,foo,foo`, ast: nonTermNode("s", termNode("foo", "foo"), termNode("foo", "foo"), termNode("foo", "foo"), ), }, // The '...' operator also can applied to an element having no children. { specSrc: ` #name test; s : a semi_colon #ast a... ; a : ; semi_colon : ';'; `, src: `;`, ast: nonTermNode("s"), }, // A label can be a parameter of #ast directive. { specSrc: ` #name test; #prec ( #left add sub ); expr : expr@lhs add expr@rhs #ast add lhs rhs | expr@lhs sub expr@rhs #ast sub lhs rhs | num ; add : '+'; sub : '-'; num : "0|[1-9][0-9]*"; `, src: `1+2-3`, ast: nonTermNode("expr", termNode("sub", "-"), nonTermNode("expr", termNode("add", "+"), nonTermNode("expr", termNode("num", "1"), ), nonTermNode("expr", termNode("num", "2"), ), ), nonTermNode("expr", termNode("num", "3"), ), ), }, // An AST can contain a symbol name, even if the symbol has a label. That is, unused labels are allowed. { specSrc: ` #name test; s : foo@x semi_colon #ast foo ; semi_colon : ';'; foo : 'foo'; `, src: `foo;`, ast: nonTermNode("s", termNode("foo", "foo"), ), }, // A production has the same precedence and associativity as the right-most terminal symbol. { specSrc: ` #name test; #prec ( #left add ); expr : expr add expr // This alternative has the same precedence and associativiry as 'add'. | int ; ws #skip : "[\u{0009}\u{0020}]+"; int : "0|[1-9][0-9]*"; add : '+'; `, // This source is recognized as the following structure because the production `expr → expr add expr` has the same // precedence and associativity as the symbol 'add'. // // ((1+2)+3) // // If the symbol doesn't have the precedence and left associativity, the production also doesn't have the precedence // and associativity and this source will be recognized as the following structure. // // (1+(2+3)) src: `1+2+3`, ast: nonTermNode("expr", nonTermNode("expr", nonTermNode("expr", termNode("int", "1"), ), termNode("add", "+"), nonTermNode("expr", termNode("int", "2"), ), ), termNode("add", "+"), nonTermNode("expr", termNode("int", "3"), ), ), }, // The 'prec' directive can set precedence of a production. { specSrc: ` #name test; #prec ( #assign $uminus #left mul div #left add sub ); expr : expr add expr | expr sub expr | expr mul expr | expr div expr | int | sub int #prec $uminus // This 'sub' means a unary minus symbol. ; ws #skip : "[\u{0009}\u{0020}]+"; int : "0|[1-9][0-9]*"; add : '+'; sub : '-'; mul : '*'; div : '/'; `, // This source is recognized as the following structure because the production `expr → sub expr` // has the `#prec mul` directive and has the same precedence of the symbol `mul`. // // (((-1) * 20) / 5) // // If the production doesn't have the `#prec` directive, this source will be recognized as // the following structure. // // (- ((1 * 20) / 5)) src: `-1*20/5`, cst: nonTermNode("expr", nonTermNode("expr", nonTermNode("expr", termNode("sub", "-"), termNode("int", "1"), ), termNode("mul", "*"), nonTermNode("expr", termNode("int", "20"), ), ), termNode("div", "/"), nonTermNode("expr", termNode("int", "5"), ), ), }, // The grammar can contain the 'error' symbol. { specSrc: ` #name test; s : id id id semi_colon | error semi_colon ; ws #skip : "[\u{0009}\u{0020}]+"; semi_colon : ';'; id : "[A-Za-z_]+"; `, src: `foo bar baz ;`, }, // The 'error' symbol can appear in an #ast directive. { specSrc: ` #name test; s : foo semi_colon | error semi_colon #ast error ; semi_colon : ';'; foo : 'foo'; `, src: `bar;`, synErr: true, ast: nonTermNode("s", errorNode(), ), }, // The 'error' symbol can have a label, and an #ast can reference it. { specSrc: ` #name test; s : foo semi_colon | error@e semi_colon #ast e ; semi_colon : ';'; foo : 'foo'; `, src: `bar;`, synErr: true, ast: nonTermNode("s", errorNode(), ), }, // The grammar can contain the 'recover' directive. { specSrc: ` #name test; seq : seq elem | elem ; elem : id id id semi_colon | error semi_colon #recover ; ws #skip : "[\u{0009}\u{0020}]+"; semi_colon : ';'; id : "[A-Za-z_]+"; `, src: `a b c ; d e f ;`, }, // The same label can be used between different alternatives. { specSrc: ` #name test; s : foo@x bar | foo@x ; foo: 'foo'; bar: 'bar'; `, src: `foo`, }, } for i, tt := range tests { t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) { ast, err := parser.Parse(strings.NewReader(tt.specSrc)) if err != nil { t.Fatal(err) } b := grammar.GrammarBuilder{ AST: ast, } cg, _, err := b.Build() if err != nil { t.Fatal(err) } toks, err := NewTokenStream(cg, strings.NewReader(tt.src)) if err != nil { t.Fatal(err) } gram := NewGrammar(cg) tb := NewDefaultSyntaxTreeBuilder() var opt []ParserOption switch { case tt.ast != nil: opt = append(opt, SemanticAction(NewASTActionSet(gram, tb))) case tt.cst != nil: opt = append(opt, SemanticAction(NewCSTActionSet(gram, tb))) } p, err := NewParser(toks, gram, opt...) if err != nil { t.Fatal(err) } err = p.Parse() if err != nil { t.Fatal(err) } if !tt.synErr && len(p.SyntaxErrors()) > 0 { for _, synErr := range p.SyntaxErrors() { t.Fatalf("unexpected syntax errors occurred: %v", synErr) } } switch { case tt.ast != nil: testTree(t, tb.Tree(), tt.ast) case tt.cst != nil: testTree(t, tb.Tree(), tt.cst) } }) } } func testTree(t *testing.T, node, expected *Node) { t.Helper() if node.Type != expected.Type || node.KindName != expected.KindName || node.Text != expected.Text { t.Fatalf("unexpected node; want: %+v, got: %+v", expected, node) } if len(node.Children) != len(expected.Children) { t.Fatalf("unexpected children; want: %v, got: %v", len(expected.Children), len(node.Children)) } for i, c := range node.Children { testTree(t, c, expected.Children[i]) } } type testSemAct struct { gram *spec.CompiledGrammar actLog []string } func (a *testSemAct) Shift(tok VToken, recovered bool) { t := a.gram.Syntactic.Terminals[tok.TerminalID()] if recovered { a.actLog = append(a.actLog, fmt.Sprintf("shift/%v/recovered", t)) } else { a.actLog = append(a.actLog, fmt.Sprintf("shift/%v", t)) } } func (a *testSemAct) Reduce(prodNum int, recovered bool) { lhsSym := a.gram.Syntactic.LHSSymbols[prodNum] lhsText := a.gram.Syntactic.NonTerminals[lhsSym] if recovered { a.actLog = append(a.actLog, fmt.Sprintf("reduce/%v/recovered", lhsText)) } else { a.actLog = append(a.actLog, fmt.Sprintf("reduce/%v", lhsText)) } } func (a *testSemAct) Accept() { a.actLog = append(a.actLog, "accept") } func (a *testSemAct) TrapAndShiftError(cause VToken, popped int) { a.actLog = append(a.actLog, fmt.Sprintf("trap/%v/shift/error", popped)) } func (a *testSemAct) MissError(cause VToken) { a.actLog = append(a.actLog, "miss") } func TestParserWithSemanticAction(t *testing.T) { specSrcWithErrorProd := ` #name test; seq : seq elem semicolon | elem semicolon | error star star semicolon | error semicolon #recover ; elem : char char char ; ws #skip : "[\u{0009}\u{0020}]+"; semicolon : ';'; star : '*'; char : "[a-z]"; ` specSrcWithoutErrorProd := ` #name test; seq : seq elem semicolon | elem semicolon ; elem : char char char ; ws #skip : "[\u{0009}\u{0020}]+"; semicolon : ';'; char : "[a-z]"; ` tests := []struct { caption string specSrc string src string actLog []string }{ { caption: "when an input contains no syntax error, the driver calls `Shift`, `Reduce`, and `Accept`.", specSrc: specSrcWithErrorProd, src: `a b c; d e f;`, actLog: []string{ "shift/char", "shift/char", "shift/char", "reduce/elem", "shift/semicolon", "reduce/seq", "shift/char", "shift/char", "shift/char", "reduce/elem", "shift/semicolon", "reduce/seq", "accept", }, }, { caption: "when a grammar has `error` symbol, the driver calls `TrapAndShiftError`.", specSrc: specSrcWithErrorProd, src: `a; b !; c d !; e ! * *; h i j;`, actLog: []string{ "shift/char", "trap/1/shift/error", "shift/semicolon", "reduce/seq/recovered", "shift/char", "trap/2/shift/error", "shift/semicolon", "reduce/seq/recovered", "shift/char", "shift/char", "trap/3/shift/error", "shift/semicolon", "reduce/seq/recovered", "shift/char", "trap/2/shift/error", "shift/star", "shift/star", // When the driver shifts three times, it recovers from an error. "shift/semicolon/recovered", "reduce/seq", "shift/char", "shift/char", "shift/char", "reduce/elem", "shift/semicolon", "reduce/seq", // Even if the input contains syntax errors, the driver calls `Accept` when the input is accepted // according to the error production. "accept", }, }, { caption: "when the input doesn't meet the error production, the driver calls `MissError`.", specSrc: specSrcWithErrorProd, src: `a !`, actLog: []string{ "shift/char", "trap/1/shift/error", "miss", }, }, { caption: "when a syntax error isn't trapped, the driver calls `MissError`.", specSrc: specSrcWithoutErrorProd, src: `a !`, actLog: []string{ "shift/char", "miss", }, }, } for _, tt := range tests { t.Run(tt.caption, func(t *testing.T) { ast, err := parser.Parse(strings.NewReader(tt.specSrc)) if err != nil { t.Fatal(err) } b := grammar.GrammarBuilder{ AST: ast, } gram, _, err := b.Build() if err != nil { t.Fatal(err) } toks, err := NewTokenStream(gram, strings.NewReader(tt.src)) if err != nil { t.Fatal(err) } semAct := &testSemAct{ gram: gram, } p, err := NewParser(toks, NewGrammar(gram), SemanticAction(semAct)) if err != nil { t.Fatal(err) } err = p.Parse() if err != nil { t.Fatal(err) } if len(semAct.actLog) != len(tt.actLog) { t.Fatalf("unexpected action log; want: %+v, got: %+v", tt.actLog, semAct.actLog) } for i, e := range tt.actLog { if semAct.actLog[i] != e { t.Fatalf("unexpected action log; want: %+v, got: %+v", tt.actLog, semAct.actLog) } } }) } } func TestParserWithSyntaxErrors(t *testing.T) { tests := []struct { caption string specSrc string src string synErrCount int }{ { caption: "the parser can report a syntax error", specSrc: ` #name test; s : foo ; foo : 'foo'; `, src: `bar`, synErrCount: 1, }, { caption: "when the parser reduced a production having the reduce directive, the parser will recover from an error state", specSrc: ` #name test; seq : seq elem semi_colon | elem semi_colon | error semi_colon #recover ; elem : a b c ; ws #skip : "[\u{0009}\u{0020}]+"; semi_colon : ';'; a : 'a'; b : 'b'; c : 'c'; `, src: `!; a!; ab!;`, synErrCount: 3, }, { caption: "After the parser shifts the error symbol, symbols are ignored until a symbol the parser can perform shift appears", specSrc: ` #name test; seq : seq elem semi_colon | elem semi_colon | error semi_colon #recover ; elem : a b c ; ws #skip : "[\u{0009}\u{0020}]+"; semi_colon : ';'; a : 'a'; b : 'b'; c : 'c'; `, // After the parser trasits to the error state reading the first invalid symbol ('!'), // the second and third invalid symbols ('!') are ignored. src: `! ! !; a!; ab!;`, synErrCount: 3, }, { caption: "when the parser performs shift three times, the parser recovers from the error state", specSrc: ` #name test; seq : seq elem semi_colon | elem semi_colon | error star star semi_colon ; elem : a b c ; ws #skip : "[\u{0009}\u{0020}]+"; semi_colon : ';'; star : '*'; a : 'a'; b : 'b'; c : 'c'; `, src: `!**; a!**; ab!**; abc!`, synErrCount: 4, }, } for i, tt := range tests { t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) { ast, err := parser.Parse(strings.NewReader(tt.specSrc)) if err != nil { t.Fatal(err) } b := grammar.GrammarBuilder{ AST: ast, } gram, _, err := b.Build() if err != nil { t.Fatal(err) } toks, err := NewTokenStream(gram, strings.NewReader(tt.src)) if err != nil { t.Fatal(err) } p, err := NewParser(toks, NewGrammar(gram)) if err != nil { t.Fatal(err) } err = p.Parse() if err != nil { t.Fatal(err) } synErrs := p.SyntaxErrors() if len(synErrs) != tt.synErrCount { t.Fatalf("unexpected syntax error; want: %v error(s), got: %v error(s)", tt.synErrCount, len(synErrs)) } }) } } func TestParserWithSyntaxErrorAndExpectedLookahead(t *testing.T) { tests := []struct { caption string specSrc string src string cause string expected []string }{ { caption: "the parser reports an expected lookahead symbol", specSrc: ` #name test; s : foo ; foo : 'foo'; `, src: `bar`, cause: `bar`, expected: []string{ "foo", }, }, { caption: "the parser reports expected lookahead symbols", specSrc: ` #name test; s : foo | bar ; foo : 'foo'; bar : 'bar'; `, src: `baz`, cause: `baz`, expected: []string{ "foo", "bar", }, }, { caption: "the parser may report the EOF as an expected lookahead symbol", specSrc: ` #name test; s : foo ; foo : 'foo'; `, src: `foobar`, cause: `bar`, expected: []string{ "", }, }, { caption: "the parser may report the EOF and others as expected lookahead symbols", specSrc: ` #name test; s : foo | ; foo : 'foo'; `, src: `bar`, cause: `bar`, expected: []string{ "foo", "", }, }, } for i, tt := range tests { t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) { ast, err := parser.Parse(strings.NewReader(tt.specSrc)) if err != nil { t.Fatal(err) } b := grammar.GrammarBuilder{ AST: ast, } gram, _, err := b.Build() if err != nil { t.Fatal(err) } toks, err := NewTokenStream(gram, strings.NewReader(tt.src)) if err != nil { t.Fatal(err) } p, err := NewParser(toks, NewGrammar(gram)) if err != nil { t.Fatal(err) } err = p.Parse() if err != nil { t.Fatal(err) } synErrs := p.SyntaxErrors() if synErrs == nil { t.Fatalf("expected one syntax error, but it didn't occur") } if len(synErrs) != 1 { t.Fatalf("too many syntax errors: %v errors", len(synErrs)) } synErr := synErrs[0] if string(synErr.Token.Lexeme()) != tt.cause { t.Fatalf("unexpected lexeme: want: %v, got: %v", tt.cause, string(synErr.Token.Lexeme())) } if len(synErr.ExpectedTerminals) != len(tt.expected) { t.Fatalf("unexpected lookahead symbols: want: %v, got: %v", tt.expected, synErr.ExpectedTerminals) } sort.Slice(tt.expected, func(i, j int) bool { return tt.expected[i] < tt.expected[j] }) sort.Slice(synErr.ExpectedTerminals, func(i, j int) bool { return synErr.ExpectedTerminals[i] < synErr.ExpectedTerminals[j] }) for i, e := range tt.expected { if synErr.ExpectedTerminals[i] != e { t.Errorf("unexpected lookahead symbol: want: %v, got: %v", e, synErr.ExpectedTerminals[i]) } } }) } } func MainTest() {}