package grammar import ( "crypto/sha256" "encoding/binary" "encoding/hex" "errors" "fmt" "io" "sort" "strconv" "strings" verr "urubu/error" "urubu/grammar/lexical" "urubu/grammar/symbol" spec "urubu/spec/grammar" "urubu/spec/grammar/parser" ) type firstEntry struct { symbols map[symbol.Symbol]struct{} empty bool } func newFirstEntry() *firstEntry { return &firstEntry{ symbols: map[symbol.Symbol]struct{}{}, empty: false, } } func (e *firstEntry) add(sym symbol.Symbol) bool { if _, ok := e.symbols[sym]; ok { return false } e.symbols[sym] = struct{}{} return true } func (e *firstEntry) addEmpty() bool { if !e.empty { e.empty = true return true } return false } func (e *firstEntry) mergeExceptEmpty(target *firstEntry) bool { if target == nil { return false } changed := false for sym := range target.symbols { added := e.add(sym) if added { changed = true } } return changed } type firstSet struct { set map[symbol.Symbol]*firstEntry } func newFirstSet(prods *productionSet) *firstSet { fst := &firstSet{ set: map[symbol.Symbol]*firstEntry{}, } for _, prod := range prods.getAllProductions() { if _, ok := fst.set[prod.lhs]; ok { continue } fst.set[prod.lhs] = newFirstEntry() } return fst } func (fst *firstSet) find(prod *production, head int) (*firstEntry, error) { entry := newFirstEntry() if prod.rhsLen <= head { entry.addEmpty() return entry, nil } for _, sym := range prod.rhs[head:] { if sym.IsTerminal() { entry.add(sym) return entry, nil } e := fst.findBySymbol(sym) if e == nil { return nil, fmt.Errorf("an entry of FIRST was not found; symbol: %s", sym) } for s := range e.symbols { entry.add(s) } if !e.empty { return entry, nil } } entry.addEmpty() return entry, nil } func (fst *firstSet) findBySymbol(sym symbol.Symbol) *firstEntry { return fst.set[sym] } type firstComContext struct { first *firstSet } func newFirstComContext(prods *productionSet) *firstComContext { return &firstComContext{ first: newFirstSet(prods), } } func genFirstSet(prods *productionSet) (*firstSet, error) { cc := newFirstComContext(prods) for { more := false for _, prod := range prods.getAllProductions() { e := cc.first.findBySymbol(prod.lhs) changed, err := genProdFirstEntry(cc, e, prod) if err != nil { return nil, err } if changed { more = true } } if !more { break } } return cc.first, nil } func genProdFirstEntry(cc *firstComContext, acc *firstEntry, prod *production) (bool, error) { if prod.isEmpty() { return acc.addEmpty(), nil } for _, sym := range prod.rhs { if sym.IsTerminal() { return acc.add(sym), nil } e := cc.first.findBySymbol(sym) changed := acc.mergeExceptEmpty(e) if !e.empty { return changed, nil } } return acc.addEmpty(), nil } type astActionEntry struct { position int expansion bool } type assocType string const ( assocTypeNil = assocType("") assocTypeLeft = assocType("left") assocTypeRight = assocType("right") ) const ( precNil = 0 precMin = 1 ) // precAndAssoc represents precedence and associativities of terminal symbols and productions. // We use the priority of the production to resolve shift/reduce conflicts. type precAndAssoc struct { // termPrec and termAssoc represent the precedence of the terminal symbols. termPrec map[symbol.SymbolNum]int termAssoc map[symbol.SymbolNum]assocType // prodPrec and prodAssoc represent the precedence and the associativities of the production. // These values are inherited from the right-most terminal symbols in the RHS of the productions. prodPrec map[productionNum]int prodAssoc map[productionNum]assocType } func (pa *precAndAssoc) terminalPrecedence(sym symbol.SymbolNum) int { prec, ok := pa.termPrec[sym] if !ok { return precNil } return prec } func (pa *precAndAssoc) terminalAssociativity(sym symbol.SymbolNum) assocType { assoc, ok := pa.termAssoc[sym] if !ok { return assocTypeNil } return assoc } func (pa *precAndAssoc) productionPredence(prod productionNum) int { prec, ok := pa.prodPrec[prod] if !ok { return precNil } return prec } func (pa *precAndAssoc) productionAssociativity(prod productionNum) assocType { assoc, ok := pa.prodAssoc[prod] if !ok { return assocTypeNil } return assoc } const reservedSymbolNameError = "error" type Grammar struct { name string lexSpec *lexical.LexSpec skipSymbols []symbol.Symbol productionSet *productionSet augmentedStartSymbol symbol.Symbol errorSymbol symbol.Symbol symbolTable *symbol.SymbolTableReader astActions map[productionID][]*astActionEntry precAndAssoc *precAndAssoc // recoverProductions is a set of productions having the recover directive. recoverProductions map[productionID]struct{} } type buildConfig struct { isReportingEnabled bool } type BuildOption func(config *buildConfig) func EnableReporting() BuildOption { return func(config *buildConfig) { config.isReportingEnabled = true } } type GrammarBuilder struct { AST *parser.RootNode errs verr.SpecErrors } func (b *GrammarBuilder) Build(opts ...BuildOption) (*spec.CompiledGrammar, *spec.Report, error) { gram, err := b.build() if err != nil { return nil, nil, err } return compile(gram, opts...) } func (b *GrammarBuilder) build() (*Grammar, error) { var specName string { errOccurred := false for _, dir := range b.AST.Directives { if dir.Name != "name" { continue } if len(dir.Parameters) != 1 || dir.Parameters[0].ID == "" { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: "'name' takes just one ID parameter", Row: dir.Pos.Row, Col: dir.Pos.Col, }) errOccurred = true break } specName = dir.Parameters[0].ID break } if specName == "" && !errOccurred { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrNoGrammarName, }) } } b.checkSpellingInconsistenciesOfUserDefinedIDs(b.AST) if len(b.errs) > 0 { return nil, b.errs } symTab, ss, err := b.genSymbolTable(b.AST) if err != nil { return nil, err } lexSpec, skip, err := b.genLexSpecAndSkipSymbols(symTab.Reader(), b.AST) if err != nil { return nil, err } prodsAndActs, err := b.genProductionsAndActions(b.AST, symTab.Reader(), ss.errSym, ss.augStartSym, ss.startSym) if err != nil { return nil, err } if prodsAndActs == nil && len(b.errs) > 0 { return nil, b.errs } pa, err := b.genPrecAndAssoc(symTab.Reader(), ss.errSym, prodsAndActs) if err != nil { return nil, err } if pa == nil && len(b.errs) > 0 { return nil, b.errs } syms := findUsedAndUnusedSymbols(b.AST) if syms == nil && len(b.errs) > 0 { return nil, b.errs } // When a terminal symbol that cannot be reached from the start symbol has the skip directive, // the compiler treats its terminal as a used symbol, not unused. { r := symTab.Reader() for _, sym := range skip { s, _ := r.ToText(sym) if _, ok := syms.unusedTerminals[s]; !ok { prod := syms.usedTerminals[s] b.errs = append(b.errs, &verr.SpecError{ Cause: semErrTermCannotBeSkipped, Detail: s, Row: prod.Pos.Row, Col: prod.Pos.Col, }) continue } delete(syms.unusedTerminals, s) } } for sym, prod := range syms.unusedProductions { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrUnusedProduction, Detail: sym, Row: prod.Pos.Row, Col: prod.Pos.Col, }) } for sym, prod := range syms.unusedTerminals { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrUnusedTerminal, Detail: sym, Row: prod.Pos.Row, Col: prod.Pos.Col, }) } if len(b.errs) > 0 { return nil, b.errs } return &Grammar{ name: specName, lexSpec: lexSpec, skipSymbols: skip, productionSet: prodsAndActs.prods, augmentedStartSymbol: prodsAndActs.augStartSym, errorSymbol: ss.errSym, symbolTable: symTab.Reader(), astActions: prodsAndActs.astActs, recoverProductions: prodsAndActs.recoverProds, precAndAssoc: pa, }, nil } type usedAndUnusedSymbols struct { unusedProductions map[string]*parser.ProductionNode unusedTerminals map[string]*parser.ProductionNode usedTerminals map[string]*parser.ProductionNode } func findUsedAndUnusedSymbols(root *parser.RootNode) *usedAndUnusedSymbols { prods := map[string]*parser.ProductionNode{} lexProds := map[string]*parser.ProductionNode{} mark := map[string]bool{} { for _, p := range root.Productions { prods[p.LHS] = p mark[p.LHS] = false for _, alt := range p.RHS { for _, e := range alt.Elements { if e.ID == "" { continue } mark[e.ID] = false } } } for _, p := range root.LexProductions { lexProds[p.LHS] = p mark[p.LHS] = false } start := root.Productions[0] mark[start.LHS] = true markUsedSymbols(mark, map[string]bool{}, prods, start) // We don't have to check the error symbol because the error symbol doesn't have a production. delete(mark, reservedSymbolNameError) } usedTerms := make(map[string]*parser.ProductionNode, len(lexProds)) unusedProds := map[string]*parser.ProductionNode{} unusedTerms := map[string]*parser.ProductionNode{} for sym, used := range mark { if p, ok := prods[sym]; ok { if used { continue } unusedProds[sym] = p continue } if p, ok := lexProds[sym]; ok { if used { usedTerms[sym] = p } else { unusedTerms[sym] = p } continue } // May be reached here when a fragment name appears on the right-hand side of a production rule. However, an error // to the effect that a production rule cannot contain a fragment will be detected in a subsequent process. So we can // ignore it here. } return &usedAndUnusedSymbols{ usedTerminals: usedTerms, unusedProductions: unusedProds, unusedTerminals: unusedTerms, } } func markUsedSymbols(mark map[string]bool, marked map[string]bool, prods map[string]*parser.ProductionNode, prod *parser.ProductionNode) { if marked[prod.LHS] { return } for _, alt := range prod.RHS { for _, e := range alt.Elements { if e.ID == "" { continue } mark[e.ID] = true p, ok := prods[e.ID] if !ok { continue } // Remove a production to avoid inifinite recursion. marked[prod.LHS] = true markUsedSymbols(mark, marked, prods, p) } } } func (b *GrammarBuilder) checkSpellingInconsistenciesOfUserDefinedIDs(root *parser.RootNode) { var ids []string { for _, prod := range root.Productions { ids = append(ids, prod.LHS) for _, alt := range prod.RHS { for _, elem := range alt.Elements { if elem.Label != nil { ids = append(ids, elem.Label.Name) } } } } for _, prod := range root.LexProductions { ids = append(ids, prod.LHS) } for _, dir := range root.Directives { dirIDs := collectUserDefinedIDsFromDirective(dir) if len(dirIDs) > 0 { ids = append(ids, dirIDs...) } } } duplicated := lexical.FindSpellingInconsistencies(ids) if len(duplicated) == 0 { return } for _, dup := range duplicated { var s string { var b strings.Builder fmt.Fprintf(&b, "%+v", dup[0]) for _, id := range dup[1:] { fmt.Fprintf(&b, ", %+v", id) } s = b.String() } b.errs = append(b.errs, &verr.SpecError{ Cause: semErrSpellingInconsistency, Detail: s, }) } } func collectUserDefinedIDsFromDirective(dir *parser.DirectiveNode) []string { var ids []string for _, param := range dir.Parameters { if param.Group != nil { for _, d := range param.Group { dIDs := collectUserDefinedIDsFromDirective(d) if len(dIDs) > 0 { ids = append(ids, dIDs...) } } } if param.OrderedSymbol != "" { ids = append(ids, param.OrderedSymbol) } } return ids } type symbols struct { errSym symbol.Symbol augStartSym symbol.Symbol startSym symbol.Symbol } func (b *GrammarBuilder) genSymbolTable(root *parser.RootNode) (*symbol.SymbolTable, *symbols, error) { symTab := symbol.NewSymbolTable() w := symTab.Writer() r := symTab.Reader() // We need to register the reserved symbol before registering others. var errSym symbol.Symbol { sym, err := w.RegisterTerminalSymbol(reservedSymbolNameError) if err != nil { return nil, nil, err } errSym = sym } for _, prod := range root.LexProductions { if sym, exist := r.ToSymbol(prod.LHS); exist { if sym == errSym { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrErrSymIsReserved, Row: prod.Pos.Row, Col: prod.Pos.Col, }) } else { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDuplicateTerminal, Detail: prod.LHS, Row: prod.Pos.Row, Col: prod.Pos.Col, }) } continue } _, err := w.RegisterTerminalSymbol(prod.LHS) if err != nil { return nil, nil, err } } startProd := root.Productions[0] augStartText := fmt.Sprintf("%s'", startProd.LHS) var err error augStartSym, err := w.RegisterStartSymbol(augStartText) if err != nil { return nil, nil, err } if augStartSym == errSym { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrErrSymIsReserved, Row: startProd.Pos.Row, Col: startProd.Pos.Col, }) } startSym, err := w.RegisterNonTerminalSymbol(startProd.LHS) if err != nil { return nil, nil, err } if startSym == errSym { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrErrSymIsReserved, Row: startProd.Pos.Row, Col: startProd.Pos.Col, }) } for _, prod := range root.Productions { sym, err := w.RegisterNonTerminalSymbol(prod.LHS) if err != nil { return nil, nil, err } if sym.IsTerminal() { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDuplicateName, Detail: prod.LHS, Row: prod.Pos.Row, Col: prod.Pos.Col, }) } if sym == errSym { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrErrSymIsReserved, Row: prod.Pos.Row, Col: prod.Pos.Col, }) } } return symTab, &symbols{ errSym: errSym, augStartSym: augStartSym, startSym: startSym, }, nil } func (b *GrammarBuilder) genLexSpecAndSkipSymbols(symTab *symbol.SymbolTableReader, root *parser.RootNode) (*lexical.LexSpec, []symbol.Symbol, error) { entries := []*lexical.LexEntry{} skipSyms := []symbol.Symbol{} for _, prod := range root.LexProductions { entry, skip, specErr, err := genLexEntry(prod) if err != nil { return nil, nil, err } if specErr != nil { b.errs = append(b.errs, specErr) continue } if skip { sym, _ := symTab.ToSymbol(prod.LHS) skipSyms = append(skipSyms, sym) } entries = append(entries, entry) } checkedFragments := map[string]struct{}{} for _, fragment := range root.Fragments { if _, exist := checkedFragments[fragment.LHS]; exist { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDuplicateFragment, Detail: fragment.LHS, Row: fragment.Pos.Row, Col: fragment.Pos.Col, }) continue } checkedFragments[fragment.LHS] = struct{}{} entries = append(entries, &lexical.LexEntry{ Fragment: true, Kind: spec.LexKindName(fragment.LHS), Pattern: fragment.RHS, }) } return &lexical.LexSpec{ Entries: entries, }, skipSyms, nil } func genLexEntry(prod *parser.ProductionNode) (*lexical.LexEntry, bool, *verr.SpecError, error) { alt := prod.RHS[0] elem := alt.Elements[0] var pattern string if elem.Literally { pattern = spec.EscapePattern(elem.Pattern) } else { pattern = elem.Pattern } var modes []spec.LexModeName var skip bool var push spec.LexModeName var pop bool dirConsumed := map[string]struct{}{} for _, dir := range prod.Directives { if _, consumed := dirConsumed[dir.Name]; consumed { return nil, false, &verr.SpecError{ Cause: semErrDuplicateDir, Detail: dir.Name, Row: dir.Pos.Row, Col: dir.Pos.Col, }, nil } dirConsumed[dir.Name] = struct{}{} switch dir.Name { case "mode": if len(dir.Parameters) == 0 { return nil, false, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: "'mode' directive needs an ID parameter", Row: dir.Pos.Row, Col: dir.Pos.Col, }, nil } for _, param := range dir.Parameters { if param.ID == "" { return nil, false, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: "'mode' directive needs an ID parameter", Row: param.Pos.Row, Col: param.Pos.Col, }, nil } modes = append(modes, spec.LexModeName(param.ID)) } case "skip": if len(dir.Parameters) > 0 { return nil, false, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: "'skip' directive needs no parameter", Row: dir.Pos.Row, Col: dir.Pos.Col, }, nil } skip = true case "push": if len(dir.Parameters) != 1 || dir.Parameters[0].ID == "" { return nil, false, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: "'push' directive needs an ID parameter", Row: dir.Pos.Row, Col: dir.Pos.Col, }, nil } push = spec.LexModeName(dir.Parameters[0].ID) case "pop": if len(dir.Parameters) > 0 { return nil, false, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: "'pop' directive needs no parameter", Row: dir.Pos.Row, Col: dir.Pos.Col, }, nil } pop = true default: return nil, false, &verr.SpecError{ Cause: semErrDirInvalidName, Detail: dir.Name, Row: dir.Pos.Row, Col: dir.Pos.Col, }, nil } } if len(alt.Directives) > 0 { return nil, false, &verr.SpecError{ Cause: semErrInvalidAltDir, Detail: "a lexical production cannot have alternative directives", Row: alt.Directives[0].Pos.Row, Col: alt.Directives[0].Pos.Col, }, nil } return &lexical.LexEntry{ Modes: modes, Kind: spec.LexKindName(prod.LHS), Pattern: pattern, Push: push, Pop: pop, }, skip, nil, nil } type productionsAndActions struct { prods *productionSet augStartSym symbol.Symbol astActs map[productionID][]*astActionEntry prodPrecsTerm map[productionID]symbol.Symbol prodPrecsOrdSym map[productionID]string prodPrecPoss map[productionID]*parser.Position recoverProds map[productionID]struct{} } func (b *GrammarBuilder) genProductionsAndActions(root *parser.RootNode, symTab *symbol.SymbolTableReader, errSym symbol.Symbol, augStartSym symbol.Symbol, startSym symbol.Symbol) (*productionsAndActions, error) { if len(root.Productions) == 0 { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrNoProduction, }) return nil, nil } prods := newProductionSet() astActs := map[productionID][]*astActionEntry{} prodPrecsTerm := map[productionID]symbol.Symbol{} prodPrecsOrdSym := map[productionID]string{} prodPrecPoss := map[productionID]*parser.Position{} recoverProds := map[productionID]struct{}{} p, err := newProduction(augStartSym, []symbol.Symbol{ startSym, }) if err != nil { return nil, err } prods.append(p) for _, prod := range root.Productions { lhsSym, ok := symTab.ToSymbol(prod.LHS) if !ok { // All symbols are assumed to be pre-detected, so it's a bug if we cannot find them here. return nil, fmt.Errorf("symbol '%v' is undefined", prod.LHS) } if len(prod.Directives) > 0 { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrInvalidProdDir, Detail: "a production cannot have production directives", Row: prod.Directives[0].Pos.Row, Col: prod.Directives[0].Pos.Col, }) continue } LOOP_RHS: for _, alt := range prod.RHS { altSyms := make([]symbol.Symbol, len(alt.Elements)) offsets := map[string]int{} ambiguousIDOffsets := map[string]struct{}{} for i, elem := range alt.Elements { sym, ok := symTab.ToSymbol(elem.ID) if !ok { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrUndefinedSym, Detail: elem.ID, Row: elem.Pos.Row, Col: elem.Pos.Col, }) continue LOOP_RHS } altSyms[i] = sym if elem.Label != nil { if _, added := offsets[elem.Label.Name]; added { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDuplicateLabel, Detail: elem.Label.Name, Row: elem.Label.Pos.Row, Col: elem.Label.Pos.Col, }) continue LOOP_RHS } if _, found := symTab.ToSymbol(elem.Label.Name); found { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrInvalidLabel, Detail: elem.Label.Name, Row: elem.Label.Pos.Row, Col: elem.Label.Pos.Col, }) continue LOOP_RHS } offsets[elem.Label.Name] = i } // A symbol having a label can be specified by both the label and the symbol name. // So record the symbol's position, whether or not it has a label. if elem.ID != "" { if _, exist := offsets[elem.ID]; exist { // When the same symbol appears multiple times in an alternative, the symbol is ambiguous. When we need // to specify the symbol in a directive, we cannot use the name of the ambiguous symbol. Instead, specify // a label to resolve the ambiguity. delete(offsets, elem.ID) ambiguousIDOffsets[elem.ID] = struct{}{} } else { offsets[elem.ID] = i } } } p, err := newProduction(lhsSym, altSyms) if err != nil { return nil, err } if _, exist := prods.findByID(p.id); exist { // Report the line number of a duplicate alternative. // When the alternative is empty, we report the position of its LHS. var row int var col int if len(alt.Elements) > 0 { row = alt.Elements[0].Pos.Row col = alt.Elements[0].Pos.Col } else { row = prod.Pos.Row col = prod.Pos.Col } var detail string { var b strings.Builder fmt.Fprintf(&b, "%v →", prod.LHS) for _, elem := range alt.Elements { switch { case elem.ID != "": fmt.Fprintf(&b, " %v", elem.ID) case elem.Pattern != "": fmt.Fprintf(&b, ` "%v"`, elem.Pattern) } } if len(alt.Elements) == 0 { fmt.Fprintf(&b, " ε") } detail = b.String() } b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDuplicateProduction, Detail: detail, Row: row, Col: col, }) continue LOOP_RHS } prods.append(p) dirConsumed := map[string]struct{}{} for _, dir := range alt.Directives { if _, consumed := dirConsumed[dir.Name]; consumed { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDuplicateDir, Detail: dir.Name, Row: dir.Pos.Row, Col: dir.Pos.Col, }) } dirConsumed[dir.Name] = struct{}{} switch dir.Name { case "ast": if len(dir.Parameters) == 0 { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: "'ast' directive needs at least one parameter", Row: dir.Pos.Row, Col: dir.Pos.Col, }) continue LOOP_RHS } astAct := make([]*astActionEntry, len(dir.Parameters)) consumedOffsets := map[int]struct{}{} for i, param := range dir.Parameters { if param.ID == "" { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: "'ast' directive can take only ID parameters", Row: dir.Pos.Row, Col: dir.Pos.Col, }) continue LOOP_RHS } if _, ambiguous := ambiguousIDOffsets[param.ID]; ambiguous { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrAmbiguousElem, Detail: fmt.Sprintf("'%v' is ambiguous", param.ID), Row: param.Pos.Row, Col: param.Pos.Col, }) continue LOOP_RHS } offset, ok := offsets[param.ID] if !ok { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: fmt.Sprintf("a symbol was not found in an alternative: %v", param.ID), Row: param.Pos.Row, Col: param.Pos.Col, }) continue LOOP_RHS } if _, consumed := consumedOffsets[offset]; consumed { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDuplicateElem, Detail: param.ID, Row: param.Pos.Row, Col: param.Pos.Col, }) continue LOOP_RHS } consumedOffsets[offset] = struct{}{} if param.Expansion { elem := alt.Elements[offset] if elem.Pattern != "" { // Currently, it is a bug to reach here because it is // forbidden to have anything other than ID appear in // production rules. b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: fmt.Sprintf("the expansion symbol cannot be applied to a pattern (%v: \"%v\")", param.ID, elem.Pattern), Row: param.Pos.Row, Col: param.Pos.Col, }) continue LOOP_RHS } elemSym, ok := symTab.ToSymbol(elem.ID) if !ok { // If the symbol was not found, it's a bug. return nil, fmt.Errorf("a symbol corresponding to an ID (%v) was not found", elem.ID) } if elemSym.IsTerminal() { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: fmt.Sprintf("the expansion symbol cannot be applied to a terminal symbol (%v: %v)", param.ID, elem.ID), Row: param.Pos.Row, Col: param.Pos.Col, }) continue LOOP_RHS } } astAct[i] = &astActionEntry{ position: offset + 1, expansion: param.Expansion, } } astActs[p.id] = astAct case "prec": if len(dir.Parameters) != 1 || (dir.Parameters[0].ID == "" && dir.Parameters[0].OrderedSymbol == "") { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: "'prec' directive needs just one ID parameter or ordered symbol", Row: dir.Pos.Row, Col: dir.Pos.Col, }) continue LOOP_RHS } param := dir.Parameters[0] switch { case param.ID != "": sym, ok := symTab.ToSymbol(param.ID) if !ok { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: fmt.Sprintf("unknown terminal symbol: %v", param.ID), Row: param.Pos.Row, Col: param.Pos.Col, }) continue LOOP_RHS } if sym == errSym { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: fmt.Sprintf("'%v' directive cannot be applied to an error symbol", dir.Name), Row: param.Pos.Row, Col: param.Pos.Col, }) } if !sym.IsTerminal() { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: fmt.Sprintf("the symbol must be a terminal: %v", param.ID), Row: param.Pos.Row, Col: param.Pos.Col, }) continue LOOP_RHS } prodPrecsTerm[p.id] = sym prodPrecPoss[p.id] = ¶m.Pos case param.OrderedSymbol != "": prodPrecsOrdSym[p.id] = param.OrderedSymbol prodPrecPoss[p.id] = ¶m.Pos } case "recover": if len(dir.Parameters) > 0 { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: "'recover' directive needs no parameter", Row: dir.Pos.Row, Col: dir.Pos.Col, }) continue LOOP_RHS } recoverProds[p.id] = struct{}{} default: b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidName, Detail: fmt.Sprintf("invalid directive name '%v'", dir.Name), Row: dir.Pos.Row, Col: dir.Pos.Col, }) continue LOOP_RHS } } } } return &productionsAndActions{ prods: prods, augStartSym: augStartSym, astActs: astActs, prodPrecsTerm: prodPrecsTerm, prodPrecsOrdSym: prodPrecsOrdSym, prodPrecPoss: prodPrecPoss, recoverProds: recoverProds, }, nil } func (b *GrammarBuilder) genPrecAndAssoc(symTab *symbol.SymbolTableReader, errSym symbol.Symbol, prodsAndActs *productionsAndActions) (*precAndAssoc, error) { termPrec := map[symbol.SymbolNum]int{} termAssoc := map[symbol.SymbolNum]assocType{} ordSymPrec := map[string]int{} { var precGroup []*parser.DirectiveNode for _, dir := range b.AST.Directives { if dir.Name == "prec" { if dir.Parameters == nil || len(dir.Parameters) != 1 || dir.Parameters[0].Group == nil { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: "'prec' needs just one directive group", Row: dir.Pos.Row, Col: dir.Pos.Col, }) continue } precGroup = dir.Parameters[0].Group continue } if dir.Name != "name" && dir.Name != "prec" { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidName, Detail: dir.Name, Row: dir.Pos.Row, Col: dir.Pos.Col, }) continue } } precN := precMin for _, dir := range precGroup { var assocTy assocType switch dir.Name { case "left": assocTy = assocTypeLeft case "right": assocTy = assocTypeRight case "assign": assocTy = assocTypeNil default: b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidName, Detail: dir.Name, Row: dir.Pos.Row, Col: dir.Pos.Col, }) return nil, nil } if len(dir.Parameters) == 0 { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: "associativity needs at least one symbol", Row: dir.Pos.Row, Col: dir.Pos.Col, }) return nil, nil } ASSOC_PARAM_LOOP: for _, p := range dir.Parameters { switch { case p.ID != "": sym, ok := symTab.ToSymbol(p.ID) if !ok { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: fmt.Sprintf("'%v' is undefined", p.ID), Row: p.Pos.Row, Col: p.Pos.Col, }) return nil, nil } if sym == errSym { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: fmt.Sprintf("'%v' directive cannot be applied to an error symbol", dir.Name), Row: p.Pos.Row, Col: p.Pos.Col, }) return nil, nil } if !sym.IsTerminal() { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: fmt.Sprintf("associativity can take only terminal symbol ('%v' is a non-terminal)", p.ID), Row: p.Pos.Row, Col: p.Pos.Col, }) return nil, nil } if prec, alreadySet := termPrec[sym.Num()]; alreadySet { if prec == precN { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDuplicateAssoc, Detail: fmt.Sprintf("'%v' already has the same associativity and precedence", p.ID), Row: p.Pos.Row, Col: p.Pos.Col, }) } else if assoc := termAssoc[sym.Num()]; assoc == assocTy { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDuplicateAssoc, Detail: fmt.Sprintf("'%v' already has different precedence", p.ID), Row: p.Pos.Row, Col: p.Pos.Col, }) } else { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDuplicateAssoc, Detail: fmt.Sprintf("'%v' already has different associativity and precedence", p.ID), Row: p.Pos.Row, Col: p.Pos.Col, }) } break ASSOC_PARAM_LOOP } termPrec[sym.Num()] = precN termAssoc[sym.Num()] = assocTy case p.OrderedSymbol != "": if prec, alreadySet := ordSymPrec[p.OrderedSymbol]; alreadySet { if prec == precN { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDuplicateAssoc, Detail: fmt.Sprintf("'$%v' already has the same precedence", p.OrderedSymbol), Row: p.Pos.Row, Col: p.Pos.Col, }) } else { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDuplicateAssoc, Detail: fmt.Sprintf("'$%v' already has different precedence", p.OrderedSymbol), Row: p.Pos.Row, Col: p.Pos.Col, }) } break ASSOC_PARAM_LOOP } ordSymPrec[p.OrderedSymbol] = precN default: b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: "a parameter must be an ID or an ordered symbol", Row: p.Pos.Row, Col: p.Pos.Col, }) return nil, nil } } precN++ } } if len(b.errs) > 0 { return nil, nil } prodPrec := map[productionNum]int{} prodAssoc := map[productionNum]assocType{} for _, prod := range prodsAndActs.prods.getAllProductions() { // A #prec directive changes only precedence, not associativity. if term, ok := prodsAndActs.prodPrecsTerm[prod.id]; ok { if prec, ok := termPrec[term.Num()]; ok { prodPrec[prod.num] = prec prodAssoc[prod.num] = assocTypeNil } else { text, _ := symTab.ToText(term) b.errs = append(b.errs, &verr.SpecError{ Cause: semErrUndefinedPrec, Detail: text, Row: prodsAndActs.prodPrecPoss[prod.id].Row, Col: prodsAndActs.prodPrecPoss[prod.id].Col, }) } } else if ordSym, ok := prodsAndActs.prodPrecsOrdSym[prod.id]; ok { if prec, ok := ordSymPrec[ordSym]; ok { prodPrec[prod.num] = prec prodAssoc[prod.num] = assocTypeNil } else { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrUndefinedOrdSym, Detail: fmt.Sprintf("$%v", ordSym), Row: prodsAndActs.prodPrecPoss[prod.id].Row, Col: prodsAndActs.prodPrecPoss[prod.id].Col, }) } } else { // A production inherits precedence and associativity from the right-most terminal symbol. mostrightTerm := symbol.SymbolNil for _, sym := range prod.rhs { if !sym.IsTerminal() { continue } mostrightTerm = sym } if !mostrightTerm.IsNil() { prodPrec[prod.num] = termPrec[mostrightTerm.Num()] prodAssoc[prod.num] = termAssoc[mostrightTerm.Num()] } } } if len(b.errs) > 0 { return nil, nil } return &precAndAssoc{ termPrec: termPrec, termAssoc: termAssoc, prodPrec: prodPrec, prodAssoc: prodAssoc, }, nil } func compile(gram *Grammar, opts ...BuildOption) (*spec.CompiledGrammar, *spec.Report, error) { config := &buildConfig{} for _, opt := range opts { opt(config) } lexSpec, err, cErrs := lexical.Compile(gram.lexSpec, lexical.CompressionLevelMax) if err != nil { if len(cErrs) > 0 { var b strings.Builder writeCompileError(&b, cErrs[0]) for _, cerr := range cErrs[1:] { fmt.Fprintf(&b, "\n") writeCompileError(&b, cerr) } return nil, nil, fmt.Errorf(b.String()) } return nil, nil, err } kind2Term := make([]int, len(lexSpec.KindNames)) for i, k := range lexSpec.KindNames { if k == spec.LexKindNameNil { kind2Term[spec.LexKindIDNil] = symbol.SymbolNil.Num().Int() continue } sym, ok := gram.symbolTable.ToSymbol(k.String()) if !ok { return nil, nil, fmt.Errorf("terminal symbol '%v' was not found in a symbol table", k) } kind2Term[i] = sym.Num().Int() } termTexts, err := gram.symbolTable.TerminalTexts() if err != nil { return nil, nil, err } var termSkip []int { r := gram.symbolTable.Reader() // I want to use gram.symbolTable.terminalSymbols() here instead of gram.symbolTable.terminalTexts(), // but gram.symbolTable.terminalSymbols() is different in length from terminalTexts // because it does not contain a predefined symbol, like EOF. // Therefore, we use terminalTexts, although it takes more time to lookup for symbols. termSkip = make([]int, len(termTexts)) for _, t := range termTexts { s, _ := r.ToSymbol(t) for _, sk := range gram.skipSymbols { if s != sk { continue } termSkip[s.Num()] = 1 break } } } nonTerms, err := gram.symbolTable.NonTerminalTexts() if err != nil { return nil, nil, err } firstSet, err := genFirstSet(gram.productionSet) if err != nil { return nil, nil, err } lr0, err := genLR0Automaton(gram.productionSet, gram.augmentedStartSymbol, gram.errorSymbol) if err != nil { return nil, nil, err } var tab *ParsingTable var report *spec.Report { lalr1, err := genLALR1Automaton(lr0, gram.productionSet, firstSet) if err != nil { return nil, nil, err } b := &lrTableBuilder{ automaton: lalr1.lr0Automaton, prods: gram.productionSet, termCount: len(termTexts), nonTermCount: len(nonTerms), symTab: gram.symbolTable, precAndAssoc: gram.precAndAssoc, } tab, err = b.build() if err != nil { return nil, nil, err } if config.isReportingEnabled { report, err = b.genReport(tab, gram) if err != nil { return nil, nil, err } } } action := make([]int, len(tab.actionTable)) for i, e := range tab.actionTable { action[i] = int(e) } goTo := make([]int, len(tab.goToTable)) for i, e := range tab.goToTable { goTo[i] = int(e) } lhsSyms := make([]int, len(gram.productionSet.getAllProductions())+1) altSymCounts := make([]int, len(gram.productionSet.getAllProductions())+1) recoverProds := make([]int, len(gram.productionSet.getAllProductions())+1) astActEnties := make([][]int, len(gram.productionSet.getAllProductions())+1) for _, p := range gram.productionSet.getAllProductions() { lhsSyms[p.num] = p.lhs.Num().Int() altSymCounts[p.num] = p.rhsLen if _, ok := gram.recoverProductions[p.id]; ok { recoverProds[p.num] = 1 } astAct, ok := gram.astActions[p.id] if !ok { continue } astActEntry := make([]int, len(astAct)) for i, e := range astAct { if e.expansion { astActEntry[i] = e.position * -1 } else { astActEntry[i] = e.position } } astActEnties[p.num] = astActEntry } return &spec.CompiledGrammar{ Name: gram.name, Lexical: lexSpec, Syntactic: &spec.SyntacticSpec{ Action: action, GoTo: goTo, StateCount: tab.stateCount, InitialState: tab.InitialState.Int(), StartProduction: productionNumStart.Int(), LHSSymbols: lhsSyms, AlternativeSymbolCounts: altSymCounts, Terminals: termTexts, TerminalCount: tab.terminalCount, TerminalSkip: termSkip, KindToTerminal: kind2Term, NonTerminals: nonTerms, NonTerminalCount: tab.nonTerminalCount, EOFSymbol: symbol.SymbolEOF.Num().Int(), ErrorSymbol: gram.errorSymbol.Num().Int(), ErrorTrapperStates: tab.errorTrapperStates, RecoverProductions: recoverProds, }, ASTAction: &spec.ASTAction{ Entries: astActEnties, }, }, report, nil } func writeCompileError(w io.Writer, cErr *lexical.CompileError) { if cErr.Fragment { fmt.Fprintf(w, "fragment ") } fmt.Fprintf(w, "%v: %v", cErr.Kind, cErr.Cause) if cErr.Detail != "" { fmt.Fprintf(w, ": %v", cErr.Detail) } } type lrItemID [32]byte func (id lrItemID) String() string { return fmt.Sprintf("%x", id.num()) } func (id lrItemID) num() uint32 { return binary.LittleEndian.Uint32(id[:]) } type lookAhead struct { symbols map[symbol.Symbol]struct{} // When propagation is true, an item propagates look-ahead symbols to other items. propagation bool } type lrItem struct { id lrItemID prod productionID // E → E + T // // Dot | Dotted Symbol | Item // ----+---------------+------------ // 0 | E | E →・E + T // 1 | + | E → E・+ T // 2 | T | E → E +・T // 3 | Nil | E → E + T・ dot int dottedSymbol symbol.Symbol // When initial is true, the LHS of the production is the augmented start symbol and dot is 0. // It looks like S' →・S. initial bool // When reducible is true, the item looks like E → E + T・. reducible bool // When kernel is true, the item is kernel item. kernel bool // lookAhead stores look-ahead symbols, and they are terminal symbols. // The item is reducible only when the look-ahead symbols appear as the next input symbol. lookAhead lookAhead } func newLR0Item(prod *production, dot int) (*lrItem, error) { if prod == nil { return nil, fmt.Errorf("production must be non-nil") } if dot < 0 || dot > prod.rhsLen { return nil, fmt.Errorf("dot must be between 0 and %v", prod.rhsLen) } var id lrItemID { b := []byte{} b = append(b, prod.id[:]...) bDot := make([]byte, 8) binary.LittleEndian.PutUint64(bDot, uint64(dot)) b = append(b, bDot...) id = sha256.Sum256(b) } dottedSymbol := symbol.SymbolNil if dot < prod.rhsLen { dottedSymbol = prod.rhs[dot] } initial := false if prod.lhs.IsStart() && dot == 0 { initial = true } reducible := false if dot == prod.rhsLen { reducible = true } kernel := false if initial || dot > 0 { kernel = true } item := &lrItem{ id: id, prod: prod.id, dot: dot, dottedSymbol: dottedSymbol, initial: initial, reducible: reducible, kernel: kernel, } return item, nil } type kernelID [32]byte func (id kernelID) String() string { return fmt.Sprintf("%x", binary.LittleEndian.Uint32(id[:])) } type kernel struct { id kernelID items []*lrItem } func newKernel(items []*lrItem) (*kernel, error) { if len(items) == 0 { return nil, fmt.Errorf("a kernel need at least one item") } // Remove duplicates from items. var sortedItems []*lrItem { m := map[lrItemID]*lrItem{} for _, item := range items { if !item.kernel { return nil, fmt.Errorf("not a kernel item: %v", item) } m[item.id] = item } sortedItems = []*lrItem{} for _, item := range m { sortedItems = append(sortedItems, item) } sort.Slice(sortedItems, func(i, j int) bool { return sortedItems[i].id.num() < sortedItems[j].id.num() }) } var id kernelID { b := []byte{} for _, item := range sortedItems { b = append(b, item.id[:]...) } id = sha256.Sum256(b) } return &kernel{ id: id, items: sortedItems, }, nil } type stateNum int const stateNumInitial = stateNum(0) func (n stateNum) Int() int { return int(n) } func (n stateNum) String() string { return strconv.Itoa(int(n)) } func (n stateNum) next() stateNum { return stateNum(n + 1) } type lrState struct { *kernel num stateNum next map[symbol.Symbol]kernelID reducible map[productionID]struct{} // emptyProdItems stores items that have an empty production like `p → ε` and is reducible. // Thus the items emptyProdItems stores are like `p → ・ε`. emptyProdItems is needed to store // look-ahead symbols because the kernel items don't include these items. // // For instance, we have the following productions, and A is a terminal symbol. // // s' → s // s → A | ε // // CLOSURE({s' → ・s}) generates the following closure, but the kernel of this closure doesn't // include `s → ・ε`. // // s' → ・s // s → ・A // s → ・ε emptyProdItems []*lrItem // When isErrorTrapper is `true`, the item can shift the `error` symbol. The item has the following form. // The `α` and `β` can be empty. // // A → α・error β isErrorTrapper bool } type stateAndLRItem struct { kernelID kernelID itemID lrItemID } type propagation struct { src *stateAndLRItem dest []*stateAndLRItem } type lalr1Automaton struct { *lr0Automaton } func genLALR1Automaton(lr0 *lr0Automaton, prods *productionSet, first *firstSet) (*lalr1Automaton, error) { // Set the look-ahead symbol to the initial item: [S' → ・S, $] iniState := lr0.states[lr0.initialState] iniState.items[0].lookAhead.symbols = map[symbol.Symbol]struct{}{ symbol.SymbolEOF: {}, } var props []*propagation for _, state := range lr0.states { for _, kItem := range state.items { items, err := genLALR1Closure(kItem, prods, first) if err != nil { return nil, err } kItem.lookAhead.propagation = true var propDests []*stateAndLRItem for _, item := range items { if item.reducible { p, ok := prods.findByID(item.prod) if !ok { return nil, fmt.Errorf("production not found: %v", item.prod) } if p.isEmpty() { var reducibleItem *lrItem for _, it := range state.emptyProdItems { if it.id != item.id { continue } reducibleItem = it break } if reducibleItem == nil { return nil, fmt.Errorf("reducible item not found: %v", item.id) } if reducibleItem.lookAhead.symbols == nil { reducibleItem.lookAhead.symbols = map[symbol.Symbol]struct{}{} } for a := range item.lookAhead.symbols { reducibleItem.lookAhead.symbols[a] = struct{}{} } propDests = append(propDests, &stateAndLRItem{ kernelID: state.id, itemID: item.id, }) } continue } nextKID := state.next[item.dottedSymbol] var nextItemID lrItemID { p, ok := prods.findByID(item.prod) if !ok { return nil, fmt.Errorf("production not found: %v", item.prod) } it, err := newLR0Item(p, item.dot+1) if err != nil { return nil, fmt.Errorf("failed to generate an item ID: %v", err) } nextItemID = it.id } if item.lookAhead.propagation { propDests = append(propDests, &stateAndLRItem{ kernelID: nextKID, itemID: nextItemID, }) } else { nextState := lr0.states[nextKID] var nextItem *lrItem for _, it := range nextState.items { if it.id != nextItemID { continue } nextItem = it break } if nextItem == nil { return nil, fmt.Errorf("item not found: %v", nextItemID) } if nextItem.lookAhead.symbols == nil { nextItem.lookAhead.symbols = map[symbol.Symbol]struct{}{} } for a := range item.lookAhead.symbols { nextItem.lookAhead.symbols[a] = struct{}{} } } } if len(propDests) == 0 { continue } props = append(props, &propagation{ src: &stateAndLRItem{ kernelID: state.id, itemID: kItem.id, }, dest: propDests, }) } } err := propagateLookAhead(lr0, props) if err != nil { return nil, fmt.Errorf("failed to propagate look-ahead symbols: %v", err) } return &lalr1Automaton{ lr0Automaton: lr0, }, nil } func genLALR1Closure(srcItem *lrItem, prods *productionSet, first *firstSet) ([]*lrItem, error) { items := []*lrItem{} knownItems := map[lrItemID]map[symbol.Symbol]struct{}{} knownItemsProp := map[lrItemID]struct{}{} uncheckedItems := []*lrItem{} items = append(items, srcItem) uncheckedItems = append(uncheckedItems, srcItem) for len(uncheckedItems) > 0 { nextUncheckedItems := []*lrItem{} for _, item := range uncheckedItems { if item.dottedSymbol.IsTerminal() { continue } p, ok := prods.findByID(item.prod) if !ok { return nil, fmt.Errorf("production not found: %v", item.prod) } var fstSyms []symbol.Symbol var isFstNullable bool { fst, err := first.find(p, item.dot+1) if err != nil { return nil, err } fstSyms = make([]symbol.Symbol, len(fst.symbols)) i := 0 for s := range fst.symbols { fstSyms[i] = s i++ } if fst.empty { isFstNullable = true } } ps, _ := prods.findByLHS(item.dottedSymbol) for _, prod := range ps { var lookAhead []symbol.Symbol { var lookAheadCount int if isFstNullable { lookAheadCount = len(fstSyms) + len(item.lookAhead.symbols) } else { lookAheadCount = len(fstSyms) } lookAhead = make([]symbol.Symbol, lookAheadCount) i := 0 for _, s := range fstSyms { lookAhead[i] = s i++ } if isFstNullable { for a := range item.lookAhead.symbols { lookAhead[i] = a i++ } } } for _, a := range lookAhead { newItem, err := newLR0Item(prod, 0) if err != nil { return nil, err } if items, exist := knownItems[newItem.id]; exist { if _, exist := items[a]; exist { continue } } newItem.lookAhead.symbols = map[symbol.Symbol]struct{}{ a: {}, } items = append(items, newItem) if knownItems[newItem.id] == nil { knownItems[newItem.id] = map[symbol.Symbol]struct{}{} } knownItems[newItem.id][a] = struct{}{} nextUncheckedItems = append(nextUncheckedItems, newItem) } if isFstNullable { newItem, err := newLR0Item(prod, 0) if err != nil { return nil, err } if _, exist := knownItemsProp[newItem.id]; exist { continue } newItem.lookAhead.propagation = true items = append(items, newItem) knownItemsProp[newItem.id] = struct{}{} nextUncheckedItems = append(nextUncheckedItems, newItem) } } } uncheckedItems = nextUncheckedItems } return items, nil } func propagateLookAhead(lr0 *lr0Automaton, props []*propagation) error { for { changed := false for _, prop := range props { srcState, ok := lr0.states[prop.src.kernelID] if !ok { return fmt.Errorf("source state not found: %v", prop.src.kernelID) } var srcItem *lrItem for _, item := range srcState.items { if item.id != prop.src.itemID { continue } srcItem = item break } if srcItem == nil { return fmt.Errorf("source item not found: %v", prop.src.itemID) } for _, dest := range prop.dest { destState, ok := lr0.states[dest.kernelID] if !ok { return fmt.Errorf("destination state not found: %v", dest.kernelID) } var destItem *lrItem for _, item := range destState.items { if item.id != dest.itemID { continue } destItem = item break } if destItem == nil { for _, item := range destState.emptyProdItems { if item.id != dest.itemID { continue } destItem = item break } if destItem == nil { return fmt.Errorf("destination item not found: %v", dest.itemID) } } for a := range srcItem.lookAhead.symbols { if _, ok := destItem.lookAhead.symbols[a]; ok { continue } if destItem.lookAhead.symbols == nil { destItem.lookAhead.symbols = map[symbol.Symbol]struct{}{} } destItem.lookAhead.symbols[a] = struct{}{} changed = true } } } if !changed { break } } return nil } type lr0Automaton struct { initialState kernelID states map[kernelID]*lrState } func genLR0Automaton(prods *productionSet, startSym symbol.Symbol, errSym symbol.Symbol) (*lr0Automaton, error) { if !startSym.IsStart() { return nil, fmt.Errorf("passed symbold is not a start symbol") } automaton := &lr0Automaton{ states: map[kernelID]*lrState{}, } currentState := stateNumInitial knownKernels := map[kernelID]struct{}{} uncheckedKernels := []*kernel{} // Generate an initial kernel. { prods, _ := prods.findByLHS(startSym) initialItem, err := newLR0Item(prods[0], 0) if err != nil { return nil, err } k, err := newKernel([]*lrItem{initialItem}) if err != nil { return nil, err } automaton.initialState = k.id knownKernels[k.id] = struct{}{} uncheckedKernels = append(uncheckedKernels, k) } for len(uncheckedKernels) > 0 { nextUncheckedKernels := []*kernel{} for _, k := range uncheckedKernels { state, neighbours, err := genStateAndNeighbourKernels(k, prods, errSym) if err != nil { return nil, err } state.num = currentState currentState = currentState.next() automaton.states[state.id] = state for _, k := range neighbours { if _, known := knownKernels[k.id]; known { continue } knownKernels[k.id] = struct{}{} nextUncheckedKernels = append(nextUncheckedKernels, k) } } uncheckedKernels = nextUncheckedKernels } return automaton, nil } func genStateAndNeighbourKernels(k *kernel, prods *productionSet, errSym symbol.Symbol) (*lrState, []*kernel, error) { items, err := genLR0Closure(k, prods) if err != nil { return nil, nil, err } neighbours, err := genNeighbourKernels(items, prods) if err != nil { return nil, nil, err } next := map[symbol.Symbol]kernelID{} kernels := []*kernel{} for _, n := range neighbours { next[n.symbol] = n.kernel.id kernels = append(kernels, n.kernel) } reducible := map[productionID]struct{}{} var emptyProdItems []*lrItem isErrorTrapper := false for _, item := range items { if item.dottedSymbol == errSym { isErrorTrapper = true } if item.reducible { reducible[item.prod] = struct{}{} prod, ok := prods.findByID(item.prod) if !ok { return nil, nil, fmt.Errorf("reducible production not found: %v", item.prod) } if prod.isEmpty() { emptyProdItems = append(emptyProdItems, item) } } } return &lrState{ kernel: k, next: next, reducible: reducible, emptyProdItems: emptyProdItems, isErrorTrapper: isErrorTrapper, }, kernels, nil } func genLR0Closure(k *kernel, prods *productionSet) ([]*lrItem, error) { items := []*lrItem{} knownItems := map[lrItemID]struct{}{} uncheckedItems := []*lrItem{} for _, item := range k.items { items = append(items, item) uncheckedItems = append(uncheckedItems, item) } for len(uncheckedItems) > 0 { nextUncheckedItems := []*lrItem{} for _, item := range uncheckedItems { if item.dottedSymbol.IsTerminal() { continue } ps, _ := prods.findByLHS(item.dottedSymbol) for _, prod := range ps { item, err := newLR0Item(prod, 0) if err != nil { return nil, err } if _, exist := knownItems[item.id]; exist { continue } items = append(items, item) knownItems[item.id] = struct{}{} nextUncheckedItems = append(nextUncheckedItems, item) } } uncheckedItems = nextUncheckedItems } return items, nil } type neighbourKernel struct { symbol symbol.Symbol kernel *kernel } func genNeighbourKernels(items []*lrItem, prods *productionSet) ([]*neighbourKernel, error) { kItemMap := map[symbol.Symbol][]*lrItem{} for _, item := range items { if item.dottedSymbol.IsNil() { continue } prod, ok := prods.findByID(item.prod) if !ok { return nil, fmt.Errorf("a production was not found: %v", item.prod) } kItem, err := newLR0Item(prod, item.dot+1) if err != nil { return nil, err } kItemMap[item.dottedSymbol] = append(kItemMap[item.dottedSymbol], kItem) } nextSyms := []symbol.Symbol{} for sym := range kItemMap { nextSyms = append(nextSyms, sym) } sort.Slice(nextSyms, func(i, j int) bool { return nextSyms[i] < nextSyms[j] }) kernels := []*neighbourKernel{} for _, sym := range nextSyms { k, err := newKernel(kItemMap[sym]) if err != nil { return nil, err } kernels = append(kernels, &neighbourKernel{ symbol: sym, kernel: k, }) } return kernels, nil } type ActionType string const ( ActionTypeShift = ActionType("shift") ActionTypeReduce = ActionType("reduce") ActionTypeError = ActionType("error") ) type actionEntry int const actionEntryEmpty = actionEntry(0) func newShiftActionEntry(state stateNum) actionEntry { return actionEntry(state * -1) } func newReduceActionEntry(prod productionNum) actionEntry { return actionEntry(prod) } func (e actionEntry) isEmpty() bool { return e == actionEntryEmpty } func (e actionEntry) describe() (ActionType, stateNum, productionNum) { if e == actionEntryEmpty { return ActionTypeError, stateNumInitial, productionNumNil } if e < 0 { return ActionTypeShift, stateNum(e * -1), productionNumNil } return ActionTypeReduce, stateNumInitial, productionNum(e) } type GoToType string const ( GoToTypeRegistered = GoToType("registered") GoToTypeError = GoToType("error") ) type goToEntry uint const goToEntryEmpty = goToEntry(0) func newGoToEntry(state stateNum) goToEntry { return goToEntry(state) } func (e goToEntry) describe() (GoToType, stateNum) { if e == goToEntryEmpty { return GoToTypeError, stateNumInitial } return GoToTypeRegistered, stateNum(e) } type conflictResolutionMethod int func (m conflictResolutionMethod) Int() int { return int(m) } const ( ResolvedByPrec conflictResolutionMethod = 1 ResolvedByAssoc conflictResolutionMethod = 2 ResolvedByShift conflictResolutionMethod = 3 ResolvedByProdOrder conflictResolutionMethod = 4 ) type conflict interface { conflict() } type shiftReduceConflict struct { state stateNum sym symbol.Symbol nextState stateNum prodNum productionNum resolvedBy conflictResolutionMethod } func (c *shiftReduceConflict) conflict() { } type reduceReduceConflict struct { state stateNum sym symbol.Symbol prodNum1 productionNum prodNum2 productionNum resolvedBy conflictResolutionMethod } func (c *reduceReduceConflict) conflict() { } var ( _ conflict = &shiftReduceConflict{} _ conflict = &reduceReduceConflict{} ) type ParsingTable struct { actionTable []actionEntry goToTable []goToEntry stateCount int terminalCount int nonTerminalCount int // errorTrapperStates's index means a state number, and when `errorTrapperStates[stateNum]` is `1`, // the state has an item having the following form. The `α` and `β` can be empty. // // A → α・error β errorTrapperStates []int InitialState stateNum } func (t *ParsingTable) getAction(state stateNum, sym symbol.SymbolNum) (ActionType, stateNum, productionNum) { pos := state.Int()*t.terminalCount + sym.Int() return t.actionTable[pos].describe() } func (t *ParsingTable) getGoTo(state stateNum, sym symbol.SymbolNum) (GoToType, stateNum) { pos := state.Int()*t.nonTerminalCount + sym.Int() return t.goToTable[pos].describe() } func (t *ParsingTable) readAction(row int, col int) actionEntry { return t.actionTable[row*t.terminalCount+col] } func (t *ParsingTable) writeAction(row int, col int, act actionEntry) { t.actionTable[row*t.terminalCount+col] = act } func (t *ParsingTable) writeGoTo(state stateNum, sym symbol.Symbol, nextState stateNum) { pos := state.Int()*t.nonTerminalCount + sym.Num().Int() t.goToTable[pos] = newGoToEntry(nextState) } type lrTableBuilder struct { automaton *lr0Automaton prods *productionSet termCount int nonTermCount int symTab *symbol.SymbolTableReader precAndAssoc *precAndAssoc conflicts []conflict } func (b *lrTableBuilder) build() (*ParsingTable, error) { var ptab *ParsingTable { initialState := b.automaton.states[b.automaton.initialState] ptab = &ParsingTable{ actionTable: make([]actionEntry, len(b.automaton.states)*b.termCount), goToTable: make([]goToEntry, len(b.automaton.states)*b.nonTermCount), stateCount: len(b.automaton.states), terminalCount: b.termCount, nonTerminalCount: b.nonTermCount, errorTrapperStates: make([]int, len(b.automaton.states)), InitialState: initialState.num, } } for _, state := range b.automaton.states { if state.isErrorTrapper { ptab.errorTrapperStates[state.num] = 1 } for sym, kID := range state.next { nextState := b.automaton.states[kID] if sym.IsTerminal() { b.writeShiftAction(ptab, state.num, sym, nextState.num) } else { ptab.writeGoTo(state.num, sym, nextState.num) } } for prodID := range state.reducible { reducibleProd, ok := b.prods.findByID(prodID) if !ok { return nil, fmt.Errorf("reducible production not found: %v", prodID) } var reducibleItem *lrItem for _, item := range state.items { if item.prod != reducibleProd.id { continue } reducibleItem = item break } if reducibleItem == nil { for _, item := range state.emptyProdItems { if item.prod != reducibleProd.id { continue } reducibleItem = item break } if reducibleItem == nil { return nil, fmt.Errorf("reducible item not found; state: %v, production: %v", state.num, reducibleProd.num) } } for a := range reducibleItem.lookAhead.symbols { b.writeReduceAction(ptab, state.num, a, reducibleProd.num) } } } return ptab, nil } // writeShiftAction writes a shift action to the parsing table. When a shift/reduce conflict occurred, // we prioritize the shift action. func (b *lrTableBuilder) writeShiftAction(tab *ParsingTable, state stateNum, sym symbol.Symbol, nextState stateNum) { act := tab.readAction(state.Int(), sym.Num().Int()) if !act.isEmpty() { ty, _, p := act.describe() if ty == ActionTypeReduce { act, method := b.resolveSRConflict(sym.Num(), p) b.conflicts = append(b.conflicts, &shiftReduceConflict{ state: state, sym: sym, nextState: nextState, prodNum: p, resolvedBy: method, }) if act == ActionTypeShift { tab.writeAction(state.Int(), sym.Num().Int(), newShiftActionEntry(nextState)) } return } } tab.writeAction(state.Int(), sym.Num().Int(), newShiftActionEntry(nextState)) } // writeReduceAction writes a reduce action to the parsing table. When a shift/reduce conflict occurred, // we prioritize the shift action, and when a reduce/reduce conflict we prioritize the action that reduces // the production with higher priority. Productions defined earlier in the grammar file have a higher priority. func (b *lrTableBuilder) writeReduceAction(tab *ParsingTable, state stateNum, sym symbol.Symbol, prod productionNum) { act := tab.readAction(state.Int(), sym.Num().Int()) if !act.isEmpty() { ty, s, p := act.describe() switch ty { case ActionTypeReduce: if p == prod { return } b.conflicts = append(b.conflicts, &reduceReduceConflict{ state: state, sym: sym, prodNum1: p, prodNum2: prod, resolvedBy: ResolvedByProdOrder, }) if p < prod { tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(p)) } else { tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod)) } case ActionTypeShift: act, method := b.resolveSRConflict(sym.Num(), prod) b.conflicts = append(b.conflicts, &shiftReduceConflict{ state: state, sym: sym, nextState: s, prodNum: prod, resolvedBy: method, }) if act == ActionTypeReduce { tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod)) } } return } tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod)) } func (b *lrTableBuilder) resolveSRConflict(sym symbol.SymbolNum, prod productionNum) (ActionType, conflictResolutionMethod) { symPrec := b.precAndAssoc.terminalPrecedence(sym) prodPrec := b.precAndAssoc.productionPredence(prod) if symPrec == 0 || prodPrec == 0 { return ActionTypeShift, ResolvedByShift } if symPrec == prodPrec { assoc := b.precAndAssoc.productionAssociativity(prod) if assoc != assocTypeLeft { return ActionTypeShift, ResolvedByAssoc } return ActionTypeReduce, ResolvedByAssoc } if symPrec < prodPrec { return ActionTypeShift, ResolvedByPrec } return ActionTypeReduce, ResolvedByPrec } func (b *lrTableBuilder) genReport(tab *ParsingTable, gram *Grammar) (*spec.Report, error) { var terms []*spec.Terminal { termSyms := b.symTab.TerminalSymbols() terms = make([]*spec.Terminal, len(termSyms)+1) for _, sym := range termSyms { name, ok := b.symTab.ToText(sym) if !ok { return nil, fmt.Errorf("failed to generate terminals: symbol not found: %v", sym) } term := &spec.Terminal{ Number: sym.Num().Int(), Name: name, } prec := b.precAndAssoc.terminalPrecedence(sym.Num()) if prec != precNil { term.Precedence = prec } assoc := b.precAndAssoc.terminalAssociativity(sym.Num()) switch assoc { case assocTypeLeft: term.Associativity = "l" case assocTypeRight: term.Associativity = "r" } terms[sym.Num()] = term } } var nonTerms []*spec.NonTerminal { nonTermSyms := b.symTab.NonTerminalSymbols() nonTerms = make([]*spec.NonTerminal, len(nonTermSyms)+1) for _, sym := range nonTermSyms { name, ok := b.symTab.ToText(sym) if !ok { return nil, fmt.Errorf("failed to generate non-terminals: symbol not found: %v", sym) } nonTerms[sym.Num()] = &spec.NonTerminal{ Number: sym.Num().Int(), Name: name, } } } var prods []*spec.Production { ps := gram.productionSet.getAllProductions() prods = make([]*spec.Production, len(ps)+1) for _, p := range ps { rhs := make([]int, len(p.rhs)) for i, e := range p.rhs { if e.IsTerminal() { rhs[i] = e.Num().Int() } else { rhs[i] = e.Num().Int() * -1 } } prod := &spec.Production{ Number: p.num.Int(), LHS: p.lhs.Num().Int(), RHS: rhs, } prec := b.precAndAssoc.productionPredence(p.num) if prec != precNil { prod.Precedence = prec } assoc := b.precAndAssoc.productionAssociativity(p.num) switch assoc { case assocTypeLeft: prod.Associativity = "l" case assocTypeRight: prod.Associativity = "r" } prods[p.num.Int()] = prod } } var states []*spec.State { srConflicts := map[stateNum][]*shiftReduceConflict{} rrConflicts := map[stateNum][]*reduceReduceConflict{} for _, con := range b.conflicts { switch c := con.(type) { case *shiftReduceConflict: srConflicts[c.state] = append(srConflicts[c.state], c) case *reduceReduceConflict: rrConflicts[c.state] = append(rrConflicts[c.state], c) } } states = make([]*spec.State, len(b.automaton.states)) for _, s := range b.automaton.states { kernel := make([]*spec.Item, len(s.items)) for i, item := range s.items { p, ok := b.prods.findByID(item.prod) if !ok { return nil, fmt.Errorf("failed to generate states: production of kernel item not found: %v", item.prod) } kernel[i] = &spec.Item{ Production: p.num.Int(), Dot: item.dot, } } sort.Slice(kernel, func(i, j int) bool { if kernel[i].Production < kernel[j].Production { return true } if kernel[i].Production > kernel[j].Production { return false } return kernel[i].Dot < kernel[j].Dot }) var shift []*spec.Transition var reduce []*spec.Reduce var goTo []*spec.Transition { TERMINALS_LOOP: for _, t := range b.symTab.TerminalSymbols() { act, next, prod := tab.getAction(s.num, t.Num()) switch act { case ActionTypeShift: shift = append(shift, &spec.Transition{ Symbol: t.Num().Int(), State: next.Int(), }) case ActionTypeReduce: for _, r := range reduce { if r.Production == prod.Int() { r.LookAhead = append(r.LookAhead, t.Num().Int()) continue TERMINALS_LOOP } } reduce = append(reduce, &spec.Reduce{ LookAhead: []int{t.Num().Int()}, Production: prod.Int(), }) } } for _, n := range b.symTab.NonTerminalSymbols() { ty, next := tab.getGoTo(s.num, n.Num()) if ty == GoToTypeRegistered { goTo = append(goTo, &spec.Transition{ Symbol: n.Num().Int(), State: next.Int(), }) } } sort.Slice(shift, func(i, j int) bool { return shift[i].State < shift[j].State }) sort.Slice(reduce, func(i, j int) bool { return reduce[i].Production < reduce[j].Production }) sort.Slice(goTo, func(i, j int) bool { return goTo[i].State < goTo[j].State }) } sr := []*spec.SRConflict{} rr := []*spec.RRConflict{} { for _, c := range srConflicts[s.num] { conflict := &spec.SRConflict{ Symbol: c.sym.Num().Int(), State: c.nextState.Int(), Production: c.prodNum.Int(), ResolvedBy: c.resolvedBy.Int(), } ty, s, p := tab.getAction(s.num, c.sym.Num()) switch ty { case ActionTypeShift: n := s.Int() conflict.AdoptedState = &n case ActionTypeReduce: n := p.Int() conflict.AdoptedProduction = &n } sr = append(sr, conflict) } sort.Slice(sr, func(i, j int) bool { return sr[i].Symbol < sr[j].Symbol }) for _, c := range rrConflicts[s.num] { conflict := &spec.RRConflict{ Symbol: c.sym.Num().Int(), Production1: c.prodNum1.Int(), Production2: c.prodNum2.Int(), ResolvedBy: c.resolvedBy.Int(), } _, _, p := tab.getAction(s.num, c.sym.Num()) conflict.AdoptedProduction = p.Int() rr = append(rr, conflict) } sort.Slice(rr, func(i, j int) bool { return rr[i].Symbol < rr[j].Symbol }) } states[s.num.Int()] = &spec.State{ Number: s.num.Int(), Kernel: kernel, Shift: shift, Reduce: reduce, GoTo: goTo, SRConflict: sr, RRConflict: rr, } } } return &spec.Report{ Terminals: terms, NonTerminals: nonTerms, Productions: prods, States: states, }, nil } type productionID [32]byte func (id productionID) String() string { return hex.EncodeToString(id[:]) } func genProductionID(lhs symbol.Symbol, rhs []symbol.Symbol) productionID { seq := lhs.Byte() for _, sym := range rhs { seq = append(seq, sym.Byte()...) } return productionID(sha256.Sum256(seq)) } type productionNum uint16 const ( productionNumNil = productionNum(0) productionNumStart = productionNum(1) productionNumMin = productionNum(2) ) func (n productionNum) Int() int { return int(n) } type production struct { id productionID num productionNum lhs symbol.Symbol rhs []symbol.Symbol rhsLen int } func newProduction(lhs symbol.Symbol, rhs []symbol.Symbol) (*production, error) { if lhs.IsNil() { return nil, fmt.Errorf("LHS must be a non-nil symbol; LHS: %v, RHS: %v", lhs, rhs) } for _, sym := range rhs { if sym.IsNil() { return nil, fmt.Errorf("a symbol of RHS must be a non-nil symbol; LHS: %v, RHS: %v", lhs, rhs) } } return &production{ id: genProductionID(lhs, rhs), lhs: lhs, rhs: rhs, rhsLen: len(rhs), }, nil } func (p *production) isEmpty() bool { return p.rhsLen == 0 } type productionSet struct { lhs2Prods map[symbol.Symbol][]*production id2Prod map[productionID]*production num productionNum } func newProductionSet() *productionSet { return &productionSet{ lhs2Prods: map[symbol.Symbol][]*production{}, id2Prod: map[productionID]*production{}, num: productionNumMin, } } func (ps *productionSet) append(prod *production) { if _, ok := ps.id2Prod[prod.id]; ok { return } if prod.lhs.IsStart() { prod.num = productionNumStart } else { prod.num = ps.num ps.num++ } if prods, ok := ps.lhs2Prods[prod.lhs]; ok { ps.lhs2Prods[prod.lhs] = append(prods, prod) } else { ps.lhs2Prods[prod.lhs] = []*production{prod} } ps.id2Prod[prod.id] = prod } func (ps *productionSet) findByID(id productionID) (*production, bool) { prod, ok := ps.id2Prod[id] return prod, ok } func (ps *productionSet) findByLHS(lhs symbol.Symbol) ([]*production, bool) { if lhs.IsNil() { return nil, false } prods, ok := ps.lhs2Prods[lhs] return prods, ok } func (ps *productionSet) getAllProductions() map[productionID]*production { return ps.id2Prod } var ( semErrNoGrammarName = errors.New("name is missing") semErrSpellingInconsistency = errors.New("the identifiers are treated as the same. please use the same spelling") semErrDuplicateAssoc = errors.New("associativity and precedence cannot be specified multiple times for a symbol") semErrUndefinedPrec = errors.New("symbol must has precedence") semErrUndefinedOrdSym = errors.New("undefined ordered symbol") semErrUnusedProduction = errors.New("unused production") semErrUnusedTerminal = errors.New("unused terminal") semErrTermCannotBeSkipped = errors.New("a terminal used in productions cannot be skipped") semErrNoProduction = errors.New("a grammar needs at least one production") semErrUndefinedSym = errors.New("undefined symbol") semErrDuplicateProduction = errors.New("duplicate production") semErrDuplicateTerminal = errors.New("duplicate terminal") semErrDuplicateFragment = errors.New("duplicate fragment") semErrDuplicateName = errors.New("duplicate names are not allowed between terminals and non-terminals") semErrErrSymIsReserved = errors.New("symbol 'error' is reserved as a terminal symbol") semErrDuplicateLabel = errors.New("a label must be unique in an alternative") semErrInvalidLabel = errors.New("a label must differ from terminal symbols or non-terminal symbols") semErrDirInvalidName = errors.New("invalid directive name") semErrDirInvalidParam = errors.New("invalid parameter") semErrDuplicateDir = errors.New("a directive must not be duplicated") semErrDuplicateElem = errors.New("duplicate element") semErrAmbiguousElem = errors.New("ambiguous element") semErrInvalidProdDir = errors.New("invalid production directive") semErrInvalidAltDir = errors.New("invalid alternative directive") )