diff options
Diffstat (limited to 'src/urubu/grammar')
-rw-r--r-- | src/urubu/grammar/first.go | 148 | ||||
-rw-r--r-- | src/urubu/grammar/grammar.go | 1390 | ||||
-rw-r--r-- | src/urubu/grammar/item.go | 206 | ||||
-rw-r--r-- | src/urubu/grammar/lalr1.go | 318 | ||||
-rw-r--r-- | src/urubu/grammar/lexical/compiler.go | 413 | ||||
-rw-r--r-- | src/urubu/grammar/lexical/dfa/dfa.go | 173 | ||||
-rw-r--r-- | src/urubu/grammar/lexical/dfa/symbol_position.go | 182 | ||||
-rw-r--r-- | src/urubu/grammar/lexical/dfa/tree.go | 567 | ||||
-rw-r--r-- | src/urubu/grammar/lexical/entry.go | 171 | ||||
-rw-r--r-- | src/urubu/grammar/lexical/parser/error.go | 36 | ||||
-rw-r--r-- | src/urubu/grammar/lexical/parser/fragment.go | 72 | ||||
-rw-r--r-- | src/urubu/grammar/lexical/parser/lexer.go | 594 | ||||
-rw-r--r-- | src/urubu/grammar/lexical/parser/parser.go | 531 | ||||
-rw-r--r-- | src/urubu/grammar/lexical/parser/tree.go | 459 | ||||
-rw-r--r-- | src/urubu/grammar/lr0.go | 197 | ||||
-rw-r--r-- | src/urubu/grammar/parsing_table.go | 553 | ||||
-rw-r--r-- | src/urubu/grammar/production.go | 117 | ||||
-rw-r--r-- | src/urubu/grammar/semantic_error.go | 30 | ||||
-rw-r--r-- | src/urubu/grammar/symbol/symbol.go | 295 |
19 files changed, 6452 insertions, 0 deletions
diff --git a/src/urubu/grammar/first.go b/src/urubu/grammar/first.go new file mode 100644 index 0000000..6443bcf --- /dev/null +++ b/src/urubu/grammar/first.go @@ -0,0 +1,148 @@ +package grammar + +import ( + "fmt" + + "urubu/grammar/symbol" +) + +type firstEntry struct { + symbols map[symbol.Symbol]struct{} + empty bool +} + +func newFirstEntry() *firstEntry { + return &firstEntry{ + symbols: map[symbol.Symbol]struct{}{}, + empty: false, + } +} + +func (e *firstEntry) add(sym symbol.Symbol) bool { + if _, ok := e.symbols[sym]; ok { + return false + } + e.symbols[sym] = struct{}{} + return true +} + +func (e *firstEntry) addEmpty() bool { + if !e.empty { + e.empty = true + return true + } + return false +} + +func (e *firstEntry) mergeExceptEmpty(target *firstEntry) bool { + if target == nil { + return false + } + changed := false + for sym := range target.symbols { + added := e.add(sym) + if added { + changed = true + } + } + return changed +} + +type firstSet struct { + set map[symbol.Symbol]*firstEntry +} + +func newFirstSet(prods *productionSet) *firstSet { + fst := &firstSet{ + set: map[symbol.Symbol]*firstEntry{}, + } + for _, prod := range prods.getAllProductions() { + if _, ok := fst.set[prod.lhs]; ok { + continue + } + fst.set[prod.lhs] = newFirstEntry() + } + + return fst +} + +func (fst *firstSet) find(prod *production, head int) (*firstEntry, error) { + entry := newFirstEntry() + if prod.rhsLen <= head { + entry.addEmpty() + return entry, nil + } + for _, sym := range prod.rhs[head:] { + if sym.IsTerminal() { + entry.add(sym) + return entry, nil + } + + e := fst.findBySymbol(sym) + if e == nil { + return nil, fmt.Errorf("an entry of FIRST was not found; symbol: %s", sym) + } + for s := range e.symbols { + entry.add(s) + } + if !e.empty { + return entry, nil + } + } + entry.addEmpty() + return entry, nil +} + +func (fst *firstSet) findBySymbol(sym symbol.Symbol) *firstEntry { + return fst.set[sym] +} + +type firstComContext struct { + first *firstSet +} + +func newFirstComContext(prods *productionSet) *firstComContext { + return &firstComContext{ + first: newFirstSet(prods), + } +} + +func genFirstSet(prods *productionSet) (*firstSet, error) { + cc := newFirstComContext(prods) + for { + more := false + for _, prod := range prods.getAllProductions() { + e := cc.first.findBySymbol(prod.lhs) + changed, err := genProdFirstEntry(cc, e, prod) + if err != nil { + return nil, err + } + if changed { + more = true + } + } + if !more { + break + } + } + return cc.first, nil +} + +func genProdFirstEntry(cc *firstComContext, acc *firstEntry, prod *production) (bool, error) { + if prod.isEmpty() { + return acc.addEmpty(), nil + } + + for _, sym := range prod.rhs { + if sym.IsTerminal() { + return acc.add(sym), nil + } + + e := cc.first.findBySymbol(sym) + changed := acc.mergeExceptEmpty(e) + if !e.empty { + return changed, nil + } + } + return acc.addEmpty(), nil +} diff --git a/src/urubu/grammar/grammar.go b/src/urubu/grammar/grammar.go new file mode 100644 index 0000000..bfa53c6 --- /dev/null +++ b/src/urubu/grammar/grammar.go @@ -0,0 +1,1390 @@ +package grammar + +import ( + "fmt" + "io" + "strings" + + verr "urubu/error" + "urubu/grammar/lexical" + "urubu/grammar/symbol" + spec "urubu/spec/grammar" + "urubu/spec/grammar/parser" +) + +type astActionEntry struct { + position int + expansion bool +} + +type assocType string + +const ( + assocTypeNil = assocType("") + assocTypeLeft = assocType("left") + assocTypeRight = assocType("right") +) + +const ( + precNil = 0 + precMin = 1 +) + +// precAndAssoc represents precedence and associativities of terminal symbols and productions. +// We use the priority of the production to resolve shift/reduce conflicts. +type precAndAssoc struct { + // termPrec and termAssoc represent the precedence of the terminal symbols. + termPrec map[symbol.SymbolNum]int + termAssoc map[symbol.SymbolNum]assocType + + // prodPrec and prodAssoc represent the precedence and the associativities of the production. + // These values are inherited from the right-most terminal symbols in the RHS of the productions. + prodPrec map[productionNum]int + prodAssoc map[productionNum]assocType +} + +func (pa *precAndAssoc) terminalPrecedence(sym symbol.SymbolNum) int { + prec, ok := pa.termPrec[sym] + if !ok { + return precNil + } + + return prec +} + +func (pa *precAndAssoc) terminalAssociativity(sym symbol.SymbolNum) assocType { + assoc, ok := pa.termAssoc[sym] + if !ok { + return assocTypeNil + } + + return assoc +} + +func (pa *precAndAssoc) productionPredence(prod productionNum) int { + prec, ok := pa.prodPrec[prod] + if !ok { + return precNil + } + + return prec +} + +func (pa *precAndAssoc) productionAssociativity(prod productionNum) assocType { + assoc, ok := pa.prodAssoc[prod] + if !ok { + return assocTypeNil + } + + return assoc +} + +const reservedSymbolNameError = "error" + +type Grammar struct { + name string + lexSpec *lexical.LexSpec + skipSymbols []symbol.Symbol + productionSet *productionSet + augmentedStartSymbol symbol.Symbol + errorSymbol symbol.Symbol + symbolTable *symbol.SymbolTableReader + astActions map[productionID][]*astActionEntry + precAndAssoc *precAndAssoc + + // recoverProductions is a set of productions having the recover directive. + recoverProductions map[productionID]struct{} +} + +type buildConfig struct { + isReportingEnabled bool +} + +type BuildOption func(config *buildConfig) + +func EnableReporting() BuildOption { + return func(config *buildConfig) { + config.isReportingEnabled = true + } +} + +type GrammarBuilder struct { + AST *parser.RootNode + + errs verr.SpecErrors +} + +func (b *GrammarBuilder) Build(opts ...BuildOption) (*spec.CompiledGrammar, *spec.Report, error) { + gram, err := b.build() + if err != nil { + return nil, nil, err + } + + return compile(gram, opts...) +} + +func (b *GrammarBuilder) build() (*Grammar, error) { + var specName string + { + errOccurred := false + for _, dir := range b.AST.Directives { + if dir.Name != "name" { + continue + } + + if len(dir.Parameters) != 1 || dir.Parameters[0].ID == "" { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'name' takes just one ID parameter", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + + errOccurred = true + break + } + + specName = dir.Parameters[0].ID + break + } + + if specName == "" && !errOccurred { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrNoGrammarName, + }) + } + } + + b.checkSpellingInconsistenciesOfUserDefinedIDs(b.AST) + if len(b.errs) > 0 { + return nil, b.errs + } + + symTab, ss, err := b.genSymbolTable(b.AST) + if err != nil { + return nil, err + } + + lexSpec, skip, err := b.genLexSpecAndSkipSymbols(symTab.Reader(), b.AST) + if err != nil { + return nil, err + } + + prodsAndActs, err := b.genProductionsAndActions(b.AST, symTab.Reader(), ss.errSym, ss.augStartSym, ss.startSym) + if err != nil { + return nil, err + } + if prodsAndActs == nil && len(b.errs) > 0 { + return nil, b.errs + } + + pa, err := b.genPrecAndAssoc(symTab.Reader(), ss.errSym, prodsAndActs) + if err != nil { + return nil, err + } + if pa == nil && len(b.errs) > 0 { + return nil, b.errs + } + + syms := findUsedAndUnusedSymbols(b.AST) + if syms == nil && len(b.errs) > 0 { + return nil, b.errs + } + + // When a terminal symbol that cannot be reached from the start symbol has the skip directive, + // the compiler treats its terminal as a used symbol, not unused. + { + r := symTab.Reader() + for _, sym := range skip { + s, _ := r.ToText(sym) + if _, ok := syms.unusedTerminals[s]; !ok { + prod := syms.usedTerminals[s] + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrTermCannotBeSkipped, + Detail: s, + Row: prod.Pos.Row, + Col: prod.Pos.Col, + }) + continue + } + + delete(syms.unusedTerminals, s) + } + } + + for sym, prod := range syms.unusedProductions { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrUnusedProduction, + Detail: sym, + Row: prod.Pos.Row, + Col: prod.Pos.Col, + }) + } + + for sym, prod := range syms.unusedTerminals { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrUnusedTerminal, + Detail: sym, + Row: prod.Pos.Row, + Col: prod.Pos.Col, + }) + } + + if len(b.errs) > 0 { + return nil, b.errs + } + + return &Grammar{ + name: specName, + lexSpec: lexSpec, + skipSymbols: skip, + productionSet: prodsAndActs.prods, + augmentedStartSymbol: prodsAndActs.augStartSym, + errorSymbol: ss.errSym, + symbolTable: symTab.Reader(), + astActions: prodsAndActs.astActs, + recoverProductions: prodsAndActs.recoverProds, + precAndAssoc: pa, + }, nil +} + +type usedAndUnusedSymbols struct { + unusedProductions map[string]*parser.ProductionNode + unusedTerminals map[string]*parser.ProductionNode + usedTerminals map[string]*parser.ProductionNode +} + +func findUsedAndUnusedSymbols(root *parser.RootNode) *usedAndUnusedSymbols { + prods := map[string]*parser.ProductionNode{} + lexProds := map[string]*parser.ProductionNode{} + mark := map[string]bool{} + { + for _, p := range root.Productions { + prods[p.LHS] = p + mark[p.LHS] = false + for _, alt := range p.RHS { + for _, e := range alt.Elements { + if e.ID == "" { + continue + } + mark[e.ID] = false + } + } + } + + for _, p := range root.LexProductions { + lexProds[p.LHS] = p + mark[p.LHS] = false + } + + start := root.Productions[0] + mark[start.LHS] = true + markUsedSymbols(mark, map[string]bool{}, prods, start) + + // We don't have to check the error symbol because the error symbol doesn't have a production. + delete(mark, reservedSymbolNameError) + } + + usedTerms := make(map[string]*parser.ProductionNode, len(lexProds)) + unusedProds := map[string]*parser.ProductionNode{} + unusedTerms := map[string]*parser.ProductionNode{} + for sym, used := range mark { + if p, ok := prods[sym]; ok { + if used { + continue + } + unusedProds[sym] = p + continue + } + if p, ok := lexProds[sym]; ok { + if used { + usedTerms[sym] = p + } else { + unusedTerms[sym] = p + } + continue + } + + // May be reached here when a fragment name appears on the right-hand side of a production rule. However, an error + // to the effect that a production rule cannot contain a fragment will be detected in a subsequent process. So we can + // ignore it here. + } + + return &usedAndUnusedSymbols{ + usedTerminals: usedTerms, + unusedProductions: unusedProds, + unusedTerminals: unusedTerms, + } +} + +func markUsedSymbols(mark map[string]bool, marked map[string]bool, prods map[string]*parser.ProductionNode, prod *parser.ProductionNode) { + if marked[prod.LHS] { + return + } + + for _, alt := range prod.RHS { + for _, e := range alt.Elements { + if e.ID == "" { + continue + } + + mark[e.ID] = true + + p, ok := prods[e.ID] + if !ok { + continue + } + + // Remove a production to avoid inifinite recursion. + marked[prod.LHS] = true + + markUsedSymbols(mark, marked, prods, p) + } + } +} + +func (b *GrammarBuilder) checkSpellingInconsistenciesOfUserDefinedIDs(root *parser.RootNode) { + var ids []string + { + for _, prod := range root.Productions { + ids = append(ids, prod.LHS) + for _, alt := range prod.RHS { + for _, elem := range alt.Elements { + if elem.Label != nil { + ids = append(ids, elem.Label.Name) + } + } + } + } + for _, prod := range root.LexProductions { + ids = append(ids, prod.LHS) + } + for _, dir := range root.Directives { + dirIDs := collectUserDefinedIDsFromDirective(dir) + if len(dirIDs) > 0 { + ids = append(ids, dirIDs...) + } + } + } + + duplicated := lexical.FindSpellingInconsistencies(ids) + if len(duplicated) == 0 { + return + } + + for _, dup := range duplicated { + var s string + { + var b strings.Builder + fmt.Fprintf(&b, "%+v", dup[0]) + for _, id := range dup[1:] { + fmt.Fprintf(&b, ", %+v", id) + } + s = b.String() + } + + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrSpellingInconsistency, + Detail: s, + }) + } +} + +func collectUserDefinedIDsFromDirective(dir *parser.DirectiveNode) []string { + var ids []string + for _, param := range dir.Parameters { + if param.Group != nil { + for _, d := range param.Group { + dIDs := collectUserDefinedIDsFromDirective(d) + if len(dIDs) > 0 { + ids = append(ids, dIDs...) + } + } + } + if param.OrderedSymbol != "" { + ids = append(ids, param.OrderedSymbol) + } + } + return ids +} + +type symbols struct { + errSym symbol.Symbol + augStartSym symbol.Symbol + startSym symbol.Symbol +} + +func (b *GrammarBuilder) genSymbolTable(root *parser.RootNode) (*symbol.SymbolTable, *symbols, error) { + symTab := symbol.NewSymbolTable() + w := symTab.Writer() + r := symTab.Reader() + + // We need to register the reserved symbol before registering others. + var errSym symbol.Symbol + { + sym, err := w.RegisterTerminalSymbol(reservedSymbolNameError) + if err != nil { + return nil, nil, err + } + errSym = sym + } + + for _, prod := range root.LexProductions { + if sym, exist := r.ToSymbol(prod.LHS); exist { + if sym == errSym { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrErrSymIsReserved, + Row: prod.Pos.Row, + Col: prod.Pos.Col, + }) + } else { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateTerminal, + Detail: prod.LHS, + Row: prod.Pos.Row, + Col: prod.Pos.Col, + }) + } + + continue + } + + _, err := w.RegisterTerminalSymbol(prod.LHS) + if err != nil { + return nil, nil, err + } + } + + startProd := root.Productions[0] + augStartText := fmt.Sprintf("%s'", startProd.LHS) + var err error + augStartSym, err := w.RegisterStartSymbol(augStartText) + if err != nil { + return nil, nil, err + } + if augStartSym == errSym { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrErrSymIsReserved, + Row: startProd.Pos.Row, + Col: startProd.Pos.Col, + }) + } + + startSym, err := w.RegisterNonTerminalSymbol(startProd.LHS) + if err != nil { + return nil, nil, err + } + if startSym == errSym { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrErrSymIsReserved, + Row: startProd.Pos.Row, + Col: startProd.Pos.Col, + }) + } + + for _, prod := range root.Productions { + sym, err := w.RegisterNonTerminalSymbol(prod.LHS) + if err != nil { + return nil, nil, err + } + if sym.IsTerminal() { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateName, + Detail: prod.LHS, + Row: prod.Pos.Row, + Col: prod.Pos.Col, + }) + } + if sym == errSym { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrErrSymIsReserved, + Row: prod.Pos.Row, + Col: prod.Pos.Col, + }) + } + } + + return symTab, &symbols{ + errSym: errSym, + augStartSym: augStartSym, + startSym: startSym, + }, nil +} + +func (b *GrammarBuilder) genLexSpecAndSkipSymbols(symTab *symbol.SymbolTableReader, root *parser.RootNode) (*lexical.LexSpec, []symbol.Symbol, error) { + entries := []*lexical.LexEntry{} + skipSyms := []symbol.Symbol{} + for _, prod := range root.LexProductions { + entry, skip, specErr, err := genLexEntry(prod) + if err != nil { + return nil, nil, err + } + if specErr != nil { + b.errs = append(b.errs, specErr) + continue + } + if skip { + sym, _ := symTab.ToSymbol(prod.LHS) + skipSyms = append(skipSyms, sym) + } + entries = append(entries, entry) + } + + checkedFragments := map[string]struct{}{} + for _, fragment := range root.Fragments { + if _, exist := checkedFragments[fragment.LHS]; exist { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateFragment, + Detail: fragment.LHS, + Row: fragment.Pos.Row, + Col: fragment.Pos.Col, + }) + continue + } + checkedFragments[fragment.LHS] = struct{}{} + + entries = append(entries, &lexical.LexEntry{ + Fragment: true, + Kind: spec.LexKindName(fragment.LHS), + Pattern: fragment.RHS, + }) + } + + return &lexical.LexSpec{ + Entries: entries, + }, skipSyms, nil +} + +func genLexEntry(prod *parser.ProductionNode) (*lexical.LexEntry, bool, *verr.SpecError, error) { + alt := prod.RHS[0] + elem := alt.Elements[0] + + var pattern string + if elem.Literally { + pattern = spec.EscapePattern(elem.Pattern) + } else { + pattern = elem.Pattern + } + + var modes []spec.LexModeName + var skip bool + var push spec.LexModeName + var pop bool + dirConsumed := map[string]struct{}{} + for _, dir := range prod.Directives { + if _, consumed := dirConsumed[dir.Name]; consumed { + return nil, false, &verr.SpecError{ + Cause: semErrDuplicateDir, + Detail: dir.Name, + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }, nil + } + dirConsumed[dir.Name] = struct{}{} + + switch dir.Name { + case "mode": + if len(dir.Parameters) == 0 { + return nil, false, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'mode' directive needs an ID parameter", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }, nil + } + for _, param := range dir.Parameters { + if param.ID == "" { + return nil, false, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'mode' directive needs an ID parameter", + Row: param.Pos.Row, + Col: param.Pos.Col, + }, nil + } + modes = append(modes, spec.LexModeName(param.ID)) + } + case "skip": + if len(dir.Parameters) > 0 { + return nil, false, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'skip' directive needs no parameter", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }, nil + } + skip = true + case "push": + if len(dir.Parameters) != 1 || dir.Parameters[0].ID == "" { + return nil, false, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'push' directive needs an ID parameter", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }, nil + } + push = spec.LexModeName(dir.Parameters[0].ID) + case "pop": + if len(dir.Parameters) > 0 { + return nil, false, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'pop' directive needs no parameter", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }, nil + } + pop = true + default: + return nil, false, &verr.SpecError{ + Cause: semErrDirInvalidName, + Detail: dir.Name, + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }, nil + } + } + + if len(alt.Directives) > 0 { + return nil, false, &verr.SpecError{ + Cause: semErrInvalidAltDir, + Detail: "a lexical production cannot have alternative directives", + Row: alt.Directives[0].Pos.Row, + Col: alt.Directives[0].Pos.Col, + }, nil + } + + return &lexical.LexEntry{ + Modes: modes, + Kind: spec.LexKindName(prod.LHS), + Pattern: pattern, + Push: push, + Pop: pop, + }, skip, nil, nil +} + +type productionsAndActions struct { + prods *productionSet + augStartSym symbol.Symbol + astActs map[productionID][]*astActionEntry + prodPrecsTerm map[productionID]symbol.Symbol + prodPrecsOrdSym map[productionID]string + prodPrecPoss map[productionID]*parser.Position + recoverProds map[productionID]struct{} +} + +func (b *GrammarBuilder) genProductionsAndActions(root *parser.RootNode, symTab *symbol.SymbolTableReader, errSym symbol.Symbol, augStartSym symbol.Symbol, startSym symbol.Symbol) (*productionsAndActions, error) { + if len(root.Productions) == 0 { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrNoProduction, + }) + return nil, nil + } + + prods := newProductionSet() + astActs := map[productionID][]*astActionEntry{} + prodPrecsTerm := map[productionID]symbol.Symbol{} + prodPrecsOrdSym := map[productionID]string{} + prodPrecPoss := map[productionID]*parser.Position{} + recoverProds := map[productionID]struct{}{} + + p, err := newProduction(augStartSym, []symbol.Symbol{ + startSym, + }) + if err != nil { + return nil, err + } + + prods.append(p) + + for _, prod := range root.Productions { + lhsSym, ok := symTab.ToSymbol(prod.LHS) + if !ok { + // All symbols are assumed to be pre-detected, so it's a bug if we cannot find them here. + return nil, fmt.Errorf("symbol '%v' is undefined", prod.LHS) + } + + if len(prod.Directives) > 0 { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrInvalidProdDir, + Detail: "a production cannot have production directives", + Row: prod.Directives[0].Pos.Row, + Col: prod.Directives[0].Pos.Col, + }) + continue + } + + LOOP_RHS: + for _, alt := range prod.RHS { + altSyms := make([]symbol.Symbol, len(alt.Elements)) + offsets := map[string]int{} + ambiguousIDOffsets := map[string]struct{}{} + for i, elem := range alt.Elements { + sym, ok := symTab.ToSymbol(elem.ID) + if !ok { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrUndefinedSym, + Detail: elem.ID, + Row: elem.Pos.Row, + Col: elem.Pos.Col, + }) + continue LOOP_RHS + } + altSyms[i] = sym + + if elem.Label != nil { + if _, added := offsets[elem.Label.Name]; added { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateLabel, + Detail: elem.Label.Name, + Row: elem.Label.Pos.Row, + Col: elem.Label.Pos.Col, + }) + continue LOOP_RHS + } + if _, found := symTab.ToSymbol(elem.Label.Name); found { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrInvalidLabel, + Detail: elem.Label.Name, + Row: elem.Label.Pos.Row, + Col: elem.Label.Pos.Col, + }) + continue LOOP_RHS + } + offsets[elem.Label.Name] = i + } + // A symbol having a label can be specified by both the label and the symbol name. + // So record the symbol's position, whether or not it has a label. + if elem.ID != "" { + if _, exist := offsets[elem.ID]; exist { + // When the same symbol appears multiple times in an alternative, the symbol is ambiguous. When we need + // to specify the symbol in a directive, we cannot use the name of the ambiguous symbol. Instead, specify + // a label to resolve the ambiguity. + delete(offsets, elem.ID) + ambiguousIDOffsets[elem.ID] = struct{}{} + } else { + offsets[elem.ID] = i + } + } + } + + p, err := newProduction(lhsSym, altSyms) + if err != nil { + return nil, err + } + if _, exist := prods.findByID(p.id); exist { + // Report the line number of a duplicate alternative. + // When the alternative is empty, we report the position of its LHS. + var row int + var col int + if len(alt.Elements) > 0 { + row = alt.Elements[0].Pos.Row + col = alt.Elements[0].Pos.Col + } else { + row = prod.Pos.Row + col = prod.Pos.Col + } + + var detail string + { + var b strings.Builder + fmt.Fprintf(&b, "%v →", prod.LHS) + for _, elem := range alt.Elements { + switch { + case elem.ID != "": + fmt.Fprintf(&b, " %v", elem.ID) + case elem.Pattern != "": + fmt.Fprintf(&b, ` "%v"`, elem.Pattern) + } + } + if len(alt.Elements) == 0 { + fmt.Fprintf(&b, " ε") + } + + detail = b.String() + } + + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateProduction, + Detail: detail, + Row: row, + Col: col, + }) + continue LOOP_RHS + } + prods.append(p) + + dirConsumed := map[string]struct{}{} + for _, dir := range alt.Directives { + if _, consumed := dirConsumed[dir.Name]; consumed { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateDir, + Detail: dir.Name, + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + } + dirConsumed[dir.Name] = struct{}{} + + switch dir.Name { + case "ast": + if len(dir.Parameters) == 0 { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'ast' directive needs at least one parameter", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + continue LOOP_RHS + } + astAct := make([]*astActionEntry, len(dir.Parameters)) + consumedOffsets := map[int]struct{}{} + for i, param := range dir.Parameters { + if param.ID == "" { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'ast' directive can take only ID parameters", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + continue LOOP_RHS + } + + if _, ambiguous := ambiguousIDOffsets[param.ID]; ambiguous { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrAmbiguousElem, + Detail: fmt.Sprintf("'%v' is ambiguous", param.ID), + Row: param.Pos.Row, + Col: param.Pos.Col, + }) + continue LOOP_RHS + } + + offset, ok := offsets[param.ID] + if !ok { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: fmt.Sprintf("a symbol was not found in an alternative: %v", param.ID), + Row: param.Pos.Row, + Col: param.Pos.Col, + }) + continue LOOP_RHS + } + if _, consumed := consumedOffsets[offset]; consumed { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateElem, + Detail: param.ID, + Row: param.Pos.Row, + Col: param.Pos.Col, + }) + continue LOOP_RHS + } + consumedOffsets[offset] = struct{}{} + + if param.Expansion { + elem := alt.Elements[offset] + if elem.Pattern != "" { + // Currently, it is a bug to reach here because it is + // forbidden to have anything other than ID appear in + // production rules. + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: fmt.Sprintf("the expansion symbol cannot be applied to a pattern (%v: \"%v\")", param.ID, elem.Pattern), + Row: param.Pos.Row, + Col: param.Pos.Col, + }) + continue LOOP_RHS + } + elemSym, ok := symTab.ToSymbol(elem.ID) + if !ok { + // If the symbol was not found, it's a bug. + return nil, fmt.Errorf("a symbol corresponding to an ID (%v) was not found", elem.ID) + } + if elemSym.IsTerminal() { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: fmt.Sprintf("the expansion symbol cannot be applied to a terminal symbol (%v: %v)", param.ID, elem.ID), + Row: param.Pos.Row, + Col: param.Pos.Col, + }) + continue LOOP_RHS + } + } + + astAct[i] = &astActionEntry{ + position: offset + 1, + expansion: param.Expansion, + } + } + astActs[p.id] = astAct + case "prec": + if len(dir.Parameters) != 1 || (dir.Parameters[0].ID == "" && dir.Parameters[0].OrderedSymbol == "") { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'prec' directive needs just one ID parameter or ordered symbol", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + continue LOOP_RHS + } + param := dir.Parameters[0] + switch { + case param.ID != "": + sym, ok := symTab.ToSymbol(param.ID) + if !ok { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: fmt.Sprintf("unknown terminal symbol: %v", param.ID), + Row: param.Pos.Row, + Col: param.Pos.Col, + }) + continue LOOP_RHS + } + if sym == errSym { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: fmt.Sprintf("'%v' directive cannot be applied to an error symbol", dir.Name), + Row: param.Pos.Row, + Col: param.Pos.Col, + }) + } + if !sym.IsTerminal() { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: fmt.Sprintf("the symbol must be a terminal: %v", param.ID), + Row: param.Pos.Row, + Col: param.Pos.Col, + }) + continue LOOP_RHS + } + prodPrecsTerm[p.id] = sym + prodPrecPoss[p.id] = ¶m.Pos + case param.OrderedSymbol != "": + prodPrecsOrdSym[p.id] = param.OrderedSymbol + prodPrecPoss[p.id] = ¶m.Pos + } + case "recover": + if len(dir.Parameters) > 0 { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'recover' directive needs no parameter", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + continue LOOP_RHS + } + recoverProds[p.id] = struct{}{} + default: + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidName, + Detail: fmt.Sprintf("invalid directive name '%v'", dir.Name), + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + continue LOOP_RHS + } + } + } + } + + return &productionsAndActions{ + prods: prods, + augStartSym: augStartSym, + astActs: astActs, + prodPrecsTerm: prodPrecsTerm, + prodPrecsOrdSym: prodPrecsOrdSym, + prodPrecPoss: prodPrecPoss, + recoverProds: recoverProds, + }, nil +} + +func (b *GrammarBuilder) genPrecAndAssoc(symTab *symbol.SymbolTableReader, errSym symbol.Symbol, prodsAndActs *productionsAndActions) (*precAndAssoc, error) { + termPrec := map[symbol.SymbolNum]int{} + termAssoc := map[symbol.SymbolNum]assocType{} + ordSymPrec := map[string]int{} + { + var precGroup []*parser.DirectiveNode + for _, dir := range b.AST.Directives { + if dir.Name == "prec" { + if dir.Parameters == nil || len(dir.Parameters) != 1 || dir.Parameters[0].Group == nil { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "'prec' needs just one directive group", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + continue + } + precGroup = dir.Parameters[0].Group + continue + } + + if dir.Name != "name" && dir.Name != "prec" { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidName, + Detail: dir.Name, + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + continue + } + } + + precN := precMin + for _, dir := range precGroup { + var assocTy assocType + switch dir.Name { + case "left": + assocTy = assocTypeLeft + case "right": + assocTy = assocTypeRight + case "assign": + assocTy = assocTypeNil + default: + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidName, + Detail: dir.Name, + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + return nil, nil + } + + if len(dir.Parameters) == 0 { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "associativity needs at least one symbol", + Row: dir.Pos.Row, + Col: dir.Pos.Col, + }) + return nil, nil + } + ASSOC_PARAM_LOOP: + for _, p := range dir.Parameters { + switch { + case p.ID != "": + sym, ok := symTab.ToSymbol(p.ID) + if !ok { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: fmt.Sprintf("'%v' is undefined", p.ID), + Row: p.Pos.Row, + Col: p.Pos.Col, + }) + return nil, nil + } + if sym == errSym { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: fmt.Sprintf("'%v' directive cannot be applied to an error symbol", dir.Name), + Row: p.Pos.Row, + Col: p.Pos.Col, + }) + return nil, nil + } + if !sym.IsTerminal() { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: fmt.Sprintf("associativity can take only terminal symbol ('%v' is a non-terminal)", p.ID), + Row: p.Pos.Row, + Col: p.Pos.Col, + }) + return nil, nil + } + if prec, alreadySet := termPrec[sym.Num()]; alreadySet { + if prec == precN { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateAssoc, + Detail: fmt.Sprintf("'%v' already has the same associativity and precedence", p.ID), + Row: p.Pos.Row, + Col: p.Pos.Col, + }) + } else if assoc := termAssoc[sym.Num()]; assoc == assocTy { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateAssoc, + Detail: fmt.Sprintf("'%v' already has different precedence", p.ID), + Row: p.Pos.Row, + Col: p.Pos.Col, + }) + } else { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateAssoc, + Detail: fmt.Sprintf("'%v' already has different associativity and precedence", p.ID), + Row: p.Pos.Row, + Col: p.Pos.Col, + }) + } + break ASSOC_PARAM_LOOP + } + + termPrec[sym.Num()] = precN + termAssoc[sym.Num()] = assocTy + case p.OrderedSymbol != "": + if prec, alreadySet := ordSymPrec[p.OrderedSymbol]; alreadySet { + if prec == precN { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateAssoc, + Detail: fmt.Sprintf("'$%v' already has the same precedence", p.OrderedSymbol), + Row: p.Pos.Row, + Col: p.Pos.Col, + }) + } else { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDuplicateAssoc, + Detail: fmt.Sprintf("'$%v' already has different precedence", p.OrderedSymbol), + Row: p.Pos.Row, + Col: p.Pos.Col, + }) + } + break ASSOC_PARAM_LOOP + } + + ordSymPrec[p.OrderedSymbol] = precN + default: + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrDirInvalidParam, + Detail: "a parameter must be an ID or an ordered symbol", + Row: p.Pos.Row, + Col: p.Pos.Col, + }) + return nil, nil + } + } + + precN++ + } + } + if len(b.errs) > 0 { + return nil, nil + } + + prodPrec := map[productionNum]int{} + prodAssoc := map[productionNum]assocType{} + for _, prod := range prodsAndActs.prods.getAllProductions() { + // A #prec directive changes only precedence, not associativity. + if term, ok := prodsAndActs.prodPrecsTerm[prod.id]; ok { + if prec, ok := termPrec[term.Num()]; ok { + prodPrec[prod.num] = prec + prodAssoc[prod.num] = assocTypeNil + } else { + text, _ := symTab.ToText(term) + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrUndefinedPrec, + Detail: text, + Row: prodsAndActs.prodPrecPoss[prod.id].Row, + Col: prodsAndActs.prodPrecPoss[prod.id].Col, + }) + } + } else if ordSym, ok := prodsAndActs.prodPrecsOrdSym[prod.id]; ok { + if prec, ok := ordSymPrec[ordSym]; ok { + prodPrec[prod.num] = prec + prodAssoc[prod.num] = assocTypeNil + } else { + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrUndefinedOrdSym, + Detail: fmt.Sprintf("$%v", ordSym), + Row: prodsAndActs.prodPrecPoss[prod.id].Row, + Col: prodsAndActs.prodPrecPoss[prod.id].Col, + }) + } + } else { + // A production inherits precedence and associativity from the right-most terminal symbol. + mostrightTerm := symbol.SymbolNil + for _, sym := range prod.rhs { + if !sym.IsTerminal() { + continue + } + mostrightTerm = sym + } + if !mostrightTerm.IsNil() { + prodPrec[prod.num] = termPrec[mostrightTerm.Num()] + prodAssoc[prod.num] = termAssoc[mostrightTerm.Num()] + } + } + } + if len(b.errs) > 0 { + return nil, nil + } + + return &precAndAssoc{ + termPrec: termPrec, + termAssoc: termAssoc, + prodPrec: prodPrec, + prodAssoc: prodAssoc, + }, nil +} + +func compile(gram *Grammar, opts ...BuildOption) (*spec.CompiledGrammar, *spec.Report, error) { + config := &buildConfig{} + for _, opt := range opts { + opt(config) + } + + lexSpec, err, cErrs := lexical.Compile(gram.lexSpec, lexical.CompressionLevelMax) + if err != nil { + if len(cErrs) > 0 { + var b strings.Builder + writeCompileError(&b, cErrs[0]) + for _, cerr := range cErrs[1:] { + fmt.Fprintf(&b, "\n") + writeCompileError(&b, cerr) + } + return nil, nil, fmt.Errorf(b.String()) + } + return nil, nil, err + } + + kind2Term := make([]int, len(lexSpec.KindNames)) + for i, k := range lexSpec.KindNames { + if k == spec.LexKindNameNil { + kind2Term[spec.LexKindIDNil] = symbol.SymbolNil.Num().Int() + continue + } + + sym, ok := gram.symbolTable.ToSymbol(k.String()) + if !ok { + return nil, nil, fmt.Errorf("terminal symbol '%v' was not found in a symbol table", k) + } + kind2Term[i] = sym.Num().Int() + } + + termTexts, err := gram.symbolTable.TerminalTexts() + if err != nil { + return nil, nil, err + } + + var termSkip []int + { + r := gram.symbolTable.Reader() + // I want to use gram.symbolTable.terminalSymbols() here instead of gram.symbolTable.terminalTexts(), + // but gram.symbolTable.terminalSymbols() is different in length from terminalTexts + // because it does not contain a predefined symbol, like EOF. + // Therefore, we use terminalTexts, although it takes more time to lookup for symbols. + termSkip = make([]int, len(termTexts)) + for _, t := range termTexts { + s, _ := r.ToSymbol(t) + for _, sk := range gram.skipSymbols { + if s != sk { + continue + } + termSkip[s.Num()] = 1 + break + } + } + } + + nonTerms, err := gram.symbolTable.NonTerminalTexts() + if err != nil { + return nil, nil, err + } + + firstSet, err := genFirstSet(gram.productionSet) + if err != nil { + return nil, nil, err + } + + lr0, err := genLR0Automaton(gram.productionSet, gram.augmentedStartSymbol, gram.errorSymbol) + if err != nil { + return nil, nil, err + } + + var tab *ParsingTable + var report *spec.Report + { + lalr1, err := genLALR1Automaton(lr0, gram.productionSet, firstSet) + if err != nil { + return nil, nil, err + } + + b := &lrTableBuilder{ + automaton: lalr1.lr0Automaton, + prods: gram.productionSet, + termCount: len(termTexts), + nonTermCount: len(nonTerms), + symTab: gram.symbolTable, + precAndAssoc: gram.precAndAssoc, + } + tab, err = b.build() + if err != nil { + return nil, nil, err + } + + if config.isReportingEnabled { + report, err = b.genReport(tab, gram) + if err != nil { + return nil, nil, err + } + } + } + + action := make([]int, len(tab.actionTable)) + for i, e := range tab.actionTable { + action[i] = int(e) + } + goTo := make([]int, len(tab.goToTable)) + for i, e := range tab.goToTable { + goTo[i] = int(e) + } + + lhsSyms := make([]int, len(gram.productionSet.getAllProductions())+1) + altSymCounts := make([]int, len(gram.productionSet.getAllProductions())+1) + recoverProds := make([]int, len(gram.productionSet.getAllProductions())+1) + astActEnties := make([][]int, len(gram.productionSet.getAllProductions())+1) + for _, p := range gram.productionSet.getAllProductions() { + lhsSyms[p.num] = p.lhs.Num().Int() + altSymCounts[p.num] = p.rhsLen + + if _, ok := gram.recoverProductions[p.id]; ok { + recoverProds[p.num] = 1 + } + + astAct, ok := gram.astActions[p.id] + if !ok { + continue + } + astActEntry := make([]int, len(astAct)) + for i, e := range astAct { + if e.expansion { + astActEntry[i] = e.position * -1 + } else { + astActEntry[i] = e.position + } + } + astActEnties[p.num] = astActEntry + } + + return &spec.CompiledGrammar{ + Name: gram.name, + Lexical: lexSpec, + Syntactic: &spec.SyntacticSpec{ + Action: action, + GoTo: goTo, + StateCount: tab.stateCount, + InitialState: tab.InitialState.Int(), + StartProduction: productionNumStart.Int(), + LHSSymbols: lhsSyms, + AlternativeSymbolCounts: altSymCounts, + Terminals: termTexts, + TerminalCount: tab.terminalCount, + TerminalSkip: termSkip, + KindToTerminal: kind2Term, + NonTerminals: nonTerms, + NonTerminalCount: tab.nonTerminalCount, + EOFSymbol: symbol.SymbolEOF.Num().Int(), + ErrorSymbol: gram.errorSymbol.Num().Int(), + ErrorTrapperStates: tab.errorTrapperStates, + RecoverProductions: recoverProds, + }, + ASTAction: &spec.ASTAction{ + Entries: astActEnties, + }, + }, report, nil +} + +func writeCompileError(w io.Writer, cErr *lexical.CompileError) { + if cErr.Fragment { + fmt.Fprintf(w, "fragment ") + } + fmt.Fprintf(w, "%v: %v", cErr.Kind, cErr.Cause) + if cErr.Detail != "" { + fmt.Fprintf(w, ": %v", cErr.Detail) + } +} diff --git a/src/urubu/grammar/item.go b/src/urubu/grammar/item.go new file mode 100644 index 0000000..6c5fe42 --- /dev/null +++ b/src/urubu/grammar/item.go @@ -0,0 +1,206 @@ +package grammar + +import ( + "crypto/sha256" + "encoding/binary" + "fmt" + "sort" + "strconv" + + "urubu/grammar/symbol" +) + +type lrItemID [32]byte + +func (id lrItemID) String() string { + return fmt.Sprintf("%x", id.num()) +} + +func (id lrItemID) num() uint32 { + return binary.LittleEndian.Uint32(id[:]) +} + +type lookAhead struct { + symbols map[symbol.Symbol]struct{} + + // When propagation is true, an item propagates look-ahead symbols to other items. + propagation bool +} + +type lrItem struct { + id lrItemID + prod productionID + + // E → E + T + // + // Dot | Dotted Symbol | Item + // ----+---------------+------------ + // 0 | E | E →・E + T + // 1 | + | E → E・+ T + // 2 | T | E → E +・T + // 3 | Nil | E → E + T・ + dot int + dottedSymbol symbol.Symbol + + // When initial is true, the LHS of the production is the augmented start symbol and dot is 0. + // It looks like S' →・S. + initial bool + + // When reducible is true, the item looks like E → E + T・. + reducible bool + + // When kernel is true, the item is kernel item. + kernel bool + + // lookAhead stores look-ahead symbols, and they are terminal symbols. + // The item is reducible only when the look-ahead symbols appear as the next input symbol. + lookAhead lookAhead +} + +func newLR0Item(prod *production, dot int) (*lrItem, error) { + if prod == nil { + return nil, fmt.Errorf("production must be non-nil") + } + + if dot < 0 || dot > prod.rhsLen { + return nil, fmt.Errorf("dot must be between 0 and %v", prod.rhsLen) + } + + var id lrItemID + { + b := []byte{} + b = append(b, prod.id[:]...) + bDot := make([]byte, 8) + binary.LittleEndian.PutUint64(bDot, uint64(dot)) + b = append(b, bDot...) + id = sha256.Sum256(b) + } + + dottedSymbol := symbol.SymbolNil + if dot < prod.rhsLen { + dottedSymbol = prod.rhs[dot] + } + + initial := false + if prod.lhs.IsStart() && dot == 0 { + initial = true + } + + reducible := false + if dot == prod.rhsLen { + reducible = true + } + + kernel := false + if initial || dot > 0 { + kernel = true + } + + item := &lrItem{ + id: id, + prod: prod.id, + dot: dot, + dottedSymbol: dottedSymbol, + initial: initial, + reducible: reducible, + kernel: kernel, + } + + return item, nil +} + +type kernelID [32]byte + +func (id kernelID) String() string { + return fmt.Sprintf("%x", binary.LittleEndian.Uint32(id[:])) +} + +type kernel struct { + id kernelID + items []*lrItem +} + +func newKernel(items []*lrItem) (*kernel, error) { + if len(items) == 0 { + return nil, fmt.Errorf("a kernel need at least one item") + } + + // Remove duplicates from items. + var sortedItems []*lrItem + { + m := map[lrItemID]*lrItem{} + for _, item := range items { + if !item.kernel { + return nil, fmt.Errorf("not a kernel item: %v", item) + } + m[item.id] = item + } + sortedItems = []*lrItem{} + for _, item := range m { + sortedItems = append(sortedItems, item) + } + sort.Slice(sortedItems, func(i, j int) bool { + return sortedItems[i].id.num() < sortedItems[j].id.num() + }) + } + + var id kernelID + { + b := []byte{} + for _, item := range sortedItems { + b = append(b, item.id[:]...) + } + id = sha256.Sum256(b) + } + + return &kernel{ + id: id, + items: sortedItems, + }, nil +} + +type stateNum int + +const stateNumInitial = stateNum(0) + +func (n stateNum) Int() int { + return int(n) +} + +func (n stateNum) String() string { + return strconv.Itoa(int(n)) +} + +func (n stateNum) next() stateNum { + return stateNum(n + 1) +} + +type lrState struct { + *kernel + num stateNum + next map[symbol.Symbol]kernelID + reducible map[productionID]struct{} + + // emptyProdItems stores items that have an empty production like `p → ε` and is reducible. + // Thus the items emptyProdItems stores are like `p → ・ε`. emptyProdItems is needed to store + // look-ahead symbols because the kernel items don't include these items. + // + // For instance, we have the following productions, and A is a terminal symbol. + // + // s' → s + // s → A | ε + // + // CLOSURE({s' → ・s}) generates the following closure, but the kernel of this closure doesn't + // include `s → ・ε`. + // + // s' → ・s + // s → ・A + // s → ・ε + emptyProdItems []*lrItem + + // When isErrorTrapper is `true`, the item can shift the `error` symbol. The item has the following form. + // The `α` and `β` can be empty. + // + // A → α・error β + isErrorTrapper bool +} diff --git a/src/urubu/grammar/lalr1.go b/src/urubu/grammar/lalr1.go new file mode 100644 index 0000000..8373568 --- /dev/null +++ b/src/urubu/grammar/lalr1.go @@ -0,0 +1,318 @@ +package grammar + +import ( + "fmt" + + "urubu/grammar/symbol" +) + +type stateAndLRItem struct { + kernelID kernelID + itemID lrItemID +} + +type propagation struct { + src *stateAndLRItem + dest []*stateAndLRItem +} + +type lalr1Automaton struct { + *lr0Automaton +} + +func genLALR1Automaton(lr0 *lr0Automaton, prods *productionSet, first *firstSet) (*lalr1Automaton, error) { + // Set the look-ahead symbol <EOF> to the initial item: [S' → ・S, $] + iniState := lr0.states[lr0.initialState] + iniState.items[0].lookAhead.symbols = map[symbol.Symbol]struct{}{ + symbol.SymbolEOF: {}, + } + + var props []*propagation + for _, state := range lr0.states { + for _, kItem := range state.items { + items, err := genLALR1Closure(kItem, prods, first) + if err != nil { + return nil, err + } + + kItem.lookAhead.propagation = true + + var propDests []*stateAndLRItem + for _, item := range items { + if item.reducible { + p, ok := prods.findByID(item.prod) + if !ok { + return nil, fmt.Errorf("production not found: %v", item.prod) + } + + if p.isEmpty() { + var reducibleItem *lrItem + for _, it := range state.emptyProdItems { + if it.id != item.id { + continue + } + + reducibleItem = it + break + } + if reducibleItem == nil { + return nil, fmt.Errorf("reducible item not found: %v", item.id) + } + if reducibleItem.lookAhead.symbols == nil { + reducibleItem.lookAhead.symbols = map[symbol.Symbol]struct{}{} + } + for a := range item.lookAhead.symbols { + reducibleItem.lookAhead.symbols[a] = struct{}{} + } + + propDests = append(propDests, &stateAndLRItem{ + kernelID: state.id, + itemID: item.id, + }) + } + + continue + } + + nextKID := state.next[item.dottedSymbol] + var nextItemID lrItemID + { + p, ok := prods.findByID(item.prod) + if !ok { + return nil, fmt.Errorf("production not found: %v", item.prod) + } + it, err := newLR0Item(p, item.dot+1) + if err != nil { + return nil, fmt.Errorf("failed to generate an item ID: %v", err) + } + nextItemID = it.id + } + + if item.lookAhead.propagation { + propDests = append(propDests, &stateAndLRItem{ + kernelID: nextKID, + itemID: nextItemID, + }) + } else { + nextState := lr0.states[nextKID] + var nextItem *lrItem + for _, it := range nextState.items { + if it.id != nextItemID { + continue + } + nextItem = it + break + } + if nextItem == nil { + return nil, fmt.Errorf("item not found: %v", nextItemID) + } + + if nextItem.lookAhead.symbols == nil { + nextItem.lookAhead.symbols = map[symbol.Symbol]struct{}{} + } + + for a := range item.lookAhead.symbols { + nextItem.lookAhead.symbols[a] = struct{}{} + } + } + } + if len(propDests) == 0 { + continue + } + + props = append(props, &propagation{ + src: &stateAndLRItem{ + kernelID: state.id, + itemID: kItem.id, + }, + dest: propDests, + }) + } + } + + err := propagateLookAhead(lr0, props) + if err != nil { + return nil, fmt.Errorf("failed to propagate look-ahead symbols: %v", err) + } + + return &lalr1Automaton{ + lr0Automaton: lr0, + }, nil +} + +func genLALR1Closure(srcItem *lrItem, prods *productionSet, first *firstSet) ([]*lrItem, error) { + items := []*lrItem{} + knownItems := map[lrItemID]map[symbol.Symbol]struct{}{} + knownItemsProp := map[lrItemID]struct{}{} + uncheckedItems := []*lrItem{} + items = append(items, srcItem) + uncheckedItems = append(uncheckedItems, srcItem) + for len(uncheckedItems) > 0 { + nextUncheckedItems := []*lrItem{} + for _, item := range uncheckedItems { + if item.dottedSymbol.IsTerminal() { + continue + } + + p, ok := prods.findByID(item.prod) + if !ok { + return nil, fmt.Errorf("production not found: %v", item.prod) + } + + var fstSyms []symbol.Symbol + var isFstNullable bool + { + fst, err := first.find(p, item.dot+1) + if err != nil { + return nil, err + } + + fstSyms = make([]symbol.Symbol, len(fst.symbols)) + i := 0 + for s := range fst.symbols { + fstSyms[i] = s + i++ + } + if fst.empty { + isFstNullable = true + } + } + + ps, _ := prods.findByLHS(item.dottedSymbol) + for _, prod := range ps { + var lookAhead []symbol.Symbol + { + var lookAheadCount int + if isFstNullable { + lookAheadCount = len(fstSyms) + len(item.lookAhead.symbols) + } else { + lookAheadCount = len(fstSyms) + } + + lookAhead = make([]symbol.Symbol, lookAheadCount) + i := 0 + for _, s := range fstSyms { + lookAhead[i] = s + i++ + } + if isFstNullable { + for a := range item.lookAhead.symbols { + lookAhead[i] = a + i++ + } + } + } + + for _, a := range lookAhead { + newItem, err := newLR0Item(prod, 0) + if err != nil { + return nil, err + } + if items, exist := knownItems[newItem.id]; exist { + if _, exist := items[a]; exist { + continue + } + } + + newItem.lookAhead.symbols = map[symbol.Symbol]struct{}{ + a: {}, + } + + items = append(items, newItem) + if knownItems[newItem.id] == nil { + knownItems[newItem.id] = map[symbol.Symbol]struct{}{} + } + knownItems[newItem.id][a] = struct{}{} + nextUncheckedItems = append(nextUncheckedItems, newItem) + } + + if isFstNullable { + newItem, err := newLR0Item(prod, 0) + if err != nil { + return nil, err + } + if _, exist := knownItemsProp[newItem.id]; exist { + continue + } + + newItem.lookAhead.propagation = true + + items = append(items, newItem) + knownItemsProp[newItem.id] = struct{}{} + nextUncheckedItems = append(nextUncheckedItems, newItem) + } + } + } + uncheckedItems = nextUncheckedItems + } + + return items, nil +} + +func propagateLookAhead(lr0 *lr0Automaton, props []*propagation) error { + for { + changed := false + for _, prop := range props { + srcState, ok := lr0.states[prop.src.kernelID] + if !ok { + return fmt.Errorf("source state not found: %v", prop.src.kernelID) + } + var srcItem *lrItem + for _, item := range srcState.items { + if item.id != prop.src.itemID { + continue + } + srcItem = item + break + } + if srcItem == nil { + return fmt.Errorf("source item not found: %v", prop.src.itemID) + } + + for _, dest := range prop.dest { + destState, ok := lr0.states[dest.kernelID] + if !ok { + return fmt.Errorf("destination state not found: %v", dest.kernelID) + } + var destItem *lrItem + for _, item := range destState.items { + if item.id != dest.itemID { + continue + } + destItem = item + break + } + if destItem == nil { + for _, item := range destState.emptyProdItems { + if item.id != dest.itemID { + continue + } + destItem = item + break + } + if destItem == nil { + return fmt.Errorf("destination item not found: %v", dest.itemID) + } + } + + for a := range srcItem.lookAhead.symbols { + if _, ok := destItem.lookAhead.symbols[a]; ok { + continue + } + + if destItem.lookAhead.symbols == nil { + destItem.lookAhead.symbols = map[symbol.Symbol]struct{}{} + } + + destItem.lookAhead.symbols[a] = struct{}{} + changed = true + } + } + } + if !changed { + break + } + } + + return nil +} diff --git a/src/urubu/grammar/lexical/compiler.go b/src/urubu/grammar/lexical/compiler.go new file mode 100644 index 0000000..637018a --- /dev/null +++ b/src/urubu/grammar/lexical/compiler.go @@ -0,0 +1,413 @@ +package lexical + +import ( + "bytes" + "fmt" + + "urubu/compressor" + "urubu/grammar/lexical/dfa" + psr "urubu/grammar/lexical/parser" + spec "urubu/spec/grammar" +) + +type CompileError struct { + Kind spec.LexKindName + Fragment bool + Cause error + Detail string +} + +func Compile(lexspec *LexSpec, compLv int) (*spec.LexicalSpec, error, []*CompileError) { + err := lexspec.Validate() + if err != nil { + return nil, fmt.Errorf("invalid lexical specification:\n%w", err), nil + } + + modeEntries, modeNames, modeName2ID, fragmetns := groupEntriesByLexMode(lexspec.Entries) + + modeSpecs := []*spec.CompiledLexModeSpec{ + nil, + } + for i, es := range modeEntries[1:] { + modeName := modeNames[i+1] + modeSpec, err, cerrs := compile(es, modeName2ID, fragmetns, compLv) + if err != nil { + return nil, fmt.Errorf("failed to compile in %v mode: %w", modeName, err), cerrs + } + modeSpecs = append(modeSpecs, modeSpec) + } + + var kindNames []spec.LexKindName + var name2ID map[spec.LexKindName]spec.LexKindID + { + name2ID = map[spec.LexKindName]spec.LexKindID{} + id := spec.LexKindIDMin + for _, modeSpec := range modeSpecs[1:] { + for _, name := range modeSpec.KindNames[1:] { + if _, ok := name2ID[name]; ok { + continue + } + name2ID[name] = id + id++ + } + } + + kindNames = make([]spec.LexKindName, len(name2ID)+1) + for name, id := range name2ID { + kindNames[id] = name + } + } + + var kindIDs [][]spec.LexKindID + { + kindIDs = make([][]spec.LexKindID, len(modeSpecs)) + for i, modeSpec := range modeSpecs[1:] { + ids := make([]spec.LexKindID, len(modeSpec.KindNames)) + for modeID, name := range modeSpec.KindNames { + if modeID == 0 { + continue + } + ids[modeID] = name2ID[name] + } + kindIDs[i+1] = ids + } + } + + return &spec.LexicalSpec{ + InitialModeID: spec.LexModeIDDefault, + ModeNames: modeNames, + KindNames: kindNames, + KindIDs: kindIDs, + CompressionLevel: compLv, + Specs: modeSpecs, + }, nil, nil +} + +func groupEntriesByLexMode(entries []*LexEntry) ([][]*LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeID, map[spec.LexKindName]*LexEntry) { + modeNames := []spec.LexModeName{ + spec.LexModeNameNil, + spec.LexModeNameDefault, + } + modeName2ID := map[spec.LexModeName]spec.LexModeID{ + spec.LexModeNameNil: spec.LexModeIDNil, + spec.LexModeNameDefault: spec.LexModeIDDefault, + } + lastModeID := spec.LexModeIDDefault + modeEntries := [][]*LexEntry{ + nil, + {}, + } + fragments := map[spec.LexKindName]*LexEntry{} + for _, e := range entries { + if e.Fragment { + fragments[e.Kind] = e + continue + } + ms := e.Modes + if len(ms) == 0 { + ms = []spec.LexModeName{ + spec.LexModeNameDefault, + } + } + for _, modeName := range ms { + modeID, ok := modeName2ID[modeName] + if !ok { + modeID = lastModeID + 1 + lastModeID = modeID + modeName2ID[modeName] = modeID + modeNames = append(modeNames, modeName) + modeEntries = append(modeEntries, []*LexEntry{}) + } + modeEntries[modeID] = append(modeEntries[modeID], e) + } + } + return modeEntries, modeNames, modeName2ID, fragments +} + +func compile( + entries []*LexEntry, + modeName2ID map[spec.LexModeName]spec.LexModeID, + fragments map[spec.LexKindName]*LexEntry, + compLv int, +) (*spec.CompiledLexModeSpec, error, []*CompileError) { + var kindNames []spec.LexKindName + kindIDToName := map[spec.LexModeKindID]spec.LexKindName{} + var patterns map[spec.LexModeKindID][]byte + { + kindNames = append(kindNames, spec.LexKindNameNil) + patterns = map[spec.LexModeKindID][]byte{} + for i, e := range entries { + kindID := spec.LexModeKindID(i + 1) + + kindNames = append(kindNames, e.Kind) + kindIDToName[kindID] = e.Kind + patterns[kindID] = []byte(e.Pattern) + } + } + + push := []spec.LexModeID{ + spec.LexModeIDNil, + } + pop := []int{ + 0, + } + for _, e := range entries { + pushV := spec.LexModeIDNil + if e.Push != "" { + pushV = modeName2ID[e.Push] + } + push = append(push, pushV) + popV := 0 + if e.Pop { + popV = 1 + } + pop = append(pop, popV) + } + + fragmentPatterns := map[spec.LexKindName][]byte{} + for k, e := range fragments { + fragmentPatterns[k] = []byte(e.Pattern) + } + + fragmentCPTrees := make(map[spec.LexKindName]psr.CPTree, len(fragmentPatterns)) + { + var cerrs []*CompileError + for kind, pat := range fragmentPatterns { + p := psr.NewParser(kind, bytes.NewReader(pat)) + t, err := p.Parse() + if err != nil { + if err == psr.ParseErr { + detail, cause := p.Error() + cerrs = append(cerrs, &CompileError{ + Kind: kind, + Fragment: true, + Cause: cause, + Detail: detail, + }) + } else { + cerrs = append(cerrs, &CompileError{ + Kind: kind, + Fragment: true, + Cause: err, + }) + } + continue + } + fragmentCPTrees[kind] = t + } + if len(cerrs) > 0 { + return nil, fmt.Errorf("compile error"), cerrs + } + + err := psr.CompleteFragments(fragmentCPTrees) + if err != nil { + if err == psr.ParseErr { + for _, frag := range fragmentCPTrees { + kind, frags, err := frag.Describe() + if err != nil { + return nil, err, nil + } + + cerrs = append(cerrs, &CompileError{ + Kind: kind, + Fragment: true, + Cause: fmt.Errorf("fragment contains undefined fragments or cycles"), + Detail: fmt.Sprintf("%v", frags), + }) + } + + return nil, fmt.Errorf("compile error"), cerrs + } + + return nil, err, nil + } + } + + cpTrees := map[spec.LexModeKindID]psr.CPTree{} + { + pats := make([]*psr.PatternEntry, len(patterns)+1) + pats[spec.LexModeKindIDNil] = &psr.PatternEntry{ + ID: spec.LexModeKindIDNil, + } + for id, pattern := range patterns { + pats[id] = &psr.PatternEntry{ + ID: id, + Pattern: pattern, + } + } + + var cerrs []*CompileError + for _, pat := range pats { + if pat.ID == spec.LexModeKindIDNil { + continue + } + + p := psr.NewParser(kindIDToName[pat.ID], bytes.NewReader(pat.Pattern)) + t, err := p.Parse() + if err != nil { + if err == psr.ParseErr { + detail, cause := p.Error() + cerrs = append(cerrs, &CompileError{ + Kind: kindIDToName[pat.ID], + Fragment: false, + Cause: cause, + Detail: detail, + }) + } else { + cerrs = append(cerrs, &CompileError{ + Kind: kindIDToName[pat.ID], + Fragment: false, + Cause: err, + }) + } + continue + } + + complete, err := psr.ApplyFragments(t, fragmentCPTrees) + if err != nil { + return nil, err, nil + } + if !complete { + _, frags, err := t.Describe() + if err != nil { + return nil, err, nil + } + + cerrs = append(cerrs, &CompileError{ + Kind: kindIDToName[pat.ID], + Fragment: false, + Cause: fmt.Errorf("pattern contains undefined fragments"), + Detail: fmt.Sprintf("%v", frags), + }) + continue + } + + cpTrees[pat.ID] = t + } + if len(cerrs) > 0 { + return nil, fmt.Errorf("compile error"), cerrs + } + } + + var tranTab *spec.TransitionTable + { + root, symTab, err := dfa.ConvertCPTreeToByteTree(cpTrees) + if err != nil { + return nil, err, nil + } + d := dfa.GenDFA(root, symTab) + tranTab, err = dfa.GenTransitionTable(d) + if err != nil { + return nil, err, nil + } + } + + var err error + switch compLv { + case 2: + tranTab, err = compressTransitionTableLv2(tranTab) + if err != nil { + return nil, err, nil + } + case 1: + tranTab, err = compressTransitionTableLv1(tranTab) + if err != nil { + return nil, err, nil + } + } + + return &spec.CompiledLexModeSpec{ + KindNames: kindNames, + Push: push, + Pop: pop, + DFA: tranTab, + }, nil, nil +} + +const ( + CompressionLevelMin = 0 + CompressionLevelMax = 2 +) + +func compressTransitionTableLv2(tranTab *spec.TransitionTable) (*spec.TransitionTable, error) { + ueTab := compressor.NewUniqueEntriesTable() + { + orig, err := compressor.NewOriginalTable(convertStateIDSliceToIntSlice(tranTab.UncompressedTransition), tranTab.ColCount) + if err != nil { + return nil, err + } + err = ueTab.Compress(orig) + if err != nil { + return nil, err + } + } + + rdTab := compressor.NewRowDisplacementTable(0) + { + orig, err := compressor.NewOriginalTable(ueTab.UniqueEntries, ueTab.OriginalColCount) + if err != nil { + return nil, err + } + err = rdTab.Compress(orig) + if err != nil { + return nil, err + } + } + + tranTab.Transition = &spec.UniqueEntriesTable{ + UniqueEntries: &spec.RowDisplacementTable{ + OriginalRowCount: rdTab.OriginalRowCount, + OriginalColCount: rdTab.OriginalColCount, + EmptyValue: spec.StateIDNil, + Entries: convertIntSliceToStateIDSlice(rdTab.Entries), + Bounds: rdTab.Bounds, + RowDisplacement: rdTab.RowDisplacement, + }, + RowNums: ueTab.RowNums, + OriginalRowCount: ueTab.OriginalRowCount, + OriginalColCount: ueTab.OriginalColCount, + } + tranTab.UncompressedTransition = nil + + return tranTab, nil +} + +func compressTransitionTableLv1(tranTab *spec.TransitionTable) (*spec.TransitionTable, error) { + ueTab := compressor.NewUniqueEntriesTable() + { + orig, err := compressor.NewOriginalTable(convertStateIDSliceToIntSlice(tranTab.UncompressedTransition), tranTab.ColCount) + if err != nil { + return nil, err + } + err = ueTab.Compress(orig) + if err != nil { + return nil, err + } + } + + tranTab.Transition = &spec.UniqueEntriesTable{ + UncompressedUniqueEntries: convertIntSliceToStateIDSlice(ueTab.UniqueEntries), + RowNums: ueTab.RowNums, + OriginalRowCount: ueTab.OriginalRowCount, + OriginalColCount: ueTab.OriginalColCount, + } + tranTab.UncompressedTransition = nil + + return tranTab, nil +} + +func convertStateIDSliceToIntSlice(s []spec.StateID) []int { + is := make([]int, len(s)) + for i, v := range s { + is[i] = v.Int() + } + return is +} + +func convertIntSliceToStateIDSlice(s []int) []spec.StateID { + ss := make([]spec.StateID, len(s)) + for i, v := range s { + ss[i] = spec.StateID(v) + } + return ss +} diff --git a/src/urubu/grammar/lexical/dfa/dfa.go b/src/urubu/grammar/lexical/dfa/dfa.go new file mode 100644 index 0000000..48bd8b4 --- /dev/null +++ b/src/urubu/grammar/lexical/dfa/dfa.go @@ -0,0 +1,173 @@ +package dfa + +import ( + "sort" + + spec "urubu/spec/grammar" +) + +type symbolTable struct { + symPos2Byte map[symbolPosition]byteRange + endPos2ID map[symbolPosition]spec.LexModeKindID +} + +func genSymbolTable(root byteTree) *symbolTable { + symTab := &symbolTable{ + symPos2Byte: map[symbolPosition]byteRange{}, + endPos2ID: map[symbolPosition]spec.LexModeKindID{}, + } + return genSymTab(symTab, root) +} + +func genSymTab(symTab *symbolTable, node byteTree) *symbolTable { + if node == nil { + return symTab + } + + switch n := node.(type) { + case *symbolNode: + symTab.symPos2Byte[n.pos] = byteRange{ + from: n.from, + to: n.to, + } + case *endMarkerNode: + symTab.endPos2ID[n.pos] = n.id + default: + left, right := node.children() + genSymTab(symTab, left) + genSymTab(symTab, right) + } + return symTab +} + +type DFA struct { + States []string + InitialState string + AcceptingStatesTable map[string]spec.LexModeKindID + TransitionTable map[string][256]string +} + +func GenDFA(root byteTree, symTab *symbolTable) *DFA { + initialState := root.first() + initialStateHash := initialState.hash() + stateMap := map[string]*symbolPositionSet{ + initialStateHash: initialState, + } + tranTab := map[string][256]string{} + { + follow := genFollowTable(root) + unmarkedStates := map[string]*symbolPositionSet{ + initialStateHash: initialState, + } + for len(unmarkedStates) > 0 { + nextUnmarkedStates := map[string]*symbolPositionSet{} + for hash, state := range unmarkedStates { + tranTabOfState := [256]*symbolPositionSet{} + for _, pos := range state.set() { + if pos.isEndMark() { + continue + } + valRange := symTab.symPos2Byte[pos] + for symVal := valRange.from; symVal <= valRange.to; symVal++ { + if tranTabOfState[symVal] == nil { + tranTabOfState[symVal] = newSymbolPositionSet() + } + tranTabOfState[symVal].merge(follow[pos]) + } + } + for _, t := range tranTabOfState { + if t == nil { + continue + } + h := t.hash() + if _, ok := stateMap[h]; ok { + continue + } + stateMap[h] = t + nextUnmarkedStates[h] = t + } + tabOfState := [256]string{} + for v, t := range tranTabOfState { + if t == nil { + continue + } + tabOfState[v] = t.hash() + } + tranTab[hash] = tabOfState + } + unmarkedStates = nextUnmarkedStates + } + } + + accTab := map[string]spec.LexModeKindID{} + { + for h, s := range stateMap { + for _, pos := range s.set() { + if !pos.isEndMark() { + continue + } + priorID, ok := accTab[h] + if !ok { + accTab[h] = symTab.endPos2ID[pos] + } else { + id := symTab.endPos2ID[pos] + if id < priorID { + accTab[h] = id + } + } + } + } + } + + var states []string + { + for s := range stateMap { + states = append(states, s) + } + sort.Slice(states, func(i, j int) bool { + return states[i] < states[j] + }) + } + + return &DFA{ + States: states, + InitialState: initialStateHash, + AcceptingStatesTable: accTab, + TransitionTable: tranTab, + } +} + +func GenTransitionTable(dfa *DFA) (*spec.TransitionTable, error) { + stateHash2ID := map[string]spec.StateID{} + for i, s := range dfa.States { + // Since 0 represents an invalid value in a transition table, + // assign a number greater than or equal to 1 to states. + stateHash2ID[s] = spec.StateID(i + spec.StateIDMin.Int()) + } + + acc := make([]spec.LexModeKindID, len(dfa.States)+1) + for _, s := range dfa.States { + id, ok := dfa.AcceptingStatesTable[s] + if !ok { + continue + } + acc[stateHash2ID[s]] = id + } + + rowCount := len(dfa.States) + 1 + colCount := 256 + tran := make([]spec.StateID, rowCount*colCount) + for s, tab := range dfa.TransitionTable { + for v, to := range tab { + tran[stateHash2ID[s].Int()*256+v] = stateHash2ID[to] + } + } + + return &spec.TransitionTable{ + InitialStateID: stateHash2ID[dfa.InitialState], + AcceptingStates: acc, + UncompressedTransition: tran, + RowCount: rowCount, + ColCount: colCount, + }, nil +} diff --git a/src/urubu/grammar/lexical/dfa/symbol_position.go b/src/urubu/grammar/lexical/dfa/symbol_position.go new file mode 100644 index 0000000..f154251 --- /dev/null +++ b/src/urubu/grammar/lexical/dfa/symbol_position.go @@ -0,0 +1,182 @@ +package dfa + +import ( + "encoding/binary" + "fmt" + "strings" +) + +type symbolPosition uint16 + +const ( + symbolPositionNil symbolPosition = 0x0000 + + symbolPositionMin uint16 = 0x0001 + symbolPositionMax uint16 = 0x7fff + + symbolPositionMaskSymbol uint16 = 0x0000 + symbolPositionMaskEndMark uint16 = 0x8000 + + symbolPositionMaskValue uint16 = 0x7fff +) + +func newSymbolPosition(n uint16, endMark bool) (symbolPosition, error) { + if n < symbolPositionMin || n > symbolPositionMax { + return symbolPositionNil, fmt.Errorf("symbol position must be within %v to %v: n: %v, endMark: %v", symbolPositionMin, symbolPositionMax, n, endMark) + } + if endMark { + return symbolPosition(n | symbolPositionMaskEndMark), nil + } + return symbolPosition(n | symbolPositionMaskSymbol), nil +} + +func (p symbolPosition) String() string { + if p.isEndMark() { + return fmt.Sprintf("end#%v", uint16(p)&symbolPositionMaskValue) + } + return fmt.Sprintf("sym#%v", uint16(p)&symbolPositionMaskValue) +} + +func (p symbolPosition) isEndMark() bool { + return uint16(p)&symbolPositionMaskEndMark > 1 +} + +func (p symbolPosition) describe() (uint16, bool) { + v := uint16(p) & symbolPositionMaskValue + if p.isEndMark() { + return v, true + } + return v, false +} + +type symbolPositionSet struct { + // `s` represents a set of symbol positions. + // However, immediately after adding a symbol position, the elements may be duplicated. + // When you need an aligned set with no duplicates, you can get such value via the set function. + s []symbolPosition + sorted bool +} + +func newSymbolPositionSet() *symbolPositionSet { + return &symbolPositionSet{ + s: []symbolPosition{}, + sorted: false, + } +} + +func (s *symbolPositionSet) String() string { + if len(s.s) <= 0 { + return "{}" + } + ps := s.sortAndRemoveDuplicates() + var b strings.Builder + fmt.Fprintf(&b, "{") + for i, p := range ps { + if i <= 0 { + fmt.Fprintf(&b, "%v", p) + continue + } + fmt.Fprintf(&b, ", %v", p) + } + fmt.Fprintf(&b, "}") + return b.String() +} + +func (s *symbolPositionSet) set() []symbolPosition { + s.sortAndRemoveDuplicates() + return s.s +} + +func (s *symbolPositionSet) add(pos symbolPosition) *symbolPositionSet { + s.s = append(s.s, pos) + s.sorted = false + return s +} + +func (s *symbolPositionSet) merge(t *symbolPositionSet) *symbolPositionSet { + s.s = append(s.s, t.s...) + s.sorted = false + return s +} + +func (s *symbolPositionSet) hash() string { + if len(s.s) <= 0 { + return "" + } + sorted := s.sortAndRemoveDuplicates() + var buf []byte + for _, p := range sorted { + b := make([]byte, 8) + binary.PutUvarint(b, uint64(p)) + buf = append(buf, b...) + } + // Convert to a string to be able to use it as a key of a map. + // But note this byte sequence is made from values of symbol positions, + // so this is not a well-formed UTF-8 sequence. + return string(buf) +} + +func (s *symbolPositionSet) sortAndRemoveDuplicates() []symbolPosition { + if s.sorted { + return s.s + } + + sortSymbolPositions(s.s, 0, len(s.s)-1) + + // Remove duplicates. + lastV := s.s[0] + nextIdx := 1 + for _, v := range s.s[1:] { + if v == lastV { + continue + } + s.s[nextIdx] = v + nextIdx++ + lastV = v + } + s.s = s.s[:nextIdx] + s.sorted = true + + return s.s +} + +// sortSymbolPositions sorts a slice of symbol positions as it uses quick sort. +func sortSymbolPositions(ps []symbolPosition, left, right int) { + if left >= right { + return + } + var pivot symbolPosition + { + // Use a median as a pivot. + p1 := ps[left] + p2 := ps[(left+right)/2] + p3 := ps[right] + if p1 > p2 { + p1, p2 = p2, p1 + } + if p2 > p3 { + p2 = p3 + if p1 > p2 { + p2 = p1 + } + } + pivot = p2 + } + i := left + j := right + for i <= j { + for ps[i] < pivot { + i++ + } + for ps[j] > pivot { + j-- + } + if i <= j { + ps[i], ps[j] = ps[j], ps[i] + i++ + j-- + } + } + sortSymbolPositions(ps, left, j) + sortSymbolPositions(ps, i, right) +} diff --git a/src/urubu/grammar/lexical/dfa/tree.go b/src/urubu/grammar/lexical/dfa/tree.go new file mode 100644 index 0000000..8a11aee --- /dev/null +++ b/src/urubu/grammar/lexical/dfa/tree.go @@ -0,0 +1,567 @@ +package dfa + +import ( + "fmt" + "io" + "sort" + + "urubu/grammar/lexical/parser" + spec "urubu/spec/grammar" + "urubu/utf8" +) + +type byteTree interface { + fmt.Stringer + children() (byteTree, byteTree) + nullable() bool + first() *symbolPositionSet + last() *symbolPositionSet + clone() byteTree +} + +var ( + _ byteTree = &symbolNode{} + _ byteTree = &endMarkerNode{} + _ byteTree = &concatNode{} + _ byteTree = &altNode{} + _ byteTree = &repeatNode{} + _ byteTree = &optionNode{} +) + +type byteRange struct { + from byte + to byte +} + +type symbolNode struct { + byteRange + pos symbolPosition + firstMemo *symbolPositionSet + lastMemo *symbolPositionSet +} + +func newSymbolNode(value byte) *symbolNode { + return &symbolNode{ + byteRange: byteRange{ + from: value, + to: value, + }, + pos: symbolPositionNil, + } +} + +func newRangeSymbolNode(from, to byte) *symbolNode { + return &symbolNode{ + byteRange: byteRange{ + from: from, + to: to, + }, + pos: symbolPositionNil, + } +} + +func (n *symbolNode) String() string { + return fmt.Sprintf("symbol: value: %v-%v, pos: %v", n.from, n.to, n.pos) +} + +func (n *symbolNode) children() (byteTree, byteTree) { + return nil, nil +} + +func (n *symbolNode) nullable() bool { + return false +} + +func (n *symbolNode) first() *symbolPositionSet { + if n.firstMemo == nil { + n.firstMemo = newSymbolPositionSet() + n.firstMemo.add(n.pos) + } + return n.firstMemo +} + +func (n *symbolNode) last() *symbolPositionSet { + if n.lastMemo == nil { + n.lastMemo = newSymbolPositionSet() + n.lastMemo.add(n.pos) + } + return n.lastMemo +} + +func (n *symbolNode) clone() byteTree { + return newRangeSymbolNode(n.from, n.to) +} + +type endMarkerNode struct { + id spec.LexModeKindID + pos symbolPosition + firstMemo *symbolPositionSet + lastMemo *symbolPositionSet +} + +func newEndMarkerNode(id spec.LexModeKindID) *endMarkerNode { + return &endMarkerNode{ + id: id, + pos: symbolPositionNil, + } +} + +func (n *endMarkerNode) String() string { + return fmt.Sprintf("end: pos: %v", n.pos) +} + +func (n *endMarkerNode) children() (byteTree, byteTree) { + return nil, nil +} + +func (n *endMarkerNode) nullable() bool { + return false +} + +func (n *endMarkerNode) first() *symbolPositionSet { + if n.firstMemo == nil { + n.firstMemo = newSymbolPositionSet() + n.firstMemo.add(n.pos) + } + return n.firstMemo +} + +func (n *endMarkerNode) last() *symbolPositionSet { + if n.lastMemo == nil { + n.lastMemo = newSymbolPositionSet() + n.lastMemo.add(n.pos) + } + return n.lastMemo +} + +func (n *endMarkerNode) clone() byteTree { + return newEndMarkerNode(n.id) +} + +type concatNode struct { + left byteTree + right byteTree + firstMemo *symbolPositionSet + lastMemo *symbolPositionSet +} + +func newConcatNode(left, right byteTree) *concatNode { + return &concatNode{ + left: left, + right: right, + } +} + +func (n *concatNode) String() string { + return "concat" +} + +func (n *concatNode) children() (byteTree, byteTree) { + return n.left, n.right +} + +func (n *concatNode) nullable() bool { + return n.left.nullable() && n.right.nullable() +} + +func (n *concatNode) first() *symbolPositionSet { + if n.firstMemo == nil { + n.firstMemo = newSymbolPositionSet() + n.firstMemo.merge(n.left.first()) + if n.left.nullable() { + n.firstMemo.merge(n.right.first()) + } + n.firstMemo.sortAndRemoveDuplicates() + } + return n.firstMemo +} + +func (n *concatNode) last() *symbolPositionSet { + if n.lastMemo == nil { + n.lastMemo = newSymbolPositionSet() + n.lastMemo.merge(n.right.last()) + if n.right.nullable() { + n.lastMemo.merge(n.left.last()) + } + n.lastMemo.sortAndRemoveDuplicates() + } + return n.lastMemo +} + +func (n *concatNode) clone() byteTree { + return newConcatNode(n.left.clone(), n.right.clone()) +} + +type altNode struct { + left byteTree + right byteTree + firstMemo *symbolPositionSet + lastMemo *symbolPositionSet +} + +func newAltNode(left, right byteTree) *altNode { + return &altNode{ + left: left, + right: right, + } +} + +func (n *altNode) String() string { + return "alt" +} + +func (n *altNode) children() (byteTree, byteTree) { + return n.left, n.right +} + +func (n *altNode) nullable() bool { + return n.left.nullable() || n.right.nullable() +} + +func (n *altNode) first() *symbolPositionSet { + if n.firstMemo == nil { + n.firstMemo = newSymbolPositionSet() + n.firstMemo.merge(n.left.first()) + n.firstMemo.merge(n.right.first()) + n.firstMemo.sortAndRemoveDuplicates() + } + return n.firstMemo +} + +func (n *altNode) last() *symbolPositionSet { + if n.lastMemo == nil { + n.lastMemo = newSymbolPositionSet() + n.lastMemo.merge(n.left.last()) + n.lastMemo.merge(n.right.last()) + n.lastMemo.sortAndRemoveDuplicates() + } + return n.lastMemo +} + +func (n *altNode) clone() byteTree { + return newAltNode(n.left.clone(), n.right.clone()) +} + +type repeatNode struct { + left byteTree + firstMemo *symbolPositionSet + lastMemo *symbolPositionSet +} + +func newRepeatNode(left byteTree) *repeatNode { + return &repeatNode{ + left: left, + } +} + +func (n *repeatNode) String() string { + return "repeat" +} + +func (n *repeatNode) children() (byteTree, byteTree) { + return n.left, nil +} + +func (n *repeatNode) nullable() bool { + return true +} + +func (n *repeatNode) first() *symbolPositionSet { + if n.firstMemo == nil { + n.firstMemo = newSymbolPositionSet() + n.firstMemo.merge(n.left.first()) + n.firstMemo.sortAndRemoveDuplicates() + } + return n.firstMemo +} + +func (n *repeatNode) last() *symbolPositionSet { + if n.lastMemo == nil { + n.lastMemo = newSymbolPositionSet() + n.lastMemo.merge(n.left.last()) + n.lastMemo.sortAndRemoveDuplicates() + } + return n.lastMemo +} + +func (n *repeatNode) clone() byteTree { + return newRepeatNode(n.left.clone()) +} + +type optionNode struct { + left byteTree + firstMemo *symbolPositionSet + lastMemo *symbolPositionSet +} + +func newOptionNode(left byteTree) *optionNode { + return &optionNode{ + left: left, + } +} + +func (n *optionNode) String() string { + return "option" +} + +func (n *optionNode) children() (byteTree, byteTree) { + return n.left, nil +} + +func (n *optionNode) nullable() bool { + return true +} + +func (n *optionNode) first() *symbolPositionSet { + if n.firstMemo == nil { + n.firstMemo = newSymbolPositionSet() + n.firstMemo.merge(n.left.first()) + n.firstMemo.sortAndRemoveDuplicates() + } + return n.firstMemo +} + +func (n *optionNode) last() *symbolPositionSet { + if n.lastMemo == nil { + n.lastMemo = newSymbolPositionSet() + n.lastMemo.merge(n.left.last()) + n.lastMemo.sortAndRemoveDuplicates() + } + return n.lastMemo +} + +func (n *optionNode) clone() byteTree { + return newOptionNode(n.left.clone()) +} + +type followTable map[symbolPosition]*symbolPositionSet + +func genFollowTable(root byteTree) followTable { + follow := followTable{} + calcFollow(follow, root) + return follow +} + +func calcFollow(follow followTable, ast byteTree) { + if ast == nil { + return + } + left, right := ast.children() + calcFollow(follow, left) + calcFollow(follow, right) + switch n := ast.(type) { + case *concatNode: + l, r := n.children() + for _, p := range l.last().set() { + if _, ok := follow[p]; !ok { + follow[p] = newSymbolPositionSet() + } + follow[p].merge(r.first()) + } + case *repeatNode: + for _, p := range n.last().set() { + if _, ok := follow[p]; !ok { + follow[p] = newSymbolPositionSet() + } + follow[p].merge(n.first()) + } + } +} + +func positionSymbols(node byteTree, n uint16) (uint16, error) { + if node == nil { + return n, nil + } + + l, r := node.children() + p := n + p, err := positionSymbols(l, p) + if err != nil { + return p, err + } + p, err = positionSymbols(r, p) + if err != nil { + return p, err + } + switch n := node.(type) { + case *symbolNode: + n.pos, err = newSymbolPosition(p, false) + if err != nil { + return p, err + } + p++ + case *endMarkerNode: + n.pos, err = newSymbolPosition(p, true) + if err != nil { + return p, err + } + p++ + } + node.first() + node.last() + return p, nil +} + +func concat(ts ...byteTree) byteTree { + nonNilNodes := []byteTree{} + for _, t := range ts { + if t == nil { + continue + } + nonNilNodes = append(nonNilNodes, t) + } + if len(nonNilNodes) <= 0 { + return nil + } + if len(nonNilNodes) == 1 { + return nonNilNodes[0] + } + concat := newConcatNode(nonNilNodes[0], nonNilNodes[1]) + for _, t := range nonNilNodes[2:] { + concat = newConcatNode(concat, t) + } + return concat +} + +func oneOf(ts ...byteTree) byteTree { + nonNilNodes := []byteTree{} + for _, t := range ts { + if t == nil { + continue + } + nonNilNodes = append(nonNilNodes, t) + } + if len(nonNilNodes) <= 0 { + return nil + } + if len(nonNilNodes) == 1 { + return nonNilNodes[0] + } + alt := newAltNode(nonNilNodes[0], nonNilNodes[1]) + for _, t := range nonNilNodes[2:] { + alt = newAltNode(alt, t) + } + return alt +} + +//nolint:unused +func printByteTree(w io.Writer, t byteTree, ruledLine string, childRuledLinePrefix string, withAttrs bool) { + if t == nil { + return + } + fmt.Fprintf(w, "%v%v", ruledLine, t) + if withAttrs { + fmt.Fprintf(w, ", nullable: %v, first: %v, last: %v", t.nullable(), t.first(), t.last()) + } + fmt.Fprintf(w, "\n") + left, right := t.children() + children := []byteTree{} + if left != nil { + children = append(children, left) + } + if right != nil { + children = append(children, right) + } + num := len(children) + for i, child := range children { + line := "└─ " + if num > 1 { + if i == 0 { + line = "├─ " + } else if i < num-1 { + line = "│ " + } + } + prefix := "│ " + if i >= num-1 { + prefix = " " + } + printByteTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix, withAttrs) + } +} + +func ConvertCPTreeToByteTree(cpTrees map[spec.LexModeKindID]parser.CPTree) (byteTree, *symbolTable, error) { + var ids []spec.LexModeKindID + for id := range cpTrees { + ids = append(ids, id) + } + sort.Slice(ids, func(i, j int) bool { + return ids[i] < ids[j] + }) + + var bt byteTree + for _, id := range ids { + cpTree := cpTrees[id] + t, err := convCPTreeToByteTree(cpTree) + if err != nil { + return nil, nil, err + } + bt = oneOf(bt, concat(t, newEndMarkerNode(id))) + } + _, err := positionSymbols(bt, symbolPositionMin) + if err != nil { + return nil, nil, err + } + + return bt, genSymbolTable(bt), nil +} + +func convCPTreeToByteTree(cpTree parser.CPTree) (byteTree, error) { + if from, to, ok := cpTree.Range(); ok { + bs, err := utf8.GenCharBlocks(from, to) + if err != nil { + return nil, err + } + var a byteTree + for _, b := range bs { + var c byteTree + for i := 0; i < len(b.From); i++ { + c = concat(c, newRangeSymbolNode(b.From[i], b.To[i])) + } + a = oneOf(a, c) + } + return a, nil + } + + if tree, ok := cpTree.Repeatable(); ok { + t, err := convCPTreeToByteTree(tree) + if err != nil { + return nil, err + } + return newRepeatNode(t), nil + } + + if tree, ok := cpTree.Optional(); ok { + t, err := convCPTreeToByteTree(tree) + if err != nil { + return nil, err + } + return newOptionNode(t), nil + } + + if left, right, ok := cpTree.Concatenation(); ok { + l, err := convCPTreeToByteTree(left) + if err != nil { + return nil, err + } + r, err := convCPTreeToByteTree(right) + if err != nil { + return nil, err + } + return newConcatNode(l, r), nil + } + + if left, right, ok := cpTree.Alternatives(); ok { + l, err := convCPTreeToByteTree(left) + if err != nil { + return nil, err + } + r, err := convCPTreeToByteTree(right) + if err != nil { + return nil, err + } + return newAltNode(l, r), nil + } + + return nil, fmt.Errorf("invalid tree type: %T", cpTree) +} diff --git a/src/urubu/grammar/lexical/entry.go b/src/urubu/grammar/lexical/entry.go new file mode 100644 index 0000000..44af8ea --- /dev/null +++ b/src/urubu/grammar/lexical/entry.go @@ -0,0 +1,171 @@ +package lexical + +import ( + "fmt" + "sort" + "strings" + + spec "urubu/spec/grammar" +) + +type LexEntry struct { + Kind spec.LexKindName + Pattern string + Modes []spec.LexModeName + Push spec.LexModeName + Pop bool + Fragment bool +} + +type LexSpec struct { + Entries []*LexEntry +} + +func (s *LexSpec) Validate() error { + if len(s.Entries) <= 0 { + return fmt.Errorf("the lexical specification must have at least one entry") + } + { + ks := map[string]struct{}{} + fks := map[string]struct{}{} + for _, e := range s.Entries { + // Allow duplicate names between fragments and non-fragments. + if e.Fragment { + if _, exist := fks[e.Kind.String()]; exist { + return fmt.Errorf("kinds `%v` are duplicates", e.Kind) + } + fks[e.Kind.String()] = struct{}{} + } else { + if _, exist := ks[e.Kind.String()]; exist { + return fmt.Errorf("kinds `%v` are duplicates", e.Kind) + } + ks[e.Kind.String()] = struct{}{} + } + } + } + { + kinds := []string{} + modes := []string{ + spec.LexModeNameDefault.String(), // This is a predefined mode. + } + for _, e := range s.Entries { + if e.Fragment { + continue + } + + kinds = append(kinds, e.Kind.String()) + + for _, m := range e.Modes { + modes = append(modes, m.String()) + } + } + + kindErrs := findSpellingInconsistenciesErrors(kinds, nil) + modeErrs := findSpellingInconsistenciesErrors(modes, func(ids []string) error { + if SnakeCaseToUpperCamelCase(ids[0]) == SnakeCaseToUpperCamelCase(spec.LexModeNameDefault.String()) { + var b strings.Builder + fmt.Fprintf(&b, "%+v", ids[0]) + for _, id := range ids[1:] { + fmt.Fprintf(&b, ", %+v", id) + } + return fmt.Errorf("these identifiers are treated as the same. please use the same spelling as predefined '%v': %v", spec.LexModeNameDefault, b.String()) + } + return nil + }) + errs := append(kindErrs, modeErrs...) + if len(errs) > 0 { + var b strings.Builder + fmt.Fprintf(&b, "%v", errs[0]) + for _, err := range errs[1:] { + fmt.Fprintf(&b, "\n%v", err) + } + return fmt.Errorf(b.String()) + } + } + + return nil +} + +func findSpellingInconsistenciesErrors(ids []string, hook func(ids []string) error) []error { + duplicated := FindSpellingInconsistencies(ids) + if len(duplicated) == 0 { + return nil + } + + var errs []error + for _, dup := range duplicated { + if hook != nil { + err := hook(dup) + if err != nil { + errs = append(errs, err) + continue + } + } + + var b strings.Builder + fmt.Fprintf(&b, "%+v", dup[0]) + for _, id := range dup[1:] { + fmt.Fprintf(&b, ", %+v", id) + } + err := fmt.Errorf("these identifiers are treated as the same. please use the same spelling: %v", b.String()) + errs = append(errs, err) + } + + return errs +} + +// FindSpellingInconsistencies finds spelling inconsistencies in identifiers. The identifiers are considered to be the same +// if they are spelled the same when expressed in UpperCamelCase. For example, `left_paren` and `LeftParen` are spelled the same +// in UpperCamelCase. Thus they are considere to be spelling inconsistency. +func FindSpellingInconsistencies(ids []string) [][]string { + m := map[string][]string{} + for _, id := range removeDuplicates(ids) { + c := SnakeCaseToUpperCamelCase(id) + m[c] = append(m[c], id) + } + + var duplicated [][]string + for _, camels := range m { + if len(camels) == 1 { + continue + } + duplicated = append(duplicated, camels) + } + + for _, dup := range duplicated { + sort.Slice(dup, func(i, j int) bool { + return dup[i] < dup[j] + }) + } + sort.Slice(duplicated, func(i, j int) bool { + return duplicated[i][0] < duplicated[j][0] + }) + + return duplicated +} + +func removeDuplicates(s []string) []string { + m := map[string]struct{}{} + for _, v := range s { + m[v] = struct{}{} + } + + var unique []string + for v := range m { + unique = append(unique, v) + } + + return unique +} + +func SnakeCaseToUpperCamelCase(snake string) string { + elems := strings.Split(snake, "_") + for i, e := range elems { + if len(e) == 0 { + continue + } + elems[i] = strings.ToUpper(string(e[0])) + e[1:] + } + + return strings.Join(elems, "") +} diff --git a/src/urubu/grammar/lexical/parser/error.go b/src/urubu/grammar/lexical/parser/error.go new file mode 100644 index 0000000..be81da4 --- /dev/null +++ b/src/urubu/grammar/lexical/parser/error.go @@ -0,0 +1,36 @@ +package parser + +import "fmt" + +var ( + ParseErr = fmt.Errorf("parse error") + + // lexical errors + synErrIncompletedEscSeq = fmt.Errorf("incompleted escape sequence; unexpected EOF following \\") + synErrInvalidEscSeq = fmt.Errorf("invalid escape sequence") + synErrInvalidCodePoint = fmt.Errorf("code points must consist of just 4 or 6 hex digits") + synErrCharPropInvalidSymbol = fmt.Errorf("invalid character property symbol") + SynErrFragmentInvalidSymbol = fmt.Errorf("invalid fragment symbol") + + // syntax errors + synErrUnexpectedToken = fmt.Errorf("unexpected token") + synErrNullPattern = fmt.Errorf("a pattern must be a non-empty byte sequence") + synErrUnmatchablePattern = fmt.Errorf("a pattern cannot match any characters") + synErrAltLackOfOperand = fmt.Errorf("an alternation expression must have operands") + synErrRepNoTarget = fmt.Errorf("a repeat expression must have an operand") + synErrGroupNoElem = fmt.Errorf("a grouping expression must include at least one character") + synErrGroupUnclosed = fmt.Errorf("unclosed grouping expression") + synErrGroupNoInitiator = fmt.Errorf(") needs preceding (") + synErrGroupInvalidForm = fmt.Errorf("invalid grouping expression") + synErrBExpNoElem = fmt.Errorf("a bracket expression must include at least one character") + synErrBExpUnclosed = fmt.Errorf("unclosed bracket expression") + synErrBExpInvalidForm = fmt.Errorf("invalid bracket expression") + synErrRangeInvalidOrder = fmt.Errorf("a range expression with invalid order") + synErrRangePropIsUnavailable = fmt.Errorf("a property expression is unavailable in a range expression") + synErrRangeInvalidForm = fmt.Errorf("invalid range expression") + synErrCPExpInvalidForm = fmt.Errorf("invalid code point expression") + synErrCPExpOutOfRange = fmt.Errorf("a code point must be between U+0000 to U+10FFFF") + synErrCharPropExpInvalidForm = fmt.Errorf("invalid character property expression") + synErrCharPropUnsupported = fmt.Errorf("unsupported character property") + synErrFragmentExpInvalidForm = fmt.Errorf("invalid fragment expression") +) diff --git a/src/urubu/grammar/lexical/parser/fragment.go b/src/urubu/grammar/lexical/parser/fragment.go new file mode 100644 index 0000000..196c00b --- /dev/null +++ b/src/urubu/grammar/lexical/parser/fragment.go @@ -0,0 +1,72 @@ +package parser + +import ( + "fmt" + + spec "urubu/spec/grammar" +) + +type incompleteFragment struct { + kind spec.LexKindName + root *rootNode +} + +func CompleteFragments(fragments map[spec.LexKindName]CPTree) error { + if len(fragments) == 0 { + return nil + } + + completeFragments := map[spec.LexKindName]CPTree{} + incompleteFragments := []*incompleteFragment{} + for kind, tree := range fragments { + root, ok := tree.(*rootNode) + if !ok { + return fmt.Errorf("CompleteFragments can take only *rootNode: %T", tree) + } + if root.incomplete() { + incompleteFragments = append(incompleteFragments, &incompleteFragment{ + kind: kind, + root: root, + }) + } else { + completeFragments[kind] = root + } + } + for len(incompleteFragments) > 0 { + lastIncompCount := len(incompleteFragments) + remainingFragments := []*incompleteFragment{} + for _, e := range incompleteFragments { + complete, err := ApplyFragments(e.root, completeFragments) + if err != nil { + return err + } + if !complete { + remainingFragments = append(remainingFragments, e) + } else { + completeFragments[e.kind] = e.root + } + } + incompleteFragments = remainingFragments + if len(incompleteFragments) == lastIncompCount { + return ParseErr + } + } + + return nil +} + +func ApplyFragments(t CPTree, fragments map[spec.LexKindName]CPTree) (bool, error) { + root, ok := t.(*rootNode) + if !ok { + return false, fmt.Errorf("ApplyFragments can take only *rootNode type: %T", t) + } + + for name, frag := range fragments { + err := root.applyFragment(name, frag) + if err != nil { + return false, err + } + } + + return !root.incomplete(), nil +} diff --git a/src/urubu/grammar/lexical/parser/lexer.go b/src/urubu/grammar/lexical/parser/lexer.go new file mode 100644 index 0000000..3861825 --- /dev/null +++ b/src/urubu/grammar/lexical/parser/lexer.go @@ -0,0 +1,594 @@ +package parser + +import ( + "bufio" + "fmt" + "io" + "strings" +) + +type tokenKind string + +const ( + tokenKindChar tokenKind = "char" + tokenKindAnyChar tokenKind = "." + tokenKindRepeat tokenKind = "*" + tokenKindRepeatOneOrMore tokenKind = "+" + tokenKindOption tokenKind = "?" + tokenKindAlt tokenKind = "|" + tokenKindGroupOpen tokenKind = "(" + tokenKindGroupClose tokenKind = ")" + tokenKindBExpOpen tokenKind = "[" + tokenKindInverseBExpOpen tokenKind = "[^" + tokenKindBExpClose tokenKind = "]" + tokenKindCharRange tokenKind = "-" + tokenKindCodePointLeader tokenKind = "\\u" + tokenKindCharPropLeader tokenKind = "\\p" + tokenKindFragmentLeader tokenKind = "\\f" + tokenKindLBrace tokenKind = "{" + tokenKindRBrace tokenKind = "}" + tokenKindEqual tokenKind = "=" + tokenKindCodePoint tokenKind = "code point" + tokenKindCharPropSymbol tokenKind = "character property symbol" + tokenKindFragmentSymbol tokenKind = "fragment symbol" + tokenKindEOF tokenKind = "eof" +) + +type token struct { + kind tokenKind + char rune + propSymbol string + codePoint string + fragmentSymbol string +} + +const nullChar = '\u0000' + +func newToken(kind tokenKind, char rune) *token { + return &token{ + kind: kind, + char: char, + } +} + +func newCodePointToken(codePoint string) *token { + return &token{ + kind: tokenKindCodePoint, + codePoint: codePoint, + } +} + +func newCharPropSymbolToken(propSymbol string) *token { + return &token{ + kind: tokenKindCharPropSymbol, + propSymbol: propSymbol, + } +} + +func newFragmentSymbolToken(fragmentSymbol string) *token { + return &token{ + kind: tokenKindFragmentSymbol, + fragmentSymbol: fragmentSymbol, + } +} + +type lexerMode string + +const ( + lexerModeDefault lexerMode = "default" + lexerModeBExp lexerMode = "bracket expression" + lexerModeCPExp lexerMode = "code point expression" + lexerModeCharPropExp lexerMode = "character property expression" + lexerModeFragmentExp lexerMode = "fragment expression" +) + +type lexerModeStack struct { + stack []lexerMode +} + +func newLexerModeStack() *lexerModeStack { + return &lexerModeStack{ + stack: []lexerMode{ + lexerModeDefault, + }, + } +} + +func (s *lexerModeStack) top() lexerMode { + return s.stack[len(s.stack)-1] +} + +func (s *lexerModeStack) push(m lexerMode) { + s.stack = append(s.stack, m) +} + +func (s *lexerModeStack) pop() { + s.stack = s.stack[:len(s.stack)-1] +} + +type rangeState string + +// [a-z] +// ^^^^ +// |||`-- ready +// ||`-- expect range terminator +// |`-- read range initiator +// `-- ready +const ( + rangeStateReady rangeState = "ready" + rangeStateReadRangeInitiator rangeState = "read range initiator" + rangeStateExpectRangeTerminator rangeState = "expect range terminator" +) + +type lexer struct { + src *bufio.Reader + peekChar2 rune + peekEOF2 bool + peekChar1 rune + peekEOF1 bool + lastChar rune + reachedEOF bool + prevChar1 rune + prevEOF1 bool + prevChar2 rune + pervEOF2 bool + modeStack *lexerModeStack + rangeState rangeState + + errCause error + errDetail string +} + +func newLexer(src io.Reader) *lexer { + return &lexer{ + src: bufio.NewReader(src), + peekChar2: nullChar, + peekEOF2: false, + peekChar1: nullChar, + peekEOF1: false, + lastChar: nullChar, + reachedEOF: false, + prevChar1: nullChar, + prevEOF1: false, + prevChar2: nullChar, + pervEOF2: false, + modeStack: newLexerModeStack(), + rangeState: rangeStateReady, + } +} + +func (l *lexer) error() (string, error) { + return l.errDetail, l.errCause +} + +func (l *lexer) next() (*token, error) { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + return newToken(tokenKindEOF, nullChar), nil + } + + switch l.modeStack.top() { + case lexerModeBExp: + tok, err := l.nextInBExp(c) + if err != nil { + return nil, err + } + if tok.kind == tokenKindChar || tok.kind == tokenKindCodePointLeader || tok.kind == tokenKindCharPropLeader { + switch l.rangeState { + case rangeStateReady: + l.rangeState = rangeStateReadRangeInitiator + case rangeStateExpectRangeTerminator: + l.rangeState = rangeStateReady + } + } + switch tok.kind { + case tokenKindBExpClose: + l.modeStack.pop() + case tokenKindCharRange: + l.rangeState = rangeStateExpectRangeTerminator + case tokenKindCodePointLeader: + l.modeStack.push(lexerModeCPExp) + case tokenKindCharPropLeader: + l.modeStack.push(lexerModeCharPropExp) + } + return tok, nil + case lexerModeCPExp: + tok, err := l.nextInCodePoint(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() + } + return tok, nil + case lexerModeCharPropExp: + tok, err := l.nextInCharProp(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() + } + return tok, nil + case lexerModeFragmentExp: + tok, err := l.nextInFragment(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() + } + return tok, nil + default: + tok, err := l.nextInDefault(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindBExpOpen: + l.modeStack.push(lexerModeBExp) + l.rangeState = rangeStateReady + case tokenKindInverseBExpOpen: + l.modeStack.push(lexerModeBExp) + l.rangeState = rangeStateReady + case tokenKindCodePointLeader: + l.modeStack.push(lexerModeCPExp) + case tokenKindCharPropLeader: + l.modeStack.push(lexerModeCharPropExp) + case tokenKindFragmentLeader: + l.modeStack.push(lexerModeFragmentExp) + } + return tok, nil + } +} + +func (l *lexer) nextInDefault(c rune) (*token, error) { + switch c { + case '*': + return newToken(tokenKindRepeat, nullChar), nil + case '+': + return newToken(tokenKindRepeatOneOrMore, nullChar), nil + case '?': + return newToken(tokenKindOption, nullChar), nil + case '.': + return newToken(tokenKindAnyChar, nullChar), nil + case '|': + return newToken(tokenKindAlt, nullChar), nil + case '(': + return newToken(tokenKindGroupOpen, nullChar), nil + case ')': + return newToken(tokenKindGroupClose, nullChar), nil + case '[': + c1, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindBExpOpen, nullChar), nil + } + if c1 != '^' { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindBExpOpen, nullChar), nil + } + c2, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindInverseBExpOpen, nullChar), nil + } + if c2 != ']' { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindInverseBExpOpen, nullChar), nil + } + err = l.restore() + if err != nil { + return nil, err + } + err = l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindBExpOpen, nullChar), nil + case '\\': + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + l.errCause = synErrIncompletedEscSeq + return nil, ParseErr + } + if c == 'u' { + return newToken(tokenKindCodePointLeader, nullChar), nil + } + if c == 'p' { + return newToken(tokenKindCharPropLeader, nullChar), nil + } + if c == 'f' { + return newToken(tokenKindFragmentLeader, nullChar), nil + } + if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' { + return newToken(tokenKindChar, c), nil + } + l.errCause = synErrInvalidEscSeq + l.errDetail = fmt.Sprintf("\\%v is not supported", string(c)) + return nil, ParseErr + default: + return newToken(tokenKindChar, c), nil + } +} + +func (l *lexer) nextInBExp(c rune) (*token, error) { + switch c { + case '-': + if l.rangeState != rangeStateReadRangeInitiator { + return newToken(tokenKindChar, c), nil + } + c1, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindChar, c), nil + } + if c1 != ']' { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindCharRange, nullChar), nil + } + err = l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindChar, c), nil + case ']': + return newToken(tokenKindBExpClose, nullChar), nil + case '\\': + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + l.errCause = synErrIncompletedEscSeq + return nil, ParseErr + } + if c == 'u' { + return newToken(tokenKindCodePointLeader, nullChar), nil + } + if c == 'p' { + return newToken(tokenKindCharPropLeader, nullChar), nil + } + if c == '\\' || c == '^' || c == '-' || c == ']' { + return newToken(tokenKindChar, c), nil + } + l.errCause = synErrInvalidEscSeq + l.errDetail = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c)) + return nil, ParseErr + default: + return newToken(tokenKindChar, c), nil + } +} + +func (l *lexer) nextInCodePoint(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + default: + if !isHexDigit(c) { + l.errCause = synErrInvalidCodePoint + return nil, ParseErr + } + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if !isHexDigit(c) || n >= 6 { + l.errCause = synErrInvalidCodePoint + return nil, ParseErr + } + fmt.Fprint(&b, string(c)) + n++ + } + cp := b.String() + cpLen := len(cp) + if !(cpLen == 4 || cpLen == 6) { + l.errCause = synErrInvalidCodePoint + return nil, ParseErr + } + return newCodePointToken(b.String()), nil + } +} + +func isHexDigit(c rune) bool { + if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' { + return true + } + return false +} + +func (l *lexer) nextInCharProp(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + case '=': + return newToken(tokenKindEqual, nullChar), nil + default: + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' || c == '=' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + fmt.Fprint(&b, string(c)) + n++ + } + sym := strings.TrimSpace(b.String()) + if len(sym) == 0 { + l.errCause = synErrCharPropInvalidSymbol + return nil, ParseErr + } + return newCharPropSymbolToken(sym), nil + } +} + +func (l *lexer) nextInFragment(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + default: + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + fmt.Fprint(&b, string(c)) + n++ + } + sym := strings.TrimSpace(b.String()) + if len(sym) == 0 { + l.errCause = SynErrFragmentInvalidSymbol + return nil, ParseErr + } + return newFragmentSymbolToken(sym), nil + } +} + +func (l *lexer) read() (rune, bool, error) { + if l.reachedEOF { + return l.lastChar, l.reachedEOF, nil + } + if l.peekChar1 != nullChar || l.peekEOF1 { + l.prevChar2 = l.prevChar1 + l.pervEOF2 = l.prevEOF1 + l.prevChar1 = l.lastChar + l.prevEOF1 = l.reachedEOF + l.lastChar = l.peekChar1 + l.reachedEOF = l.peekEOF1 + l.peekChar1 = l.peekChar2 + l.peekEOF1 = l.peekEOF2 + l.peekChar2 = nullChar + l.peekEOF2 = false + return l.lastChar, l.reachedEOF, nil + } + c, _, err := l.src.ReadRune() + if err != nil { + if err == io.EOF { + l.prevChar2 = l.prevChar1 + l.pervEOF2 = l.prevEOF1 + l.prevChar1 = l.lastChar + l.prevEOF1 = l.reachedEOF + l.lastChar = nullChar + l.reachedEOF = true + return l.lastChar, l.reachedEOF, nil + } + return nullChar, false, err + } + l.prevChar2 = l.prevChar1 + l.pervEOF2 = l.prevEOF1 + l.prevChar1 = l.lastChar + l.prevEOF1 = l.reachedEOF + l.lastChar = c + l.reachedEOF = false + return l.lastChar, l.reachedEOF, nil +} + +func (l *lexer) restore() error { + if l.lastChar == nullChar && !l.reachedEOF { + return fmt.Errorf("failed to call restore() because the last character is null") + } + l.peekChar2 = l.peekChar1 + l.peekEOF2 = l.peekEOF1 + l.peekChar1 = l.lastChar + l.peekEOF1 = l.reachedEOF + l.lastChar = l.prevChar1 + l.reachedEOF = l.prevEOF1 + l.prevChar1 = l.prevChar2 + l.prevEOF1 = l.pervEOF2 + l.prevChar2 = nullChar + l.pervEOF2 = false + return nil +} diff --git a/src/urubu/grammar/lexical/parser/parser.go b/src/urubu/grammar/lexical/parser/parser.go new file mode 100644 index 0000000..425b553 --- /dev/null +++ b/src/urubu/grammar/lexical/parser/parser.go @@ -0,0 +1,531 @@ +package parser + +import ( + "bytes" + "fmt" + "io" + "strconv" + + spec "urubu/spec/grammar" + "urubu/ucd" +) + +type PatternEntry struct { + ID spec.LexModeKindID + Pattern []byte +} + +type parser struct { + kind spec.LexKindName + lex *lexer + peekedTok *token + lastTok *token + + // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that + // appear in property expressions. + // + // The contributory properties are not exposed, and users cannot use those properties because the parser + // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid. + // + // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to + // interpret derived properties internally because the derived properties consist of other properties that + // may contain the contributory properties. + // + // [UAX #44 5.13 Property APIs] says: + // > The following subtypes of Unicode character properties should generally not be exposed in APIs, + // > except in limited circumstances. They may not be useful, particularly in public API collections, + // > and may instead prove misleading to the users of such API collections. + // > * Contributory properties are not recommended for public APIs. + // > ... + // https://unicode.org/reports/tr44/#Property_APIs + isContributoryPropertyExposed bool + + errCause error + errDetail string +} + +func NewParser(kind spec.LexKindName, src io.Reader) *parser { + return &parser{ + kind: kind, + lex: newLexer(src), + isContributoryPropertyExposed: false, + } +} + +func (p *parser) exposeContributoryProperty() { + p.isContributoryPropertyExposed = true +} + +func (p *parser) Error() (string, error) { + return p.errDetail, p.errCause +} + +func (p *parser) Parse() (root CPTree, retErr error) { + defer func() { + err := recover() + if err != nil { + var ok bool + retErr, ok = err.(error) + if !ok { + panic(err) + } + return + } + }() + + return newRootNode(p.kind, p.parseRegexp()), nil +} + +func (p *parser) parseRegexp() CPTree { + alt := p.parseAlt() + if alt == nil { + if p.consume(tokenKindGroupClose) { + p.raiseParseError(synErrGroupNoInitiator, "") + } + p.raiseParseError(synErrNullPattern, "") + } + if p.consume(tokenKindGroupClose) { + p.raiseParseError(synErrGroupNoInitiator, "") + } + p.expect(tokenKindEOF) + return alt +} + +func (p *parser) parseAlt() CPTree { + left := p.parseConcat() + if left == nil { + if p.consume(tokenKindAlt) { + p.raiseParseError(synErrAltLackOfOperand, "") + } + return nil + } + for { + if !p.consume(tokenKindAlt) { + break + } + right := p.parseConcat() + if right == nil { + p.raiseParseError(synErrAltLackOfOperand, "") + } + left = newAltNode(left, right) + } + return left +} + +func (p *parser) parseConcat() CPTree { + left := p.parseRepeat() + for { + right := p.parseRepeat() + if right == nil { + break + } + left = newConcatNode(left, right) + } + return left +} + +func (p *parser) parseRepeat() CPTree { + group := p.parseGroup() + if group == nil { + if p.consume(tokenKindRepeat) { + p.raiseParseError(synErrRepNoTarget, "* needs an operand") + } + if p.consume(tokenKindRepeatOneOrMore) { + p.raiseParseError(synErrRepNoTarget, "+ needs an operand") + } + if p.consume(tokenKindOption) { + p.raiseParseError(synErrRepNoTarget, "? needs an operand") + } + return nil + } + if p.consume(tokenKindRepeat) { + return newRepeatNode(group) + } + if p.consume(tokenKindRepeatOneOrMore) { + return newRepeatOneOrMoreNode(group) + } + if p.consume(tokenKindOption) { + return newOptionNode(group) + } + return group +} + +func (p *parser) parseGroup() CPTree { + if p.consume(tokenKindGroupOpen) { + alt := p.parseAlt() + if alt == nil { + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrGroupUnclosed, "") + } + p.raiseParseError(synErrGroupNoElem, "") + } + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrGroupUnclosed, "") + } + if !p.consume(tokenKindGroupClose) { + p.raiseParseError(synErrGroupInvalidForm, "") + } + return alt + } + return p.parseSingleChar() +} + +func (p *parser) parseSingleChar() CPTree { + if p.consume(tokenKindAnyChar) { + return genAnyCharAST() + } + if p.consume(tokenKindBExpOpen) { + left := p.parseBExpElem() + if left == nil { + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.raiseParseError(synErrBExpNoElem, "") + } + for { + right := p.parseBExpElem() + if right == nil { + break + } + left = newAltNode(left, right) + } + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.expect(tokenKindBExpClose) + return left + } + if p.consume(tokenKindInverseBExpOpen) { + elem := p.parseBExpElem() + if elem == nil { + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.raiseParseError(synErrBExpNoElem, "") + } + inverse := exclude(elem, genAnyCharAST()) + if inverse == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + for { + elem := p.parseBExpElem() + if elem == nil { + break + } + inverse = exclude(elem, inverse) + if inverse == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + } + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.expect(tokenKindBExpClose) + return inverse + } + if p.consume(tokenKindCodePointLeader) { + return p.parseCodePoint() + } + if p.consume(tokenKindCharPropLeader) { + return p.parseCharProp() + } + if p.consume(tokenKindFragmentLeader) { + return p.parseFragment() + } + c := p.parseNormalChar() + if c == nil { + if p.consume(tokenKindBExpClose) { + p.raiseParseError(synErrBExpInvalidForm, "") + } + return nil + } + return c +} + +func (p *parser) parseBExpElem() CPTree { + var left CPTree + switch { + case p.consume(tokenKindCodePointLeader): + left = p.parseCodePoint() + case p.consume(tokenKindCharPropLeader): + left = p.parseCharProp() + if p.consume(tokenKindCharRange) { + p.raiseParseError(synErrRangePropIsUnavailable, "") + } + default: + left = p.parseNormalChar() + } + if left == nil { + return nil + } + if !p.consume(tokenKindCharRange) { + return left + } + var right CPTree + switch { + case p.consume(tokenKindCodePointLeader): + right = p.parseCodePoint() + case p.consume(tokenKindCharPropLeader): + p.raiseParseError(synErrRangePropIsUnavailable, "") + default: + right = p.parseNormalChar() + } + if right == nil { + p.raiseParseError(synErrRangeInvalidForm, "") + } + from, _, _ := left.Range() + _, to, _ := right.Range() + if !isValidOrder(from, to) { + p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("%X..%X", from, to)) + } + return newRangeSymbolNode(from, to) +} + +func (p *parser) parseCodePoint() CPTree { + if !p.consume(tokenKindLBrace) { + p.raiseParseError(synErrCPExpInvalidForm, "") + } + if !p.consume(tokenKindCodePoint) { + p.raiseParseError(synErrCPExpInvalidForm, "") + } + + n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64) + if err != nil { + panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err)) + } + if n < 0x0000 || n > 0x10FFFF { + p.raiseParseError(synErrCPExpOutOfRange, "") + } + + sym := newSymbolNode(rune(n)) + + if !p.consume(tokenKindRBrace) { + p.raiseParseError(synErrCPExpInvalidForm, "") + } + + return sym +} + +func (p *parser) parseCharProp() CPTree { + if !p.consume(tokenKindLBrace) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + var sym1, sym2 string + if !p.consume(tokenKindCharPropSymbol) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + sym1 = p.lastTok.propSymbol + if p.consume(tokenKindEqual) { + if !p.consume(tokenKindCharPropSymbol) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + sym2 = p.lastTok.propSymbol + } + + var alt CPTree + var propName, propVal string + if sym2 != "" { + propName = sym1 + propVal = sym2 + } else { + propName = "" + propVal = sym1 + } + if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) { + p.raiseParseError(synErrCharPropUnsupported, propName) + } + pat, err := ucd.NormalizeCharacterProperty(propName, propVal) + if err != nil { + p.raiseParseError(synErrCharPropUnsupported, err.Error()) + } + if pat != "" { + p := NewParser(p.kind, bytes.NewReader([]byte(pat))) + p.exposeContributoryProperty() + ast, err := p.Parse() + if err != nil { + panic(err) + } + alt = ast + } else { + cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal) + if err != nil { + p.raiseParseError(synErrCharPropUnsupported, err.Error()) + } + if inverse { + r := cpRanges[0] + alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST()) + if alt == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + for _, r := range cpRanges[1:] { + alt = exclude(newRangeSymbolNode(r.From, r.To), alt) + if alt == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + } + } else { + for _, r := range cpRanges { + alt = genAltNode( + alt, + newRangeSymbolNode(r.From, r.To), + ) + } + } + } + + if !p.consume(tokenKindRBrace) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + + return alt +} + +func (p *parser) parseFragment() CPTree { + if !p.consume(tokenKindLBrace) { + p.raiseParseError(synErrFragmentExpInvalidForm, "") + } + if !p.consume(tokenKindFragmentSymbol) { + p.raiseParseError(synErrFragmentExpInvalidForm, "") + } + sym := p.lastTok.fragmentSymbol + + if !p.consume(tokenKindRBrace) { + p.raiseParseError(synErrFragmentExpInvalidForm, "") + } + + return newFragmentNode(spec.LexKindName(sym), nil) +} + +func (p *parser) parseNormalChar() CPTree { + if !p.consume(tokenKindChar) { + return nil + } + return newSymbolNode(p.lastTok.char) +} + +func exclude(symbol, base CPTree) CPTree { + if left, right, ok := symbol.Alternatives(); ok { + return exclude(right, exclude(left, base)) + } + + if left, right, ok := base.Alternatives(); ok { + return genAltNode( + exclude(symbol, left), + exclude(symbol, right), + ) + } + + if bFrom, bTo, ok := base.Range(); ok { + sFrom, sTo, ok := symbol.Range() + if !ok { + panic(fmt.Errorf("invalid symbol tree: %T", symbol)) + } + + switch { + case sFrom > bFrom && sTo < bTo: + return genAltNode( + newRangeSymbolNode(bFrom, sFrom-1), + newRangeSymbolNode(sTo+1, bTo), + ) + case sFrom <= bFrom && sTo >= bFrom && sTo < bTo: + return newRangeSymbolNode(sTo+1, bTo) + case sFrom > bFrom && sFrom <= bTo && sTo >= bTo: + return newRangeSymbolNode(bFrom, sFrom-1) + case sFrom <= bFrom && sTo >= bTo: + return nil + default: + return base + } + } + + panic(fmt.Errorf("invalid base tree: %T", base)) +} + +func genAnyCharAST() CPTree { + return newRangeSymbolNode(0x0, 0x10FFFF) +} + +func isValidOrder(from, to rune) bool { + return from <= to +} + +func genConcatNode(cs ...CPTree) CPTree { + nonNilNodes := []CPTree{} + for _, c := range cs { + if c == nil { + continue + } + nonNilNodes = append(nonNilNodes, c) + } + if len(nonNilNodes) <= 0 { + return nil + } + if len(nonNilNodes) == 1 { + return nonNilNodes[0] + } + concat := newConcatNode(nonNilNodes[0], nonNilNodes[1]) + for _, c := range nonNilNodes[2:] { + concat = newConcatNode(concat, c) + } + return concat +} + +func genAltNode(cs ...CPTree) CPTree { + nonNilNodes := []CPTree{} + for _, c := range cs { + if c == nil { + continue + } + nonNilNodes = append(nonNilNodes, c) + } + if len(nonNilNodes) <= 0 { + return nil + } + if len(nonNilNodes) == 1 { + return nonNilNodes[0] + } + alt := newAltNode(nonNilNodes[0], nonNilNodes[1]) + for _, c := range nonNilNodes[2:] { + alt = newAltNode(alt, c) + } + return alt +} + +func (p *parser) expect(expected tokenKind) { + if !p.consume(expected) { + tok := p.peekedTok + p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("expected: %v, actual: %v", expected, tok.kind)) + } +} + +func (p *parser) consume(expected tokenKind) bool { + var tok *token + var err error + if p.peekedTok != nil { + tok = p.peekedTok + p.peekedTok = nil + } else { + tok, err = p.lex.next() + if err != nil { + if err == ParseErr { + detail, cause := p.lex.error() + p.raiseParseError(cause, detail) + } + panic(err) + } + } + p.lastTok = tok + if tok.kind == expected { + return true + } + p.peekedTok = tok + p.lastTok = nil + + return false +} + +func (p *parser) raiseParseError(err error, detail string) { + p.errCause = err + p.errDetail = detail + panic(ParseErr) +} diff --git a/src/urubu/grammar/lexical/parser/tree.go b/src/urubu/grammar/lexical/parser/tree.go new file mode 100644 index 0000000..df03d37 --- /dev/null +++ b/src/urubu/grammar/lexical/parser/tree.go @@ -0,0 +1,459 @@ +package parser + +import ( + "fmt" + "io" + "sort" + + spec "urubu/spec/grammar" +) + +type CPRange struct { + From rune + To rune +} + +type CPTree interface { + fmt.Stringer + Range() (rune, rune, bool) + Optional() (CPTree, bool) + Repeatable() (CPTree, bool) + Concatenation() (CPTree, CPTree, bool) + Alternatives() (CPTree, CPTree, bool) + Describe() (spec.LexKindName, []spec.LexKindName, error) + + children() (CPTree, CPTree) + clone() CPTree +} + +var ( + _ CPTree = &rootNode{} + _ CPTree = &symbolNode{} + _ CPTree = &concatNode{} + _ CPTree = &altNode{} + _ CPTree = &quantifierNode{} + _ CPTree = &fragmentNode{} +) + +type rootNode struct { + kind spec.LexKindName + tree CPTree + fragments map[spec.LexKindName][]*fragmentNode +} + +func newRootNode(kind spec.LexKindName, t CPTree) *rootNode { + fragments := map[spec.LexKindName][]*fragmentNode{} + collectFragments(t, fragments) + + return &rootNode{ + kind: kind, + tree: t, + fragments: fragments, + } +} + +func collectFragments(n CPTree, fragments map[spec.LexKindName][]*fragmentNode) { + if n == nil { + return + } + + if f, ok := n.(*fragmentNode); ok { + fragments[f.kind] = append(fragments[f.kind], f) + return + } + + l, r := n.children() + collectFragments(l, fragments) + collectFragments(r, fragments) +} + +func (n *rootNode) String() string { + return fmt.Sprintf("root: %v: %v fragments", n.kind, len(n.fragments)) +} + +func (n *rootNode) Range() (rune, rune, bool) { + return n.tree.Range() +} + +func (n *rootNode) Optional() (CPTree, bool) { + return n.tree.Optional() +} + +func (n *rootNode) Repeatable() (CPTree, bool) { + return n.tree.Repeatable() +} + +func (n *rootNode) Concatenation() (CPTree, CPTree, bool) { + return n.tree.Concatenation() +} + +func (n *rootNode) Alternatives() (CPTree, CPTree, bool) { + return n.tree.Alternatives() +} + +func (n *rootNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + var frags []spec.LexKindName + for f := range n.fragments { + frags = append(frags, spec.LexKindName(f)) + } + sort.Slice(frags, func(i, j int) bool { + return frags[i] < frags[j] + }) + + return n.kind, frags, nil +} + +func (n *rootNode) children() (CPTree, CPTree) { + return n.tree.children() +} + +func (n *rootNode) clone() CPTree { + return n.tree.clone() +} + +func (n *rootNode) incomplete() bool { + return len(n.fragments) > 0 +} + +func (n *rootNode) applyFragment(kind spec.LexKindName, fragment CPTree) error { + root, ok := fragment.(*rootNode) + if !ok { + return fmt.Errorf("applyFragment can take only *rootNode: %T", fragment) + } + if root.incomplete() { + return fmt.Errorf("fragment is incomplete") + } + + fs, ok := n.fragments[kind] + if !ok { + return nil + } + for _, f := range fs { + f.tree = root.clone() + } + delete(n.fragments, kind) + + return nil +} + +type symbolNode struct { + CPRange +} + +func newSymbolNode(cp rune) *symbolNode { + return &symbolNode{ + CPRange: CPRange{ + From: cp, + To: cp, + }, + } +} + +func newRangeSymbolNode(from, to rune) *symbolNode { + return &symbolNode{ + CPRange: CPRange{ + From: from, + To: to, + }, + } +} + +func (n *symbolNode) String() string { + return fmt.Sprintf("symbol: %X..%X", n.From, n.To) +} + +func (n *symbolNode) Range() (rune, rune, bool) { + return n.From, n.To, true +} + +func (n *symbolNode) Optional() (CPTree, bool) { + return nil, false +} + +func (n *symbolNode) Repeatable() (CPTree, bool) { + return nil, false +} + +func (n *symbolNode) Concatenation() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *symbolNode) Alternatives() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *symbolNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *symbolNode) children() (CPTree, CPTree) { + return nil, nil +} + +func (n *symbolNode) clone() CPTree { + return newRangeSymbolNode(n.From, n.To) +} + +type concatNode struct { + left CPTree + right CPTree +} + +func newConcatNode(left, right CPTree) *concatNode { + return &concatNode{ + left: left, + right: right, + } +} + +func (n *concatNode) String() string { + return "concat" +} + +func (n *concatNode) Range() (rune, rune, bool) { + return 0, 0, false +} + +func (n *concatNode) Optional() (CPTree, bool) { + return nil, false +} + +func (n *concatNode) Repeatable() (CPTree, bool) { + return nil, false +} + +func (n *concatNode) Concatenation() (CPTree, CPTree, bool) { + return n.left, n.right, true +} + +func (n *concatNode) Alternatives() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *concatNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *concatNode) children() (CPTree, CPTree) { + return n.left, n.right +} + +func (n *concatNode) clone() CPTree { + if n == nil { + return nil + } + return newConcatNode(n.left.clone(), n.right.clone()) +} + +type altNode struct { + left CPTree + right CPTree +} + +func newAltNode(left, right CPTree) *altNode { + return &altNode{ + left: left, + right: right, + } +} + +func (n *altNode) String() string { + return "alt" +} + +func (n *altNode) Range() (rune, rune, bool) { + return 0, 0, false +} + +func (n *altNode) Optional() (CPTree, bool) { + return nil, false +} + +func (n *altNode) Repeatable() (CPTree, bool) { + return nil, false +} + +func (n *altNode) Concatenation() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *altNode) Alternatives() (CPTree, CPTree, bool) { + return n.left, n.right, true +} + +func (n *altNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *altNode) children() (CPTree, CPTree) { + return n.left, n.right +} + +func (n *altNode) clone() CPTree { + return newAltNode(n.left.clone(), n.right.clone()) +} + +type quantifierNode struct { + optional bool + repeatable bool + tree CPTree +} + +func (n *quantifierNode) String() string { + switch { + case n.repeatable: + return "repeatable (>= 0 times)" + case n.optional: + return "optional (0 or 1 times)" + default: + return "invalid quantifier" + } +} + +func newRepeatNode(t CPTree) *quantifierNode { + return &quantifierNode{ + repeatable: true, + tree: t, + } +} + +func newRepeatOneOrMoreNode(t CPTree) *concatNode { + return newConcatNode( + t, + &quantifierNode{ + repeatable: true, + tree: t.clone(), + }) +} + +func newOptionNode(t CPTree) *quantifierNode { + return &quantifierNode{ + optional: true, + tree: t, + } +} + +func (n *quantifierNode) Range() (rune, rune, bool) { + return 0, 0, false +} + +func (n *quantifierNode) Optional() (CPTree, bool) { + return n.tree, n.optional +} + +func (n *quantifierNode) Repeatable() (CPTree, bool) { + return n.tree, n.repeatable +} + +func (n *quantifierNode) Concatenation() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *quantifierNode) Alternatives() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *quantifierNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *quantifierNode) children() (CPTree, CPTree) { + return n.tree, nil +} + +func (n *quantifierNode) clone() CPTree { + if n.repeatable { + return newRepeatNode(n.tree.clone()) + } + return newOptionNode(n.tree.clone()) +} + +type fragmentNode struct { + kind spec.LexKindName + tree CPTree +} + +func newFragmentNode(kind spec.LexKindName, t CPTree) *fragmentNode { + return &fragmentNode{ + kind: kind, + tree: t, + } +} + +func (n *fragmentNode) String() string { + return fmt.Sprintf("fragment: %v", n.kind) +} + +func (n *fragmentNode) Range() (rune, rune, bool) { + return n.tree.Range() +} + +func (n *fragmentNode) Optional() (CPTree, bool) { + return n.tree.Optional() +} + +func (n *fragmentNode) Repeatable() (CPTree, bool) { + return n.tree.Repeatable() +} + +func (n *fragmentNode) Concatenation() (CPTree, CPTree, bool) { + return n.tree.Concatenation() +} + +func (n *fragmentNode) Alternatives() (CPTree, CPTree, bool) { + return n.tree.Alternatives() +} + +func (n *fragmentNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *fragmentNode) children() (CPTree, CPTree) { + return n.tree.children() +} + +func (n *fragmentNode) clone() CPTree { + if n.tree == nil { + return newFragmentNode(n.kind, nil) + } + return newFragmentNode(n.kind, n.tree.clone()) +} + +//nolint:unused +func printCPTree(w io.Writer, t CPTree, ruledLine string, childRuledLinePrefix string) { + if t == nil { + return + } + fmt.Fprintf(w, "%v%v\n", ruledLine, t) + children := []CPTree{} + switch n := t.(type) { + case *rootNode: + children = append(children, n.tree) + case *fragmentNode: + children = append(children, n.tree) + default: + left, right := t.children() + if left != nil { + children = append(children, left) + } + if right != nil { + children = append(children, right) + } + } + num := len(children) + for i, child := range children { + line := "└─ " + if num > 1 { + if i == 0 { + line = "├─ " + } else if i < num-1 { + line = "│ " + } + } + prefix := "│ " + if i >= num-1 { + prefix = " " + } + printCPTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix) + } +} diff --git a/src/urubu/grammar/lr0.go b/src/urubu/grammar/lr0.go new file mode 100644 index 0000000..92a2137 --- /dev/null +++ b/src/urubu/grammar/lr0.go @@ -0,0 +1,197 @@ +package grammar + +import ( + "fmt" + "sort" + + "urubu/grammar/symbol" +) + +type lr0Automaton struct { + initialState kernelID + states map[kernelID]*lrState +} + +func genLR0Automaton(prods *productionSet, startSym symbol.Symbol, errSym symbol.Symbol) (*lr0Automaton, error) { + if !startSym.IsStart() { + return nil, fmt.Errorf("passed symbold is not a start symbol") + } + + automaton := &lr0Automaton{ + states: map[kernelID]*lrState{}, + } + + currentState := stateNumInitial + knownKernels := map[kernelID]struct{}{} + uncheckedKernels := []*kernel{} + + // Generate an initial kernel. + { + prods, _ := prods.findByLHS(startSym) + initialItem, err := newLR0Item(prods[0], 0) + if err != nil { + return nil, err + } + + k, err := newKernel([]*lrItem{initialItem}) + if err != nil { + return nil, err + } + + automaton.initialState = k.id + knownKernels[k.id] = struct{}{} + uncheckedKernels = append(uncheckedKernels, k) + } + + for len(uncheckedKernels) > 0 { + nextUncheckedKernels := []*kernel{} + for _, k := range uncheckedKernels { + state, neighbours, err := genStateAndNeighbourKernels(k, prods, errSym) + if err != nil { + return nil, err + } + state.num = currentState + currentState = currentState.next() + + automaton.states[state.id] = state + + for _, k := range neighbours { + if _, known := knownKernels[k.id]; known { + continue + } + knownKernels[k.id] = struct{}{} + nextUncheckedKernels = append(nextUncheckedKernels, k) + } + } + uncheckedKernels = nextUncheckedKernels + } + + return automaton, nil +} + +func genStateAndNeighbourKernels(k *kernel, prods *productionSet, errSym symbol.Symbol) (*lrState, []*kernel, error) { + items, err := genLR0Closure(k, prods) + if err != nil { + return nil, nil, err + } + neighbours, err := genNeighbourKernels(items, prods) + if err != nil { + return nil, nil, err + } + + next := map[symbol.Symbol]kernelID{} + kernels := []*kernel{} + for _, n := range neighbours { + next[n.symbol] = n.kernel.id + kernels = append(kernels, n.kernel) + } + + reducible := map[productionID]struct{}{} + var emptyProdItems []*lrItem + isErrorTrapper := false + for _, item := range items { + if item.dottedSymbol == errSym { + isErrorTrapper = true + } + + if item.reducible { + reducible[item.prod] = struct{}{} + + prod, ok := prods.findByID(item.prod) + if !ok { + return nil, nil, fmt.Errorf("reducible production not found: %v", item.prod) + } + if prod.isEmpty() { + emptyProdItems = append(emptyProdItems, item) + } + } + } + + return &lrState{ + kernel: k, + next: next, + reducible: reducible, + emptyProdItems: emptyProdItems, + isErrorTrapper: isErrorTrapper, + }, kernels, nil +} + +func genLR0Closure(k *kernel, prods *productionSet) ([]*lrItem, error) { + items := []*lrItem{} + knownItems := map[lrItemID]struct{}{} + uncheckedItems := []*lrItem{} + for _, item := range k.items { + items = append(items, item) + uncheckedItems = append(uncheckedItems, item) + } + for len(uncheckedItems) > 0 { + nextUncheckedItems := []*lrItem{} + for _, item := range uncheckedItems { + if item.dottedSymbol.IsTerminal() { + continue + } + + ps, _ := prods.findByLHS(item.dottedSymbol) + for _, prod := range ps { + item, err := newLR0Item(prod, 0) + if err != nil { + return nil, err + } + if _, exist := knownItems[item.id]; exist { + continue + } + items = append(items, item) + knownItems[item.id] = struct{}{} + nextUncheckedItems = append(nextUncheckedItems, item) + } + } + uncheckedItems = nextUncheckedItems + } + + return items, nil +} + +type neighbourKernel struct { + symbol symbol.Symbol + kernel *kernel +} + +func genNeighbourKernels(items []*lrItem, prods *productionSet) ([]*neighbourKernel, error) { + kItemMap := map[symbol.Symbol][]*lrItem{} + for _, item := range items { + if item.dottedSymbol.IsNil() { + continue + } + prod, ok := prods.findByID(item.prod) + if !ok { + return nil, fmt.Errorf("a production was not found: %v", item.prod) + } + kItem, err := newLR0Item(prod, item.dot+1) + if err != nil { + return nil, err + } + kItemMap[item.dottedSymbol] = append(kItemMap[item.dottedSymbol], kItem) + } + + nextSyms := []symbol.Symbol{} + for sym := range kItemMap { + nextSyms = append(nextSyms, sym) + } + sort.Slice(nextSyms, func(i, j int) bool { + return nextSyms[i] < nextSyms[j] + }) + + kernels := []*neighbourKernel{} + for _, sym := range nextSyms { + k, err := newKernel(kItemMap[sym]) + if err != nil { + return nil, err + } + kernels = append(kernels, &neighbourKernel{ + symbol: sym, + kernel: k, + }) + } + + return kernels, nil +} diff --git a/src/urubu/grammar/parsing_table.go b/src/urubu/grammar/parsing_table.go new file mode 100644 index 0000000..48ea9fe --- /dev/null +++ b/src/urubu/grammar/parsing_table.go @@ -0,0 +1,553 @@ +package grammar + +import ( + "fmt" + "sort" + + "urubu/grammar/symbol" + spec "urubu/spec/grammar" +) + +type ActionType string + +const ( + ActionTypeShift = ActionType("shift") + ActionTypeReduce = ActionType("reduce") + ActionTypeError = ActionType("error") +) + +type actionEntry int + +const actionEntryEmpty = actionEntry(0) + +func newShiftActionEntry(state stateNum) actionEntry { + return actionEntry(state * -1) +} + +func newReduceActionEntry(prod productionNum) actionEntry { + return actionEntry(prod) +} + +func (e actionEntry) isEmpty() bool { + return e == actionEntryEmpty +} + +func (e actionEntry) describe() (ActionType, stateNum, productionNum) { + if e == actionEntryEmpty { + return ActionTypeError, stateNumInitial, productionNumNil + } + if e < 0 { + return ActionTypeShift, stateNum(e * -1), productionNumNil + } + return ActionTypeReduce, stateNumInitial, productionNum(e) +} + +type GoToType string + +const ( + GoToTypeRegistered = GoToType("registered") + GoToTypeError = GoToType("error") +) + +type goToEntry uint + +const goToEntryEmpty = goToEntry(0) + +func newGoToEntry(state stateNum) goToEntry { + return goToEntry(state) +} + +func (e goToEntry) describe() (GoToType, stateNum) { + if e == goToEntryEmpty { + return GoToTypeError, stateNumInitial + } + return GoToTypeRegistered, stateNum(e) +} + +type conflictResolutionMethod int + +func (m conflictResolutionMethod) Int() int { + return int(m) +} + +const ( + ResolvedByPrec conflictResolutionMethod = 1 + ResolvedByAssoc conflictResolutionMethod = 2 + ResolvedByShift conflictResolutionMethod = 3 + ResolvedByProdOrder conflictResolutionMethod = 4 +) + +type conflict interface { + conflict() +} + +type shiftReduceConflict struct { + state stateNum + sym symbol.Symbol + nextState stateNum + prodNum productionNum + resolvedBy conflictResolutionMethod +} + +func (c *shiftReduceConflict) conflict() { +} + +type reduceReduceConflict struct { + state stateNum + sym symbol.Symbol + prodNum1 productionNum + prodNum2 productionNum + resolvedBy conflictResolutionMethod +} + +func (c *reduceReduceConflict) conflict() { +} + +var ( + _ conflict = &shiftReduceConflict{} + _ conflict = &reduceReduceConflict{} +) + +type ParsingTable struct { + actionTable []actionEntry + goToTable []goToEntry + stateCount int + terminalCount int + nonTerminalCount int + + // errorTrapperStates's index means a state number, and when `errorTrapperStates[stateNum]` is `1`, + // the state has an item having the following form. The `α` and `β` can be empty. + // + // A → α・error β + errorTrapperStates []int + + InitialState stateNum +} + +func (t *ParsingTable) getAction(state stateNum, sym symbol.SymbolNum) (ActionType, stateNum, productionNum) { + pos := state.Int()*t.terminalCount + sym.Int() + return t.actionTable[pos].describe() +} + +func (t *ParsingTable) getGoTo(state stateNum, sym symbol.SymbolNum) (GoToType, stateNum) { + pos := state.Int()*t.nonTerminalCount + sym.Int() + return t.goToTable[pos].describe() +} + +func (t *ParsingTable) readAction(row int, col int) actionEntry { + return t.actionTable[row*t.terminalCount+col] +} + +func (t *ParsingTable) writeAction(row int, col int, act actionEntry) { + t.actionTable[row*t.terminalCount+col] = act +} + +func (t *ParsingTable) writeGoTo(state stateNum, sym symbol.Symbol, nextState stateNum) { + pos := state.Int()*t.nonTerminalCount + sym.Num().Int() + t.goToTable[pos] = newGoToEntry(nextState) +} + +type lrTableBuilder struct { + automaton *lr0Automaton + prods *productionSet + termCount int + nonTermCount int + symTab *symbol.SymbolTableReader + precAndAssoc *precAndAssoc + + conflicts []conflict +} + +func (b *lrTableBuilder) build() (*ParsingTable, error) { + var ptab *ParsingTable + { + initialState := b.automaton.states[b.automaton.initialState] + ptab = &ParsingTable{ + actionTable: make([]actionEntry, len(b.automaton.states)*b.termCount), + goToTable: make([]goToEntry, len(b.automaton.states)*b.nonTermCount), + stateCount: len(b.automaton.states), + terminalCount: b.termCount, + nonTerminalCount: b.nonTermCount, + errorTrapperStates: make([]int, len(b.automaton.states)), + InitialState: initialState.num, + } + } + + for _, state := range b.automaton.states { + if state.isErrorTrapper { + ptab.errorTrapperStates[state.num] = 1 + } + + for sym, kID := range state.next { + nextState := b.automaton.states[kID] + if sym.IsTerminal() { + b.writeShiftAction(ptab, state.num, sym, nextState.num) + } else { + ptab.writeGoTo(state.num, sym, nextState.num) + } + } + + for prodID := range state.reducible { + reducibleProd, ok := b.prods.findByID(prodID) + if !ok { + return nil, fmt.Errorf("reducible production not found: %v", prodID) + } + + var reducibleItem *lrItem + for _, item := range state.items { + if item.prod != reducibleProd.id { + continue + } + + reducibleItem = item + break + } + if reducibleItem == nil { + for _, item := range state.emptyProdItems { + if item.prod != reducibleProd.id { + continue + } + + reducibleItem = item + break + } + if reducibleItem == nil { + return nil, fmt.Errorf("reducible item not found; state: %v, production: %v", state.num, reducibleProd.num) + } + } + + for a := range reducibleItem.lookAhead.symbols { + b.writeReduceAction(ptab, state.num, a, reducibleProd.num) + } + } + } + + return ptab, nil +} + +// writeShiftAction writes a shift action to the parsing table. When a shift/reduce conflict occurred, +// we prioritize the shift action. +func (b *lrTableBuilder) writeShiftAction(tab *ParsingTable, state stateNum, sym symbol.Symbol, nextState stateNum) { + act := tab.readAction(state.Int(), sym.Num().Int()) + if !act.isEmpty() { + ty, _, p := act.describe() + if ty == ActionTypeReduce { + act, method := b.resolveSRConflict(sym.Num(), p) + b.conflicts = append(b.conflicts, &shiftReduceConflict{ + state: state, + sym: sym, + nextState: nextState, + prodNum: p, + resolvedBy: method, + }) + if act == ActionTypeShift { + tab.writeAction(state.Int(), sym.Num().Int(), newShiftActionEntry(nextState)) + } + return + } + } + tab.writeAction(state.Int(), sym.Num().Int(), newShiftActionEntry(nextState)) +} + +// writeReduceAction writes a reduce action to the parsing table. When a shift/reduce conflict occurred, +// we prioritize the shift action, and when a reduce/reduce conflict we prioritize the action that reduces +// the production with higher priority. Productions defined earlier in the grammar file have a higher priority. +func (b *lrTableBuilder) writeReduceAction(tab *ParsingTable, state stateNum, sym symbol.Symbol, prod productionNum) { + act := tab.readAction(state.Int(), sym.Num().Int()) + if !act.isEmpty() { + ty, s, p := act.describe() + switch ty { + case ActionTypeReduce: + if p == prod { + return + } + + b.conflicts = append(b.conflicts, &reduceReduceConflict{ + state: state, + sym: sym, + prodNum1: p, + prodNum2: prod, + resolvedBy: ResolvedByProdOrder, + }) + if p < prod { + tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(p)) + } else { + tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod)) + } + case ActionTypeShift: + act, method := b.resolveSRConflict(sym.Num(), prod) + b.conflicts = append(b.conflicts, &shiftReduceConflict{ + state: state, + sym: sym, + nextState: s, + prodNum: prod, + resolvedBy: method, + }) + if act == ActionTypeReduce { + tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod)) + } + } + return + } + tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod)) +} + +func (b *lrTableBuilder) resolveSRConflict(sym symbol.SymbolNum, prod productionNum) (ActionType, conflictResolutionMethod) { + symPrec := b.precAndAssoc.terminalPrecedence(sym) + prodPrec := b.precAndAssoc.productionPredence(prod) + if symPrec == 0 || prodPrec == 0 { + return ActionTypeShift, ResolvedByShift + } + if symPrec == prodPrec { + assoc := b.precAndAssoc.productionAssociativity(prod) + if assoc != assocTypeLeft { + return ActionTypeShift, ResolvedByAssoc + } + return ActionTypeReduce, ResolvedByAssoc + } + if symPrec < prodPrec { + return ActionTypeShift, ResolvedByPrec + } + return ActionTypeReduce, ResolvedByPrec +} + +func (b *lrTableBuilder) genReport(tab *ParsingTable, gram *Grammar) (*spec.Report, error) { + var terms []*spec.Terminal + { + termSyms := b.symTab.TerminalSymbols() + terms = make([]*spec.Terminal, len(termSyms)+1) + + for _, sym := range termSyms { + name, ok := b.symTab.ToText(sym) + if !ok { + return nil, fmt.Errorf("failed to generate terminals: symbol not found: %v", sym) + } + + term := &spec.Terminal{ + Number: sym.Num().Int(), + Name: name, + } + + prec := b.precAndAssoc.terminalPrecedence(sym.Num()) + if prec != precNil { + term.Precedence = prec + } + + assoc := b.precAndAssoc.terminalAssociativity(sym.Num()) + switch assoc { + case assocTypeLeft: + term.Associativity = "l" + case assocTypeRight: + term.Associativity = "r" + } + + terms[sym.Num()] = term + } + } + + var nonTerms []*spec.NonTerminal + { + nonTermSyms := b.symTab.NonTerminalSymbols() + nonTerms = make([]*spec.NonTerminal, len(nonTermSyms)+1) + for _, sym := range nonTermSyms { + name, ok := b.symTab.ToText(sym) + if !ok { + return nil, fmt.Errorf("failed to generate non-terminals: symbol not found: %v", sym) + } + + nonTerms[sym.Num()] = &spec.NonTerminal{ + Number: sym.Num().Int(), + Name: name, + } + } + } + + var prods []*spec.Production + { + ps := gram.productionSet.getAllProductions() + prods = make([]*spec.Production, len(ps)+1) + for _, p := range ps { + rhs := make([]int, len(p.rhs)) + for i, e := range p.rhs { + if e.IsTerminal() { + rhs[i] = e.Num().Int() + } else { + rhs[i] = e.Num().Int() * -1 + } + } + + prod := &spec.Production{ + Number: p.num.Int(), + LHS: p.lhs.Num().Int(), + RHS: rhs, + } + + prec := b.precAndAssoc.productionPredence(p.num) + if prec != precNil { + prod.Precedence = prec + } + + assoc := b.precAndAssoc.productionAssociativity(p.num) + switch assoc { + case assocTypeLeft: + prod.Associativity = "l" + case assocTypeRight: + prod.Associativity = "r" + } + + prods[p.num.Int()] = prod + } + } + + var states []*spec.State + { + srConflicts := map[stateNum][]*shiftReduceConflict{} + rrConflicts := map[stateNum][]*reduceReduceConflict{} + for _, con := range b.conflicts { + switch c := con.(type) { + case *shiftReduceConflict: + srConflicts[c.state] = append(srConflicts[c.state], c) + case *reduceReduceConflict: + rrConflicts[c.state] = append(rrConflicts[c.state], c) + } + } + + states = make([]*spec.State, len(b.automaton.states)) + for _, s := range b.automaton.states { + kernel := make([]*spec.Item, len(s.items)) + for i, item := range s.items { + p, ok := b.prods.findByID(item.prod) + if !ok { + return nil, fmt.Errorf("failed to generate states: production of kernel item not found: %v", item.prod) + } + + kernel[i] = &spec.Item{ + Production: p.num.Int(), + Dot: item.dot, + } + } + + sort.Slice(kernel, func(i, j int) bool { + if kernel[i].Production < kernel[j].Production { + return true + } + if kernel[i].Production > kernel[j].Production { + return false + } + return kernel[i].Dot < kernel[j].Dot + }) + + var shift []*spec.Transition + var reduce []*spec.Reduce + var goTo []*spec.Transition + { + TERMINALS_LOOP: + for _, t := range b.symTab.TerminalSymbols() { + act, next, prod := tab.getAction(s.num, t.Num()) + switch act { + case ActionTypeShift: + shift = append(shift, &spec.Transition{ + Symbol: t.Num().Int(), + State: next.Int(), + }) + case ActionTypeReduce: + for _, r := range reduce { + if r.Production == prod.Int() { + r.LookAhead = append(r.LookAhead, t.Num().Int()) + continue TERMINALS_LOOP + } + } + reduce = append(reduce, &spec.Reduce{ + LookAhead: []int{t.Num().Int()}, + Production: prod.Int(), + }) + } + } + + for _, n := range b.symTab.NonTerminalSymbols() { + ty, next := tab.getGoTo(s.num, n.Num()) + if ty == GoToTypeRegistered { + goTo = append(goTo, &spec.Transition{ + Symbol: n.Num().Int(), + State: next.Int(), + }) + } + } + + sort.Slice(shift, func(i, j int) bool { + return shift[i].State < shift[j].State + }) + sort.Slice(reduce, func(i, j int) bool { + return reduce[i].Production < reduce[j].Production + }) + sort.Slice(goTo, func(i, j int) bool { + return goTo[i].State < goTo[j].State + }) + } + + sr := []*spec.SRConflict{} + rr := []*spec.RRConflict{} + { + for _, c := range srConflicts[s.num] { + conflict := &spec.SRConflict{ + Symbol: c.sym.Num().Int(), + State: c.nextState.Int(), + Production: c.prodNum.Int(), + ResolvedBy: c.resolvedBy.Int(), + } + + ty, s, p := tab.getAction(s.num, c.sym.Num()) + switch ty { + case ActionTypeShift: + n := s.Int() + conflict.AdoptedState = &n + case ActionTypeReduce: + n := p.Int() + conflict.AdoptedProduction = &n + } + + sr = append(sr, conflict) + } + + sort.Slice(sr, func(i, j int) bool { + return sr[i].Symbol < sr[j].Symbol + }) + + for _, c := range rrConflicts[s.num] { + conflict := &spec.RRConflict{ + Symbol: c.sym.Num().Int(), + Production1: c.prodNum1.Int(), + Production2: c.prodNum2.Int(), + ResolvedBy: c.resolvedBy.Int(), + } + + _, _, p := tab.getAction(s.num, c.sym.Num()) + conflict.AdoptedProduction = p.Int() + + rr = append(rr, conflict) + } + + sort.Slice(rr, func(i, j int) bool { + return rr[i].Symbol < rr[j].Symbol + }) + } + + states[s.num.Int()] = &spec.State{ + Number: s.num.Int(), + Kernel: kernel, + Shift: shift, + Reduce: reduce, + GoTo: goTo, + SRConflict: sr, + RRConflict: rr, + } + } + } + + return &spec.Report{ + Terminals: terms, + NonTerminals: nonTerms, + Productions: prods, + States: states, + }, nil +} diff --git a/src/urubu/grammar/production.go b/src/urubu/grammar/production.go new file mode 100644 index 0000000..8f6c103 --- /dev/null +++ b/src/urubu/grammar/production.go @@ -0,0 +1,117 @@ +package grammar + +import ( + "crypto/sha256" + "encoding/hex" + "fmt" + + "urubu/grammar/symbol" +) + +type productionID [32]byte + +func (id productionID) String() string { + return hex.EncodeToString(id[:]) +} + +func genProductionID(lhs symbol.Symbol, rhs []symbol.Symbol) productionID { + seq := lhs.Byte() + for _, sym := range rhs { + seq = append(seq, sym.Byte()...) + } + return productionID(sha256.Sum256(seq)) +} + +type productionNum uint16 + +const ( + productionNumNil = productionNum(0) + productionNumStart = productionNum(1) + productionNumMin = productionNum(2) +) + +func (n productionNum) Int() int { + return int(n) +} + +type production struct { + id productionID + num productionNum + lhs symbol.Symbol + rhs []symbol.Symbol + rhsLen int +} + +func newProduction(lhs symbol.Symbol, rhs []symbol.Symbol) (*production, error) { + if lhs.IsNil() { + return nil, fmt.Errorf("LHS must be a non-nil symbol; LHS: %v, RHS: %v", lhs, rhs) + } + for _, sym := range rhs { + if sym.IsNil() { + return nil, fmt.Errorf("a symbol of RHS must be a non-nil symbol; LHS: %v, RHS: %v", lhs, rhs) + } + } + + return &production{ + id: genProductionID(lhs, rhs), + lhs: lhs, + rhs: rhs, + rhsLen: len(rhs), + }, nil +} + +func (p *production) isEmpty() bool { + return p.rhsLen == 0 +} + +type productionSet struct { + lhs2Prods map[symbol.Symbol][]*production + id2Prod map[productionID]*production + num productionNum +} + +func newProductionSet() *productionSet { + return &productionSet{ + lhs2Prods: map[symbol.Symbol][]*production{}, + id2Prod: map[productionID]*production{}, + num: productionNumMin, + } +} + +func (ps *productionSet) append(prod *production) { + if _, ok := ps.id2Prod[prod.id]; ok { + return + } + + if prod.lhs.IsStart() { + prod.num = productionNumStart + } else { + prod.num = ps.num + ps.num++ + } + + if prods, ok := ps.lhs2Prods[prod.lhs]; ok { + ps.lhs2Prods[prod.lhs] = append(prods, prod) + } else { + ps.lhs2Prods[prod.lhs] = []*production{prod} + } + ps.id2Prod[prod.id] = prod +} + +func (ps *productionSet) findByID(id productionID) (*production, bool) { + prod, ok := ps.id2Prod[id] + return prod, ok +} + +func (ps *productionSet) findByLHS(lhs symbol.Symbol) ([]*production, bool) { + if lhs.IsNil() { + return nil, false + } + + prods, ok := ps.lhs2Prods[lhs] + return prods, ok +} + +func (ps *productionSet) getAllProductions() map[productionID]*production { + return ps.id2Prod +} diff --git a/src/urubu/grammar/semantic_error.go b/src/urubu/grammar/semantic_error.go new file mode 100644 index 0000000..88a6b17 --- /dev/null +++ b/src/urubu/grammar/semantic_error.go @@ -0,0 +1,30 @@ +package grammar + +import "errors" + +var ( + semErrNoGrammarName = errors.New("name is missing") + semErrSpellingInconsistency = errors.New("the identifiers are treated as the same. please use the same spelling") + semErrDuplicateAssoc = errors.New("associativity and precedence cannot be specified multiple times for a symbol") + semErrUndefinedPrec = errors.New("symbol must has precedence") + semErrUndefinedOrdSym = errors.New("undefined ordered symbol") + semErrUnusedProduction = errors.New("unused production") + semErrUnusedTerminal = errors.New("unused terminal") + semErrTermCannotBeSkipped = errors.New("a terminal used in productions cannot be skipped") + semErrNoProduction = errors.New("a grammar needs at least one production") + semErrUndefinedSym = errors.New("undefined symbol") + semErrDuplicateProduction = errors.New("duplicate production") + semErrDuplicateTerminal = errors.New("duplicate terminal") + semErrDuplicateFragment = errors.New("duplicate fragment") + semErrDuplicateName = errors.New("duplicate names are not allowed between terminals and non-terminals") + semErrErrSymIsReserved = errors.New("symbol 'error' is reserved as a terminal symbol") + semErrDuplicateLabel = errors.New("a label must be unique in an alternative") + semErrInvalidLabel = errors.New("a label must differ from terminal symbols or non-terminal symbols") + semErrDirInvalidName = errors.New("invalid directive name") + semErrDirInvalidParam = errors.New("invalid parameter") + semErrDuplicateDir = errors.New("a directive must not be duplicated") + semErrDuplicateElem = errors.New("duplicate element") + semErrAmbiguousElem = errors.New("ambiguous element") + semErrInvalidProdDir = errors.New("invalid production directive") + semErrInvalidAltDir = errors.New("invalid alternative directive") +) diff --git a/src/urubu/grammar/symbol/symbol.go b/src/urubu/grammar/symbol/symbol.go new file mode 100644 index 0000000..f9e6a93 --- /dev/null +++ b/src/urubu/grammar/symbol/symbol.go @@ -0,0 +1,295 @@ +package symbol + +import ( + "fmt" + "sort" +) + +type symbolKind string + +const ( + symbolKindNonTerminal = symbolKind("non-terminal") + symbolKindTerminal = symbolKind("terminal") +) + +func (t symbolKind) String() string { + return string(t) +} + +type SymbolNum uint16 + +func (n SymbolNum) Int() int { + return int(n) +} + +type Symbol uint16 + +func (s Symbol) String() string { + kind, isStart, isEOF, num := s.describe() + var prefix string + switch { + case isStart: + prefix = "s" + case isEOF: + prefix = "e" + case kind == symbolKindNonTerminal: + prefix = "n" + case kind == symbolKindTerminal: + prefix = "t" + default: + prefix = "?" + } + return fmt.Sprintf("%v%v", prefix, num) +} + +const ( + maskKindPart = uint16(0x8000) // 1000 0000 0000 0000 + maskNonTerminal = uint16(0x0000) // 0000 0000 0000 0000 + maskTerminal = uint16(0x8000) // 1000 0000 0000 0000 + + maskSubKindpart = uint16(0x4000) // 0100 0000 0000 0000 + maskNonStartAndEOF = uint16(0x0000) // 0000 0000 0000 0000 + maskStartOrEOF = uint16(0x4000) // 0100 0000 0000 0000 + + maskNumberPart = uint16(0x3fff) // 0011 1111 1111 1111 + + symbolNumStart = uint16(0x0001) // 0000 0000 0000 0001 + symbolNumEOF = uint16(0x0001) // 0000 0000 0000 0001 + + SymbolNil = Symbol(0) // 0000 0000 0000 0000 + symbolStart = Symbol(maskNonTerminal | maskStartOrEOF | symbolNumStart) // 0100 0000 0000 0001 + SymbolEOF = Symbol(maskTerminal | maskStartOrEOF | symbolNumEOF) // 1100 0000 0000 0001: The EOF symbol is treated as a terminal symbol. + + // The symbol name contains `<` and `>` to avoid conflicting with user-defined symbols. + symbolNameEOF = "<eof>" + + nonTerminalNumMin = SymbolNum(2) // The number 1 is used by a start symbol. + terminalNumMin = SymbolNum(2) // The number 1 is used by the EOF symbol. + symbolNumMax = SymbolNum(0xffff) >> 2 // 0011 1111 1111 1111 +) + +func newSymbol(kind symbolKind, isStart bool, num SymbolNum) (Symbol, error) { + if num > symbolNumMax { + return SymbolNil, fmt.Errorf("a symbol number exceeds the limit; limit: %v, passed: %v", symbolNumMax, num) + } + if kind == symbolKindTerminal && isStart { + return SymbolNil, fmt.Errorf("a start symbol must be a non-terminal symbol") + } + + kindMask := maskNonTerminal + if kind == symbolKindTerminal { + kindMask = maskTerminal + } + startMask := maskNonStartAndEOF + if isStart { + startMask = maskStartOrEOF + } + return Symbol(kindMask | startMask | uint16(num)), nil +} + +func (s Symbol) Num() SymbolNum { + _, _, _, num := s.describe() + return num +} + +func (s Symbol) Byte() []byte { + if s.IsNil() { + return []byte{0, 0} + } + return []byte{byte(uint16(s) >> 8), byte(uint16(s) & 0x00ff)} +} + +func (s Symbol) IsNil() bool { + _, _, _, num := s.describe() + return num == 0 +} + +func (s Symbol) IsStart() bool { + if s.IsNil() { + return false + } + _, isStart, _, _ := s.describe() + return isStart +} + +func (s Symbol) isEOF() bool { + if s.IsNil() { + return false + } + _, _, isEOF, _ := s.describe() + return isEOF +} + +func (s Symbol) isNonTerminal() bool { + if s.IsNil() { + return false + } + kind, _, _, _ := s.describe() + return kind == symbolKindNonTerminal +} + +func (s Symbol) IsTerminal() bool { + if s.IsNil() { + return false + } + return !s.isNonTerminal() +} + +func (s Symbol) describe() (symbolKind, bool, bool, SymbolNum) { + kind := symbolKindNonTerminal + if uint16(s)&maskKindPart > 0 { + kind = symbolKindTerminal + } + isStart := false + isEOF := false + if uint16(s)&maskSubKindpart > 0 { + if kind == symbolKindNonTerminal { + isStart = true + } else { + isEOF = true + } + } + num := SymbolNum(uint16(s) & maskNumberPart) + return kind, isStart, isEOF, num +} + +type SymbolTable struct { + text2Sym map[string]Symbol + sym2Text map[Symbol]string + nonTermTexts []string + termTexts []string + nonTermNum SymbolNum + termNum SymbolNum +} + +type SymbolTableWriter struct { + *SymbolTable +} + +type SymbolTableReader struct { + *SymbolTable +} + +func NewSymbolTable() *SymbolTable { + return &SymbolTable{ + text2Sym: map[string]Symbol{ + symbolNameEOF: SymbolEOF, + }, + sym2Text: map[Symbol]string{ + SymbolEOF: symbolNameEOF, + }, + termTexts: []string{ + "", // Nil + symbolNameEOF, // EOF + }, + nonTermTexts: []string{ + "", // Nil + "", // Start Symbol + }, + nonTermNum: nonTerminalNumMin, + termNum: terminalNumMin, + } +} + +func (t *SymbolTable) Writer() *SymbolTableWriter { + return &SymbolTableWriter{ + SymbolTable: t, + } +} + +func (t *SymbolTable) Reader() *SymbolTableReader { + return &SymbolTableReader{ + SymbolTable: t, + } +} + +func (w *SymbolTableWriter) RegisterStartSymbol(text string) (Symbol, error) { + w.text2Sym[text] = symbolStart + w.sym2Text[symbolStart] = text + w.nonTermTexts[symbolStart.Num().Int()] = text + return symbolStart, nil +} + +func (w *SymbolTableWriter) RegisterNonTerminalSymbol(text string) (Symbol, error) { + if sym, ok := w.text2Sym[text]; ok { + return sym, nil + } + sym, err := newSymbol(symbolKindNonTerminal, false, w.nonTermNum) + if err != nil { + return SymbolNil, err + } + w.nonTermNum++ + w.text2Sym[text] = sym + w.sym2Text[sym] = text + w.nonTermTexts = append(w.nonTermTexts, text) + return sym, nil +} + +func (w *SymbolTableWriter) RegisterTerminalSymbol(text string) (Symbol, error) { + if sym, ok := w.text2Sym[text]; ok { + return sym, nil + } + sym, err := newSymbol(symbolKindTerminal, false, w.termNum) + if err != nil { + return SymbolNil, err + } + w.termNum++ + w.text2Sym[text] = sym + w.sym2Text[sym] = text + w.termTexts = append(w.termTexts, text) + return sym, nil +} + +func (r *SymbolTableReader) ToSymbol(text string) (Symbol, bool) { + if sym, ok := r.text2Sym[text]; ok { + return sym, true + } + return SymbolNil, false +} + +func (r *SymbolTableReader) ToText(sym Symbol) (string, bool) { + text, ok := r.sym2Text[sym] + return text, ok +} + +func (r *SymbolTableReader) TerminalSymbols() []Symbol { + syms := make([]Symbol, 0, r.termNum.Int()-terminalNumMin.Int()) + for sym := range r.sym2Text { + if !sym.IsTerminal() || sym.IsNil() { + continue + } + syms = append(syms, sym) + } + sort.Slice(syms, func(i, j int) bool { + return syms[i] < syms[j] + }) + return syms +} + +func (r *SymbolTableReader) TerminalTexts() ([]string, error) { + if r.termNum == terminalNumMin { + return nil, fmt.Errorf("symbol table has no terminals") + } + return r.termTexts, nil +} + +func (r *SymbolTableReader) NonTerminalSymbols() []Symbol { + syms := make([]Symbol, 0, r.nonTermNum.Int()-nonTerminalNumMin.Int()) + for sym := range r.sym2Text { + if !sym.isNonTerminal() || sym.IsNil() { + continue + } + syms = append(syms, sym) + } + sort.Slice(syms, func(i, j int) bool { + return syms[i] < syms[j] + }) + return syms +} + +func (r *SymbolTableReader) NonTerminalTexts() ([]string, error) { + if r.nonTermNum == nonTerminalNumMin || r.nonTermTexts[symbolStart.Num().Int()] == "" { + return nil, fmt.Errorf("symbol table has no terminals or no start symbol") + } + return r.nonTermTexts, nil +} |