aboutsummaryrefslogtreecommitdiff
path: root/src/urubu/grammar
diff options
context:
space:
mode:
Diffstat (limited to 'src/urubu/grammar')
-rw-r--r--src/urubu/grammar/first.go148
-rw-r--r--src/urubu/grammar/grammar.go1390
-rw-r--r--src/urubu/grammar/item.go206
-rw-r--r--src/urubu/grammar/lalr1.go318
-rw-r--r--src/urubu/grammar/lexical/compiler.go413
-rw-r--r--src/urubu/grammar/lexical/dfa/dfa.go173
-rw-r--r--src/urubu/grammar/lexical/dfa/symbol_position.go182
-rw-r--r--src/urubu/grammar/lexical/dfa/tree.go567
-rw-r--r--src/urubu/grammar/lexical/entry.go171
-rw-r--r--src/urubu/grammar/lexical/parser/error.go36
-rw-r--r--src/urubu/grammar/lexical/parser/fragment.go72
-rw-r--r--src/urubu/grammar/lexical/parser/lexer.go594
-rw-r--r--src/urubu/grammar/lexical/parser/parser.go531
-rw-r--r--src/urubu/grammar/lexical/parser/tree.go459
-rw-r--r--src/urubu/grammar/lr0.go197
-rw-r--r--src/urubu/grammar/parsing_table.go553
-rw-r--r--src/urubu/grammar/production.go117
-rw-r--r--src/urubu/grammar/semantic_error.go30
-rw-r--r--src/urubu/grammar/symbol/symbol.go295
19 files changed, 6452 insertions, 0 deletions
diff --git a/src/urubu/grammar/first.go b/src/urubu/grammar/first.go
new file mode 100644
index 0000000..6443bcf
--- /dev/null
+++ b/src/urubu/grammar/first.go
@@ -0,0 +1,148 @@
+package grammar
+
+import (
+ "fmt"
+
+ "urubu/grammar/symbol"
+)
+
+type firstEntry struct {
+ symbols map[symbol.Symbol]struct{}
+ empty bool
+}
+
+func newFirstEntry() *firstEntry {
+ return &firstEntry{
+ symbols: map[symbol.Symbol]struct{}{},
+ empty: false,
+ }
+}
+
+func (e *firstEntry) add(sym symbol.Symbol) bool {
+ if _, ok := e.symbols[sym]; ok {
+ return false
+ }
+ e.symbols[sym] = struct{}{}
+ return true
+}
+
+func (e *firstEntry) addEmpty() bool {
+ if !e.empty {
+ e.empty = true
+ return true
+ }
+ return false
+}
+
+func (e *firstEntry) mergeExceptEmpty(target *firstEntry) bool {
+ if target == nil {
+ return false
+ }
+ changed := false
+ for sym := range target.symbols {
+ added := e.add(sym)
+ if added {
+ changed = true
+ }
+ }
+ return changed
+}
+
+type firstSet struct {
+ set map[symbol.Symbol]*firstEntry
+}
+
+func newFirstSet(prods *productionSet) *firstSet {
+ fst := &firstSet{
+ set: map[symbol.Symbol]*firstEntry{},
+ }
+ for _, prod := range prods.getAllProductions() {
+ if _, ok := fst.set[prod.lhs]; ok {
+ continue
+ }
+ fst.set[prod.lhs] = newFirstEntry()
+ }
+
+ return fst
+}
+
+func (fst *firstSet) find(prod *production, head int) (*firstEntry, error) {
+ entry := newFirstEntry()
+ if prod.rhsLen <= head {
+ entry.addEmpty()
+ return entry, nil
+ }
+ for _, sym := range prod.rhs[head:] {
+ if sym.IsTerminal() {
+ entry.add(sym)
+ return entry, nil
+ }
+
+ e := fst.findBySymbol(sym)
+ if e == nil {
+ return nil, fmt.Errorf("an entry of FIRST was not found; symbol: %s", sym)
+ }
+ for s := range e.symbols {
+ entry.add(s)
+ }
+ if !e.empty {
+ return entry, nil
+ }
+ }
+ entry.addEmpty()
+ return entry, nil
+}
+
+func (fst *firstSet) findBySymbol(sym symbol.Symbol) *firstEntry {
+ return fst.set[sym]
+}
+
+type firstComContext struct {
+ first *firstSet
+}
+
+func newFirstComContext(prods *productionSet) *firstComContext {
+ return &firstComContext{
+ first: newFirstSet(prods),
+ }
+}
+
+func genFirstSet(prods *productionSet) (*firstSet, error) {
+ cc := newFirstComContext(prods)
+ for {
+ more := false
+ for _, prod := range prods.getAllProductions() {
+ e := cc.first.findBySymbol(prod.lhs)
+ changed, err := genProdFirstEntry(cc, e, prod)
+ if err != nil {
+ return nil, err
+ }
+ if changed {
+ more = true
+ }
+ }
+ if !more {
+ break
+ }
+ }
+ return cc.first, nil
+}
+
+func genProdFirstEntry(cc *firstComContext, acc *firstEntry, prod *production) (bool, error) {
+ if prod.isEmpty() {
+ return acc.addEmpty(), nil
+ }
+
+ for _, sym := range prod.rhs {
+ if sym.IsTerminal() {
+ return acc.add(sym), nil
+ }
+
+ e := cc.first.findBySymbol(sym)
+ changed := acc.mergeExceptEmpty(e)
+ if !e.empty {
+ return changed, nil
+ }
+ }
+ return acc.addEmpty(), nil
+}
diff --git a/src/urubu/grammar/grammar.go b/src/urubu/grammar/grammar.go
new file mode 100644
index 0000000..bfa53c6
--- /dev/null
+++ b/src/urubu/grammar/grammar.go
@@ -0,0 +1,1390 @@
+package grammar
+
+import (
+ "fmt"
+ "io"
+ "strings"
+
+ verr "urubu/error"
+ "urubu/grammar/lexical"
+ "urubu/grammar/symbol"
+ spec "urubu/spec/grammar"
+ "urubu/spec/grammar/parser"
+)
+
+type astActionEntry struct {
+ position int
+ expansion bool
+}
+
+type assocType string
+
+const (
+ assocTypeNil = assocType("")
+ assocTypeLeft = assocType("left")
+ assocTypeRight = assocType("right")
+)
+
+const (
+ precNil = 0
+ precMin = 1
+)
+
+// precAndAssoc represents precedence and associativities of terminal symbols and productions.
+// We use the priority of the production to resolve shift/reduce conflicts.
+type precAndAssoc struct {
+ // termPrec and termAssoc represent the precedence of the terminal symbols.
+ termPrec map[symbol.SymbolNum]int
+ termAssoc map[symbol.SymbolNum]assocType
+
+ // prodPrec and prodAssoc represent the precedence and the associativities of the production.
+ // These values are inherited from the right-most terminal symbols in the RHS of the productions.
+ prodPrec map[productionNum]int
+ prodAssoc map[productionNum]assocType
+}
+
+func (pa *precAndAssoc) terminalPrecedence(sym symbol.SymbolNum) int {
+ prec, ok := pa.termPrec[sym]
+ if !ok {
+ return precNil
+ }
+
+ return prec
+}
+
+func (pa *precAndAssoc) terminalAssociativity(sym symbol.SymbolNum) assocType {
+ assoc, ok := pa.termAssoc[sym]
+ if !ok {
+ return assocTypeNil
+ }
+
+ return assoc
+}
+
+func (pa *precAndAssoc) productionPredence(prod productionNum) int {
+ prec, ok := pa.prodPrec[prod]
+ if !ok {
+ return precNil
+ }
+
+ return prec
+}
+
+func (pa *precAndAssoc) productionAssociativity(prod productionNum) assocType {
+ assoc, ok := pa.prodAssoc[prod]
+ if !ok {
+ return assocTypeNil
+ }
+
+ return assoc
+}
+
+const reservedSymbolNameError = "error"
+
+type Grammar struct {
+ name string
+ lexSpec *lexical.LexSpec
+ skipSymbols []symbol.Symbol
+ productionSet *productionSet
+ augmentedStartSymbol symbol.Symbol
+ errorSymbol symbol.Symbol
+ symbolTable *symbol.SymbolTableReader
+ astActions map[productionID][]*astActionEntry
+ precAndAssoc *precAndAssoc
+
+ // recoverProductions is a set of productions having the recover directive.
+ recoverProductions map[productionID]struct{}
+}
+
+type buildConfig struct {
+ isReportingEnabled bool
+}
+
+type BuildOption func(config *buildConfig)
+
+func EnableReporting() BuildOption {
+ return func(config *buildConfig) {
+ config.isReportingEnabled = true
+ }
+}
+
+type GrammarBuilder struct {
+ AST *parser.RootNode
+
+ errs verr.SpecErrors
+}
+
+func (b *GrammarBuilder) Build(opts ...BuildOption) (*spec.CompiledGrammar, *spec.Report, error) {
+ gram, err := b.build()
+ if err != nil {
+ return nil, nil, err
+ }
+
+ return compile(gram, opts...)
+}
+
+func (b *GrammarBuilder) build() (*Grammar, error) {
+ var specName string
+ {
+ errOccurred := false
+ for _, dir := range b.AST.Directives {
+ if dir.Name != "name" {
+ continue
+ }
+
+ if len(dir.Parameters) != 1 || dir.Parameters[0].ID == "" {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'name' takes just one ID parameter",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+
+ errOccurred = true
+ break
+ }
+
+ specName = dir.Parameters[0].ID
+ break
+ }
+
+ if specName == "" && !errOccurred {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrNoGrammarName,
+ })
+ }
+ }
+
+ b.checkSpellingInconsistenciesOfUserDefinedIDs(b.AST)
+ if len(b.errs) > 0 {
+ return nil, b.errs
+ }
+
+ symTab, ss, err := b.genSymbolTable(b.AST)
+ if err != nil {
+ return nil, err
+ }
+
+ lexSpec, skip, err := b.genLexSpecAndSkipSymbols(symTab.Reader(), b.AST)
+ if err != nil {
+ return nil, err
+ }
+
+ prodsAndActs, err := b.genProductionsAndActions(b.AST, symTab.Reader(), ss.errSym, ss.augStartSym, ss.startSym)
+ if err != nil {
+ return nil, err
+ }
+ if prodsAndActs == nil && len(b.errs) > 0 {
+ return nil, b.errs
+ }
+
+ pa, err := b.genPrecAndAssoc(symTab.Reader(), ss.errSym, prodsAndActs)
+ if err != nil {
+ return nil, err
+ }
+ if pa == nil && len(b.errs) > 0 {
+ return nil, b.errs
+ }
+
+ syms := findUsedAndUnusedSymbols(b.AST)
+ if syms == nil && len(b.errs) > 0 {
+ return nil, b.errs
+ }
+
+ // When a terminal symbol that cannot be reached from the start symbol has the skip directive,
+ // the compiler treats its terminal as a used symbol, not unused.
+ {
+ r := symTab.Reader()
+ for _, sym := range skip {
+ s, _ := r.ToText(sym)
+ if _, ok := syms.unusedTerminals[s]; !ok {
+ prod := syms.usedTerminals[s]
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrTermCannotBeSkipped,
+ Detail: s,
+ Row: prod.Pos.Row,
+ Col: prod.Pos.Col,
+ })
+ continue
+ }
+
+ delete(syms.unusedTerminals, s)
+ }
+ }
+
+ for sym, prod := range syms.unusedProductions {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrUnusedProduction,
+ Detail: sym,
+ Row: prod.Pos.Row,
+ Col: prod.Pos.Col,
+ })
+ }
+
+ for sym, prod := range syms.unusedTerminals {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrUnusedTerminal,
+ Detail: sym,
+ Row: prod.Pos.Row,
+ Col: prod.Pos.Col,
+ })
+ }
+
+ if len(b.errs) > 0 {
+ return nil, b.errs
+ }
+
+ return &Grammar{
+ name: specName,
+ lexSpec: lexSpec,
+ skipSymbols: skip,
+ productionSet: prodsAndActs.prods,
+ augmentedStartSymbol: prodsAndActs.augStartSym,
+ errorSymbol: ss.errSym,
+ symbolTable: symTab.Reader(),
+ astActions: prodsAndActs.astActs,
+ recoverProductions: prodsAndActs.recoverProds,
+ precAndAssoc: pa,
+ }, nil
+}
+
+type usedAndUnusedSymbols struct {
+ unusedProductions map[string]*parser.ProductionNode
+ unusedTerminals map[string]*parser.ProductionNode
+ usedTerminals map[string]*parser.ProductionNode
+}
+
+func findUsedAndUnusedSymbols(root *parser.RootNode) *usedAndUnusedSymbols {
+ prods := map[string]*parser.ProductionNode{}
+ lexProds := map[string]*parser.ProductionNode{}
+ mark := map[string]bool{}
+ {
+ for _, p := range root.Productions {
+ prods[p.LHS] = p
+ mark[p.LHS] = false
+ for _, alt := range p.RHS {
+ for _, e := range alt.Elements {
+ if e.ID == "" {
+ continue
+ }
+ mark[e.ID] = false
+ }
+ }
+ }
+
+ for _, p := range root.LexProductions {
+ lexProds[p.LHS] = p
+ mark[p.LHS] = false
+ }
+
+ start := root.Productions[0]
+ mark[start.LHS] = true
+ markUsedSymbols(mark, map[string]bool{}, prods, start)
+
+ // We don't have to check the error symbol because the error symbol doesn't have a production.
+ delete(mark, reservedSymbolNameError)
+ }
+
+ usedTerms := make(map[string]*parser.ProductionNode, len(lexProds))
+ unusedProds := map[string]*parser.ProductionNode{}
+ unusedTerms := map[string]*parser.ProductionNode{}
+ for sym, used := range mark {
+ if p, ok := prods[sym]; ok {
+ if used {
+ continue
+ }
+ unusedProds[sym] = p
+ continue
+ }
+ if p, ok := lexProds[sym]; ok {
+ if used {
+ usedTerms[sym] = p
+ } else {
+ unusedTerms[sym] = p
+ }
+ continue
+ }
+
+ // May be reached here when a fragment name appears on the right-hand side of a production rule. However, an error
+ // to the effect that a production rule cannot contain a fragment will be detected in a subsequent process. So we can
+ // ignore it here.
+ }
+
+ return &usedAndUnusedSymbols{
+ usedTerminals: usedTerms,
+ unusedProductions: unusedProds,
+ unusedTerminals: unusedTerms,
+ }
+}
+
+func markUsedSymbols(mark map[string]bool, marked map[string]bool, prods map[string]*parser.ProductionNode, prod *parser.ProductionNode) {
+ if marked[prod.LHS] {
+ return
+ }
+
+ for _, alt := range prod.RHS {
+ for _, e := range alt.Elements {
+ if e.ID == "" {
+ continue
+ }
+
+ mark[e.ID] = true
+
+ p, ok := prods[e.ID]
+ if !ok {
+ continue
+ }
+
+ // Remove a production to avoid inifinite recursion.
+ marked[prod.LHS] = true
+
+ markUsedSymbols(mark, marked, prods, p)
+ }
+ }
+}
+
+func (b *GrammarBuilder) checkSpellingInconsistenciesOfUserDefinedIDs(root *parser.RootNode) {
+ var ids []string
+ {
+ for _, prod := range root.Productions {
+ ids = append(ids, prod.LHS)
+ for _, alt := range prod.RHS {
+ for _, elem := range alt.Elements {
+ if elem.Label != nil {
+ ids = append(ids, elem.Label.Name)
+ }
+ }
+ }
+ }
+ for _, prod := range root.LexProductions {
+ ids = append(ids, prod.LHS)
+ }
+ for _, dir := range root.Directives {
+ dirIDs := collectUserDefinedIDsFromDirective(dir)
+ if len(dirIDs) > 0 {
+ ids = append(ids, dirIDs...)
+ }
+ }
+ }
+
+ duplicated := lexical.FindSpellingInconsistencies(ids)
+ if len(duplicated) == 0 {
+ return
+ }
+
+ for _, dup := range duplicated {
+ var s string
+ {
+ var b strings.Builder
+ fmt.Fprintf(&b, "%+v", dup[0])
+ for _, id := range dup[1:] {
+ fmt.Fprintf(&b, ", %+v", id)
+ }
+ s = b.String()
+ }
+
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrSpellingInconsistency,
+ Detail: s,
+ })
+ }
+}
+
+func collectUserDefinedIDsFromDirective(dir *parser.DirectiveNode) []string {
+ var ids []string
+ for _, param := range dir.Parameters {
+ if param.Group != nil {
+ for _, d := range param.Group {
+ dIDs := collectUserDefinedIDsFromDirective(d)
+ if len(dIDs) > 0 {
+ ids = append(ids, dIDs...)
+ }
+ }
+ }
+ if param.OrderedSymbol != "" {
+ ids = append(ids, param.OrderedSymbol)
+ }
+ }
+ return ids
+}
+
+type symbols struct {
+ errSym symbol.Symbol
+ augStartSym symbol.Symbol
+ startSym symbol.Symbol
+}
+
+func (b *GrammarBuilder) genSymbolTable(root *parser.RootNode) (*symbol.SymbolTable, *symbols, error) {
+ symTab := symbol.NewSymbolTable()
+ w := symTab.Writer()
+ r := symTab.Reader()
+
+ // We need to register the reserved symbol before registering others.
+ var errSym symbol.Symbol
+ {
+ sym, err := w.RegisterTerminalSymbol(reservedSymbolNameError)
+ if err != nil {
+ return nil, nil, err
+ }
+ errSym = sym
+ }
+
+ for _, prod := range root.LexProductions {
+ if sym, exist := r.ToSymbol(prod.LHS); exist {
+ if sym == errSym {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrErrSymIsReserved,
+ Row: prod.Pos.Row,
+ Col: prod.Pos.Col,
+ })
+ } else {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateTerminal,
+ Detail: prod.LHS,
+ Row: prod.Pos.Row,
+ Col: prod.Pos.Col,
+ })
+ }
+
+ continue
+ }
+
+ _, err := w.RegisterTerminalSymbol(prod.LHS)
+ if err != nil {
+ return nil, nil, err
+ }
+ }
+
+ startProd := root.Productions[0]
+ augStartText := fmt.Sprintf("%s'", startProd.LHS)
+ var err error
+ augStartSym, err := w.RegisterStartSymbol(augStartText)
+ if err != nil {
+ return nil, nil, err
+ }
+ if augStartSym == errSym {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrErrSymIsReserved,
+ Row: startProd.Pos.Row,
+ Col: startProd.Pos.Col,
+ })
+ }
+
+ startSym, err := w.RegisterNonTerminalSymbol(startProd.LHS)
+ if err != nil {
+ return nil, nil, err
+ }
+ if startSym == errSym {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrErrSymIsReserved,
+ Row: startProd.Pos.Row,
+ Col: startProd.Pos.Col,
+ })
+ }
+
+ for _, prod := range root.Productions {
+ sym, err := w.RegisterNonTerminalSymbol(prod.LHS)
+ if err != nil {
+ return nil, nil, err
+ }
+ if sym.IsTerminal() {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateName,
+ Detail: prod.LHS,
+ Row: prod.Pos.Row,
+ Col: prod.Pos.Col,
+ })
+ }
+ if sym == errSym {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrErrSymIsReserved,
+ Row: prod.Pos.Row,
+ Col: prod.Pos.Col,
+ })
+ }
+ }
+
+ return symTab, &symbols{
+ errSym: errSym,
+ augStartSym: augStartSym,
+ startSym: startSym,
+ }, nil
+}
+
+func (b *GrammarBuilder) genLexSpecAndSkipSymbols(symTab *symbol.SymbolTableReader, root *parser.RootNode) (*lexical.LexSpec, []symbol.Symbol, error) {
+ entries := []*lexical.LexEntry{}
+ skipSyms := []symbol.Symbol{}
+ for _, prod := range root.LexProductions {
+ entry, skip, specErr, err := genLexEntry(prod)
+ if err != nil {
+ return nil, nil, err
+ }
+ if specErr != nil {
+ b.errs = append(b.errs, specErr)
+ continue
+ }
+ if skip {
+ sym, _ := symTab.ToSymbol(prod.LHS)
+ skipSyms = append(skipSyms, sym)
+ }
+ entries = append(entries, entry)
+ }
+
+ checkedFragments := map[string]struct{}{}
+ for _, fragment := range root.Fragments {
+ if _, exist := checkedFragments[fragment.LHS]; exist {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateFragment,
+ Detail: fragment.LHS,
+ Row: fragment.Pos.Row,
+ Col: fragment.Pos.Col,
+ })
+ continue
+ }
+ checkedFragments[fragment.LHS] = struct{}{}
+
+ entries = append(entries, &lexical.LexEntry{
+ Fragment: true,
+ Kind: spec.LexKindName(fragment.LHS),
+ Pattern: fragment.RHS,
+ })
+ }
+
+ return &lexical.LexSpec{
+ Entries: entries,
+ }, skipSyms, nil
+}
+
+func genLexEntry(prod *parser.ProductionNode) (*lexical.LexEntry, bool, *verr.SpecError, error) {
+ alt := prod.RHS[0]
+ elem := alt.Elements[0]
+
+ var pattern string
+ if elem.Literally {
+ pattern = spec.EscapePattern(elem.Pattern)
+ } else {
+ pattern = elem.Pattern
+ }
+
+ var modes []spec.LexModeName
+ var skip bool
+ var push spec.LexModeName
+ var pop bool
+ dirConsumed := map[string]struct{}{}
+ for _, dir := range prod.Directives {
+ if _, consumed := dirConsumed[dir.Name]; consumed {
+ return nil, false, &verr.SpecError{
+ Cause: semErrDuplicateDir,
+ Detail: dir.Name,
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ }, nil
+ }
+ dirConsumed[dir.Name] = struct{}{}
+
+ switch dir.Name {
+ case "mode":
+ if len(dir.Parameters) == 0 {
+ return nil, false, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'mode' directive needs an ID parameter",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ }, nil
+ }
+ for _, param := range dir.Parameters {
+ if param.ID == "" {
+ return nil, false, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'mode' directive needs an ID parameter",
+ Row: param.Pos.Row,
+ Col: param.Pos.Col,
+ }, nil
+ }
+ modes = append(modes, spec.LexModeName(param.ID))
+ }
+ case "skip":
+ if len(dir.Parameters) > 0 {
+ return nil, false, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'skip' directive needs no parameter",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ }, nil
+ }
+ skip = true
+ case "push":
+ if len(dir.Parameters) != 1 || dir.Parameters[0].ID == "" {
+ return nil, false, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'push' directive needs an ID parameter",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ }, nil
+ }
+ push = spec.LexModeName(dir.Parameters[0].ID)
+ case "pop":
+ if len(dir.Parameters) > 0 {
+ return nil, false, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'pop' directive needs no parameter",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ }, nil
+ }
+ pop = true
+ default:
+ return nil, false, &verr.SpecError{
+ Cause: semErrDirInvalidName,
+ Detail: dir.Name,
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ }, nil
+ }
+ }
+
+ if len(alt.Directives) > 0 {
+ return nil, false, &verr.SpecError{
+ Cause: semErrInvalidAltDir,
+ Detail: "a lexical production cannot have alternative directives",
+ Row: alt.Directives[0].Pos.Row,
+ Col: alt.Directives[0].Pos.Col,
+ }, nil
+ }
+
+ return &lexical.LexEntry{
+ Modes: modes,
+ Kind: spec.LexKindName(prod.LHS),
+ Pattern: pattern,
+ Push: push,
+ Pop: pop,
+ }, skip, nil, nil
+}
+
+type productionsAndActions struct {
+ prods *productionSet
+ augStartSym symbol.Symbol
+ astActs map[productionID][]*astActionEntry
+ prodPrecsTerm map[productionID]symbol.Symbol
+ prodPrecsOrdSym map[productionID]string
+ prodPrecPoss map[productionID]*parser.Position
+ recoverProds map[productionID]struct{}
+}
+
+func (b *GrammarBuilder) genProductionsAndActions(root *parser.RootNode, symTab *symbol.SymbolTableReader, errSym symbol.Symbol, augStartSym symbol.Symbol, startSym symbol.Symbol) (*productionsAndActions, error) {
+ if len(root.Productions) == 0 {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrNoProduction,
+ })
+ return nil, nil
+ }
+
+ prods := newProductionSet()
+ astActs := map[productionID][]*astActionEntry{}
+ prodPrecsTerm := map[productionID]symbol.Symbol{}
+ prodPrecsOrdSym := map[productionID]string{}
+ prodPrecPoss := map[productionID]*parser.Position{}
+ recoverProds := map[productionID]struct{}{}
+
+ p, err := newProduction(augStartSym, []symbol.Symbol{
+ startSym,
+ })
+ if err != nil {
+ return nil, err
+ }
+
+ prods.append(p)
+
+ for _, prod := range root.Productions {
+ lhsSym, ok := symTab.ToSymbol(prod.LHS)
+ if !ok {
+ // All symbols are assumed to be pre-detected, so it's a bug if we cannot find them here.
+ return nil, fmt.Errorf("symbol '%v' is undefined", prod.LHS)
+ }
+
+ if len(prod.Directives) > 0 {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrInvalidProdDir,
+ Detail: "a production cannot have production directives",
+ Row: prod.Directives[0].Pos.Row,
+ Col: prod.Directives[0].Pos.Col,
+ })
+ continue
+ }
+
+ LOOP_RHS:
+ for _, alt := range prod.RHS {
+ altSyms := make([]symbol.Symbol, len(alt.Elements))
+ offsets := map[string]int{}
+ ambiguousIDOffsets := map[string]struct{}{}
+ for i, elem := range alt.Elements {
+ sym, ok := symTab.ToSymbol(elem.ID)
+ if !ok {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrUndefinedSym,
+ Detail: elem.ID,
+ Row: elem.Pos.Row,
+ Col: elem.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ altSyms[i] = sym
+
+ if elem.Label != nil {
+ if _, added := offsets[elem.Label.Name]; added {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateLabel,
+ Detail: elem.Label.Name,
+ Row: elem.Label.Pos.Row,
+ Col: elem.Label.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ if _, found := symTab.ToSymbol(elem.Label.Name); found {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrInvalidLabel,
+ Detail: elem.Label.Name,
+ Row: elem.Label.Pos.Row,
+ Col: elem.Label.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ offsets[elem.Label.Name] = i
+ }
+ // A symbol having a label can be specified by both the label and the symbol name.
+ // So record the symbol's position, whether or not it has a label.
+ if elem.ID != "" {
+ if _, exist := offsets[elem.ID]; exist {
+ // When the same symbol appears multiple times in an alternative, the symbol is ambiguous. When we need
+ // to specify the symbol in a directive, we cannot use the name of the ambiguous symbol. Instead, specify
+ // a label to resolve the ambiguity.
+ delete(offsets, elem.ID)
+ ambiguousIDOffsets[elem.ID] = struct{}{}
+ } else {
+ offsets[elem.ID] = i
+ }
+ }
+ }
+
+ p, err := newProduction(lhsSym, altSyms)
+ if err != nil {
+ return nil, err
+ }
+ if _, exist := prods.findByID(p.id); exist {
+ // Report the line number of a duplicate alternative.
+ // When the alternative is empty, we report the position of its LHS.
+ var row int
+ var col int
+ if len(alt.Elements) > 0 {
+ row = alt.Elements[0].Pos.Row
+ col = alt.Elements[0].Pos.Col
+ } else {
+ row = prod.Pos.Row
+ col = prod.Pos.Col
+ }
+
+ var detail string
+ {
+ var b strings.Builder
+ fmt.Fprintf(&b, "%v →", prod.LHS)
+ for _, elem := range alt.Elements {
+ switch {
+ case elem.ID != "":
+ fmt.Fprintf(&b, " %v", elem.ID)
+ case elem.Pattern != "":
+ fmt.Fprintf(&b, ` "%v"`, elem.Pattern)
+ }
+ }
+ if len(alt.Elements) == 0 {
+ fmt.Fprintf(&b, " ε")
+ }
+
+ detail = b.String()
+ }
+
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateProduction,
+ Detail: detail,
+ Row: row,
+ Col: col,
+ })
+ continue LOOP_RHS
+ }
+ prods.append(p)
+
+ dirConsumed := map[string]struct{}{}
+ for _, dir := range alt.Directives {
+ if _, consumed := dirConsumed[dir.Name]; consumed {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateDir,
+ Detail: dir.Name,
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ }
+ dirConsumed[dir.Name] = struct{}{}
+
+ switch dir.Name {
+ case "ast":
+ if len(dir.Parameters) == 0 {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'ast' directive needs at least one parameter",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ astAct := make([]*astActionEntry, len(dir.Parameters))
+ consumedOffsets := map[int]struct{}{}
+ for i, param := range dir.Parameters {
+ if param.ID == "" {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'ast' directive can take only ID parameters",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+
+ if _, ambiguous := ambiguousIDOffsets[param.ID]; ambiguous {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrAmbiguousElem,
+ Detail: fmt.Sprintf("'%v' is ambiguous", param.ID),
+ Row: param.Pos.Row,
+ Col: param.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+
+ offset, ok := offsets[param.ID]
+ if !ok {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: fmt.Sprintf("a symbol was not found in an alternative: %v", param.ID),
+ Row: param.Pos.Row,
+ Col: param.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ if _, consumed := consumedOffsets[offset]; consumed {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateElem,
+ Detail: param.ID,
+ Row: param.Pos.Row,
+ Col: param.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ consumedOffsets[offset] = struct{}{}
+
+ if param.Expansion {
+ elem := alt.Elements[offset]
+ if elem.Pattern != "" {
+ // Currently, it is a bug to reach here because it is
+ // forbidden to have anything other than ID appear in
+ // production rules.
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: fmt.Sprintf("the expansion symbol cannot be applied to a pattern (%v: \"%v\")", param.ID, elem.Pattern),
+ Row: param.Pos.Row,
+ Col: param.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ elemSym, ok := symTab.ToSymbol(elem.ID)
+ if !ok {
+ // If the symbol was not found, it's a bug.
+ return nil, fmt.Errorf("a symbol corresponding to an ID (%v) was not found", elem.ID)
+ }
+ if elemSym.IsTerminal() {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: fmt.Sprintf("the expansion symbol cannot be applied to a terminal symbol (%v: %v)", param.ID, elem.ID),
+ Row: param.Pos.Row,
+ Col: param.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ }
+
+ astAct[i] = &astActionEntry{
+ position: offset + 1,
+ expansion: param.Expansion,
+ }
+ }
+ astActs[p.id] = astAct
+ case "prec":
+ if len(dir.Parameters) != 1 || (dir.Parameters[0].ID == "" && dir.Parameters[0].OrderedSymbol == "") {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'prec' directive needs just one ID parameter or ordered symbol",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ param := dir.Parameters[0]
+ switch {
+ case param.ID != "":
+ sym, ok := symTab.ToSymbol(param.ID)
+ if !ok {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: fmt.Sprintf("unknown terminal symbol: %v", param.ID),
+ Row: param.Pos.Row,
+ Col: param.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ if sym == errSym {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: fmt.Sprintf("'%v' directive cannot be applied to an error symbol", dir.Name),
+ Row: param.Pos.Row,
+ Col: param.Pos.Col,
+ })
+ }
+ if !sym.IsTerminal() {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: fmt.Sprintf("the symbol must be a terminal: %v", param.ID),
+ Row: param.Pos.Row,
+ Col: param.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ prodPrecsTerm[p.id] = sym
+ prodPrecPoss[p.id] = &param.Pos
+ case param.OrderedSymbol != "":
+ prodPrecsOrdSym[p.id] = param.OrderedSymbol
+ prodPrecPoss[p.id] = &param.Pos
+ }
+ case "recover":
+ if len(dir.Parameters) > 0 {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'recover' directive needs no parameter",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ recoverProds[p.id] = struct{}{}
+ default:
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidName,
+ Detail: fmt.Sprintf("invalid directive name '%v'", dir.Name),
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ continue LOOP_RHS
+ }
+ }
+ }
+ }
+
+ return &productionsAndActions{
+ prods: prods,
+ augStartSym: augStartSym,
+ astActs: astActs,
+ prodPrecsTerm: prodPrecsTerm,
+ prodPrecsOrdSym: prodPrecsOrdSym,
+ prodPrecPoss: prodPrecPoss,
+ recoverProds: recoverProds,
+ }, nil
+}
+
+func (b *GrammarBuilder) genPrecAndAssoc(symTab *symbol.SymbolTableReader, errSym symbol.Symbol, prodsAndActs *productionsAndActions) (*precAndAssoc, error) {
+ termPrec := map[symbol.SymbolNum]int{}
+ termAssoc := map[symbol.SymbolNum]assocType{}
+ ordSymPrec := map[string]int{}
+ {
+ var precGroup []*parser.DirectiveNode
+ for _, dir := range b.AST.Directives {
+ if dir.Name == "prec" {
+ if dir.Parameters == nil || len(dir.Parameters) != 1 || dir.Parameters[0].Group == nil {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "'prec' needs just one directive group",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ continue
+ }
+ precGroup = dir.Parameters[0].Group
+ continue
+ }
+
+ if dir.Name != "name" && dir.Name != "prec" {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidName,
+ Detail: dir.Name,
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ continue
+ }
+ }
+
+ precN := precMin
+ for _, dir := range precGroup {
+ var assocTy assocType
+ switch dir.Name {
+ case "left":
+ assocTy = assocTypeLeft
+ case "right":
+ assocTy = assocTypeRight
+ case "assign":
+ assocTy = assocTypeNil
+ default:
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidName,
+ Detail: dir.Name,
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ return nil, nil
+ }
+
+ if len(dir.Parameters) == 0 {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "associativity needs at least one symbol",
+ Row: dir.Pos.Row,
+ Col: dir.Pos.Col,
+ })
+ return nil, nil
+ }
+ ASSOC_PARAM_LOOP:
+ for _, p := range dir.Parameters {
+ switch {
+ case p.ID != "":
+ sym, ok := symTab.ToSymbol(p.ID)
+ if !ok {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: fmt.Sprintf("'%v' is undefined", p.ID),
+ Row: p.Pos.Row,
+ Col: p.Pos.Col,
+ })
+ return nil, nil
+ }
+ if sym == errSym {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: fmt.Sprintf("'%v' directive cannot be applied to an error symbol", dir.Name),
+ Row: p.Pos.Row,
+ Col: p.Pos.Col,
+ })
+ return nil, nil
+ }
+ if !sym.IsTerminal() {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: fmt.Sprintf("associativity can take only terminal symbol ('%v' is a non-terminal)", p.ID),
+ Row: p.Pos.Row,
+ Col: p.Pos.Col,
+ })
+ return nil, nil
+ }
+ if prec, alreadySet := termPrec[sym.Num()]; alreadySet {
+ if prec == precN {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateAssoc,
+ Detail: fmt.Sprintf("'%v' already has the same associativity and precedence", p.ID),
+ Row: p.Pos.Row,
+ Col: p.Pos.Col,
+ })
+ } else if assoc := termAssoc[sym.Num()]; assoc == assocTy {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateAssoc,
+ Detail: fmt.Sprintf("'%v' already has different precedence", p.ID),
+ Row: p.Pos.Row,
+ Col: p.Pos.Col,
+ })
+ } else {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateAssoc,
+ Detail: fmt.Sprintf("'%v' already has different associativity and precedence", p.ID),
+ Row: p.Pos.Row,
+ Col: p.Pos.Col,
+ })
+ }
+ break ASSOC_PARAM_LOOP
+ }
+
+ termPrec[sym.Num()] = precN
+ termAssoc[sym.Num()] = assocTy
+ case p.OrderedSymbol != "":
+ if prec, alreadySet := ordSymPrec[p.OrderedSymbol]; alreadySet {
+ if prec == precN {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateAssoc,
+ Detail: fmt.Sprintf("'$%v' already has the same precedence", p.OrderedSymbol),
+ Row: p.Pos.Row,
+ Col: p.Pos.Col,
+ })
+ } else {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDuplicateAssoc,
+ Detail: fmt.Sprintf("'$%v' already has different precedence", p.OrderedSymbol),
+ Row: p.Pos.Row,
+ Col: p.Pos.Col,
+ })
+ }
+ break ASSOC_PARAM_LOOP
+ }
+
+ ordSymPrec[p.OrderedSymbol] = precN
+ default:
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrDirInvalidParam,
+ Detail: "a parameter must be an ID or an ordered symbol",
+ Row: p.Pos.Row,
+ Col: p.Pos.Col,
+ })
+ return nil, nil
+ }
+ }
+
+ precN++
+ }
+ }
+ if len(b.errs) > 0 {
+ return nil, nil
+ }
+
+ prodPrec := map[productionNum]int{}
+ prodAssoc := map[productionNum]assocType{}
+ for _, prod := range prodsAndActs.prods.getAllProductions() {
+ // A #prec directive changes only precedence, not associativity.
+ if term, ok := prodsAndActs.prodPrecsTerm[prod.id]; ok {
+ if prec, ok := termPrec[term.Num()]; ok {
+ prodPrec[prod.num] = prec
+ prodAssoc[prod.num] = assocTypeNil
+ } else {
+ text, _ := symTab.ToText(term)
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrUndefinedPrec,
+ Detail: text,
+ Row: prodsAndActs.prodPrecPoss[prod.id].Row,
+ Col: prodsAndActs.prodPrecPoss[prod.id].Col,
+ })
+ }
+ } else if ordSym, ok := prodsAndActs.prodPrecsOrdSym[prod.id]; ok {
+ if prec, ok := ordSymPrec[ordSym]; ok {
+ prodPrec[prod.num] = prec
+ prodAssoc[prod.num] = assocTypeNil
+ } else {
+ b.errs = append(b.errs, &verr.SpecError{
+ Cause: semErrUndefinedOrdSym,
+ Detail: fmt.Sprintf("$%v", ordSym),
+ Row: prodsAndActs.prodPrecPoss[prod.id].Row,
+ Col: prodsAndActs.prodPrecPoss[prod.id].Col,
+ })
+ }
+ } else {
+ // A production inherits precedence and associativity from the right-most terminal symbol.
+ mostrightTerm := symbol.SymbolNil
+ for _, sym := range prod.rhs {
+ if !sym.IsTerminal() {
+ continue
+ }
+ mostrightTerm = sym
+ }
+ if !mostrightTerm.IsNil() {
+ prodPrec[prod.num] = termPrec[mostrightTerm.Num()]
+ prodAssoc[prod.num] = termAssoc[mostrightTerm.Num()]
+ }
+ }
+ }
+ if len(b.errs) > 0 {
+ return nil, nil
+ }
+
+ return &precAndAssoc{
+ termPrec: termPrec,
+ termAssoc: termAssoc,
+ prodPrec: prodPrec,
+ prodAssoc: prodAssoc,
+ }, nil
+}
+
+func compile(gram *Grammar, opts ...BuildOption) (*spec.CompiledGrammar, *spec.Report, error) {
+ config := &buildConfig{}
+ for _, opt := range opts {
+ opt(config)
+ }
+
+ lexSpec, err, cErrs := lexical.Compile(gram.lexSpec, lexical.CompressionLevelMax)
+ if err != nil {
+ if len(cErrs) > 0 {
+ var b strings.Builder
+ writeCompileError(&b, cErrs[0])
+ for _, cerr := range cErrs[1:] {
+ fmt.Fprintf(&b, "\n")
+ writeCompileError(&b, cerr)
+ }
+ return nil, nil, fmt.Errorf(b.String())
+ }
+ return nil, nil, err
+ }
+
+ kind2Term := make([]int, len(lexSpec.KindNames))
+ for i, k := range lexSpec.KindNames {
+ if k == spec.LexKindNameNil {
+ kind2Term[spec.LexKindIDNil] = symbol.SymbolNil.Num().Int()
+ continue
+ }
+
+ sym, ok := gram.symbolTable.ToSymbol(k.String())
+ if !ok {
+ return nil, nil, fmt.Errorf("terminal symbol '%v' was not found in a symbol table", k)
+ }
+ kind2Term[i] = sym.Num().Int()
+ }
+
+ termTexts, err := gram.symbolTable.TerminalTexts()
+ if err != nil {
+ return nil, nil, err
+ }
+
+ var termSkip []int
+ {
+ r := gram.symbolTable.Reader()
+ // I want to use gram.symbolTable.terminalSymbols() here instead of gram.symbolTable.terminalTexts(),
+ // but gram.symbolTable.terminalSymbols() is different in length from terminalTexts
+ // because it does not contain a predefined symbol, like EOF.
+ // Therefore, we use terminalTexts, although it takes more time to lookup for symbols.
+ termSkip = make([]int, len(termTexts))
+ for _, t := range termTexts {
+ s, _ := r.ToSymbol(t)
+ for _, sk := range gram.skipSymbols {
+ if s != sk {
+ continue
+ }
+ termSkip[s.Num()] = 1
+ break
+ }
+ }
+ }
+
+ nonTerms, err := gram.symbolTable.NonTerminalTexts()
+ if err != nil {
+ return nil, nil, err
+ }
+
+ firstSet, err := genFirstSet(gram.productionSet)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ lr0, err := genLR0Automaton(gram.productionSet, gram.augmentedStartSymbol, gram.errorSymbol)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ var tab *ParsingTable
+ var report *spec.Report
+ {
+ lalr1, err := genLALR1Automaton(lr0, gram.productionSet, firstSet)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ b := &lrTableBuilder{
+ automaton: lalr1.lr0Automaton,
+ prods: gram.productionSet,
+ termCount: len(termTexts),
+ nonTermCount: len(nonTerms),
+ symTab: gram.symbolTable,
+ precAndAssoc: gram.precAndAssoc,
+ }
+ tab, err = b.build()
+ if err != nil {
+ return nil, nil, err
+ }
+
+ if config.isReportingEnabled {
+ report, err = b.genReport(tab, gram)
+ if err != nil {
+ return nil, nil, err
+ }
+ }
+ }
+
+ action := make([]int, len(tab.actionTable))
+ for i, e := range tab.actionTable {
+ action[i] = int(e)
+ }
+ goTo := make([]int, len(tab.goToTable))
+ for i, e := range tab.goToTable {
+ goTo[i] = int(e)
+ }
+
+ lhsSyms := make([]int, len(gram.productionSet.getAllProductions())+1)
+ altSymCounts := make([]int, len(gram.productionSet.getAllProductions())+1)
+ recoverProds := make([]int, len(gram.productionSet.getAllProductions())+1)
+ astActEnties := make([][]int, len(gram.productionSet.getAllProductions())+1)
+ for _, p := range gram.productionSet.getAllProductions() {
+ lhsSyms[p.num] = p.lhs.Num().Int()
+ altSymCounts[p.num] = p.rhsLen
+
+ if _, ok := gram.recoverProductions[p.id]; ok {
+ recoverProds[p.num] = 1
+ }
+
+ astAct, ok := gram.astActions[p.id]
+ if !ok {
+ continue
+ }
+ astActEntry := make([]int, len(astAct))
+ for i, e := range astAct {
+ if e.expansion {
+ astActEntry[i] = e.position * -1
+ } else {
+ astActEntry[i] = e.position
+ }
+ }
+ astActEnties[p.num] = astActEntry
+ }
+
+ return &spec.CompiledGrammar{
+ Name: gram.name,
+ Lexical: lexSpec,
+ Syntactic: &spec.SyntacticSpec{
+ Action: action,
+ GoTo: goTo,
+ StateCount: tab.stateCount,
+ InitialState: tab.InitialState.Int(),
+ StartProduction: productionNumStart.Int(),
+ LHSSymbols: lhsSyms,
+ AlternativeSymbolCounts: altSymCounts,
+ Terminals: termTexts,
+ TerminalCount: tab.terminalCount,
+ TerminalSkip: termSkip,
+ KindToTerminal: kind2Term,
+ NonTerminals: nonTerms,
+ NonTerminalCount: tab.nonTerminalCount,
+ EOFSymbol: symbol.SymbolEOF.Num().Int(),
+ ErrorSymbol: gram.errorSymbol.Num().Int(),
+ ErrorTrapperStates: tab.errorTrapperStates,
+ RecoverProductions: recoverProds,
+ },
+ ASTAction: &spec.ASTAction{
+ Entries: astActEnties,
+ },
+ }, report, nil
+}
+
+func writeCompileError(w io.Writer, cErr *lexical.CompileError) {
+ if cErr.Fragment {
+ fmt.Fprintf(w, "fragment ")
+ }
+ fmt.Fprintf(w, "%v: %v", cErr.Kind, cErr.Cause)
+ if cErr.Detail != "" {
+ fmt.Fprintf(w, ": %v", cErr.Detail)
+ }
+}
diff --git a/src/urubu/grammar/item.go b/src/urubu/grammar/item.go
new file mode 100644
index 0000000..6c5fe42
--- /dev/null
+++ b/src/urubu/grammar/item.go
@@ -0,0 +1,206 @@
+package grammar
+
+import (
+ "crypto/sha256"
+ "encoding/binary"
+ "fmt"
+ "sort"
+ "strconv"
+
+ "urubu/grammar/symbol"
+)
+
+type lrItemID [32]byte
+
+func (id lrItemID) String() string {
+ return fmt.Sprintf("%x", id.num())
+}
+
+func (id lrItemID) num() uint32 {
+ return binary.LittleEndian.Uint32(id[:])
+}
+
+type lookAhead struct {
+ symbols map[symbol.Symbol]struct{}
+
+ // When propagation is true, an item propagates look-ahead symbols to other items.
+ propagation bool
+}
+
+type lrItem struct {
+ id lrItemID
+ prod productionID
+
+ // E → E + T
+ //
+ // Dot | Dotted Symbol | Item
+ // ----+---------------+------------
+ // 0 | E | E →・E + T
+ // 1 | + | E → E・+ T
+ // 2 | T | E → E +・T
+ // 3 | Nil | E → E + T・
+ dot int
+ dottedSymbol symbol.Symbol
+
+ // When initial is true, the LHS of the production is the augmented start symbol and dot is 0.
+ // It looks like S' →・S.
+ initial bool
+
+ // When reducible is true, the item looks like E → E + T・.
+ reducible bool
+
+ // When kernel is true, the item is kernel item.
+ kernel bool
+
+ // lookAhead stores look-ahead symbols, and they are terminal symbols.
+ // The item is reducible only when the look-ahead symbols appear as the next input symbol.
+ lookAhead lookAhead
+}
+
+func newLR0Item(prod *production, dot int) (*lrItem, error) {
+ if prod == nil {
+ return nil, fmt.Errorf("production must be non-nil")
+ }
+
+ if dot < 0 || dot > prod.rhsLen {
+ return nil, fmt.Errorf("dot must be between 0 and %v", prod.rhsLen)
+ }
+
+ var id lrItemID
+ {
+ b := []byte{}
+ b = append(b, prod.id[:]...)
+ bDot := make([]byte, 8)
+ binary.LittleEndian.PutUint64(bDot, uint64(dot))
+ b = append(b, bDot...)
+ id = sha256.Sum256(b)
+ }
+
+ dottedSymbol := symbol.SymbolNil
+ if dot < prod.rhsLen {
+ dottedSymbol = prod.rhs[dot]
+ }
+
+ initial := false
+ if prod.lhs.IsStart() && dot == 0 {
+ initial = true
+ }
+
+ reducible := false
+ if dot == prod.rhsLen {
+ reducible = true
+ }
+
+ kernel := false
+ if initial || dot > 0 {
+ kernel = true
+ }
+
+ item := &lrItem{
+ id: id,
+ prod: prod.id,
+ dot: dot,
+ dottedSymbol: dottedSymbol,
+ initial: initial,
+ reducible: reducible,
+ kernel: kernel,
+ }
+
+ return item, nil
+}
+
+type kernelID [32]byte
+
+func (id kernelID) String() string {
+ return fmt.Sprintf("%x", binary.LittleEndian.Uint32(id[:]))
+}
+
+type kernel struct {
+ id kernelID
+ items []*lrItem
+}
+
+func newKernel(items []*lrItem) (*kernel, error) {
+ if len(items) == 0 {
+ return nil, fmt.Errorf("a kernel need at least one item")
+ }
+
+ // Remove duplicates from items.
+ var sortedItems []*lrItem
+ {
+ m := map[lrItemID]*lrItem{}
+ for _, item := range items {
+ if !item.kernel {
+ return nil, fmt.Errorf("not a kernel item: %v", item)
+ }
+ m[item.id] = item
+ }
+ sortedItems = []*lrItem{}
+ for _, item := range m {
+ sortedItems = append(sortedItems, item)
+ }
+ sort.Slice(sortedItems, func(i, j int) bool {
+ return sortedItems[i].id.num() < sortedItems[j].id.num()
+ })
+ }
+
+ var id kernelID
+ {
+ b := []byte{}
+ for _, item := range sortedItems {
+ b = append(b, item.id[:]...)
+ }
+ id = sha256.Sum256(b)
+ }
+
+ return &kernel{
+ id: id,
+ items: sortedItems,
+ }, nil
+}
+
+type stateNum int
+
+const stateNumInitial = stateNum(0)
+
+func (n stateNum) Int() int {
+ return int(n)
+}
+
+func (n stateNum) String() string {
+ return strconv.Itoa(int(n))
+}
+
+func (n stateNum) next() stateNum {
+ return stateNum(n + 1)
+}
+
+type lrState struct {
+ *kernel
+ num stateNum
+ next map[symbol.Symbol]kernelID
+ reducible map[productionID]struct{}
+
+ // emptyProdItems stores items that have an empty production like `p → ε` and is reducible.
+ // Thus the items emptyProdItems stores are like `p → ・ε`. emptyProdItems is needed to store
+ // look-ahead symbols because the kernel items don't include these items.
+ //
+ // For instance, we have the following productions, and A is a terminal symbol.
+ //
+ // s' → s
+ // s → A | ε
+ //
+ // CLOSURE({s' → ・s}) generates the following closure, but the kernel of this closure doesn't
+ // include `s → ・ε`.
+ //
+ // s' → ・s
+ // s → ・A
+ // s → ・ε
+ emptyProdItems []*lrItem
+
+ // When isErrorTrapper is `true`, the item can shift the `error` symbol. The item has the following form.
+ // The `α` and `β` can be empty.
+ //
+ // A → α・error β
+ isErrorTrapper bool
+}
diff --git a/src/urubu/grammar/lalr1.go b/src/urubu/grammar/lalr1.go
new file mode 100644
index 0000000..8373568
--- /dev/null
+++ b/src/urubu/grammar/lalr1.go
@@ -0,0 +1,318 @@
+package grammar
+
+import (
+ "fmt"
+
+ "urubu/grammar/symbol"
+)
+
+type stateAndLRItem struct {
+ kernelID kernelID
+ itemID lrItemID
+}
+
+type propagation struct {
+ src *stateAndLRItem
+ dest []*stateAndLRItem
+}
+
+type lalr1Automaton struct {
+ *lr0Automaton
+}
+
+func genLALR1Automaton(lr0 *lr0Automaton, prods *productionSet, first *firstSet) (*lalr1Automaton, error) {
+ // Set the look-ahead symbol <EOF> to the initial item: [S' → ・S, $]
+ iniState := lr0.states[lr0.initialState]
+ iniState.items[0].lookAhead.symbols = map[symbol.Symbol]struct{}{
+ symbol.SymbolEOF: {},
+ }
+
+ var props []*propagation
+ for _, state := range lr0.states {
+ for _, kItem := range state.items {
+ items, err := genLALR1Closure(kItem, prods, first)
+ if err != nil {
+ return nil, err
+ }
+
+ kItem.lookAhead.propagation = true
+
+ var propDests []*stateAndLRItem
+ for _, item := range items {
+ if item.reducible {
+ p, ok := prods.findByID(item.prod)
+ if !ok {
+ return nil, fmt.Errorf("production not found: %v", item.prod)
+ }
+
+ if p.isEmpty() {
+ var reducibleItem *lrItem
+ for _, it := range state.emptyProdItems {
+ if it.id != item.id {
+ continue
+ }
+
+ reducibleItem = it
+ break
+ }
+ if reducibleItem == nil {
+ return nil, fmt.Errorf("reducible item not found: %v", item.id)
+ }
+ if reducibleItem.lookAhead.symbols == nil {
+ reducibleItem.lookAhead.symbols = map[symbol.Symbol]struct{}{}
+ }
+ for a := range item.lookAhead.symbols {
+ reducibleItem.lookAhead.symbols[a] = struct{}{}
+ }
+
+ propDests = append(propDests, &stateAndLRItem{
+ kernelID: state.id,
+ itemID: item.id,
+ })
+ }
+
+ continue
+ }
+
+ nextKID := state.next[item.dottedSymbol]
+ var nextItemID lrItemID
+ {
+ p, ok := prods.findByID(item.prod)
+ if !ok {
+ return nil, fmt.Errorf("production not found: %v", item.prod)
+ }
+ it, err := newLR0Item(p, item.dot+1)
+ if err != nil {
+ return nil, fmt.Errorf("failed to generate an item ID: %v", err)
+ }
+ nextItemID = it.id
+ }
+
+ if item.lookAhead.propagation {
+ propDests = append(propDests, &stateAndLRItem{
+ kernelID: nextKID,
+ itemID: nextItemID,
+ })
+ } else {
+ nextState := lr0.states[nextKID]
+ var nextItem *lrItem
+ for _, it := range nextState.items {
+ if it.id != nextItemID {
+ continue
+ }
+ nextItem = it
+ break
+ }
+ if nextItem == nil {
+ return nil, fmt.Errorf("item not found: %v", nextItemID)
+ }
+
+ if nextItem.lookAhead.symbols == nil {
+ nextItem.lookAhead.symbols = map[symbol.Symbol]struct{}{}
+ }
+
+ for a := range item.lookAhead.symbols {
+ nextItem.lookAhead.symbols[a] = struct{}{}
+ }
+ }
+ }
+ if len(propDests) == 0 {
+ continue
+ }
+
+ props = append(props, &propagation{
+ src: &stateAndLRItem{
+ kernelID: state.id,
+ itemID: kItem.id,
+ },
+ dest: propDests,
+ })
+ }
+ }
+
+ err := propagateLookAhead(lr0, props)
+ if err != nil {
+ return nil, fmt.Errorf("failed to propagate look-ahead symbols: %v", err)
+ }
+
+ return &lalr1Automaton{
+ lr0Automaton: lr0,
+ }, nil
+}
+
+func genLALR1Closure(srcItem *lrItem, prods *productionSet, first *firstSet) ([]*lrItem, error) {
+ items := []*lrItem{}
+ knownItems := map[lrItemID]map[symbol.Symbol]struct{}{}
+ knownItemsProp := map[lrItemID]struct{}{}
+ uncheckedItems := []*lrItem{}
+ items = append(items, srcItem)
+ uncheckedItems = append(uncheckedItems, srcItem)
+ for len(uncheckedItems) > 0 {
+ nextUncheckedItems := []*lrItem{}
+ for _, item := range uncheckedItems {
+ if item.dottedSymbol.IsTerminal() {
+ continue
+ }
+
+ p, ok := prods.findByID(item.prod)
+ if !ok {
+ return nil, fmt.Errorf("production not found: %v", item.prod)
+ }
+
+ var fstSyms []symbol.Symbol
+ var isFstNullable bool
+ {
+ fst, err := first.find(p, item.dot+1)
+ if err != nil {
+ return nil, err
+ }
+
+ fstSyms = make([]symbol.Symbol, len(fst.symbols))
+ i := 0
+ for s := range fst.symbols {
+ fstSyms[i] = s
+ i++
+ }
+ if fst.empty {
+ isFstNullable = true
+ }
+ }
+
+ ps, _ := prods.findByLHS(item.dottedSymbol)
+ for _, prod := range ps {
+ var lookAhead []symbol.Symbol
+ {
+ var lookAheadCount int
+ if isFstNullable {
+ lookAheadCount = len(fstSyms) + len(item.lookAhead.symbols)
+ } else {
+ lookAheadCount = len(fstSyms)
+ }
+
+ lookAhead = make([]symbol.Symbol, lookAheadCount)
+ i := 0
+ for _, s := range fstSyms {
+ lookAhead[i] = s
+ i++
+ }
+ if isFstNullable {
+ for a := range item.lookAhead.symbols {
+ lookAhead[i] = a
+ i++
+ }
+ }
+ }
+
+ for _, a := range lookAhead {
+ newItem, err := newLR0Item(prod, 0)
+ if err != nil {
+ return nil, err
+ }
+ if items, exist := knownItems[newItem.id]; exist {
+ if _, exist := items[a]; exist {
+ continue
+ }
+ }
+
+ newItem.lookAhead.symbols = map[symbol.Symbol]struct{}{
+ a: {},
+ }
+
+ items = append(items, newItem)
+ if knownItems[newItem.id] == nil {
+ knownItems[newItem.id] = map[symbol.Symbol]struct{}{}
+ }
+ knownItems[newItem.id][a] = struct{}{}
+ nextUncheckedItems = append(nextUncheckedItems, newItem)
+ }
+
+ if isFstNullable {
+ newItem, err := newLR0Item(prod, 0)
+ if err != nil {
+ return nil, err
+ }
+ if _, exist := knownItemsProp[newItem.id]; exist {
+ continue
+ }
+
+ newItem.lookAhead.propagation = true
+
+ items = append(items, newItem)
+ knownItemsProp[newItem.id] = struct{}{}
+ nextUncheckedItems = append(nextUncheckedItems, newItem)
+ }
+ }
+ }
+ uncheckedItems = nextUncheckedItems
+ }
+
+ return items, nil
+}
+
+func propagateLookAhead(lr0 *lr0Automaton, props []*propagation) error {
+ for {
+ changed := false
+ for _, prop := range props {
+ srcState, ok := lr0.states[prop.src.kernelID]
+ if !ok {
+ return fmt.Errorf("source state not found: %v", prop.src.kernelID)
+ }
+ var srcItem *lrItem
+ for _, item := range srcState.items {
+ if item.id != prop.src.itemID {
+ continue
+ }
+ srcItem = item
+ break
+ }
+ if srcItem == nil {
+ return fmt.Errorf("source item not found: %v", prop.src.itemID)
+ }
+
+ for _, dest := range prop.dest {
+ destState, ok := lr0.states[dest.kernelID]
+ if !ok {
+ return fmt.Errorf("destination state not found: %v", dest.kernelID)
+ }
+ var destItem *lrItem
+ for _, item := range destState.items {
+ if item.id != dest.itemID {
+ continue
+ }
+ destItem = item
+ break
+ }
+ if destItem == nil {
+ for _, item := range destState.emptyProdItems {
+ if item.id != dest.itemID {
+ continue
+ }
+ destItem = item
+ break
+ }
+ if destItem == nil {
+ return fmt.Errorf("destination item not found: %v", dest.itemID)
+ }
+ }
+
+ for a := range srcItem.lookAhead.symbols {
+ if _, ok := destItem.lookAhead.symbols[a]; ok {
+ continue
+ }
+
+ if destItem.lookAhead.symbols == nil {
+ destItem.lookAhead.symbols = map[symbol.Symbol]struct{}{}
+ }
+
+ destItem.lookAhead.symbols[a] = struct{}{}
+ changed = true
+ }
+ }
+ }
+ if !changed {
+ break
+ }
+ }
+
+ return nil
+}
diff --git a/src/urubu/grammar/lexical/compiler.go b/src/urubu/grammar/lexical/compiler.go
new file mode 100644
index 0000000..637018a
--- /dev/null
+++ b/src/urubu/grammar/lexical/compiler.go
@@ -0,0 +1,413 @@
+package lexical
+
+import (
+ "bytes"
+ "fmt"
+
+ "urubu/compressor"
+ "urubu/grammar/lexical/dfa"
+ psr "urubu/grammar/lexical/parser"
+ spec "urubu/spec/grammar"
+)
+
+type CompileError struct {
+ Kind spec.LexKindName
+ Fragment bool
+ Cause error
+ Detail string
+}
+
+func Compile(lexspec *LexSpec, compLv int) (*spec.LexicalSpec, error, []*CompileError) {
+ err := lexspec.Validate()
+ if err != nil {
+ return nil, fmt.Errorf("invalid lexical specification:\n%w", err), nil
+ }
+
+ modeEntries, modeNames, modeName2ID, fragmetns := groupEntriesByLexMode(lexspec.Entries)
+
+ modeSpecs := []*spec.CompiledLexModeSpec{
+ nil,
+ }
+ for i, es := range modeEntries[1:] {
+ modeName := modeNames[i+1]
+ modeSpec, err, cerrs := compile(es, modeName2ID, fragmetns, compLv)
+ if err != nil {
+ return nil, fmt.Errorf("failed to compile in %v mode: %w", modeName, err), cerrs
+ }
+ modeSpecs = append(modeSpecs, modeSpec)
+ }
+
+ var kindNames []spec.LexKindName
+ var name2ID map[spec.LexKindName]spec.LexKindID
+ {
+ name2ID = map[spec.LexKindName]spec.LexKindID{}
+ id := spec.LexKindIDMin
+ for _, modeSpec := range modeSpecs[1:] {
+ for _, name := range modeSpec.KindNames[1:] {
+ if _, ok := name2ID[name]; ok {
+ continue
+ }
+ name2ID[name] = id
+ id++
+ }
+ }
+
+ kindNames = make([]spec.LexKindName, len(name2ID)+1)
+ for name, id := range name2ID {
+ kindNames[id] = name
+ }
+ }
+
+ var kindIDs [][]spec.LexKindID
+ {
+ kindIDs = make([][]spec.LexKindID, len(modeSpecs))
+ for i, modeSpec := range modeSpecs[1:] {
+ ids := make([]spec.LexKindID, len(modeSpec.KindNames))
+ for modeID, name := range modeSpec.KindNames {
+ if modeID == 0 {
+ continue
+ }
+ ids[modeID] = name2ID[name]
+ }
+ kindIDs[i+1] = ids
+ }
+ }
+
+ return &spec.LexicalSpec{
+ InitialModeID: spec.LexModeIDDefault,
+ ModeNames: modeNames,
+ KindNames: kindNames,
+ KindIDs: kindIDs,
+ CompressionLevel: compLv,
+ Specs: modeSpecs,
+ }, nil, nil
+}
+
+func groupEntriesByLexMode(entries []*LexEntry) ([][]*LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeID, map[spec.LexKindName]*LexEntry) {
+ modeNames := []spec.LexModeName{
+ spec.LexModeNameNil,
+ spec.LexModeNameDefault,
+ }
+ modeName2ID := map[spec.LexModeName]spec.LexModeID{
+ spec.LexModeNameNil: spec.LexModeIDNil,
+ spec.LexModeNameDefault: spec.LexModeIDDefault,
+ }
+ lastModeID := spec.LexModeIDDefault
+ modeEntries := [][]*LexEntry{
+ nil,
+ {},
+ }
+ fragments := map[spec.LexKindName]*LexEntry{}
+ for _, e := range entries {
+ if e.Fragment {
+ fragments[e.Kind] = e
+ continue
+ }
+ ms := e.Modes
+ if len(ms) == 0 {
+ ms = []spec.LexModeName{
+ spec.LexModeNameDefault,
+ }
+ }
+ for _, modeName := range ms {
+ modeID, ok := modeName2ID[modeName]
+ if !ok {
+ modeID = lastModeID + 1
+ lastModeID = modeID
+ modeName2ID[modeName] = modeID
+ modeNames = append(modeNames, modeName)
+ modeEntries = append(modeEntries, []*LexEntry{})
+ }
+ modeEntries[modeID] = append(modeEntries[modeID], e)
+ }
+ }
+ return modeEntries, modeNames, modeName2ID, fragments
+}
+
+func compile(
+ entries []*LexEntry,
+ modeName2ID map[spec.LexModeName]spec.LexModeID,
+ fragments map[spec.LexKindName]*LexEntry,
+ compLv int,
+) (*spec.CompiledLexModeSpec, error, []*CompileError) {
+ var kindNames []spec.LexKindName
+ kindIDToName := map[spec.LexModeKindID]spec.LexKindName{}
+ var patterns map[spec.LexModeKindID][]byte
+ {
+ kindNames = append(kindNames, spec.LexKindNameNil)
+ patterns = map[spec.LexModeKindID][]byte{}
+ for i, e := range entries {
+ kindID := spec.LexModeKindID(i + 1)
+
+ kindNames = append(kindNames, e.Kind)
+ kindIDToName[kindID] = e.Kind
+ patterns[kindID] = []byte(e.Pattern)
+ }
+ }
+
+ push := []spec.LexModeID{
+ spec.LexModeIDNil,
+ }
+ pop := []int{
+ 0,
+ }
+ for _, e := range entries {
+ pushV := spec.LexModeIDNil
+ if e.Push != "" {
+ pushV = modeName2ID[e.Push]
+ }
+ push = append(push, pushV)
+ popV := 0
+ if e.Pop {
+ popV = 1
+ }
+ pop = append(pop, popV)
+ }
+
+ fragmentPatterns := map[spec.LexKindName][]byte{}
+ for k, e := range fragments {
+ fragmentPatterns[k] = []byte(e.Pattern)
+ }
+
+ fragmentCPTrees := make(map[spec.LexKindName]psr.CPTree, len(fragmentPatterns))
+ {
+ var cerrs []*CompileError
+ for kind, pat := range fragmentPatterns {
+ p := psr.NewParser(kind, bytes.NewReader(pat))
+ t, err := p.Parse()
+ if err != nil {
+ if err == psr.ParseErr {
+ detail, cause := p.Error()
+ cerrs = append(cerrs, &CompileError{
+ Kind: kind,
+ Fragment: true,
+ Cause: cause,
+ Detail: detail,
+ })
+ } else {
+ cerrs = append(cerrs, &CompileError{
+ Kind: kind,
+ Fragment: true,
+ Cause: err,
+ })
+ }
+ continue
+ }
+ fragmentCPTrees[kind] = t
+ }
+ if len(cerrs) > 0 {
+ return nil, fmt.Errorf("compile error"), cerrs
+ }
+
+ err := psr.CompleteFragments(fragmentCPTrees)
+ if err != nil {
+ if err == psr.ParseErr {
+ for _, frag := range fragmentCPTrees {
+ kind, frags, err := frag.Describe()
+ if err != nil {
+ return nil, err, nil
+ }
+
+ cerrs = append(cerrs, &CompileError{
+ Kind: kind,
+ Fragment: true,
+ Cause: fmt.Errorf("fragment contains undefined fragments or cycles"),
+ Detail: fmt.Sprintf("%v", frags),
+ })
+ }
+
+ return nil, fmt.Errorf("compile error"), cerrs
+ }
+
+ return nil, err, nil
+ }
+ }
+
+ cpTrees := map[spec.LexModeKindID]psr.CPTree{}
+ {
+ pats := make([]*psr.PatternEntry, len(patterns)+1)
+ pats[spec.LexModeKindIDNil] = &psr.PatternEntry{
+ ID: spec.LexModeKindIDNil,
+ }
+ for id, pattern := range patterns {
+ pats[id] = &psr.PatternEntry{
+ ID: id,
+ Pattern: pattern,
+ }
+ }
+
+ var cerrs []*CompileError
+ for _, pat := range pats {
+ if pat.ID == spec.LexModeKindIDNil {
+ continue
+ }
+
+ p := psr.NewParser(kindIDToName[pat.ID], bytes.NewReader(pat.Pattern))
+ t, err := p.Parse()
+ if err != nil {
+ if err == psr.ParseErr {
+ detail, cause := p.Error()
+ cerrs = append(cerrs, &CompileError{
+ Kind: kindIDToName[pat.ID],
+ Fragment: false,
+ Cause: cause,
+ Detail: detail,
+ })
+ } else {
+ cerrs = append(cerrs, &CompileError{
+ Kind: kindIDToName[pat.ID],
+ Fragment: false,
+ Cause: err,
+ })
+ }
+ continue
+ }
+
+ complete, err := psr.ApplyFragments(t, fragmentCPTrees)
+ if err != nil {
+ return nil, err, nil
+ }
+ if !complete {
+ _, frags, err := t.Describe()
+ if err != nil {
+ return nil, err, nil
+ }
+
+ cerrs = append(cerrs, &CompileError{
+ Kind: kindIDToName[pat.ID],
+ Fragment: false,
+ Cause: fmt.Errorf("pattern contains undefined fragments"),
+ Detail: fmt.Sprintf("%v", frags),
+ })
+ continue
+ }
+
+ cpTrees[pat.ID] = t
+ }
+ if len(cerrs) > 0 {
+ return nil, fmt.Errorf("compile error"), cerrs
+ }
+ }
+
+ var tranTab *spec.TransitionTable
+ {
+ root, symTab, err := dfa.ConvertCPTreeToByteTree(cpTrees)
+ if err != nil {
+ return nil, err, nil
+ }
+ d := dfa.GenDFA(root, symTab)
+ tranTab, err = dfa.GenTransitionTable(d)
+ if err != nil {
+ return nil, err, nil
+ }
+ }
+
+ var err error
+ switch compLv {
+ case 2:
+ tranTab, err = compressTransitionTableLv2(tranTab)
+ if err != nil {
+ return nil, err, nil
+ }
+ case 1:
+ tranTab, err = compressTransitionTableLv1(tranTab)
+ if err != nil {
+ return nil, err, nil
+ }
+ }
+
+ return &spec.CompiledLexModeSpec{
+ KindNames: kindNames,
+ Push: push,
+ Pop: pop,
+ DFA: tranTab,
+ }, nil, nil
+}
+
+const (
+ CompressionLevelMin = 0
+ CompressionLevelMax = 2
+)
+
+func compressTransitionTableLv2(tranTab *spec.TransitionTable) (*spec.TransitionTable, error) {
+ ueTab := compressor.NewUniqueEntriesTable()
+ {
+ orig, err := compressor.NewOriginalTable(convertStateIDSliceToIntSlice(tranTab.UncompressedTransition), tranTab.ColCount)
+ if err != nil {
+ return nil, err
+ }
+ err = ueTab.Compress(orig)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ rdTab := compressor.NewRowDisplacementTable(0)
+ {
+ orig, err := compressor.NewOriginalTable(ueTab.UniqueEntries, ueTab.OriginalColCount)
+ if err != nil {
+ return nil, err
+ }
+ err = rdTab.Compress(orig)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ tranTab.Transition = &spec.UniqueEntriesTable{
+ UniqueEntries: &spec.RowDisplacementTable{
+ OriginalRowCount: rdTab.OriginalRowCount,
+ OriginalColCount: rdTab.OriginalColCount,
+ EmptyValue: spec.StateIDNil,
+ Entries: convertIntSliceToStateIDSlice(rdTab.Entries),
+ Bounds: rdTab.Bounds,
+ RowDisplacement: rdTab.RowDisplacement,
+ },
+ RowNums: ueTab.RowNums,
+ OriginalRowCount: ueTab.OriginalRowCount,
+ OriginalColCount: ueTab.OriginalColCount,
+ }
+ tranTab.UncompressedTransition = nil
+
+ return tranTab, nil
+}
+
+func compressTransitionTableLv1(tranTab *spec.TransitionTable) (*spec.TransitionTable, error) {
+ ueTab := compressor.NewUniqueEntriesTable()
+ {
+ orig, err := compressor.NewOriginalTable(convertStateIDSliceToIntSlice(tranTab.UncompressedTransition), tranTab.ColCount)
+ if err != nil {
+ return nil, err
+ }
+ err = ueTab.Compress(orig)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ tranTab.Transition = &spec.UniqueEntriesTable{
+ UncompressedUniqueEntries: convertIntSliceToStateIDSlice(ueTab.UniqueEntries),
+ RowNums: ueTab.RowNums,
+ OriginalRowCount: ueTab.OriginalRowCount,
+ OriginalColCount: ueTab.OriginalColCount,
+ }
+ tranTab.UncompressedTransition = nil
+
+ return tranTab, nil
+}
+
+func convertStateIDSliceToIntSlice(s []spec.StateID) []int {
+ is := make([]int, len(s))
+ for i, v := range s {
+ is[i] = v.Int()
+ }
+ return is
+}
+
+func convertIntSliceToStateIDSlice(s []int) []spec.StateID {
+ ss := make([]spec.StateID, len(s))
+ for i, v := range s {
+ ss[i] = spec.StateID(v)
+ }
+ return ss
+}
diff --git a/src/urubu/grammar/lexical/dfa/dfa.go b/src/urubu/grammar/lexical/dfa/dfa.go
new file mode 100644
index 0000000..48bd8b4
--- /dev/null
+++ b/src/urubu/grammar/lexical/dfa/dfa.go
@@ -0,0 +1,173 @@
+package dfa
+
+import (
+ "sort"
+
+ spec "urubu/spec/grammar"
+)
+
+type symbolTable struct {
+ symPos2Byte map[symbolPosition]byteRange
+ endPos2ID map[symbolPosition]spec.LexModeKindID
+}
+
+func genSymbolTable(root byteTree) *symbolTable {
+ symTab := &symbolTable{
+ symPos2Byte: map[symbolPosition]byteRange{},
+ endPos2ID: map[symbolPosition]spec.LexModeKindID{},
+ }
+ return genSymTab(symTab, root)
+}
+
+func genSymTab(symTab *symbolTable, node byteTree) *symbolTable {
+ if node == nil {
+ return symTab
+ }
+
+ switch n := node.(type) {
+ case *symbolNode:
+ symTab.symPos2Byte[n.pos] = byteRange{
+ from: n.from,
+ to: n.to,
+ }
+ case *endMarkerNode:
+ symTab.endPos2ID[n.pos] = n.id
+ default:
+ left, right := node.children()
+ genSymTab(symTab, left)
+ genSymTab(symTab, right)
+ }
+ return symTab
+}
+
+type DFA struct {
+ States []string
+ InitialState string
+ AcceptingStatesTable map[string]spec.LexModeKindID
+ TransitionTable map[string][256]string
+}
+
+func GenDFA(root byteTree, symTab *symbolTable) *DFA {
+ initialState := root.first()
+ initialStateHash := initialState.hash()
+ stateMap := map[string]*symbolPositionSet{
+ initialStateHash: initialState,
+ }
+ tranTab := map[string][256]string{}
+ {
+ follow := genFollowTable(root)
+ unmarkedStates := map[string]*symbolPositionSet{
+ initialStateHash: initialState,
+ }
+ for len(unmarkedStates) > 0 {
+ nextUnmarkedStates := map[string]*symbolPositionSet{}
+ for hash, state := range unmarkedStates {
+ tranTabOfState := [256]*symbolPositionSet{}
+ for _, pos := range state.set() {
+ if pos.isEndMark() {
+ continue
+ }
+ valRange := symTab.symPos2Byte[pos]
+ for symVal := valRange.from; symVal <= valRange.to; symVal++ {
+ if tranTabOfState[symVal] == nil {
+ tranTabOfState[symVal] = newSymbolPositionSet()
+ }
+ tranTabOfState[symVal].merge(follow[pos])
+ }
+ }
+ for _, t := range tranTabOfState {
+ if t == nil {
+ continue
+ }
+ h := t.hash()
+ if _, ok := stateMap[h]; ok {
+ continue
+ }
+ stateMap[h] = t
+ nextUnmarkedStates[h] = t
+ }
+ tabOfState := [256]string{}
+ for v, t := range tranTabOfState {
+ if t == nil {
+ continue
+ }
+ tabOfState[v] = t.hash()
+ }
+ tranTab[hash] = tabOfState
+ }
+ unmarkedStates = nextUnmarkedStates
+ }
+ }
+
+ accTab := map[string]spec.LexModeKindID{}
+ {
+ for h, s := range stateMap {
+ for _, pos := range s.set() {
+ if !pos.isEndMark() {
+ continue
+ }
+ priorID, ok := accTab[h]
+ if !ok {
+ accTab[h] = symTab.endPos2ID[pos]
+ } else {
+ id := symTab.endPos2ID[pos]
+ if id < priorID {
+ accTab[h] = id
+ }
+ }
+ }
+ }
+ }
+
+ var states []string
+ {
+ for s := range stateMap {
+ states = append(states, s)
+ }
+ sort.Slice(states, func(i, j int) bool {
+ return states[i] < states[j]
+ })
+ }
+
+ return &DFA{
+ States: states,
+ InitialState: initialStateHash,
+ AcceptingStatesTable: accTab,
+ TransitionTable: tranTab,
+ }
+}
+
+func GenTransitionTable(dfa *DFA) (*spec.TransitionTable, error) {
+ stateHash2ID := map[string]spec.StateID{}
+ for i, s := range dfa.States {
+ // Since 0 represents an invalid value in a transition table,
+ // assign a number greater than or equal to 1 to states.
+ stateHash2ID[s] = spec.StateID(i + spec.StateIDMin.Int())
+ }
+
+ acc := make([]spec.LexModeKindID, len(dfa.States)+1)
+ for _, s := range dfa.States {
+ id, ok := dfa.AcceptingStatesTable[s]
+ if !ok {
+ continue
+ }
+ acc[stateHash2ID[s]] = id
+ }
+
+ rowCount := len(dfa.States) + 1
+ colCount := 256
+ tran := make([]spec.StateID, rowCount*colCount)
+ for s, tab := range dfa.TransitionTable {
+ for v, to := range tab {
+ tran[stateHash2ID[s].Int()*256+v] = stateHash2ID[to]
+ }
+ }
+
+ return &spec.TransitionTable{
+ InitialStateID: stateHash2ID[dfa.InitialState],
+ AcceptingStates: acc,
+ UncompressedTransition: tran,
+ RowCount: rowCount,
+ ColCount: colCount,
+ }, nil
+}
diff --git a/src/urubu/grammar/lexical/dfa/symbol_position.go b/src/urubu/grammar/lexical/dfa/symbol_position.go
new file mode 100644
index 0000000..f154251
--- /dev/null
+++ b/src/urubu/grammar/lexical/dfa/symbol_position.go
@@ -0,0 +1,182 @@
+package dfa
+
+import (
+ "encoding/binary"
+ "fmt"
+ "strings"
+)
+
+type symbolPosition uint16
+
+const (
+ symbolPositionNil symbolPosition = 0x0000
+
+ symbolPositionMin uint16 = 0x0001
+ symbolPositionMax uint16 = 0x7fff
+
+ symbolPositionMaskSymbol uint16 = 0x0000
+ symbolPositionMaskEndMark uint16 = 0x8000
+
+ symbolPositionMaskValue uint16 = 0x7fff
+)
+
+func newSymbolPosition(n uint16, endMark bool) (symbolPosition, error) {
+ if n < symbolPositionMin || n > symbolPositionMax {
+ return symbolPositionNil, fmt.Errorf("symbol position must be within %v to %v: n: %v, endMark: %v", symbolPositionMin, symbolPositionMax, n, endMark)
+ }
+ if endMark {
+ return symbolPosition(n | symbolPositionMaskEndMark), nil
+ }
+ return symbolPosition(n | symbolPositionMaskSymbol), nil
+}
+
+func (p symbolPosition) String() string {
+ if p.isEndMark() {
+ return fmt.Sprintf("end#%v", uint16(p)&symbolPositionMaskValue)
+ }
+ return fmt.Sprintf("sym#%v", uint16(p)&symbolPositionMaskValue)
+}
+
+func (p symbolPosition) isEndMark() bool {
+ return uint16(p)&symbolPositionMaskEndMark > 1
+}
+
+func (p symbolPosition) describe() (uint16, bool) {
+ v := uint16(p) & symbolPositionMaskValue
+ if p.isEndMark() {
+ return v, true
+ }
+ return v, false
+}
+
+type symbolPositionSet struct {
+ // `s` represents a set of symbol positions.
+ // However, immediately after adding a symbol position, the elements may be duplicated.
+ // When you need an aligned set with no duplicates, you can get such value via the set function.
+ s []symbolPosition
+ sorted bool
+}
+
+func newSymbolPositionSet() *symbolPositionSet {
+ return &symbolPositionSet{
+ s: []symbolPosition{},
+ sorted: false,
+ }
+}
+
+func (s *symbolPositionSet) String() string {
+ if len(s.s) <= 0 {
+ return "{}"
+ }
+ ps := s.sortAndRemoveDuplicates()
+ var b strings.Builder
+ fmt.Fprintf(&b, "{")
+ for i, p := range ps {
+ if i <= 0 {
+ fmt.Fprintf(&b, "%v", p)
+ continue
+ }
+ fmt.Fprintf(&b, ", %v", p)
+ }
+ fmt.Fprintf(&b, "}")
+ return b.String()
+}
+
+func (s *symbolPositionSet) set() []symbolPosition {
+ s.sortAndRemoveDuplicates()
+ return s.s
+}
+
+func (s *symbolPositionSet) add(pos symbolPosition) *symbolPositionSet {
+ s.s = append(s.s, pos)
+ s.sorted = false
+ return s
+}
+
+func (s *symbolPositionSet) merge(t *symbolPositionSet) *symbolPositionSet {
+ s.s = append(s.s, t.s...)
+ s.sorted = false
+ return s
+}
+
+func (s *symbolPositionSet) hash() string {
+ if len(s.s) <= 0 {
+ return ""
+ }
+ sorted := s.sortAndRemoveDuplicates()
+ var buf []byte
+ for _, p := range sorted {
+ b := make([]byte, 8)
+ binary.PutUvarint(b, uint64(p))
+ buf = append(buf, b...)
+ }
+ // Convert to a string to be able to use it as a key of a map.
+ // But note this byte sequence is made from values of symbol positions,
+ // so this is not a well-formed UTF-8 sequence.
+ return string(buf)
+}
+
+func (s *symbolPositionSet) sortAndRemoveDuplicates() []symbolPosition {
+ if s.sorted {
+ return s.s
+ }
+
+ sortSymbolPositions(s.s, 0, len(s.s)-1)
+
+ // Remove duplicates.
+ lastV := s.s[0]
+ nextIdx := 1
+ for _, v := range s.s[1:] {
+ if v == lastV {
+ continue
+ }
+ s.s[nextIdx] = v
+ nextIdx++
+ lastV = v
+ }
+ s.s = s.s[:nextIdx]
+ s.sorted = true
+
+ return s.s
+}
+
+// sortSymbolPositions sorts a slice of symbol positions as it uses quick sort.
+func sortSymbolPositions(ps []symbolPosition, left, right int) {
+ if left >= right {
+ return
+ }
+ var pivot symbolPosition
+ {
+ // Use a median as a pivot.
+ p1 := ps[left]
+ p2 := ps[(left+right)/2]
+ p3 := ps[right]
+ if p1 > p2 {
+ p1, p2 = p2, p1
+ }
+ if p2 > p3 {
+ p2 = p3
+ if p1 > p2 {
+ p2 = p1
+ }
+ }
+ pivot = p2
+ }
+ i := left
+ j := right
+ for i <= j {
+ for ps[i] < pivot {
+ i++
+ }
+ for ps[j] > pivot {
+ j--
+ }
+ if i <= j {
+ ps[i], ps[j] = ps[j], ps[i]
+ i++
+ j--
+ }
+ }
+ sortSymbolPositions(ps, left, j)
+ sortSymbolPositions(ps, i, right)
+}
diff --git a/src/urubu/grammar/lexical/dfa/tree.go b/src/urubu/grammar/lexical/dfa/tree.go
new file mode 100644
index 0000000..8a11aee
--- /dev/null
+++ b/src/urubu/grammar/lexical/dfa/tree.go
@@ -0,0 +1,567 @@
+package dfa
+
+import (
+ "fmt"
+ "io"
+ "sort"
+
+ "urubu/grammar/lexical/parser"
+ spec "urubu/spec/grammar"
+ "urubu/utf8"
+)
+
+type byteTree interface {
+ fmt.Stringer
+ children() (byteTree, byteTree)
+ nullable() bool
+ first() *symbolPositionSet
+ last() *symbolPositionSet
+ clone() byteTree
+}
+
+var (
+ _ byteTree = &symbolNode{}
+ _ byteTree = &endMarkerNode{}
+ _ byteTree = &concatNode{}
+ _ byteTree = &altNode{}
+ _ byteTree = &repeatNode{}
+ _ byteTree = &optionNode{}
+)
+
+type byteRange struct {
+ from byte
+ to byte
+}
+
+type symbolNode struct {
+ byteRange
+ pos symbolPosition
+ firstMemo *symbolPositionSet
+ lastMemo *symbolPositionSet
+}
+
+func newSymbolNode(value byte) *symbolNode {
+ return &symbolNode{
+ byteRange: byteRange{
+ from: value,
+ to: value,
+ },
+ pos: symbolPositionNil,
+ }
+}
+
+func newRangeSymbolNode(from, to byte) *symbolNode {
+ return &symbolNode{
+ byteRange: byteRange{
+ from: from,
+ to: to,
+ },
+ pos: symbolPositionNil,
+ }
+}
+
+func (n *symbolNode) String() string {
+ return fmt.Sprintf("symbol: value: %v-%v, pos: %v", n.from, n.to, n.pos)
+}
+
+func (n *symbolNode) children() (byteTree, byteTree) {
+ return nil, nil
+}
+
+func (n *symbolNode) nullable() bool {
+ return false
+}
+
+func (n *symbolNode) first() *symbolPositionSet {
+ if n.firstMemo == nil {
+ n.firstMemo = newSymbolPositionSet()
+ n.firstMemo.add(n.pos)
+ }
+ return n.firstMemo
+}
+
+func (n *symbolNode) last() *symbolPositionSet {
+ if n.lastMemo == nil {
+ n.lastMemo = newSymbolPositionSet()
+ n.lastMemo.add(n.pos)
+ }
+ return n.lastMemo
+}
+
+func (n *symbolNode) clone() byteTree {
+ return newRangeSymbolNode(n.from, n.to)
+}
+
+type endMarkerNode struct {
+ id spec.LexModeKindID
+ pos symbolPosition
+ firstMemo *symbolPositionSet
+ lastMemo *symbolPositionSet
+}
+
+func newEndMarkerNode(id spec.LexModeKindID) *endMarkerNode {
+ return &endMarkerNode{
+ id: id,
+ pos: symbolPositionNil,
+ }
+}
+
+func (n *endMarkerNode) String() string {
+ return fmt.Sprintf("end: pos: %v", n.pos)
+}
+
+func (n *endMarkerNode) children() (byteTree, byteTree) {
+ return nil, nil
+}
+
+func (n *endMarkerNode) nullable() bool {
+ return false
+}
+
+func (n *endMarkerNode) first() *symbolPositionSet {
+ if n.firstMemo == nil {
+ n.firstMemo = newSymbolPositionSet()
+ n.firstMemo.add(n.pos)
+ }
+ return n.firstMemo
+}
+
+func (n *endMarkerNode) last() *symbolPositionSet {
+ if n.lastMemo == nil {
+ n.lastMemo = newSymbolPositionSet()
+ n.lastMemo.add(n.pos)
+ }
+ return n.lastMemo
+}
+
+func (n *endMarkerNode) clone() byteTree {
+ return newEndMarkerNode(n.id)
+}
+
+type concatNode struct {
+ left byteTree
+ right byteTree
+ firstMemo *symbolPositionSet
+ lastMemo *symbolPositionSet
+}
+
+func newConcatNode(left, right byteTree) *concatNode {
+ return &concatNode{
+ left: left,
+ right: right,
+ }
+}
+
+func (n *concatNode) String() string {
+ return "concat"
+}
+
+func (n *concatNode) children() (byteTree, byteTree) {
+ return n.left, n.right
+}
+
+func (n *concatNode) nullable() bool {
+ return n.left.nullable() && n.right.nullable()
+}
+
+func (n *concatNode) first() *symbolPositionSet {
+ if n.firstMemo == nil {
+ n.firstMemo = newSymbolPositionSet()
+ n.firstMemo.merge(n.left.first())
+ if n.left.nullable() {
+ n.firstMemo.merge(n.right.first())
+ }
+ n.firstMemo.sortAndRemoveDuplicates()
+ }
+ return n.firstMemo
+}
+
+func (n *concatNode) last() *symbolPositionSet {
+ if n.lastMemo == nil {
+ n.lastMemo = newSymbolPositionSet()
+ n.lastMemo.merge(n.right.last())
+ if n.right.nullable() {
+ n.lastMemo.merge(n.left.last())
+ }
+ n.lastMemo.sortAndRemoveDuplicates()
+ }
+ return n.lastMemo
+}
+
+func (n *concatNode) clone() byteTree {
+ return newConcatNode(n.left.clone(), n.right.clone())
+}
+
+type altNode struct {
+ left byteTree
+ right byteTree
+ firstMemo *symbolPositionSet
+ lastMemo *symbolPositionSet
+}
+
+func newAltNode(left, right byteTree) *altNode {
+ return &altNode{
+ left: left,
+ right: right,
+ }
+}
+
+func (n *altNode) String() string {
+ return "alt"
+}
+
+func (n *altNode) children() (byteTree, byteTree) {
+ return n.left, n.right
+}
+
+func (n *altNode) nullable() bool {
+ return n.left.nullable() || n.right.nullable()
+}
+
+func (n *altNode) first() *symbolPositionSet {
+ if n.firstMemo == nil {
+ n.firstMemo = newSymbolPositionSet()
+ n.firstMemo.merge(n.left.first())
+ n.firstMemo.merge(n.right.first())
+ n.firstMemo.sortAndRemoveDuplicates()
+ }
+ return n.firstMemo
+}
+
+func (n *altNode) last() *symbolPositionSet {
+ if n.lastMemo == nil {
+ n.lastMemo = newSymbolPositionSet()
+ n.lastMemo.merge(n.left.last())
+ n.lastMemo.merge(n.right.last())
+ n.lastMemo.sortAndRemoveDuplicates()
+ }
+ return n.lastMemo
+}
+
+func (n *altNode) clone() byteTree {
+ return newAltNode(n.left.clone(), n.right.clone())
+}
+
+type repeatNode struct {
+ left byteTree
+ firstMemo *symbolPositionSet
+ lastMemo *symbolPositionSet
+}
+
+func newRepeatNode(left byteTree) *repeatNode {
+ return &repeatNode{
+ left: left,
+ }
+}
+
+func (n *repeatNode) String() string {
+ return "repeat"
+}
+
+func (n *repeatNode) children() (byteTree, byteTree) {
+ return n.left, nil
+}
+
+func (n *repeatNode) nullable() bool {
+ return true
+}
+
+func (n *repeatNode) first() *symbolPositionSet {
+ if n.firstMemo == nil {
+ n.firstMemo = newSymbolPositionSet()
+ n.firstMemo.merge(n.left.first())
+ n.firstMemo.sortAndRemoveDuplicates()
+ }
+ return n.firstMemo
+}
+
+func (n *repeatNode) last() *symbolPositionSet {
+ if n.lastMemo == nil {
+ n.lastMemo = newSymbolPositionSet()
+ n.lastMemo.merge(n.left.last())
+ n.lastMemo.sortAndRemoveDuplicates()
+ }
+ return n.lastMemo
+}
+
+func (n *repeatNode) clone() byteTree {
+ return newRepeatNode(n.left.clone())
+}
+
+type optionNode struct {
+ left byteTree
+ firstMemo *symbolPositionSet
+ lastMemo *symbolPositionSet
+}
+
+func newOptionNode(left byteTree) *optionNode {
+ return &optionNode{
+ left: left,
+ }
+}
+
+func (n *optionNode) String() string {
+ return "option"
+}
+
+func (n *optionNode) children() (byteTree, byteTree) {
+ return n.left, nil
+}
+
+func (n *optionNode) nullable() bool {
+ return true
+}
+
+func (n *optionNode) first() *symbolPositionSet {
+ if n.firstMemo == nil {
+ n.firstMemo = newSymbolPositionSet()
+ n.firstMemo.merge(n.left.first())
+ n.firstMemo.sortAndRemoveDuplicates()
+ }
+ return n.firstMemo
+}
+
+func (n *optionNode) last() *symbolPositionSet {
+ if n.lastMemo == nil {
+ n.lastMemo = newSymbolPositionSet()
+ n.lastMemo.merge(n.left.last())
+ n.lastMemo.sortAndRemoveDuplicates()
+ }
+ return n.lastMemo
+}
+
+func (n *optionNode) clone() byteTree {
+ return newOptionNode(n.left.clone())
+}
+
+type followTable map[symbolPosition]*symbolPositionSet
+
+func genFollowTable(root byteTree) followTable {
+ follow := followTable{}
+ calcFollow(follow, root)
+ return follow
+}
+
+func calcFollow(follow followTable, ast byteTree) {
+ if ast == nil {
+ return
+ }
+ left, right := ast.children()
+ calcFollow(follow, left)
+ calcFollow(follow, right)
+ switch n := ast.(type) {
+ case *concatNode:
+ l, r := n.children()
+ for _, p := range l.last().set() {
+ if _, ok := follow[p]; !ok {
+ follow[p] = newSymbolPositionSet()
+ }
+ follow[p].merge(r.first())
+ }
+ case *repeatNode:
+ for _, p := range n.last().set() {
+ if _, ok := follow[p]; !ok {
+ follow[p] = newSymbolPositionSet()
+ }
+ follow[p].merge(n.first())
+ }
+ }
+}
+
+func positionSymbols(node byteTree, n uint16) (uint16, error) {
+ if node == nil {
+ return n, nil
+ }
+
+ l, r := node.children()
+ p := n
+ p, err := positionSymbols(l, p)
+ if err != nil {
+ return p, err
+ }
+ p, err = positionSymbols(r, p)
+ if err != nil {
+ return p, err
+ }
+ switch n := node.(type) {
+ case *symbolNode:
+ n.pos, err = newSymbolPosition(p, false)
+ if err != nil {
+ return p, err
+ }
+ p++
+ case *endMarkerNode:
+ n.pos, err = newSymbolPosition(p, true)
+ if err != nil {
+ return p, err
+ }
+ p++
+ }
+ node.first()
+ node.last()
+ return p, nil
+}
+
+func concat(ts ...byteTree) byteTree {
+ nonNilNodes := []byteTree{}
+ for _, t := range ts {
+ if t == nil {
+ continue
+ }
+ nonNilNodes = append(nonNilNodes, t)
+ }
+ if len(nonNilNodes) <= 0 {
+ return nil
+ }
+ if len(nonNilNodes) == 1 {
+ return nonNilNodes[0]
+ }
+ concat := newConcatNode(nonNilNodes[0], nonNilNodes[1])
+ for _, t := range nonNilNodes[2:] {
+ concat = newConcatNode(concat, t)
+ }
+ return concat
+}
+
+func oneOf(ts ...byteTree) byteTree {
+ nonNilNodes := []byteTree{}
+ for _, t := range ts {
+ if t == nil {
+ continue
+ }
+ nonNilNodes = append(nonNilNodes, t)
+ }
+ if len(nonNilNodes) <= 0 {
+ return nil
+ }
+ if len(nonNilNodes) == 1 {
+ return nonNilNodes[0]
+ }
+ alt := newAltNode(nonNilNodes[0], nonNilNodes[1])
+ for _, t := range nonNilNodes[2:] {
+ alt = newAltNode(alt, t)
+ }
+ return alt
+}
+
+//nolint:unused
+func printByteTree(w io.Writer, t byteTree, ruledLine string, childRuledLinePrefix string, withAttrs bool) {
+ if t == nil {
+ return
+ }
+ fmt.Fprintf(w, "%v%v", ruledLine, t)
+ if withAttrs {
+ fmt.Fprintf(w, ", nullable: %v, first: %v, last: %v", t.nullable(), t.first(), t.last())
+ }
+ fmt.Fprintf(w, "\n")
+ left, right := t.children()
+ children := []byteTree{}
+ if left != nil {
+ children = append(children, left)
+ }
+ if right != nil {
+ children = append(children, right)
+ }
+ num := len(children)
+ for i, child := range children {
+ line := "└─ "
+ if num > 1 {
+ if i == 0 {
+ line = "├─ "
+ } else if i < num-1 {
+ line = "│ "
+ }
+ }
+ prefix := "│ "
+ if i >= num-1 {
+ prefix = " "
+ }
+ printByteTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix, withAttrs)
+ }
+}
+
+func ConvertCPTreeToByteTree(cpTrees map[spec.LexModeKindID]parser.CPTree) (byteTree, *symbolTable, error) {
+ var ids []spec.LexModeKindID
+ for id := range cpTrees {
+ ids = append(ids, id)
+ }
+ sort.Slice(ids, func(i, j int) bool {
+ return ids[i] < ids[j]
+ })
+
+ var bt byteTree
+ for _, id := range ids {
+ cpTree := cpTrees[id]
+ t, err := convCPTreeToByteTree(cpTree)
+ if err != nil {
+ return nil, nil, err
+ }
+ bt = oneOf(bt, concat(t, newEndMarkerNode(id)))
+ }
+ _, err := positionSymbols(bt, symbolPositionMin)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ return bt, genSymbolTable(bt), nil
+}
+
+func convCPTreeToByteTree(cpTree parser.CPTree) (byteTree, error) {
+ if from, to, ok := cpTree.Range(); ok {
+ bs, err := utf8.GenCharBlocks(from, to)
+ if err != nil {
+ return nil, err
+ }
+ var a byteTree
+ for _, b := range bs {
+ var c byteTree
+ for i := 0; i < len(b.From); i++ {
+ c = concat(c, newRangeSymbolNode(b.From[i], b.To[i]))
+ }
+ a = oneOf(a, c)
+ }
+ return a, nil
+ }
+
+ if tree, ok := cpTree.Repeatable(); ok {
+ t, err := convCPTreeToByteTree(tree)
+ if err != nil {
+ return nil, err
+ }
+ return newRepeatNode(t), nil
+ }
+
+ if tree, ok := cpTree.Optional(); ok {
+ t, err := convCPTreeToByteTree(tree)
+ if err != nil {
+ return nil, err
+ }
+ return newOptionNode(t), nil
+ }
+
+ if left, right, ok := cpTree.Concatenation(); ok {
+ l, err := convCPTreeToByteTree(left)
+ if err != nil {
+ return nil, err
+ }
+ r, err := convCPTreeToByteTree(right)
+ if err != nil {
+ return nil, err
+ }
+ return newConcatNode(l, r), nil
+ }
+
+ if left, right, ok := cpTree.Alternatives(); ok {
+ l, err := convCPTreeToByteTree(left)
+ if err != nil {
+ return nil, err
+ }
+ r, err := convCPTreeToByteTree(right)
+ if err != nil {
+ return nil, err
+ }
+ return newAltNode(l, r), nil
+ }
+
+ return nil, fmt.Errorf("invalid tree type: %T", cpTree)
+}
diff --git a/src/urubu/grammar/lexical/entry.go b/src/urubu/grammar/lexical/entry.go
new file mode 100644
index 0000000..44af8ea
--- /dev/null
+++ b/src/urubu/grammar/lexical/entry.go
@@ -0,0 +1,171 @@
+package lexical
+
+import (
+ "fmt"
+ "sort"
+ "strings"
+
+ spec "urubu/spec/grammar"
+)
+
+type LexEntry struct {
+ Kind spec.LexKindName
+ Pattern string
+ Modes []spec.LexModeName
+ Push spec.LexModeName
+ Pop bool
+ Fragment bool
+}
+
+type LexSpec struct {
+ Entries []*LexEntry
+}
+
+func (s *LexSpec) Validate() error {
+ if len(s.Entries) <= 0 {
+ return fmt.Errorf("the lexical specification must have at least one entry")
+ }
+ {
+ ks := map[string]struct{}{}
+ fks := map[string]struct{}{}
+ for _, e := range s.Entries {
+ // Allow duplicate names between fragments and non-fragments.
+ if e.Fragment {
+ if _, exist := fks[e.Kind.String()]; exist {
+ return fmt.Errorf("kinds `%v` are duplicates", e.Kind)
+ }
+ fks[e.Kind.String()] = struct{}{}
+ } else {
+ if _, exist := ks[e.Kind.String()]; exist {
+ return fmt.Errorf("kinds `%v` are duplicates", e.Kind)
+ }
+ ks[e.Kind.String()] = struct{}{}
+ }
+ }
+ }
+ {
+ kinds := []string{}
+ modes := []string{
+ spec.LexModeNameDefault.String(), // This is a predefined mode.
+ }
+ for _, e := range s.Entries {
+ if e.Fragment {
+ continue
+ }
+
+ kinds = append(kinds, e.Kind.String())
+
+ for _, m := range e.Modes {
+ modes = append(modes, m.String())
+ }
+ }
+
+ kindErrs := findSpellingInconsistenciesErrors(kinds, nil)
+ modeErrs := findSpellingInconsistenciesErrors(modes, func(ids []string) error {
+ if SnakeCaseToUpperCamelCase(ids[0]) == SnakeCaseToUpperCamelCase(spec.LexModeNameDefault.String()) {
+ var b strings.Builder
+ fmt.Fprintf(&b, "%+v", ids[0])
+ for _, id := range ids[1:] {
+ fmt.Fprintf(&b, ", %+v", id)
+ }
+ return fmt.Errorf("these identifiers are treated as the same. please use the same spelling as predefined '%v': %v", spec.LexModeNameDefault, b.String())
+ }
+ return nil
+ })
+ errs := append(kindErrs, modeErrs...)
+ if len(errs) > 0 {
+ var b strings.Builder
+ fmt.Fprintf(&b, "%v", errs[0])
+ for _, err := range errs[1:] {
+ fmt.Fprintf(&b, "\n%v", err)
+ }
+ return fmt.Errorf(b.String())
+ }
+ }
+
+ return nil
+}
+
+func findSpellingInconsistenciesErrors(ids []string, hook func(ids []string) error) []error {
+ duplicated := FindSpellingInconsistencies(ids)
+ if len(duplicated) == 0 {
+ return nil
+ }
+
+ var errs []error
+ for _, dup := range duplicated {
+ if hook != nil {
+ err := hook(dup)
+ if err != nil {
+ errs = append(errs, err)
+ continue
+ }
+ }
+
+ var b strings.Builder
+ fmt.Fprintf(&b, "%+v", dup[0])
+ for _, id := range dup[1:] {
+ fmt.Fprintf(&b, ", %+v", id)
+ }
+ err := fmt.Errorf("these identifiers are treated as the same. please use the same spelling: %v", b.String())
+ errs = append(errs, err)
+ }
+
+ return errs
+}
+
+// FindSpellingInconsistencies finds spelling inconsistencies in identifiers. The identifiers are considered to be the same
+// if they are spelled the same when expressed in UpperCamelCase. For example, `left_paren` and `LeftParen` are spelled the same
+// in UpperCamelCase. Thus they are considere to be spelling inconsistency.
+func FindSpellingInconsistencies(ids []string) [][]string {
+ m := map[string][]string{}
+ for _, id := range removeDuplicates(ids) {
+ c := SnakeCaseToUpperCamelCase(id)
+ m[c] = append(m[c], id)
+ }
+
+ var duplicated [][]string
+ for _, camels := range m {
+ if len(camels) == 1 {
+ continue
+ }
+ duplicated = append(duplicated, camels)
+ }
+
+ for _, dup := range duplicated {
+ sort.Slice(dup, func(i, j int) bool {
+ return dup[i] < dup[j]
+ })
+ }
+ sort.Slice(duplicated, func(i, j int) bool {
+ return duplicated[i][0] < duplicated[j][0]
+ })
+
+ return duplicated
+}
+
+func removeDuplicates(s []string) []string {
+ m := map[string]struct{}{}
+ for _, v := range s {
+ m[v] = struct{}{}
+ }
+
+ var unique []string
+ for v := range m {
+ unique = append(unique, v)
+ }
+
+ return unique
+}
+
+func SnakeCaseToUpperCamelCase(snake string) string {
+ elems := strings.Split(snake, "_")
+ for i, e := range elems {
+ if len(e) == 0 {
+ continue
+ }
+ elems[i] = strings.ToUpper(string(e[0])) + e[1:]
+ }
+
+ return strings.Join(elems, "")
+}
diff --git a/src/urubu/grammar/lexical/parser/error.go b/src/urubu/grammar/lexical/parser/error.go
new file mode 100644
index 0000000..be81da4
--- /dev/null
+++ b/src/urubu/grammar/lexical/parser/error.go
@@ -0,0 +1,36 @@
+package parser
+
+import "fmt"
+
+var (
+ ParseErr = fmt.Errorf("parse error")
+
+ // lexical errors
+ synErrIncompletedEscSeq = fmt.Errorf("incompleted escape sequence; unexpected EOF following \\")
+ synErrInvalidEscSeq = fmt.Errorf("invalid escape sequence")
+ synErrInvalidCodePoint = fmt.Errorf("code points must consist of just 4 or 6 hex digits")
+ synErrCharPropInvalidSymbol = fmt.Errorf("invalid character property symbol")
+ SynErrFragmentInvalidSymbol = fmt.Errorf("invalid fragment symbol")
+
+ // syntax errors
+ synErrUnexpectedToken = fmt.Errorf("unexpected token")
+ synErrNullPattern = fmt.Errorf("a pattern must be a non-empty byte sequence")
+ synErrUnmatchablePattern = fmt.Errorf("a pattern cannot match any characters")
+ synErrAltLackOfOperand = fmt.Errorf("an alternation expression must have operands")
+ synErrRepNoTarget = fmt.Errorf("a repeat expression must have an operand")
+ synErrGroupNoElem = fmt.Errorf("a grouping expression must include at least one character")
+ synErrGroupUnclosed = fmt.Errorf("unclosed grouping expression")
+ synErrGroupNoInitiator = fmt.Errorf(") needs preceding (")
+ synErrGroupInvalidForm = fmt.Errorf("invalid grouping expression")
+ synErrBExpNoElem = fmt.Errorf("a bracket expression must include at least one character")
+ synErrBExpUnclosed = fmt.Errorf("unclosed bracket expression")
+ synErrBExpInvalidForm = fmt.Errorf("invalid bracket expression")
+ synErrRangeInvalidOrder = fmt.Errorf("a range expression with invalid order")
+ synErrRangePropIsUnavailable = fmt.Errorf("a property expression is unavailable in a range expression")
+ synErrRangeInvalidForm = fmt.Errorf("invalid range expression")
+ synErrCPExpInvalidForm = fmt.Errorf("invalid code point expression")
+ synErrCPExpOutOfRange = fmt.Errorf("a code point must be between U+0000 to U+10FFFF")
+ synErrCharPropExpInvalidForm = fmt.Errorf("invalid character property expression")
+ synErrCharPropUnsupported = fmt.Errorf("unsupported character property")
+ synErrFragmentExpInvalidForm = fmt.Errorf("invalid fragment expression")
+)
diff --git a/src/urubu/grammar/lexical/parser/fragment.go b/src/urubu/grammar/lexical/parser/fragment.go
new file mode 100644
index 0000000..196c00b
--- /dev/null
+++ b/src/urubu/grammar/lexical/parser/fragment.go
@@ -0,0 +1,72 @@
+package parser
+
+import (
+ "fmt"
+
+ spec "urubu/spec/grammar"
+)
+
+type incompleteFragment struct {
+ kind spec.LexKindName
+ root *rootNode
+}
+
+func CompleteFragments(fragments map[spec.LexKindName]CPTree) error {
+ if len(fragments) == 0 {
+ return nil
+ }
+
+ completeFragments := map[spec.LexKindName]CPTree{}
+ incompleteFragments := []*incompleteFragment{}
+ for kind, tree := range fragments {
+ root, ok := tree.(*rootNode)
+ if !ok {
+ return fmt.Errorf("CompleteFragments can take only *rootNode: %T", tree)
+ }
+ if root.incomplete() {
+ incompleteFragments = append(incompleteFragments, &incompleteFragment{
+ kind: kind,
+ root: root,
+ })
+ } else {
+ completeFragments[kind] = root
+ }
+ }
+ for len(incompleteFragments) > 0 {
+ lastIncompCount := len(incompleteFragments)
+ remainingFragments := []*incompleteFragment{}
+ for _, e := range incompleteFragments {
+ complete, err := ApplyFragments(e.root, completeFragments)
+ if err != nil {
+ return err
+ }
+ if !complete {
+ remainingFragments = append(remainingFragments, e)
+ } else {
+ completeFragments[e.kind] = e.root
+ }
+ }
+ incompleteFragments = remainingFragments
+ if len(incompleteFragments) == lastIncompCount {
+ return ParseErr
+ }
+ }
+
+ return nil
+}
+
+func ApplyFragments(t CPTree, fragments map[spec.LexKindName]CPTree) (bool, error) {
+ root, ok := t.(*rootNode)
+ if !ok {
+ return false, fmt.Errorf("ApplyFragments can take only *rootNode type: %T", t)
+ }
+
+ for name, frag := range fragments {
+ err := root.applyFragment(name, frag)
+ if err != nil {
+ return false, err
+ }
+ }
+
+ return !root.incomplete(), nil
+}
diff --git a/src/urubu/grammar/lexical/parser/lexer.go b/src/urubu/grammar/lexical/parser/lexer.go
new file mode 100644
index 0000000..3861825
--- /dev/null
+++ b/src/urubu/grammar/lexical/parser/lexer.go
@@ -0,0 +1,594 @@
+package parser
+
+import (
+ "bufio"
+ "fmt"
+ "io"
+ "strings"
+)
+
+type tokenKind string
+
+const (
+ tokenKindChar tokenKind = "char"
+ tokenKindAnyChar tokenKind = "."
+ tokenKindRepeat tokenKind = "*"
+ tokenKindRepeatOneOrMore tokenKind = "+"
+ tokenKindOption tokenKind = "?"
+ tokenKindAlt tokenKind = "|"
+ tokenKindGroupOpen tokenKind = "("
+ tokenKindGroupClose tokenKind = ")"
+ tokenKindBExpOpen tokenKind = "["
+ tokenKindInverseBExpOpen tokenKind = "[^"
+ tokenKindBExpClose tokenKind = "]"
+ tokenKindCharRange tokenKind = "-"
+ tokenKindCodePointLeader tokenKind = "\\u"
+ tokenKindCharPropLeader tokenKind = "\\p"
+ tokenKindFragmentLeader tokenKind = "\\f"
+ tokenKindLBrace tokenKind = "{"
+ tokenKindRBrace tokenKind = "}"
+ tokenKindEqual tokenKind = "="
+ tokenKindCodePoint tokenKind = "code point"
+ tokenKindCharPropSymbol tokenKind = "character property symbol"
+ tokenKindFragmentSymbol tokenKind = "fragment symbol"
+ tokenKindEOF tokenKind = "eof"
+)
+
+type token struct {
+ kind tokenKind
+ char rune
+ propSymbol string
+ codePoint string
+ fragmentSymbol string
+}
+
+const nullChar = '\u0000'
+
+func newToken(kind tokenKind, char rune) *token {
+ return &token{
+ kind: kind,
+ char: char,
+ }
+}
+
+func newCodePointToken(codePoint string) *token {
+ return &token{
+ kind: tokenKindCodePoint,
+ codePoint: codePoint,
+ }
+}
+
+func newCharPropSymbolToken(propSymbol string) *token {
+ return &token{
+ kind: tokenKindCharPropSymbol,
+ propSymbol: propSymbol,
+ }
+}
+
+func newFragmentSymbolToken(fragmentSymbol string) *token {
+ return &token{
+ kind: tokenKindFragmentSymbol,
+ fragmentSymbol: fragmentSymbol,
+ }
+}
+
+type lexerMode string
+
+const (
+ lexerModeDefault lexerMode = "default"
+ lexerModeBExp lexerMode = "bracket expression"
+ lexerModeCPExp lexerMode = "code point expression"
+ lexerModeCharPropExp lexerMode = "character property expression"
+ lexerModeFragmentExp lexerMode = "fragment expression"
+)
+
+type lexerModeStack struct {
+ stack []lexerMode
+}
+
+func newLexerModeStack() *lexerModeStack {
+ return &lexerModeStack{
+ stack: []lexerMode{
+ lexerModeDefault,
+ },
+ }
+}
+
+func (s *lexerModeStack) top() lexerMode {
+ return s.stack[len(s.stack)-1]
+}
+
+func (s *lexerModeStack) push(m lexerMode) {
+ s.stack = append(s.stack, m)
+}
+
+func (s *lexerModeStack) pop() {
+ s.stack = s.stack[:len(s.stack)-1]
+}
+
+type rangeState string
+
+// [a-z]
+// ^^^^
+// |||`-- ready
+// ||`-- expect range terminator
+// |`-- read range initiator
+// `-- ready
+const (
+ rangeStateReady rangeState = "ready"
+ rangeStateReadRangeInitiator rangeState = "read range initiator"
+ rangeStateExpectRangeTerminator rangeState = "expect range terminator"
+)
+
+type lexer struct {
+ src *bufio.Reader
+ peekChar2 rune
+ peekEOF2 bool
+ peekChar1 rune
+ peekEOF1 bool
+ lastChar rune
+ reachedEOF bool
+ prevChar1 rune
+ prevEOF1 bool
+ prevChar2 rune
+ pervEOF2 bool
+ modeStack *lexerModeStack
+ rangeState rangeState
+
+ errCause error
+ errDetail string
+}
+
+func newLexer(src io.Reader) *lexer {
+ return &lexer{
+ src: bufio.NewReader(src),
+ peekChar2: nullChar,
+ peekEOF2: false,
+ peekChar1: nullChar,
+ peekEOF1: false,
+ lastChar: nullChar,
+ reachedEOF: false,
+ prevChar1: nullChar,
+ prevEOF1: false,
+ prevChar2: nullChar,
+ pervEOF2: false,
+ modeStack: newLexerModeStack(),
+ rangeState: rangeStateReady,
+ }
+}
+
+func (l *lexer) error() (string, error) {
+ return l.errDetail, l.errCause
+}
+
+func (l *lexer) next() (*token, error) {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ return newToken(tokenKindEOF, nullChar), nil
+ }
+
+ switch l.modeStack.top() {
+ case lexerModeBExp:
+ tok, err := l.nextInBExp(c)
+ if err != nil {
+ return nil, err
+ }
+ if tok.kind == tokenKindChar || tok.kind == tokenKindCodePointLeader || tok.kind == tokenKindCharPropLeader {
+ switch l.rangeState {
+ case rangeStateReady:
+ l.rangeState = rangeStateReadRangeInitiator
+ case rangeStateExpectRangeTerminator:
+ l.rangeState = rangeStateReady
+ }
+ }
+ switch tok.kind {
+ case tokenKindBExpClose:
+ l.modeStack.pop()
+ case tokenKindCharRange:
+ l.rangeState = rangeStateExpectRangeTerminator
+ case tokenKindCodePointLeader:
+ l.modeStack.push(lexerModeCPExp)
+ case tokenKindCharPropLeader:
+ l.modeStack.push(lexerModeCharPropExp)
+ }
+ return tok, nil
+ case lexerModeCPExp:
+ tok, err := l.nextInCodePoint(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindRBrace:
+ l.modeStack.pop()
+ }
+ return tok, nil
+ case lexerModeCharPropExp:
+ tok, err := l.nextInCharProp(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindRBrace:
+ l.modeStack.pop()
+ }
+ return tok, nil
+ case lexerModeFragmentExp:
+ tok, err := l.nextInFragment(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindRBrace:
+ l.modeStack.pop()
+ }
+ return tok, nil
+ default:
+ tok, err := l.nextInDefault(c)
+ if err != nil {
+ return nil, err
+ }
+ switch tok.kind {
+ case tokenKindBExpOpen:
+ l.modeStack.push(lexerModeBExp)
+ l.rangeState = rangeStateReady
+ case tokenKindInverseBExpOpen:
+ l.modeStack.push(lexerModeBExp)
+ l.rangeState = rangeStateReady
+ case tokenKindCodePointLeader:
+ l.modeStack.push(lexerModeCPExp)
+ case tokenKindCharPropLeader:
+ l.modeStack.push(lexerModeCharPropExp)
+ case tokenKindFragmentLeader:
+ l.modeStack.push(lexerModeFragmentExp)
+ }
+ return tok, nil
+ }
+}
+
+func (l *lexer) nextInDefault(c rune) (*token, error) {
+ switch c {
+ case '*':
+ return newToken(tokenKindRepeat, nullChar), nil
+ case '+':
+ return newToken(tokenKindRepeatOneOrMore, nullChar), nil
+ case '?':
+ return newToken(tokenKindOption, nullChar), nil
+ case '.':
+ return newToken(tokenKindAnyChar, nullChar), nil
+ case '|':
+ return newToken(tokenKindAlt, nullChar), nil
+ case '(':
+ return newToken(tokenKindGroupOpen, nullChar), nil
+ case ')':
+ return newToken(tokenKindGroupClose, nullChar), nil
+ case '[':
+ c1, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindBExpOpen, nullChar), nil
+ }
+ if c1 != '^' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindBExpOpen, nullChar), nil
+ }
+ c2, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindInverseBExpOpen, nullChar), nil
+ }
+ if c2 != ']' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindInverseBExpOpen, nullChar), nil
+ }
+ err = l.restore()
+ if err != nil {
+ return nil, err
+ }
+ err = l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindBExpOpen, nullChar), nil
+ case '\\':
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ l.errCause = synErrIncompletedEscSeq
+ return nil, ParseErr
+ }
+ if c == 'u' {
+ return newToken(tokenKindCodePointLeader, nullChar), nil
+ }
+ if c == 'p' {
+ return newToken(tokenKindCharPropLeader, nullChar), nil
+ }
+ if c == 'f' {
+ return newToken(tokenKindFragmentLeader, nullChar), nil
+ }
+ if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' {
+ return newToken(tokenKindChar, c), nil
+ }
+ l.errCause = synErrInvalidEscSeq
+ l.errDetail = fmt.Sprintf("\\%v is not supported", string(c))
+ return nil, ParseErr
+ default:
+ return newToken(tokenKindChar, c), nil
+ }
+}
+
+func (l *lexer) nextInBExp(c rune) (*token, error) {
+ switch c {
+ case '-':
+ if l.rangeState != rangeStateReadRangeInitiator {
+ return newToken(tokenKindChar, c), nil
+ }
+ c1, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindChar, c), nil
+ }
+ if c1 != ']' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindCharRange, nullChar), nil
+ }
+ err = l.restore()
+ if err != nil {
+ return nil, err
+ }
+ return newToken(tokenKindChar, c), nil
+ case ']':
+ return newToken(tokenKindBExpClose, nullChar), nil
+ case '\\':
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ l.errCause = synErrIncompletedEscSeq
+ return nil, ParseErr
+ }
+ if c == 'u' {
+ return newToken(tokenKindCodePointLeader, nullChar), nil
+ }
+ if c == 'p' {
+ return newToken(tokenKindCharPropLeader, nullChar), nil
+ }
+ if c == '\\' || c == '^' || c == '-' || c == ']' {
+ return newToken(tokenKindChar, c), nil
+ }
+ l.errCause = synErrInvalidEscSeq
+ l.errDetail = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c))
+ return nil, ParseErr
+ default:
+ return newToken(tokenKindChar, c), nil
+ }
+}
+
+func (l *lexer) nextInCodePoint(c rune) (*token, error) {
+ switch c {
+ case '{':
+ return newToken(tokenKindLBrace, nullChar), nil
+ case '}':
+ return newToken(tokenKindRBrace, nullChar), nil
+ default:
+ if !isHexDigit(c) {
+ l.errCause = synErrInvalidCodePoint
+ return nil, ParseErr
+ }
+ var b strings.Builder
+ fmt.Fprint(&b, string(c))
+ n := 1
+ for {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if c == '}' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if !isHexDigit(c) || n >= 6 {
+ l.errCause = synErrInvalidCodePoint
+ return nil, ParseErr
+ }
+ fmt.Fprint(&b, string(c))
+ n++
+ }
+ cp := b.String()
+ cpLen := len(cp)
+ if !(cpLen == 4 || cpLen == 6) {
+ l.errCause = synErrInvalidCodePoint
+ return nil, ParseErr
+ }
+ return newCodePointToken(b.String()), nil
+ }
+}
+
+func isHexDigit(c rune) bool {
+ if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' {
+ return true
+ }
+ return false
+}
+
+func (l *lexer) nextInCharProp(c rune) (*token, error) {
+ switch c {
+ case '{':
+ return newToken(tokenKindLBrace, nullChar), nil
+ case '}':
+ return newToken(tokenKindRBrace, nullChar), nil
+ case '=':
+ return newToken(tokenKindEqual, nullChar), nil
+ default:
+ var b strings.Builder
+ fmt.Fprint(&b, string(c))
+ n := 1
+ for {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if c == '}' || c == '=' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ fmt.Fprint(&b, string(c))
+ n++
+ }
+ sym := strings.TrimSpace(b.String())
+ if len(sym) == 0 {
+ l.errCause = synErrCharPropInvalidSymbol
+ return nil, ParseErr
+ }
+ return newCharPropSymbolToken(sym), nil
+ }
+}
+
+func (l *lexer) nextInFragment(c rune) (*token, error) {
+ switch c {
+ case '{':
+ return newToken(tokenKindLBrace, nullChar), nil
+ case '}':
+ return newToken(tokenKindRBrace, nullChar), nil
+ default:
+ var b strings.Builder
+ fmt.Fprint(&b, string(c))
+ n := 1
+ for {
+ c, eof, err := l.read()
+ if err != nil {
+ return nil, err
+ }
+ if eof {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ if c == '}' {
+ err := l.restore()
+ if err != nil {
+ return nil, err
+ }
+ break
+ }
+ fmt.Fprint(&b, string(c))
+ n++
+ }
+ sym := strings.TrimSpace(b.String())
+ if len(sym) == 0 {
+ l.errCause = SynErrFragmentInvalidSymbol
+ return nil, ParseErr
+ }
+ return newFragmentSymbolToken(sym), nil
+ }
+}
+
+func (l *lexer) read() (rune, bool, error) {
+ if l.reachedEOF {
+ return l.lastChar, l.reachedEOF, nil
+ }
+ if l.peekChar1 != nullChar || l.peekEOF1 {
+ l.prevChar2 = l.prevChar1
+ l.pervEOF2 = l.prevEOF1
+ l.prevChar1 = l.lastChar
+ l.prevEOF1 = l.reachedEOF
+ l.lastChar = l.peekChar1
+ l.reachedEOF = l.peekEOF1
+ l.peekChar1 = l.peekChar2
+ l.peekEOF1 = l.peekEOF2
+ l.peekChar2 = nullChar
+ l.peekEOF2 = false
+ return l.lastChar, l.reachedEOF, nil
+ }
+ c, _, err := l.src.ReadRune()
+ if err != nil {
+ if err == io.EOF {
+ l.prevChar2 = l.prevChar1
+ l.pervEOF2 = l.prevEOF1
+ l.prevChar1 = l.lastChar
+ l.prevEOF1 = l.reachedEOF
+ l.lastChar = nullChar
+ l.reachedEOF = true
+ return l.lastChar, l.reachedEOF, nil
+ }
+ return nullChar, false, err
+ }
+ l.prevChar2 = l.prevChar1
+ l.pervEOF2 = l.prevEOF1
+ l.prevChar1 = l.lastChar
+ l.prevEOF1 = l.reachedEOF
+ l.lastChar = c
+ l.reachedEOF = false
+ return l.lastChar, l.reachedEOF, nil
+}
+
+func (l *lexer) restore() error {
+ if l.lastChar == nullChar && !l.reachedEOF {
+ return fmt.Errorf("failed to call restore() because the last character is null")
+ }
+ l.peekChar2 = l.peekChar1
+ l.peekEOF2 = l.peekEOF1
+ l.peekChar1 = l.lastChar
+ l.peekEOF1 = l.reachedEOF
+ l.lastChar = l.prevChar1
+ l.reachedEOF = l.prevEOF1
+ l.prevChar1 = l.prevChar2
+ l.prevEOF1 = l.pervEOF2
+ l.prevChar2 = nullChar
+ l.pervEOF2 = false
+ return nil
+}
diff --git a/src/urubu/grammar/lexical/parser/parser.go b/src/urubu/grammar/lexical/parser/parser.go
new file mode 100644
index 0000000..425b553
--- /dev/null
+++ b/src/urubu/grammar/lexical/parser/parser.go
@@ -0,0 +1,531 @@
+package parser
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "strconv"
+
+ spec "urubu/spec/grammar"
+ "urubu/ucd"
+)
+
+type PatternEntry struct {
+ ID spec.LexModeKindID
+ Pattern []byte
+}
+
+type parser struct {
+ kind spec.LexKindName
+ lex *lexer
+ peekedTok *token
+ lastTok *token
+
+ // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that
+ // appear in property expressions.
+ //
+ // The contributory properties are not exposed, and users cannot use those properties because the parser
+ // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid.
+ //
+ // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to
+ // interpret derived properties internally because the derived properties consist of other properties that
+ // may contain the contributory properties.
+ //
+ // [UAX #44 5.13 Property APIs] says:
+ // > The following subtypes of Unicode character properties should generally not be exposed in APIs,
+ // > except in limited circumstances. They may not be useful, particularly in public API collections,
+ // > and may instead prove misleading to the users of such API collections.
+ // > * Contributory properties are not recommended for public APIs.
+ // > ...
+ // https://unicode.org/reports/tr44/#Property_APIs
+ isContributoryPropertyExposed bool
+
+ errCause error
+ errDetail string
+}
+
+func NewParser(kind spec.LexKindName, src io.Reader) *parser {
+ return &parser{
+ kind: kind,
+ lex: newLexer(src),
+ isContributoryPropertyExposed: false,
+ }
+}
+
+func (p *parser) exposeContributoryProperty() {
+ p.isContributoryPropertyExposed = true
+}
+
+func (p *parser) Error() (string, error) {
+ return p.errDetail, p.errCause
+}
+
+func (p *parser) Parse() (root CPTree, retErr error) {
+ defer func() {
+ err := recover()
+ if err != nil {
+ var ok bool
+ retErr, ok = err.(error)
+ if !ok {
+ panic(err)
+ }
+ return
+ }
+ }()
+
+ return newRootNode(p.kind, p.parseRegexp()), nil
+}
+
+func (p *parser) parseRegexp() CPTree {
+ alt := p.parseAlt()
+ if alt == nil {
+ if p.consume(tokenKindGroupClose) {
+ p.raiseParseError(synErrGroupNoInitiator, "")
+ }
+ p.raiseParseError(synErrNullPattern, "")
+ }
+ if p.consume(tokenKindGroupClose) {
+ p.raiseParseError(synErrGroupNoInitiator, "")
+ }
+ p.expect(tokenKindEOF)
+ return alt
+}
+
+func (p *parser) parseAlt() CPTree {
+ left := p.parseConcat()
+ if left == nil {
+ if p.consume(tokenKindAlt) {
+ p.raiseParseError(synErrAltLackOfOperand, "")
+ }
+ return nil
+ }
+ for {
+ if !p.consume(tokenKindAlt) {
+ break
+ }
+ right := p.parseConcat()
+ if right == nil {
+ p.raiseParseError(synErrAltLackOfOperand, "")
+ }
+ left = newAltNode(left, right)
+ }
+ return left
+}
+
+func (p *parser) parseConcat() CPTree {
+ left := p.parseRepeat()
+ for {
+ right := p.parseRepeat()
+ if right == nil {
+ break
+ }
+ left = newConcatNode(left, right)
+ }
+ return left
+}
+
+func (p *parser) parseRepeat() CPTree {
+ group := p.parseGroup()
+ if group == nil {
+ if p.consume(tokenKindRepeat) {
+ p.raiseParseError(synErrRepNoTarget, "* needs an operand")
+ }
+ if p.consume(tokenKindRepeatOneOrMore) {
+ p.raiseParseError(synErrRepNoTarget, "+ needs an operand")
+ }
+ if p.consume(tokenKindOption) {
+ p.raiseParseError(synErrRepNoTarget, "? needs an operand")
+ }
+ return nil
+ }
+ if p.consume(tokenKindRepeat) {
+ return newRepeatNode(group)
+ }
+ if p.consume(tokenKindRepeatOneOrMore) {
+ return newRepeatOneOrMoreNode(group)
+ }
+ if p.consume(tokenKindOption) {
+ return newOptionNode(group)
+ }
+ return group
+}
+
+func (p *parser) parseGroup() CPTree {
+ if p.consume(tokenKindGroupOpen) {
+ alt := p.parseAlt()
+ if alt == nil {
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrGroupUnclosed, "")
+ }
+ p.raiseParseError(synErrGroupNoElem, "")
+ }
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrGroupUnclosed, "")
+ }
+ if !p.consume(tokenKindGroupClose) {
+ p.raiseParseError(synErrGroupInvalidForm, "")
+ }
+ return alt
+ }
+ return p.parseSingleChar()
+}
+
+func (p *parser) parseSingleChar() CPTree {
+ if p.consume(tokenKindAnyChar) {
+ return genAnyCharAST()
+ }
+ if p.consume(tokenKindBExpOpen) {
+ left := p.parseBExpElem()
+ if left == nil {
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrBExpUnclosed, "")
+ }
+ p.raiseParseError(synErrBExpNoElem, "")
+ }
+ for {
+ right := p.parseBExpElem()
+ if right == nil {
+ break
+ }
+ left = newAltNode(left, right)
+ }
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrBExpUnclosed, "")
+ }
+ p.expect(tokenKindBExpClose)
+ return left
+ }
+ if p.consume(tokenKindInverseBExpOpen) {
+ elem := p.parseBExpElem()
+ if elem == nil {
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrBExpUnclosed, "")
+ }
+ p.raiseParseError(synErrBExpNoElem, "")
+ }
+ inverse := exclude(elem, genAnyCharAST())
+ if inverse == nil {
+ p.raiseParseError(synErrUnmatchablePattern, "")
+ }
+ for {
+ elem := p.parseBExpElem()
+ if elem == nil {
+ break
+ }
+ inverse = exclude(elem, inverse)
+ if inverse == nil {
+ p.raiseParseError(synErrUnmatchablePattern, "")
+ }
+ }
+ if p.consume(tokenKindEOF) {
+ p.raiseParseError(synErrBExpUnclosed, "")
+ }
+ p.expect(tokenKindBExpClose)
+ return inverse
+ }
+ if p.consume(tokenKindCodePointLeader) {
+ return p.parseCodePoint()
+ }
+ if p.consume(tokenKindCharPropLeader) {
+ return p.parseCharProp()
+ }
+ if p.consume(tokenKindFragmentLeader) {
+ return p.parseFragment()
+ }
+ c := p.parseNormalChar()
+ if c == nil {
+ if p.consume(tokenKindBExpClose) {
+ p.raiseParseError(synErrBExpInvalidForm, "")
+ }
+ return nil
+ }
+ return c
+}
+
+func (p *parser) parseBExpElem() CPTree {
+ var left CPTree
+ switch {
+ case p.consume(tokenKindCodePointLeader):
+ left = p.parseCodePoint()
+ case p.consume(tokenKindCharPropLeader):
+ left = p.parseCharProp()
+ if p.consume(tokenKindCharRange) {
+ p.raiseParseError(synErrRangePropIsUnavailable, "")
+ }
+ default:
+ left = p.parseNormalChar()
+ }
+ if left == nil {
+ return nil
+ }
+ if !p.consume(tokenKindCharRange) {
+ return left
+ }
+ var right CPTree
+ switch {
+ case p.consume(tokenKindCodePointLeader):
+ right = p.parseCodePoint()
+ case p.consume(tokenKindCharPropLeader):
+ p.raiseParseError(synErrRangePropIsUnavailable, "")
+ default:
+ right = p.parseNormalChar()
+ }
+ if right == nil {
+ p.raiseParseError(synErrRangeInvalidForm, "")
+ }
+ from, _, _ := left.Range()
+ _, to, _ := right.Range()
+ if !isValidOrder(from, to) {
+ p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("%X..%X", from, to))
+ }
+ return newRangeSymbolNode(from, to)
+}
+
+func (p *parser) parseCodePoint() CPTree {
+ if !p.consume(tokenKindLBrace) {
+ p.raiseParseError(synErrCPExpInvalidForm, "")
+ }
+ if !p.consume(tokenKindCodePoint) {
+ p.raiseParseError(synErrCPExpInvalidForm, "")
+ }
+
+ n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64)
+ if err != nil {
+ panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err))
+ }
+ if n < 0x0000 || n > 0x10FFFF {
+ p.raiseParseError(synErrCPExpOutOfRange, "")
+ }
+
+ sym := newSymbolNode(rune(n))
+
+ if !p.consume(tokenKindRBrace) {
+ p.raiseParseError(synErrCPExpInvalidForm, "")
+ }
+
+ return sym
+}
+
+func (p *parser) parseCharProp() CPTree {
+ if !p.consume(tokenKindLBrace) {
+ p.raiseParseError(synErrCharPropExpInvalidForm, "")
+ }
+ var sym1, sym2 string
+ if !p.consume(tokenKindCharPropSymbol) {
+ p.raiseParseError(synErrCharPropExpInvalidForm, "")
+ }
+ sym1 = p.lastTok.propSymbol
+ if p.consume(tokenKindEqual) {
+ if !p.consume(tokenKindCharPropSymbol) {
+ p.raiseParseError(synErrCharPropExpInvalidForm, "")
+ }
+ sym2 = p.lastTok.propSymbol
+ }
+
+ var alt CPTree
+ var propName, propVal string
+ if sym2 != "" {
+ propName = sym1
+ propVal = sym2
+ } else {
+ propName = ""
+ propVal = sym1
+ }
+ if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) {
+ p.raiseParseError(synErrCharPropUnsupported, propName)
+ }
+ pat, err := ucd.NormalizeCharacterProperty(propName, propVal)
+ if err != nil {
+ p.raiseParseError(synErrCharPropUnsupported, err.Error())
+ }
+ if pat != "" {
+ p := NewParser(p.kind, bytes.NewReader([]byte(pat)))
+ p.exposeContributoryProperty()
+ ast, err := p.Parse()
+ if err != nil {
+ panic(err)
+ }
+ alt = ast
+ } else {
+ cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal)
+ if err != nil {
+ p.raiseParseError(synErrCharPropUnsupported, err.Error())
+ }
+ if inverse {
+ r := cpRanges[0]
+ alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST())
+ if alt == nil {
+ p.raiseParseError(synErrUnmatchablePattern, "")
+ }
+ for _, r := range cpRanges[1:] {
+ alt = exclude(newRangeSymbolNode(r.From, r.To), alt)
+ if alt == nil {
+ p.raiseParseError(synErrUnmatchablePattern, "")
+ }
+ }
+ } else {
+ for _, r := range cpRanges {
+ alt = genAltNode(
+ alt,
+ newRangeSymbolNode(r.From, r.To),
+ )
+ }
+ }
+ }
+
+ if !p.consume(tokenKindRBrace) {
+ p.raiseParseError(synErrCharPropExpInvalidForm, "")
+ }
+
+ return alt
+}
+
+func (p *parser) parseFragment() CPTree {
+ if !p.consume(tokenKindLBrace) {
+ p.raiseParseError(synErrFragmentExpInvalidForm, "")
+ }
+ if !p.consume(tokenKindFragmentSymbol) {
+ p.raiseParseError(synErrFragmentExpInvalidForm, "")
+ }
+ sym := p.lastTok.fragmentSymbol
+
+ if !p.consume(tokenKindRBrace) {
+ p.raiseParseError(synErrFragmentExpInvalidForm, "")
+ }
+
+ return newFragmentNode(spec.LexKindName(sym), nil)
+}
+
+func (p *parser) parseNormalChar() CPTree {
+ if !p.consume(tokenKindChar) {
+ return nil
+ }
+ return newSymbolNode(p.lastTok.char)
+}
+
+func exclude(symbol, base CPTree) CPTree {
+ if left, right, ok := symbol.Alternatives(); ok {
+ return exclude(right, exclude(left, base))
+ }
+
+ if left, right, ok := base.Alternatives(); ok {
+ return genAltNode(
+ exclude(symbol, left),
+ exclude(symbol, right),
+ )
+ }
+
+ if bFrom, bTo, ok := base.Range(); ok {
+ sFrom, sTo, ok := symbol.Range()
+ if !ok {
+ panic(fmt.Errorf("invalid symbol tree: %T", symbol))
+ }
+
+ switch {
+ case sFrom > bFrom && sTo < bTo:
+ return genAltNode(
+ newRangeSymbolNode(bFrom, sFrom-1),
+ newRangeSymbolNode(sTo+1, bTo),
+ )
+ case sFrom <= bFrom && sTo >= bFrom && sTo < bTo:
+ return newRangeSymbolNode(sTo+1, bTo)
+ case sFrom > bFrom && sFrom <= bTo && sTo >= bTo:
+ return newRangeSymbolNode(bFrom, sFrom-1)
+ case sFrom <= bFrom && sTo >= bTo:
+ return nil
+ default:
+ return base
+ }
+ }
+
+ panic(fmt.Errorf("invalid base tree: %T", base))
+}
+
+func genAnyCharAST() CPTree {
+ return newRangeSymbolNode(0x0, 0x10FFFF)
+}
+
+func isValidOrder(from, to rune) bool {
+ return from <= to
+}
+
+func genConcatNode(cs ...CPTree) CPTree {
+ nonNilNodes := []CPTree{}
+ for _, c := range cs {
+ if c == nil {
+ continue
+ }
+ nonNilNodes = append(nonNilNodes, c)
+ }
+ if len(nonNilNodes) <= 0 {
+ return nil
+ }
+ if len(nonNilNodes) == 1 {
+ return nonNilNodes[0]
+ }
+ concat := newConcatNode(nonNilNodes[0], nonNilNodes[1])
+ for _, c := range nonNilNodes[2:] {
+ concat = newConcatNode(concat, c)
+ }
+ return concat
+}
+
+func genAltNode(cs ...CPTree) CPTree {
+ nonNilNodes := []CPTree{}
+ for _, c := range cs {
+ if c == nil {
+ continue
+ }
+ nonNilNodes = append(nonNilNodes, c)
+ }
+ if len(nonNilNodes) <= 0 {
+ return nil
+ }
+ if len(nonNilNodes) == 1 {
+ return nonNilNodes[0]
+ }
+ alt := newAltNode(nonNilNodes[0], nonNilNodes[1])
+ for _, c := range nonNilNodes[2:] {
+ alt = newAltNode(alt, c)
+ }
+ return alt
+}
+
+func (p *parser) expect(expected tokenKind) {
+ if !p.consume(expected) {
+ tok := p.peekedTok
+ p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("expected: %v, actual: %v", expected, tok.kind))
+ }
+}
+
+func (p *parser) consume(expected tokenKind) bool {
+ var tok *token
+ var err error
+ if p.peekedTok != nil {
+ tok = p.peekedTok
+ p.peekedTok = nil
+ } else {
+ tok, err = p.lex.next()
+ if err != nil {
+ if err == ParseErr {
+ detail, cause := p.lex.error()
+ p.raiseParseError(cause, detail)
+ }
+ panic(err)
+ }
+ }
+ p.lastTok = tok
+ if tok.kind == expected {
+ return true
+ }
+ p.peekedTok = tok
+ p.lastTok = nil
+
+ return false
+}
+
+func (p *parser) raiseParseError(err error, detail string) {
+ p.errCause = err
+ p.errDetail = detail
+ panic(ParseErr)
+}
diff --git a/src/urubu/grammar/lexical/parser/tree.go b/src/urubu/grammar/lexical/parser/tree.go
new file mode 100644
index 0000000..df03d37
--- /dev/null
+++ b/src/urubu/grammar/lexical/parser/tree.go
@@ -0,0 +1,459 @@
+package parser
+
+import (
+ "fmt"
+ "io"
+ "sort"
+
+ spec "urubu/spec/grammar"
+)
+
+type CPRange struct {
+ From rune
+ To rune
+}
+
+type CPTree interface {
+ fmt.Stringer
+ Range() (rune, rune, bool)
+ Optional() (CPTree, bool)
+ Repeatable() (CPTree, bool)
+ Concatenation() (CPTree, CPTree, bool)
+ Alternatives() (CPTree, CPTree, bool)
+ Describe() (spec.LexKindName, []spec.LexKindName, error)
+
+ children() (CPTree, CPTree)
+ clone() CPTree
+}
+
+var (
+ _ CPTree = &rootNode{}
+ _ CPTree = &symbolNode{}
+ _ CPTree = &concatNode{}
+ _ CPTree = &altNode{}
+ _ CPTree = &quantifierNode{}
+ _ CPTree = &fragmentNode{}
+)
+
+type rootNode struct {
+ kind spec.LexKindName
+ tree CPTree
+ fragments map[spec.LexKindName][]*fragmentNode
+}
+
+func newRootNode(kind spec.LexKindName, t CPTree) *rootNode {
+ fragments := map[spec.LexKindName][]*fragmentNode{}
+ collectFragments(t, fragments)
+
+ return &rootNode{
+ kind: kind,
+ tree: t,
+ fragments: fragments,
+ }
+}
+
+func collectFragments(n CPTree, fragments map[spec.LexKindName][]*fragmentNode) {
+ if n == nil {
+ return
+ }
+
+ if f, ok := n.(*fragmentNode); ok {
+ fragments[f.kind] = append(fragments[f.kind], f)
+ return
+ }
+
+ l, r := n.children()
+ collectFragments(l, fragments)
+ collectFragments(r, fragments)
+}
+
+func (n *rootNode) String() string {
+ return fmt.Sprintf("root: %v: %v fragments", n.kind, len(n.fragments))
+}
+
+func (n *rootNode) Range() (rune, rune, bool) {
+ return n.tree.Range()
+}
+
+func (n *rootNode) Optional() (CPTree, bool) {
+ return n.tree.Optional()
+}
+
+func (n *rootNode) Repeatable() (CPTree, bool) {
+ return n.tree.Repeatable()
+}
+
+func (n *rootNode) Concatenation() (CPTree, CPTree, bool) {
+ return n.tree.Concatenation()
+}
+
+func (n *rootNode) Alternatives() (CPTree, CPTree, bool) {
+ return n.tree.Alternatives()
+}
+
+func (n *rootNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
+ var frags []spec.LexKindName
+ for f := range n.fragments {
+ frags = append(frags, spec.LexKindName(f))
+ }
+ sort.Slice(frags, func(i, j int) bool {
+ return frags[i] < frags[j]
+ })
+
+ return n.kind, frags, nil
+}
+
+func (n *rootNode) children() (CPTree, CPTree) {
+ return n.tree.children()
+}
+
+func (n *rootNode) clone() CPTree {
+ return n.tree.clone()
+}
+
+func (n *rootNode) incomplete() bool {
+ return len(n.fragments) > 0
+}
+
+func (n *rootNode) applyFragment(kind spec.LexKindName, fragment CPTree) error {
+ root, ok := fragment.(*rootNode)
+ if !ok {
+ return fmt.Errorf("applyFragment can take only *rootNode: %T", fragment)
+ }
+ if root.incomplete() {
+ return fmt.Errorf("fragment is incomplete")
+ }
+
+ fs, ok := n.fragments[kind]
+ if !ok {
+ return nil
+ }
+ for _, f := range fs {
+ f.tree = root.clone()
+ }
+ delete(n.fragments, kind)
+
+ return nil
+}
+
+type symbolNode struct {
+ CPRange
+}
+
+func newSymbolNode(cp rune) *symbolNode {
+ return &symbolNode{
+ CPRange: CPRange{
+ From: cp,
+ To: cp,
+ },
+ }
+}
+
+func newRangeSymbolNode(from, to rune) *symbolNode {
+ return &symbolNode{
+ CPRange: CPRange{
+ From: from,
+ To: to,
+ },
+ }
+}
+
+func (n *symbolNode) String() string {
+ return fmt.Sprintf("symbol: %X..%X", n.From, n.To)
+}
+
+func (n *symbolNode) Range() (rune, rune, bool) {
+ return n.From, n.To, true
+}
+
+func (n *symbolNode) Optional() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *symbolNode) Repeatable() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *symbolNode) Concatenation() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *symbolNode) Alternatives() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *symbolNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
+ return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *symbolNode) children() (CPTree, CPTree) {
+ return nil, nil
+}
+
+func (n *symbolNode) clone() CPTree {
+ return newRangeSymbolNode(n.From, n.To)
+}
+
+type concatNode struct {
+ left CPTree
+ right CPTree
+}
+
+func newConcatNode(left, right CPTree) *concatNode {
+ return &concatNode{
+ left: left,
+ right: right,
+ }
+}
+
+func (n *concatNode) String() string {
+ return "concat"
+}
+
+func (n *concatNode) Range() (rune, rune, bool) {
+ return 0, 0, false
+}
+
+func (n *concatNode) Optional() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *concatNode) Repeatable() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *concatNode) Concatenation() (CPTree, CPTree, bool) {
+ return n.left, n.right, true
+}
+
+func (n *concatNode) Alternatives() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *concatNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
+ return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *concatNode) children() (CPTree, CPTree) {
+ return n.left, n.right
+}
+
+func (n *concatNode) clone() CPTree {
+ if n == nil {
+ return nil
+ }
+ return newConcatNode(n.left.clone(), n.right.clone())
+}
+
+type altNode struct {
+ left CPTree
+ right CPTree
+}
+
+func newAltNode(left, right CPTree) *altNode {
+ return &altNode{
+ left: left,
+ right: right,
+ }
+}
+
+func (n *altNode) String() string {
+ return "alt"
+}
+
+func (n *altNode) Range() (rune, rune, bool) {
+ return 0, 0, false
+}
+
+func (n *altNode) Optional() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *altNode) Repeatable() (CPTree, bool) {
+ return nil, false
+}
+
+func (n *altNode) Concatenation() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *altNode) Alternatives() (CPTree, CPTree, bool) {
+ return n.left, n.right, true
+}
+
+func (n *altNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
+ return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *altNode) children() (CPTree, CPTree) {
+ return n.left, n.right
+}
+
+func (n *altNode) clone() CPTree {
+ return newAltNode(n.left.clone(), n.right.clone())
+}
+
+type quantifierNode struct {
+ optional bool
+ repeatable bool
+ tree CPTree
+}
+
+func (n *quantifierNode) String() string {
+ switch {
+ case n.repeatable:
+ return "repeatable (>= 0 times)"
+ case n.optional:
+ return "optional (0 or 1 times)"
+ default:
+ return "invalid quantifier"
+ }
+}
+
+func newRepeatNode(t CPTree) *quantifierNode {
+ return &quantifierNode{
+ repeatable: true,
+ tree: t,
+ }
+}
+
+func newRepeatOneOrMoreNode(t CPTree) *concatNode {
+ return newConcatNode(
+ t,
+ &quantifierNode{
+ repeatable: true,
+ tree: t.clone(),
+ })
+}
+
+func newOptionNode(t CPTree) *quantifierNode {
+ return &quantifierNode{
+ optional: true,
+ tree: t,
+ }
+}
+
+func (n *quantifierNode) Range() (rune, rune, bool) {
+ return 0, 0, false
+}
+
+func (n *quantifierNode) Optional() (CPTree, bool) {
+ return n.tree, n.optional
+}
+
+func (n *quantifierNode) Repeatable() (CPTree, bool) {
+ return n.tree, n.repeatable
+}
+
+func (n *quantifierNode) Concatenation() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *quantifierNode) Alternatives() (CPTree, CPTree, bool) {
+ return nil, nil, false
+}
+
+func (n *quantifierNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
+ return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *quantifierNode) children() (CPTree, CPTree) {
+ return n.tree, nil
+}
+
+func (n *quantifierNode) clone() CPTree {
+ if n.repeatable {
+ return newRepeatNode(n.tree.clone())
+ }
+ return newOptionNode(n.tree.clone())
+}
+
+type fragmentNode struct {
+ kind spec.LexKindName
+ tree CPTree
+}
+
+func newFragmentNode(kind spec.LexKindName, t CPTree) *fragmentNode {
+ return &fragmentNode{
+ kind: kind,
+ tree: t,
+ }
+}
+
+func (n *fragmentNode) String() string {
+ return fmt.Sprintf("fragment: %v", n.kind)
+}
+
+func (n *fragmentNode) Range() (rune, rune, bool) {
+ return n.tree.Range()
+}
+
+func (n *fragmentNode) Optional() (CPTree, bool) {
+ return n.tree.Optional()
+}
+
+func (n *fragmentNode) Repeatable() (CPTree, bool) {
+ return n.tree.Repeatable()
+}
+
+func (n *fragmentNode) Concatenation() (CPTree, CPTree, bool) {
+ return n.tree.Concatenation()
+}
+
+func (n *fragmentNode) Alternatives() (CPTree, CPTree, bool) {
+ return n.tree.Alternatives()
+}
+
+func (n *fragmentNode) Describe() (spec.LexKindName, []spec.LexKindName, error) {
+ return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n)
+}
+
+func (n *fragmentNode) children() (CPTree, CPTree) {
+ return n.tree.children()
+}
+
+func (n *fragmentNode) clone() CPTree {
+ if n.tree == nil {
+ return newFragmentNode(n.kind, nil)
+ }
+ return newFragmentNode(n.kind, n.tree.clone())
+}
+
+//nolint:unused
+func printCPTree(w io.Writer, t CPTree, ruledLine string, childRuledLinePrefix string) {
+ if t == nil {
+ return
+ }
+ fmt.Fprintf(w, "%v%v\n", ruledLine, t)
+ children := []CPTree{}
+ switch n := t.(type) {
+ case *rootNode:
+ children = append(children, n.tree)
+ case *fragmentNode:
+ children = append(children, n.tree)
+ default:
+ left, right := t.children()
+ if left != nil {
+ children = append(children, left)
+ }
+ if right != nil {
+ children = append(children, right)
+ }
+ }
+ num := len(children)
+ for i, child := range children {
+ line := "└─ "
+ if num > 1 {
+ if i == 0 {
+ line = "├─ "
+ } else if i < num-1 {
+ line = "│ "
+ }
+ }
+ prefix := "│ "
+ if i >= num-1 {
+ prefix = " "
+ }
+ printCPTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix)
+ }
+}
diff --git a/src/urubu/grammar/lr0.go b/src/urubu/grammar/lr0.go
new file mode 100644
index 0000000..92a2137
--- /dev/null
+++ b/src/urubu/grammar/lr0.go
@@ -0,0 +1,197 @@
+package grammar
+
+import (
+ "fmt"
+ "sort"
+
+ "urubu/grammar/symbol"
+)
+
+type lr0Automaton struct {
+ initialState kernelID
+ states map[kernelID]*lrState
+}
+
+func genLR0Automaton(prods *productionSet, startSym symbol.Symbol, errSym symbol.Symbol) (*lr0Automaton, error) {
+ if !startSym.IsStart() {
+ return nil, fmt.Errorf("passed symbold is not a start symbol")
+ }
+
+ automaton := &lr0Automaton{
+ states: map[kernelID]*lrState{},
+ }
+
+ currentState := stateNumInitial
+ knownKernels := map[kernelID]struct{}{}
+ uncheckedKernels := []*kernel{}
+
+ // Generate an initial kernel.
+ {
+ prods, _ := prods.findByLHS(startSym)
+ initialItem, err := newLR0Item(prods[0], 0)
+ if err != nil {
+ return nil, err
+ }
+
+ k, err := newKernel([]*lrItem{initialItem})
+ if err != nil {
+ return nil, err
+ }
+
+ automaton.initialState = k.id
+ knownKernels[k.id] = struct{}{}
+ uncheckedKernels = append(uncheckedKernels, k)
+ }
+
+ for len(uncheckedKernels) > 0 {
+ nextUncheckedKernels := []*kernel{}
+ for _, k := range uncheckedKernels {
+ state, neighbours, err := genStateAndNeighbourKernels(k, prods, errSym)
+ if err != nil {
+ return nil, err
+ }
+ state.num = currentState
+ currentState = currentState.next()
+
+ automaton.states[state.id] = state
+
+ for _, k := range neighbours {
+ if _, known := knownKernels[k.id]; known {
+ continue
+ }
+ knownKernels[k.id] = struct{}{}
+ nextUncheckedKernels = append(nextUncheckedKernels, k)
+ }
+ }
+ uncheckedKernels = nextUncheckedKernels
+ }
+
+ return automaton, nil
+}
+
+func genStateAndNeighbourKernels(k *kernel, prods *productionSet, errSym symbol.Symbol) (*lrState, []*kernel, error) {
+ items, err := genLR0Closure(k, prods)
+ if err != nil {
+ return nil, nil, err
+ }
+ neighbours, err := genNeighbourKernels(items, prods)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ next := map[symbol.Symbol]kernelID{}
+ kernels := []*kernel{}
+ for _, n := range neighbours {
+ next[n.symbol] = n.kernel.id
+ kernels = append(kernels, n.kernel)
+ }
+
+ reducible := map[productionID]struct{}{}
+ var emptyProdItems []*lrItem
+ isErrorTrapper := false
+ for _, item := range items {
+ if item.dottedSymbol == errSym {
+ isErrorTrapper = true
+ }
+
+ if item.reducible {
+ reducible[item.prod] = struct{}{}
+
+ prod, ok := prods.findByID(item.prod)
+ if !ok {
+ return nil, nil, fmt.Errorf("reducible production not found: %v", item.prod)
+ }
+ if prod.isEmpty() {
+ emptyProdItems = append(emptyProdItems, item)
+ }
+ }
+ }
+
+ return &lrState{
+ kernel: k,
+ next: next,
+ reducible: reducible,
+ emptyProdItems: emptyProdItems,
+ isErrorTrapper: isErrorTrapper,
+ }, kernels, nil
+}
+
+func genLR0Closure(k *kernel, prods *productionSet) ([]*lrItem, error) {
+ items := []*lrItem{}
+ knownItems := map[lrItemID]struct{}{}
+ uncheckedItems := []*lrItem{}
+ for _, item := range k.items {
+ items = append(items, item)
+ uncheckedItems = append(uncheckedItems, item)
+ }
+ for len(uncheckedItems) > 0 {
+ nextUncheckedItems := []*lrItem{}
+ for _, item := range uncheckedItems {
+ if item.dottedSymbol.IsTerminal() {
+ continue
+ }
+
+ ps, _ := prods.findByLHS(item.dottedSymbol)
+ for _, prod := range ps {
+ item, err := newLR0Item(prod, 0)
+ if err != nil {
+ return nil, err
+ }
+ if _, exist := knownItems[item.id]; exist {
+ continue
+ }
+ items = append(items, item)
+ knownItems[item.id] = struct{}{}
+ nextUncheckedItems = append(nextUncheckedItems, item)
+ }
+ }
+ uncheckedItems = nextUncheckedItems
+ }
+
+ return items, nil
+}
+
+type neighbourKernel struct {
+ symbol symbol.Symbol
+ kernel *kernel
+}
+
+func genNeighbourKernels(items []*lrItem, prods *productionSet) ([]*neighbourKernel, error) {
+ kItemMap := map[symbol.Symbol][]*lrItem{}
+ for _, item := range items {
+ if item.dottedSymbol.IsNil() {
+ continue
+ }
+ prod, ok := prods.findByID(item.prod)
+ if !ok {
+ return nil, fmt.Errorf("a production was not found: %v", item.prod)
+ }
+ kItem, err := newLR0Item(prod, item.dot+1)
+ if err != nil {
+ return nil, err
+ }
+ kItemMap[item.dottedSymbol] = append(kItemMap[item.dottedSymbol], kItem)
+ }
+
+ nextSyms := []symbol.Symbol{}
+ for sym := range kItemMap {
+ nextSyms = append(nextSyms, sym)
+ }
+ sort.Slice(nextSyms, func(i, j int) bool {
+ return nextSyms[i] < nextSyms[j]
+ })
+
+ kernels := []*neighbourKernel{}
+ for _, sym := range nextSyms {
+ k, err := newKernel(kItemMap[sym])
+ if err != nil {
+ return nil, err
+ }
+ kernels = append(kernels, &neighbourKernel{
+ symbol: sym,
+ kernel: k,
+ })
+ }
+
+ return kernels, nil
+}
diff --git a/src/urubu/grammar/parsing_table.go b/src/urubu/grammar/parsing_table.go
new file mode 100644
index 0000000..48ea9fe
--- /dev/null
+++ b/src/urubu/grammar/parsing_table.go
@@ -0,0 +1,553 @@
+package grammar
+
+import (
+ "fmt"
+ "sort"
+
+ "urubu/grammar/symbol"
+ spec "urubu/spec/grammar"
+)
+
+type ActionType string
+
+const (
+ ActionTypeShift = ActionType("shift")
+ ActionTypeReduce = ActionType("reduce")
+ ActionTypeError = ActionType("error")
+)
+
+type actionEntry int
+
+const actionEntryEmpty = actionEntry(0)
+
+func newShiftActionEntry(state stateNum) actionEntry {
+ return actionEntry(state * -1)
+}
+
+func newReduceActionEntry(prod productionNum) actionEntry {
+ return actionEntry(prod)
+}
+
+func (e actionEntry) isEmpty() bool {
+ return e == actionEntryEmpty
+}
+
+func (e actionEntry) describe() (ActionType, stateNum, productionNum) {
+ if e == actionEntryEmpty {
+ return ActionTypeError, stateNumInitial, productionNumNil
+ }
+ if e < 0 {
+ return ActionTypeShift, stateNum(e * -1), productionNumNil
+ }
+ return ActionTypeReduce, stateNumInitial, productionNum(e)
+}
+
+type GoToType string
+
+const (
+ GoToTypeRegistered = GoToType("registered")
+ GoToTypeError = GoToType("error")
+)
+
+type goToEntry uint
+
+const goToEntryEmpty = goToEntry(0)
+
+func newGoToEntry(state stateNum) goToEntry {
+ return goToEntry(state)
+}
+
+func (e goToEntry) describe() (GoToType, stateNum) {
+ if e == goToEntryEmpty {
+ return GoToTypeError, stateNumInitial
+ }
+ return GoToTypeRegistered, stateNum(e)
+}
+
+type conflictResolutionMethod int
+
+func (m conflictResolutionMethod) Int() int {
+ return int(m)
+}
+
+const (
+ ResolvedByPrec conflictResolutionMethod = 1
+ ResolvedByAssoc conflictResolutionMethod = 2
+ ResolvedByShift conflictResolutionMethod = 3
+ ResolvedByProdOrder conflictResolutionMethod = 4
+)
+
+type conflict interface {
+ conflict()
+}
+
+type shiftReduceConflict struct {
+ state stateNum
+ sym symbol.Symbol
+ nextState stateNum
+ prodNum productionNum
+ resolvedBy conflictResolutionMethod
+}
+
+func (c *shiftReduceConflict) conflict() {
+}
+
+type reduceReduceConflict struct {
+ state stateNum
+ sym symbol.Symbol
+ prodNum1 productionNum
+ prodNum2 productionNum
+ resolvedBy conflictResolutionMethod
+}
+
+func (c *reduceReduceConflict) conflict() {
+}
+
+var (
+ _ conflict = &shiftReduceConflict{}
+ _ conflict = &reduceReduceConflict{}
+)
+
+type ParsingTable struct {
+ actionTable []actionEntry
+ goToTable []goToEntry
+ stateCount int
+ terminalCount int
+ nonTerminalCount int
+
+ // errorTrapperStates's index means a state number, and when `errorTrapperStates[stateNum]` is `1`,
+ // the state has an item having the following form. The `α` and `β` can be empty.
+ //
+ // A → α・error β
+ errorTrapperStates []int
+
+ InitialState stateNum
+}
+
+func (t *ParsingTable) getAction(state stateNum, sym symbol.SymbolNum) (ActionType, stateNum, productionNum) {
+ pos := state.Int()*t.terminalCount + sym.Int()
+ return t.actionTable[pos].describe()
+}
+
+func (t *ParsingTable) getGoTo(state stateNum, sym symbol.SymbolNum) (GoToType, stateNum) {
+ pos := state.Int()*t.nonTerminalCount + sym.Int()
+ return t.goToTable[pos].describe()
+}
+
+func (t *ParsingTable) readAction(row int, col int) actionEntry {
+ return t.actionTable[row*t.terminalCount+col]
+}
+
+func (t *ParsingTable) writeAction(row int, col int, act actionEntry) {
+ t.actionTable[row*t.terminalCount+col] = act
+}
+
+func (t *ParsingTable) writeGoTo(state stateNum, sym symbol.Symbol, nextState stateNum) {
+ pos := state.Int()*t.nonTerminalCount + sym.Num().Int()
+ t.goToTable[pos] = newGoToEntry(nextState)
+}
+
+type lrTableBuilder struct {
+ automaton *lr0Automaton
+ prods *productionSet
+ termCount int
+ nonTermCount int
+ symTab *symbol.SymbolTableReader
+ precAndAssoc *precAndAssoc
+
+ conflicts []conflict
+}
+
+func (b *lrTableBuilder) build() (*ParsingTable, error) {
+ var ptab *ParsingTable
+ {
+ initialState := b.automaton.states[b.automaton.initialState]
+ ptab = &ParsingTable{
+ actionTable: make([]actionEntry, len(b.automaton.states)*b.termCount),
+ goToTable: make([]goToEntry, len(b.automaton.states)*b.nonTermCount),
+ stateCount: len(b.automaton.states),
+ terminalCount: b.termCount,
+ nonTerminalCount: b.nonTermCount,
+ errorTrapperStates: make([]int, len(b.automaton.states)),
+ InitialState: initialState.num,
+ }
+ }
+
+ for _, state := range b.automaton.states {
+ if state.isErrorTrapper {
+ ptab.errorTrapperStates[state.num] = 1
+ }
+
+ for sym, kID := range state.next {
+ nextState := b.automaton.states[kID]
+ if sym.IsTerminal() {
+ b.writeShiftAction(ptab, state.num, sym, nextState.num)
+ } else {
+ ptab.writeGoTo(state.num, sym, nextState.num)
+ }
+ }
+
+ for prodID := range state.reducible {
+ reducibleProd, ok := b.prods.findByID(prodID)
+ if !ok {
+ return nil, fmt.Errorf("reducible production not found: %v", prodID)
+ }
+
+ var reducibleItem *lrItem
+ for _, item := range state.items {
+ if item.prod != reducibleProd.id {
+ continue
+ }
+
+ reducibleItem = item
+ break
+ }
+ if reducibleItem == nil {
+ for _, item := range state.emptyProdItems {
+ if item.prod != reducibleProd.id {
+ continue
+ }
+
+ reducibleItem = item
+ break
+ }
+ if reducibleItem == nil {
+ return nil, fmt.Errorf("reducible item not found; state: %v, production: %v", state.num, reducibleProd.num)
+ }
+ }
+
+ for a := range reducibleItem.lookAhead.symbols {
+ b.writeReduceAction(ptab, state.num, a, reducibleProd.num)
+ }
+ }
+ }
+
+ return ptab, nil
+}
+
+// writeShiftAction writes a shift action to the parsing table. When a shift/reduce conflict occurred,
+// we prioritize the shift action.
+func (b *lrTableBuilder) writeShiftAction(tab *ParsingTable, state stateNum, sym symbol.Symbol, nextState stateNum) {
+ act := tab.readAction(state.Int(), sym.Num().Int())
+ if !act.isEmpty() {
+ ty, _, p := act.describe()
+ if ty == ActionTypeReduce {
+ act, method := b.resolveSRConflict(sym.Num(), p)
+ b.conflicts = append(b.conflicts, &shiftReduceConflict{
+ state: state,
+ sym: sym,
+ nextState: nextState,
+ prodNum: p,
+ resolvedBy: method,
+ })
+ if act == ActionTypeShift {
+ tab.writeAction(state.Int(), sym.Num().Int(), newShiftActionEntry(nextState))
+ }
+ return
+ }
+ }
+ tab.writeAction(state.Int(), sym.Num().Int(), newShiftActionEntry(nextState))
+}
+
+// writeReduceAction writes a reduce action to the parsing table. When a shift/reduce conflict occurred,
+// we prioritize the shift action, and when a reduce/reduce conflict we prioritize the action that reduces
+// the production with higher priority. Productions defined earlier in the grammar file have a higher priority.
+func (b *lrTableBuilder) writeReduceAction(tab *ParsingTable, state stateNum, sym symbol.Symbol, prod productionNum) {
+ act := tab.readAction(state.Int(), sym.Num().Int())
+ if !act.isEmpty() {
+ ty, s, p := act.describe()
+ switch ty {
+ case ActionTypeReduce:
+ if p == prod {
+ return
+ }
+
+ b.conflicts = append(b.conflicts, &reduceReduceConflict{
+ state: state,
+ sym: sym,
+ prodNum1: p,
+ prodNum2: prod,
+ resolvedBy: ResolvedByProdOrder,
+ })
+ if p < prod {
+ tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(p))
+ } else {
+ tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod))
+ }
+ case ActionTypeShift:
+ act, method := b.resolveSRConflict(sym.Num(), prod)
+ b.conflicts = append(b.conflicts, &shiftReduceConflict{
+ state: state,
+ sym: sym,
+ nextState: s,
+ prodNum: prod,
+ resolvedBy: method,
+ })
+ if act == ActionTypeReduce {
+ tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod))
+ }
+ }
+ return
+ }
+ tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod))
+}
+
+func (b *lrTableBuilder) resolveSRConflict(sym symbol.SymbolNum, prod productionNum) (ActionType, conflictResolutionMethod) {
+ symPrec := b.precAndAssoc.terminalPrecedence(sym)
+ prodPrec := b.precAndAssoc.productionPredence(prod)
+ if symPrec == 0 || prodPrec == 0 {
+ return ActionTypeShift, ResolvedByShift
+ }
+ if symPrec == prodPrec {
+ assoc := b.precAndAssoc.productionAssociativity(prod)
+ if assoc != assocTypeLeft {
+ return ActionTypeShift, ResolvedByAssoc
+ }
+ return ActionTypeReduce, ResolvedByAssoc
+ }
+ if symPrec < prodPrec {
+ return ActionTypeShift, ResolvedByPrec
+ }
+ return ActionTypeReduce, ResolvedByPrec
+}
+
+func (b *lrTableBuilder) genReport(tab *ParsingTable, gram *Grammar) (*spec.Report, error) {
+ var terms []*spec.Terminal
+ {
+ termSyms := b.symTab.TerminalSymbols()
+ terms = make([]*spec.Terminal, len(termSyms)+1)
+
+ for _, sym := range termSyms {
+ name, ok := b.symTab.ToText(sym)
+ if !ok {
+ return nil, fmt.Errorf("failed to generate terminals: symbol not found: %v", sym)
+ }
+
+ term := &spec.Terminal{
+ Number: sym.Num().Int(),
+ Name: name,
+ }
+
+ prec := b.precAndAssoc.terminalPrecedence(sym.Num())
+ if prec != precNil {
+ term.Precedence = prec
+ }
+
+ assoc := b.precAndAssoc.terminalAssociativity(sym.Num())
+ switch assoc {
+ case assocTypeLeft:
+ term.Associativity = "l"
+ case assocTypeRight:
+ term.Associativity = "r"
+ }
+
+ terms[sym.Num()] = term
+ }
+ }
+
+ var nonTerms []*spec.NonTerminal
+ {
+ nonTermSyms := b.symTab.NonTerminalSymbols()
+ nonTerms = make([]*spec.NonTerminal, len(nonTermSyms)+1)
+ for _, sym := range nonTermSyms {
+ name, ok := b.symTab.ToText(sym)
+ if !ok {
+ return nil, fmt.Errorf("failed to generate non-terminals: symbol not found: %v", sym)
+ }
+
+ nonTerms[sym.Num()] = &spec.NonTerminal{
+ Number: sym.Num().Int(),
+ Name: name,
+ }
+ }
+ }
+
+ var prods []*spec.Production
+ {
+ ps := gram.productionSet.getAllProductions()
+ prods = make([]*spec.Production, len(ps)+1)
+ for _, p := range ps {
+ rhs := make([]int, len(p.rhs))
+ for i, e := range p.rhs {
+ if e.IsTerminal() {
+ rhs[i] = e.Num().Int()
+ } else {
+ rhs[i] = e.Num().Int() * -1
+ }
+ }
+
+ prod := &spec.Production{
+ Number: p.num.Int(),
+ LHS: p.lhs.Num().Int(),
+ RHS: rhs,
+ }
+
+ prec := b.precAndAssoc.productionPredence(p.num)
+ if prec != precNil {
+ prod.Precedence = prec
+ }
+
+ assoc := b.precAndAssoc.productionAssociativity(p.num)
+ switch assoc {
+ case assocTypeLeft:
+ prod.Associativity = "l"
+ case assocTypeRight:
+ prod.Associativity = "r"
+ }
+
+ prods[p.num.Int()] = prod
+ }
+ }
+
+ var states []*spec.State
+ {
+ srConflicts := map[stateNum][]*shiftReduceConflict{}
+ rrConflicts := map[stateNum][]*reduceReduceConflict{}
+ for _, con := range b.conflicts {
+ switch c := con.(type) {
+ case *shiftReduceConflict:
+ srConflicts[c.state] = append(srConflicts[c.state], c)
+ case *reduceReduceConflict:
+ rrConflicts[c.state] = append(rrConflicts[c.state], c)
+ }
+ }
+
+ states = make([]*spec.State, len(b.automaton.states))
+ for _, s := range b.automaton.states {
+ kernel := make([]*spec.Item, len(s.items))
+ for i, item := range s.items {
+ p, ok := b.prods.findByID(item.prod)
+ if !ok {
+ return nil, fmt.Errorf("failed to generate states: production of kernel item not found: %v", item.prod)
+ }
+
+ kernel[i] = &spec.Item{
+ Production: p.num.Int(),
+ Dot: item.dot,
+ }
+ }
+
+ sort.Slice(kernel, func(i, j int) bool {
+ if kernel[i].Production < kernel[j].Production {
+ return true
+ }
+ if kernel[i].Production > kernel[j].Production {
+ return false
+ }
+ return kernel[i].Dot < kernel[j].Dot
+ })
+
+ var shift []*spec.Transition
+ var reduce []*spec.Reduce
+ var goTo []*spec.Transition
+ {
+ TERMINALS_LOOP:
+ for _, t := range b.symTab.TerminalSymbols() {
+ act, next, prod := tab.getAction(s.num, t.Num())
+ switch act {
+ case ActionTypeShift:
+ shift = append(shift, &spec.Transition{
+ Symbol: t.Num().Int(),
+ State: next.Int(),
+ })
+ case ActionTypeReduce:
+ for _, r := range reduce {
+ if r.Production == prod.Int() {
+ r.LookAhead = append(r.LookAhead, t.Num().Int())
+ continue TERMINALS_LOOP
+ }
+ }
+ reduce = append(reduce, &spec.Reduce{
+ LookAhead: []int{t.Num().Int()},
+ Production: prod.Int(),
+ })
+ }
+ }
+
+ for _, n := range b.symTab.NonTerminalSymbols() {
+ ty, next := tab.getGoTo(s.num, n.Num())
+ if ty == GoToTypeRegistered {
+ goTo = append(goTo, &spec.Transition{
+ Symbol: n.Num().Int(),
+ State: next.Int(),
+ })
+ }
+ }
+
+ sort.Slice(shift, func(i, j int) bool {
+ return shift[i].State < shift[j].State
+ })
+ sort.Slice(reduce, func(i, j int) bool {
+ return reduce[i].Production < reduce[j].Production
+ })
+ sort.Slice(goTo, func(i, j int) bool {
+ return goTo[i].State < goTo[j].State
+ })
+ }
+
+ sr := []*spec.SRConflict{}
+ rr := []*spec.RRConflict{}
+ {
+ for _, c := range srConflicts[s.num] {
+ conflict := &spec.SRConflict{
+ Symbol: c.sym.Num().Int(),
+ State: c.nextState.Int(),
+ Production: c.prodNum.Int(),
+ ResolvedBy: c.resolvedBy.Int(),
+ }
+
+ ty, s, p := tab.getAction(s.num, c.sym.Num())
+ switch ty {
+ case ActionTypeShift:
+ n := s.Int()
+ conflict.AdoptedState = &n
+ case ActionTypeReduce:
+ n := p.Int()
+ conflict.AdoptedProduction = &n
+ }
+
+ sr = append(sr, conflict)
+ }
+
+ sort.Slice(sr, func(i, j int) bool {
+ return sr[i].Symbol < sr[j].Symbol
+ })
+
+ for _, c := range rrConflicts[s.num] {
+ conflict := &spec.RRConflict{
+ Symbol: c.sym.Num().Int(),
+ Production1: c.prodNum1.Int(),
+ Production2: c.prodNum2.Int(),
+ ResolvedBy: c.resolvedBy.Int(),
+ }
+
+ _, _, p := tab.getAction(s.num, c.sym.Num())
+ conflict.AdoptedProduction = p.Int()
+
+ rr = append(rr, conflict)
+ }
+
+ sort.Slice(rr, func(i, j int) bool {
+ return rr[i].Symbol < rr[j].Symbol
+ })
+ }
+
+ states[s.num.Int()] = &spec.State{
+ Number: s.num.Int(),
+ Kernel: kernel,
+ Shift: shift,
+ Reduce: reduce,
+ GoTo: goTo,
+ SRConflict: sr,
+ RRConflict: rr,
+ }
+ }
+ }
+
+ return &spec.Report{
+ Terminals: terms,
+ NonTerminals: nonTerms,
+ Productions: prods,
+ States: states,
+ }, nil
+}
diff --git a/src/urubu/grammar/production.go b/src/urubu/grammar/production.go
new file mode 100644
index 0000000..8f6c103
--- /dev/null
+++ b/src/urubu/grammar/production.go
@@ -0,0 +1,117 @@
+package grammar
+
+import (
+ "crypto/sha256"
+ "encoding/hex"
+ "fmt"
+
+ "urubu/grammar/symbol"
+)
+
+type productionID [32]byte
+
+func (id productionID) String() string {
+ return hex.EncodeToString(id[:])
+}
+
+func genProductionID(lhs symbol.Symbol, rhs []symbol.Symbol) productionID {
+ seq := lhs.Byte()
+ for _, sym := range rhs {
+ seq = append(seq, sym.Byte()...)
+ }
+ return productionID(sha256.Sum256(seq))
+}
+
+type productionNum uint16
+
+const (
+ productionNumNil = productionNum(0)
+ productionNumStart = productionNum(1)
+ productionNumMin = productionNum(2)
+)
+
+func (n productionNum) Int() int {
+ return int(n)
+}
+
+type production struct {
+ id productionID
+ num productionNum
+ lhs symbol.Symbol
+ rhs []symbol.Symbol
+ rhsLen int
+}
+
+func newProduction(lhs symbol.Symbol, rhs []symbol.Symbol) (*production, error) {
+ if lhs.IsNil() {
+ return nil, fmt.Errorf("LHS must be a non-nil symbol; LHS: %v, RHS: %v", lhs, rhs)
+ }
+ for _, sym := range rhs {
+ if sym.IsNil() {
+ return nil, fmt.Errorf("a symbol of RHS must be a non-nil symbol; LHS: %v, RHS: %v", lhs, rhs)
+ }
+ }
+
+ return &production{
+ id: genProductionID(lhs, rhs),
+ lhs: lhs,
+ rhs: rhs,
+ rhsLen: len(rhs),
+ }, nil
+}
+
+func (p *production) isEmpty() bool {
+ return p.rhsLen == 0
+}
+
+type productionSet struct {
+ lhs2Prods map[symbol.Symbol][]*production
+ id2Prod map[productionID]*production
+ num productionNum
+}
+
+func newProductionSet() *productionSet {
+ return &productionSet{
+ lhs2Prods: map[symbol.Symbol][]*production{},
+ id2Prod: map[productionID]*production{},
+ num: productionNumMin,
+ }
+}
+
+func (ps *productionSet) append(prod *production) {
+ if _, ok := ps.id2Prod[prod.id]; ok {
+ return
+ }
+
+ if prod.lhs.IsStart() {
+ prod.num = productionNumStart
+ } else {
+ prod.num = ps.num
+ ps.num++
+ }
+
+ if prods, ok := ps.lhs2Prods[prod.lhs]; ok {
+ ps.lhs2Prods[prod.lhs] = append(prods, prod)
+ } else {
+ ps.lhs2Prods[prod.lhs] = []*production{prod}
+ }
+ ps.id2Prod[prod.id] = prod
+}
+
+func (ps *productionSet) findByID(id productionID) (*production, bool) {
+ prod, ok := ps.id2Prod[id]
+ return prod, ok
+}
+
+func (ps *productionSet) findByLHS(lhs symbol.Symbol) ([]*production, bool) {
+ if lhs.IsNil() {
+ return nil, false
+ }
+
+ prods, ok := ps.lhs2Prods[lhs]
+ return prods, ok
+}
+
+func (ps *productionSet) getAllProductions() map[productionID]*production {
+ return ps.id2Prod
+}
diff --git a/src/urubu/grammar/semantic_error.go b/src/urubu/grammar/semantic_error.go
new file mode 100644
index 0000000..88a6b17
--- /dev/null
+++ b/src/urubu/grammar/semantic_error.go
@@ -0,0 +1,30 @@
+package grammar
+
+import "errors"
+
+var (
+ semErrNoGrammarName = errors.New("name is missing")
+ semErrSpellingInconsistency = errors.New("the identifiers are treated as the same. please use the same spelling")
+ semErrDuplicateAssoc = errors.New("associativity and precedence cannot be specified multiple times for a symbol")
+ semErrUndefinedPrec = errors.New("symbol must has precedence")
+ semErrUndefinedOrdSym = errors.New("undefined ordered symbol")
+ semErrUnusedProduction = errors.New("unused production")
+ semErrUnusedTerminal = errors.New("unused terminal")
+ semErrTermCannotBeSkipped = errors.New("a terminal used in productions cannot be skipped")
+ semErrNoProduction = errors.New("a grammar needs at least one production")
+ semErrUndefinedSym = errors.New("undefined symbol")
+ semErrDuplicateProduction = errors.New("duplicate production")
+ semErrDuplicateTerminal = errors.New("duplicate terminal")
+ semErrDuplicateFragment = errors.New("duplicate fragment")
+ semErrDuplicateName = errors.New("duplicate names are not allowed between terminals and non-terminals")
+ semErrErrSymIsReserved = errors.New("symbol 'error' is reserved as a terminal symbol")
+ semErrDuplicateLabel = errors.New("a label must be unique in an alternative")
+ semErrInvalidLabel = errors.New("a label must differ from terminal symbols or non-terminal symbols")
+ semErrDirInvalidName = errors.New("invalid directive name")
+ semErrDirInvalidParam = errors.New("invalid parameter")
+ semErrDuplicateDir = errors.New("a directive must not be duplicated")
+ semErrDuplicateElem = errors.New("duplicate element")
+ semErrAmbiguousElem = errors.New("ambiguous element")
+ semErrInvalidProdDir = errors.New("invalid production directive")
+ semErrInvalidAltDir = errors.New("invalid alternative directive")
+)
diff --git a/src/urubu/grammar/symbol/symbol.go b/src/urubu/grammar/symbol/symbol.go
new file mode 100644
index 0000000..f9e6a93
--- /dev/null
+++ b/src/urubu/grammar/symbol/symbol.go
@@ -0,0 +1,295 @@
+package symbol
+
+import (
+ "fmt"
+ "sort"
+)
+
+type symbolKind string
+
+const (
+ symbolKindNonTerminal = symbolKind("non-terminal")
+ symbolKindTerminal = symbolKind("terminal")
+)
+
+func (t symbolKind) String() string {
+ return string(t)
+}
+
+type SymbolNum uint16
+
+func (n SymbolNum) Int() int {
+ return int(n)
+}
+
+type Symbol uint16
+
+func (s Symbol) String() string {
+ kind, isStart, isEOF, num := s.describe()
+ var prefix string
+ switch {
+ case isStart:
+ prefix = "s"
+ case isEOF:
+ prefix = "e"
+ case kind == symbolKindNonTerminal:
+ prefix = "n"
+ case kind == symbolKindTerminal:
+ prefix = "t"
+ default:
+ prefix = "?"
+ }
+ return fmt.Sprintf("%v%v", prefix, num)
+}
+
+const (
+ maskKindPart = uint16(0x8000) // 1000 0000 0000 0000
+ maskNonTerminal = uint16(0x0000) // 0000 0000 0000 0000
+ maskTerminal = uint16(0x8000) // 1000 0000 0000 0000
+
+ maskSubKindpart = uint16(0x4000) // 0100 0000 0000 0000
+ maskNonStartAndEOF = uint16(0x0000) // 0000 0000 0000 0000
+ maskStartOrEOF = uint16(0x4000) // 0100 0000 0000 0000
+
+ maskNumberPart = uint16(0x3fff) // 0011 1111 1111 1111
+
+ symbolNumStart = uint16(0x0001) // 0000 0000 0000 0001
+ symbolNumEOF = uint16(0x0001) // 0000 0000 0000 0001
+
+ SymbolNil = Symbol(0) // 0000 0000 0000 0000
+ symbolStart = Symbol(maskNonTerminal | maskStartOrEOF | symbolNumStart) // 0100 0000 0000 0001
+ SymbolEOF = Symbol(maskTerminal | maskStartOrEOF | symbolNumEOF) // 1100 0000 0000 0001: The EOF symbol is treated as a terminal symbol.
+
+ // The symbol name contains `<` and `>` to avoid conflicting with user-defined symbols.
+ symbolNameEOF = "<eof>"
+
+ nonTerminalNumMin = SymbolNum(2) // The number 1 is used by a start symbol.
+ terminalNumMin = SymbolNum(2) // The number 1 is used by the EOF symbol.
+ symbolNumMax = SymbolNum(0xffff) >> 2 // 0011 1111 1111 1111
+)
+
+func newSymbol(kind symbolKind, isStart bool, num SymbolNum) (Symbol, error) {
+ if num > symbolNumMax {
+ return SymbolNil, fmt.Errorf("a symbol number exceeds the limit; limit: %v, passed: %v", symbolNumMax, num)
+ }
+ if kind == symbolKindTerminal && isStart {
+ return SymbolNil, fmt.Errorf("a start symbol must be a non-terminal symbol")
+ }
+
+ kindMask := maskNonTerminal
+ if kind == symbolKindTerminal {
+ kindMask = maskTerminal
+ }
+ startMask := maskNonStartAndEOF
+ if isStart {
+ startMask = maskStartOrEOF
+ }
+ return Symbol(kindMask | startMask | uint16(num)), nil
+}
+
+func (s Symbol) Num() SymbolNum {
+ _, _, _, num := s.describe()
+ return num
+}
+
+func (s Symbol) Byte() []byte {
+ if s.IsNil() {
+ return []byte{0, 0}
+ }
+ return []byte{byte(uint16(s) >> 8), byte(uint16(s) & 0x00ff)}
+}
+
+func (s Symbol) IsNil() bool {
+ _, _, _, num := s.describe()
+ return num == 0
+}
+
+func (s Symbol) IsStart() bool {
+ if s.IsNil() {
+ return false
+ }
+ _, isStart, _, _ := s.describe()
+ return isStart
+}
+
+func (s Symbol) isEOF() bool {
+ if s.IsNil() {
+ return false
+ }
+ _, _, isEOF, _ := s.describe()
+ return isEOF
+}
+
+func (s Symbol) isNonTerminal() bool {
+ if s.IsNil() {
+ return false
+ }
+ kind, _, _, _ := s.describe()
+ return kind == symbolKindNonTerminal
+}
+
+func (s Symbol) IsTerminal() bool {
+ if s.IsNil() {
+ return false
+ }
+ return !s.isNonTerminal()
+}
+
+func (s Symbol) describe() (symbolKind, bool, bool, SymbolNum) {
+ kind := symbolKindNonTerminal
+ if uint16(s)&maskKindPart > 0 {
+ kind = symbolKindTerminal
+ }
+ isStart := false
+ isEOF := false
+ if uint16(s)&maskSubKindpart > 0 {
+ if kind == symbolKindNonTerminal {
+ isStart = true
+ } else {
+ isEOF = true
+ }
+ }
+ num := SymbolNum(uint16(s) & maskNumberPart)
+ return kind, isStart, isEOF, num
+}
+
+type SymbolTable struct {
+ text2Sym map[string]Symbol
+ sym2Text map[Symbol]string
+ nonTermTexts []string
+ termTexts []string
+ nonTermNum SymbolNum
+ termNum SymbolNum
+}
+
+type SymbolTableWriter struct {
+ *SymbolTable
+}
+
+type SymbolTableReader struct {
+ *SymbolTable
+}
+
+func NewSymbolTable() *SymbolTable {
+ return &SymbolTable{
+ text2Sym: map[string]Symbol{
+ symbolNameEOF: SymbolEOF,
+ },
+ sym2Text: map[Symbol]string{
+ SymbolEOF: symbolNameEOF,
+ },
+ termTexts: []string{
+ "", // Nil
+ symbolNameEOF, // EOF
+ },
+ nonTermTexts: []string{
+ "", // Nil
+ "", // Start Symbol
+ },
+ nonTermNum: nonTerminalNumMin,
+ termNum: terminalNumMin,
+ }
+}
+
+func (t *SymbolTable) Writer() *SymbolTableWriter {
+ return &SymbolTableWriter{
+ SymbolTable: t,
+ }
+}
+
+func (t *SymbolTable) Reader() *SymbolTableReader {
+ return &SymbolTableReader{
+ SymbolTable: t,
+ }
+}
+
+func (w *SymbolTableWriter) RegisterStartSymbol(text string) (Symbol, error) {
+ w.text2Sym[text] = symbolStart
+ w.sym2Text[symbolStart] = text
+ w.nonTermTexts[symbolStart.Num().Int()] = text
+ return symbolStart, nil
+}
+
+func (w *SymbolTableWriter) RegisterNonTerminalSymbol(text string) (Symbol, error) {
+ if sym, ok := w.text2Sym[text]; ok {
+ return sym, nil
+ }
+ sym, err := newSymbol(symbolKindNonTerminal, false, w.nonTermNum)
+ if err != nil {
+ return SymbolNil, err
+ }
+ w.nonTermNum++
+ w.text2Sym[text] = sym
+ w.sym2Text[sym] = text
+ w.nonTermTexts = append(w.nonTermTexts, text)
+ return sym, nil
+}
+
+func (w *SymbolTableWriter) RegisterTerminalSymbol(text string) (Symbol, error) {
+ if sym, ok := w.text2Sym[text]; ok {
+ return sym, nil
+ }
+ sym, err := newSymbol(symbolKindTerminal, false, w.termNum)
+ if err != nil {
+ return SymbolNil, err
+ }
+ w.termNum++
+ w.text2Sym[text] = sym
+ w.sym2Text[sym] = text
+ w.termTexts = append(w.termTexts, text)
+ return sym, nil
+}
+
+func (r *SymbolTableReader) ToSymbol(text string) (Symbol, bool) {
+ if sym, ok := r.text2Sym[text]; ok {
+ return sym, true
+ }
+ return SymbolNil, false
+}
+
+func (r *SymbolTableReader) ToText(sym Symbol) (string, bool) {
+ text, ok := r.sym2Text[sym]
+ return text, ok
+}
+
+func (r *SymbolTableReader) TerminalSymbols() []Symbol {
+ syms := make([]Symbol, 0, r.termNum.Int()-terminalNumMin.Int())
+ for sym := range r.sym2Text {
+ if !sym.IsTerminal() || sym.IsNil() {
+ continue
+ }
+ syms = append(syms, sym)
+ }
+ sort.Slice(syms, func(i, j int) bool {
+ return syms[i] < syms[j]
+ })
+ return syms
+}
+
+func (r *SymbolTableReader) TerminalTexts() ([]string, error) {
+ if r.termNum == terminalNumMin {
+ return nil, fmt.Errorf("symbol table has no terminals")
+ }
+ return r.termTexts, nil
+}
+
+func (r *SymbolTableReader) NonTerminalSymbols() []Symbol {
+ syms := make([]Symbol, 0, r.nonTermNum.Int()-nonTerminalNumMin.Int())
+ for sym := range r.sym2Text {
+ if !sym.isNonTerminal() || sym.IsNil() {
+ continue
+ }
+ syms = append(syms, sym)
+ }
+ sort.Slice(syms, func(i, j int) bool {
+ return syms[i] < syms[j]
+ })
+ return syms
+}
+
+func (r *SymbolTableReader) NonTerminalTexts() ([]string, error) {
+ if r.nonTermNum == nonTerminalNumMin || r.nonTermTexts[symbolStart.Num().Int()] == "" {
+ return nil, fmt.Errorf("symbol table has no terminals or no start symbol")
+ }
+ return r.nonTermTexts, nil
+}