diff options
Diffstat (limited to 'src/urubu/grammar')
20 files changed, 2175 insertions, 5177 deletions
diff --git a/src/urubu/grammar/first.go b/src/urubu/grammar/first.go deleted file mode 100644 index 6443bcf..0000000 --- a/src/urubu/grammar/first.go +++ /dev/null @@ -1,148 +0,0 @@ -package grammar - -import ( - "fmt" - - "urubu/grammar/symbol" -) - -type firstEntry struct { - symbols map[symbol.Symbol]struct{} - empty bool -} - -func newFirstEntry() *firstEntry { - return &firstEntry{ - symbols: map[symbol.Symbol]struct{}{}, - empty: false, - } -} - -func (e *firstEntry) add(sym symbol.Symbol) bool { - if _, ok := e.symbols[sym]; ok { - return false - } - e.symbols[sym] = struct{}{} - return true -} - -func (e *firstEntry) addEmpty() bool { - if !e.empty { - e.empty = true - return true - } - return false -} - -func (e *firstEntry) mergeExceptEmpty(target *firstEntry) bool { - if target == nil { - return false - } - changed := false - for sym := range target.symbols { - added := e.add(sym) - if added { - changed = true - } - } - return changed -} - -type firstSet struct { - set map[symbol.Symbol]*firstEntry -} - -func newFirstSet(prods *productionSet) *firstSet { - fst := &firstSet{ - set: map[symbol.Symbol]*firstEntry{}, - } - for _, prod := range prods.getAllProductions() { - if _, ok := fst.set[prod.lhs]; ok { - continue - } - fst.set[prod.lhs] = newFirstEntry() - } - - return fst -} - -func (fst *firstSet) find(prod *production, head int) (*firstEntry, error) { - entry := newFirstEntry() - if prod.rhsLen <= head { - entry.addEmpty() - return entry, nil - } - for _, sym := range prod.rhs[head:] { - if sym.IsTerminal() { - entry.add(sym) - return entry, nil - } - - e := fst.findBySymbol(sym) - if e == nil { - return nil, fmt.Errorf("an entry of FIRST was not found; symbol: %s", sym) - } - for s := range e.symbols { - entry.add(s) - } - if !e.empty { - return entry, nil - } - } - entry.addEmpty() - return entry, nil -} - -func (fst *firstSet) findBySymbol(sym symbol.Symbol) *firstEntry { - return fst.set[sym] -} - -type firstComContext struct { - first *firstSet -} - -func newFirstComContext(prods *productionSet) *firstComContext { - return &firstComContext{ - first: newFirstSet(prods), - } -} - -func genFirstSet(prods *productionSet) (*firstSet, error) { - cc := newFirstComContext(prods) - for { - more := false - for _, prod := range prods.getAllProductions() { - e := cc.first.findBySymbol(prod.lhs) - changed, err := genProdFirstEntry(cc, e, prod) - if err != nil { - return nil, err - } - if changed { - more = true - } - } - if !more { - break - } - } - return cc.first, nil -} - -func genProdFirstEntry(cc *firstComContext, acc *firstEntry, prod *production) (bool, error) { - if prod.isEmpty() { - return acc.addEmpty(), nil - } - - for _, sym := range prod.rhs { - if sym.IsTerminal() { - return acc.add(sym), nil - } - - e := cc.first.findBySymbol(sym) - changed := acc.mergeExceptEmpty(e) - if !e.empty { - return changed, nil - } - } - return acc.addEmpty(), nil -} diff --git a/src/urubu/grammar/grammar.go b/src/urubu/grammar/grammar.go deleted file mode 100644 index bfa53c6..0000000 --- a/src/urubu/grammar/grammar.go +++ /dev/null @@ -1,1390 +0,0 @@ -package grammar - -import ( - "fmt" - "io" - "strings" - - verr "urubu/error" - "urubu/grammar/lexical" - "urubu/grammar/symbol" - spec "urubu/spec/grammar" - "urubu/spec/grammar/parser" -) - -type astActionEntry struct { - position int - expansion bool -} - -type assocType string - -const ( - assocTypeNil = assocType("") - assocTypeLeft = assocType("left") - assocTypeRight = assocType("right") -) - -const ( - precNil = 0 - precMin = 1 -) - -// precAndAssoc represents precedence and associativities of terminal symbols and productions. -// We use the priority of the production to resolve shift/reduce conflicts. -type precAndAssoc struct { - // termPrec and termAssoc represent the precedence of the terminal symbols. - termPrec map[symbol.SymbolNum]int - termAssoc map[symbol.SymbolNum]assocType - - // prodPrec and prodAssoc represent the precedence and the associativities of the production. - // These values are inherited from the right-most terminal symbols in the RHS of the productions. - prodPrec map[productionNum]int - prodAssoc map[productionNum]assocType -} - -func (pa *precAndAssoc) terminalPrecedence(sym symbol.SymbolNum) int { - prec, ok := pa.termPrec[sym] - if !ok { - return precNil - } - - return prec -} - -func (pa *precAndAssoc) terminalAssociativity(sym symbol.SymbolNum) assocType { - assoc, ok := pa.termAssoc[sym] - if !ok { - return assocTypeNil - } - - return assoc -} - -func (pa *precAndAssoc) productionPredence(prod productionNum) int { - prec, ok := pa.prodPrec[prod] - if !ok { - return precNil - } - - return prec -} - -func (pa *precAndAssoc) productionAssociativity(prod productionNum) assocType { - assoc, ok := pa.prodAssoc[prod] - if !ok { - return assocTypeNil - } - - return assoc -} - -const reservedSymbolNameError = "error" - -type Grammar struct { - name string - lexSpec *lexical.LexSpec - skipSymbols []symbol.Symbol - productionSet *productionSet - augmentedStartSymbol symbol.Symbol - errorSymbol symbol.Symbol - symbolTable *symbol.SymbolTableReader - astActions map[productionID][]*astActionEntry - precAndAssoc *precAndAssoc - - // recoverProductions is a set of productions having the recover directive. - recoverProductions map[productionID]struct{} -} - -type buildConfig struct { - isReportingEnabled bool -} - -type BuildOption func(config *buildConfig) - -func EnableReporting() BuildOption { - return func(config *buildConfig) { - config.isReportingEnabled = true - } -} - -type GrammarBuilder struct { - AST *parser.RootNode - - errs verr.SpecErrors -} - -func (b *GrammarBuilder) Build(opts ...BuildOption) (*spec.CompiledGrammar, *spec.Report, error) { - gram, err := b.build() - if err != nil { - return nil, nil, err - } - - return compile(gram, opts...) -} - -func (b *GrammarBuilder) build() (*Grammar, error) { - var specName string - { - errOccurred := false - for _, dir := range b.AST.Directives { - if dir.Name != "name" { - continue - } - - if len(dir.Parameters) != 1 || dir.Parameters[0].ID == "" { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'name' takes just one ID parameter", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - - errOccurred = true - break - } - - specName = dir.Parameters[0].ID - break - } - - if specName == "" && !errOccurred { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrNoGrammarName, - }) - } - } - - b.checkSpellingInconsistenciesOfUserDefinedIDs(b.AST) - if len(b.errs) > 0 { - return nil, b.errs - } - - symTab, ss, err := b.genSymbolTable(b.AST) - if err != nil { - return nil, err - } - - lexSpec, skip, err := b.genLexSpecAndSkipSymbols(symTab.Reader(), b.AST) - if err != nil { - return nil, err - } - - prodsAndActs, err := b.genProductionsAndActions(b.AST, symTab.Reader(), ss.errSym, ss.augStartSym, ss.startSym) - if err != nil { - return nil, err - } - if prodsAndActs == nil && len(b.errs) > 0 { - return nil, b.errs - } - - pa, err := b.genPrecAndAssoc(symTab.Reader(), ss.errSym, prodsAndActs) - if err != nil { - return nil, err - } - if pa == nil && len(b.errs) > 0 { - return nil, b.errs - } - - syms := findUsedAndUnusedSymbols(b.AST) - if syms == nil && len(b.errs) > 0 { - return nil, b.errs - } - - // When a terminal symbol that cannot be reached from the start symbol has the skip directive, - // the compiler treats its terminal as a used symbol, not unused. - { - r := symTab.Reader() - for _, sym := range skip { - s, _ := r.ToText(sym) - if _, ok := syms.unusedTerminals[s]; !ok { - prod := syms.usedTerminals[s] - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrTermCannotBeSkipped, - Detail: s, - Row: prod.Pos.Row, - Col: prod.Pos.Col, - }) - continue - } - - delete(syms.unusedTerminals, s) - } - } - - for sym, prod := range syms.unusedProductions { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrUnusedProduction, - Detail: sym, - Row: prod.Pos.Row, - Col: prod.Pos.Col, - }) - } - - for sym, prod := range syms.unusedTerminals { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrUnusedTerminal, - Detail: sym, - Row: prod.Pos.Row, - Col: prod.Pos.Col, - }) - } - - if len(b.errs) > 0 { - return nil, b.errs - } - - return &Grammar{ - name: specName, - lexSpec: lexSpec, - skipSymbols: skip, - productionSet: prodsAndActs.prods, - augmentedStartSymbol: prodsAndActs.augStartSym, - errorSymbol: ss.errSym, - symbolTable: symTab.Reader(), - astActions: prodsAndActs.astActs, - recoverProductions: prodsAndActs.recoverProds, - precAndAssoc: pa, - }, nil -} - -type usedAndUnusedSymbols struct { - unusedProductions map[string]*parser.ProductionNode - unusedTerminals map[string]*parser.ProductionNode - usedTerminals map[string]*parser.ProductionNode -} - -func findUsedAndUnusedSymbols(root *parser.RootNode) *usedAndUnusedSymbols { - prods := map[string]*parser.ProductionNode{} - lexProds := map[string]*parser.ProductionNode{} - mark := map[string]bool{} - { - for _, p := range root.Productions { - prods[p.LHS] = p - mark[p.LHS] = false - for _, alt := range p.RHS { - for _, e := range alt.Elements { - if e.ID == "" { - continue - } - mark[e.ID] = false - } - } - } - - for _, p := range root.LexProductions { - lexProds[p.LHS] = p - mark[p.LHS] = false - } - - start := root.Productions[0] - mark[start.LHS] = true - markUsedSymbols(mark, map[string]bool{}, prods, start) - - // We don't have to check the error symbol because the error symbol doesn't have a production. - delete(mark, reservedSymbolNameError) - } - - usedTerms := make(map[string]*parser.ProductionNode, len(lexProds)) - unusedProds := map[string]*parser.ProductionNode{} - unusedTerms := map[string]*parser.ProductionNode{} - for sym, used := range mark { - if p, ok := prods[sym]; ok { - if used { - continue - } - unusedProds[sym] = p - continue - } - if p, ok := lexProds[sym]; ok { - if used { - usedTerms[sym] = p - } else { - unusedTerms[sym] = p - } - continue - } - - // May be reached here when a fragment name appears on the right-hand side of a production rule. However, an error - // to the effect that a production rule cannot contain a fragment will be detected in a subsequent process. So we can - // ignore it here. - } - - return &usedAndUnusedSymbols{ - usedTerminals: usedTerms, - unusedProductions: unusedProds, - unusedTerminals: unusedTerms, - } -} - -func markUsedSymbols(mark map[string]bool, marked map[string]bool, prods map[string]*parser.ProductionNode, prod *parser.ProductionNode) { - if marked[prod.LHS] { - return - } - - for _, alt := range prod.RHS { - for _, e := range alt.Elements { - if e.ID == "" { - continue - } - - mark[e.ID] = true - - p, ok := prods[e.ID] - if !ok { - continue - } - - // Remove a production to avoid inifinite recursion. - marked[prod.LHS] = true - - markUsedSymbols(mark, marked, prods, p) - } - } -} - -func (b *GrammarBuilder) checkSpellingInconsistenciesOfUserDefinedIDs(root *parser.RootNode) { - var ids []string - { - for _, prod := range root.Productions { - ids = append(ids, prod.LHS) - for _, alt := range prod.RHS { - for _, elem := range alt.Elements { - if elem.Label != nil { - ids = append(ids, elem.Label.Name) - } - } - } - } - for _, prod := range root.LexProductions { - ids = append(ids, prod.LHS) - } - for _, dir := range root.Directives { - dirIDs := collectUserDefinedIDsFromDirective(dir) - if len(dirIDs) > 0 { - ids = append(ids, dirIDs...) - } - } - } - - duplicated := lexical.FindSpellingInconsistencies(ids) - if len(duplicated) == 0 { - return - } - - for _, dup := range duplicated { - var s string - { - var b strings.Builder - fmt.Fprintf(&b, "%+v", dup[0]) - for _, id := range dup[1:] { - fmt.Fprintf(&b, ", %+v", id) - } - s = b.String() - } - - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrSpellingInconsistency, - Detail: s, - }) - } -} - -func collectUserDefinedIDsFromDirective(dir *parser.DirectiveNode) []string { - var ids []string - for _, param := range dir.Parameters { - if param.Group != nil { - for _, d := range param.Group { - dIDs := collectUserDefinedIDsFromDirective(d) - if len(dIDs) > 0 { - ids = append(ids, dIDs...) - } - } - } - if param.OrderedSymbol != "" { - ids = append(ids, param.OrderedSymbol) - } - } - return ids -} - -type symbols struct { - errSym symbol.Symbol - augStartSym symbol.Symbol - startSym symbol.Symbol -} - -func (b *GrammarBuilder) genSymbolTable(root *parser.RootNode) (*symbol.SymbolTable, *symbols, error) { - symTab := symbol.NewSymbolTable() - w := symTab.Writer() - r := symTab.Reader() - - // We need to register the reserved symbol before registering others. - var errSym symbol.Symbol - { - sym, err := w.RegisterTerminalSymbol(reservedSymbolNameError) - if err != nil { - return nil, nil, err - } - errSym = sym - } - - for _, prod := range root.LexProductions { - if sym, exist := r.ToSymbol(prod.LHS); exist { - if sym == errSym { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrErrSymIsReserved, - Row: prod.Pos.Row, - Col: prod.Pos.Col, - }) - } else { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateTerminal, - Detail: prod.LHS, - Row: prod.Pos.Row, - Col: prod.Pos.Col, - }) - } - - continue - } - - _, err := w.RegisterTerminalSymbol(prod.LHS) - if err != nil { - return nil, nil, err - } - } - - startProd := root.Productions[0] - augStartText := fmt.Sprintf("%s'", startProd.LHS) - var err error - augStartSym, err := w.RegisterStartSymbol(augStartText) - if err != nil { - return nil, nil, err - } - if augStartSym == errSym { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrErrSymIsReserved, - Row: startProd.Pos.Row, - Col: startProd.Pos.Col, - }) - } - - startSym, err := w.RegisterNonTerminalSymbol(startProd.LHS) - if err != nil { - return nil, nil, err - } - if startSym == errSym { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrErrSymIsReserved, - Row: startProd.Pos.Row, - Col: startProd.Pos.Col, - }) - } - - for _, prod := range root.Productions { - sym, err := w.RegisterNonTerminalSymbol(prod.LHS) - if err != nil { - return nil, nil, err - } - if sym.IsTerminal() { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateName, - Detail: prod.LHS, - Row: prod.Pos.Row, - Col: prod.Pos.Col, - }) - } - if sym == errSym { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrErrSymIsReserved, - Row: prod.Pos.Row, - Col: prod.Pos.Col, - }) - } - } - - return symTab, &symbols{ - errSym: errSym, - augStartSym: augStartSym, - startSym: startSym, - }, nil -} - -func (b *GrammarBuilder) genLexSpecAndSkipSymbols(symTab *symbol.SymbolTableReader, root *parser.RootNode) (*lexical.LexSpec, []symbol.Symbol, error) { - entries := []*lexical.LexEntry{} - skipSyms := []symbol.Symbol{} - for _, prod := range root.LexProductions { - entry, skip, specErr, err := genLexEntry(prod) - if err != nil { - return nil, nil, err - } - if specErr != nil { - b.errs = append(b.errs, specErr) - continue - } - if skip { - sym, _ := symTab.ToSymbol(prod.LHS) - skipSyms = append(skipSyms, sym) - } - entries = append(entries, entry) - } - - checkedFragments := map[string]struct{}{} - for _, fragment := range root.Fragments { - if _, exist := checkedFragments[fragment.LHS]; exist { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateFragment, - Detail: fragment.LHS, - Row: fragment.Pos.Row, - Col: fragment.Pos.Col, - }) - continue - } - checkedFragments[fragment.LHS] = struct{}{} - - entries = append(entries, &lexical.LexEntry{ - Fragment: true, - Kind: spec.LexKindName(fragment.LHS), - Pattern: fragment.RHS, - }) - } - - return &lexical.LexSpec{ - Entries: entries, - }, skipSyms, nil -} - -func genLexEntry(prod *parser.ProductionNode) (*lexical.LexEntry, bool, *verr.SpecError, error) { - alt := prod.RHS[0] - elem := alt.Elements[0] - - var pattern string - if elem.Literally { - pattern = spec.EscapePattern(elem.Pattern) - } else { - pattern = elem.Pattern - } - - var modes []spec.LexModeName - var skip bool - var push spec.LexModeName - var pop bool - dirConsumed := map[string]struct{}{} - for _, dir := range prod.Directives { - if _, consumed := dirConsumed[dir.Name]; consumed { - return nil, false, &verr.SpecError{ - Cause: semErrDuplicateDir, - Detail: dir.Name, - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }, nil - } - dirConsumed[dir.Name] = struct{}{} - - switch dir.Name { - case "mode": - if len(dir.Parameters) == 0 { - return nil, false, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'mode' directive needs an ID parameter", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }, nil - } - for _, param := range dir.Parameters { - if param.ID == "" { - return nil, false, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'mode' directive needs an ID parameter", - Row: param.Pos.Row, - Col: param.Pos.Col, - }, nil - } - modes = append(modes, spec.LexModeName(param.ID)) - } - case "skip": - if len(dir.Parameters) > 0 { - return nil, false, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'skip' directive needs no parameter", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }, nil - } - skip = true - case "push": - if len(dir.Parameters) != 1 || dir.Parameters[0].ID == "" { - return nil, false, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'push' directive needs an ID parameter", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }, nil - } - push = spec.LexModeName(dir.Parameters[0].ID) - case "pop": - if len(dir.Parameters) > 0 { - return nil, false, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'pop' directive needs no parameter", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }, nil - } - pop = true - default: - return nil, false, &verr.SpecError{ - Cause: semErrDirInvalidName, - Detail: dir.Name, - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }, nil - } - } - - if len(alt.Directives) > 0 { - return nil, false, &verr.SpecError{ - Cause: semErrInvalidAltDir, - Detail: "a lexical production cannot have alternative directives", - Row: alt.Directives[0].Pos.Row, - Col: alt.Directives[0].Pos.Col, - }, nil - } - - return &lexical.LexEntry{ - Modes: modes, - Kind: spec.LexKindName(prod.LHS), - Pattern: pattern, - Push: push, - Pop: pop, - }, skip, nil, nil -} - -type productionsAndActions struct { - prods *productionSet - augStartSym symbol.Symbol - astActs map[productionID][]*astActionEntry - prodPrecsTerm map[productionID]symbol.Symbol - prodPrecsOrdSym map[productionID]string - prodPrecPoss map[productionID]*parser.Position - recoverProds map[productionID]struct{} -} - -func (b *GrammarBuilder) genProductionsAndActions(root *parser.RootNode, symTab *symbol.SymbolTableReader, errSym symbol.Symbol, augStartSym symbol.Symbol, startSym symbol.Symbol) (*productionsAndActions, error) { - if len(root.Productions) == 0 { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrNoProduction, - }) - return nil, nil - } - - prods := newProductionSet() - astActs := map[productionID][]*astActionEntry{} - prodPrecsTerm := map[productionID]symbol.Symbol{} - prodPrecsOrdSym := map[productionID]string{} - prodPrecPoss := map[productionID]*parser.Position{} - recoverProds := map[productionID]struct{}{} - - p, err := newProduction(augStartSym, []symbol.Symbol{ - startSym, - }) - if err != nil { - return nil, err - } - - prods.append(p) - - for _, prod := range root.Productions { - lhsSym, ok := symTab.ToSymbol(prod.LHS) - if !ok { - // All symbols are assumed to be pre-detected, so it's a bug if we cannot find them here. - return nil, fmt.Errorf("symbol '%v' is undefined", prod.LHS) - } - - if len(prod.Directives) > 0 { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrInvalidProdDir, - Detail: "a production cannot have production directives", - Row: prod.Directives[0].Pos.Row, - Col: prod.Directives[0].Pos.Col, - }) - continue - } - - LOOP_RHS: - for _, alt := range prod.RHS { - altSyms := make([]symbol.Symbol, len(alt.Elements)) - offsets := map[string]int{} - ambiguousIDOffsets := map[string]struct{}{} - for i, elem := range alt.Elements { - sym, ok := symTab.ToSymbol(elem.ID) - if !ok { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrUndefinedSym, - Detail: elem.ID, - Row: elem.Pos.Row, - Col: elem.Pos.Col, - }) - continue LOOP_RHS - } - altSyms[i] = sym - - if elem.Label != nil { - if _, added := offsets[elem.Label.Name]; added { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateLabel, - Detail: elem.Label.Name, - Row: elem.Label.Pos.Row, - Col: elem.Label.Pos.Col, - }) - continue LOOP_RHS - } - if _, found := symTab.ToSymbol(elem.Label.Name); found { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrInvalidLabel, - Detail: elem.Label.Name, - Row: elem.Label.Pos.Row, - Col: elem.Label.Pos.Col, - }) - continue LOOP_RHS - } - offsets[elem.Label.Name] = i - } - // A symbol having a label can be specified by both the label and the symbol name. - // So record the symbol's position, whether or not it has a label. - if elem.ID != "" { - if _, exist := offsets[elem.ID]; exist { - // When the same symbol appears multiple times in an alternative, the symbol is ambiguous. When we need - // to specify the symbol in a directive, we cannot use the name of the ambiguous symbol. Instead, specify - // a label to resolve the ambiguity. - delete(offsets, elem.ID) - ambiguousIDOffsets[elem.ID] = struct{}{} - } else { - offsets[elem.ID] = i - } - } - } - - p, err := newProduction(lhsSym, altSyms) - if err != nil { - return nil, err - } - if _, exist := prods.findByID(p.id); exist { - // Report the line number of a duplicate alternative. - // When the alternative is empty, we report the position of its LHS. - var row int - var col int - if len(alt.Elements) > 0 { - row = alt.Elements[0].Pos.Row - col = alt.Elements[0].Pos.Col - } else { - row = prod.Pos.Row - col = prod.Pos.Col - } - - var detail string - { - var b strings.Builder - fmt.Fprintf(&b, "%v →", prod.LHS) - for _, elem := range alt.Elements { - switch { - case elem.ID != "": - fmt.Fprintf(&b, " %v", elem.ID) - case elem.Pattern != "": - fmt.Fprintf(&b, ` "%v"`, elem.Pattern) - } - } - if len(alt.Elements) == 0 { - fmt.Fprintf(&b, " ε") - } - - detail = b.String() - } - - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateProduction, - Detail: detail, - Row: row, - Col: col, - }) - continue LOOP_RHS - } - prods.append(p) - - dirConsumed := map[string]struct{}{} - for _, dir := range alt.Directives { - if _, consumed := dirConsumed[dir.Name]; consumed { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateDir, - Detail: dir.Name, - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - } - dirConsumed[dir.Name] = struct{}{} - - switch dir.Name { - case "ast": - if len(dir.Parameters) == 0 { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'ast' directive needs at least one parameter", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - continue LOOP_RHS - } - astAct := make([]*astActionEntry, len(dir.Parameters)) - consumedOffsets := map[int]struct{}{} - for i, param := range dir.Parameters { - if param.ID == "" { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'ast' directive can take only ID parameters", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - continue LOOP_RHS - } - - if _, ambiguous := ambiguousIDOffsets[param.ID]; ambiguous { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrAmbiguousElem, - Detail: fmt.Sprintf("'%v' is ambiguous", param.ID), - Row: param.Pos.Row, - Col: param.Pos.Col, - }) - continue LOOP_RHS - } - - offset, ok := offsets[param.ID] - if !ok { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: fmt.Sprintf("a symbol was not found in an alternative: %v", param.ID), - Row: param.Pos.Row, - Col: param.Pos.Col, - }) - continue LOOP_RHS - } - if _, consumed := consumedOffsets[offset]; consumed { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateElem, - Detail: param.ID, - Row: param.Pos.Row, - Col: param.Pos.Col, - }) - continue LOOP_RHS - } - consumedOffsets[offset] = struct{}{} - - if param.Expansion { - elem := alt.Elements[offset] - if elem.Pattern != "" { - // Currently, it is a bug to reach here because it is - // forbidden to have anything other than ID appear in - // production rules. - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: fmt.Sprintf("the expansion symbol cannot be applied to a pattern (%v: \"%v\")", param.ID, elem.Pattern), - Row: param.Pos.Row, - Col: param.Pos.Col, - }) - continue LOOP_RHS - } - elemSym, ok := symTab.ToSymbol(elem.ID) - if !ok { - // If the symbol was not found, it's a bug. - return nil, fmt.Errorf("a symbol corresponding to an ID (%v) was not found", elem.ID) - } - if elemSym.IsTerminal() { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: fmt.Sprintf("the expansion symbol cannot be applied to a terminal symbol (%v: %v)", param.ID, elem.ID), - Row: param.Pos.Row, - Col: param.Pos.Col, - }) - continue LOOP_RHS - } - } - - astAct[i] = &astActionEntry{ - position: offset + 1, - expansion: param.Expansion, - } - } - astActs[p.id] = astAct - case "prec": - if len(dir.Parameters) != 1 || (dir.Parameters[0].ID == "" && dir.Parameters[0].OrderedSymbol == "") { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'prec' directive needs just one ID parameter or ordered symbol", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - continue LOOP_RHS - } - param := dir.Parameters[0] - switch { - case param.ID != "": - sym, ok := symTab.ToSymbol(param.ID) - if !ok { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: fmt.Sprintf("unknown terminal symbol: %v", param.ID), - Row: param.Pos.Row, - Col: param.Pos.Col, - }) - continue LOOP_RHS - } - if sym == errSym { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: fmt.Sprintf("'%v' directive cannot be applied to an error symbol", dir.Name), - Row: param.Pos.Row, - Col: param.Pos.Col, - }) - } - if !sym.IsTerminal() { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: fmt.Sprintf("the symbol must be a terminal: %v", param.ID), - Row: param.Pos.Row, - Col: param.Pos.Col, - }) - continue LOOP_RHS - } - prodPrecsTerm[p.id] = sym - prodPrecPoss[p.id] = ¶m.Pos - case param.OrderedSymbol != "": - prodPrecsOrdSym[p.id] = param.OrderedSymbol - prodPrecPoss[p.id] = ¶m.Pos - } - case "recover": - if len(dir.Parameters) > 0 { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'recover' directive needs no parameter", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - continue LOOP_RHS - } - recoverProds[p.id] = struct{}{} - default: - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidName, - Detail: fmt.Sprintf("invalid directive name '%v'", dir.Name), - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - continue LOOP_RHS - } - } - } - } - - return &productionsAndActions{ - prods: prods, - augStartSym: augStartSym, - astActs: astActs, - prodPrecsTerm: prodPrecsTerm, - prodPrecsOrdSym: prodPrecsOrdSym, - prodPrecPoss: prodPrecPoss, - recoverProds: recoverProds, - }, nil -} - -func (b *GrammarBuilder) genPrecAndAssoc(symTab *symbol.SymbolTableReader, errSym symbol.Symbol, prodsAndActs *productionsAndActions) (*precAndAssoc, error) { - termPrec := map[symbol.SymbolNum]int{} - termAssoc := map[symbol.SymbolNum]assocType{} - ordSymPrec := map[string]int{} - { - var precGroup []*parser.DirectiveNode - for _, dir := range b.AST.Directives { - if dir.Name == "prec" { - if dir.Parameters == nil || len(dir.Parameters) != 1 || dir.Parameters[0].Group == nil { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "'prec' needs just one directive group", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - continue - } - precGroup = dir.Parameters[0].Group - continue - } - - if dir.Name != "name" && dir.Name != "prec" { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidName, - Detail: dir.Name, - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - continue - } - } - - precN := precMin - for _, dir := range precGroup { - var assocTy assocType - switch dir.Name { - case "left": - assocTy = assocTypeLeft - case "right": - assocTy = assocTypeRight - case "assign": - assocTy = assocTypeNil - default: - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidName, - Detail: dir.Name, - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - return nil, nil - } - - if len(dir.Parameters) == 0 { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "associativity needs at least one symbol", - Row: dir.Pos.Row, - Col: dir.Pos.Col, - }) - return nil, nil - } - ASSOC_PARAM_LOOP: - for _, p := range dir.Parameters { - switch { - case p.ID != "": - sym, ok := symTab.ToSymbol(p.ID) - if !ok { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: fmt.Sprintf("'%v' is undefined", p.ID), - Row: p.Pos.Row, - Col: p.Pos.Col, - }) - return nil, nil - } - if sym == errSym { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: fmt.Sprintf("'%v' directive cannot be applied to an error symbol", dir.Name), - Row: p.Pos.Row, - Col: p.Pos.Col, - }) - return nil, nil - } - if !sym.IsTerminal() { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: fmt.Sprintf("associativity can take only terminal symbol ('%v' is a non-terminal)", p.ID), - Row: p.Pos.Row, - Col: p.Pos.Col, - }) - return nil, nil - } - if prec, alreadySet := termPrec[sym.Num()]; alreadySet { - if prec == precN { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateAssoc, - Detail: fmt.Sprintf("'%v' already has the same associativity and precedence", p.ID), - Row: p.Pos.Row, - Col: p.Pos.Col, - }) - } else if assoc := termAssoc[sym.Num()]; assoc == assocTy { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateAssoc, - Detail: fmt.Sprintf("'%v' already has different precedence", p.ID), - Row: p.Pos.Row, - Col: p.Pos.Col, - }) - } else { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateAssoc, - Detail: fmt.Sprintf("'%v' already has different associativity and precedence", p.ID), - Row: p.Pos.Row, - Col: p.Pos.Col, - }) - } - break ASSOC_PARAM_LOOP - } - - termPrec[sym.Num()] = precN - termAssoc[sym.Num()] = assocTy - case p.OrderedSymbol != "": - if prec, alreadySet := ordSymPrec[p.OrderedSymbol]; alreadySet { - if prec == precN { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateAssoc, - Detail: fmt.Sprintf("'$%v' already has the same precedence", p.OrderedSymbol), - Row: p.Pos.Row, - Col: p.Pos.Col, - }) - } else { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDuplicateAssoc, - Detail: fmt.Sprintf("'$%v' already has different precedence", p.OrderedSymbol), - Row: p.Pos.Row, - Col: p.Pos.Col, - }) - } - break ASSOC_PARAM_LOOP - } - - ordSymPrec[p.OrderedSymbol] = precN - default: - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrDirInvalidParam, - Detail: "a parameter must be an ID or an ordered symbol", - Row: p.Pos.Row, - Col: p.Pos.Col, - }) - return nil, nil - } - } - - precN++ - } - } - if len(b.errs) > 0 { - return nil, nil - } - - prodPrec := map[productionNum]int{} - prodAssoc := map[productionNum]assocType{} - for _, prod := range prodsAndActs.prods.getAllProductions() { - // A #prec directive changes only precedence, not associativity. - if term, ok := prodsAndActs.prodPrecsTerm[prod.id]; ok { - if prec, ok := termPrec[term.Num()]; ok { - prodPrec[prod.num] = prec - prodAssoc[prod.num] = assocTypeNil - } else { - text, _ := symTab.ToText(term) - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrUndefinedPrec, - Detail: text, - Row: prodsAndActs.prodPrecPoss[prod.id].Row, - Col: prodsAndActs.prodPrecPoss[prod.id].Col, - }) - } - } else if ordSym, ok := prodsAndActs.prodPrecsOrdSym[prod.id]; ok { - if prec, ok := ordSymPrec[ordSym]; ok { - prodPrec[prod.num] = prec - prodAssoc[prod.num] = assocTypeNil - } else { - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrUndefinedOrdSym, - Detail: fmt.Sprintf("$%v", ordSym), - Row: prodsAndActs.prodPrecPoss[prod.id].Row, - Col: prodsAndActs.prodPrecPoss[prod.id].Col, - }) - } - } else { - // A production inherits precedence and associativity from the right-most terminal symbol. - mostrightTerm := symbol.SymbolNil - for _, sym := range prod.rhs { - if !sym.IsTerminal() { - continue - } - mostrightTerm = sym - } - if !mostrightTerm.IsNil() { - prodPrec[prod.num] = termPrec[mostrightTerm.Num()] - prodAssoc[prod.num] = termAssoc[mostrightTerm.Num()] - } - } - } - if len(b.errs) > 0 { - return nil, nil - } - - return &precAndAssoc{ - termPrec: termPrec, - termAssoc: termAssoc, - prodPrec: prodPrec, - prodAssoc: prodAssoc, - }, nil -} - -func compile(gram *Grammar, opts ...BuildOption) (*spec.CompiledGrammar, *spec.Report, error) { - config := &buildConfig{} - for _, opt := range opts { - opt(config) - } - - lexSpec, err, cErrs := lexical.Compile(gram.lexSpec, lexical.CompressionLevelMax) - if err != nil { - if len(cErrs) > 0 { - var b strings.Builder - writeCompileError(&b, cErrs[0]) - for _, cerr := range cErrs[1:] { - fmt.Fprintf(&b, "\n") - writeCompileError(&b, cerr) - } - return nil, nil, fmt.Errorf(b.String()) - } - return nil, nil, err - } - - kind2Term := make([]int, len(lexSpec.KindNames)) - for i, k := range lexSpec.KindNames { - if k == spec.LexKindNameNil { - kind2Term[spec.LexKindIDNil] = symbol.SymbolNil.Num().Int() - continue - } - - sym, ok := gram.symbolTable.ToSymbol(k.String()) - if !ok { - return nil, nil, fmt.Errorf("terminal symbol '%v' was not found in a symbol table", k) - } - kind2Term[i] = sym.Num().Int() - } - - termTexts, err := gram.symbolTable.TerminalTexts() - if err != nil { - return nil, nil, err - } - - var termSkip []int - { - r := gram.symbolTable.Reader() - // I want to use gram.symbolTable.terminalSymbols() here instead of gram.symbolTable.terminalTexts(), - // but gram.symbolTable.terminalSymbols() is different in length from terminalTexts - // because it does not contain a predefined symbol, like EOF. - // Therefore, we use terminalTexts, although it takes more time to lookup for symbols. - termSkip = make([]int, len(termTexts)) - for _, t := range termTexts { - s, _ := r.ToSymbol(t) - for _, sk := range gram.skipSymbols { - if s != sk { - continue - } - termSkip[s.Num()] = 1 - break - } - } - } - - nonTerms, err := gram.symbolTable.NonTerminalTexts() - if err != nil { - return nil, nil, err - } - - firstSet, err := genFirstSet(gram.productionSet) - if err != nil { - return nil, nil, err - } - - lr0, err := genLR0Automaton(gram.productionSet, gram.augmentedStartSymbol, gram.errorSymbol) - if err != nil { - return nil, nil, err - } - - var tab *ParsingTable - var report *spec.Report - { - lalr1, err := genLALR1Automaton(lr0, gram.productionSet, firstSet) - if err != nil { - return nil, nil, err - } - - b := &lrTableBuilder{ - automaton: lalr1.lr0Automaton, - prods: gram.productionSet, - termCount: len(termTexts), - nonTermCount: len(nonTerms), - symTab: gram.symbolTable, - precAndAssoc: gram.precAndAssoc, - } - tab, err = b.build() - if err != nil { - return nil, nil, err - } - - if config.isReportingEnabled { - report, err = b.genReport(tab, gram) - if err != nil { - return nil, nil, err - } - } - } - - action := make([]int, len(tab.actionTable)) - for i, e := range tab.actionTable { - action[i] = int(e) - } - goTo := make([]int, len(tab.goToTable)) - for i, e := range tab.goToTable { - goTo[i] = int(e) - } - - lhsSyms := make([]int, len(gram.productionSet.getAllProductions())+1) - altSymCounts := make([]int, len(gram.productionSet.getAllProductions())+1) - recoverProds := make([]int, len(gram.productionSet.getAllProductions())+1) - astActEnties := make([][]int, len(gram.productionSet.getAllProductions())+1) - for _, p := range gram.productionSet.getAllProductions() { - lhsSyms[p.num] = p.lhs.Num().Int() - altSymCounts[p.num] = p.rhsLen - - if _, ok := gram.recoverProductions[p.id]; ok { - recoverProds[p.num] = 1 - } - - astAct, ok := gram.astActions[p.id] - if !ok { - continue - } - astActEntry := make([]int, len(astAct)) - for i, e := range astAct { - if e.expansion { - astActEntry[i] = e.position * -1 - } else { - astActEntry[i] = e.position - } - } - astActEnties[p.num] = astActEntry - } - - return &spec.CompiledGrammar{ - Name: gram.name, - Lexical: lexSpec, - Syntactic: &spec.SyntacticSpec{ - Action: action, - GoTo: goTo, - StateCount: tab.stateCount, - InitialState: tab.InitialState.Int(), - StartProduction: productionNumStart.Int(), - LHSSymbols: lhsSyms, - AlternativeSymbolCounts: altSymCounts, - Terminals: termTexts, - TerminalCount: tab.terminalCount, - TerminalSkip: termSkip, - KindToTerminal: kind2Term, - NonTerminals: nonTerms, - NonTerminalCount: tab.nonTerminalCount, - EOFSymbol: symbol.SymbolEOF.Num().Int(), - ErrorSymbol: gram.errorSymbol.Num().Int(), - ErrorTrapperStates: tab.errorTrapperStates, - RecoverProductions: recoverProds, - }, - ASTAction: &spec.ASTAction{ - Entries: astActEnties, - }, - }, report, nil -} - -func writeCompileError(w io.Writer, cErr *lexical.CompileError) { - if cErr.Fragment { - fmt.Fprintf(w, "fragment ") - } - fmt.Fprintf(w, "%v: %v", cErr.Kind, cErr.Cause) - if cErr.Detail != "" { - fmt.Fprintf(w, ": %v", cErr.Detail) - } -} diff --git a/src/urubu/grammar/item.go b/src/urubu/grammar/item.go deleted file mode 100644 index 6c5fe42..0000000 --- a/src/urubu/grammar/item.go +++ /dev/null @@ -1,206 +0,0 @@ -package grammar - -import ( - "crypto/sha256" - "encoding/binary" - "fmt" - "sort" - "strconv" - - "urubu/grammar/symbol" -) - -type lrItemID [32]byte - -func (id lrItemID) String() string { - return fmt.Sprintf("%x", id.num()) -} - -func (id lrItemID) num() uint32 { - return binary.LittleEndian.Uint32(id[:]) -} - -type lookAhead struct { - symbols map[symbol.Symbol]struct{} - - // When propagation is true, an item propagates look-ahead symbols to other items. - propagation bool -} - -type lrItem struct { - id lrItemID - prod productionID - - // E → E + T - // - // Dot | Dotted Symbol | Item - // ----+---------------+------------ - // 0 | E | E →・E + T - // 1 | + | E → E・+ T - // 2 | T | E → E +・T - // 3 | Nil | E → E + T・ - dot int - dottedSymbol symbol.Symbol - - // When initial is true, the LHS of the production is the augmented start symbol and dot is 0. - // It looks like S' →・S. - initial bool - - // When reducible is true, the item looks like E → E + T・. - reducible bool - - // When kernel is true, the item is kernel item. - kernel bool - - // lookAhead stores look-ahead symbols, and they are terminal symbols. - // The item is reducible only when the look-ahead symbols appear as the next input symbol. - lookAhead lookAhead -} - -func newLR0Item(prod *production, dot int) (*lrItem, error) { - if prod == nil { - return nil, fmt.Errorf("production must be non-nil") - } - - if dot < 0 || dot > prod.rhsLen { - return nil, fmt.Errorf("dot must be between 0 and %v", prod.rhsLen) - } - - var id lrItemID - { - b := []byte{} - b = append(b, prod.id[:]...) - bDot := make([]byte, 8) - binary.LittleEndian.PutUint64(bDot, uint64(dot)) - b = append(b, bDot...) - id = sha256.Sum256(b) - } - - dottedSymbol := symbol.SymbolNil - if dot < prod.rhsLen { - dottedSymbol = prod.rhs[dot] - } - - initial := false - if prod.lhs.IsStart() && dot == 0 { - initial = true - } - - reducible := false - if dot == prod.rhsLen { - reducible = true - } - - kernel := false - if initial || dot > 0 { - kernel = true - } - - item := &lrItem{ - id: id, - prod: prod.id, - dot: dot, - dottedSymbol: dottedSymbol, - initial: initial, - reducible: reducible, - kernel: kernel, - } - - return item, nil -} - -type kernelID [32]byte - -func (id kernelID) String() string { - return fmt.Sprintf("%x", binary.LittleEndian.Uint32(id[:])) -} - -type kernel struct { - id kernelID - items []*lrItem -} - -func newKernel(items []*lrItem) (*kernel, error) { - if len(items) == 0 { - return nil, fmt.Errorf("a kernel need at least one item") - } - - // Remove duplicates from items. - var sortedItems []*lrItem - { - m := map[lrItemID]*lrItem{} - for _, item := range items { - if !item.kernel { - return nil, fmt.Errorf("not a kernel item: %v", item) - } - m[item.id] = item - } - sortedItems = []*lrItem{} - for _, item := range m { - sortedItems = append(sortedItems, item) - } - sort.Slice(sortedItems, func(i, j int) bool { - return sortedItems[i].id.num() < sortedItems[j].id.num() - }) - } - - var id kernelID - { - b := []byte{} - for _, item := range sortedItems { - b = append(b, item.id[:]...) - } - id = sha256.Sum256(b) - } - - return &kernel{ - id: id, - items: sortedItems, - }, nil -} - -type stateNum int - -const stateNumInitial = stateNum(0) - -func (n stateNum) Int() int { - return int(n) -} - -func (n stateNum) String() string { - return strconv.Itoa(int(n)) -} - -func (n stateNum) next() stateNum { - return stateNum(n + 1) -} - -type lrState struct { - *kernel - num stateNum - next map[symbol.Symbol]kernelID - reducible map[productionID]struct{} - - // emptyProdItems stores items that have an empty production like `p → ε` and is reducible. - // Thus the items emptyProdItems stores are like `p → ・ε`. emptyProdItems is needed to store - // look-ahead symbols because the kernel items don't include these items. - // - // For instance, we have the following productions, and A is a terminal symbol. - // - // s' → s - // s → A | ε - // - // CLOSURE({s' → ・s}) generates the following closure, but the kernel of this closure doesn't - // include `s → ・ε`. - // - // s' → ・s - // s → ・A - // s → ・ε - emptyProdItems []*lrItem - - // When isErrorTrapper is `true`, the item can shift the `error` symbol. The item has the following form. - // The `α` and `β` can be empty. - // - // A → α・error β - isErrorTrapper bool -} diff --git a/src/urubu/grammar/lalr1.go b/src/urubu/grammar/lalr1.go deleted file mode 100644 index 8373568..0000000 --- a/src/urubu/grammar/lalr1.go +++ /dev/null @@ -1,318 +0,0 @@ -package grammar - -import ( - "fmt" - - "urubu/grammar/symbol" -) - -type stateAndLRItem struct { - kernelID kernelID - itemID lrItemID -} - -type propagation struct { - src *stateAndLRItem - dest []*stateAndLRItem -} - -type lalr1Automaton struct { - *lr0Automaton -} - -func genLALR1Automaton(lr0 *lr0Automaton, prods *productionSet, first *firstSet) (*lalr1Automaton, error) { - // Set the look-ahead symbol <EOF> to the initial item: [S' → ・S, $] - iniState := lr0.states[lr0.initialState] - iniState.items[0].lookAhead.symbols = map[symbol.Symbol]struct{}{ - symbol.SymbolEOF: {}, - } - - var props []*propagation - for _, state := range lr0.states { - for _, kItem := range state.items { - items, err := genLALR1Closure(kItem, prods, first) - if err != nil { - return nil, err - } - - kItem.lookAhead.propagation = true - - var propDests []*stateAndLRItem - for _, item := range items { - if item.reducible { - p, ok := prods.findByID(item.prod) - if !ok { - return nil, fmt.Errorf("production not found: %v", item.prod) - } - - if p.isEmpty() { - var reducibleItem *lrItem - for _, it := range state.emptyProdItems { - if it.id != item.id { - continue - } - - reducibleItem = it - break - } - if reducibleItem == nil { - return nil, fmt.Errorf("reducible item not found: %v", item.id) - } - if reducibleItem.lookAhead.symbols == nil { - reducibleItem.lookAhead.symbols = map[symbol.Symbol]struct{}{} - } - for a := range item.lookAhead.symbols { - reducibleItem.lookAhead.symbols[a] = struct{}{} - } - - propDests = append(propDests, &stateAndLRItem{ - kernelID: state.id, - itemID: item.id, - }) - } - - continue - } - - nextKID := state.next[item.dottedSymbol] - var nextItemID lrItemID - { - p, ok := prods.findByID(item.prod) - if !ok { - return nil, fmt.Errorf("production not found: %v", item.prod) - } - it, err := newLR0Item(p, item.dot+1) - if err != nil { - return nil, fmt.Errorf("failed to generate an item ID: %v", err) - } - nextItemID = it.id - } - - if item.lookAhead.propagation { - propDests = append(propDests, &stateAndLRItem{ - kernelID: nextKID, - itemID: nextItemID, - }) - } else { - nextState := lr0.states[nextKID] - var nextItem *lrItem - for _, it := range nextState.items { - if it.id != nextItemID { - continue - } - nextItem = it - break - } - if nextItem == nil { - return nil, fmt.Errorf("item not found: %v", nextItemID) - } - - if nextItem.lookAhead.symbols == nil { - nextItem.lookAhead.symbols = map[symbol.Symbol]struct{}{} - } - - for a := range item.lookAhead.symbols { - nextItem.lookAhead.symbols[a] = struct{}{} - } - } - } - if len(propDests) == 0 { - continue - } - - props = append(props, &propagation{ - src: &stateAndLRItem{ - kernelID: state.id, - itemID: kItem.id, - }, - dest: propDests, - }) - } - } - - err := propagateLookAhead(lr0, props) - if err != nil { - return nil, fmt.Errorf("failed to propagate look-ahead symbols: %v", err) - } - - return &lalr1Automaton{ - lr0Automaton: lr0, - }, nil -} - -func genLALR1Closure(srcItem *lrItem, prods *productionSet, first *firstSet) ([]*lrItem, error) { - items := []*lrItem{} - knownItems := map[lrItemID]map[symbol.Symbol]struct{}{} - knownItemsProp := map[lrItemID]struct{}{} - uncheckedItems := []*lrItem{} - items = append(items, srcItem) - uncheckedItems = append(uncheckedItems, srcItem) - for len(uncheckedItems) > 0 { - nextUncheckedItems := []*lrItem{} - for _, item := range uncheckedItems { - if item.dottedSymbol.IsTerminal() { - continue - } - - p, ok := prods.findByID(item.prod) - if !ok { - return nil, fmt.Errorf("production not found: %v", item.prod) - } - - var fstSyms []symbol.Symbol - var isFstNullable bool - { - fst, err := first.find(p, item.dot+1) - if err != nil { - return nil, err - } - - fstSyms = make([]symbol.Symbol, len(fst.symbols)) - i := 0 - for s := range fst.symbols { - fstSyms[i] = s - i++ - } - if fst.empty { - isFstNullable = true - } - } - - ps, _ := prods.findByLHS(item.dottedSymbol) - for _, prod := range ps { - var lookAhead []symbol.Symbol - { - var lookAheadCount int - if isFstNullable { - lookAheadCount = len(fstSyms) + len(item.lookAhead.symbols) - } else { - lookAheadCount = len(fstSyms) - } - - lookAhead = make([]symbol.Symbol, lookAheadCount) - i := 0 - for _, s := range fstSyms { - lookAhead[i] = s - i++ - } - if isFstNullable { - for a := range item.lookAhead.symbols { - lookAhead[i] = a - i++ - } - } - } - - for _, a := range lookAhead { - newItem, err := newLR0Item(prod, 0) - if err != nil { - return nil, err - } - if items, exist := knownItems[newItem.id]; exist { - if _, exist := items[a]; exist { - continue - } - } - - newItem.lookAhead.symbols = map[symbol.Symbol]struct{}{ - a: {}, - } - - items = append(items, newItem) - if knownItems[newItem.id] == nil { - knownItems[newItem.id] = map[symbol.Symbol]struct{}{} - } - knownItems[newItem.id][a] = struct{}{} - nextUncheckedItems = append(nextUncheckedItems, newItem) - } - - if isFstNullable { - newItem, err := newLR0Item(prod, 0) - if err != nil { - return nil, err - } - if _, exist := knownItemsProp[newItem.id]; exist { - continue - } - - newItem.lookAhead.propagation = true - - items = append(items, newItem) - knownItemsProp[newItem.id] = struct{}{} - nextUncheckedItems = append(nextUncheckedItems, newItem) - } - } - } - uncheckedItems = nextUncheckedItems - } - - return items, nil -} - -func propagateLookAhead(lr0 *lr0Automaton, props []*propagation) error { - for { - changed := false - for _, prop := range props { - srcState, ok := lr0.states[prop.src.kernelID] - if !ok { - return fmt.Errorf("source state not found: %v", prop.src.kernelID) - } - var srcItem *lrItem - for _, item := range srcState.items { - if item.id != prop.src.itemID { - continue - } - srcItem = item - break - } - if srcItem == nil { - return fmt.Errorf("source item not found: %v", prop.src.itemID) - } - - for _, dest := range prop.dest { - destState, ok := lr0.states[dest.kernelID] - if !ok { - return fmt.Errorf("destination state not found: %v", dest.kernelID) - } - var destItem *lrItem - for _, item := range destState.items { - if item.id != dest.itemID { - continue - } - destItem = item - break - } - if destItem == nil { - for _, item := range destState.emptyProdItems { - if item.id != dest.itemID { - continue - } - destItem = item - break - } - if destItem == nil { - return fmt.Errorf("destination item not found: %v", dest.itemID) - } - } - - for a := range srcItem.lookAhead.symbols { - if _, ok := destItem.lookAhead.symbols[a]; ok { - continue - } - - if destItem.lookAhead.symbols == nil { - destItem.lookAhead.symbols = map[symbol.Symbol]struct{}{} - } - - destItem.lookAhead.symbols[a] = struct{}{} - changed = true - } - } - } - if !changed { - break - } - } - - return nil -} diff --git a/src/urubu/grammar/lexical/compiler.go b/src/urubu/grammar/lexical.go index 637018a..515e491 100644 --- a/src/urubu/grammar/lexical/compiler.go +++ b/src/urubu/grammar/lexical.go @@ -3,6 +3,8 @@ package lexical import ( "bytes" "fmt" + "sort" + "strings" "urubu/compressor" "urubu/grammar/lexical/dfa" @@ -411,3 +413,165 @@ func convertIntSliceToStateIDSlice(s []int) []spec.StateID { } return ss } + +type LexEntry struct { + Kind spec.LexKindName + Pattern string + Modes []spec.LexModeName + Push spec.LexModeName + Pop bool + Fragment bool +} + +type LexSpec struct { + Entries []*LexEntry +} + +func (s *LexSpec) Validate() error { + if len(s.Entries) <= 0 { + return fmt.Errorf("the lexical specification must have at least one entry") + } + { + ks := map[string]struct{}{} + fks := map[string]struct{}{} + for _, e := range s.Entries { + // Allow duplicate names between fragments and non-fragments. + if e.Fragment { + if _, exist := fks[e.Kind.String()]; exist { + return fmt.Errorf("kinds `%v` are duplicates", e.Kind) + } + fks[e.Kind.String()] = struct{}{} + } else { + if _, exist := ks[e.Kind.String()]; exist { + return fmt.Errorf("kinds `%v` are duplicates", e.Kind) + } + ks[e.Kind.String()] = struct{}{} + } + } + } + { + kinds := []string{} + modes := []string{ + spec.LexModeNameDefault.String(), // This is a predefined mode. + } + for _, e := range s.Entries { + if e.Fragment { + continue + } + + kinds = append(kinds, e.Kind.String()) + + for _, m := range e.Modes { + modes = append(modes, m.String()) + } + } + + kindErrs := findSpellingInconsistenciesErrors(kinds, nil) + modeErrs := findSpellingInconsistenciesErrors(modes, func(ids []string) error { + if SnakeCaseToUpperCamelCase(ids[0]) == SnakeCaseToUpperCamelCase(spec.LexModeNameDefault.String()) { + var b strings.Builder + fmt.Fprintf(&b, "%+v", ids[0]) + for _, id := range ids[1:] { + fmt.Fprintf(&b, ", %+v", id) + } + return fmt.Errorf("these identifiers are treated as the same. please use the same spelling as predefined '%v': %v", spec.LexModeNameDefault, b.String()) + } + return nil + }) + errs := append(kindErrs, modeErrs...) + if len(errs) > 0 { + var b strings.Builder + fmt.Fprintf(&b, "%v", errs[0]) + for _, err := range errs[1:] { + fmt.Fprintf(&b, "\n%v", err) + } + return fmt.Errorf(b.String()) + } + } + + return nil +} + +func findSpellingInconsistenciesErrors(ids []string, hook func(ids []string) error) []error { + duplicated := FindSpellingInconsistencies(ids) + if len(duplicated) == 0 { + return nil + } + + var errs []error + for _, dup := range duplicated { + if hook != nil { + err := hook(dup) + if err != nil { + errs = append(errs, err) + continue + } + } + + var b strings.Builder + fmt.Fprintf(&b, "%+v", dup[0]) + for _, id := range dup[1:] { + fmt.Fprintf(&b, ", %+v", id) + } + err := fmt.Errorf("these identifiers are treated as the same. please use the same spelling: %v", b.String()) + errs = append(errs, err) + } + + return errs +} + +// FindSpellingInconsistencies finds spelling inconsistencies in identifiers. The identifiers are considered to be the same +// if they are spelled the same when expressed in UpperCamelCase. For example, `left_paren` and `LeftParen` are spelled the same +// in UpperCamelCase. Thus they are considere to be spelling inconsistency. +func FindSpellingInconsistencies(ids []string) [][]string { + m := map[string][]string{} + for _, id := range removeDuplicates(ids) { + c := SnakeCaseToUpperCamelCase(id) + m[c] = append(m[c], id) + } + + var duplicated [][]string + for _, camels := range m { + if len(camels) == 1 { + continue + } + duplicated = append(duplicated, camels) + } + + for _, dup := range duplicated { + sort.Slice(dup, func(i, j int) bool { + return dup[i] < dup[j] + }) + } + sort.Slice(duplicated, func(i, j int) bool { + return duplicated[i][0] < duplicated[j][0] + }) + + return duplicated +} + +func removeDuplicates(s []string) []string { + m := map[string]struct{}{} + for _, v := range s { + m[v] = struct{}{} + } + + var unique []string + for v := range m { + unique = append(unique, v) + } + + return unique +} + +func SnakeCaseToUpperCamelCase(snake string) string { + elems := strings.Split(snake, "_") + for i, e := range elems { + if len(e) == 0 { + continue + } + elems[i] = strings.ToUpper(string(e[0])) + e[1:] + } + + return strings.Join(elems, "") +} diff --git a/src/urubu/grammar/lexical/dfa/tree.go b/src/urubu/grammar/lexical/dfa.go index 8a11aee..982420d 100644 --- a/src/urubu/grammar/lexical/dfa/tree.go +++ b/src/urubu/grammar/lexical/dfa.go @@ -1,15 +1,358 @@ package dfa import ( + "encoding/binary" "fmt" "io" "sort" + "strings" "urubu/grammar/lexical/parser" spec "urubu/spec/grammar" "urubu/utf8" ) +type symbolTable struct { + symPos2Byte map[symbolPosition]byteRange + endPos2ID map[symbolPosition]spec.LexModeKindID +} + +func genSymbolTable(root byteTree) *symbolTable { + symTab := &symbolTable{ + symPos2Byte: map[symbolPosition]byteRange{}, + endPos2ID: map[symbolPosition]spec.LexModeKindID{}, + } + return genSymTab(symTab, root) +} + +func genSymTab(symTab *symbolTable, node byteTree) *symbolTable { + if node == nil { + return symTab + } + + switch n := node.(type) { + case *symbolNode: + symTab.symPos2Byte[n.pos] = byteRange{ + from: n.from, + to: n.to, + } + case *endMarkerNode: + symTab.endPos2ID[n.pos] = n.id + default: + left, right := node.children() + genSymTab(symTab, left) + genSymTab(symTab, right) + } + return symTab +} + +type DFA struct { + States []string + InitialState string + AcceptingStatesTable map[string]spec.LexModeKindID + TransitionTable map[string][256]string +} + +func GenDFA(root byteTree, symTab *symbolTable) *DFA { + initialState := root.first() + initialStateHash := initialState.hash() + stateMap := map[string]*symbolPositionSet{ + initialStateHash: initialState, + } + tranTab := map[string][256]string{} + { + follow := genFollowTable(root) + unmarkedStates := map[string]*symbolPositionSet{ + initialStateHash: initialState, + } + for len(unmarkedStates) > 0 { + nextUnmarkedStates := map[string]*symbolPositionSet{} + for hash, state := range unmarkedStates { + tranTabOfState := [256]*symbolPositionSet{} + for _, pos := range state.set() { + if pos.isEndMark() { + continue + } + valRange := symTab.symPos2Byte[pos] + for symVal := valRange.from; symVal <= valRange.to; symVal++ { + if tranTabOfState[symVal] == nil { + tranTabOfState[symVal] = newSymbolPositionSet() + } + tranTabOfState[symVal].merge(follow[pos]) + } + } + for _, t := range tranTabOfState { + if t == nil { + continue + } + h := t.hash() + if _, ok := stateMap[h]; ok { + continue + } + stateMap[h] = t + nextUnmarkedStates[h] = t + } + tabOfState := [256]string{} + for v, t := range tranTabOfState { + if t == nil { + continue + } + tabOfState[v] = t.hash() + } + tranTab[hash] = tabOfState + } + unmarkedStates = nextUnmarkedStates + } + } + + accTab := map[string]spec.LexModeKindID{} + { + for h, s := range stateMap { + for _, pos := range s.set() { + if !pos.isEndMark() { + continue + } + priorID, ok := accTab[h] + if !ok { + accTab[h] = symTab.endPos2ID[pos] + } else { + id := symTab.endPos2ID[pos] + if id < priorID { + accTab[h] = id + } + } + } + } + } + + var states []string + { + for s := range stateMap { + states = append(states, s) + } + sort.Slice(states, func(i, j int) bool { + return states[i] < states[j] + }) + } + + return &DFA{ + States: states, + InitialState: initialStateHash, + AcceptingStatesTable: accTab, + TransitionTable: tranTab, + } +} + +func GenTransitionTable(dfa *DFA) (*spec.TransitionTable, error) { + stateHash2ID := map[string]spec.StateID{} + for i, s := range dfa.States { + // Since 0 represents an invalid value in a transition table, + // assign a number greater than or equal to 1 to states. + stateHash2ID[s] = spec.StateID(i + spec.StateIDMin.Int()) + } + + acc := make([]spec.LexModeKindID, len(dfa.States)+1) + for _, s := range dfa.States { + id, ok := dfa.AcceptingStatesTable[s] + if !ok { + continue + } + acc[stateHash2ID[s]] = id + } + + rowCount := len(dfa.States) + 1 + colCount := 256 + tran := make([]spec.StateID, rowCount*colCount) + for s, tab := range dfa.TransitionTable { + for v, to := range tab { + tran[stateHash2ID[s].Int()*256+v] = stateHash2ID[to] + } + } + + return &spec.TransitionTable{ + InitialStateID: stateHash2ID[dfa.InitialState], + AcceptingStates: acc, + UncompressedTransition: tran, + RowCount: rowCount, + ColCount: colCount, + }, nil +} + +type symbolPosition uint16 + +const ( + symbolPositionNil symbolPosition = 0x0000 + + symbolPositionMin uint16 = 0x0001 + symbolPositionMax uint16 = 0x7fff + + symbolPositionMaskSymbol uint16 = 0x0000 + symbolPositionMaskEndMark uint16 = 0x8000 + + symbolPositionMaskValue uint16 = 0x7fff +) + +func newSymbolPosition(n uint16, endMark bool) (symbolPosition, error) { + if n < symbolPositionMin || n > symbolPositionMax { + return symbolPositionNil, fmt.Errorf("symbol position must be within %v to %v: n: %v, endMark: %v", symbolPositionMin, symbolPositionMax, n, endMark) + } + if endMark { + return symbolPosition(n | symbolPositionMaskEndMark), nil + } + return symbolPosition(n | symbolPositionMaskSymbol), nil +} + +func (p symbolPosition) String() string { + if p.isEndMark() { + return fmt.Sprintf("end#%v", uint16(p)&symbolPositionMaskValue) + } + return fmt.Sprintf("sym#%v", uint16(p)&symbolPositionMaskValue) +} + +func (p symbolPosition) isEndMark() bool { + return uint16(p)&symbolPositionMaskEndMark > 1 +} + +func (p symbolPosition) describe() (uint16, bool) { + v := uint16(p) & symbolPositionMaskValue + if p.isEndMark() { + return v, true + } + return v, false +} + +type symbolPositionSet struct { + // `s` represents a set of symbol positions. + // However, immediately after adding a symbol position, the elements may be duplicated. + // When you need an aligned set with no duplicates, you can get such value via the set function. + s []symbolPosition + sorted bool +} + +func newSymbolPositionSet() *symbolPositionSet { + return &symbolPositionSet{ + s: []symbolPosition{}, + sorted: false, + } +} + +func (s *symbolPositionSet) String() string { + if len(s.s) <= 0 { + return "{}" + } + ps := s.sortAndRemoveDuplicates() + var b strings.Builder + fmt.Fprintf(&b, "{") + for i, p := range ps { + if i <= 0 { + fmt.Fprintf(&b, "%v", p) + continue + } + fmt.Fprintf(&b, ", %v", p) + } + fmt.Fprintf(&b, "}") + return b.String() +} + +func (s *symbolPositionSet) set() []symbolPosition { + s.sortAndRemoveDuplicates() + return s.s +} + +func (s *symbolPositionSet) add(pos symbolPosition) *symbolPositionSet { + s.s = append(s.s, pos) + s.sorted = false + return s +} + +func (s *symbolPositionSet) merge(t *symbolPositionSet) *symbolPositionSet { + s.s = append(s.s, t.s...) + s.sorted = false + return s +} + +func (s *symbolPositionSet) hash() string { + if len(s.s) <= 0 { + return "" + } + sorted := s.sortAndRemoveDuplicates() + var buf []byte + for _, p := range sorted { + b := make([]byte, 8) + binary.PutUvarint(b, uint64(p)) + buf = append(buf, b...) + } + // Convert to a string to be able to use it as a key of a map. + // But note this byte sequence is made from values of symbol positions, + // so this is not a well-formed UTF-8 sequence. + return string(buf) +} + +func (s *symbolPositionSet) sortAndRemoveDuplicates() []symbolPosition { + if s.sorted { + return s.s + } + + sortSymbolPositions(s.s, 0, len(s.s)-1) + + // Remove duplicates. + lastV := s.s[0] + nextIdx := 1 + for _, v := range s.s[1:] { + if v == lastV { + continue + } + s.s[nextIdx] = v + nextIdx++ + lastV = v + } + s.s = s.s[:nextIdx] + s.sorted = true + + return s.s +} + +// sortSymbolPositions sorts a slice of symbol positions as it uses quick sort. +func sortSymbolPositions(ps []symbolPosition, left, right int) { + if left >= right { + return + } + var pivot symbolPosition + { + // Use a median as a pivot. + p1 := ps[left] + p2 := ps[(left+right)/2] + p3 := ps[right] + if p1 > p2 { + p1, p2 = p2, p1 + } + if p2 > p3 { + p2 = p3 + if p1 > p2 { + p2 = p1 + } + } + pivot = p2 + } + i := left + j := right + for i <= j { + for ps[i] < pivot { + i++ + } + for ps[j] > pivot { + j-- + } + if i <= j { + ps[i], ps[j] = ps[j], ps[i] + i++ + j-- + } + } + sortSymbolPositions(ps, left, j) + sortSymbolPositions(ps, i, right) +} + type byteTree interface { fmt.Stringer children() (byteTree, byteTree) diff --git a/src/urubu/grammar/lexical/dfa/dfa.go b/src/urubu/grammar/lexical/dfa/dfa.go deleted file mode 100644 index 48bd8b4..0000000 --- a/src/urubu/grammar/lexical/dfa/dfa.go +++ /dev/null @@ -1,173 +0,0 @@ -package dfa - -import ( - "sort" - - spec "urubu/spec/grammar" -) - -type symbolTable struct { - symPos2Byte map[symbolPosition]byteRange - endPos2ID map[symbolPosition]spec.LexModeKindID -} - -func genSymbolTable(root byteTree) *symbolTable { - symTab := &symbolTable{ - symPos2Byte: map[symbolPosition]byteRange{}, - endPos2ID: map[symbolPosition]spec.LexModeKindID{}, - } - return genSymTab(symTab, root) -} - -func genSymTab(symTab *symbolTable, node byteTree) *symbolTable { - if node == nil { - return symTab - } - - switch n := node.(type) { - case *symbolNode: - symTab.symPos2Byte[n.pos] = byteRange{ - from: n.from, - to: n.to, - } - case *endMarkerNode: - symTab.endPos2ID[n.pos] = n.id - default: - left, right := node.children() - genSymTab(symTab, left) - genSymTab(symTab, right) - } - return symTab -} - -type DFA struct { - States []string - InitialState string - AcceptingStatesTable map[string]spec.LexModeKindID - TransitionTable map[string][256]string -} - -func GenDFA(root byteTree, symTab *symbolTable) *DFA { - initialState := root.first() - initialStateHash := initialState.hash() - stateMap := map[string]*symbolPositionSet{ - initialStateHash: initialState, - } - tranTab := map[string][256]string{} - { - follow := genFollowTable(root) - unmarkedStates := map[string]*symbolPositionSet{ - initialStateHash: initialState, - } - for len(unmarkedStates) > 0 { - nextUnmarkedStates := map[string]*symbolPositionSet{} - for hash, state := range unmarkedStates { - tranTabOfState := [256]*symbolPositionSet{} - for _, pos := range state.set() { - if pos.isEndMark() { - continue - } - valRange := symTab.symPos2Byte[pos] - for symVal := valRange.from; symVal <= valRange.to; symVal++ { - if tranTabOfState[symVal] == nil { - tranTabOfState[symVal] = newSymbolPositionSet() - } - tranTabOfState[symVal].merge(follow[pos]) - } - } - for _, t := range tranTabOfState { - if t == nil { - continue - } - h := t.hash() - if _, ok := stateMap[h]; ok { - continue - } - stateMap[h] = t - nextUnmarkedStates[h] = t - } - tabOfState := [256]string{} - for v, t := range tranTabOfState { - if t == nil { - continue - } - tabOfState[v] = t.hash() - } - tranTab[hash] = tabOfState - } - unmarkedStates = nextUnmarkedStates - } - } - - accTab := map[string]spec.LexModeKindID{} - { - for h, s := range stateMap { - for _, pos := range s.set() { - if !pos.isEndMark() { - continue - } - priorID, ok := accTab[h] - if !ok { - accTab[h] = symTab.endPos2ID[pos] - } else { - id := symTab.endPos2ID[pos] - if id < priorID { - accTab[h] = id - } - } - } - } - } - - var states []string - { - for s := range stateMap { - states = append(states, s) - } - sort.Slice(states, func(i, j int) bool { - return states[i] < states[j] - }) - } - - return &DFA{ - States: states, - InitialState: initialStateHash, - AcceptingStatesTable: accTab, - TransitionTable: tranTab, - } -} - -func GenTransitionTable(dfa *DFA) (*spec.TransitionTable, error) { - stateHash2ID := map[string]spec.StateID{} - for i, s := range dfa.States { - // Since 0 represents an invalid value in a transition table, - // assign a number greater than or equal to 1 to states. - stateHash2ID[s] = spec.StateID(i + spec.StateIDMin.Int()) - } - - acc := make([]spec.LexModeKindID, len(dfa.States)+1) - for _, s := range dfa.States { - id, ok := dfa.AcceptingStatesTable[s] - if !ok { - continue - } - acc[stateHash2ID[s]] = id - } - - rowCount := len(dfa.States) + 1 - colCount := 256 - tran := make([]spec.StateID, rowCount*colCount) - for s, tab := range dfa.TransitionTable { - for v, to := range tab { - tran[stateHash2ID[s].Int()*256+v] = stateHash2ID[to] - } - } - - return &spec.TransitionTable{ - InitialStateID: stateHash2ID[dfa.InitialState], - AcceptingStates: acc, - UncompressedTransition: tran, - RowCount: rowCount, - ColCount: colCount, - }, nil -} diff --git a/src/urubu/grammar/lexical/dfa/symbol_position.go b/src/urubu/grammar/lexical/dfa/symbol_position.go deleted file mode 100644 index f154251..0000000 --- a/src/urubu/grammar/lexical/dfa/symbol_position.go +++ /dev/null @@ -1,182 +0,0 @@ -package dfa - -import ( - "encoding/binary" - "fmt" - "strings" -) - -type symbolPosition uint16 - -const ( - symbolPositionNil symbolPosition = 0x0000 - - symbolPositionMin uint16 = 0x0001 - symbolPositionMax uint16 = 0x7fff - - symbolPositionMaskSymbol uint16 = 0x0000 - symbolPositionMaskEndMark uint16 = 0x8000 - - symbolPositionMaskValue uint16 = 0x7fff -) - -func newSymbolPosition(n uint16, endMark bool) (symbolPosition, error) { - if n < symbolPositionMin || n > symbolPositionMax { - return symbolPositionNil, fmt.Errorf("symbol position must be within %v to %v: n: %v, endMark: %v", symbolPositionMin, symbolPositionMax, n, endMark) - } - if endMark { - return symbolPosition(n | symbolPositionMaskEndMark), nil - } - return symbolPosition(n | symbolPositionMaskSymbol), nil -} - -func (p symbolPosition) String() string { - if p.isEndMark() { - return fmt.Sprintf("end#%v", uint16(p)&symbolPositionMaskValue) - } - return fmt.Sprintf("sym#%v", uint16(p)&symbolPositionMaskValue) -} - -func (p symbolPosition) isEndMark() bool { - return uint16(p)&symbolPositionMaskEndMark > 1 -} - -func (p symbolPosition) describe() (uint16, bool) { - v := uint16(p) & symbolPositionMaskValue - if p.isEndMark() { - return v, true - } - return v, false -} - -type symbolPositionSet struct { - // `s` represents a set of symbol positions. - // However, immediately after adding a symbol position, the elements may be duplicated. - // When you need an aligned set with no duplicates, you can get such value via the set function. - s []symbolPosition - sorted bool -} - -func newSymbolPositionSet() *symbolPositionSet { - return &symbolPositionSet{ - s: []symbolPosition{}, - sorted: false, - } -} - -func (s *symbolPositionSet) String() string { - if len(s.s) <= 0 { - return "{}" - } - ps := s.sortAndRemoveDuplicates() - var b strings.Builder - fmt.Fprintf(&b, "{") - for i, p := range ps { - if i <= 0 { - fmt.Fprintf(&b, "%v", p) - continue - } - fmt.Fprintf(&b, ", %v", p) - } - fmt.Fprintf(&b, "}") - return b.String() -} - -func (s *symbolPositionSet) set() []symbolPosition { - s.sortAndRemoveDuplicates() - return s.s -} - -func (s *symbolPositionSet) add(pos symbolPosition) *symbolPositionSet { - s.s = append(s.s, pos) - s.sorted = false - return s -} - -func (s *symbolPositionSet) merge(t *symbolPositionSet) *symbolPositionSet { - s.s = append(s.s, t.s...) - s.sorted = false - return s -} - -func (s *symbolPositionSet) hash() string { - if len(s.s) <= 0 { - return "" - } - sorted := s.sortAndRemoveDuplicates() - var buf []byte - for _, p := range sorted { - b := make([]byte, 8) - binary.PutUvarint(b, uint64(p)) - buf = append(buf, b...) - } - // Convert to a string to be able to use it as a key of a map. - // But note this byte sequence is made from values of symbol positions, - // so this is not a well-formed UTF-8 sequence. - return string(buf) -} - -func (s *symbolPositionSet) sortAndRemoveDuplicates() []symbolPosition { - if s.sorted { - return s.s - } - - sortSymbolPositions(s.s, 0, len(s.s)-1) - - // Remove duplicates. - lastV := s.s[0] - nextIdx := 1 - for _, v := range s.s[1:] { - if v == lastV { - continue - } - s.s[nextIdx] = v - nextIdx++ - lastV = v - } - s.s = s.s[:nextIdx] - s.sorted = true - - return s.s -} - -// sortSymbolPositions sorts a slice of symbol positions as it uses quick sort. -func sortSymbolPositions(ps []symbolPosition, left, right int) { - if left >= right { - return - } - var pivot symbolPosition - { - // Use a median as a pivot. - p1 := ps[left] - p2 := ps[(left+right)/2] - p3 := ps[right] - if p1 > p2 { - p1, p2 = p2, p1 - } - if p2 > p3 { - p2 = p3 - if p1 > p2 { - p2 = p1 - } - } - pivot = p2 - } - i := left - j := right - for i <= j { - for ps[i] < pivot { - i++ - } - for ps[j] > pivot { - j-- - } - if i <= j { - ps[i], ps[j] = ps[j], ps[i] - i++ - j-- - } - } - sortSymbolPositions(ps, left, j) - sortSymbolPositions(ps, i, right) -} diff --git a/src/urubu/grammar/lexical/entry.go b/src/urubu/grammar/lexical/entry.go deleted file mode 100644 index 44af8ea..0000000 --- a/src/urubu/grammar/lexical/entry.go +++ /dev/null @@ -1,171 +0,0 @@ -package lexical - -import ( - "fmt" - "sort" - "strings" - - spec "urubu/spec/grammar" -) - -type LexEntry struct { - Kind spec.LexKindName - Pattern string - Modes []spec.LexModeName - Push spec.LexModeName - Pop bool - Fragment bool -} - -type LexSpec struct { - Entries []*LexEntry -} - -func (s *LexSpec) Validate() error { - if len(s.Entries) <= 0 { - return fmt.Errorf("the lexical specification must have at least one entry") - } - { - ks := map[string]struct{}{} - fks := map[string]struct{}{} - for _, e := range s.Entries { - // Allow duplicate names between fragments and non-fragments. - if e.Fragment { - if _, exist := fks[e.Kind.String()]; exist { - return fmt.Errorf("kinds `%v` are duplicates", e.Kind) - } - fks[e.Kind.String()] = struct{}{} - } else { - if _, exist := ks[e.Kind.String()]; exist { - return fmt.Errorf("kinds `%v` are duplicates", e.Kind) - } - ks[e.Kind.String()] = struct{}{} - } - } - } - { - kinds := []string{} - modes := []string{ - spec.LexModeNameDefault.String(), // This is a predefined mode. - } - for _, e := range s.Entries { - if e.Fragment { - continue - } - - kinds = append(kinds, e.Kind.String()) - - for _, m := range e.Modes { - modes = append(modes, m.String()) - } - } - - kindErrs := findSpellingInconsistenciesErrors(kinds, nil) - modeErrs := findSpellingInconsistenciesErrors(modes, func(ids []string) error { - if SnakeCaseToUpperCamelCase(ids[0]) == SnakeCaseToUpperCamelCase(spec.LexModeNameDefault.String()) { - var b strings.Builder - fmt.Fprintf(&b, "%+v", ids[0]) - for _, id := range ids[1:] { - fmt.Fprintf(&b, ", %+v", id) - } - return fmt.Errorf("these identifiers are treated as the same. please use the same spelling as predefined '%v': %v", spec.LexModeNameDefault, b.String()) - } - return nil - }) - errs := append(kindErrs, modeErrs...) - if len(errs) > 0 { - var b strings.Builder - fmt.Fprintf(&b, "%v", errs[0]) - for _, err := range errs[1:] { - fmt.Fprintf(&b, "\n%v", err) - } - return fmt.Errorf(b.String()) - } - } - - return nil -} - -func findSpellingInconsistenciesErrors(ids []string, hook func(ids []string) error) []error { - duplicated := FindSpellingInconsistencies(ids) - if len(duplicated) == 0 { - return nil - } - - var errs []error - for _, dup := range duplicated { - if hook != nil { - err := hook(dup) - if err != nil { - errs = append(errs, err) - continue - } - } - - var b strings.Builder - fmt.Fprintf(&b, "%+v", dup[0]) - for _, id := range dup[1:] { - fmt.Fprintf(&b, ", %+v", id) - } - err := fmt.Errorf("these identifiers are treated as the same. please use the same spelling: %v", b.String()) - errs = append(errs, err) - } - - return errs -} - -// FindSpellingInconsistencies finds spelling inconsistencies in identifiers. The identifiers are considered to be the same -// if they are spelled the same when expressed in UpperCamelCase. For example, `left_paren` and `LeftParen` are spelled the same -// in UpperCamelCase. Thus they are considere to be spelling inconsistency. -func FindSpellingInconsistencies(ids []string) [][]string { - m := map[string][]string{} - for _, id := range removeDuplicates(ids) { - c := SnakeCaseToUpperCamelCase(id) - m[c] = append(m[c], id) - } - - var duplicated [][]string - for _, camels := range m { - if len(camels) == 1 { - continue - } - duplicated = append(duplicated, camels) - } - - for _, dup := range duplicated { - sort.Slice(dup, func(i, j int) bool { - return dup[i] < dup[j] - }) - } - sort.Slice(duplicated, func(i, j int) bool { - return duplicated[i][0] < duplicated[j][0] - }) - - return duplicated -} - -func removeDuplicates(s []string) []string { - m := map[string]struct{}{} - for _, v := range s { - m[v] = struct{}{} - } - - var unique []string - for v := range m { - unique = append(unique, v) - } - - return unique -} - -func SnakeCaseToUpperCamelCase(snake string) string { - elems := strings.Split(snake, "_") - for i, e := range elems { - if len(e) == 0 { - continue - } - elems[i] = strings.ToUpper(string(e[0])) + e[1:] - } - - return strings.Join(elems, "") -} diff --git a/src/urubu/grammar/lexical/parser.go b/src/urubu/grammar/lexical/parser.go new file mode 100644 index 0000000..748e8fe --- /dev/null +++ b/src/urubu/grammar/lexical/parser.go @@ -0,0 +1,1668 @@ +package parser + +import ( + "bufio" + "bytes" + "fmt" + "io" + "sort" + "strconv" + "strings" + + spec "urubu/spec/grammar" + "urubu/ucd" +) + +var ( + ParseErr = fmt.Errorf("parse error") + + // lexical errors + synErrIncompletedEscSeq = fmt.Errorf("incompleted escape sequence; unexpected EOF following \\") + synErrInvalidEscSeq = fmt.Errorf("invalid escape sequence") + synErrInvalidCodePoint = fmt.Errorf("code points must consist of just 4 or 6 hex digits") + synErrCharPropInvalidSymbol = fmt.Errorf("invalid character property symbol") + SynErrFragmentInvalidSymbol = fmt.Errorf("invalid fragment symbol") + + // syntax errors + synErrUnexpectedToken = fmt.Errorf("unexpected token") + synErrNullPattern = fmt.Errorf("a pattern must be a non-empty byte sequence") + synErrUnmatchablePattern = fmt.Errorf("a pattern cannot match any characters") + synErrAltLackOfOperand = fmt.Errorf("an alternation expression must have operands") + synErrRepNoTarget = fmt.Errorf("a repeat expression must have an operand") + synErrGroupNoElem = fmt.Errorf("a grouping expression must include at least one character") + synErrGroupUnclosed = fmt.Errorf("unclosed grouping expression") + synErrGroupNoInitiator = fmt.Errorf(") needs preceding (") + synErrGroupInvalidForm = fmt.Errorf("invalid grouping expression") + synErrBExpNoElem = fmt.Errorf("a bracket expression must include at least one character") + synErrBExpUnclosed = fmt.Errorf("unclosed bracket expression") + synErrBExpInvalidForm = fmt.Errorf("invalid bracket expression") + synErrRangeInvalidOrder = fmt.Errorf("a range expression with invalid order") + synErrRangePropIsUnavailable = fmt.Errorf("a property expression is unavailable in a range expression") + synErrRangeInvalidForm = fmt.Errorf("invalid range expression") + synErrCPExpInvalidForm = fmt.Errorf("invalid code point expression") + synErrCPExpOutOfRange = fmt.Errorf("a code point must be between U+0000 to U+10FFFF") + synErrCharPropExpInvalidForm = fmt.Errorf("invalid character property expression") + synErrCharPropUnsupported = fmt.Errorf("unsupported character property") + synErrFragmentExpInvalidForm = fmt.Errorf("invalid fragment expression") +) + +type incompleteFragment struct { + kind spec.LexKindName + root *rootNode +} + +func CompleteFragments(fragments map[spec.LexKindName]CPTree) error { + if len(fragments) == 0 { + return nil + } + + completeFragments := map[spec.LexKindName]CPTree{} + incompleteFragments := []*incompleteFragment{} + for kind, tree := range fragments { + root, ok := tree.(*rootNode) + if !ok { + return fmt.Errorf("CompleteFragments can take only *rootNode: %T", tree) + } + if root.incomplete() { + incompleteFragments = append(incompleteFragments, &incompleteFragment{ + kind: kind, + root: root, + }) + } else { + completeFragments[kind] = root + } + } + for len(incompleteFragments) > 0 { + lastIncompCount := len(incompleteFragments) + remainingFragments := []*incompleteFragment{} + for _, e := range incompleteFragments { + complete, err := ApplyFragments(e.root, completeFragments) + if err != nil { + return err + } + if !complete { + remainingFragments = append(remainingFragments, e) + } else { + completeFragments[e.kind] = e.root + } + } + incompleteFragments = remainingFragments + if len(incompleteFragments) == lastIncompCount { + return ParseErr + } + } + + return nil +} + +func ApplyFragments(t CPTree, fragments map[spec.LexKindName]CPTree) (bool, error) { + root, ok := t.(*rootNode) + if !ok { + return false, fmt.Errorf("ApplyFragments can take only *rootNode type: %T", t) + } + + for name, frag := range fragments { + err := root.applyFragment(name, frag) + if err != nil { + return false, err + } + } + + return !root.incomplete(), nil +} + +type tokenKind string + +const ( + tokenKindChar tokenKind = "char" + tokenKindAnyChar tokenKind = "." + tokenKindRepeat tokenKind = "*" + tokenKindRepeatOneOrMore tokenKind = "+" + tokenKindOption tokenKind = "?" + tokenKindAlt tokenKind = "|" + tokenKindGroupOpen tokenKind = "(" + tokenKindGroupClose tokenKind = ")" + tokenKindBExpOpen tokenKind = "[" + tokenKindInverseBExpOpen tokenKind = "[^" + tokenKindBExpClose tokenKind = "]" + tokenKindCharRange tokenKind = "-" + tokenKindCodePointLeader tokenKind = "\\u" + tokenKindCharPropLeader tokenKind = "\\p" + tokenKindFragmentLeader tokenKind = "\\f" + tokenKindLBrace tokenKind = "{" + tokenKindRBrace tokenKind = "}" + tokenKindEqual tokenKind = "=" + tokenKindCodePoint tokenKind = "code point" + tokenKindCharPropSymbol tokenKind = "character property symbol" + tokenKindFragmentSymbol tokenKind = "fragment symbol" + tokenKindEOF tokenKind = "eof" +) + +type token struct { + kind tokenKind + char rune + propSymbol string + codePoint string + fragmentSymbol string +} + +const nullChar = '\u0000' + +func newToken(kind tokenKind, char rune) *token { + return &token{ + kind: kind, + char: char, + } +} + +func newCodePointToken(codePoint string) *token { + return &token{ + kind: tokenKindCodePoint, + codePoint: codePoint, + } +} + +func newCharPropSymbolToken(propSymbol string) *token { + return &token{ + kind: tokenKindCharPropSymbol, + propSymbol: propSymbol, + } +} + +func newFragmentSymbolToken(fragmentSymbol string) *token { + return &token{ + kind: tokenKindFragmentSymbol, + fragmentSymbol: fragmentSymbol, + } +} + +type lexerMode string + +const ( + lexerModeDefault lexerMode = "default" + lexerModeBExp lexerMode = "bracket expression" + lexerModeCPExp lexerMode = "code point expression" + lexerModeCharPropExp lexerMode = "character property expression" + lexerModeFragmentExp lexerMode = "fragment expression" +) + +type lexerModeStack struct { + stack []lexerMode +} + +func newLexerModeStack() *lexerModeStack { + return &lexerModeStack{ + stack: []lexerMode{ + lexerModeDefault, + }, + } +} + +func (s *lexerModeStack) top() lexerMode { + return s.stack[len(s.stack)-1] +} + +func (s *lexerModeStack) push(m lexerMode) { + s.stack = append(s.stack, m) +} + +func (s *lexerModeStack) pop() { + s.stack = s.stack[:len(s.stack)-1] +} + +type rangeState string + +// [a-z] +// ^^^^ +// |||`-- ready +// ||`-- expect range terminator +// |`-- read range initiator +// `-- ready +const ( + rangeStateReady rangeState = "ready" + rangeStateReadRangeInitiator rangeState = "read range initiator" + rangeStateExpectRangeTerminator rangeState = "expect range terminator" +) + +type lexer struct { + src *bufio.Reader + peekChar2 rune + peekEOF2 bool + peekChar1 rune + peekEOF1 bool + lastChar rune + reachedEOF bool + prevChar1 rune + prevEOF1 bool + prevChar2 rune + pervEOF2 bool + modeStack *lexerModeStack + rangeState rangeState + + errCause error + errDetail string +} + +func newLexer(src io.Reader) *lexer { + return &lexer{ + src: bufio.NewReader(src), + peekChar2: nullChar, + peekEOF2: false, + peekChar1: nullChar, + peekEOF1: false, + lastChar: nullChar, + reachedEOF: false, + prevChar1: nullChar, + prevEOF1: false, + prevChar2: nullChar, + pervEOF2: false, + modeStack: newLexerModeStack(), + rangeState: rangeStateReady, + } +} + +func (l *lexer) error() (string, error) { + return l.errDetail, l.errCause +} + +func (l *lexer) next() (*token, error) { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + return newToken(tokenKindEOF, nullChar), nil + } + + switch l.modeStack.top() { + case lexerModeBExp: + tok, err := l.nextInBExp(c) + if err != nil { + return nil, err + } + if tok.kind == tokenKindChar || tok.kind == tokenKindCodePointLeader || tok.kind == tokenKindCharPropLeader { + switch l.rangeState { + case rangeStateReady: + l.rangeState = rangeStateReadRangeInitiator + case rangeStateExpectRangeTerminator: + l.rangeState = rangeStateReady + } + } + switch tok.kind { + case tokenKindBExpClose: + l.modeStack.pop() + case tokenKindCharRange: + l.rangeState = rangeStateExpectRangeTerminator + case tokenKindCodePointLeader: + l.modeStack.push(lexerModeCPExp) + case tokenKindCharPropLeader: + l.modeStack.push(lexerModeCharPropExp) + } + return tok, nil + case lexerModeCPExp: + tok, err := l.nextInCodePoint(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() + } + return tok, nil + case lexerModeCharPropExp: + tok, err := l.nextInCharProp(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() + } + return tok, nil + case lexerModeFragmentExp: + tok, err := l.nextInFragment(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() + } + return tok, nil + default: + tok, err := l.nextInDefault(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindBExpOpen: + l.modeStack.push(lexerModeBExp) + l.rangeState = rangeStateReady + case tokenKindInverseBExpOpen: + l.modeStack.push(lexerModeBExp) + l.rangeState = rangeStateReady + case tokenKindCodePointLeader: + l.modeStack.push(lexerModeCPExp) + case tokenKindCharPropLeader: + l.modeStack.push(lexerModeCharPropExp) + case tokenKindFragmentLeader: + l.modeStack.push(lexerModeFragmentExp) + } + return tok, nil + } +} + +func (l *lexer) nextInDefault(c rune) (*token, error) { + switch c { + case '*': + return newToken(tokenKindRepeat, nullChar), nil + case '+': + return newToken(tokenKindRepeatOneOrMore, nullChar), nil + case '?': + return newToken(tokenKindOption, nullChar), nil + case '.': + return newToken(tokenKindAnyChar, nullChar), nil + case '|': + return newToken(tokenKindAlt, nullChar), nil + case '(': + return newToken(tokenKindGroupOpen, nullChar), nil + case ')': + return newToken(tokenKindGroupClose, nullChar), nil + case '[': + c1, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindBExpOpen, nullChar), nil + } + if c1 != '^' { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindBExpOpen, nullChar), nil + } + c2, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindInverseBExpOpen, nullChar), nil + } + if c2 != ']' { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindInverseBExpOpen, nullChar), nil + } + err = l.restore() + if err != nil { + return nil, err + } + err = l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindBExpOpen, nullChar), nil + case '\\': + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + l.errCause = synErrIncompletedEscSeq + return nil, ParseErr + } + if c == 'u' { + return newToken(tokenKindCodePointLeader, nullChar), nil + } + if c == 'p' { + return newToken(tokenKindCharPropLeader, nullChar), nil + } + if c == 'f' { + return newToken(tokenKindFragmentLeader, nullChar), nil + } + if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' { + return newToken(tokenKindChar, c), nil + } + l.errCause = synErrInvalidEscSeq + l.errDetail = fmt.Sprintf("\\%v is not supported", string(c)) + return nil, ParseErr + default: + return newToken(tokenKindChar, c), nil + } +} + +func (l *lexer) nextInBExp(c rune) (*token, error) { + switch c { + case '-': + if l.rangeState != rangeStateReadRangeInitiator { + return newToken(tokenKindChar, c), nil + } + c1, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindChar, c), nil + } + if c1 != ']' { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindCharRange, nullChar), nil + } + err = l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindChar, c), nil + case ']': + return newToken(tokenKindBExpClose, nullChar), nil + case '\\': + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + l.errCause = synErrIncompletedEscSeq + return nil, ParseErr + } + if c == 'u' { + return newToken(tokenKindCodePointLeader, nullChar), nil + } + if c == 'p' { + return newToken(tokenKindCharPropLeader, nullChar), nil + } + if c == '\\' || c == '^' || c == '-' || c == ']' { + return newToken(tokenKindChar, c), nil + } + l.errCause = synErrInvalidEscSeq + l.errDetail = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c)) + return nil, ParseErr + default: + return newToken(tokenKindChar, c), nil + } +} + +func (l *lexer) nextInCodePoint(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + default: + if !isHexDigit(c) { + l.errCause = synErrInvalidCodePoint + return nil, ParseErr + } + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if !isHexDigit(c) || n >= 6 { + l.errCause = synErrInvalidCodePoint + return nil, ParseErr + } + fmt.Fprint(&b, string(c)) + n++ + } + cp := b.String() + cpLen := len(cp) + if !(cpLen == 4 || cpLen == 6) { + l.errCause = synErrInvalidCodePoint + return nil, ParseErr + } + return newCodePointToken(b.String()), nil + } +} + +func isHexDigit(c rune) bool { + if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' { + return true + } + return false +} + +func (l *lexer) nextInCharProp(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + case '=': + return newToken(tokenKindEqual, nullChar), nil + default: + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' || c == '=' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + fmt.Fprint(&b, string(c)) + n++ + } + sym := strings.TrimSpace(b.String()) + if len(sym) == 0 { + l.errCause = synErrCharPropInvalidSymbol + return nil, ParseErr + } + return newCharPropSymbolToken(sym), nil + } +} + +func (l *lexer) nextInFragment(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + default: + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + fmt.Fprint(&b, string(c)) + n++ + } + sym := strings.TrimSpace(b.String()) + if len(sym) == 0 { + l.errCause = SynErrFragmentInvalidSymbol + return nil, ParseErr + } + return newFragmentSymbolToken(sym), nil + } +} + +func (l *lexer) read() (rune, bool, error) { + if l.reachedEOF { + return l.lastChar, l.reachedEOF, nil + } + if l.peekChar1 != nullChar || l.peekEOF1 { + l.prevChar2 = l.prevChar1 + l.pervEOF2 = l.prevEOF1 + l.prevChar1 = l.lastChar + l.prevEOF1 = l.reachedEOF + l.lastChar = l.peekChar1 + l.reachedEOF = l.peekEOF1 + l.peekChar1 = l.peekChar2 + l.peekEOF1 = l.peekEOF2 + l.peekChar2 = nullChar + l.peekEOF2 = false + return l.lastChar, l.reachedEOF, nil + } + c, _, err := l.src.ReadRune() + if err != nil { + if err == io.EOF { + l.prevChar2 = l.prevChar1 + l.pervEOF2 = l.prevEOF1 + l.prevChar1 = l.lastChar + l.prevEOF1 = l.reachedEOF + l.lastChar = nullChar + l.reachedEOF = true + return l.lastChar, l.reachedEOF, nil + } + return nullChar, false, err + } + l.prevChar2 = l.prevChar1 + l.pervEOF2 = l.prevEOF1 + l.prevChar1 = l.lastChar + l.prevEOF1 = l.reachedEOF + l.lastChar = c + l.reachedEOF = false + return l.lastChar, l.reachedEOF, nil +} + +func (l *lexer) restore() error { + if l.lastChar == nullChar && !l.reachedEOF { + return fmt.Errorf("failed to call restore() because the last character is null") + } + l.peekChar2 = l.peekChar1 + l.peekEOF2 = l.peekEOF1 + l.peekChar1 = l.lastChar + l.peekEOF1 = l.reachedEOF + l.lastChar = l.prevChar1 + l.reachedEOF = l.prevEOF1 + l.prevChar1 = l.prevChar2 + l.prevEOF1 = l.pervEOF2 + l.prevChar2 = nullChar + l.pervEOF2 = false + return nil +} + +type PatternEntry struct { + ID spec.LexModeKindID + Pattern []byte +} + +type parser struct { + kind spec.LexKindName + lex *lexer + peekedTok *token + lastTok *token + + // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that + // appear in property expressions. + // + // The contributory properties are not exposed, and users cannot use those properties because the parser + // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid. + // + // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to + // interpret derived properties internally because the derived properties consist of other properties that + // may contain the contributory properties. + // + // [UAX #44 5.13 Property APIs] says: + // > The following subtypes of Unicode character properties should generally not be exposed in APIs, + // > except in limited circumstances. They may not be useful, particularly in public API collections, + // > and may instead prove misleading to the users of such API collections. + // > * Contributory properties are not recommended for public APIs. + // > ... + // https://unicode.org/reports/tr44/#Property_APIs + isContributoryPropertyExposed bool + + errCause error + errDetail string +} + +func NewParser(kind spec.LexKindName, src io.Reader) *parser { + return &parser{ + kind: kind, + lex: newLexer(src), + isContributoryPropertyExposed: false, + } +} + +func (p *parser) exposeContributoryProperty() { + p.isContributoryPropertyExposed = true +} + +func (p *parser) Error() (string, error) { + return p.errDetail, p.errCause +} + +func (p *parser) Parse() (root CPTree, retErr error) { + defer func() { + err := recover() + if err != nil { + var ok bool + retErr, ok = err.(error) + if !ok { + panic(err) + } + return + } + }() + + return newRootNode(p.kind, p.parseRegexp()), nil +} + +func (p *parser) parseRegexp() CPTree { + alt := p.parseAlt() + if alt == nil { + if p.consume(tokenKindGroupClose) { + p.raiseParseError(synErrGroupNoInitiator, "") + } + p.raiseParseError(synErrNullPattern, "") + } + if p.consume(tokenKindGroupClose) { + p.raiseParseError(synErrGroupNoInitiator, "") + } + p.expect(tokenKindEOF) + return alt +} + +func (p *parser) parseAlt() CPTree { + left := p.parseConcat() + if left == nil { + if p.consume(tokenKindAlt) { + p.raiseParseError(synErrAltLackOfOperand, "") + } + return nil + } + for { + if !p.consume(tokenKindAlt) { + break + } + right := p.parseConcat() + if right == nil { + p.raiseParseError(synErrAltLackOfOperand, "") + } + left = newAltNode(left, right) + } + return left +} + +func (p *parser) parseConcat() CPTree { + left := p.parseRepeat() + for { + right := p.parseRepeat() + if right == nil { + break + } + left = newConcatNode(left, right) + } + return left +} + +func (p *parser) parseRepeat() CPTree { + group := p.parseGroup() + if group == nil { + if p.consume(tokenKindRepeat) { + p.raiseParseError(synErrRepNoTarget, "* needs an operand") + } + if p.consume(tokenKindRepeatOneOrMore) { + p.raiseParseError(synErrRepNoTarget, "+ needs an operand") + } + if p.consume(tokenKindOption) { + p.raiseParseError(synErrRepNoTarget, "? needs an operand") + } + return nil + } + if p.consume(tokenKindRepeat) { + return newRepeatNode(group) + } + if p.consume(tokenKindRepeatOneOrMore) { + return newRepeatOneOrMoreNode(group) + } + if p.consume(tokenKindOption) { + return newOptionNode(group) + } + return group +} + +func (p *parser) parseGroup() CPTree { + if p.consume(tokenKindGroupOpen) { + alt := p.parseAlt() + if alt == nil { + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrGroupUnclosed, "") + } + p.raiseParseError(synErrGroupNoElem, "") + } + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrGroupUnclosed, "") + } + if !p.consume(tokenKindGroupClose) { + p.raiseParseError(synErrGroupInvalidForm, "") + } + return alt + } + return p.parseSingleChar() +} + +func (p *parser) parseSingleChar() CPTree { + if p.consume(tokenKindAnyChar) { + return genAnyCharAST() + } + if p.consume(tokenKindBExpOpen) { + left := p.parseBExpElem() + if left == nil { + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.raiseParseError(synErrBExpNoElem, "") + } + for { + right := p.parseBExpElem() + if right == nil { + break + } + left = newAltNode(left, right) + } + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.expect(tokenKindBExpClose) + return left + } + if p.consume(tokenKindInverseBExpOpen) { + elem := p.parseBExpElem() + if elem == nil { + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.raiseParseError(synErrBExpNoElem, "") + } + inverse := exclude(elem, genAnyCharAST()) + if inverse == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + for { + elem := p.parseBExpElem() + if elem == nil { + break + } + inverse = exclude(elem, inverse) + if inverse == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + } + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.expect(tokenKindBExpClose) + return inverse + } + if p.consume(tokenKindCodePointLeader) { + return p.parseCodePoint() + } + if p.consume(tokenKindCharPropLeader) { + return p.parseCharProp() + } + if p.consume(tokenKindFragmentLeader) { + return p.parseFragment() + } + c := p.parseNormalChar() + if c == nil { + if p.consume(tokenKindBExpClose) { + p.raiseParseError(synErrBExpInvalidForm, "") + } + return nil + } + return c +} + +func (p *parser) parseBExpElem() CPTree { + var left CPTree + switch { + case p.consume(tokenKindCodePointLeader): + left = p.parseCodePoint() + case p.consume(tokenKindCharPropLeader): + left = p.parseCharProp() + if p.consume(tokenKindCharRange) { + p.raiseParseError(synErrRangePropIsUnavailable, "") + } + default: + left = p.parseNormalChar() + } + if left == nil { + return nil + } + if !p.consume(tokenKindCharRange) { + return left + } + var right CPTree + switch { + case p.consume(tokenKindCodePointLeader): + right = p.parseCodePoint() + case p.consume(tokenKindCharPropLeader): + p.raiseParseError(synErrRangePropIsUnavailable, "") + default: + right = p.parseNormalChar() + } + if right == nil { + p.raiseParseError(synErrRangeInvalidForm, "") + } + from, _, _ := left.Range() + _, to, _ := right.Range() + if !isValidOrder(from, to) { + p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("%X..%X", from, to)) + } + return newRangeSymbolNode(from, to) +} + +func (p *parser) parseCodePoint() CPTree { + if !p.consume(tokenKindLBrace) { + p.raiseParseError(synErrCPExpInvalidForm, "") + } + if !p.consume(tokenKindCodePoint) { + p.raiseParseError(synErrCPExpInvalidForm, "") + } + + n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64) + if err != nil { + panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err)) + } + if n < 0x0000 || n > 0x10FFFF { + p.raiseParseError(synErrCPExpOutOfRange, "") + } + + sym := newSymbolNode(rune(n)) + + if !p.consume(tokenKindRBrace) { + p.raiseParseError(synErrCPExpInvalidForm, "") + } + + return sym +} + +func (p *parser) parseCharProp() CPTree { + if !p.consume(tokenKindLBrace) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + var sym1, sym2 string + if !p.consume(tokenKindCharPropSymbol) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + sym1 = p.lastTok.propSymbol + if p.consume(tokenKindEqual) { + if !p.consume(tokenKindCharPropSymbol) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + sym2 = p.lastTok.propSymbol + } + + var alt CPTree + var propName, propVal string + if sym2 != "" { + propName = sym1 + propVal = sym2 + } else { + propName = "" + propVal = sym1 + } + if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) { + p.raiseParseError(synErrCharPropUnsupported, propName) + } + pat, err := ucd.NormalizeCharacterProperty(propName, propVal) + if err != nil { + p.raiseParseError(synErrCharPropUnsupported, err.Error()) + } + if pat != "" { + p := NewParser(p.kind, bytes.NewReader([]byte(pat))) + p.exposeContributoryProperty() + ast, err := p.Parse() + if err != nil { + panic(err) + } + alt = ast + } else { + cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal) + if err != nil { + p.raiseParseError(synErrCharPropUnsupported, err.Error()) + } + if inverse { + r := cpRanges[0] + alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST()) + if alt == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + for _, r := range cpRanges[1:] { + alt = exclude(newRangeSymbolNode(r.From, r.To), alt) + if alt == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + } + } else { + for _, r := range cpRanges { + alt = genAltNode( + alt, + newRangeSymbolNode(r.From, r.To), + ) + } + } + } + + if !p.consume(tokenKindRBrace) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + + return alt +} + +func (p *parser) parseFragment() CPTree { + if !p.consume(tokenKindLBrace) { + p.raiseParseError(synErrFragmentExpInvalidForm, "") + } + if !p.consume(tokenKindFragmentSymbol) { + p.raiseParseError(synErrFragmentExpInvalidForm, "") + } + sym := p.lastTok.fragmentSymbol + + if !p.consume(tokenKindRBrace) { + p.raiseParseError(synErrFragmentExpInvalidForm, "") + } + + return newFragmentNode(spec.LexKindName(sym), nil) +} + +func (p *parser) parseNormalChar() CPTree { + if !p.consume(tokenKindChar) { + return nil + } + return newSymbolNode(p.lastTok.char) +} + +func exclude(symbol, base CPTree) CPTree { + if left, right, ok := symbol.Alternatives(); ok { + return exclude(right, exclude(left, base)) + } + + if left, right, ok := base.Alternatives(); ok { + return genAltNode( + exclude(symbol, left), + exclude(symbol, right), + ) + } + + if bFrom, bTo, ok := base.Range(); ok { + sFrom, sTo, ok := symbol.Range() + if !ok { + panic(fmt.Errorf("invalid symbol tree: %T", symbol)) + } + + switch { + case sFrom > bFrom && sTo < bTo: + return genAltNode( + newRangeSymbolNode(bFrom, sFrom-1), + newRangeSymbolNode(sTo+1, bTo), + ) + case sFrom <= bFrom && sTo >= bFrom && sTo < bTo: + return newRangeSymbolNode(sTo+1, bTo) + case sFrom > bFrom && sFrom <= bTo && sTo >= bTo: + return newRangeSymbolNode(bFrom, sFrom-1) + case sFrom <= bFrom && sTo >= bTo: + return nil + default: + return base + } + } + + panic(fmt.Errorf("invalid base tree: %T", base)) +} + +func genAnyCharAST() CPTree { + return newRangeSymbolNode(0x0, 0x10FFFF) +} + +func isValidOrder(from, to rune) bool { + return from <= to +} + +func genConcatNode(cs ...CPTree) CPTree { + nonNilNodes := []CPTree{} + for _, c := range cs { + if c == nil { + continue + } + nonNilNodes = append(nonNilNodes, c) + } + if len(nonNilNodes) <= 0 { + return nil + } + if len(nonNilNodes) == 1 { + return nonNilNodes[0] + } + concat := newConcatNode(nonNilNodes[0], nonNilNodes[1]) + for _, c := range nonNilNodes[2:] { + concat = newConcatNode(concat, c) + } + return concat +} + +func genAltNode(cs ...CPTree) CPTree { + nonNilNodes := []CPTree{} + for _, c := range cs { + if c == nil { + continue + } + nonNilNodes = append(nonNilNodes, c) + } + if len(nonNilNodes) <= 0 { + return nil + } + if len(nonNilNodes) == 1 { + return nonNilNodes[0] + } + alt := newAltNode(nonNilNodes[0], nonNilNodes[1]) + for _, c := range nonNilNodes[2:] { + alt = newAltNode(alt, c) + } + return alt +} + +func (p *parser) expect(expected tokenKind) { + if !p.consume(expected) { + tok := p.peekedTok + p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("expected: %v, actual: %v", expected, tok.kind)) + } +} + +func (p *parser) consume(expected tokenKind) bool { + var tok *token + var err error + if p.peekedTok != nil { + tok = p.peekedTok + p.peekedTok = nil + } else { + tok, err = p.lex.next() + if err != nil { + if err == ParseErr { + detail, cause := p.lex.error() + p.raiseParseError(cause, detail) + } + panic(err) + } + } + p.lastTok = tok + if tok.kind == expected { + return true + } + p.peekedTok = tok + p.lastTok = nil + + return false +} + +func (p *parser) raiseParseError(err error, detail string) { + p.errCause = err + p.errDetail = detail + panic(ParseErr) +} + +type CPRange struct { + From rune + To rune +} + +type CPTree interface { + fmt.Stringer + Range() (rune, rune, bool) + Optional() (CPTree, bool) + Repeatable() (CPTree, bool) + Concatenation() (CPTree, CPTree, bool) + Alternatives() (CPTree, CPTree, bool) + Describe() (spec.LexKindName, []spec.LexKindName, error) + + children() (CPTree, CPTree) + clone() CPTree +} + +var ( + _ CPTree = &rootNode{} + _ CPTree = &symbolNode{} + _ CPTree = &concatNode{} + _ CPTree = &altNode{} + _ CPTree = &quantifierNode{} + _ CPTree = &fragmentNode{} +) + +type rootNode struct { + kind spec.LexKindName + tree CPTree + fragments map[spec.LexKindName][]*fragmentNode +} + +func newRootNode(kind spec.LexKindName, t CPTree) *rootNode { + fragments := map[spec.LexKindName][]*fragmentNode{} + collectFragments(t, fragments) + + return &rootNode{ + kind: kind, + tree: t, + fragments: fragments, + } +} + +func collectFragments(n CPTree, fragments map[spec.LexKindName][]*fragmentNode) { + if n == nil { + return + } + + if f, ok := n.(*fragmentNode); ok { + fragments[f.kind] = append(fragments[f.kind], f) + return + } + + l, r := n.children() + collectFragments(l, fragments) + collectFragments(r, fragments) +} + +func (n *rootNode) String() string { + return fmt.Sprintf("root: %v: %v fragments", n.kind, len(n.fragments)) +} + +func (n *rootNode) Range() (rune, rune, bool) { + return n.tree.Range() +} + +func (n *rootNode) Optional() (CPTree, bool) { + return n.tree.Optional() +} + +func (n *rootNode) Repeatable() (CPTree, bool) { + return n.tree.Repeatable() +} + +func (n *rootNode) Concatenation() (CPTree, CPTree, bool) { + return n.tree.Concatenation() +} + +func (n *rootNode) Alternatives() (CPTree, CPTree, bool) { + return n.tree.Alternatives() +} + +func (n *rootNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + var frags []spec.LexKindName + for f := range n.fragments { + frags = append(frags, spec.LexKindName(f)) + } + sort.Slice(frags, func(i, j int) bool { + return frags[i] < frags[j] + }) + + return n.kind, frags, nil +} + +func (n *rootNode) children() (CPTree, CPTree) { + return n.tree.children() +} + +func (n *rootNode) clone() CPTree { + return n.tree.clone() +} + +func (n *rootNode) incomplete() bool { + return len(n.fragments) > 0 +} + +func (n *rootNode) applyFragment(kind spec.LexKindName, fragment CPTree) error { + root, ok := fragment.(*rootNode) + if !ok { + return fmt.Errorf("applyFragment can take only *rootNode: %T", fragment) + } + if root.incomplete() { + return fmt.Errorf("fragment is incomplete") + } + + fs, ok := n.fragments[kind] + if !ok { + return nil + } + for _, f := range fs { + f.tree = root.clone() + } + delete(n.fragments, kind) + + return nil +} + +type symbolNode struct { + CPRange +} + +func newSymbolNode(cp rune) *symbolNode { + return &symbolNode{ + CPRange: CPRange{ + From: cp, + To: cp, + }, + } +} + +func newRangeSymbolNode(from, to rune) *symbolNode { + return &symbolNode{ + CPRange: CPRange{ + From: from, + To: to, + }, + } +} + +func (n *symbolNode) String() string { + return fmt.Sprintf("symbol: %X..%X", n.From, n.To) +} + +func (n *symbolNode) Range() (rune, rune, bool) { + return n.From, n.To, true +} + +func (n *symbolNode) Optional() (CPTree, bool) { + return nil, false +} + +func (n *symbolNode) Repeatable() (CPTree, bool) { + return nil, false +} + +func (n *symbolNode) Concatenation() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *symbolNode) Alternatives() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *symbolNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *symbolNode) children() (CPTree, CPTree) { + return nil, nil +} + +func (n *symbolNode) clone() CPTree { + return newRangeSymbolNode(n.From, n.To) +} + +type concatNode struct { + left CPTree + right CPTree +} + +func newConcatNode(left, right CPTree) *concatNode { + return &concatNode{ + left: left, + right: right, + } +} + +func (n *concatNode) String() string { + return "concat" +} + +func (n *concatNode) Range() (rune, rune, bool) { + return 0, 0, false +} + +func (n *concatNode) Optional() (CPTree, bool) { + return nil, false +} + +func (n *concatNode) Repeatable() (CPTree, bool) { + return nil, false +} + +func (n *concatNode) Concatenation() (CPTree, CPTree, bool) { + return n.left, n.right, true +} + +func (n *concatNode) Alternatives() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *concatNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *concatNode) children() (CPTree, CPTree) { + return n.left, n.right +} + +func (n *concatNode) clone() CPTree { + if n == nil { + return nil + } + return newConcatNode(n.left.clone(), n.right.clone()) +} + +type altNode struct { + left CPTree + right CPTree +} + +func newAltNode(left, right CPTree) *altNode { + return &altNode{ + left: left, + right: right, + } +} + +func (n *altNode) String() string { + return "alt" +} + +func (n *altNode) Range() (rune, rune, bool) { + return 0, 0, false +} + +func (n *altNode) Optional() (CPTree, bool) { + return nil, false +} + +func (n *altNode) Repeatable() (CPTree, bool) { + return nil, false +} + +func (n *altNode) Concatenation() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *altNode) Alternatives() (CPTree, CPTree, bool) { + return n.left, n.right, true +} + +func (n *altNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *altNode) children() (CPTree, CPTree) { + return n.left, n.right +} + +func (n *altNode) clone() CPTree { + return newAltNode(n.left.clone(), n.right.clone()) +} + +type quantifierNode struct { + optional bool + repeatable bool + tree CPTree +} + +func (n *quantifierNode) String() string { + switch { + case n.repeatable: + return "repeatable (>= 0 times)" + case n.optional: + return "optional (0 or 1 times)" + default: + return "invalid quantifier" + } +} + +func newRepeatNode(t CPTree) *quantifierNode { + return &quantifierNode{ + repeatable: true, + tree: t, + } +} + +func newRepeatOneOrMoreNode(t CPTree) *concatNode { + return newConcatNode( + t, + &quantifierNode{ + repeatable: true, + tree: t.clone(), + }) +} + +func newOptionNode(t CPTree) *quantifierNode { + return &quantifierNode{ + optional: true, + tree: t, + } +} + +func (n *quantifierNode) Range() (rune, rune, bool) { + return 0, 0, false +} + +func (n *quantifierNode) Optional() (CPTree, bool) { + return n.tree, n.optional +} + +func (n *quantifierNode) Repeatable() (CPTree, bool) { + return n.tree, n.repeatable +} + +func (n *quantifierNode) Concatenation() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *quantifierNode) Alternatives() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *quantifierNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *quantifierNode) children() (CPTree, CPTree) { + return n.tree, nil +} + +func (n *quantifierNode) clone() CPTree { + if n.repeatable { + return newRepeatNode(n.tree.clone()) + } + return newOptionNode(n.tree.clone()) +} + +type fragmentNode struct { + kind spec.LexKindName + tree CPTree +} + +func newFragmentNode(kind spec.LexKindName, t CPTree) *fragmentNode { + return &fragmentNode{ + kind: kind, + tree: t, + } +} + +func (n *fragmentNode) String() string { + return fmt.Sprintf("fragment: %v", n.kind) +} + +func (n *fragmentNode) Range() (rune, rune, bool) { + return n.tree.Range() +} + +func (n *fragmentNode) Optional() (CPTree, bool) { + return n.tree.Optional() +} + +func (n *fragmentNode) Repeatable() (CPTree, bool) { + return n.tree.Repeatable() +} + +func (n *fragmentNode) Concatenation() (CPTree, CPTree, bool) { + return n.tree.Concatenation() +} + +func (n *fragmentNode) Alternatives() (CPTree, CPTree, bool) { + return n.tree.Alternatives() +} + +func (n *fragmentNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *fragmentNode) children() (CPTree, CPTree) { + return n.tree.children() +} + +func (n *fragmentNode) clone() CPTree { + if n.tree == nil { + return newFragmentNode(n.kind, nil) + } + return newFragmentNode(n.kind, n.tree.clone()) +} + +//nolint:unused +func printCPTree(w io.Writer, t CPTree, ruledLine string, childRuledLinePrefix string) { + if t == nil { + return + } + fmt.Fprintf(w, "%v%v\n", ruledLine, t) + children := []CPTree{} + switch n := t.(type) { + case *rootNode: + children = append(children, n.tree) + case *fragmentNode: + children = append(children, n.tree) + default: + left, right := t.children() + if left != nil { + children = append(children, left) + } + if right != nil { + children = append(children, right) + } + } + num := len(children) + for i, child := range children { + line := "└─ " + if num > 1 { + if i == 0 { + line = "├─ " + } else if i < num-1 { + line = "│ " + } + } + prefix := "│ " + if i >= num-1 { + prefix = " " + } + printCPTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix) + } +} diff --git a/src/urubu/grammar/lexical/parser/error.go b/src/urubu/grammar/lexical/parser/error.go deleted file mode 100644 index be81da4..0000000 --- a/src/urubu/grammar/lexical/parser/error.go +++ /dev/null @@ -1,36 +0,0 @@ -package parser - -import "fmt" - -var ( - ParseErr = fmt.Errorf("parse error") - - // lexical errors - synErrIncompletedEscSeq = fmt.Errorf("incompleted escape sequence; unexpected EOF following \\") - synErrInvalidEscSeq = fmt.Errorf("invalid escape sequence") - synErrInvalidCodePoint = fmt.Errorf("code points must consist of just 4 or 6 hex digits") - synErrCharPropInvalidSymbol = fmt.Errorf("invalid character property symbol") - SynErrFragmentInvalidSymbol = fmt.Errorf("invalid fragment symbol") - - // syntax errors - synErrUnexpectedToken = fmt.Errorf("unexpected token") - synErrNullPattern = fmt.Errorf("a pattern must be a non-empty byte sequence") - synErrUnmatchablePattern = fmt.Errorf("a pattern cannot match any characters") - synErrAltLackOfOperand = fmt.Errorf("an alternation expression must have operands") - synErrRepNoTarget = fmt.Errorf("a repeat expression must have an operand") - synErrGroupNoElem = fmt.Errorf("a grouping expression must include at least one character") - synErrGroupUnclosed = fmt.Errorf("unclosed grouping expression") - synErrGroupNoInitiator = fmt.Errorf(") needs preceding (") - synErrGroupInvalidForm = fmt.Errorf("invalid grouping expression") - synErrBExpNoElem = fmt.Errorf("a bracket expression must include at least one character") - synErrBExpUnclosed = fmt.Errorf("unclosed bracket expression") - synErrBExpInvalidForm = fmt.Errorf("invalid bracket expression") - synErrRangeInvalidOrder = fmt.Errorf("a range expression with invalid order") - synErrRangePropIsUnavailable = fmt.Errorf("a property expression is unavailable in a range expression") - synErrRangeInvalidForm = fmt.Errorf("invalid range expression") - synErrCPExpInvalidForm = fmt.Errorf("invalid code point expression") - synErrCPExpOutOfRange = fmt.Errorf("a code point must be between U+0000 to U+10FFFF") - synErrCharPropExpInvalidForm = fmt.Errorf("invalid character property expression") - synErrCharPropUnsupported = fmt.Errorf("unsupported character property") - synErrFragmentExpInvalidForm = fmt.Errorf("invalid fragment expression") -) diff --git a/src/urubu/grammar/lexical/parser/fragment.go b/src/urubu/grammar/lexical/parser/fragment.go deleted file mode 100644 index 196c00b..0000000 --- a/src/urubu/grammar/lexical/parser/fragment.go +++ /dev/null @@ -1,72 +0,0 @@ -package parser - -import ( - "fmt" - - spec "urubu/spec/grammar" -) - -type incompleteFragment struct { - kind spec.LexKindName - root *rootNode -} - -func CompleteFragments(fragments map[spec.LexKindName]CPTree) error { - if len(fragments) == 0 { - return nil - } - - completeFragments := map[spec.LexKindName]CPTree{} - incompleteFragments := []*incompleteFragment{} - for kind, tree := range fragments { - root, ok := tree.(*rootNode) - if !ok { - return fmt.Errorf("CompleteFragments can take only *rootNode: %T", tree) - } - if root.incomplete() { - incompleteFragments = append(incompleteFragments, &incompleteFragment{ - kind: kind, - root: root, - }) - } else { - completeFragments[kind] = root - } - } - for len(incompleteFragments) > 0 { - lastIncompCount := len(incompleteFragments) - remainingFragments := []*incompleteFragment{} - for _, e := range incompleteFragments { - complete, err := ApplyFragments(e.root, completeFragments) - if err != nil { - return err - } - if !complete { - remainingFragments = append(remainingFragments, e) - } else { - completeFragments[e.kind] = e.root - } - } - incompleteFragments = remainingFragments - if len(incompleteFragments) == lastIncompCount { - return ParseErr - } - } - - return nil -} - -func ApplyFragments(t CPTree, fragments map[spec.LexKindName]CPTree) (bool, error) { - root, ok := t.(*rootNode) - if !ok { - return false, fmt.Errorf("ApplyFragments can take only *rootNode type: %T", t) - } - - for name, frag := range fragments { - err := root.applyFragment(name, frag) - if err != nil { - return false, err - } - } - - return !root.incomplete(), nil -} diff --git a/src/urubu/grammar/lexical/parser/lexer.go b/src/urubu/grammar/lexical/parser/lexer.go deleted file mode 100644 index 3861825..0000000 --- a/src/urubu/grammar/lexical/parser/lexer.go +++ /dev/null @@ -1,594 +0,0 @@ -package parser - -import ( - "bufio" - "fmt" - "io" - "strings" -) - -type tokenKind string - -const ( - tokenKindChar tokenKind = "char" - tokenKindAnyChar tokenKind = "." - tokenKindRepeat tokenKind = "*" - tokenKindRepeatOneOrMore tokenKind = "+" - tokenKindOption tokenKind = "?" - tokenKindAlt tokenKind = "|" - tokenKindGroupOpen tokenKind = "(" - tokenKindGroupClose tokenKind = ")" - tokenKindBExpOpen tokenKind = "[" - tokenKindInverseBExpOpen tokenKind = "[^" - tokenKindBExpClose tokenKind = "]" - tokenKindCharRange tokenKind = "-" - tokenKindCodePointLeader tokenKind = "\\u" - tokenKindCharPropLeader tokenKind = "\\p" - tokenKindFragmentLeader tokenKind = "\\f" - tokenKindLBrace tokenKind = "{" - tokenKindRBrace tokenKind = "}" - tokenKindEqual tokenKind = "=" - tokenKindCodePoint tokenKind = "code point" - tokenKindCharPropSymbol tokenKind = "character property symbol" - tokenKindFragmentSymbol tokenKind = "fragment symbol" - tokenKindEOF tokenKind = "eof" -) - -type token struct { - kind tokenKind - char rune - propSymbol string - codePoint string - fragmentSymbol string -} - -const nullChar = '\u0000' - -func newToken(kind tokenKind, char rune) *token { - return &token{ - kind: kind, - char: char, - } -} - -func newCodePointToken(codePoint string) *token { - return &token{ - kind: tokenKindCodePoint, - codePoint: codePoint, - } -} - -func newCharPropSymbolToken(propSymbol string) *token { - return &token{ - kind: tokenKindCharPropSymbol, - propSymbol: propSymbol, - } -} - -func newFragmentSymbolToken(fragmentSymbol string) *token { - return &token{ - kind: tokenKindFragmentSymbol, - fragmentSymbol: fragmentSymbol, - } -} - -type lexerMode string - -const ( - lexerModeDefault lexerMode = "default" - lexerModeBExp lexerMode = "bracket expression" - lexerModeCPExp lexerMode = "code point expression" - lexerModeCharPropExp lexerMode = "character property expression" - lexerModeFragmentExp lexerMode = "fragment expression" -) - -type lexerModeStack struct { - stack []lexerMode -} - -func newLexerModeStack() *lexerModeStack { - return &lexerModeStack{ - stack: []lexerMode{ - lexerModeDefault, - }, - } -} - -func (s *lexerModeStack) top() lexerMode { - return s.stack[len(s.stack)-1] -} - -func (s *lexerModeStack) push(m lexerMode) { - s.stack = append(s.stack, m) -} - -func (s *lexerModeStack) pop() { - s.stack = s.stack[:len(s.stack)-1] -} - -type rangeState string - -// [a-z] -// ^^^^ -// |||`-- ready -// ||`-- expect range terminator -// |`-- read range initiator -// `-- ready -const ( - rangeStateReady rangeState = "ready" - rangeStateReadRangeInitiator rangeState = "read range initiator" - rangeStateExpectRangeTerminator rangeState = "expect range terminator" -) - -type lexer struct { - src *bufio.Reader - peekChar2 rune - peekEOF2 bool - peekChar1 rune - peekEOF1 bool - lastChar rune - reachedEOF bool - prevChar1 rune - prevEOF1 bool - prevChar2 rune - pervEOF2 bool - modeStack *lexerModeStack - rangeState rangeState - - errCause error - errDetail string -} - -func newLexer(src io.Reader) *lexer { - return &lexer{ - src: bufio.NewReader(src), - peekChar2: nullChar, - peekEOF2: false, - peekChar1: nullChar, - peekEOF1: false, - lastChar: nullChar, - reachedEOF: false, - prevChar1: nullChar, - prevEOF1: false, - prevChar2: nullChar, - pervEOF2: false, - modeStack: newLexerModeStack(), - rangeState: rangeStateReady, - } -} - -func (l *lexer) error() (string, error) { - return l.errDetail, l.errCause -} - -func (l *lexer) next() (*token, error) { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - return newToken(tokenKindEOF, nullChar), nil - } - - switch l.modeStack.top() { - case lexerModeBExp: - tok, err := l.nextInBExp(c) - if err != nil { - return nil, err - } - if tok.kind == tokenKindChar || tok.kind == tokenKindCodePointLeader || tok.kind == tokenKindCharPropLeader { - switch l.rangeState { - case rangeStateReady: - l.rangeState = rangeStateReadRangeInitiator - case rangeStateExpectRangeTerminator: - l.rangeState = rangeStateReady - } - } - switch tok.kind { - case tokenKindBExpClose: - l.modeStack.pop() - case tokenKindCharRange: - l.rangeState = rangeStateExpectRangeTerminator - case tokenKindCodePointLeader: - l.modeStack.push(lexerModeCPExp) - case tokenKindCharPropLeader: - l.modeStack.push(lexerModeCharPropExp) - } - return tok, nil - case lexerModeCPExp: - tok, err := l.nextInCodePoint(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindRBrace: - l.modeStack.pop() - } - return tok, nil - case lexerModeCharPropExp: - tok, err := l.nextInCharProp(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindRBrace: - l.modeStack.pop() - } - return tok, nil - case lexerModeFragmentExp: - tok, err := l.nextInFragment(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindRBrace: - l.modeStack.pop() - } - return tok, nil - default: - tok, err := l.nextInDefault(c) - if err != nil { - return nil, err - } - switch tok.kind { - case tokenKindBExpOpen: - l.modeStack.push(lexerModeBExp) - l.rangeState = rangeStateReady - case tokenKindInverseBExpOpen: - l.modeStack.push(lexerModeBExp) - l.rangeState = rangeStateReady - case tokenKindCodePointLeader: - l.modeStack.push(lexerModeCPExp) - case tokenKindCharPropLeader: - l.modeStack.push(lexerModeCharPropExp) - case tokenKindFragmentLeader: - l.modeStack.push(lexerModeFragmentExp) - } - return tok, nil - } -} - -func (l *lexer) nextInDefault(c rune) (*token, error) { - switch c { - case '*': - return newToken(tokenKindRepeat, nullChar), nil - case '+': - return newToken(tokenKindRepeatOneOrMore, nullChar), nil - case '?': - return newToken(tokenKindOption, nullChar), nil - case '.': - return newToken(tokenKindAnyChar, nullChar), nil - case '|': - return newToken(tokenKindAlt, nullChar), nil - case '(': - return newToken(tokenKindGroupOpen, nullChar), nil - case ')': - return newToken(tokenKindGroupClose, nullChar), nil - case '[': - c1, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindBExpOpen, nullChar), nil - } - if c1 != '^' { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindBExpOpen, nullChar), nil - } - c2, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindInverseBExpOpen, nullChar), nil - } - if c2 != ']' { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindInverseBExpOpen, nullChar), nil - } - err = l.restore() - if err != nil { - return nil, err - } - err = l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindBExpOpen, nullChar), nil - case '\\': - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - l.errCause = synErrIncompletedEscSeq - return nil, ParseErr - } - if c == 'u' { - return newToken(tokenKindCodePointLeader, nullChar), nil - } - if c == 'p' { - return newToken(tokenKindCharPropLeader, nullChar), nil - } - if c == 'f' { - return newToken(tokenKindFragmentLeader, nullChar), nil - } - if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' { - return newToken(tokenKindChar, c), nil - } - l.errCause = synErrInvalidEscSeq - l.errDetail = fmt.Sprintf("\\%v is not supported", string(c)) - return nil, ParseErr - default: - return newToken(tokenKindChar, c), nil - } -} - -func (l *lexer) nextInBExp(c rune) (*token, error) { - switch c { - case '-': - if l.rangeState != rangeStateReadRangeInitiator { - return newToken(tokenKindChar, c), nil - } - c1, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindChar, c), nil - } - if c1 != ']' { - err := l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindCharRange, nullChar), nil - } - err = l.restore() - if err != nil { - return nil, err - } - return newToken(tokenKindChar, c), nil - case ']': - return newToken(tokenKindBExpClose, nullChar), nil - case '\\': - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - l.errCause = synErrIncompletedEscSeq - return nil, ParseErr - } - if c == 'u' { - return newToken(tokenKindCodePointLeader, nullChar), nil - } - if c == 'p' { - return newToken(tokenKindCharPropLeader, nullChar), nil - } - if c == '\\' || c == '^' || c == '-' || c == ']' { - return newToken(tokenKindChar, c), nil - } - l.errCause = synErrInvalidEscSeq - l.errDetail = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c)) - return nil, ParseErr - default: - return newToken(tokenKindChar, c), nil - } -} - -func (l *lexer) nextInCodePoint(c rune) (*token, error) { - switch c { - case '{': - return newToken(tokenKindLBrace, nullChar), nil - case '}': - return newToken(tokenKindRBrace, nullChar), nil - default: - if !isHexDigit(c) { - l.errCause = synErrInvalidCodePoint - return nil, ParseErr - } - var b strings.Builder - fmt.Fprint(&b, string(c)) - n := 1 - for { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - break - } - if c == '}' { - err := l.restore() - if err != nil { - return nil, err - } - break - } - if !isHexDigit(c) || n >= 6 { - l.errCause = synErrInvalidCodePoint - return nil, ParseErr - } - fmt.Fprint(&b, string(c)) - n++ - } - cp := b.String() - cpLen := len(cp) - if !(cpLen == 4 || cpLen == 6) { - l.errCause = synErrInvalidCodePoint - return nil, ParseErr - } - return newCodePointToken(b.String()), nil - } -} - -func isHexDigit(c rune) bool { - if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' { - return true - } - return false -} - -func (l *lexer) nextInCharProp(c rune) (*token, error) { - switch c { - case '{': - return newToken(tokenKindLBrace, nullChar), nil - case '}': - return newToken(tokenKindRBrace, nullChar), nil - case '=': - return newToken(tokenKindEqual, nullChar), nil - default: - var b strings.Builder - fmt.Fprint(&b, string(c)) - n := 1 - for { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - break - } - if c == '}' || c == '=' { - err := l.restore() - if err != nil { - return nil, err - } - break - } - fmt.Fprint(&b, string(c)) - n++ - } - sym := strings.TrimSpace(b.String()) - if len(sym) == 0 { - l.errCause = synErrCharPropInvalidSymbol - return nil, ParseErr - } - return newCharPropSymbolToken(sym), nil - } -} - -func (l *lexer) nextInFragment(c rune) (*token, error) { - switch c { - case '{': - return newToken(tokenKindLBrace, nullChar), nil - case '}': - return newToken(tokenKindRBrace, nullChar), nil - default: - var b strings.Builder - fmt.Fprint(&b, string(c)) - n := 1 - for { - c, eof, err := l.read() - if err != nil { - return nil, err - } - if eof { - err := l.restore() - if err != nil { - return nil, err - } - break - } - if c == '}' { - err := l.restore() - if err != nil { - return nil, err - } - break - } - fmt.Fprint(&b, string(c)) - n++ - } - sym := strings.TrimSpace(b.String()) - if len(sym) == 0 { - l.errCause = SynErrFragmentInvalidSymbol - return nil, ParseErr - } - return newFragmentSymbolToken(sym), nil - } -} - -func (l *lexer) read() (rune, bool, error) { - if l.reachedEOF { - return l.lastChar, l.reachedEOF, nil - } - if l.peekChar1 != nullChar || l.peekEOF1 { - l.prevChar2 = l.prevChar1 - l.pervEOF2 = l.prevEOF1 - l.prevChar1 = l.lastChar - l.prevEOF1 = l.reachedEOF - l.lastChar = l.peekChar1 - l.reachedEOF = l.peekEOF1 - l.peekChar1 = l.peekChar2 - l.peekEOF1 = l.peekEOF2 - l.peekChar2 = nullChar - l.peekEOF2 = false - return l.lastChar, l.reachedEOF, nil - } - c, _, err := l.src.ReadRune() - if err != nil { - if err == io.EOF { - l.prevChar2 = l.prevChar1 - l.pervEOF2 = l.prevEOF1 - l.prevChar1 = l.lastChar - l.prevEOF1 = l.reachedEOF - l.lastChar = nullChar - l.reachedEOF = true - return l.lastChar, l.reachedEOF, nil - } - return nullChar, false, err - } - l.prevChar2 = l.prevChar1 - l.pervEOF2 = l.prevEOF1 - l.prevChar1 = l.lastChar - l.prevEOF1 = l.reachedEOF - l.lastChar = c - l.reachedEOF = false - return l.lastChar, l.reachedEOF, nil -} - -func (l *lexer) restore() error { - if l.lastChar == nullChar && !l.reachedEOF { - return fmt.Errorf("failed to call restore() because the last character is null") - } - l.peekChar2 = l.peekChar1 - l.peekEOF2 = l.peekEOF1 - l.peekChar1 = l.lastChar - l.peekEOF1 = l.reachedEOF - l.lastChar = l.prevChar1 - l.reachedEOF = l.prevEOF1 - l.prevChar1 = l.prevChar2 - l.prevEOF1 = l.pervEOF2 - l.prevChar2 = nullChar - l.pervEOF2 = false - return nil -} diff --git a/src/urubu/grammar/lexical/parser/parser.go b/src/urubu/grammar/lexical/parser/parser.go deleted file mode 100644 index 425b553..0000000 --- a/src/urubu/grammar/lexical/parser/parser.go +++ /dev/null @@ -1,531 +0,0 @@ -package parser - -import ( - "bytes" - "fmt" - "io" - "strconv" - - spec "urubu/spec/grammar" - "urubu/ucd" -) - -type PatternEntry struct { - ID spec.LexModeKindID - Pattern []byte -} - -type parser struct { - kind spec.LexKindName - lex *lexer - peekedTok *token - lastTok *token - - // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that - // appear in property expressions. - // - // The contributory properties are not exposed, and users cannot use those properties because the parser - // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid. - // - // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to - // interpret derived properties internally because the derived properties consist of other properties that - // may contain the contributory properties. - // - // [UAX #44 5.13 Property APIs] says: - // > The following subtypes of Unicode character properties should generally not be exposed in APIs, - // > except in limited circumstances. They may not be useful, particularly in public API collections, - // > and may instead prove misleading to the users of such API collections. - // > * Contributory properties are not recommended for public APIs. - // > ... - // https://unicode.org/reports/tr44/#Property_APIs - isContributoryPropertyExposed bool - - errCause error - errDetail string -} - -func NewParser(kind spec.LexKindName, src io.Reader) *parser { - return &parser{ - kind: kind, - lex: newLexer(src), - isContributoryPropertyExposed: false, - } -} - -func (p *parser) exposeContributoryProperty() { - p.isContributoryPropertyExposed = true -} - -func (p *parser) Error() (string, error) { - return p.errDetail, p.errCause -} - -func (p *parser) Parse() (root CPTree, retErr error) { - defer func() { - err := recover() - if err != nil { - var ok bool - retErr, ok = err.(error) - if !ok { - panic(err) - } - return - } - }() - - return newRootNode(p.kind, p.parseRegexp()), nil -} - -func (p *parser) parseRegexp() CPTree { - alt := p.parseAlt() - if alt == nil { - if p.consume(tokenKindGroupClose) { - p.raiseParseError(synErrGroupNoInitiator, "") - } - p.raiseParseError(synErrNullPattern, "") - } - if p.consume(tokenKindGroupClose) { - p.raiseParseError(synErrGroupNoInitiator, "") - } - p.expect(tokenKindEOF) - return alt -} - -func (p *parser) parseAlt() CPTree { - left := p.parseConcat() - if left == nil { - if p.consume(tokenKindAlt) { - p.raiseParseError(synErrAltLackOfOperand, "") - } - return nil - } - for { - if !p.consume(tokenKindAlt) { - break - } - right := p.parseConcat() - if right == nil { - p.raiseParseError(synErrAltLackOfOperand, "") - } - left = newAltNode(left, right) - } - return left -} - -func (p *parser) parseConcat() CPTree { - left := p.parseRepeat() - for { - right := p.parseRepeat() - if right == nil { - break - } - left = newConcatNode(left, right) - } - return left -} - -func (p *parser) parseRepeat() CPTree { - group := p.parseGroup() - if group == nil { - if p.consume(tokenKindRepeat) { - p.raiseParseError(synErrRepNoTarget, "* needs an operand") - } - if p.consume(tokenKindRepeatOneOrMore) { - p.raiseParseError(synErrRepNoTarget, "+ needs an operand") - } - if p.consume(tokenKindOption) { - p.raiseParseError(synErrRepNoTarget, "? needs an operand") - } - return nil - } - if p.consume(tokenKindRepeat) { - return newRepeatNode(group) - } - if p.consume(tokenKindRepeatOneOrMore) { - return newRepeatOneOrMoreNode(group) - } - if p.consume(tokenKindOption) { - return newOptionNode(group) - } - return group -} - -func (p *parser) parseGroup() CPTree { - if p.consume(tokenKindGroupOpen) { - alt := p.parseAlt() - if alt == nil { - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrGroupUnclosed, "") - } - p.raiseParseError(synErrGroupNoElem, "") - } - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrGroupUnclosed, "") - } - if !p.consume(tokenKindGroupClose) { - p.raiseParseError(synErrGroupInvalidForm, "") - } - return alt - } - return p.parseSingleChar() -} - -func (p *parser) parseSingleChar() CPTree { - if p.consume(tokenKindAnyChar) { - return genAnyCharAST() - } - if p.consume(tokenKindBExpOpen) { - left := p.parseBExpElem() - if left == nil { - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.raiseParseError(synErrBExpNoElem, "") - } - for { - right := p.parseBExpElem() - if right == nil { - break - } - left = newAltNode(left, right) - } - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.expect(tokenKindBExpClose) - return left - } - if p.consume(tokenKindInverseBExpOpen) { - elem := p.parseBExpElem() - if elem == nil { - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.raiseParseError(synErrBExpNoElem, "") - } - inverse := exclude(elem, genAnyCharAST()) - if inverse == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - for { - elem := p.parseBExpElem() - if elem == nil { - break - } - inverse = exclude(elem, inverse) - if inverse == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - } - if p.consume(tokenKindEOF) { - p.raiseParseError(synErrBExpUnclosed, "") - } - p.expect(tokenKindBExpClose) - return inverse - } - if p.consume(tokenKindCodePointLeader) { - return p.parseCodePoint() - } - if p.consume(tokenKindCharPropLeader) { - return p.parseCharProp() - } - if p.consume(tokenKindFragmentLeader) { - return p.parseFragment() - } - c := p.parseNormalChar() - if c == nil { - if p.consume(tokenKindBExpClose) { - p.raiseParseError(synErrBExpInvalidForm, "") - } - return nil - } - return c -} - -func (p *parser) parseBExpElem() CPTree { - var left CPTree - switch { - case p.consume(tokenKindCodePointLeader): - left = p.parseCodePoint() - case p.consume(tokenKindCharPropLeader): - left = p.parseCharProp() - if p.consume(tokenKindCharRange) { - p.raiseParseError(synErrRangePropIsUnavailable, "") - } - default: - left = p.parseNormalChar() - } - if left == nil { - return nil - } - if !p.consume(tokenKindCharRange) { - return left - } - var right CPTree - switch { - case p.consume(tokenKindCodePointLeader): - right = p.parseCodePoint() - case p.consume(tokenKindCharPropLeader): - p.raiseParseError(synErrRangePropIsUnavailable, "") - default: - right = p.parseNormalChar() - } - if right == nil { - p.raiseParseError(synErrRangeInvalidForm, "") - } - from, _, _ := left.Range() - _, to, _ := right.Range() - if !isValidOrder(from, to) { - p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("%X..%X", from, to)) - } - return newRangeSymbolNode(from, to) -} - -func (p *parser) parseCodePoint() CPTree { - if !p.consume(tokenKindLBrace) { - p.raiseParseError(synErrCPExpInvalidForm, "") - } - if !p.consume(tokenKindCodePoint) { - p.raiseParseError(synErrCPExpInvalidForm, "") - } - - n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64) - if err != nil { - panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err)) - } - if n < 0x0000 || n > 0x10FFFF { - p.raiseParseError(synErrCPExpOutOfRange, "") - } - - sym := newSymbolNode(rune(n)) - - if !p.consume(tokenKindRBrace) { - p.raiseParseError(synErrCPExpInvalidForm, "") - } - - return sym -} - -func (p *parser) parseCharProp() CPTree { - if !p.consume(tokenKindLBrace) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - var sym1, sym2 string - if !p.consume(tokenKindCharPropSymbol) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - sym1 = p.lastTok.propSymbol - if p.consume(tokenKindEqual) { - if !p.consume(tokenKindCharPropSymbol) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - sym2 = p.lastTok.propSymbol - } - - var alt CPTree - var propName, propVal string - if sym2 != "" { - propName = sym1 - propVal = sym2 - } else { - propName = "" - propVal = sym1 - } - if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) { - p.raiseParseError(synErrCharPropUnsupported, propName) - } - pat, err := ucd.NormalizeCharacterProperty(propName, propVal) - if err != nil { - p.raiseParseError(synErrCharPropUnsupported, err.Error()) - } - if pat != "" { - p := NewParser(p.kind, bytes.NewReader([]byte(pat))) - p.exposeContributoryProperty() - ast, err := p.Parse() - if err != nil { - panic(err) - } - alt = ast - } else { - cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal) - if err != nil { - p.raiseParseError(synErrCharPropUnsupported, err.Error()) - } - if inverse { - r := cpRanges[0] - alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST()) - if alt == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - for _, r := range cpRanges[1:] { - alt = exclude(newRangeSymbolNode(r.From, r.To), alt) - if alt == nil { - p.raiseParseError(synErrUnmatchablePattern, "") - } - } - } else { - for _, r := range cpRanges { - alt = genAltNode( - alt, - newRangeSymbolNode(r.From, r.To), - ) - } - } - } - - if !p.consume(tokenKindRBrace) { - p.raiseParseError(synErrCharPropExpInvalidForm, "") - } - - return alt -} - -func (p *parser) parseFragment() CPTree { - if !p.consume(tokenKindLBrace) { - p.raiseParseError(synErrFragmentExpInvalidForm, "") - } - if !p.consume(tokenKindFragmentSymbol) { - p.raiseParseError(synErrFragmentExpInvalidForm, "") - } - sym := p.lastTok.fragmentSymbol - - if !p.consume(tokenKindRBrace) { - p.raiseParseError(synErrFragmentExpInvalidForm, "") - } - - return newFragmentNode(spec.LexKindName(sym), nil) -} - -func (p *parser) parseNormalChar() CPTree { - if !p.consume(tokenKindChar) { - return nil - } - return newSymbolNode(p.lastTok.char) -} - -func exclude(symbol, base CPTree) CPTree { - if left, right, ok := symbol.Alternatives(); ok { - return exclude(right, exclude(left, base)) - } - - if left, right, ok := base.Alternatives(); ok { - return genAltNode( - exclude(symbol, left), - exclude(symbol, right), - ) - } - - if bFrom, bTo, ok := base.Range(); ok { - sFrom, sTo, ok := symbol.Range() - if !ok { - panic(fmt.Errorf("invalid symbol tree: %T", symbol)) - } - - switch { - case sFrom > bFrom && sTo < bTo: - return genAltNode( - newRangeSymbolNode(bFrom, sFrom-1), - newRangeSymbolNode(sTo+1, bTo), - ) - case sFrom <= bFrom && sTo >= bFrom && sTo < bTo: - return newRangeSymbolNode(sTo+1, bTo) - case sFrom > bFrom && sFrom <= bTo && sTo >= bTo: - return newRangeSymbolNode(bFrom, sFrom-1) - case sFrom <= bFrom && sTo >= bTo: - return nil - default: - return base - } - } - - panic(fmt.Errorf("invalid base tree: %T", base)) -} - -func genAnyCharAST() CPTree { - return newRangeSymbolNode(0x0, 0x10FFFF) -} - -func isValidOrder(from, to rune) bool { - return from <= to -} - -func genConcatNode(cs ...CPTree) CPTree { - nonNilNodes := []CPTree{} - for _, c := range cs { - if c == nil { - continue - } - nonNilNodes = append(nonNilNodes, c) - } - if len(nonNilNodes) <= 0 { - return nil - } - if len(nonNilNodes) == 1 { - return nonNilNodes[0] - } - concat := newConcatNode(nonNilNodes[0], nonNilNodes[1]) - for _, c := range nonNilNodes[2:] { - concat = newConcatNode(concat, c) - } - return concat -} - -func genAltNode(cs ...CPTree) CPTree { - nonNilNodes := []CPTree{} - for _, c := range cs { - if c == nil { - continue - } - nonNilNodes = append(nonNilNodes, c) - } - if len(nonNilNodes) <= 0 { - return nil - } - if len(nonNilNodes) == 1 { - return nonNilNodes[0] - } - alt := newAltNode(nonNilNodes[0], nonNilNodes[1]) - for _, c := range nonNilNodes[2:] { - alt = newAltNode(alt, c) - } - return alt -} - -func (p *parser) expect(expected tokenKind) { - if !p.consume(expected) { - tok := p.peekedTok - p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("expected: %v, actual: %v", expected, tok.kind)) - } -} - -func (p *parser) consume(expected tokenKind) bool { - var tok *token - var err error - if p.peekedTok != nil { - tok = p.peekedTok - p.peekedTok = nil - } else { - tok, err = p.lex.next() - if err != nil { - if err == ParseErr { - detail, cause := p.lex.error() - p.raiseParseError(cause, detail) - } - panic(err) - } - } - p.lastTok = tok - if tok.kind == expected { - return true - } - p.peekedTok = tok - p.lastTok = nil - - return false -} - -func (p *parser) raiseParseError(err error, detail string) { - p.errCause = err - p.errDetail = detail - panic(ParseErr) -} diff --git a/src/urubu/grammar/lexical/parser/tree.go b/src/urubu/grammar/lexical/parser/tree.go deleted file mode 100644 index df03d37..0000000 --- a/src/urubu/grammar/lexical/parser/tree.go +++ /dev/null @@ -1,459 +0,0 @@ -package parser - -import ( - "fmt" - "io" - "sort" - - spec "urubu/spec/grammar" -) - -type CPRange struct { - From rune - To rune -} - -type CPTree interface { - fmt.Stringer - Range() (rune, rune, bool) - Optional() (CPTree, bool) - Repeatable() (CPTree, bool) - Concatenation() (CPTree, CPTree, bool) - Alternatives() (CPTree, CPTree, bool) - Describe() (spec.LexKindName, []spec.LexKindName, error) - - children() (CPTree, CPTree) - clone() CPTree -} - -var ( - _ CPTree = &rootNode{} - _ CPTree = &symbolNode{} - _ CPTree = &concatNode{} - _ CPTree = &altNode{} - _ CPTree = &quantifierNode{} - _ CPTree = &fragmentNode{} -) - -type rootNode struct { - kind spec.LexKindName - tree CPTree - fragments map[spec.LexKindName][]*fragmentNode -} - -func newRootNode(kind spec.LexKindName, t CPTree) *rootNode { - fragments := map[spec.LexKindName][]*fragmentNode{} - collectFragments(t, fragments) - - return &rootNode{ - kind: kind, - tree: t, - fragments: fragments, - } -} - -func collectFragments(n CPTree, fragments map[spec.LexKindName][]*fragmentNode) { - if n == nil { - return - } - - if f, ok := n.(*fragmentNode); ok { - fragments[f.kind] = append(fragments[f.kind], f) - return - } - - l, r := n.children() - collectFragments(l, fragments) - collectFragments(r, fragments) -} - -func (n *rootNode) String() string { - return fmt.Sprintf("root: %v: %v fragments", n.kind, len(n.fragments)) -} - -func (n *rootNode) Range() (rune, rune, bool) { - return n.tree.Range() -} - -func (n *rootNode) Optional() (CPTree, bool) { - return n.tree.Optional() -} - -func (n *rootNode) Repeatable() (CPTree, bool) { - return n.tree.Repeatable() -} - -func (n *rootNode) Concatenation() (CPTree, CPTree, bool) { - return n.tree.Concatenation() -} - -func (n *rootNode) Alternatives() (CPTree, CPTree, bool) { - return n.tree.Alternatives() -} - -func (n *rootNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - var frags []spec.LexKindName - for f := range n.fragments { - frags = append(frags, spec.LexKindName(f)) - } - sort.Slice(frags, func(i, j int) bool { - return frags[i] < frags[j] - }) - - return n.kind, frags, nil -} - -func (n *rootNode) children() (CPTree, CPTree) { - return n.tree.children() -} - -func (n *rootNode) clone() CPTree { - return n.tree.clone() -} - -func (n *rootNode) incomplete() bool { - return len(n.fragments) > 0 -} - -func (n *rootNode) applyFragment(kind spec.LexKindName, fragment CPTree) error { - root, ok := fragment.(*rootNode) - if !ok { - return fmt.Errorf("applyFragment can take only *rootNode: %T", fragment) - } - if root.incomplete() { - return fmt.Errorf("fragment is incomplete") - } - - fs, ok := n.fragments[kind] - if !ok { - return nil - } - for _, f := range fs { - f.tree = root.clone() - } - delete(n.fragments, kind) - - return nil -} - -type symbolNode struct { - CPRange -} - -func newSymbolNode(cp rune) *symbolNode { - return &symbolNode{ - CPRange: CPRange{ - From: cp, - To: cp, - }, - } -} - -func newRangeSymbolNode(from, to rune) *symbolNode { - return &symbolNode{ - CPRange: CPRange{ - From: from, - To: to, - }, - } -} - -func (n *symbolNode) String() string { - return fmt.Sprintf("symbol: %X..%X", n.From, n.To) -} - -func (n *symbolNode) Range() (rune, rune, bool) { - return n.From, n.To, true -} - -func (n *symbolNode) Optional() (CPTree, bool) { - return nil, false -} - -func (n *symbolNode) Repeatable() (CPTree, bool) { - return nil, false -} - -func (n *symbolNode) Concatenation() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *symbolNode) Alternatives() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *symbolNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *symbolNode) children() (CPTree, CPTree) { - return nil, nil -} - -func (n *symbolNode) clone() CPTree { - return newRangeSymbolNode(n.From, n.To) -} - -type concatNode struct { - left CPTree - right CPTree -} - -func newConcatNode(left, right CPTree) *concatNode { - return &concatNode{ - left: left, - right: right, - } -} - -func (n *concatNode) String() string { - return "concat" -} - -func (n *concatNode) Range() (rune, rune, bool) { - return 0, 0, false -} - -func (n *concatNode) Optional() (CPTree, bool) { - return nil, false -} - -func (n *concatNode) Repeatable() (CPTree, bool) { - return nil, false -} - -func (n *concatNode) Concatenation() (CPTree, CPTree, bool) { - return n.left, n.right, true -} - -func (n *concatNode) Alternatives() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *concatNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *concatNode) children() (CPTree, CPTree) { - return n.left, n.right -} - -func (n *concatNode) clone() CPTree { - if n == nil { - return nil - } - return newConcatNode(n.left.clone(), n.right.clone()) -} - -type altNode struct { - left CPTree - right CPTree -} - -func newAltNode(left, right CPTree) *altNode { - return &altNode{ - left: left, - right: right, - } -} - -func (n *altNode) String() string { - return "alt" -} - -func (n *altNode) Range() (rune, rune, bool) { - return 0, 0, false -} - -func (n *altNode) Optional() (CPTree, bool) { - return nil, false -} - -func (n *altNode) Repeatable() (CPTree, bool) { - return nil, false -} - -func (n *altNode) Concatenation() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *altNode) Alternatives() (CPTree, CPTree, bool) { - return n.left, n.right, true -} - -func (n *altNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *altNode) children() (CPTree, CPTree) { - return n.left, n.right -} - -func (n *altNode) clone() CPTree { - return newAltNode(n.left.clone(), n.right.clone()) -} - -type quantifierNode struct { - optional bool - repeatable bool - tree CPTree -} - -func (n *quantifierNode) String() string { - switch { - case n.repeatable: - return "repeatable (>= 0 times)" - case n.optional: - return "optional (0 or 1 times)" - default: - return "invalid quantifier" - } -} - -func newRepeatNode(t CPTree) *quantifierNode { - return &quantifierNode{ - repeatable: true, - tree: t, - } -} - -func newRepeatOneOrMoreNode(t CPTree) *concatNode { - return newConcatNode( - t, - &quantifierNode{ - repeatable: true, - tree: t.clone(), - }) -} - -func newOptionNode(t CPTree) *quantifierNode { - return &quantifierNode{ - optional: true, - tree: t, - } -} - -func (n *quantifierNode) Range() (rune, rune, bool) { - return 0, 0, false -} - -func (n *quantifierNode) Optional() (CPTree, bool) { - return n.tree, n.optional -} - -func (n *quantifierNode) Repeatable() (CPTree, bool) { - return n.tree, n.repeatable -} - -func (n *quantifierNode) Concatenation() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *quantifierNode) Alternatives() (CPTree, CPTree, bool) { - return nil, nil, false -} - -func (n *quantifierNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *quantifierNode) children() (CPTree, CPTree) { - return n.tree, nil -} - -func (n *quantifierNode) clone() CPTree { - if n.repeatable { - return newRepeatNode(n.tree.clone()) - } - return newOptionNode(n.tree.clone()) -} - -type fragmentNode struct { - kind spec.LexKindName - tree CPTree -} - -func newFragmentNode(kind spec.LexKindName, t CPTree) *fragmentNode { - return &fragmentNode{ - kind: kind, - tree: t, - } -} - -func (n *fragmentNode) String() string { - return fmt.Sprintf("fragment: %v", n.kind) -} - -func (n *fragmentNode) Range() (rune, rune, bool) { - return n.tree.Range() -} - -func (n *fragmentNode) Optional() (CPTree, bool) { - return n.tree.Optional() -} - -func (n *fragmentNode) Repeatable() (CPTree, bool) { - return n.tree.Repeatable() -} - -func (n *fragmentNode) Concatenation() (CPTree, CPTree, bool) { - return n.tree.Concatenation() -} - -func (n *fragmentNode) Alternatives() (CPTree, CPTree, bool) { - return n.tree.Alternatives() -} - -func (n *fragmentNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { - return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) -} - -func (n *fragmentNode) children() (CPTree, CPTree) { - return n.tree.children() -} - -func (n *fragmentNode) clone() CPTree { - if n.tree == nil { - return newFragmentNode(n.kind, nil) - } - return newFragmentNode(n.kind, n.tree.clone()) -} - -//nolint:unused -func printCPTree(w io.Writer, t CPTree, ruledLine string, childRuledLinePrefix string) { - if t == nil { - return - } - fmt.Fprintf(w, "%v%v\n", ruledLine, t) - children := []CPTree{} - switch n := t.(type) { - case *rootNode: - children = append(children, n.tree) - case *fragmentNode: - children = append(children, n.tree) - default: - left, right := t.children() - if left != nil { - children = append(children, left) - } - if right != nil { - children = append(children, right) - } - } - num := len(children) - for i, child := range children { - line := "└─ " - if num > 1 { - if i == 0 { - line = "├─ " - } else if i < num-1 { - line = "│ " - } - } - prefix := "│ " - if i >= num-1 { - prefix = " " - } - printCPTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix) - } -} diff --git a/src/urubu/grammar/lr0.go b/src/urubu/grammar/lr0.go deleted file mode 100644 index 92a2137..0000000 --- a/src/urubu/grammar/lr0.go +++ /dev/null @@ -1,197 +0,0 @@ -package grammar - -import ( - "fmt" - "sort" - - "urubu/grammar/symbol" -) - -type lr0Automaton struct { - initialState kernelID - states map[kernelID]*lrState -} - -func genLR0Automaton(prods *productionSet, startSym symbol.Symbol, errSym symbol.Symbol) (*lr0Automaton, error) { - if !startSym.IsStart() { - return nil, fmt.Errorf("passed symbold is not a start symbol") - } - - automaton := &lr0Automaton{ - states: map[kernelID]*lrState{}, - } - - currentState := stateNumInitial - knownKernels := map[kernelID]struct{}{} - uncheckedKernels := []*kernel{} - - // Generate an initial kernel. - { - prods, _ := prods.findByLHS(startSym) - initialItem, err := newLR0Item(prods[0], 0) - if err != nil { - return nil, err - } - - k, err := newKernel([]*lrItem{initialItem}) - if err != nil { - return nil, err - } - - automaton.initialState = k.id - knownKernels[k.id] = struct{}{} - uncheckedKernels = append(uncheckedKernels, k) - } - - for len(uncheckedKernels) > 0 { - nextUncheckedKernels := []*kernel{} - for _, k := range uncheckedKernels { - state, neighbours, err := genStateAndNeighbourKernels(k, prods, errSym) - if err != nil { - return nil, err - } - state.num = currentState - currentState = currentState.next() - - automaton.states[state.id] = state - - for _, k := range neighbours { - if _, known := knownKernels[k.id]; known { - continue - } - knownKernels[k.id] = struct{}{} - nextUncheckedKernels = append(nextUncheckedKernels, k) - } - } - uncheckedKernels = nextUncheckedKernels - } - - return automaton, nil -} - -func genStateAndNeighbourKernels(k *kernel, prods *productionSet, errSym symbol.Symbol) (*lrState, []*kernel, error) { - items, err := genLR0Closure(k, prods) - if err != nil { - return nil, nil, err - } - neighbours, err := genNeighbourKernels(items, prods) - if err != nil { - return nil, nil, err - } - - next := map[symbol.Symbol]kernelID{} - kernels := []*kernel{} - for _, n := range neighbours { - next[n.symbol] = n.kernel.id - kernels = append(kernels, n.kernel) - } - - reducible := map[productionID]struct{}{} - var emptyProdItems []*lrItem - isErrorTrapper := false - for _, item := range items { - if item.dottedSymbol == errSym { - isErrorTrapper = true - } - - if item.reducible { - reducible[item.prod] = struct{}{} - - prod, ok := prods.findByID(item.prod) - if !ok { - return nil, nil, fmt.Errorf("reducible production not found: %v", item.prod) - } - if prod.isEmpty() { - emptyProdItems = append(emptyProdItems, item) - } - } - } - - return &lrState{ - kernel: k, - next: next, - reducible: reducible, - emptyProdItems: emptyProdItems, - isErrorTrapper: isErrorTrapper, - }, kernels, nil -} - -func genLR0Closure(k *kernel, prods *productionSet) ([]*lrItem, error) { - items := []*lrItem{} - knownItems := map[lrItemID]struct{}{} - uncheckedItems := []*lrItem{} - for _, item := range k.items { - items = append(items, item) - uncheckedItems = append(uncheckedItems, item) - } - for len(uncheckedItems) > 0 { - nextUncheckedItems := []*lrItem{} - for _, item := range uncheckedItems { - if item.dottedSymbol.IsTerminal() { - continue - } - - ps, _ := prods.findByLHS(item.dottedSymbol) - for _, prod := range ps { - item, err := newLR0Item(prod, 0) - if err != nil { - return nil, err - } - if _, exist := knownItems[item.id]; exist { - continue - } - items = append(items, item) - knownItems[item.id] = struct{}{} - nextUncheckedItems = append(nextUncheckedItems, item) - } - } - uncheckedItems = nextUncheckedItems - } - - return items, nil -} - -type neighbourKernel struct { - symbol symbol.Symbol - kernel *kernel -} - -func genNeighbourKernels(items []*lrItem, prods *productionSet) ([]*neighbourKernel, error) { - kItemMap := map[symbol.Symbol][]*lrItem{} - for _, item := range items { - if item.dottedSymbol.IsNil() { - continue - } - prod, ok := prods.findByID(item.prod) - if !ok { - return nil, fmt.Errorf("a production was not found: %v", item.prod) - } - kItem, err := newLR0Item(prod, item.dot+1) - if err != nil { - return nil, err - } - kItemMap[item.dottedSymbol] = append(kItemMap[item.dottedSymbol], kItem) - } - - nextSyms := []symbol.Symbol{} - for sym := range kItemMap { - nextSyms = append(nextSyms, sym) - } - sort.Slice(nextSyms, func(i, j int) bool { - return nextSyms[i] < nextSyms[j] - }) - - kernels := []*neighbourKernel{} - for _, sym := range nextSyms { - k, err := newKernel(kItemMap[sym]) - if err != nil { - return nil, err - } - kernels = append(kernels, &neighbourKernel{ - symbol: sym, - kernel: k, - }) - } - - return kernels, nil -} diff --git a/src/urubu/grammar/parsing_table.go b/src/urubu/grammar/parsing_table.go deleted file mode 100644 index 48ea9fe..0000000 --- a/src/urubu/grammar/parsing_table.go +++ /dev/null @@ -1,553 +0,0 @@ -package grammar - -import ( - "fmt" - "sort" - - "urubu/grammar/symbol" - spec "urubu/spec/grammar" -) - -type ActionType string - -const ( - ActionTypeShift = ActionType("shift") - ActionTypeReduce = ActionType("reduce") - ActionTypeError = ActionType("error") -) - -type actionEntry int - -const actionEntryEmpty = actionEntry(0) - -func newShiftActionEntry(state stateNum) actionEntry { - return actionEntry(state * -1) -} - -func newReduceActionEntry(prod productionNum) actionEntry { - return actionEntry(prod) -} - -func (e actionEntry) isEmpty() bool { - return e == actionEntryEmpty -} - -func (e actionEntry) describe() (ActionType, stateNum, productionNum) { - if e == actionEntryEmpty { - return ActionTypeError, stateNumInitial, productionNumNil - } - if e < 0 { - return ActionTypeShift, stateNum(e * -1), productionNumNil - } - return ActionTypeReduce, stateNumInitial, productionNum(e) -} - -type GoToType string - -const ( - GoToTypeRegistered = GoToType("registered") - GoToTypeError = GoToType("error") -) - -type goToEntry uint - -const goToEntryEmpty = goToEntry(0) - -func newGoToEntry(state stateNum) goToEntry { - return goToEntry(state) -} - -func (e goToEntry) describe() (GoToType, stateNum) { - if e == goToEntryEmpty { - return GoToTypeError, stateNumInitial - } - return GoToTypeRegistered, stateNum(e) -} - -type conflictResolutionMethod int - -func (m conflictResolutionMethod) Int() int { - return int(m) -} - -const ( - ResolvedByPrec conflictResolutionMethod = 1 - ResolvedByAssoc conflictResolutionMethod = 2 - ResolvedByShift conflictResolutionMethod = 3 - ResolvedByProdOrder conflictResolutionMethod = 4 -) - -type conflict interface { - conflict() -} - -type shiftReduceConflict struct { - state stateNum - sym symbol.Symbol - nextState stateNum - prodNum productionNum - resolvedBy conflictResolutionMethod -} - -func (c *shiftReduceConflict) conflict() { -} - -type reduceReduceConflict struct { - state stateNum - sym symbol.Symbol - prodNum1 productionNum - prodNum2 productionNum - resolvedBy conflictResolutionMethod -} - -func (c *reduceReduceConflict) conflict() { -} - -var ( - _ conflict = &shiftReduceConflict{} - _ conflict = &reduceReduceConflict{} -) - -type ParsingTable struct { - actionTable []actionEntry - goToTable []goToEntry - stateCount int - terminalCount int - nonTerminalCount int - - // errorTrapperStates's index means a state number, and when `errorTrapperStates[stateNum]` is `1`, - // the state has an item having the following form. The `α` and `β` can be empty. - // - // A → α・error β - errorTrapperStates []int - - InitialState stateNum -} - -func (t *ParsingTable) getAction(state stateNum, sym symbol.SymbolNum) (ActionType, stateNum, productionNum) { - pos := state.Int()*t.terminalCount + sym.Int() - return t.actionTable[pos].describe() -} - -func (t *ParsingTable) getGoTo(state stateNum, sym symbol.SymbolNum) (GoToType, stateNum) { - pos := state.Int()*t.nonTerminalCount + sym.Int() - return t.goToTable[pos].describe() -} - -func (t *ParsingTable) readAction(row int, col int) actionEntry { - return t.actionTable[row*t.terminalCount+col] -} - -func (t *ParsingTable) writeAction(row int, col int, act actionEntry) { - t.actionTable[row*t.terminalCount+col] = act -} - -func (t *ParsingTable) writeGoTo(state stateNum, sym symbol.Symbol, nextState stateNum) { - pos := state.Int()*t.nonTerminalCount + sym.Num().Int() - t.goToTable[pos] = newGoToEntry(nextState) -} - -type lrTableBuilder struct { - automaton *lr0Automaton - prods *productionSet - termCount int - nonTermCount int - symTab *symbol.SymbolTableReader - precAndAssoc *precAndAssoc - - conflicts []conflict -} - -func (b *lrTableBuilder) build() (*ParsingTable, error) { - var ptab *ParsingTable - { - initialState := b.automaton.states[b.automaton.initialState] - ptab = &ParsingTable{ - actionTable: make([]actionEntry, len(b.automaton.states)*b.termCount), - goToTable: make([]goToEntry, len(b.automaton.states)*b.nonTermCount), - stateCount: len(b.automaton.states), - terminalCount: b.termCount, - nonTerminalCount: b.nonTermCount, - errorTrapperStates: make([]int, len(b.automaton.states)), - InitialState: initialState.num, - } - } - - for _, state := range b.automaton.states { - if state.isErrorTrapper { - ptab.errorTrapperStates[state.num] = 1 - } - - for sym, kID := range state.next { - nextState := b.automaton.states[kID] - if sym.IsTerminal() { - b.writeShiftAction(ptab, state.num, sym, nextState.num) - } else { - ptab.writeGoTo(state.num, sym, nextState.num) - } - } - - for prodID := range state.reducible { - reducibleProd, ok := b.prods.findByID(prodID) - if !ok { - return nil, fmt.Errorf("reducible production not found: %v", prodID) - } - - var reducibleItem *lrItem - for _, item := range state.items { - if item.prod != reducibleProd.id { - continue - } - - reducibleItem = item - break - } - if reducibleItem == nil { - for _, item := range state.emptyProdItems { - if item.prod != reducibleProd.id { - continue - } - - reducibleItem = item - break - } - if reducibleItem == nil { - return nil, fmt.Errorf("reducible item not found; state: %v, production: %v", state.num, reducibleProd.num) - } - } - - for a := range reducibleItem.lookAhead.symbols { - b.writeReduceAction(ptab, state.num, a, reducibleProd.num) - } - } - } - - return ptab, nil -} - -// writeShiftAction writes a shift action to the parsing table. When a shift/reduce conflict occurred, -// we prioritize the shift action. -func (b *lrTableBuilder) writeShiftAction(tab *ParsingTable, state stateNum, sym symbol.Symbol, nextState stateNum) { - act := tab.readAction(state.Int(), sym.Num().Int()) - if !act.isEmpty() { - ty, _, p := act.describe() - if ty == ActionTypeReduce { - act, method := b.resolveSRConflict(sym.Num(), p) - b.conflicts = append(b.conflicts, &shiftReduceConflict{ - state: state, - sym: sym, - nextState: nextState, - prodNum: p, - resolvedBy: method, - }) - if act == ActionTypeShift { - tab.writeAction(state.Int(), sym.Num().Int(), newShiftActionEntry(nextState)) - } - return - } - } - tab.writeAction(state.Int(), sym.Num().Int(), newShiftActionEntry(nextState)) -} - -// writeReduceAction writes a reduce action to the parsing table. When a shift/reduce conflict occurred, -// we prioritize the shift action, and when a reduce/reduce conflict we prioritize the action that reduces -// the production with higher priority. Productions defined earlier in the grammar file have a higher priority. -func (b *lrTableBuilder) writeReduceAction(tab *ParsingTable, state stateNum, sym symbol.Symbol, prod productionNum) { - act := tab.readAction(state.Int(), sym.Num().Int()) - if !act.isEmpty() { - ty, s, p := act.describe() - switch ty { - case ActionTypeReduce: - if p == prod { - return - } - - b.conflicts = append(b.conflicts, &reduceReduceConflict{ - state: state, - sym: sym, - prodNum1: p, - prodNum2: prod, - resolvedBy: ResolvedByProdOrder, - }) - if p < prod { - tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(p)) - } else { - tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod)) - } - case ActionTypeShift: - act, method := b.resolveSRConflict(sym.Num(), prod) - b.conflicts = append(b.conflicts, &shiftReduceConflict{ - state: state, - sym: sym, - nextState: s, - prodNum: prod, - resolvedBy: method, - }) - if act == ActionTypeReduce { - tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod)) - } - } - return - } - tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod)) -} - -func (b *lrTableBuilder) resolveSRConflict(sym symbol.SymbolNum, prod productionNum) (ActionType, conflictResolutionMethod) { - symPrec := b.precAndAssoc.terminalPrecedence(sym) - prodPrec := b.precAndAssoc.productionPredence(prod) - if symPrec == 0 || prodPrec == 0 { - return ActionTypeShift, ResolvedByShift - } - if symPrec == prodPrec { - assoc := b.precAndAssoc.productionAssociativity(prod) - if assoc != assocTypeLeft { - return ActionTypeShift, ResolvedByAssoc - } - return ActionTypeReduce, ResolvedByAssoc - } - if symPrec < prodPrec { - return ActionTypeShift, ResolvedByPrec - } - return ActionTypeReduce, ResolvedByPrec -} - -func (b *lrTableBuilder) genReport(tab *ParsingTable, gram *Grammar) (*spec.Report, error) { - var terms []*spec.Terminal - { - termSyms := b.symTab.TerminalSymbols() - terms = make([]*spec.Terminal, len(termSyms)+1) - - for _, sym := range termSyms { - name, ok := b.symTab.ToText(sym) - if !ok { - return nil, fmt.Errorf("failed to generate terminals: symbol not found: %v", sym) - } - - term := &spec.Terminal{ - Number: sym.Num().Int(), - Name: name, - } - - prec := b.precAndAssoc.terminalPrecedence(sym.Num()) - if prec != precNil { - term.Precedence = prec - } - - assoc := b.precAndAssoc.terminalAssociativity(sym.Num()) - switch assoc { - case assocTypeLeft: - term.Associativity = "l" - case assocTypeRight: - term.Associativity = "r" - } - - terms[sym.Num()] = term - } - } - - var nonTerms []*spec.NonTerminal - { - nonTermSyms := b.symTab.NonTerminalSymbols() - nonTerms = make([]*spec.NonTerminal, len(nonTermSyms)+1) - for _, sym := range nonTermSyms { - name, ok := b.symTab.ToText(sym) - if !ok { - return nil, fmt.Errorf("failed to generate non-terminals: symbol not found: %v", sym) - } - - nonTerms[sym.Num()] = &spec.NonTerminal{ - Number: sym.Num().Int(), - Name: name, - } - } - } - - var prods []*spec.Production - { - ps := gram.productionSet.getAllProductions() - prods = make([]*spec.Production, len(ps)+1) - for _, p := range ps { - rhs := make([]int, len(p.rhs)) - for i, e := range p.rhs { - if e.IsTerminal() { - rhs[i] = e.Num().Int() - } else { - rhs[i] = e.Num().Int() * -1 - } - } - - prod := &spec.Production{ - Number: p.num.Int(), - LHS: p.lhs.Num().Int(), - RHS: rhs, - } - - prec := b.precAndAssoc.productionPredence(p.num) - if prec != precNil { - prod.Precedence = prec - } - - assoc := b.precAndAssoc.productionAssociativity(p.num) - switch assoc { - case assocTypeLeft: - prod.Associativity = "l" - case assocTypeRight: - prod.Associativity = "r" - } - - prods[p.num.Int()] = prod - } - } - - var states []*spec.State - { - srConflicts := map[stateNum][]*shiftReduceConflict{} - rrConflicts := map[stateNum][]*reduceReduceConflict{} - for _, con := range b.conflicts { - switch c := con.(type) { - case *shiftReduceConflict: - srConflicts[c.state] = append(srConflicts[c.state], c) - case *reduceReduceConflict: - rrConflicts[c.state] = append(rrConflicts[c.state], c) - } - } - - states = make([]*spec.State, len(b.automaton.states)) - for _, s := range b.automaton.states { - kernel := make([]*spec.Item, len(s.items)) - for i, item := range s.items { - p, ok := b.prods.findByID(item.prod) - if !ok { - return nil, fmt.Errorf("failed to generate states: production of kernel item not found: %v", item.prod) - } - - kernel[i] = &spec.Item{ - Production: p.num.Int(), - Dot: item.dot, - } - } - - sort.Slice(kernel, func(i, j int) bool { - if kernel[i].Production < kernel[j].Production { - return true - } - if kernel[i].Production > kernel[j].Production { - return false - } - return kernel[i].Dot < kernel[j].Dot - }) - - var shift []*spec.Transition - var reduce []*spec.Reduce - var goTo []*spec.Transition - { - TERMINALS_LOOP: - for _, t := range b.symTab.TerminalSymbols() { - act, next, prod := tab.getAction(s.num, t.Num()) - switch act { - case ActionTypeShift: - shift = append(shift, &spec.Transition{ - Symbol: t.Num().Int(), - State: next.Int(), - }) - case ActionTypeReduce: - for _, r := range reduce { - if r.Production == prod.Int() { - r.LookAhead = append(r.LookAhead, t.Num().Int()) - continue TERMINALS_LOOP - } - } - reduce = append(reduce, &spec.Reduce{ - LookAhead: []int{t.Num().Int()}, - Production: prod.Int(), - }) - } - } - - for _, n := range b.symTab.NonTerminalSymbols() { - ty, next := tab.getGoTo(s.num, n.Num()) - if ty == GoToTypeRegistered { - goTo = append(goTo, &spec.Transition{ - Symbol: n.Num().Int(), - State: next.Int(), - }) - } - } - - sort.Slice(shift, func(i, j int) bool { - return shift[i].State < shift[j].State - }) - sort.Slice(reduce, func(i, j int) bool { - return reduce[i].Production < reduce[j].Production - }) - sort.Slice(goTo, func(i, j int) bool { - return goTo[i].State < goTo[j].State - }) - } - - sr := []*spec.SRConflict{} - rr := []*spec.RRConflict{} - { - for _, c := range srConflicts[s.num] { - conflict := &spec.SRConflict{ - Symbol: c.sym.Num().Int(), - State: c.nextState.Int(), - Production: c.prodNum.Int(), - ResolvedBy: c.resolvedBy.Int(), - } - - ty, s, p := tab.getAction(s.num, c.sym.Num()) - switch ty { - case ActionTypeShift: - n := s.Int() - conflict.AdoptedState = &n - case ActionTypeReduce: - n := p.Int() - conflict.AdoptedProduction = &n - } - - sr = append(sr, conflict) - } - - sort.Slice(sr, func(i, j int) bool { - return sr[i].Symbol < sr[j].Symbol - }) - - for _, c := range rrConflicts[s.num] { - conflict := &spec.RRConflict{ - Symbol: c.sym.Num().Int(), - Production1: c.prodNum1.Int(), - Production2: c.prodNum2.Int(), - ResolvedBy: c.resolvedBy.Int(), - } - - _, _, p := tab.getAction(s.num, c.sym.Num()) - conflict.AdoptedProduction = p.Int() - - rr = append(rr, conflict) - } - - sort.Slice(rr, func(i, j int) bool { - return rr[i].Symbol < rr[j].Symbol - }) - } - - states[s.num.Int()] = &spec.State{ - Number: s.num.Int(), - Kernel: kernel, - Shift: shift, - Reduce: reduce, - GoTo: goTo, - SRConflict: sr, - RRConflict: rr, - } - } - } - - return &spec.Report{ - Terminals: terms, - NonTerminals: nonTerms, - Productions: prods, - States: states, - }, nil -} diff --git a/src/urubu/grammar/production.go b/src/urubu/grammar/production.go deleted file mode 100644 index 8f6c103..0000000 --- a/src/urubu/grammar/production.go +++ /dev/null @@ -1,117 +0,0 @@ -package grammar - -import ( - "crypto/sha256" - "encoding/hex" - "fmt" - - "urubu/grammar/symbol" -) - -type productionID [32]byte - -func (id productionID) String() string { - return hex.EncodeToString(id[:]) -} - -func genProductionID(lhs symbol.Symbol, rhs []symbol.Symbol) productionID { - seq := lhs.Byte() - for _, sym := range rhs { - seq = append(seq, sym.Byte()...) - } - return productionID(sha256.Sum256(seq)) -} - -type productionNum uint16 - -const ( - productionNumNil = productionNum(0) - productionNumStart = productionNum(1) - productionNumMin = productionNum(2) -) - -func (n productionNum) Int() int { - return int(n) -} - -type production struct { - id productionID - num productionNum - lhs symbol.Symbol - rhs []symbol.Symbol - rhsLen int -} - -func newProduction(lhs symbol.Symbol, rhs []symbol.Symbol) (*production, error) { - if lhs.IsNil() { - return nil, fmt.Errorf("LHS must be a non-nil symbol; LHS: %v, RHS: %v", lhs, rhs) - } - for _, sym := range rhs { - if sym.IsNil() { - return nil, fmt.Errorf("a symbol of RHS must be a non-nil symbol; LHS: %v, RHS: %v", lhs, rhs) - } - } - - return &production{ - id: genProductionID(lhs, rhs), - lhs: lhs, - rhs: rhs, - rhsLen: len(rhs), - }, nil -} - -func (p *production) isEmpty() bool { - return p.rhsLen == 0 -} - -type productionSet struct { - lhs2Prods map[symbol.Symbol][]*production - id2Prod map[productionID]*production - num productionNum -} - -func newProductionSet() *productionSet { - return &productionSet{ - lhs2Prods: map[symbol.Symbol][]*production{}, - id2Prod: map[productionID]*production{}, - num: productionNumMin, - } -} - -func (ps *productionSet) append(prod *production) { - if _, ok := ps.id2Prod[prod.id]; ok { - return - } - - if prod.lhs.IsStart() { - prod.num = productionNumStart - } else { - prod.num = ps.num - ps.num++ - } - - if prods, ok := ps.lhs2Prods[prod.lhs]; ok { - ps.lhs2Prods[prod.lhs] = append(prods, prod) - } else { - ps.lhs2Prods[prod.lhs] = []*production{prod} - } - ps.id2Prod[prod.id] = prod -} - -func (ps *productionSet) findByID(id productionID) (*production, bool) { - prod, ok := ps.id2Prod[id] - return prod, ok -} - -func (ps *productionSet) findByLHS(lhs symbol.Symbol) ([]*production, bool) { - if lhs.IsNil() { - return nil, false - } - - prods, ok := ps.lhs2Prods[lhs] - return prods, ok -} - -func (ps *productionSet) getAllProductions() map[productionID]*production { - return ps.id2Prod -} diff --git a/src/urubu/grammar/semantic_error.go b/src/urubu/grammar/semantic_error.go deleted file mode 100644 index 88a6b17..0000000 --- a/src/urubu/grammar/semantic_error.go +++ /dev/null @@ -1,30 +0,0 @@ -package grammar - -import "errors" - -var ( - semErrNoGrammarName = errors.New("name is missing") - semErrSpellingInconsistency = errors.New("the identifiers are treated as the same. please use the same spelling") - semErrDuplicateAssoc = errors.New("associativity and precedence cannot be specified multiple times for a symbol") - semErrUndefinedPrec = errors.New("symbol must has precedence") - semErrUndefinedOrdSym = errors.New("undefined ordered symbol") - semErrUnusedProduction = errors.New("unused production") - semErrUnusedTerminal = errors.New("unused terminal") - semErrTermCannotBeSkipped = errors.New("a terminal used in productions cannot be skipped") - semErrNoProduction = errors.New("a grammar needs at least one production") - semErrUndefinedSym = errors.New("undefined symbol") - semErrDuplicateProduction = errors.New("duplicate production") - semErrDuplicateTerminal = errors.New("duplicate terminal") - semErrDuplicateFragment = errors.New("duplicate fragment") - semErrDuplicateName = errors.New("duplicate names are not allowed between terminals and non-terminals") - semErrErrSymIsReserved = errors.New("symbol 'error' is reserved as a terminal symbol") - semErrDuplicateLabel = errors.New("a label must be unique in an alternative") - semErrInvalidLabel = errors.New("a label must differ from terminal symbols or non-terminal symbols") - semErrDirInvalidName = errors.New("invalid directive name") - semErrDirInvalidParam = errors.New("invalid parameter") - semErrDuplicateDir = errors.New("a directive must not be duplicated") - semErrDuplicateElem = errors.New("duplicate element") - semErrAmbiguousElem = errors.New("ambiguous element") - semErrInvalidProdDir = errors.New("invalid production directive") - semErrInvalidAltDir = errors.New("invalid alternative directive") -) diff --git a/src/urubu/grammar/symbol/symbol.go b/src/urubu/grammar/symbol.go index f9e6a93..f9e6a93 100644 --- a/src/urubu/grammar/symbol/symbol.go +++ b/src/urubu/grammar/symbol.go |