diff options
Diffstat (limited to 'grammar')
32 files changed, 6652 insertions, 729 deletions
diff --git a/grammar/first.go b/grammar/first.go index 72de282..923f8ed 100644 --- a/grammar/first.go +++ b/grammar/first.go @@ -1,20 +1,24 @@ package grammar -import "fmt" +import ( + "fmt" + + "github.com/nihei9/vartan/grammar/symbol" +) type firstEntry struct { - symbols map[symbol]struct{} + symbols map[symbol.Symbol]struct{} empty bool } func newFirstEntry() *firstEntry { return &firstEntry{ - symbols: map[symbol]struct{}{}, + symbols: map[symbol.Symbol]struct{}{}, empty: false, } } -func (e *firstEntry) add(sym symbol) bool { +func (e *firstEntry) add(sym symbol.Symbol) bool { if _, ok := e.symbols[sym]; ok { return false } @@ -45,12 +49,12 @@ func (e *firstEntry) mergeExceptEmpty(target *firstEntry) bool { } type firstSet struct { - set map[symbol]*firstEntry + set map[symbol.Symbol]*firstEntry } func newFirstSet(prods *productionSet) *firstSet { fst := &firstSet{ - set: map[symbol]*firstEntry{}, + set: map[symbol.Symbol]*firstEntry{}, } for _, prod := range prods.getAllProductions() { if _, ok := fst.set[prod.lhs]; ok { @@ -69,7 +73,7 @@ func (fst *firstSet) find(prod *production, head int) (*firstEntry, error) { return entry, nil } for _, sym := range prod.rhs[head:] { - if sym.isTerminal() { + if sym.IsTerminal() { entry.add(sym) return entry, nil } @@ -89,7 +93,7 @@ func (fst *firstSet) find(prod *production, head int) (*firstEntry, error) { return entry, nil } -func (fst *firstSet) findBySymbol(sym symbol) *firstEntry { +func (fst *firstSet) findBySymbol(sym symbol.Symbol) *firstEntry { return fst.set[sym] } @@ -130,7 +134,7 @@ func genProdFirstEntry(cc *firstComContext, acc *firstEntry, prod *production) ( } for _, sym := range prod.rhs { - if sym.isTerminal() { + if sym.IsTerminal() { return acc.add(sym), nil } diff --git a/grammar/first_test.go b/grammar/first_test.go index 21ee4df..1eff309 100644 --- a/grammar/first_test.go +++ b/grammar/first_test.go @@ -4,7 +4,8 @@ import ( "strings" "testing" - spec "github.com/nihei9/vartan/spec/grammar" + "github.com/nihei9/vartan/grammar/symbol" + "github.com/nihei9/vartan/spec/grammar/parser" ) type first struct { @@ -137,7 +138,7 @@ bar: "bar"; fst, gram := genActualFirst(t, tt.src) for _, ttFirst := range tt.first { - lhsSym, ok := gram.symbolTable.toSymbol(ttFirst.lhs) + lhsSym, ok := gram.symbolTable.ToSymbol(ttFirst.lhs) if !ok { t.Fatalf("a symbol was not found; symbol: %v", ttFirst.lhs) } @@ -161,14 +162,14 @@ bar: "bar"; } func genActualFirst(t *testing.T, src string) (*firstSet, *Grammar) { - ast, err := spec.Parse(strings.NewReader(src)) + ast, err := parser.Parse(strings.NewReader(src)) if err != nil { t.Fatal(err) } b := GrammarBuilder{ AST: ast, } - gram, err := b.Build() + gram, err := b.build() if err != nil { t.Fatal(err) } @@ -183,7 +184,7 @@ func genActualFirst(t *testing.T, src string) (*firstSet, *Grammar) { return fst, gram } -func genExpectedFirstEntry(t *testing.T, symbols []string, empty bool, symTab *symbolTableReader) *firstEntry { +func genExpectedFirstEntry(t *testing.T, symbols []string, empty bool, symTab *symbol.SymbolTableReader) *firstEntry { t.Helper() entry := newFirstEntry() @@ -191,7 +192,7 @@ func genExpectedFirstEntry(t *testing.T, symbols []string, empty bool, symTab *s entry.addEmpty() } for _, sym := range symbols { - symSym, ok := symTab.toSymbol(sym) + symSym, ok := symTab.ToSymbol(sym) if !ok { t.Fatalf("a symbol was not found; symbol: %v", sym) } diff --git a/grammar/grammar.go b/grammar/grammar.go index 50272e0..1e05289 100644 --- a/grammar/grammar.go +++ b/grammar/grammar.go @@ -5,10 +5,11 @@ import ( "io" "strings" - mlcompiler "github.com/nihei9/maleeni/compiler" - mlspec "github.com/nihei9/maleeni/spec" verr "github.com/nihei9/vartan/error" + "github.com/nihei9/vartan/grammar/lexical" + "github.com/nihei9/vartan/grammar/symbol" spec "github.com/nihei9/vartan/spec/grammar" + "github.com/nihei9/vartan/spec/grammar/parser" ) type astActionEntry struct { @@ -33,8 +34,8 @@ const ( // We use the priority of the production to resolve shift/reduce conflicts. type precAndAssoc struct { // termPrec and termAssoc represent the precedence of the terminal symbols. - termPrec map[symbolNum]int - termAssoc map[symbolNum]assocType + termPrec map[symbol.SymbolNum]int + termAssoc map[symbol.SymbolNum]assocType // prodPrec and prodAssoc represent the precedence and the associativities of the production. // These values are inherited from the right-most terminal symbols in the RHS of the productions. @@ -42,7 +43,7 @@ type precAndAssoc struct { prodAssoc map[productionNum]assocType } -func (pa *precAndAssoc) terminalPrecedence(sym symbolNum) int { +func (pa *precAndAssoc) terminalPrecedence(sym symbol.SymbolNum) int { prec, ok := pa.termPrec[sym] if !ok { return precNil @@ -51,7 +52,7 @@ func (pa *precAndAssoc) terminalPrecedence(sym symbolNum) int { return prec } -func (pa *precAndAssoc) terminalAssociativity(sym symbolNum) assocType { +func (pa *precAndAssoc) terminalAssociativity(sym symbol.SymbolNum) assocType { assoc, ok := pa.termAssoc[sym] if !ok { return assocTypeNil @@ -82,12 +83,12 @@ const reservedSymbolNameError = "error" type Grammar struct { name string - lexSpec *mlspec.LexSpec - skipLexKinds []mlspec.LexKindName + lexSpec *lexical.LexSpec + skipSymbols []symbol.Symbol productionSet *productionSet - augmentedStartSymbol symbol - errorSymbol symbol - symbolTable *symbolTableReader + augmentedStartSymbol symbol.Symbol + errorSymbol symbol.Symbol + symbolTable *symbol.SymbolTableReader astActions map[productionID][]*astActionEntry precAndAssoc *precAndAssoc @@ -95,13 +96,34 @@ type Grammar struct { recoverProductions map[productionID]struct{} } +type buildConfig struct { + isReportingEnabled bool +} + +type BuildOption func(config *buildConfig) + +func EnableReporting() BuildOption { + return func(config *buildConfig) { + config.isReportingEnabled = true + } +} + type GrammarBuilder struct { - AST *spec.RootNode + AST *parser.RootNode errs verr.SpecErrors } -func (b *GrammarBuilder) Build() (*Grammar, error) { +func (b *GrammarBuilder) Build(opts ...BuildOption) (*spec.CompiledGrammar, *spec.Report, error) { + gram, err := b.build() + if err != nil { + return nil, nil, err + } + + return compile(gram, opts...) +} + +func (b *GrammarBuilder) build() (*Grammar, error) { var specName string { errOccurred := false @@ -143,12 +165,12 @@ func (b *GrammarBuilder) Build() (*Grammar, error) { return nil, err } - lexSpec, err := b.genLexSpec(b.AST) + lexSpec, skip, err := b.genLexSpecAndSkipSymbols(symTab.Reader(), b.AST) if err != nil { return nil, err } - prodsAndActs, err := b.genProductionsAndActions(b.AST, symTab.reader(), ss.errSym, ss.augStartSym, ss.startSym) + prodsAndActs, err := b.genProductionsAndActions(b.AST, symTab.Reader(), ss.errSym, ss.augStartSym, ss.startSym) if err != nil { return nil, err } @@ -156,7 +178,7 @@ func (b *GrammarBuilder) Build() (*Grammar, error) { return nil, b.errs } - pa, err := b.genPrecAndAssoc(symTab.reader(), ss.errSym, prodsAndActs) + pa, err := b.genPrecAndAssoc(symTab.Reader(), ss.errSym, prodsAndActs) if err != nil { return nil, err } @@ -171,20 +193,23 @@ func (b *GrammarBuilder) Build() (*Grammar, error) { // When a terminal symbol that cannot be reached from the start symbol has the skip directive, // the compiler treats its terminal as a used symbol, not unused. - for _, sym := range lexSpec.skip { - s := sym.String() - if _, ok := syms.unusedTerminals[s]; !ok { - prod := syms.usedTerminals[s] - b.errs = append(b.errs, &verr.SpecError{ - Cause: semErrTermCannotBeSkipped, - Detail: s, - Row: prod.Pos.Row, - Col: prod.Pos.Col, - }) - continue - } + { + r := symTab.Reader() + for _, sym := range skip { + s, _ := r.ToText(sym) + if _, ok := syms.unusedTerminals[s]; !ok { + prod := syms.usedTerminals[s] + b.errs = append(b.errs, &verr.SpecError{ + Cause: semErrTermCannotBeSkipped, + Detail: s, + Row: prod.Pos.Row, + Col: prod.Pos.Col, + }) + continue + } - delete(syms.unusedTerminals, s) + delete(syms.unusedTerminals, s) + } } for sym, prod := range syms.unusedProductions { @@ -209,16 +234,14 @@ func (b *GrammarBuilder) Build() (*Grammar, error) { return nil, b.errs } - lexSpec.lexSpec.Name = specName - return &Grammar{ name: specName, - lexSpec: lexSpec.lexSpec, - skipLexKinds: lexSpec.skip, + lexSpec: lexSpec, + skipSymbols: skip, productionSet: prodsAndActs.prods, augmentedStartSymbol: prodsAndActs.augStartSym, errorSymbol: ss.errSym, - symbolTable: symTab.reader(), + symbolTable: symTab.Reader(), astActions: prodsAndActs.astActs, recoverProductions: prodsAndActs.recoverProds, precAndAssoc: pa, @@ -226,14 +249,14 @@ func (b *GrammarBuilder) Build() (*Grammar, error) { } type usedAndUnusedSymbols struct { - unusedProductions map[string]*spec.ProductionNode - unusedTerminals map[string]*spec.ProductionNode - usedTerminals map[string]*spec.ProductionNode + unusedProductions map[string]*parser.ProductionNode + unusedTerminals map[string]*parser.ProductionNode + usedTerminals map[string]*parser.ProductionNode } -func findUsedAndUnusedSymbols(root *spec.RootNode) *usedAndUnusedSymbols { - prods := map[string]*spec.ProductionNode{} - lexProds := map[string]*spec.ProductionNode{} +func findUsedAndUnusedSymbols(root *parser.RootNode) *usedAndUnusedSymbols { + prods := map[string]*parser.ProductionNode{} + lexProds := map[string]*parser.ProductionNode{} mark := map[string]bool{} { for _, p := range root.Productions { @@ -262,9 +285,9 @@ func findUsedAndUnusedSymbols(root *spec.RootNode) *usedAndUnusedSymbols { delete(mark, reservedSymbolNameError) } - usedTerms := make(map[string]*spec.ProductionNode, len(lexProds)) - unusedProds := map[string]*spec.ProductionNode{} - unusedTerms := map[string]*spec.ProductionNode{} + usedTerms := make(map[string]*parser.ProductionNode, len(lexProds)) + unusedProds := map[string]*parser.ProductionNode{} + unusedTerms := map[string]*parser.ProductionNode{} for sym, used := range mark { if p, ok := prods[sym]; ok { if used { @@ -294,7 +317,7 @@ func findUsedAndUnusedSymbols(root *spec.RootNode) *usedAndUnusedSymbols { } } -func markUsedSymbols(mark map[string]bool, marked map[string]bool, prods map[string]*spec.ProductionNode, prod *spec.ProductionNode) { +func markUsedSymbols(mark map[string]bool, marked map[string]bool, prods map[string]*parser.ProductionNode, prod *parser.ProductionNode) { if marked[prod.LHS] { return } @@ -320,7 +343,7 @@ func markUsedSymbols(mark map[string]bool, marked map[string]bool, prods map[str } } -func (b *GrammarBuilder) checkSpellingInconsistenciesOfUserDefinedIDs(root *spec.RootNode) { +func (b *GrammarBuilder) checkSpellingInconsistenciesOfUserDefinedIDs(root *parser.RootNode) { var ids []string { for _, prod := range root.Productions { @@ -344,7 +367,7 @@ func (b *GrammarBuilder) checkSpellingInconsistenciesOfUserDefinedIDs(root *spec } } - duplicated := mlspec.FindSpellingInconsistencies(ids) + duplicated := lexical.FindSpellingInconsistencies(ids) if len(duplicated) == 0 { return } @@ -367,7 +390,7 @@ func (b *GrammarBuilder) checkSpellingInconsistenciesOfUserDefinedIDs(root *spec } } -func collectUserDefinedIDsFromDirective(dir *spec.DirectiveNode) []string { +func collectUserDefinedIDsFromDirective(dir *parser.DirectiveNode) []string { var ids []string for _, param := range dir.Parameters { if param.Group != nil { @@ -386,20 +409,20 @@ func collectUserDefinedIDsFromDirective(dir *spec.DirectiveNode) []string { } type symbols struct { - errSym symbol - augStartSym symbol - startSym symbol + errSym symbol.Symbol + augStartSym symbol.Symbol + startSym symbol.Symbol } -func (b *GrammarBuilder) genSymbolTable(root *spec.RootNode) (*symbolTable, *symbols, error) { - symTab := newSymbolTable() - w := symTab.writer() - r := symTab.reader() +func (b *GrammarBuilder) genSymbolTable(root *parser.RootNode) (*symbol.SymbolTable, *symbols, error) { + symTab := symbol.NewSymbolTable() + w := symTab.Writer() + r := symTab.Reader() // We need to register the reserved symbol before registering others. - var errSym symbol + var errSym symbol.Symbol { - sym, err := w.registerTerminalSymbol(reservedSymbolNameError) + sym, err := w.RegisterTerminalSymbol(reservedSymbolNameError) if err != nil { return nil, nil, err } @@ -407,7 +430,7 @@ func (b *GrammarBuilder) genSymbolTable(root *spec.RootNode) (*symbolTable, *sym } for _, prod := range root.LexProductions { - if sym, exist := r.toSymbol(prod.LHS); exist { + if sym, exist := r.ToSymbol(prod.LHS); exist { if sym == errSym { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrErrSymIsReserved, @@ -426,7 +449,7 @@ func (b *GrammarBuilder) genSymbolTable(root *spec.RootNode) (*symbolTable, *sym continue } - _, err := w.registerTerminalSymbol(prod.LHS) + _, err := w.RegisterTerminalSymbol(prod.LHS) if err != nil { return nil, nil, err } @@ -435,7 +458,7 @@ func (b *GrammarBuilder) genSymbolTable(root *spec.RootNode) (*symbolTable, *sym startProd := root.Productions[0] augStartText := fmt.Sprintf("%s'", startProd.LHS) var err error - augStartSym, err := w.registerStartSymbol(augStartText) + augStartSym, err := w.RegisterStartSymbol(augStartText) if err != nil { return nil, nil, err } @@ -447,7 +470,7 @@ func (b *GrammarBuilder) genSymbolTable(root *spec.RootNode) (*symbolTable, *sym }) } - startSym, err := w.registerNonTerminalSymbol(startProd.LHS) + startSym, err := w.RegisterNonTerminalSymbol(startProd.LHS) if err != nil { return nil, nil, err } @@ -460,11 +483,11 @@ func (b *GrammarBuilder) genSymbolTable(root *spec.RootNode) (*symbolTable, *sym } for _, prod := range root.Productions { - sym, err := w.registerNonTerminalSymbol(prod.LHS) + sym, err := w.RegisterNonTerminalSymbol(prod.LHS) if err != nil { return nil, nil, err } - if sym.isTerminal() { + if sym.IsTerminal() { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDuplicateName, Detail: prod.LHS, @@ -488,25 +511,21 @@ func (b *GrammarBuilder) genSymbolTable(root *spec.RootNode) (*symbolTable, *sym }, nil } -type lexSpec struct { - lexSpec *mlspec.LexSpec - skip []mlspec.LexKindName -} - -func (b *GrammarBuilder) genLexSpec(root *spec.RootNode) (*lexSpec, error) { - entries := []*mlspec.LexEntry{} - skipKinds := []mlspec.LexKindName{} +func (b *GrammarBuilder) genLexSpecAndSkipSymbols(symTab *symbol.SymbolTableReader, root *parser.RootNode) (*lexical.LexSpec, []symbol.Symbol, error) { + entries := []*lexical.LexEntry{} + skipSyms := []symbol.Symbol{} for _, prod := range root.LexProductions { entry, skip, specErr, err := genLexEntry(prod) if err != nil { - return nil, err + return nil, nil, err } if specErr != nil { b.errs = append(b.errs, specErr) continue } if skip { - skipKinds = append(skipKinds, mlspec.LexKindName(prod.LHS)) + sym, _ := symTab.ToSymbol(prod.LHS) + skipSyms = append(skipSyms, sym) } entries = append(entries, entry) } @@ -524,35 +543,32 @@ func (b *GrammarBuilder) genLexSpec(root *spec.RootNode) (*lexSpec, error) { } checkedFragments[fragment.LHS] = struct{}{} - entries = append(entries, &mlspec.LexEntry{ + entries = append(entries, &lexical.LexEntry{ Fragment: true, - Kind: mlspec.LexKindName(fragment.LHS), - Pattern: mlspec.LexPattern(fragment.RHS), + Kind: spec.LexKindName(fragment.LHS), + Pattern: fragment.RHS, }) } - return &lexSpec{ - lexSpec: &mlspec.LexSpec{ - Entries: entries, - }, - skip: skipKinds, - }, nil + return &lexical.LexSpec{ + Entries: entries, + }, skipSyms, nil } -func genLexEntry(prod *spec.ProductionNode) (*mlspec.LexEntry, bool, *verr.SpecError, error) { +func genLexEntry(prod *parser.ProductionNode) (*lexical.LexEntry, bool, *verr.SpecError, error) { alt := prod.RHS[0] elem := alt.Elements[0] var pattern string if elem.Literally { - pattern = mlspec.EscapePattern(elem.Pattern) + pattern = spec.EscapePattern(elem.Pattern) } else { pattern = elem.Pattern } - var modes []mlspec.LexModeName + var modes []spec.LexModeName var skip bool - var push mlspec.LexModeName + var push spec.LexModeName var pop bool dirConsumed := map[string]struct{}{} for _, dir := range prod.Directives { @@ -585,7 +601,7 @@ func genLexEntry(prod *spec.ProductionNode) (*mlspec.LexEntry, bool, *verr.SpecE Col: param.Pos.Col, }, nil } - modes = append(modes, mlspec.LexModeName(param.ID)) + modes = append(modes, spec.LexModeName(param.ID)) } case "skip": if len(dir.Parameters) > 0 { @@ -606,7 +622,7 @@ func genLexEntry(prod *spec.ProductionNode) (*mlspec.LexEntry, bool, *verr.SpecE Col: dir.Pos.Col, }, nil } - push = mlspec.LexModeName(dir.Parameters[0].ID) + push = spec.LexModeName(dir.Parameters[0].ID) case "pop": if len(dir.Parameters) > 0 { return nil, false, &verr.SpecError{ @@ -636,10 +652,10 @@ func genLexEntry(prod *spec.ProductionNode) (*mlspec.LexEntry, bool, *verr.SpecE }, nil } - return &mlspec.LexEntry{ + return &lexical.LexEntry{ Modes: modes, - Kind: mlspec.LexKindName(prod.LHS), - Pattern: mlspec.LexPattern(pattern), + Kind: spec.LexKindName(prod.LHS), + Pattern: pattern, Push: push, Pop: pop, }, skip, nil, nil @@ -647,15 +663,15 @@ func genLexEntry(prod *spec.ProductionNode) (*mlspec.LexEntry, bool, *verr.SpecE type productionsAndActions struct { prods *productionSet - augStartSym symbol + augStartSym symbol.Symbol astActs map[productionID][]*astActionEntry - prodPrecsTerm map[productionID]symbol + prodPrecsTerm map[productionID]symbol.Symbol prodPrecsOrdSym map[productionID]string - prodPrecPoss map[productionID]*spec.Position + prodPrecPoss map[productionID]*parser.Position recoverProds map[productionID]struct{} } -func (b *GrammarBuilder) genProductionsAndActions(root *spec.RootNode, symTab *symbolTableReader, errSym symbol, augStartSym symbol, startSym symbol) (*productionsAndActions, error) { +func (b *GrammarBuilder) genProductionsAndActions(root *parser.RootNode, symTab *symbol.SymbolTableReader, errSym symbol.Symbol, augStartSym symbol.Symbol, startSym symbol.Symbol) (*productionsAndActions, error) { if len(root.Productions) == 0 { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrNoProduction, @@ -665,12 +681,12 @@ func (b *GrammarBuilder) genProductionsAndActions(root *spec.RootNode, symTab *s prods := newProductionSet() astActs := map[productionID][]*astActionEntry{} - prodPrecsTerm := map[productionID]symbol{} + prodPrecsTerm := map[productionID]symbol.Symbol{} prodPrecsOrdSym := map[productionID]string{} - prodPrecPoss := map[productionID]*spec.Position{} + prodPrecPoss := map[productionID]*parser.Position{} recoverProds := map[productionID]struct{}{} - p, err := newProduction(augStartSym, []symbol{ + p, err := newProduction(augStartSym, []symbol.Symbol{ startSym, }) if err != nil { @@ -680,7 +696,7 @@ func (b *GrammarBuilder) genProductionsAndActions(root *spec.RootNode, symTab *s prods.append(p) for _, prod := range root.Productions { - lhsSym, ok := symTab.toSymbol(prod.LHS) + lhsSym, ok := symTab.ToSymbol(prod.LHS) if !ok { // All symbols are assumed to be pre-detected, so it's a bug if we cannot find them here. return nil, fmt.Errorf("symbol '%v' is undefined", prod.LHS) @@ -698,11 +714,11 @@ func (b *GrammarBuilder) genProductionsAndActions(root *spec.RootNode, symTab *s LOOP_RHS: for _, alt := range prod.RHS { - altSyms := make([]symbol, len(alt.Elements)) + altSyms := make([]symbol.Symbol, len(alt.Elements)) offsets := map[string]int{} ambiguousIDOffsets := map[string]struct{}{} for i, elem := range alt.Elements { - sym, ok := symTab.toSymbol(elem.ID) + sym, ok := symTab.ToSymbol(elem.ID) if !ok { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrUndefinedSym, @@ -724,7 +740,7 @@ func (b *GrammarBuilder) genProductionsAndActions(root *spec.RootNode, symTab *s }) continue LOOP_RHS } - if _, found := symTab.toSymbol(elem.Label.Name); found { + if _, found := symTab.ToSymbol(elem.Label.Name); found { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrInvalidLabel, Detail: elem.Label.Name, @@ -877,12 +893,12 @@ func (b *GrammarBuilder) genProductionsAndActions(root *spec.RootNode, symTab *s }) continue LOOP_RHS } - elemSym, ok := symTab.toSymbol(elem.ID) + elemSym, ok := symTab.ToSymbol(elem.ID) if !ok { // If the symbol was not found, it's a bug. return nil, fmt.Errorf("a symbol corresponding to an ID (%v) was not found", elem.ID) } - if elemSym.isTerminal() { + if elemSym.IsTerminal() { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: fmt.Sprintf("the expansion symbol cannot be applied to a terminal symbol (%v: %v)", param.ID, elem.ID), @@ -912,7 +928,7 @@ func (b *GrammarBuilder) genProductionsAndActions(root *spec.RootNode, symTab *s param := dir.Parameters[0] switch { case param.ID != "": - sym, ok := symTab.toSymbol(param.ID) + sym, ok := symTab.ToSymbol(param.ID) if !ok { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, @@ -930,7 +946,7 @@ func (b *GrammarBuilder) genProductionsAndActions(root *spec.RootNode, symTab *s Col: param.Pos.Col, }) } - if !sym.isTerminal() { + if !sym.IsTerminal() { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: fmt.Sprintf("the symbol must be a terminal: %v", param.ID), @@ -980,12 +996,12 @@ func (b *GrammarBuilder) genProductionsAndActions(root *spec.RootNode, symTab *s }, nil } -func (b *GrammarBuilder) genPrecAndAssoc(symTab *symbolTableReader, errSym symbol, prodsAndActs *productionsAndActions) (*precAndAssoc, error) { - termPrec := map[symbolNum]int{} - termAssoc := map[symbolNum]assocType{} +func (b *GrammarBuilder) genPrecAndAssoc(symTab *symbol.SymbolTableReader, errSym symbol.Symbol, prodsAndActs *productionsAndActions) (*precAndAssoc, error) { + termPrec := map[symbol.SymbolNum]int{} + termAssoc := map[symbol.SymbolNum]assocType{} ordSymPrec := map[string]int{} { - var precGroup []*spec.DirectiveNode + var precGroup []*parser.DirectiveNode for _, dir := range b.AST.Directives { if dir.Name == "prec" { if dir.Parameters == nil || len(dir.Parameters) != 1 || dir.Parameters[0].Group == nil { @@ -1045,7 +1061,7 @@ func (b *GrammarBuilder) genPrecAndAssoc(symTab *symbolTableReader, errSym symbo for _, p := range dir.Parameters { switch { case p.ID != "": - sym, ok := symTab.toSymbol(p.ID) + sym, ok := symTab.ToSymbol(p.ID) if !ok { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, @@ -1064,7 +1080,7 @@ func (b *GrammarBuilder) genPrecAndAssoc(symTab *symbolTableReader, errSym symbo }) return nil, nil } - if !sym.isTerminal() { + if !sym.IsTerminal() { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDirInvalidParam, Detail: fmt.Sprintf("associativity can take only terminal symbol ('%v' is a non-terminal)", p.ID), @@ -1073,7 +1089,7 @@ func (b *GrammarBuilder) genPrecAndAssoc(symTab *symbolTableReader, errSym symbo }) return nil, nil } - if prec, alreadySet := termPrec[sym.num()]; alreadySet { + if prec, alreadySet := termPrec[sym.Num()]; alreadySet { if prec == precN { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDuplicateAssoc, @@ -1081,7 +1097,7 @@ func (b *GrammarBuilder) genPrecAndAssoc(symTab *symbolTableReader, errSym symbo Row: p.Pos.Row, Col: p.Pos.Col, }) - } else if assoc := termAssoc[sym.num()]; assoc == assocTy { + } else if assoc := termAssoc[sym.Num()]; assoc == assocTy { b.errs = append(b.errs, &verr.SpecError{ Cause: semErrDuplicateAssoc, Detail: fmt.Sprintf("'%v' already has different precedence", p.ID), @@ -1099,8 +1115,8 @@ func (b *GrammarBuilder) genPrecAndAssoc(symTab *symbolTableReader, errSym symbo break ASSOC_PARAM_LOOP } - termPrec[sym.num()] = precN - termAssoc[sym.num()] = assocTy + termPrec[sym.Num()] = precN + termAssoc[sym.Num()] = assocTy case p.OrderedSymbol != "": if prec, alreadySet := ordSymPrec[p.OrderedSymbol]; alreadySet { if prec == precN { @@ -1145,11 +1161,11 @@ func (b *GrammarBuilder) genPrecAndAssoc(symTab *symbolTableReader, errSym symbo for _, prod := range prodsAndActs.prods.getAllProductions() { // A #prec directive changes only precedence, not associativity. if term, ok := prodsAndActs.prodPrecsTerm[prod.id]; ok { - if prec, ok := termPrec[term.num()]; ok { + if prec, ok := termPrec[term.Num()]; ok { prodPrec[prod.num] = prec prodAssoc[prod.num] = assocTypeNil } else { - text, _ := symTab.toText(term) + text, _ := symTab.ToText(term) b.errs = append(b.errs, &verr.SpecError{ Cause: semErrUndefinedPrec, Detail: text, @@ -1171,16 +1187,16 @@ func (b *GrammarBuilder) genPrecAndAssoc(symTab *symbolTableReader, errSym symbo } } else { // A production inherits precedence and associativity from the right-most terminal symbol. - mostrightTerm := symbolNil + mostrightTerm := symbol.SymbolNil for _, sym := range prod.rhs { - if !sym.isTerminal() { + if !sym.IsTerminal() { continue } mostrightTerm = sym } - if !mostrightTerm.isNil() { - prodPrec[prod.num] = termPrec[mostrightTerm.num()] - prodAssoc[prod.num] = termAssoc[mostrightTerm.num()] + if !mostrightTerm.IsNil() { + prodPrec[prod.num] = termPrec[mostrightTerm.Num()] + prodAssoc[prod.num] = termAssoc[mostrightTerm.Num()] } } } @@ -1196,25 +1212,13 @@ func (b *GrammarBuilder) genPrecAndAssoc(symTab *symbolTableReader, errSym symbo }, nil } -type compileConfig struct { - isReportingEnabled bool -} - -type CompileOption func(config *compileConfig) - -func EnableReporting() CompileOption { - return func(config *compileConfig) { - config.isReportingEnabled = true - } -} - -func Compile(gram *Grammar, opts ...CompileOption) (*spec.CompiledGrammar, *spec.Report, error) { - config := &compileConfig{} +func compile(gram *Grammar, opts ...BuildOption) (*spec.CompiledGrammar, *spec.Report, error) { + config := &buildConfig{} for _, opt := range opts { opt(config) } - lexSpec, err, cErrs := mlcompiler.Compile(gram.lexSpec, mlcompiler.CompressionLevel(mlcompiler.CompressionLevelMax)) + lexSpec, err, cErrs := lexical.Compile(gram.lexSpec, lexical.CompressionLevelMax) if err != nil { if len(cErrs) > 0 { var b strings.Builder @@ -1230,35 +1234,44 @@ func Compile(gram *Grammar, opts ...CompileOption) (*spec.CompiledGrammar, *spec kind2Term := make([]int, len(lexSpec.KindNames)) for i, k := range lexSpec.KindNames { - if k == mlspec.LexKindNameNil { - kind2Term[mlspec.LexKindIDNil] = symbolNil.num().Int() + if k == spec.LexKindNameNil { + kind2Term[spec.LexKindIDNil] = symbol.SymbolNil.Num().Int() continue } - sym, ok := gram.symbolTable.toSymbol(k.String()) + sym, ok := gram.symbolTable.ToSymbol(k.String()) if !ok { return nil, nil, fmt.Errorf("terminal symbol '%v' was not found in a symbol table", k) } - kind2Term[i] = sym.num().Int() + kind2Term[i] = sym.Num().Int() } - termTexts, err := gram.symbolTable.terminalTexts() + termTexts, err := gram.symbolTable.TerminalTexts() if err != nil { return nil, nil, err } - termSkip := make([]int, len(termTexts)) - for i, k := range lexSpec.KindNames { - for _, sk := range gram.skipLexKinds { - if k != sk { - continue + var termSkip []int + { + r := gram.symbolTable.Reader() + // I want to use gram.symbolTable.terminalSymbols() here instead of gram.symbolTable.terminalTexts(), + // but gram.symbolTable.terminalSymbols() is different in length from terminalTexts + // because it does not contain a predefined symbol, like EOF. + // Therefore, we use terminalTexts, although it takes more time to lookup for symbols. + termSkip = make([]int, len(termTexts)) + for _, t := range termTexts { + s, _ := r.ToSymbol(t) + for _, sk := range gram.skipSymbols { + if s != sk { + continue + } + termSkip[s.Num()] = 1 + break } - termSkip[kind2Term[i]] = 1 - break } } - nonTerms, err := gram.symbolTable.nonTerminalTexts() + nonTerms, err := gram.symbolTable.NonTerminalTexts() if err != nil { return nil, nil, err } @@ -1316,7 +1329,7 @@ func Compile(gram *Grammar, opts ...CompileOption) (*spec.CompiledGrammar, *spec recoverProds := make([]int, len(gram.productionSet.getAllProductions())+1) astActEnties := make([][]int, len(gram.productionSet.getAllProductions())+1) for _, p := range gram.productionSet.getAllProductions() { - lhsSyms[p.num] = p.lhs.num().Int() + lhsSyms[p.num] = p.lhs.Num().Int() altSymCounts[p.num] = p.rhsLen if _, ok := gram.recoverProductions[p.id]; ok { @@ -1339,15 +1352,9 @@ func Compile(gram *Grammar, opts ...CompileOption) (*spec.CompiledGrammar, *spec } return &spec.CompiledGrammar{ - Name: gram.name, - LexicalSpecification: &spec.LexicalSpecification{ - Lexer: "maleeni", - Maleeni: &spec.Maleeni{ - Spec: lexSpec, - KindToTerminal: kind2Term, - }, - }, - ParsingTable: &spec.ParsingTable{ + Name: gram.name, + Lexical: lexSpec, + Syntactic: &spec.SyntacticSpec{ Action: action, GoTo: goTo, StateCount: tab.stateCount, @@ -1358,10 +1365,11 @@ func Compile(gram *Grammar, opts ...CompileOption) (*spec.CompiledGrammar, *spec Terminals: termTexts, TerminalCount: tab.terminalCount, TerminalSkip: termSkip, + KindToTerminal: kind2Term, NonTerminals: nonTerms, NonTerminalCount: tab.nonTerminalCount, - EOFSymbol: symbolEOF.num().Int(), - ErrorSymbol: gram.errorSymbol.num().Int(), + EOFSymbol: symbol.SymbolEOF.Num().Int(), + ErrorSymbol: gram.errorSymbol.Num().Int(), ErrorTrapperStates: tab.errorTrapperStates, RecoverProductions: recoverProds, }, @@ -1371,7 +1379,7 @@ func Compile(gram *Grammar, opts ...CompileOption) (*spec.CompiledGrammar, *spec }, report, nil } -func writeCompileError(w io.Writer, cErr *mlcompiler.CompileError) { +func writeCompileError(w io.Writer, cErr *lexical.CompileError) { if cErr.Fragment { fmt.Fprintf(w, "fragment ") } diff --git a/grammar/grammar_test.go b/grammar/grammar_test.go index f6cb681..e3cf668 100644 --- a/grammar/grammar_test.go +++ b/grammar/grammar_test.go @@ -5,7 +5,7 @@ import ( "testing" verr "github.com/nihei9/vartan/error" - spec "github.com/nihei9/vartan/spec/grammar" + "github.com/nihei9/vartan/spec/grammar/parser" ) func TestGrammarBuilderOK(t *testing.T) { @@ -243,9 +243,9 @@ baz var fooPrec int var fooAssoc assocType { - s, _ := g.symbolTable.toSymbol("foo") - fooPrec = g.precAndAssoc.terminalPrecedence(s.num()) - fooAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("foo") + fooPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + fooAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } if fooPrec != 1 || fooAssoc != assocTypeLeft { t.Fatalf("unexpected terminal precedence and associativity: want: (prec: %v, assoc: %v), got: (prec: %v, assoc: %v)", 1, assocTypeLeft, fooPrec, fooAssoc) @@ -253,9 +253,9 @@ baz var barPrec int var barAssoc assocType { - s, _ := g.symbolTable.toSymbol("bar") - barPrec = g.precAndAssoc.terminalPrecedence(s.num()) - barAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("bar") + barPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + barAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } if barPrec != 1 || barAssoc != assocTypeLeft { t.Fatalf("unexpected terminal precedence and associativity: want: (prec: %v, assoc: %v), got: (prec: %v, assoc: %v)", 1, assocTypeLeft, barPrec, barAssoc) @@ -263,9 +263,9 @@ baz var bazPrec int var bazAssoc assocType { - s, _ := g.symbolTable.toSymbol("baz") - bazPrec = g.precAndAssoc.terminalPrecedence(s.num()) - bazAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("baz") + bazPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + bazAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } if bazPrec != precNil || bazAssoc != assocTypeNil { t.Fatalf("unexpected terminal precedence and associativity: want: (prec: %v, assoc: %v), got: (prec: %v, assoc: %v)", precNil, assocTypeNil, bazPrec, bazAssoc) @@ -296,9 +296,9 @@ baz var fooPrec int var fooAssoc assocType { - s, _ := g.symbolTable.toSymbol("foo") - fooPrec = g.precAndAssoc.terminalPrecedence(s.num()) - fooAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("foo") + fooPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + fooAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } if fooPrec != 1 || fooAssoc != assocTypeRight { t.Fatalf("unexpected terminal precedence and associativity: want: (prec: %v, assoc: %v), got: (prec: %v, assoc: %v)", 1, assocTypeRight, fooPrec, fooAssoc) @@ -306,9 +306,9 @@ baz var barPrec int var barAssoc assocType { - s, _ := g.symbolTable.toSymbol("bar") - barPrec = g.precAndAssoc.terminalPrecedence(s.num()) - barAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("bar") + barPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + barAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } if barPrec != 1 || barAssoc != assocTypeRight { t.Fatalf("unexpected terminal precedence and associativity: want: (prec: %v, assoc: %v), got: (prec: %v, assoc: %v)", 1, assocTypeRight, barPrec, barAssoc) @@ -316,9 +316,9 @@ baz var bazPrec int var bazAssoc assocType { - s, _ := g.symbolTable.toSymbol("baz") - bazPrec = g.precAndAssoc.terminalPrecedence(s.num()) - bazAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("baz") + bazPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + bazAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } if bazPrec != precNil || bazAssoc != assocTypeNil { t.Fatalf("unexpected terminal precedence and associativity: want: (prec: %v, assoc: %v), got: (prec: %v, assoc: %v)", precNil, assocTypeNil, bazPrec, bazAssoc) @@ -349,9 +349,9 @@ baz var fooPrec int var fooAssoc assocType { - s, _ := g.symbolTable.toSymbol("foo") - fooPrec = g.precAndAssoc.terminalPrecedence(s.num()) - fooAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("foo") + fooPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + fooAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } if fooPrec != 1 || fooAssoc != assocTypeNil { t.Fatalf("unexpected terminal precedence and associativity: want: (prec: %v, assoc: %v), got: (prec: %v, assoc: %v)", 1, assocTypeNil, fooPrec, fooAssoc) @@ -359,9 +359,9 @@ baz var barPrec int var barAssoc assocType { - s, _ := g.symbolTable.toSymbol("bar") - barPrec = g.precAndAssoc.terminalPrecedence(s.num()) - barAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("bar") + barPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + barAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } if barPrec != 1 || barAssoc != assocTypeNil { t.Fatalf("unexpected terminal precedence and associativity: want: (prec: %v, assoc: %v), got: (prec: %v, assoc: %v)", 1, assocTypeNil, barPrec, barAssoc) @@ -369,9 +369,9 @@ baz var bazPrec int var bazAssoc assocType { - s, _ := g.symbolTable.toSymbol("baz") - bazPrec = g.precAndAssoc.terminalPrecedence(s.num()) - bazAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("baz") + bazPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + bazAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } if bazPrec != precNil || bazAssoc != assocTypeNil { t.Fatalf("unexpected terminal precedence and associativity: want: (prec: %v, assoc: %v), got: (prec: %v, assoc: %v)", precNil, assocTypeNil, bazPrec, bazAssoc) @@ -400,14 +400,14 @@ bar var barPrec int var barAssoc assocType { - s, _ := g.symbolTable.toSymbol("bar") - barPrec = g.precAndAssoc.terminalPrecedence(s.num()) - barAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("bar") + barPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + barAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } var sPrec int var sAssoc assocType { - s, _ := g.symbolTable.toSymbol("s") + s, _ := g.symbolTable.ToSymbol("s") ps, _ := g.productionSet.findByLHS(s) sPrec = g.precAndAssoc.productionPredence(ps[0].num) sAssoc = g.precAndAssoc.productionAssociativity(ps[0].num) @@ -443,14 +443,14 @@ bar var barPrec int var barAssoc assocType { - s, _ := g.symbolTable.toSymbol("bar") - barPrec = g.precAndAssoc.terminalPrecedence(s.num()) - barAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("bar") + barPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + barAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } var sPrec int var sAssoc assocType { - s, _ := g.symbolTable.toSymbol("s") + s, _ := g.symbolTable.ToSymbol("s") ps, _ := g.productionSet.findByLHS(s) sPrec = g.precAndAssoc.productionPredence(ps[0].num) sAssoc = g.precAndAssoc.productionAssociativity(ps[0].num) @@ -489,21 +489,21 @@ bar var fooPrec int var fooAssoc assocType { - s, _ := g.symbolTable.toSymbol("foo") - fooPrec = g.precAndAssoc.terminalPrecedence(s.num()) - fooAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("foo") + fooPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + fooAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } var barPrec int var barAssoc assocType { - s, _ := g.symbolTable.toSymbol("bar") - barPrec = g.precAndAssoc.terminalPrecedence(s.num()) - barAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("bar") + barPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + barAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } var aPrec int var aAssoc assocType { - s, _ := g.symbolTable.toSymbol("a") + s, _ := g.symbolTable.ToSymbol("a") ps, _ := g.productionSet.findByLHS(s) aPrec = g.precAndAssoc.productionPredence(ps[0].num) aAssoc = g.precAndAssoc.productionAssociativity(ps[0].num) @@ -511,7 +511,7 @@ bar var sPrec int var sAssoc assocType { - s, _ := g.symbolTable.toSymbol("s") + s, _ := g.symbolTable.ToSymbol("s") ps, _ := g.productionSet.findByLHS(s) sPrec = g.precAndAssoc.productionPredence(ps[0].num) sAssoc = g.precAndAssoc.productionAssociativity(ps[0].num) @@ -567,7 +567,7 @@ bra var alt4Prec int var alt4Assoc assocType { - s, _ := g.symbolTable.toSymbol("s") + s, _ := g.symbolTable.ToSymbol("s") ps, _ := g.productionSet.findByLHS(s) alt1Prec = g.precAndAssoc.productionPredence(ps[0].num) alt1Assoc = g.precAndAssoc.productionAssociativity(ps[0].num) @@ -615,14 +615,14 @@ foo var fooPrec int var fooAssoc assocType { - s, _ := g.symbolTable.toSymbol("foo") - fooPrec = g.precAndAssoc.terminalPrecedence(s.num()) - fooAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("foo") + fooPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + fooAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } var aPrec int var aAssoc assocType { - s, _ := g.symbolTable.toSymbol("a") + s, _ := g.symbolTable.ToSymbol("a") ps, _ := g.productionSet.findByLHS(s) aPrec = g.precAndAssoc.productionPredence(ps[0].num) aAssoc = g.precAndAssoc.productionAssociativity(ps[0].num) @@ -630,7 +630,7 @@ foo var sPrec int var sAssoc assocType { - s, _ := g.symbolTable.toSymbol("s") + s, _ := g.symbolTable.ToSymbol("s") ps, _ := g.productionSet.findByLHS(s) sPrec = g.precAndAssoc.productionPredence(ps[0].num) sAssoc = g.precAndAssoc.productionAssociativity(ps[0].num) @@ -668,14 +668,14 @@ bar var fooPrec int var fooAssoc assocType { - s, _ := g.symbolTable.toSymbol("foo") - fooPrec = g.precAndAssoc.terminalPrecedence(s.num()) - fooAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("foo") + fooPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + fooAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } var sPrec int var sAssoc assocType { - s, _ := g.symbolTable.toSymbol("s") + s, _ := g.symbolTable.ToSymbol("s") ps, _ := g.productionSet.findByLHS(s) sPrec = g.precAndAssoc.productionPredence(ps[0].num) sAssoc = g.precAndAssoc.productionAssociativity(ps[0].num) @@ -711,21 +711,21 @@ bar var fooPrec int var fooAssoc assocType { - s, _ := g.symbolTable.toSymbol("foo") - fooPrec = g.precAndAssoc.terminalPrecedence(s.num()) - fooAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("foo") + fooPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + fooAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } var barPrec int var barAssoc assocType { - s, _ := g.symbolTable.toSymbol("bar") - barPrec = g.precAndAssoc.terminalPrecedence(s.num()) - barAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("bar") + barPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + barAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } var sPrec int var sAssoc assocType { - s, _ := g.symbolTable.toSymbol("s") + s, _ := g.symbolTable.ToSymbol("s") ps, _ := g.productionSet.findByLHS(s) sPrec = g.precAndAssoc.productionPredence(ps[0].num) sAssoc = g.precAndAssoc.productionAssociativity(ps[0].num) @@ -766,9 +766,9 @@ bar var fooPrec int var fooAssoc assocType { - s, _ := g.symbolTable.toSymbol("foo") - fooPrec = g.precAndAssoc.terminalPrecedence(s.num()) - fooAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("foo") + fooPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + fooAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } if fooPrec != 2 || fooAssoc != assocTypeRight { t.Fatalf("unexpected terminal precedence and associativity: want: (prec: %v, assoc: %v), got: (prec: %v, assoc: %v)", 2, assocTypeRight, fooPrec, fooAssoc) @@ -776,9 +776,9 @@ bar var barPrec int var barAssoc assocType { - s, _ := g.symbolTable.toSymbol("bar") - barPrec = g.precAndAssoc.terminalPrecedence(s.num()) - barAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("bar") + barPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + barAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } if barPrec != 2 || barAssoc != assocTypeRight { t.Fatalf("unexpected terminal precedence and associativity: want: (prec: %v, assoc: %v), got: (prec: %v, assoc: %v)", 2, assocTypeRight, barPrec, barAssoc) @@ -788,7 +788,7 @@ bar var alt2Prec int var alt2Assoc assocType { - s, _ := g.symbolTable.toSymbol("s") + s, _ := g.symbolTable.ToSymbol("s") ps, _ := g.productionSet.findByLHS(s) alt1Prec = g.precAndAssoc.productionPredence(ps[0].num) alt1Assoc = g.precAndAssoc.productionAssociativity(ps[0].num) @@ -828,9 +828,9 @@ bar var fooPrec int var fooAssoc assocType { - s, _ := g.symbolTable.toSymbol("foo") - fooPrec = g.precAndAssoc.terminalPrecedence(s.num()) - fooAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("foo") + fooPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + fooAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } if fooPrec != 2 || fooAssoc != assocTypeLeft { t.Fatalf("unexpected terminal precedence and associativity: want: (prec: %v, assoc: %v), got: (prec: %v, assoc: %v)", 2, assocTypeLeft, fooPrec, fooAssoc) @@ -838,9 +838,9 @@ bar var barPrec int var barAssoc assocType { - s, _ := g.symbolTable.toSymbol("bar") - barPrec = g.precAndAssoc.terminalPrecedence(s.num()) - barAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("bar") + barPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + barAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } if barPrec != 2 || barAssoc != assocTypeLeft { t.Fatalf("unexpected terminal precedence and associativity: want: (prec: %v, assoc: %v), got: (prec: %v, assoc: %v)", 2, assocTypeLeft, barPrec, barAssoc) @@ -850,7 +850,7 @@ bar var alt2Prec int var alt2Assoc assocType { - s, _ := g.symbolTable.toSymbol("s") + s, _ := g.symbolTable.ToSymbol("s") ps, _ := g.productionSet.findByLHS(s) alt1Prec = g.precAndAssoc.productionPredence(ps[0].num) alt1Assoc = g.precAndAssoc.productionAssociativity(ps[0].num) @@ -891,9 +891,9 @@ bar var fooPrec int var fooAssoc assocType { - s, _ := g.symbolTable.toSymbol("foo") - fooPrec = g.precAndAssoc.terminalPrecedence(s.num()) - fooAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("foo") + fooPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + fooAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } if fooPrec != 2 || fooAssoc != assocTypeLeft { t.Fatalf("unexpected terminal precedence and associativity: want: (prec: %v, assoc: %v), got: (prec: %v, assoc: %v)", 2, assocTypeLeft, fooPrec, fooAssoc) @@ -901,9 +901,9 @@ bar var barPrec int var barAssoc assocType { - s, _ := g.symbolTable.toSymbol("bar") - barPrec = g.precAndAssoc.terminalPrecedence(s.num()) - barAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("bar") + barPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + barAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } if barPrec != 3 || barAssoc != assocTypeRight { t.Fatalf("unexpected terminal precedence and associativity: want: (prec: %v, assoc: %v), got: (prec: %v, assoc: %v)", 3, assocTypeRight, barPrec, barAssoc) @@ -913,7 +913,7 @@ bar var alt2Prec int var alt2Assoc assocType { - s, _ := g.symbolTable.toSymbol("s") + s, _ := g.symbolTable.ToSymbol("s") ps, _ := g.productionSet.findByLHS(s) alt1Prec = g.precAndAssoc.productionPredence(ps[0].num) alt1Assoc = g.precAndAssoc.productionAssociativity(ps[0].num) @@ -952,16 +952,16 @@ bar var fooPrec int var fooAssoc assocType { - s, _ := g.symbolTable.toSymbol("foo") - fooPrec = g.precAndAssoc.terminalPrecedence(s.num()) - fooAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("foo") + fooPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + fooAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } var barPrec int var barAssoc assocType { - s, _ := g.symbolTable.toSymbol("bar") - barPrec = g.precAndAssoc.terminalPrecedence(s.num()) - barAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("bar") + barPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + barAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } if fooPrec != 1 || fooAssoc != assocTypeLeft { t.Fatalf("unexpected terminal precedence and associativity: want: (prec: %v, assoc: %v), got: (prec: %v, assoc: %v)", 1, assocTypeLeft, fooPrec, fooAssoc) @@ -974,7 +974,7 @@ bar var alt2Prec int var alt2Assoc assocType { - s, _ := g.symbolTable.toSymbol("s") + s, _ := g.symbolTable.ToSymbol("s") ps, _ := g.productionSet.findByLHS(s) alt1Prec = g.precAndAssoc.productionPredence(ps[0].num) alt1Assoc = g.precAndAssoc.productionAssociativity(ps[0].num) @@ -1016,9 +1016,9 @@ bar var barPrec int var barAssoc assocType { - s, _ := g.symbolTable.toSymbol("bar") - barPrec = g.precAndAssoc.terminalPrecedence(s.num()) - barAssoc = g.precAndAssoc.terminalAssociativity(s.num()) + s, _ := g.symbolTable.ToSymbol("bar") + barPrec = g.precAndAssoc.terminalPrecedence(s.Num()) + barAssoc = g.precAndAssoc.terminalAssociativity(s.Num()) } if barPrec != 1 || barAssoc != assocTypeLeft { t.Fatalf("unexpected terminal precedence and associativity: want: (prec: %v, assoc: %v), got: (prec: %v, assoc: %v)", 1, assocTypeLeft, barPrec, barAssoc) @@ -1028,7 +1028,7 @@ bar var alt2Prec int var alt2Assoc assocType { - s, _ := g.symbolTable.toSymbol("s") + s, _ := g.symbolTable.ToSymbol("s") ps, _ := g.productionSet.findByLHS(s) alt1Prec = g.precAndAssoc.productionPredence(ps[0].num) alt1Assoc = g.precAndAssoc.productionAssociativity(ps[0].num) @@ -1052,7 +1052,7 @@ bar for _, test := range tests { t.Run(test.caption, func(t *testing.T) { - ast, err := spec.Parse(strings.NewReader(test.specSrc)) + ast, err := parser.Parse(strings.NewReader(test.specSrc)) if err != nil { t.Fatal(err) } @@ -1060,7 +1060,7 @@ bar b := GrammarBuilder{ AST: ast, } - g, err := b.Build() + g, err := b.build() if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -1075,7 +1075,7 @@ func TestGrammarBuilderSpecError(t *testing.T) { type specErrTest struct { caption string specSrc string - errs []*SemanticError + errs []error } spellingInconsistenciesTests := []*specErrTest{ @@ -1094,7 +1094,7 @@ a_1 foo : 'foo'; `, - errs: []*SemanticError{semErrSpellingInconsistency}, + errs: []error{semErrSpellingInconsistency}, }, { caption: "a spelling inconsistency appears among terminal symbols", @@ -1110,7 +1110,7 @@ foo1 foo_1 : 'foo_1'; `, - errs: []*SemanticError{semErrSpellingInconsistency}, + errs: []error{semErrSpellingInconsistency}, }, { caption: "a spelling inconsistency appears among non-terminal and terminal symbols", @@ -1124,7 +1124,7 @@ a1 a_1 : 'a_1'; `, - errs: []*SemanticError{semErrSpellingInconsistency}, + errs: []error{semErrSpellingInconsistency}, }, { caption: "a spelling inconsistency appears among ordered symbols whose precedence is the same", @@ -1145,7 +1145,7 @@ foo bar : 'bar'; `, - errs: []*SemanticError{semErrSpellingInconsistency}, + errs: []error{semErrSpellingInconsistency}, }, { caption: "a spelling inconsistency appears among ordered symbols whose precedence is not the same", @@ -1167,7 +1167,7 @@ foo bar : 'bar'; `, - errs: []*SemanticError{semErrSpellingInconsistency}, + errs: []error{semErrSpellingInconsistency}, }, { caption: "a spelling inconsistency appears among labels the same alternative contains", @@ -1181,7 +1181,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrSpellingInconsistency}, + errs: []error{semErrSpellingInconsistency}, }, { caption: "a spelling inconsistency appears among labels the same production contains", @@ -1198,7 +1198,7 @@ foo bar : 'bar'; `, - errs: []*SemanticError{semErrSpellingInconsistency}, + errs: []error{semErrSpellingInconsistency}, }, { caption: "a spelling inconsistency appears among labels different productions contain", @@ -1217,7 +1217,7 @@ foo bar : 'bar'; `, - errs: []*SemanticError{semErrSpellingInconsistency}, + errs: []error{semErrSpellingInconsistency}, }, } @@ -1237,7 +1237,7 @@ b foo : "foo"; `, - errs: []*SemanticError{semErrUnusedProduction}, + errs: []error{semErrUnusedProduction}, }, { caption: "a terminal symbol `bar` is unused", @@ -1253,7 +1253,7 @@ foo bar : "bar"; `, - errs: []*SemanticError{semErrUnusedTerminal}, + errs: []error{semErrUnusedTerminal}, }, { caption: "a production `b` and terminal symbol `bar` is unused", @@ -1272,7 +1272,7 @@ foo bar : "bar"; `, - errs: []*SemanticError{ + errs: []error{ semErrUnusedProduction, semErrUnusedTerminal, }, @@ -1289,7 +1289,7 @@ s #prec foo foo : 'foo'; `, - errs: []*SemanticError{semErrInvalidProdDir}, + errs: []error{semErrInvalidProdDir}, }, { caption: "a lexical production cannot have alternative directives", @@ -1303,7 +1303,7 @@ s foo : 'foo' #skip; `, - errs: []*SemanticError{semErrInvalidAltDir}, + errs: []error{semErrInvalidAltDir}, }, { caption: "a production directive must not be duplicated", @@ -1317,7 +1317,7 @@ s foo #skip #skip : 'foo'; `, - errs: []*SemanticError{semErrDuplicateDir}, + errs: []error{semErrDuplicateDir}, }, { caption: "an alternative directive must not be duplicated", @@ -1333,7 +1333,7 @@ foo bar : 'bar'; `, - errs: []*SemanticError{semErrDuplicateDir}, + errs: []error{semErrDuplicateDir}, }, { caption: "a production must not have a duplicate alternative (non-empty alternatives)", @@ -1348,7 +1348,7 @@ s foo : "foo"; `, - errs: []*SemanticError{semErrDuplicateProduction}, + errs: []error{semErrDuplicateProduction}, }, { caption: "a production must not have a duplicate alternative (non-empty and split alternatives)", @@ -1371,7 +1371,7 @@ foo bar : "bar"; `, - errs: []*SemanticError{semErrDuplicateProduction}, + errs: []error{semErrDuplicateProduction}, }, { caption: "a production must not have a duplicate alternative (empty alternatives)", @@ -1390,7 +1390,7 @@ a foo : "foo"; `, - errs: []*SemanticError{semErrDuplicateProduction}, + errs: []error{semErrDuplicateProduction}, }, { caption: "a production must not have a duplicate alternative (empty and split alternatives)", @@ -1412,7 +1412,7 @@ a foo : "foo"; `, - errs: []*SemanticError{semErrDuplicateProduction}, + errs: []error{semErrDuplicateProduction}, }, { caption: "a terminal symbol and a non-terminal symbol (start symbol) are duplicates", @@ -1428,7 +1428,7 @@ foo s : "a"; `, - errs: []*SemanticError{semErrDuplicateName}, + errs: []error{semErrDuplicateName}, }, { caption: "a terminal symbol and a non-terminal symbol (not start symbol) are duplicates", @@ -1450,7 +1450,7 @@ bar a : "a"; `, - errs: []*SemanticError{semErrDuplicateName}, + errs: []error{semErrDuplicateName}, }, { caption: "an invalid top-level directive", @@ -1466,7 +1466,7 @@ s a : 'a'; `, - errs: []*SemanticError{semErrDirInvalidName}, + errs: []error{semErrDirInvalidName}, }, { caption: "a label must be unique in an alternative", @@ -1482,7 +1482,7 @@ foo bar : 'bar'; `, - errs: []*SemanticError{semErrDuplicateLabel}, + errs: []error{semErrDuplicateLabel}, }, { caption: "a label cannot be the same name as terminal symbols", @@ -1498,7 +1498,7 @@ foo bar : 'bar'; `, - errs: []*SemanticError{semErrDuplicateLabel}, + errs: []error{semErrDuplicateLabel}, }, { caption: "a label cannot be the same name as non-terminal symbols", @@ -1518,7 +1518,7 @@ foo bar : 'bar'; `, - errs: []*SemanticError{ + errs: []error{ semErrInvalidLabel, }, }, @@ -1535,7 +1535,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrNoGrammarName}, + errs: []error{semErrNoGrammarName}, }, { caption: "the `#name` directive needs an ID parameter", @@ -1549,7 +1549,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#name` directive cannot take a pattern parameter", @@ -1563,7 +1563,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#name` directive cannot take a string parameter", @@ -1577,7 +1577,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#name` directive takes just one parameter", @@ -1591,7 +1591,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, } @@ -1610,7 +1610,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#prec` directive cannot take an ID parameter", @@ -1626,7 +1626,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#prec` directive cannot take an ordered symbol parameter", @@ -1642,7 +1642,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#prec` directive cannot take a pattern parameter", @@ -1658,7 +1658,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#prec` directive cannot take a string parameter", @@ -1674,7 +1674,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#prec` directive takes just one directive group parameter", @@ -1690,7 +1690,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, } @@ -1711,7 +1711,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#left` directive cannot be applied to an error symbol", @@ -1732,7 +1732,7 @@ foo semi_colon : ';'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#left` directive cannot take an undefined symbol", @@ -1750,7 +1750,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#left` directive cannot take a non-terminal symbol", @@ -1768,7 +1768,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#left` directive cannot take a pattern parameter", @@ -1786,7 +1786,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#left` directive cannot take a string parameter", @@ -1804,7 +1804,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#left` directive cannot take a directive parameter", @@ -1822,7 +1822,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#left` dirctive cannot be specified multiple times for a terminal symbol", @@ -1840,7 +1840,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateAssoc}, + errs: []error{semErrDuplicateAssoc}, }, { caption: "the `#left` dirctive cannot be specified multiple times for an ordered symbol", @@ -1858,7 +1858,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateAssoc}, + errs: []error{semErrDuplicateAssoc}, }, { caption: "a terminal symbol cannot have different precedence", @@ -1877,7 +1877,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateAssoc}, + errs: []error{semErrDuplicateAssoc}, }, { caption: "an ordered symbol cannot have different precedence", @@ -1896,7 +1896,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateAssoc}, + errs: []error{semErrDuplicateAssoc}, }, { caption: "a terminal symbol cannot have different associativity", @@ -1915,7 +1915,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateAssoc}, + errs: []error{semErrDuplicateAssoc}, }, { caption: "an ordered symbol cannot have different associativity", @@ -1934,7 +1934,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateAssoc}, + errs: []error{semErrDuplicateAssoc}, }, } @@ -1955,7 +1955,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#right` directive cannot be applied to an error symbol", @@ -1976,7 +1976,7 @@ foo semi_colon : ';'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#right` directive cannot take an undefined symbol", @@ -1994,7 +1994,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#right` directive cannot take a non-terminal symbol", @@ -2012,7 +2012,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#right` directive cannot take a pattern parameter", @@ -2030,7 +2030,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#right` directive cannot take a string parameter", @@ -2048,7 +2048,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#right` directive cannot take a directive group parameter", @@ -2066,7 +2066,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#right` directive cannot be specified multiple times for a terminal symbol", @@ -2084,7 +2084,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateAssoc}, + errs: []error{semErrDuplicateAssoc}, }, { caption: "the `#right` directive cannot be specified multiple times for an ordered symbol", @@ -2102,7 +2102,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateAssoc}, + errs: []error{semErrDuplicateAssoc}, }, { caption: "a terminal symbol cannot have different precedence", @@ -2121,7 +2121,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateAssoc}, + errs: []error{semErrDuplicateAssoc}, }, { caption: "an ordered symbol cannot have different precedence", @@ -2140,7 +2140,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateAssoc}, + errs: []error{semErrDuplicateAssoc}, }, { caption: "a terminal symbol cannot have different associativity", @@ -2159,7 +2159,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateAssoc}, + errs: []error{semErrDuplicateAssoc}, }, { caption: "an ordered symbol cannot have different associativity", @@ -2178,7 +2178,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateAssoc}, + errs: []error{semErrDuplicateAssoc}, }, } @@ -2199,7 +2199,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#assign` directive cannot be applied to an error symbol", @@ -2220,7 +2220,7 @@ foo semi_colon : ';'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#assign` directive cannot take an undefined symbol", @@ -2238,7 +2238,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#assign` directive cannot take a non-terminal symbol", @@ -2256,7 +2256,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#assign` directive cannot take a pattern parameter", @@ -2274,7 +2274,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#assign` directive cannot take a string parameter", @@ -2292,7 +2292,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#assign` directive cannot take a directive parameter", @@ -2310,7 +2310,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#assign` dirctive cannot be specified multiple times for a terminal symbol", @@ -2328,7 +2328,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateAssoc}, + errs: []error{semErrDuplicateAssoc}, }, { caption: "the `#assign` dirctive cannot be specified multiple times for an ordered symbol", @@ -2346,7 +2346,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateAssoc}, + errs: []error{semErrDuplicateAssoc}, }, { caption: "a terminal symbol cannot have different precedence", @@ -2365,7 +2365,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateAssoc}, + errs: []error{semErrDuplicateAssoc}, }, { caption: "an ordered symbol cannot have different precedence", @@ -2384,7 +2384,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateAssoc}, + errs: []error{semErrDuplicateAssoc}, }, { caption: "a terminal symbol cannot have different associativity", @@ -2403,7 +2403,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateAssoc}, + errs: []error{semErrDuplicateAssoc}, }, { caption: "an ordered symbol cannot have different associativity", @@ -2422,7 +2422,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateAssoc}, + errs: []error{semErrDuplicateAssoc}, }, } @@ -2441,7 +2441,7 @@ error foo: 'foo'; `, - errs: []*SemanticError{ + errs: []error{ semErrErrSymIsReserved, semErrDuplicateName, }, @@ -2457,7 +2457,7 @@ s error: 'error'; `, - errs: []*SemanticError{semErrErrSymIsReserved}, + errs: []error{semErrErrSymIsReserved}, }, { caption: "cannot use the error symbol as a terminal symbol, even if given the skip directive", @@ -2473,7 +2473,7 @@ foo error #skip : 'error'; `, - errs: []*SemanticError{semErrErrSymIsReserved}, + errs: []error{semErrErrSymIsReserved}, }, } @@ -2490,7 +2490,7 @@ s foo : "foo"; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#ast` directive cannot take an ordered symbol parameter", @@ -2508,7 +2508,7 @@ s foo : "foo"; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#ast` directive cannot take a pattern parameter", @@ -2522,7 +2522,7 @@ s foo : "foo"; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#ast` directive cannot take a string parameter", @@ -2536,7 +2536,7 @@ s foo : "foo"; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#ast` directive cannot take a directive group parameter", @@ -2550,7 +2550,7 @@ s foo : "foo"; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "a parameter of the `#ast` directive must be either a symbol or a label in an alternative", @@ -2566,7 +2566,7 @@ foo bar : "bar"; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "a symbol in a different alternative cannot be a parameter of the `#ast` directive", @@ -2583,7 +2583,7 @@ foo bar : "bar"; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "a label in a different alternative cannot be a parameter of the `#ast` directive", @@ -2600,7 +2600,7 @@ foo bar : "bar"; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "a symbol can appear in the `#ast` directive only once", @@ -2614,7 +2614,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateElem}, + errs: []error{semErrDuplicateElem}, }, { caption: "a label can appear in the `#ast` directive only once", @@ -2628,7 +2628,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateElem}, + errs: []error{semErrDuplicateElem}, }, { caption: "a symbol can appear in the `#ast` directive only once, even if the symbol has a label", @@ -2642,7 +2642,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDuplicateElem}, + errs: []error{semErrDuplicateElem}, }, { caption: "symbol `foo` is ambiguous because it appears in an alternative twice", @@ -2656,7 +2656,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrAmbiguousElem}, + errs: []error{semErrAmbiguousElem}, }, { caption: "symbol `foo` is ambiguous because it appears in an alternative twice, even if one of them has a label", @@ -2670,7 +2670,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrAmbiguousElem}, + errs: []error{semErrAmbiguousElem}, }, { caption: "the expansion operator cannot be applied to a terminal symbol", @@ -2684,7 +2684,7 @@ s foo : "foo"; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, } @@ -2701,7 +2701,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#prec` directive cannot be applied to an error symbol", @@ -2715,7 +2715,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#prec` directive cannot take an undefined symbol", @@ -2729,7 +2729,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#prec` directive cannot take a non-terminal symbol", @@ -2752,7 +2752,7 @@ foo bar : 'bar'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#prec` directive cannot take an undefined ordered symbol parameter", @@ -2766,7 +2766,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrUndefinedOrdSym}, + errs: []error{semErrUndefinedOrdSym}, }, { caption: "the `#prec` directive cannot take a pattern parameter", @@ -2780,7 +2780,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#prec` directive cannot take a string parameter", @@ -2794,7 +2794,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#prec` directive cannot take a directive parameter", @@ -2808,7 +2808,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "a symbol the `#prec` directive takes must be given precedence explicitly", @@ -2824,7 +2824,7 @@ foo bar : 'bar'; `, - errs: []*SemanticError{semErrUndefinedPrec}, + errs: []error{semErrUndefinedPrec}, }, } @@ -2841,7 +2841,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#recover` directive cannot take an ordered symbol parameter", @@ -2859,7 +2859,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#recover` directive cannot take a pattern parameter", @@ -2873,7 +2873,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#recover` directive cannot take a string parameter", @@ -2887,7 +2887,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#recover` directive cannot take a directive group parameter", @@ -2901,7 +2901,7 @@ s foo : 'foo'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, } @@ -2918,7 +2918,7 @@ s fragment f : 'fragment'; `, - errs: []*SemanticError{semErrUndefinedSym}, + errs: []error{semErrUndefinedSym}, }, { caption: "fragments cannot be duplicated", @@ -2936,7 +2936,7 @@ fragment f fragment f : 'fragment 2'; `, - errs: []*SemanticError{semErrDuplicateFragment}, + errs: []error{semErrDuplicateFragment}, }, } @@ -2955,7 +2955,7 @@ foo #push mode_1 bar #mode : 'bar'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#mode` directive cannot take an ordered symbol parameter", @@ -2975,7 +2975,7 @@ foo bar #mode $x : 'bar'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#mode` directive cannot take a pattern parameter", @@ -2991,7 +2991,7 @@ foo #push mode_1 bar #mode "mode_1" : 'bar'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#mode` directive cannot take a string parameter", @@ -3007,7 +3007,7 @@ foo #push mode_1 bar #mode 'mode_1' : 'bar'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#mode` directive cannot take a directive group parameter", @@ -3023,7 +3023,7 @@ foo #push mode_1 bar #mode () : 'bar'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, } @@ -3042,7 +3042,7 @@ foo #push bar #mode mode_1 : 'bar'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#push` directive takes just one ID parameter", @@ -3058,7 +3058,7 @@ foo #push mode_1 mode_2 bar #mode mode_1 : 'bar'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#push` directive cannot take an ordered symbol parameter", @@ -3078,7 +3078,7 @@ foo #push $x bar : 'bar'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#push` directive cannot take a pattern parameter", @@ -3094,7 +3094,7 @@ foo #push "mode_1" bar #mode mode_1 : 'bar'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#push` directive cannot take a string parameter", @@ -3110,7 +3110,7 @@ foo #push 'mode_1' bar #mode mode_1 : 'bar'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#push` directive cannot take a directive group parameter", @@ -3126,7 +3126,7 @@ foo #push () bar #mode mode_1 : 'bar'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, } @@ -3147,7 +3147,7 @@ bar #mode mode_1 baz #pop mode_1 : 'baz'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#pop` directive cannot take an ordered symbol parameter", @@ -3169,7 +3169,7 @@ bar #mode mode_1 baz #pop $x : 'baz'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#pop` directive cannot take a pattern parameter", @@ -3187,7 +3187,7 @@ bar #mode mode_1 baz #pop "mode_1" : 'baz'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#pop` directive cannot take a string parameter", @@ -3205,7 +3205,7 @@ bar #mode mode_1 baz #pop 'mode_1' : 'baz'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#pop` directive cannot take a directive parameter", @@ -3223,7 +3223,7 @@ bar #mode mode_1 baz #pop () : 'baz'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, } @@ -3242,7 +3242,7 @@ foo #skip bar bar : 'bar'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#skip` directive cannot take an ordered symbol parameter", @@ -3262,7 +3262,7 @@ foo #skip $x bar : 'bar'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#skip` directive cannot take a pattern parameter", @@ -3278,7 +3278,7 @@ foo #skip "bar" bar : 'bar'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#skip` directive cannot take a string parameter", @@ -3294,7 +3294,7 @@ foo #skip 'bar' bar : 'bar'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "the `#skip` directive cannot take a directive group parameter", @@ -3310,7 +3310,7 @@ foo #skip () bar : 'bar'; `, - errs: []*SemanticError{semErrDirInvalidParam}, + errs: []error{semErrDirInvalidParam}, }, { caption: "a terminal symbol used in productions cannot have the skip directive", @@ -3326,7 +3326,7 @@ foo #skip bar : 'bar'; `, - errs: []*SemanticError{semErrTermCannotBeSkipped}, + errs: []error{semErrTermCannotBeSkipped}, }, } @@ -3349,7 +3349,7 @@ bar tests = append(tests, skipDirTests...) for _, test := range tests { t.Run(test.caption, func(t *testing.T) { - ast, err := spec.Parse(strings.NewReader(test.specSrc)) + ast, err := parser.Parse(strings.NewReader(test.specSrc)) if err != nil { t.Fatal(err) } @@ -3357,7 +3357,7 @@ bar b := GrammarBuilder{ AST: ast, } - _, err = b.Build() + _, err = b.build() if err == nil { t.Fatal("an expected error didn't occur") } diff --git a/grammar/item.go b/grammar/item.go index 100d920..84c4911 100644 --- a/grammar/item.go +++ b/grammar/item.go @@ -6,6 +6,8 @@ import ( "fmt" "sort" "strconv" + + "github.com/nihei9/vartan/grammar/symbol" ) type lrItemID [32]byte @@ -19,7 +21,7 @@ func (id lrItemID) num() uint32 { } type lookAhead struct { - symbols map[symbol]struct{} + symbols map[symbol.Symbol]struct{} // When propagation is true, an item propagates look-ahead symbols to other items. propagation bool @@ -38,7 +40,7 @@ type lrItem struct { // 2 | T | E → E +・T // 3 | Nil | E → E + T・ dot int - dottedSymbol symbol + dottedSymbol symbol.Symbol // When initial is true, the LHS of the production is the augmented start symbol and dot is 0. // It looks like S' →・S. @@ -74,13 +76,13 @@ func newLR0Item(prod *production, dot int) (*lrItem, error) { id = sha256.Sum256(b) } - dottedSymbol := symbolNil + dottedSymbol := symbol.SymbolNil if dot < prod.rhsLen { dottedSymbol = prod.rhs[dot] } initial := false - if prod.lhs.isStart() && dot == 0 { + if prod.lhs.IsStart() && dot == 0 { initial = true } @@ -176,7 +178,7 @@ func (n stateNum) next() stateNum { type lrState struct { *kernel num stateNum - next map[symbol]kernelID + next map[symbol.Symbol]kernelID reducible map[productionID]struct{} // emptyProdItems stores items that have an empty production like `p → ε` and is reducible. diff --git a/grammar/lalr1.go b/grammar/lalr1.go index f1b8149..1667d84 100644 --- a/grammar/lalr1.go +++ b/grammar/lalr1.go @@ -1,6 +1,10 @@ package grammar -import "fmt" +import ( + "fmt" + + "github.com/nihei9/vartan/grammar/symbol" +) type stateAndLRItem struct { kernelID kernelID @@ -19,8 +23,8 @@ type lalr1Automaton struct { func genLALR1Automaton(lr0 *lr0Automaton, prods *productionSet, first *firstSet) (*lalr1Automaton, error) { // Set the look-ahead symbol <EOF> to the initial item: [S' → ・S, $] iniState := lr0.states[lr0.initialState] - iniState.items[0].lookAhead.symbols = map[symbol]struct{}{ - symbolEOF: {}, + iniState.items[0].lookAhead.symbols = map[symbol.Symbol]struct{}{ + symbol.SymbolEOF: {}, } var props []*propagation @@ -55,7 +59,7 @@ func genLALR1Automaton(lr0 *lr0Automaton, prods *productionSet, first *firstSet) return nil, fmt.Errorf("reducible item not found: %v", item.id) } if reducibleItem.lookAhead.symbols == nil { - reducibleItem.lookAhead.symbols = map[symbol]struct{}{} + reducibleItem.lookAhead.symbols = map[symbol.Symbol]struct{}{} } for a := range item.lookAhead.symbols { reducibleItem.lookAhead.symbols[a] = struct{}{} @@ -104,7 +108,7 @@ func genLALR1Automaton(lr0 *lr0Automaton, prods *productionSet, first *firstSet) } if nextItem.lookAhead.symbols == nil { - nextItem.lookAhead.symbols = map[symbol]struct{}{} + nextItem.lookAhead.symbols = map[symbol.Symbol]struct{}{} } for a := range item.lookAhead.symbols { @@ -138,7 +142,7 @@ func genLALR1Automaton(lr0 *lr0Automaton, prods *productionSet, first *firstSet) func genLALR1Closure(srcItem *lrItem, prods *productionSet, first *firstSet) ([]*lrItem, error) { items := []*lrItem{} - knownItems := map[lrItemID]map[symbol]struct{}{} + knownItems := map[lrItemID]map[symbol.Symbol]struct{}{} knownItemsProp := map[lrItemID]struct{}{} uncheckedItems := []*lrItem{} items = append(items, srcItem) @@ -146,7 +150,7 @@ func genLALR1Closure(srcItem *lrItem, prods *productionSet, first *firstSet) ([] for len(uncheckedItems) > 0 { nextUncheckedItems := []*lrItem{} for _, item := range uncheckedItems { - if item.dottedSymbol.isTerminal() { + if item.dottedSymbol.IsTerminal() { continue } @@ -155,7 +159,7 @@ func genLALR1Closure(srcItem *lrItem, prods *productionSet, first *firstSet) ([] return nil, fmt.Errorf("production not found: %v", item.prod) } - var fstSyms []symbol + var fstSyms []symbol.Symbol var isFstNullable bool { fst, err := first.find(p, item.dot+1) @@ -163,7 +167,7 @@ func genLALR1Closure(srcItem *lrItem, prods *productionSet, first *firstSet) ([] return nil, err } - fstSyms = make([]symbol, len(fst.symbols)) + fstSyms = make([]symbol.Symbol, len(fst.symbols)) i := 0 for s := range fst.symbols { fstSyms[i] = s @@ -176,7 +180,7 @@ func genLALR1Closure(srcItem *lrItem, prods *productionSet, first *firstSet) ([] ps, _ := prods.findByLHS(item.dottedSymbol) for _, prod := range ps { - var lookAhead []symbol + var lookAhead []symbol.Symbol { var lookAheadCount int if isFstNullable { @@ -185,7 +189,7 @@ func genLALR1Closure(srcItem *lrItem, prods *productionSet, first *firstSet) ([] lookAheadCount = len(fstSyms) } - lookAhead = make([]symbol, lookAheadCount) + lookAhead = make([]symbol.Symbol, lookAheadCount) i := 0 for _, s := range fstSyms { lookAhead[i] = s @@ -210,13 +214,13 @@ func genLALR1Closure(srcItem *lrItem, prods *productionSet, first *firstSet) ([] } } - newItem.lookAhead.symbols = map[symbol]struct{}{ + newItem.lookAhead.symbols = map[symbol.Symbol]struct{}{ a: {}, } items = append(items, newItem) if knownItems[newItem.id] == nil { - knownItems[newItem.id] = map[symbol]struct{}{} + knownItems[newItem.id] = map[symbol.Symbol]struct{}{} } knownItems[newItem.id][a] = struct{}{} nextUncheckedItems = append(nextUncheckedItems, newItem) @@ -297,7 +301,7 @@ func propagateLookAhead(lr0 *lr0Automaton, props []*propagation) error { } if destItem.lookAhead.symbols == nil { - destItem.lookAhead.symbols = map[symbol]struct{}{} + destItem.lookAhead.symbols = map[symbol.Symbol]struct{}{} } destItem.lookAhead.symbols[a] = struct{}{} diff --git a/grammar/lalr1_test.go b/grammar/lalr1_test.go index d6d0371..c57dc5c 100644 --- a/grammar/lalr1_test.go +++ b/grammar/lalr1_test.go @@ -4,7 +4,8 @@ import ( "strings" "testing" - spec "github.com/nihei9/vartan/spec/grammar" + "github.com/nihei9/vartan/grammar/symbol" + "github.com/nihei9/vartan/spec/grammar/parser" ) func TestGenLALR1Automaton(t *testing.T) { @@ -23,15 +24,14 @@ id: "[A-Za-z0-9_]+"; var gram *Grammar var automaton *lalr1Automaton { - ast, err := spec.Parse(strings.NewReader(src)) + ast, err := parser.Parse(strings.NewReader(src)) if err != nil { t.Fatal(err) } b := GrammarBuilder{ AST: ast, } - - gram, err = b.Build() + gram, err = b.build() if err != nil { t.Fatal(err) } @@ -66,42 +66,42 @@ id: "[A-Za-z0-9_]+"; expectedKernels := map[int][]*lrItem{ 0: { - withLookAhead(genLR0Item("s'", 0, "s"), symbolEOF), + withLookAhead(genLR0Item("s'", 0, "s"), symbol.SymbolEOF), }, 1: { - withLookAhead(genLR0Item("s'", 1, "s"), symbolEOF), + withLookAhead(genLR0Item("s'", 1, "s"), symbol.SymbolEOF), }, 2: { - withLookAhead(genLR0Item("s", 1, "l", "eq", "r"), symbolEOF), - withLookAhead(genLR0Item("r", 1, "l"), symbolEOF), + withLookAhead(genLR0Item("s", 1, "l", "eq", "r"), symbol.SymbolEOF), + withLookAhead(genLR0Item("r", 1, "l"), symbol.SymbolEOF), }, 3: { - withLookAhead(genLR0Item("s", 1, "r"), symbolEOF), + withLookAhead(genLR0Item("s", 1, "r"), symbol.SymbolEOF), }, 4: { - withLookAhead(genLR0Item("l", 1, "ref", "r"), genSym("eq"), symbolEOF), + withLookAhead(genLR0Item("l", 1, "ref", "r"), genSym("eq"), symbol.SymbolEOF), }, 5: { - withLookAhead(genLR0Item("l", 1, "id"), genSym("eq"), symbolEOF), + withLookAhead(genLR0Item("l", 1, "id"), genSym("eq"), symbol.SymbolEOF), }, 6: { - withLookAhead(genLR0Item("s", 2, "l", "eq", "r"), symbolEOF), + withLookAhead(genLR0Item("s", 2, "l", "eq", "r"), symbol.SymbolEOF), }, 7: { - withLookAhead(genLR0Item("l", 2, "ref", "r"), genSym("eq"), symbolEOF), + withLookAhead(genLR0Item("l", 2, "ref", "r"), genSym("eq"), symbol.SymbolEOF), }, 8: { - withLookAhead(genLR0Item("r", 1, "l"), genSym("eq"), symbolEOF), + withLookAhead(genLR0Item("r", 1, "l"), genSym("eq"), symbol.SymbolEOF), }, 9: { - withLookAhead(genLR0Item("s", 3, "l", "eq", "r"), symbolEOF), + withLookAhead(genLR0Item("s", 3, "l", "eq", "r"), symbol.SymbolEOF), }, } expectedStates := []*expectedLRState{ { kernelItems: expectedKernels[0], - nextStates: map[symbol][]*lrItem{ + nextStates: map[symbol.Symbol][]*lrItem{ genSym("s"): expectedKernels[1], genSym("l"): expectedKernels[2], genSym("r"): expectedKernels[3], @@ -112,14 +112,14 @@ id: "[A-Za-z0-9_]+"; }, { kernelItems: expectedKernels[1], - nextStates: map[symbol][]*lrItem{}, + nextStates: map[symbol.Symbol][]*lrItem{}, reducibleProds: []*production{ genProd("s'", "s"), }, }, { kernelItems: expectedKernels[2], - nextStates: map[symbol][]*lrItem{ + nextStates: map[symbol.Symbol][]*lrItem{ genSym("eq"): expectedKernels[6], }, reducibleProds: []*production{ @@ -128,14 +128,14 @@ id: "[A-Za-z0-9_]+"; }, { kernelItems: expectedKernels[3], - nextStates: map[symbol][]*lrItem{}, + nextStates: map[symbol.Symbol][]*lrItem{}, reducibleProds: []*production{ genProd("s", "r"), }, }, { kernelItems: expectedKernels[4], - nextStates: map[symbol][]*lrItem{ + nextStates: map[symbol.Symbol][]*lrItem{ genSym("r"): expectedKernels[7], genSym("l"): expectedKernels[8], genSym("ref"): expectedKernels[4], @@ -145,14 +145,14 @@ id: "[A-Za-z0-9_]+"; }, { kernelItems: expectedKernels[5], - nextStates: map[symbol][]*lrItem{}, + nextStates: map[symbol.Symbol][]*lrItem{}, reducibleProds: []*production{ genProd("l", "id"), }, }, { kernelItems: expectedKernels[6], - nextStates: map[symbol][]*lrItem{ + nextStates: map[symbol.Symbol][]*lrItem{ genSym("r"): expectedKernels[9], genSym("l"): expectedKernels[8], genSym("ref"): expectedKernels[4], @@ -162,21 +162,21 @@ id: "[A-Za-z0-9_]+"; }, { kernelItems: expectedKernels[7], - nextStates: map[symbol][]*lrItem{}, + nextStates: map[symbol.Symbol][]*lrItem{}, reducibleProds: []*production{ genProd("l", "ref", "r"), }, }, { kernelItems: expectedKernels[8], - nextStates: map[symbol][]*lrItem{}, + nextStates: map[symbol.Symbol][]*lrItem{}, reducibleProds: []*production{ genProd("r", "l"), }, }, { kernelItems: expectedKernels[9], - nextStates: map[symbol][]*lrItem{}, + nextStates: map[symbol.Symbol][]*lrItem{}, reducibleProds: []*production{ genProd("s", "l", "eq", "r"), }, diff --git a/grammar/lexical/compiler.go b/grammar/lexical/compiler.go new file mode 100644 index 0000000..61aa3f2 --- /dev/null +++ b/grammar/lexical/compiler.go @@ -0,0 +1,413 @@ +package lexical + +import ( + "bytes" + "fmt" + + "github.com/nihei9/vartan/compressor" + "github.com/nihei9/vartan/grammar/lexical/dfa" + psr "github.com/nihei9/vartan/grammar/lexical/parser" + spec "github.com/nihei9/vartan/spec/grammar" +) + +type CompileError struct { + Kind spec.LexKindName + Fragment bool + Cause error + Detail string +} + +func Compile(lexspec *LexSpec, compLv int) (*spec.LexicalSpec, error, []*CompileError) { + err := lexspec.Validate() + if err != nil { + return nil, fmt.Errorf("invalid lexical specification:\n%w", err), nil + } + + modeEntries, modeNames, modeName2ID, fragmetns := groupEntriesByLexMode(lexspec.Entries) + + modeSpecs := []*spec.CompiledLexModeSpec{ + nil, + } + for i, es := range modeEntries[1:] { + modeName := modeNames[i+1] + modeSpec, err, cerrs := compile(es, modeName2ID, fragmetns, compLv) + if err != nil { + return nil, fmt.Errorf("failed to compile in %v mode: %w", modeName, err), cerrs + } + modeSpecs = append(modeSpecs, modeSpec) + } + + var kindNames []spec.LexKindName + var name2ID map[spec.LexKindName]spec.LexKindID + { + name2ID = map[spec.LexKindName]spec.LexKindID{} + id := spec.LexKindIDMin + for _, modeSpec := range modeSpecs[1:] { + for _, name := range modeSpec.KindNames[1:] { + if _, ok := name2ID[name]; ok { + continue + } + name2ID[name] = id + id++ + } + } + + kindNames = make([]spec.LexKindName, len(name2ID)+1) + for name, id := range name2ID { + kindNames[id] = name + } + } + + var kindIDs [][]spec.LexKindID + { + kindIDs = make([][]spec.LexKindID, len(modeSpecs)) + for i, modeSpec := range modeSpecs[1:] { + ids := make([]spec.LexKindID, len(modeSpec.KindNames)) + for modeID, name := range modeSpec.KindNames { + if modeID == 0 { + continue + } + ids[modeID] = name2ID[name] + } + kindIDs[i+1] = ids + } + } + + return &spec.LexicalSpec{ + InitialModeID: spec.LexModeIDDefault, + ModeNames: modeNames, + KindNames: kindNames, + KindIDs: kindIDs, + CompressionLevel: compLv, + Specs: modeSpecs, + }, nil, nil +} + +func groupEntriesByLexMode(entries []*LexEntry) ([][]*LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeID, map[spec.LexKindName]*LexEntry) { + modeNames := []spec.LexModeName{ + spec.LexModeNameNil, + spec.LexModeNameDefault, + } + modeName2ID := map[spec.LexModeName]spec.LexModeID{ + spec.LexModeNameNil: spec.LexModeIDNil, + spec.LexModeNameDefault: spec.LexModeIDDefault, + } + lastModeID := spec.LexModeIDDefault + modeEntries := [][]*LexEntry{ + nil, + {}, + } + fragments := map[spec.LexKindName]*LexEntry{} + for _, e := range entries { + if e.Fragment { + fragments[e.Kind] = e + continue + } + ms := e.Modes + if len(ms) == 0 { + ms = []spec.LexModeName{ + spec.LexModeNameDefault, + } + } + for _, modeName := range ms { + modeID, ok := modeName2ID[modeName] + if !ok { + modeID = lastModeID + 1 + lastModeID = modeID + modeName2ID[modeName] = modeID + modeNames = append(modeNames, modeName) + modeEntries = append(modeEntries, []*LexEntry{}) + } + modeEntries[modeID] = append(modeEntries[modeID], e) + } + } + return modeEntries, modeNames, modeName2ID, fragments +} + +func compile( + entries []*LexEntry, + modeName2ID map[spec.LexModeName]spec.LexModeID, + fragments map[spec.LexKindName]*LexEntry, + compLv int, +) (*spec.CompiledLexModeSpec, error, []*CompileError) { + var kindNames []spec.LexKindName + kindIDToName := map[spec.LexModeKindID]spec.LexKindName{} + var patterns map[spec.LexModeKindID][]byte + { + kindNames = append(kindNames, spec.LexKindNameNil) + patterns = map[spec.LexModeKindID][]byte{} + for i, e := range entries { + kindID := spec.LexModeKindID(i + 1) + + kindNames = append(kindNames, e.Kind) + kindIDToName[kindID] = e.Kind + patterns[kindID] = []byte(e.Pattern) + } + } + + push := []spec.LexModeID{ + spec.LexModeIDNil, + } + pop := []int{ + 0, + } + for _, e := range entries { + pushV := spec.LexModeIDNil + if e.Push != "" { + pushV = modeName2ID[e.Push] + } + push = append(push, pushV) + popV := 0 + if e.Pop { + popV = 1 + } + pop = append(pop, popV) + } + + fragmentPatterns := map[spec.LexKindName][]byte{} + for k, e := range fragments { + fragmentPatterns[k] = []byte(e.Pattern) + } + + fragmentCPTrees := make(map[spec.LexKindName]psr.CPTree, len(fragmentPatterns)) + { + var cerrs []*CompileError + for kind, pat := range fragmentPatterns { + p := psr.NewParser(kind, bytes.NewReader(pat)) + t, err := p.Parse() + if err != nil { + if err == psr.ParseErr { + detail, cause := p.Error() + cerrs = append(cerrs, &CompileError{ + Kind: kind, + Fragment: true, + Cause: cause, + Detail: detail, + }) + } else { + cerrs = append(cerrs, &CompileError{ + Kind: kind, + Fragment: true, + Cause: err, + }) + } + continue + } + fragmentCPTrees[kind] = t + } + if len(cerrs) > 0 { + return nil, fmt.Errorf("compile error"), cerrs + } + + err := psr.CompleteFragments(fragmentCPTrees) + if err != nil { + if err == psr.ParseErr { + for _, frag := range fragmentCPTrees { + kind, frags, err := frag.Describe() + if err != nil { + return nil, err, nil + } + + cerrs = append(cerrs, &CompileError{ + Kind: kind, + Fragment: true, + Cause: fmt.Errorf("fragment contains undefined fragments or cycles"), + Detail: fmt.Sprintf("%v", frags), + }) + } + + return nil, fmt.Errorf("compile error"), cerrs + } + + return nil, err, nil + } + } + + cpTrees := map[spec.LexModeKindID]psr.CPTree{} + { + pats := make([]*psr.PatternEntry, len(patterns)+1) + pats[spec.LexModeKindIDNil] = &psr.PatternEntry{ + ID: spec.LexModeKindIDNil, + } + for id, pattern := range patterns { + pats[id] = &psr.PatternEntry{ + ID: id, + Pattern: pattern, + } + } + + var cerrs []*CompileError + for _, pat := range pats { + if pat.ID == spec.LexModeKindIDNil { + continue + } + + p := psr.NewParser(kindIDToName[pat.ID], bytes.NewReader(pat.Pattern)) + t, err := p.Parse() + if err != nil { + if err == psr.ParseErr { + detail, cause := p.Error() + cerrs = append(cerrs, &CompileError{ + Kind: kindIDToName[pat.ID], + Fragment: false, + Cause: cause, + Detail: detail, + }) + } else { + cerrs = append(cerrs, &CompileError{ + Kind: kindIDToName[pat.ID], + Fragment: false, + Cause: err, + }) + } + continue + } + + complete, err := psr.ApplyFragments(t, fragmentCPTrees) + if err != nil { + return nil, err, nil + } + if !complete { + _, frags, err := t.Describe() + if err != nil { + return nil, err, nil + } + + cerrs = append(cerrs, &CompileError{ + Kind: kindIDToName[pat.ID], + Fragment: false, + Cause: fmt.Errorf("pattern contains undefined fragments"), + Detail: fmt.Sprintf("%v", frags), + }) + continue + } + + cpTrees[pat.ID] = t + } + if len(cerrs) > 0 { + return nil, fmt.Errorf("compile error"), cerrs + } + } + + var tranTab *spec.TransitionTable + { + root, symTab, err := dfa.ConvertCPTreeToByteTree(cpTrees) + if err != nil { + return nil, err, nil + } + d := dfa.GenDFA(root, symTab) + tranTab, err = dfa.GenTransitionTable(d) + if err != nil { + return nil, err, nil + } + } + + var err error + switch compLv { + case 2: + tranTab, err = compressTransitionTableLv2(tranTab) + if err != nil { + return nil, err, nil + } + case 1: + tranTab, err = compressTransitionTableLv1(tranTab) + if err != nil { + return nil, err, nil + } + } + + return &spec.CompiledLexModeSpec{ + KindNames: kindNames, + Push: push, + Pop: pop, + DFA: tranTab, + }, nil, nil +} + +const ( + CompressionLevelMin = 0 + CompressionLevelMax = 2 +) + +func compressTransitionTableLv2(tranTab *spec.TransitionTable) (*spec.TransitionTable, error) { + ueTab := compressor.NewUniqueEntriesTable() + { + orig, err := compressor.NewOriginalTable(convertStateIDSliceToIntSlice(tranTab.UncompressedTransition), tranTab.ColCount) + if err != nil { + return nil, err + } + err = ueTab.Compress(orig) + if err != nil { + return nil, err + } + } + + rdTab := compressor.NewRowDisplacementTable(0) + { + orig, err := compressor.NewOriginalTable(ueTab.UniqueEntries, ueTab.OriginalColCount) + if err != nil { + return nil, err + } + err = rdTab.Compress(orig) + if err != nil { + return nil, err + } + } + + tranTab.Transition = &spec.UniqueEntriesTable{ + UniqueEntries: &spec.RowDisplacementTable{ + OriginalRowCount: rdTab.OriginalRowCount, + OriginalColCount: rdTab.OriginalColCount, + EmptyValue: spec.StateIDNil, + Entries: convertIntSliceToStateIDSlice(rdTab.Entries), + Bounds: rdTab.Bounds, + RowDisplacement: rdTab.RowDisplacement, + }, + RowNums: ueTab.RowNums, + OriginalRowCount: ueTab.OriginalRowCount, + OriginalColCount: ueTab.OriginalColCount, + } + tranTab.UncompressedTransition = nil + + return tranTab, nil +} + +func compressTransitionTableLv1(tranTab *spec.TransitionTable) (*spec.TransitionTable, error) { + ueTab := compressor.NewUniqueEntriesTable() + { + orig, err := compressor.NewOriginalTable(convertStateIDSliceToIntSlice(tranTab.UncompressedTransition), tranTab.ColCount) + if err != nil { + return nil, err + } + err = ueTab.Compress(orig) + if err != nil { + return nil, err + } + } + + tranTab.Transition = &spec.UniqueEntriesTable{ + UncompressedUniqueEntries: convertIntSliceToStateIDSlice(ueTab.UniqueEntries), + RowNums: ueTab.RowNums, + OriginalRowCount: ueTab.OriginalRowCount, + OriginalColCount: ueTab.OriginalColCount, + } + tranTab.UncompressedTransition = nil + + return tranTab, nil +} + +func convertStateIDSliceToIntSlice(s []spec.StateID) []int { + is := make([]int, len(s)) + for i, v := range s { + is[i] = v.Int() + } + return is +} + +func convertIntSliceToStateIDSlice(s []int) []spec.StateID { + ss := make([]spec.StateID, len(s)) + for i, v := range s { + ss[i] = spec.StateID(v) + } + return ss +} diff --git a/grammar/lexical/compiler_test.go b/grammar/lexical/compiler_test.go new file mode 100644 index 0000000..f2ef0f2 --- /dev/null +++ b/grammar/lexical/compiler_test.go @@ -0,0 +1,338 @@ +package lexical + +import ( + "encoding/json" + "fmt" + "testing" + + spec "github.com/nihei9/vartan/spec/grammar" +) + +func TestLexSpec_Validate(t *testing.T) { + // We expect that the spelling inconsistency error will occur. + spec := &LexSpec{ + Entries: []*LexEntry{ + { + Modes: []spec.LexModeName{ + // 'Default' is the spelling inconsistency because 'default' is predefined. + "Default", + }, + Kind: "foo", + Pattern: "foo", + }, + }, + } + err := spec.Validate() + if err == nil { + t.Fatalf("expected error didn't occur") + } +} + +func TestSnakeCaseToUpperCamelCase(t *testing.T) { + tests := []struct { + snake string + camel string + }{ + { + snake: "foo", + camel: "Foo", + }, + { + snake: "foo_bar", + camel: "FooBar", + }, + { + snake: "foo_bar_baz", + camel: "FooBarBaz", + }, + { + snake: "Foo", + camel: "Foo", + }, + { + snake: "fooBar", + camel: "FooBar", + }, + { + snake: "FOO", + camel: "FOO", + }, + { + snake: "FOO_BAR", + camel: "FOOBAR", + }, + { + snake: "_foo_bar_", + camel: "FooBar", + }, + { + snake: "___foo___bar___", + camel: "FooBar", + }, + } + for _, tt := range tests { + c := SnakeCaseToUpperCamelCase(tt.snake) + if c != tt.camel { + t.Errorf("unexpected string; want: %v, got: %v", tt.camel, c) + } + } +} + +func TestFindSpellingInconsistencies(t *testing.T) { + tests := []struct { + ids []string + duplicated [][]string + }{ + { + ids: []string{"foo", "foo"}, + duplicated: nil, + }, + { + ids: []string{"foo", "Foo"}, + duplicated: [][]string{{"Foo", "foo"}}, + }, + { + ids: []string{"foo", "foo", "Foo"}, + duplicated: [][]string{{"Foo", "foo"}}, + }, + { + ids: []string{"foo_bar_baz", "FooBarBaz"}, + duplicated: [][]string{{"FooBarBaz", "foo_bar_baz"}}, + }, + { + ids: []string{"foo", "Foo", "bar", "Bar"}, + duplicated: [][]string{{"Bar", "bar"}, {"Foo", "foo"}}, + }, + { + ids: []string{"foo", "Foo", "bar", "Bar", "baz", "bra"}, + duplicated: [][]string{{"Bar", "bar"}, {"Foo", "foo"}}, + }, + } + for i, tt := range tests { + t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) { + duplicated := FindSpellingInconsistencies(tt.ids) + if len(duplicated) != len(tt.duplicated) { + t.Fatalf("unexpected IDs; want: %#v, got: %#v", tt.duplicated, duplicated) + } + for i, dupIDs := range duplicated { + if len(dupIDs) != len(tt.duplicated[i]) { + t.Fatalf("unexpected IDs; want: %#v, got: %#v", tt.duplicated[i], dupIDs) + } + for j, id := range dupIDs { + if id != tt.duplicated[i][j] { + t.Fatalf("unexpected IDs; want: %#v, got: %#v", tt.duplicated[i], dupIDs) + } + } + } + }) + } +} + +func TestCompile(t *testing.T) { + tests := []struct { + Caption string + Spec string + Err bool + }{ + { + Caption: "allow duplicates names between fragments and non-fragments", + Spec: ` +{ + "name": "test", + "entries": [ + { + "kind": "a2z", + "pattern": "\\f{a2z}" + }, + { + "fragment": true, + "kind": "a2z", + "pattern": "[a-z]" + } + ] +} +`, + }, + { + Caption: "don't allow duplicates names in non-fragments", + Spec: ` +{ + "name": "test", + "entries": [ + { + "kind": "a2z", + "pattern": "a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z" + }, + { + "kind": "a2z", + "pattern": "[a-z]" + } + ] +} +`, + Err: true, + }, + { + Caption: "don't allow duplicates names in fragments", + Spec: ` +{ + "name": "test", + "entries": [ + { + "kind": "a2z", + "pattern": "\\f{a2z}" + }, + { + "fragments": true, + "kind": "a2z", + "pattern": "a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z" + }, + { + "fragments": true, + "kind": "a2z", + "pattern": "[a-z]" + } + ] +} +`, + Err: true, + }, + { + Caption: "don't allow kind names in the same mode to contain spelling inconsistencies", + Spec: ` +{ + "name": "test", + "entries": [ + { + "kind": "foo_1", + "pattern": "foo_1" + }, + { + "kind": "foo1", + "pattern": "foo1" + } + ] +} +`, + Err: true, + }, + { + Caption: "don't allow kind names across modes to contain spelling inconsistencies", + Spec: ` +{ + "name": "test", + "entries": [ + { + "modes": ["default"], + "kind": "foo_1", + "pattern": "foo_1" + }, + { + "modes": ["other_mode"], + "kind": "foo1", + "pattern": "foo1" + } + ] +} +`, + Err: true, + }, + { + Caption: "don't allow mode names to contain spelling inconsistencies", + Spec: ` +{ + "name": "test", + "entries": [ + { + "modes": ["foo_1"], + "kind": "a", + "pattern": "a" + }, + { + "modes": ["foo1"], + "kind": "b", + "pattern": "b" + } + ] +} +`, + Err: true, + }, + { + Caption: "allow fragment names in the same mode to contain spelling inconsistencies because fragments will not appear in output files", + Spec: ` +{ + "name": "test", + "entries": [ + { + "kind": "a", + "pattern": "a" + }, + { + "fragment": true, + "kind": "foo_1", + "pattern": "foo_1" + }, + { + "fragment": true, + "kind": "foo1", + "pattern": "foo1" + } + ] +} +`, + }, + { + Caption: "allow fragment names across modes to contain spelling inconsistencies because fragments will not appear in output files", + Spec: ` +{ + "name": "test", + "entries": [ + { + "modes": ["default"], + "kind": "a", + "pattern": "a" + }, + { + "modes": ["default"], + "fragment": true, + "kind": "foo_1", + "pattern": "foo_1" + }, + { + "modes": ["other_mode"], + "fragment": true, + "kind": "foo1", + "pattern": "foo1" + } + ] +} +`, + }, + } + for i, tt := range tests { + t.Run(fmt.Sprintf("#%v %s", i, tt.Caption), func(t *testing.T) { + lspec := &LexSpec{} + err := json.Unmarshal([]byte(tt.Spec), lspec) + if err != nil { + t.Fatalf("%v", err) + } + clspec, err, _ := Compile(lspec, CompressionLevelMin) + if tt.Err { + if err == nil { + t.Fatalf("expected an error") + } + if clspec != nil { + t.Fatalf("Compile function mustn't return a compiled specification") + } + } else { + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if clspec == nil { + t.Fatalf("Compile function must return a compiled specification") + } + } + }) + } +} diff --git a/grammar/lexical/dfa/dfa.go b/grammar/lexical/dfa/dfa.go new file mode 100644 index 0000000..850264a --- /dev/null +++ b/grammar/lexical/dfa/dfa.go @@ -0,0 +1,173 @@ +package dfa + +import ( + "sort" + + spec "github.com/nihei9/vartan/spec/grammar" +) + +type symbolTable struct { + symPos2Byte map[symbolPosition]byteRange + endPos2ID map[symbolPosition]spec.LexModeKindID +} + +func genSymbolTable(root byteTree) *symbolTable { + symTab := &symbolTable{ + symPos2Byte: map[symbolPosition]byteRange{}, + endPos2ID: map[symbolPosition]spec.LexModeKindID{}, + } + return genSymTab(symTab, root) +} + +func genSymTab(symTab *symbolTable, node byteTree) *symbolTable { + if node == nil { + return symTab + } + + switch n := node.(type) { + case *symbolNode: + symTab.symPos2Byte[n.pos] = byteRange{ + from: n.from, + to: n.to, + } + case *endMarkerNode: + symTab.endPos2ID[n.pos] = n.id + default: + left, right := node.children() + genSymTab(symTab, left) + genSymTab(symTab, right) + } + return symTab +} + +type DFA struct { + States []string + InitialState string + AcceptingStatesTable map[string]spec.LexModeKindID + TransitionTable map[string][256]string +} + +func GenDFA(root byteTree, symTab *symbolTable) *DFA { + initialState := root.first() + initialStateHash := initialState.hash() + stateMap := map[string]*symbolPositionSet{ + initialStateHash: initialState, + } + tranTab := map[string][256]string{} + { + follow := genFollowTable(root) + unmarkedStates := map[string]*symbolPositionSet{ + initialStateHash: initialState, + } + for len(unmarkedStates) > 0 { + nextUnmarkedStates := map[string]*symbolPositionSet{} + for hash, state := range unmarkedStates { + tranTabOfState := [256]*symbolPositionSet{} + for _, pos := range state.set() { + if pos.isEndMark() { + continue + } + valRange := symTab.symPos2Byte[pos] + for symVal := valRange.from; symVal <= valRange.to; symVal++ { + if tranTabOfState[symVal] == nil { + tranTabOfState[symVal] = newSymbolPositionSet() + } + tranTabOfState[symVal].merge(follow[pos]) + } + } + for _, t := range tranTabOfState { + if t == nil { + continue + } + h := t.hash() + if _, ok := stateMap[h]; ok { + continue + } + stateMap[h] = t + nextUnmarkedStates[h] = t + } + tabOfState := [256]string{} + for v, t := range tranTabOfState { + if t == nil { + continue + } + tabOfState[v] = t.hash() + } + tranTab[hash] = tabOfState + } + unmarkedStates = nextUnmarkedStates + } + } + + accTab := map[string]spec.LexModeKindID{} + { + for h, s := range stateMap { + for _, pos := range s.set() { + if !pos.isEndMark() { + continue + } + priorID, ok := accTab[h] + if !ok { + accTab[h] = symTab.endPos2ID[pos] + } else { + id := symTab.endPos2ID[pos] + if id < priorID { + accTab[h] = id + } + } + } + } + } + + var states []string + { + for s := range stateMap { + states = append(states, s) + } + sort.Slice(states, func(i, j int) bool { + return states[i] < states[j] + }) + } + + return &DFA{ + States: states, + InitialState: initialStateHash, + AcceptingStatesTable: accTab, + TransitionTable: tranTab, + } +} + +func GenTransitionTable(dfa *DFA) (*spec.TransitionTable, error) { + stateHash2ID := map[string]spec.StateID{} + for i, s := range dfa.States { + // Since 0 represents an invalid value in a transition table, + // assign a number greater than or equal to 1 to states. + stateHash2ID[s] = spec.StateID(i + spec.StateIDMin.Int()) + } + + acc := make([]spec.LexModeKindID, len(dfa.States)+1) + for _, s := range dfa.States { + id, ok := dfa.AcceptingStatesTable[s] + if !ok { + continue + } + acc[stateHash2ID[s]] = id + } + + rowCount := len(dfa.States) + 1 + colCount := 256 + tran := make([]spec.StateID, rowCount*colCount) + for s, tab := range dfa.TransitionTable { + for v, to := range tab { + tran[stateHash2ID[s].Int()*256+v] = stateHash2ID[to] + } + } + + return &spec.TransitionTable{ + InitialStateID: stateHash2ID[dfa.InitialState], + AcceptingStates: acc, + UncompressedTransition: tran, + RowCount: rowCount, + ColCount: colCount, + }, nil +} diff --git a/grammar/lexical/dfa/dfa_test.go b/grammar/lexical/dfa/dfa_test.go new file mode 100644 index 0000000..ae71875 --- /dev/null +++ b/grammar/lexical/dfa/dfa_test.go @@ -0,0 +1,121 @@ +package dfa + +import ( + "strings" + "testing" + + "github.com/nihei9/vartan/grammar/lexical/parser" + spec "github.com/nihei9/vartan/spec/grammar" +) + +func TestGenDFA(t *testing.T) { + p := parser.NewParser(spec.LexKindName("test"), strings.NewReader("(a|b)*abb")) + cpt, err := p.Parse() + if err != nil { + t.Fatal(err) + } + bt, symTab, err := ConvertCPTreeToByteTree(map[spec.LexModeKindID]parser.CPTree{ + spec.LexModeKindIDMin: cpt, + }) + if err != nil { + t.Fatal(err) + } + dfa := GenDFA(bt, symTab) + if dfa == nil { + t.Fatalf("DFA is nil") + } + + symPos := func(n uint16) symbolPosition { + pos, err := newSymbolPosition(n, false) + if err != nil { + panic(err) + } + return pos + } + + endPos := func(n uint16) symbolPosition { + pos, err := newSymbolPosition(n, true) + if err != nil { + panic(err) + } + return pos + } + + s0 := newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)) + s1 := newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)).add(symPos(4)) + s2 := newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)).add(symPos(5)) + s3 := newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)).add(endPos(6)) + + rune2Int := func(char rune, index int) uint8 { + return uint8([]byte(string(char))[index]) + } + + tranS0 := [256]string{} + tranS0[rune2Int('a', 0)] = s1.hash() + tranS0[rune2Int('b', 0)] = s0.hash() + + tranS1 := [256]string{} + tranS1[rune2Int('a', 0)] = s1.hash() + tranS1[rune2Int('b', 0)] = s2.hash() + + tranS2 := [256]string{} + tranS2[rune2Int('a', 0)] = s1.hash() + tranS2[rune2Int('b', 0)] = s3.hash() + + tranS3 := [256]string{} + tranS3[rune2Int('a', 0)] = s1.hash() + tranS3[rune2Int('b', 0)] = s0.hash() + + expectedTranTab := map[string][256]string{ + s0.hash(): tranS0, + s1.hash(): tranS1, + s2.hash(): tranS2, + s3.hash(): tranS3, + } + if len(dfa.TransitionTable) != len(expectedTranTab) { + t.Errorf("transition table is mismatched: want: %v entries, got: %v entries", len(expectedTranTab), len(dfa.TransitionTable)) + } + for h, eTranTab := range expectedTranTab { + tranTab, ok := dfa.TransitionTable[h] + if !ok { + t.Errorf("no entry; hash: %v", h) + continue + } + if len(tranTab) != len(eTranTab) { + t.Errorf("transition table is mismatched: hash: %v, want: %v entries, got: %v entries", h, len(eTranTab), len(tranTab)) + } + for c, eNext := range eTranTab { + if eNext == "" { + continue + } + + next := tranTab[c] + if next == "" { + t.Errorf("no enatry: hash: %v, char: %v", h, c) + } + if next != eNext { + t.Errorf("next state is mismatched: want: %v, got: %v", eNext, next) + } + } + } + + if dfa.InitialState != s0.hash() { + t.Errorf("initial state is mismatched: want: %v, got: %v", s0.hash(), dfa.InitialState) + } + + accTab := map[string]spec.LexModeKindID{ + s3.hash(): 1, + } + if len(dfa.AcceptingStatesTable) != len(accTab) { + t.Errorf("accepting states are mismatched: want: %v entries, got: %v entries", len(accTab), len(dfa.AcceptingStatesTable)) + } + for eState, eID := range accTab { + id, ok := dfa.AcceptingStatesTable[eState] + if !ok { + t.Errorf("accepting state is not found: state: %v", eState) + } + if id != eID { + t.Errorf("ID is mismatched: state: %v, want: %v, got: %v", eState, eID, id) + } + } +} diff --git a/grammar/lexical/dfa/symbol_position.go b/grammar/lexical/dfa/symbol_position.go new file mode 100644 index 0000000..f154251 --- /dev/null +++ b/grammar/lexical/dfa/symbol_position.go @@ -0,0 +1,182 @@ +package dfa + +import ( + "encoding/binary" + "fmt" + "strings" +) + +type symbolPosition uint16 + +const ( + symbolPositionNil symbolPosition = 0x0000 + + symbolPositionMin uint16 = 0x0001 + symbolPositionMax uint16 = 0x7fff + + symbolPositionMaskSymbol uint16 = 0x0000 + symbolPositionMaskEndMark uint16 = 0x8000 + + symbolPositionMaskValue uint16 = 0x7fff +) + +func newSymbolPosition(n uint16, endMark bool) (symbolPosition, error) { + if n < symbolPositionMin || n > symbolPositionMax { + return symbolPositionNil, fmt.Errorf("symbol position must be within %v to %v: n: %v, endMark: %v", symbolPositionMin, symbolPositionMax, n, endMark) + } + if endMark { + return symbolPosition(n | symbolPositionMaskEndMark), nil + } + return symbolPosition(n | symbolPositionMaskSymbol), nil +} + +func (p symbolPosition) String() string { + if p.isEndMark() { + return fmt.Sprintf("end#%v", uint16(p)&symbolPositionMaskValue) + } + return fmt.Sprintf("sym#%v", uint16(p)&symbolPositionMaskValue) +} + +func (p symbolPosition) isEndMark() bool { + return uint16(p)&symbolPositionMaskEndMark > 1 +} + +func (p symbolPosition) describe() (uint16, bool) { + v := uint16(p) & symbolPositionMaskValue + if p.isEndMark() { + return v, true + } + return v, false +} + +type symbolPositionSet struct { + // `s` represents a set of symbol positions. + // However, immediately after adding a symbol position, the elements may be duplicated. + // When you need an aligned set with no duplicates, you can get such value via the set function. + s []symbolPosition + sorted bool +} + +func newSymbolPositionSet() *symbolPositionSet { + return &symbolPositionSet{ + s: []symbolPosition{}, + sorted: false, + } +} + +func (s *symbolPositionSet) String() string { + if len(s.s) <= 0 { + return "{}" + } + ps := s.sortAndRemoveDuplicates() + var b strings.Builder + fmt.Fprintf(&b, "{") + for i, p := range ps { + if i <= 0 { + fmt.Fprintf(&b, "%v", p) + continue + } + fmt.Fprintf(&b, ", %v", p) + } + fmt.Fprintf(&b, "}") + return b.String() +} + +func (s *symbolPositionSet) set() []symbolPosition { + s.sortAndRemoveDuplicates() + return s.s +} + +func (s *symbolPositionSet) add(pos symbolPosition) *symbolPositionSet { + s.s = append(s.s, pos) + s.sorted = false + return s +} + +func (s *symbolPositionSet) merge(t *symbolPositionSet) *symbolPositionSet { + s.s = append(s.s, t.s...) + s.sorted = false + return s +} + +func (s *symbolPositionSet) hash() string { + if len(s.s) <= 0 { + return "" + } + sorted := s.sortAndRemoveDuplicates() + var buf []byte + for _, p := range sorted { + b := make([]byte, 8) + binary.PutUvarint(b, uint64(p)) + buf = append(buf, b...) + } + // Convert to a string to be able to use it as a key of a map. + // But note this byte sequence is made from values of symbol positions, + // so this is not a well-formed UTF-8 sequence. + return string(buf) +} + +func (s *symbolPositionSet) sortAndRemoveDuplicates() []symbolPosition { + if s.sorted { + return s.s + } + + sortSymbolPositions(s.s, 0, len(s.s)-1) + + // Remove duplicates. + lastV := s.s[0] + nextIdx := 1 + for _, v := range s.s[1:] { + if v == lastV { + continue + } + s.s[nextIdx] = v + nextIdx++ + lastV = v + } + s.s = s.s[:nextIdx] + s.sorted = true + + return s.s +} + +// sortSymbolPositions sorts a slice of symbol positions as it uses quick sort. +func sortSymbolPositions(ps []symbolPosition, left, right int) { + if left >= right { + return + } + var pivot symbolPosition + { + // Use a median as a pivot. + p1 := ps[left] + p2 := ps[(left+right)/2] + p3 := ps[right] + if p1 > p2 { + p1, p2 = p2, p1 + } + if p2 > p3 { + p2 = p3 + if p1 > p2 { + p2 = p1 + } + } + pivot = p2 + } + i := left + j := right + for i <= j { + for ps[i] < pivot { + i++ + } + for ps[j] > pivot { + j-- + } + if i <= j { + ps[i], ps[j] = ps[j], ps[i] + i++ + j-- + } + } + sortSymbolPositions(ps, left, j) + sortSymbolPositions(ps, i, right) +} diff --git a/grammar/lexical/dfa/symbol_position_test.go b/grammar/lexical/dfa/symbol_position_test.go new file mode 100644 index 0000000..c867f64 --- /dev/null +++ b/grammar/lexical/dfa/symbol_position_test.go @@ -0,0 +1,79 @@ +package dfa + +import ( + "fmt" + "testing" +) + +func TestNewSymbolPosition(t *testing.T) { + tests := []struct { + n uint16 + endMark bool + err bool + }{ + { + n: 0, + endMark: false, + err: true, + }, + { + n: 0, + endMark: true, + err: true, + }, + { + n: symbolPositionMin - 1, + endMark: false, + err: true, + }, + { + n: symbolPositionMin - 1, + endMark: true, + err: true, + }, + { + n: symbolPositionMin, + endMark: false, + }, + { + n: symbolPositionMin, + endMark: true, + }, + { + n: symbolPositionMax, + endMark: false, + }, + { + n: symbolPositionMax, + endMark: true, + }, + { + n: symbolPositionMax + 1, + endMark: false, + err: true, + }, + { + n: symbolPositionMax + 1, + endMark: true, + err: true, + }, + } + for i, tt := range tests { + t.Run(fmt.Sprintf("#%v n: %v, endMark: %v", i, tt.n, tt.endMark), func(t *testing.T) { + pos, err := newSymbolPosition(tt.n, tt.endMark) + if tt.err { + if err == nil { + t.Fatal("err is nil") + } + return + } + if err != nil { + t.Fatal(err) + } + n, endMark := pos.describe() + if n != tt.n || endMark != tt.endMark { + t.Errorf("unexpected symbol position: want: n: %v, endMark: %v, got: n: %v, endMark: %v", tt.n, tt.endMark, n, endMark) + } + }) + } +} diff --git a/grammar/lexical/dfa/tree.go b/grammar/lexical/dfa/tree.go new file mode 100644 index 0000000..cd6081e --- /dev/null +++ b/grammar/lexical/dfa/tree.go @@ -0,0 +1,567 @@ +package dfa + +import ( + "fmt" + "io" + "sort" + + "github.com/nihei9/vartan/grammar/lexical/parser" + spec "github.com/nihei9/vartan/spec/grammar" + "github.com/nihei9/vartan/utf8" +) + +type byteTree interface { + fmt.Stringer + children() (byteTree, byteTree) + nullable() bool + first() *symbolPositionSet + last() *symbolPositionSet + clone() byteTree +} + +var ( + _ byteTree = &symbolNode{} + _ byteTree = &endMarkerNode{} + _ byteTree = &concatNode{} + _ byteTree = &altNode{} + _ byteTree = &repeatNode{} + _ byteTree = &optionNode{} +) + +type byteRange struct { + from byte + to byte +} + +type symbolNode struct { + byteRange + pos symbolPosition + firstMemo *symbolPositionSet + lastMemo *symbolPositionSet +} + +func newSymbolNode(value byte) *symbolNode { + return &symbolNode{ + byteRange: byteRange{ + from: value, + to: value, + }, + pos: symbolPositionNil, + } +} + +func newRangeSymbolNode(from, to byte) *symbolNode { + return &symbolNode{ + byteRange: byteRange{ + from: from, + to: to, + }, + pos: symbolPositionNil, + } +} + +func (n *symbolNode) String() string { + return fmt.Sprintf("symbol: value: %v-%v, pos: %v", n.from, n.to, n.pos) +} + +func (n *symbolNode) children() (byteTree, byteTree) { + return nil, nil +} + +func (n *symbolNode) nullable() bool { + return false +} + +func (n *symbolNode) first() *symbolPositionSet { + if n.firstMemo == nil { + n.firstMemo = newSymbolPositionSet() + n.firstMemo.add(n.pos) + } + return n.firstMemo +} + +func (n *symbolNode) last() *symbolPositionSet { + if n.lastMemo == nil { + n.lastMemo = newSymbolPositionSet() + n.lastMemo.add(n.pos) + } + return n.lastMemo +} + +func (n *symbolNode) clone() byteTree { + return newRangeSymbolNode(n.from, n.to) +} + +type endMarkerNode struct { + id spec.LexModeKindID + pos symbolPosition + firstMemo *symbolPositionSet + lastMemo *symbolPositionSet +} + +func newEndMarkerNode(id spec.LexModeKindID) *endMarkerNode { + return &endMarkerNode{ + id: id, + pos: symbolPositionNil, + } +} + +func (n *endMarkerNode) String() string { + return fmt.Sprintf("end: pos: %v", n.pos) +} + +func (n *endMarkerNode) children() (byteTree, byteTree) { + return nil, nil +} + +func (n *endMarkerNode) nullable() bool { + return false +} + +func (n *endMarkerNode) first() *symbolPositionSet { + if n.firstMemo == nil { + n.firstMemo = newSymbolPositionSet() + n.firstMemo.add(n.pos) + } + return n.firstMemo +} + +func (n *endMarkerNode) last() *symbolPositionSet { + if n.lastMemo == nil { + n.lastMemo = newSymbolPositionSet() + n.lastMemo.add(n.pos) + } + return n.lastMemo +} + +func (n *endMarkerNode) clone() byteTree { + return newEndMarkerNode(n.id) +} + +type concatNode struct { + left byteTree + right byteTree + firstMemo *symbolPositionSet + lastMemo *symbolPositionSet +} + +func newConcatNode(left, right byteTree) *concatNode { + return &concatNode{ + left: left, + right: right, + } +} + +func (n *concatNode) String() string { + return "concat" +} + +func (n *concatNode) children() (byteTree, byteTree) { + return n.left, n.right +} + +func (n *concatNode) nullable() bool { + return n.left.nullable() && n.right.nullable() +} + +func (n *concatNode) first() *symbolPositionSet { + if n.firstMemo == nil { + n.firstMemo = newSymbolPositionSet() + n.firstMemo.merge(n.left.first()) + if n.left.nullable() { + n.firstMemo.merge(n.right.first()) + } + n.firstMemo.sortAndRemoveDuplicates() + } + return n.firstMemo +} + +func (n *concatNode) last() *symbolPositionSet { + if n.lastMemo == nil { + n.lastMemo = newSymbolPositionSet() + n.lastMemo.merge(n.right.last()) + if n.right.nullable() { + n.lastMemo.merge(n.left.last()) + } + n.lastMemo.sortAndRemoveDuplicates() + } + return n.lastMemo +} + +func (n *concatNode) clone() byteTree { + return newConcatNode(n.left.clone(), n.right.clone()) +} + +type altNode struct { + left byteTree + right byteTree + firstMemo *symbolPositionSet + lastMemo *symbolPositionSet +} + +func newAltNode(left, right byteTree) *altNode { + return &altNode{ + left: left, + right: right, + } +} + +func (n *altNode) String() string { + return "alt" +} + +func (n *altNode) children() (byteTree, byteTree) { + return n.left, n.right +} + +func (n *altNode) nullable() bool { + return n.left.nullable() || n.right.nullable() +} + +func (n *altNode) first() *symbolPositionSet { + if n.firstMemo == nil { + n.firstMemo = newSymbolPositionSet() + n.firstMemo.merge(n.left.first()) + n.firstMemo.merge(n.right.first()) + n.firstMemo.sortAndRemoveDuplicates() + } + return n.firstMemo +} + +func (n *altNode) last() *symbolPositionSet { + if n.lastMemo == nil { + n.lastMemo = newSymbolPositionSet() + n.lastMemo.merge(n.left.last()) + n.lastMemo.merge(n.right.last()) + n.lastMemo.sortAndRemoveDuplicates() + } + return n.lastMemo +} + +func (n *altNode) clone() byteTree { + return newAltNode(n.left.clone(), n.right.clone()) +} + +type repeatNode struct { + left byteTree + firstMemo *symbolPositionSet + lastMemo *symbolPositionSet +} + +func newRepeatNode(left byteTree) *repeatNode { + return &repeatNode{ + left: left, + } +} + +func (n *repeatNode) String() string { + return "repeat" +} + +func (n *repeatNode) children() (byteTree, byteTree) { + return n.left, nil +} + +func (n *repeatNode) nullable() bool { + return true +} + +func (n *repeatNode) first() *symbolPositionSet { + if n.firstMemo == nil { + n.firstMemo = newSymbolPositionSet() + n.firstMemo.merge(n.left.first()) + n.firstMemo.sortAndRemoveDuplicates() + } + return n.firstMemo +} + +func (n *repeatNode) last() *symbolPositionSet { + if n.lastMemo == nil { + n.lastMemo = newSymbolPositionSet() + n.lastMemo.merge(n.left.last()) + n.lastMemo.sortAndRemoveDuplicates() + } + return n.lastMemo +} + +func (n *repeatNode) clone() byteTree { + return newRepeatNode(n.left.clone()) +} + +type optionNode struct { + left byteTree + firstMemo *symbolPositionSet + lastMemo *symbolPositionSet +} + +func newOptionNode(left byteTree) *optionNode { + return &optionNode{ + left: left, + } +} + +func (n *optionNode) String() string { + return "option" +} + +func (n *optionNode) children() (byteTree, byteTree) { + return n.left, nil +} + +func (n *optionNode) nullable() bool { + return true +} + +func (n *optionNode) first() *symbolPositionSet { + if n.firstMemo == nil { + n.firstMemo = newSymbolPositionSet() + n.firstMemo.merge(n.left.first()) + n.firstMemo.sortAndRemoveDuplicates() + } + return n.firstMemo +} + +func (n *optionNode) last() *symbolPositionSet { + if n.lastMemo == nil { + n.lastMemo = newSymbolPositionSet() + n.lastMemo.merge(n.left.last()) + n.lastMemo.sortAndRemoveDuplicates() + } + return n.lastMemo +} + +func (n *optionNode) clone() byteTree { + return newOptionNode(n.left.clone()) +} + +type followTable map[symbolPosition]*symbolPositionSet + +func genFollowTable(root byteTree) followTable { + follow := followTable{} + calcFollow(follow, root) + return follow +} + +func calcFollow(follow followTable, ast byteTree) { + if ast == nil { + return + } + left, right := ast.children() + calcFollow(follow, left) + calcFollow(follow, right) + switch n := ast.(type) { + case *concatNode: + l, r := n.children() + for _, p := range l.last().set() { + if _, ok := follow[p]; !ok { + follow[p] = newSymbolPositionSet() + } + follow[p].merge(r.first()) + } + case *repeatNode: + for _, p := range n.last().set() { + if _, ok := follow[p]; !ok { + follow[p] = newSymbolPositionSet() + } + follow[p].merge(n.first()) + } + } +} + +func positionSymbols(node byteTree, n uint16) (uint16, error) { + if node == nil { + return n, nil + } + + l, r := node.children() + p := n + p, err := positionSymbols(l, p) + if err != nil { + return p, err + } + p, err = positionSymbols(r, p) + if err != nil { + return p, err + } + switch n := node.(type) { + case *symbolNode: + n.pos, err = newSymbolPosition(p, false) + if err != nil { + return p, err + } + p++ + case *endMarkerNode: + n.pos, err = newSymbolPosition(p, true) + if err != nil { + return p, err + } + p++ + } + node.first() + node.last() + return p, nil +} + +func concat(ts ...byteTree) byteTree { + nonNilNodes := []byteTree{} + for _, t := range ts { + if t == nil { + continue + } + nonNilNodes = append(nonNilNodes, t) + } + if len(nonNilNodes) <= 0 { + return nil + } + if len(nonNilNodes) == 1 { + return nonNilNodes[0] + } + concat := newConcatNode(nonNilNodes[0], nonNilNodes[1]) + for _, t := range nonNilNodes[2:] { + concat = newConcatNode(concat, t) + } + return concat +} + +func oneOf(ts ...byteTree) byteTree { + nonNilNodes := []byteTree{} + for _, t := range ts { + if t == nil { + continue + } + nonNilNodes = append(nonNilNodes, t) + } + if len(nonNilNodes) <= 0 { + return nil + } + if len(nonNilNodes) == 1 { + return nonNilNodes[0] + } + alt := newAltNode(nonNilNodes[0], nonNilNodes[1]) + for _, t := range nonNilNodes[2:] { + alt = newAltNode(alt, t) + } + return alt +} + +//nolint:unused +func printByteTree(w io.Writer, t byteTree, ruledLine string, childRuledLinePrefix string, withAttrs bool) { + if t == nil { + return + } + fmt.Fprintf(w, "%v%v", ruledLine, t) + if withAttrs { + fmt.Fprintf(w, ", nullable: %v, first: %v, last: %v", t.nullable(), t.first(), t.last()) + } + fmt.Fprintf(w, "\n") + left, right := t.children() + children := []byteTree{} + if left != nil { + children = append(children, left) + } + if right != nil { + children = append(children, right) + } + num := len(children) + for i, child := range children { + line := "└─ " + if num > 1 { + if i == 0 { + line = "├─ " + } else if i < num-1 { + line = "│ " + } + } + prefix := "│ " + if i >= num-1 { + prefix = " " + } + printByteTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix, withAttrs) + } +} + +func ConvertCPTreeToByteTree(cpTrees map[spec.LexModeKindID]parser.CPTree) (byteTree, *symbolTable, error) { + var ids []spec.LexModeKindID + for id := range cpTrees { + ids = append(ids, id) + } + sort.Slice(ids, func(i, j int) bool { + return ids[i] < ids[j] + }) + + var bt byteTree + for _, id := range ids { + cpTree := cpTrees[id] + t, err := convCPTreeToByteTree(cpTree) + if err != nil { + return nil, nil, err + } + bt = oneOf(bt, concat(t, newEndMarkerNode(id))) + } + _, err := positionSymbols(bt, symbolPositionMin) + if err != nil { + return nil, nil, err + } + + return bt, genSymbolTable(bt), nil +} + +func convCPTreeToByteTree(cpTree parser.CPTree) (byteTree, error) { + if from, to, ok := cpTree.Range(); ok { + bs, err := utf8.GenCharBlocks(from, to) + if err != nil { + return nil, err + } + var a byteTree + for _, b := range bs { + var c byteTree + for i := 0; i < len(b.From); i++ { + c = concat(c, newRangeSymbolNode(b.From[i], b.To[i])) + } + a = oneOf(a, c) + } + return a, nil + } + + if tree, ok := cpTree.Repeatable(); ok { + t, err := convCPTreeToByteTree(tree) + if err != nil { + return nil, err + } + return newRepeatNode(t), nil + } + + if tree, ok := cpTree.Optional(); ok { + t, err := convCPTreeToByteTree(tree) + if err != nil { + return nil, err + } + return newOptionNode(t), nil + } + + if left, right, ok := cpTree.Concatenation(); ok { + l, err := convCPTreeToByteTree(left) + if err != nil { + return nil, err + } + r, err := convCPTreeToByteTree(right) + if err != nil { + return nil, err + } + return newConcatNode(l, r), nil + } + + if left, right, ok := cpTree.Alternatives(); ok { + l, err := convCPTreeToByteTree(left) + if err != nil { + return nil, err + } + r, err := convCPTreeToByteTree(right) + if err != nil { + return nil, err + } + return newAltNode(l, r), nil + } + + return nil, fmt.Errorf("invalid tree type: %T", cpTree) +} diff --git a/grammar/lexical/dfa/tree_test.go b/grammar/lexical/dfa/tree_test.go new file mode 100644 index 0000000..e0abe64 --- /dev/null +++ b/grammar/lexical/dfa/tree_test.go @@ -0,0 +1,257 @@ +package dfa + +import ( + "fmt" + "strings" + "testing" + + "github.com/nihei9/vartan/grammar/lexical/parser" + spec "github.com/nihei9/vartan/spec/grammar" +) + +func TestByteTree(t *testing.T) { + tests := []struct { + root byteTree + nullable bool + first *symbolPositionSet + last *symbolPositionSet + }{ + { + root: newSymbolNodeWithPos(0, 1), + nullable: false, + first: newSymbolPositionSet().add(1), + last: newSymbolPositionSet().add(1), + }, + { + root: newEndMarkerNodeWithPos(1, 1), + nullable: false, + first: newSymbolPositionSet().add(1), + last: newSymbolPositionSet().add(1), + }, + { + root: newConcatNode( + newSymbolNodeWithPos(0, 1), + newSymbolNodeWithPos(0, 2), + ), + nullable: false, + first: newSymbolPositionSet().add(1), + last: newSymbolPositionSet().add(2), + }, + { + root: newConcatNode( + newRepeatNode(newSymbolNodeWithPos(0, 1)), + newSymbolNodeWithPos(0, 2), + ), + nullable: false, + first: newSymbolPositionSet().add(1).add(2), + last: newSymbolPositionSet().add(2), + }, + { + root: newConcatNode( + newSymbolNodeWithPos(0, 1), + newRepeatNode(newSymbolNodeWithPos(0, 2)), + ), + nullable: false, + first: newSymbolPositionSet().add(1), + last: newSymbolPositionSet().add(1).add(2), + }, + { + root: newConcatNode( + newRepeatNode(newSymbolNodeWithPos(0, 1)), + newRepeatNode(newSymbolNodeWithPos(0, 2)), + ), + nullable: true, + first: newSymbolPositionSet().add(1).add(2), + last: newSymbolPositionSet().add(1).add(2), + }, + { + root: newAltNode( + newSymbolNodeWithPos(0, 1), + newSymbolNodeWithPos(0, 2), + ), + nullable: false, + first: newSymbolPositionSet().add(1).add(2), + last: newSymbolPositionSet().add(1).add(2), + }, + { + root: newAltNode( + newRepeatNode(newSymbolNodeWithPos(0, 1)), + newSymbolNodeWithPos(0, 2), + ), + nullable: true, + first: newSymbolPositionSet().add(1).add(2), + last: newSymbolPositionSet().add(1).add(2), + }, + { + root: newAltNode( + newSymbolNodeWithPos(0, 1), + newRepeatNode(newSymbolNodeWithPos(0, 2)), + ), + nullable: true, + first: newSymbolPositionSet().add(1).add(2), + last: newSymbolPositionSet().add(1).add(2), + }, + { + root: newAltNode( + newRepeatNode(newSymbolNodeWithPos(0, 1)), + newRepeatNode(newSymbolNodeWithPos(0, 2)), + ), + nullable: true, + first: newSymbolPositionSet().add(1).add(2), + last: newSymbolPositionSet().add(1).add(2), + }, + { + root: newRepeatNode(newSymbolNodeWithPos(0, 1)), + nullable: true, + first: newSymbolPositionSet().add(1), + last: newSymbolPositionSet().add(1), + }, + { + root: newOptionNode(newSymbolNodeWithPos(0, 1)), + nullable: true, + first: newSymbolPositionSet().add(1), + last: newSymbolPositionSet().add(1), + }, + } + for i, tt := range tests { + t.Run(fmt.Sprintf("#%v", i), func(t *testing.T) { + if tt.root.nullable() != tt.nullable { + t.Errorf("unexpected nullable attribute; want: %v, got: %v", tt.nullable, tt.root.nullable()) + } + if tt.first.hash() != tt.root.first().hash() { + t.Errorf("unexpected first positions attribute; want: %v, got: %v", tt.first, tt.root.first()) + } + if tt.last.hash() != tt.root.last().hash() { + t.Errorf("unexpected last positions attribute; want: %v, got: %v", tt.last, tt.root.last()) + } + }) + } +} + +func newSymbolNodeWithPos(v byte, pos symbolPosition) *symbolNode { + n := newSymbolNode(v) + n.pos = pos + return n +} + +func newEndMarkerNodeWithPos(id int, pos symbolPosition) *endMarkerNode { + n := newEndMarkerNode(spec.LexModeKindID(id)) + n.pos = pos + return n +} + +func TestFollowAndSymbolTable(t *testing.T) { + symPos := func(n uint16) symbolPosition { + pos, err := newSymbolPosition(n, false) + if err != nil { + panic(err) + } + return pos + } + + endPos := func(n uint16) symbolPosition { + pos, err := newSymbolPosition(n, true) + if err != nil { + panic(err) + } + return pos + } + + p := parser.NewParser(spec.LexKindName("test"), strings.NewReader("(a|b)*abb")) + cpt, err := p.Parse() + if err != nil { + t.Fatal(err) + } + + bt, symTab, err := ConvertCPTreeToByteTree(map[spec.LexModeKindID]parser.CPTree{ + spec.LexModeKindIDMin: cpt, + }) + if err != nil { + t.Fatal(err) + } + + { + followTab := genFollowTable(bt) + if followTab == nil { + t.Fatal("follow table is nil") + } + expectedFollowTab := followTable{ + 1: newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)), + 2: newSymbolPositionSet().add(symPos(1)).add(symPos(2)).add(symPos(3)), + 3: newSymbolPositionSet().add(symPos(4)), + 4: newSymbolPositionSet().add(symPos(5)), + 5: newSymbolPositionSet().add(endPos(6)), + } + testFollowTable(t, expectedFollowTab, followTab) + } + + { + entry := func(v byte) byteRange { + return byteRange{ + from: v, + to: v, + } + } + + expectedSymTab := &symbolTable{ + symPos2Byte: map[symbolPosition]byteRange{ + symPos(1): entry(byte('a')), + symPos(2): entry(byte('b')), + symPos(3): entry(byte('a')), + symPos(4): entry(byte('b')), + symPos(5): entry(byte('b')), + }, + endPos2ID: map[symbolPosition]spec.LexModeKindID{ + endPos(6): 1, + }, + } + testSymbolTable(t, expectedSymTab, symTab) + } +} + +func testFollowTable(t *testing.T, expected, actual followTable) { + if len(actual) != len(expected) { + t.Errorf("unexpected number of the follow table entries; want: %v, got: %v", len(expected), len(actual)) + } + for ePos, eSet := range expected { + aSet, ok := actual[ePos] + if !ok { + t.Fatalf("follow entry is not found: position: %v, follow: %v", ePos, eSet) + } + if aSet.hash() != eSet.hash() { + t.Fatalf("follow entry of position %v is mismatched: want: %v, got: %v", ePos, aSet, eSet) + } + } +} + +func testSymbolTable(t *testing.T, expected, actual *symbolTable) { + t.Helper() + + if len(actual.symPos2Byte) != len(expected.symPos2Byte) { + t.Errorf("unexpected symPos2Byte entries: want: %v entries, got: %v entries", len(expected.symPos2Byte), len(actual.symPos2Byte)) + } + for ePos, eByte := range expected.symPos2Byte { + byte, ok := actual.symPos2Byte[ePos] + if !ok { + t.Errorf("a symbol position entry is not found: %v -> %v", ePos, eByte) + continue + } + if byte.from != eByte.from || byte.to != eByte.to { + t.Errorf("unexpected symbol position entry: want: %v -> %v, got: %v -> %v", ePos, eByte, ePos, byte) + } + } + + if len(actual.endPos2ID) != len(expected.endPos2ID) { + t.Errorf("unexpected endPos2ID entries: want: %v entries, got: %v entries", len(expected.endPos2ID), len(actual.endPos2ID)) + } + for ePos, eID := range expected.endPos2ID { + id, ok := actual.endPos2ID[ePos] + if !ok { + t.Errorf("an end position entry is not found: %v -> %v", ePos, eID) + continue + } + if id != eID { + t.Errorf("unexpected end position entry: want: %v -> %v, got: %v -> %v", ePos, eID, ePos, id) + } + } +} diff --git a/grammar/lexical/entry.go b/grammar/lexical/entry.go new file mode 100644 index 0000000..c9f8691 --- /dev/null +++ b/grammar/lexical/entry.go @@ -0,0 +1,171 @@ +package lexical + +import ( + "fmt" + "sort" + "strings" + + spec "github.com/nihei9/vartan/spec/grammar" +) + +type LexEntry struct { + Kind spec.LexKindName + Pattern string + Modes []spec.LexModeName + Push spec.LexModeName + Pop bool + Fragment bool +} + +type LexSpec struct { + Entries []*LexEntry +} + +func (s *LexSpec) Validate() error { + if len(s.Entries) <= 0 { + return fmt.Errorf("the lexical specification must have at least one entry") + } + { + ks := map[string]struct{}{} + fks := map[string]struct{}{} + for _, e := range s.Entries { + // Allow duplicate names between fragments and non-fragments. + if e.Fragment { + if _, exist := fks[e.Kind.String()]; exist { + return fmt.Errorf("kinds `%v` are duplicates", e.Kind) + } + fks[e.Kind.String()] = struct{}{} + } else { + if _, exist := ks[e.Kind.String()]; exist { + return fmt.Errorf("kinds `%v` are duplicates", e.Kind) + } + ks[e.Kind.String()] = struct{}{} + } + } + } + { + kinds := []string{} + modes := []string{ + spec.LexModeNameDefault.String(), // This is a predefined mode. + } + for _, e := range s.Entries { + if e.Fragment { + continue + } + + kinds = append(kinds, e.Kind.String()) + + for _, m := range e.Modes { + modes = append(modes, m.String()) + } + } + + kindErrs := findSpellingInconsistenciesErrors(kinds, nil) + modeErrs := findSpellingInconsistenciesErrors(modes, func(ids []string) error { + if SnakeCaseToUpperCamelCase(ids[0]) == SnakeCaseToUpperCamelCase(spec.LexModeNameDefault.String()) { + var b strings.Builder + fmt.Fprintf(&b, "%+v", ids[0]) + for _, id := range ids[1:] { + fmt.Fprintf(&b, ", %+v", id) + } + return fmt.Errorf("these identifiers are treated as the same. please use the same spelling as predefined '%v': %v", spec.LexModeNameDefault, b.String()) + } + return nil + }) + errs := append(kindErrs, modeErrs...) + if len(errs) > 0 { + var b strings.Builder + fmt.Fprintf(&b, "%v", errs[0]) + for _, err := range errs[1:] { + fmt.Fprintf(&b, "\n%v", err) + } + return fmt.Errorf(b.String()) + } + } + + return nil +} + +func findSpellingInconsistenciesErrors(ids []string, hook func(ids []string) error) []error { + duplicated := FindSpellingInconsistencies(ids) + if len(duplicated) == 0 { + return nil + } + + var errs []error + for _, dup := range duplicated { + if hook != nil { + err := hook(dup) + if err != nil { + errs = append(errs, err) + continue + } + } + + var b strings.Builder + fmt.Fprintf(&b, "%+v", dup[0]) + for _, id := range dup[1:] { + fmt.Fprintf(&b, ", %+v", id) + } + err := fmt.Errorf("these identifiers are treated as the same. please use the same spelling: %v", b.String()) + errs = append(errs, err) + } + + return errs +} + +// FindSpellingInconsistencies finds spelling inconsistencies in identifiers. The identifiers are considered to be the same +// if they are spelled the same when expressed in UpperCamelCase. For example, `left_paren` and `LeftParen` are spelled the same +// in UpperCamelCase. Thus they are considere to be spelling inconsistency. +func FindSpellingInconsistencies(ids []string) [][]string { + m := map[string][]string{} + for _, id := range removeDuplicates(ids) { + c := SnakeCaseToUpperCamelCase(id) + m[c] = append(m[c], id) + } + + var duplicated [][]string + for _, camels := range m { + if len(camels) == 1 { + continue + } + duplicated = append(duplicated, camels) + } + + for _, dup := range duplicated { + sort.Slice(dup, func(i, j int) bool { + return dup[i] < dup[j] + }) + } + sort.Slice(duplicated, func(i, j int) bool { + return duplicated[i][0] < duplicated[j][0] + }) + + return duplicated +} + +func removeDuplicates(s []string) []string { + m := map[string]struct{}{} + for _, v := range s { + m[v] = struct{}{} + } + + var unique []string + for v := range m { + unique = append(unique, v) + } + + return unique +} + +func SnakeCaseToUpperCamelCase(snake string) string { + elems := strings.Split(snake, "_") + for i, e := range elems { + if len(e) == 0 { + continue + } + elems[i] = strings.ToUpper(string(e[0])) + e[1:] + } + + return strings.Join(elems, "") +} diff --git a/grammar/lexical/parser/error.go b/grammar/lexical/parser/error.go new file mode 100644 index 0000000..be81da4 --- /dev/null +++ b/grammar/lexical/parser/error.go @@ -0,0 +1,36 @@ +package parser + +import "fmt" + +var ( + ParseErr = fmt.Errorf("parse error") + + // lexical errors + synErrIncompletedEscSeq = fmt.Errorf("incompleted escape sequence; unexpected EOF following \\") + synErrInvalidEscSeq = fmt.Errorf("invalid escape sequence") + synErrInvalidCodePoint = fmt.Errorf("code points must consist of just 4 or 6 hex digits") + synErrCharPropInvalidSymbol = fmt.Errorf("invalid character property symbol") + SynErrFragmentInvalidSymbol = fmt.Errorf("invalid fragment symbol") + + // syntax errors + synErrUnexpectedToken = fmt.Errorf("unexpected token") + synErrNullPattern = fmt.Errorf("a pattern must be a non-empty byte sequence") + synErrUnmatchablePattern = fmt.Errorf("a pattern cannot match any characters") + synErrAltLackOfOperand = fmt.Errorf("an alternation expression must have operands") + synErrRepNoTarget = fmt.Errorf("a repeat expression must have an operand") + synErrGroupNoElem = fmt.Errorf("a grouping expression must include at least one character") + synErrGroupUnclosed = fmt.Errorf("unclosed grouping expression") + synErrGroupNoInitiator = fmt.Errorf(") needs preceding (") + synErrGroupInvalidForm = fmt.Errorf("invalid grouping expression") + synErrBExpNoElem = fmt.Errorf("a bracket expression must include at least one character") + synErrBExpUnclosed = fmt.Errorf("unclosed bracket expression") + synErrBExpInvalidForm = fmt.Errorf("invalid bracket expression") + synErrRangeInvalidOrder = fmt.Errorf("a range expression with invalid order") + synErrRangePropIsUnavailable = fmt.Errorf("a property expression is unavailable in a range expression") + synErrRangeInvalidForm = fmt.Errorf("invalid range expression") + synErrCPExpInvalidForm = fmt.Errorf("invalid code point expression") + synErrCPExpOutOfRange = fmt.Errorf("a code point must be between U+0000 to U+10FFFF") + synErrCharPropExpInvalidForm = fmt.Errorf("invalid character property expression") + synErrCharPropUnsupported = fmt.Errorf("unsupported character property") + synErrFragmentExpInvalidForm = fmt.Errorf("invalid fragment expression") +) diff --git a/grammar/lexical/parser/fragment.go b/grammar/lexical/parser/fragment.go new file mode 100644 index 0000000..fc6f16b --- /dev/null +++ b/grammar/lexical/parser/fragment.go @@ -0,0 +1,72 @@ +package parser + +import ( + "fmt" + + spec "github.com/nihei9/vartan/spec/grammar" +) + +type incompleteFragment struct { + kind spec.LexKindName + root *rootNode +} + +func CompleteFragments(fragments map[spec.LexKindName]CPTree) error { + if len(fragments) == 0 { + return nil + } + + completeFragments := map[spec.LexKindName]CPTree{} + incompleteFragments := []*incompleteFragment{} + for kind, tree := range fragments { + root, ok := tree.(*rootNode) + if !ok { + return fmt.Errorf("CompleteFragments can take only *rootNode: %T", tree) + } + if root.incomplete() { + incompleteFragments = append(incompleteFragments, &incompleteFragment{ + kind: kind, + root: root, + }) + } else { + completeFragments[kind] = root + } + } + for len(incompleteFragments) > 0 { + lastIncompCount := len(incompleteFragments) + remainingFragments := []*incompleteFragment{} + for _, e := range incompleteFragments { + complete, err := ApplyFragments(e.root, completeFragments) + if err != nil { + return err + } + if !complete { + remainingFragments = append(remainingFragments, e) + } else { + completeFragments[e.kind] = e.root + } + } + incompleteFragments = remainingFragments + if len(incompleteFragments) == lastIncompCount { + return ParseErr + } + } + + return nil +} + +func ApplyFragments(t CPTree, fragments map[spec.LexKindName]CPTree) (bool, error) { + root, ok := t.(*rootNode) + if !ok { + return false, fmt.Errorf("ApplyFragments can take only *rootNode type: %T", t) + } + + for name, frag := range fragments { + err := root.applyFragment(name, frag) + if err != nil { + return false, err + } + } + + return !root.incomplete(), nil +} diff --git a/grammar/lexical/parser/lexer.go b/grammar/lexical/parser/lexer.go new file mode 100644 index 0000000..3861825 --- /dev/null +++ b/grammar/lexical/parser/lexer.go @@ -0,0 +1,594 @@ +package parser + +import ( + "bufio" + "fmt" + "io" + "strings" +) + +type tokenKind string + +const ( + tokenKindChar tokenKind = "char" + tokenKindAnyChar tokenKind = "." + tokenKindRepeat tokenKind = "*" + tokenKindRepeatOneOrMore tokenKind = "+" + tokenKindOption tokenKind = "?" + tokenKindAlt tokenKind = "|" + tokenKindGroupOpen tokenKind = "(" + tokenKindGroupClose tokenKind = ")" + tokenKindBExpOpen tokenKind = "[" + tokenKindInverseBExpOpen tokenKind = "[^" + tokenKindBExpClose tokenKind = "]" + tokenKindCharRange tokenKind = "-" + tokenKindCodePointLeader tokenKind = "\\u" + tokenKindCharPropLeader tokenKind = "\\p" + tokenKindFragmentLeader tokenKind = "\\f" + tokenKindLBrace tokenKind = "{" + tokenKindRBrace tokenKind = "}" + tokenKindEqual tokenKind = "=" + tokenKindCodePoint tokenKind = "code point" + tokenKindCharPropSymbol tokenKind = "character property symbol" + tokenKindFragmentSymbol tokenKind = "fragment symbol" + tokenKindEOF tokenKind = "eof" +) + +type token struct { + kind tokenKind + char rune + propSymbol string + codePoint string + fragmentSymbol string +} + +const nullChar = '\u0000' + +func newToken(kind tokenKind, char rune) *token { + return &token{ + kind: kind, + char: char, + } +} + +func newCodePointToken(codePoint string) *token { + return &token{ + kind: tokenKindCodePoint, + codePoint: codePoint, + } +} + +func newCharPropSymbolToken(propSymbol string) *token { + return &token{ + kind: tokenKindCharPropSymbol, + propSymbol: propSymbol, + } +} + +func newFragmentSymbolToken(fragmentSymbol string) *token { + return &token{ + kind: tokenKindFragmentSymbol, + fragmentSymbol: fragmentSymbol, + } +} + +type lexerMode string + +const ( + lexerModeDefault lexerMode = "default" + lexerModeBExp lexerMode = "bracket expression" + lexerModeCPExp lexerMode = "code point expression" + lexerModeCharPropExp lexerMode = "character property expression" + lexerModeFragmentExp lexerMode = "fragment expression" +) + +type lexerModeStack struct { + stack []lexerMode +} + +func newLexerModeStack() *lexerModeStack { + return &lexerModeStack{ + stack: []lexerMode{ + lexerModeDefault, + }, + } +} + +func (s *lexerModeStack) top() lexerMode { + return s.stack[len(s.stack)-1] +} + +func (s *lexerModeStack) push(m lexerMode) { + s.stack = append(s.stack, m) +} + +func (s *lexerModeStack) pop() { + s.stack = s.stack[:len(s.stack)-1] +} + +type rangeState string + +// [a-z] +// ^^^^ +// |||`-- ready +// ||`-- expect range terminator +// |`-- read range initiator +// `-- ready +const ( + rangeStateReady rangeState = "ready" + rangeStateReadRangeInitiator rangeState = "read range initiator" + rangeStateExpectRangeTerminator rangeState = "expect range terminator" +) + +type lexer struct { + src *bufio.Reader + peekChar2 rune + peekEOF2 bool + peekChar1 rune + peekEOF1 bool + lastChar rune + reachedEOF bool + prevChar1 rune + prevEOF1 bool + prevChar2 rune + pervEOF2 bool + modeStack *lexerModeStack + rangeState rangeState + + errCause error + errDetail string +} + +func newLexer(src io.Reader) *lexer { + return &lexer{ + src: bufio.NewReader(src), + peekChar2: nullChar, + peekEOF2: false, + peekChar1: nullChar, + peekEOF1: false, + lastChar: nullChar, + reachedEOF: false, + prevChar1: nullChar, + prevEOF1: false, + prevChar2: nullChar, + pervEOF2: false, + modeStack: newLexerModeStack(), + rangeState: rangeStateReady, + } +} + +func (l *lexer) error() (string, error) { + return l.errDetail, l.errCause +} + +func (l *lexer) next() (*token, error) { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + return newToken(tokenKindEOF, nullChar), nil + } + + switch l.modeStack.top() { + case lexerModeBExp: + tok, err := l.nextInBExp(c) + if err != nil { + return nil, err + } + if tok.kind == tokenKindChar || tok.kind == tokenKindCodePointLeader || tok.kind == tokenKindCharPropLeader { + switch l.rangeState { + case rangeStateReady: + l.rangeState = rangeStateReadRangeInitiator + case rangeStateExpectRangeTerminator: + l.rangeState = rangeStateReady + } + } + switch tok.kind { + case tokenKindBExpClose: + l.modeStack.pop() + case tokenKindCharRange: + l.rangeState = rangeStateExpectRangeTerminator + case tokenKindCodePointLeader: + l.modeStack.push(lexerModeCPExp) + case tokenKindCharPropLeader: + l.modeStack.push(lexerModeCharPropExp) + } + return tok, nil + case lexerModeCPExp: + tok, err := l.nextInCodePoint(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() + } + return tok, nil + case lexerModeCharPropExp: + tok, err := l.nextInCharProp(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() + } + return tok, nil + case lexerModeFragmentExp: + tok, err := l.nextInFragment(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindRBrace: + l.modeStack.pop() + } + return tok, nil + default: + tok, err := l.nextInDefault(c) + if err != nil { + return nil, err + } + switch tok.kind { + case tokenKindBExpOpen: + l.modeStack.push(lexerModeBExp) + l.rangeState = rangeStateReady + case tokenKindInverseBExpOpen: + l.modeStack.push(lexerModeBExp) + l.rangeState = rangeStateReady + case tokenKindCodePointLeader: + l.modeStack.push(lexerModeCPExp) + case tokenKindCharPropLeader: + l.modeStack.push(lexerModeCharPropExp) + case tokenKindFragmentLeader: + l.modeStack.push(lexerModeFragmentExp) + } + return tok, nil + } +} + +func (l *lexer) nextInDefault(c rune) (*token, error) { + switch c { + case '*': + return newToken(tokenKindRepeat, nullChar), nil + case '+': + return newToken(tokenKindRepeatOneOrMore, nullChar), nil + case '?': + return newToken(tokenKindOption, nullChar), nil + case '.': + return newToken(tokenKindAnyChar, nullChar), nil + case '|': + return newToken(tokenKindAlt, nullChar), nil + case '(': + return newToken(tokenKindGroupOpen, nullChar), nil + case ')': + return newToken(tokenKindGroupClose, nullChar), nil + case '[': + c1, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindBExpOpen, nullChar), nil + } + if c1 != '^' { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindBExpOpen, nullChar), nil + } + c2, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindInverseBExpOpen, nullChar), nil + } + if c2 != ']' { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindInverseBExpOpen, nullChar), nil + } + err = l.restore() + if err != nil { + return nil, err + } + err = l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindBExpOpen, nullChar), nil + case '\\': + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + l.errCause = synErrIncompletedEscSeq + return nil, ParseErr + } + if c == 'u' { + return newToken(tokenKindCodePointLeader, nullChar), nil + } + if c == 'p' { + return newToken(tokenKindCharPropLeader, nullChar), nil + } + if c == 'f' { + return newToken(tokenKindFragmentLeader, nullChar), nil + } + if c == '\\' || c == '.' || c == '*' || c == '+' || c == '?' || c == '|' || c == '(' || c == ')' || c == '[' || c == ']' { + return newToken(tokenKindChar, c), nil + } + l.errCause = synErrInvalidEscSeq + l.errDetail = fmt.Sprintf("\\%v is not supported", string(c)) + return nil, ParseErr + default: + return newToken(tokenKindChar, c), nil + } +} + +func (l *lexer) nextInBExp(c rune) (*token, error) { + switch c { + case '-': + if l.rangeState != rangeStateReadRangeInitiator { + return newToken(tokenKindChar, c), nil + } + c1, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindChar, c), nil + } + if c1 != ']' { + err := l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindCharRange, nullChar), nil + } + err = l.restore() + if err != nil { + return nil, err + } + return newToken(tokenKindChar, c), nil + case ']': + return newToken(tokenKindBExpClose, nullChar), nil + case '\\': + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + l.errCause = synErrIncompletedEscSeq + return nil, ParseErr + } + if c == 'u' { + return newToken(tokenKindCodePointLeader, nullChar), nil + } + if c == 'p' { + return newToken(tokenKindCharPropLeader, nullChar), nil + } + if c == '\\' || c == '^' || c == '-' || c == ']' { + return newToken(tokenKindChar, c), nil + } + l.errCause = synErrInvalidEscSeq + l.errDetail = fmt.Sprintf("\\%v is not supported in a bracket expression", string(c)) + return nil, ParseErr + default: + return newToken(tokenKindChar, c), nil + } +} + +func (l *lexer) nextInCodePoint(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + default: + if !isHexDigit(c) { + l.errCause = synErrInvalidCodePoint + return nil, ParseErr + } + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if !isHexDigit(c) || n >= 6 { + l.errCause = synErrInvalidCodePoint + return nil, ParseErr + } + fmt.Fprint(&b, string(c)) + n++ + } + cp := b.String() + cpLen := len(cp) + if !(cpLen == 4 || cpLen == 6) { + l.errCause = synErrInvalidCodePoint + return nil, ParseErr + } + return newCodePointToken(b.String()), nil + } +} + +func isHexDigit(c rune) bool { + if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' { + return true + } + return false +} + +func (l *lexer) nextInCharProp(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + case '=': + return newToken(tokenKindEqual, nullChar), nil + default: + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' || c == '=' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + fmt.Fprint(&b, string(c)) + n++ + } + sym := strings.TrimSpace(b.String()) + if len(sym) == 0 { + l.errCause = synErrCharPropInvalidSymbol + return nil, ParseErr + } + return newCharPropSymbolToken(sym), nil + } +} + +func (l *lexer) nextInFragment(c rune) (*token, error) { + switch c { + case '{': + return newToken(tokenKindLBrace, nullChar), nil + case '}': + return newToken(tokenKindRBrace, nullChar), nil + default: + var b strings.Builder + fmt.Fprint(&b, string(c)) + n := 1 + for { + c, eof, err := l.read() + if err != nil { + return nil, err + } + if eof { + err := l.restore() + if err != nil { + return nil, err + } + break + } + if c == '}' { + err := l.restore() + if err != nil { + return nil, err + } + break + } + fmt.Fprint(&b, string(c)) + n++ + } + sym := strings.TrimSpace(b.String()) + if len(sym) == 0 { + l.errCause = SynErrFragmentInvalidSymbol + return nil, ParseErr + } + return newFragmentSymbolToken(sym), nil + } +} + +func (l *lexer) read() (rune, bool, error) { + if l.reachedEOF { + return l.lastChar, l.reachedEOF, nil + } + if l.peekChar1 != nullChar || l.peekEOF1 { + l.prevChar2 = l.prevChar1 + l.pervEOF2 = l.prevEOF1 + l.prevChar1 = l.lastChar + l.prevEOF1 = l.reachedEOF + l.lastChar = l.peekChar1 + l.reachedEOF = l.peekEOF1 + l.peekChar1 = l.peekChar2 + l.peekEOF1 = l.peekEOF2 + l.peekChar2 = nullChar + l.peekEOF2 = false + return l.lastChar, l.reachedEOF, nil + } + c, _, err := l.src.ReadRune() + if err != nil { + if err == io.EOF { + l.prevChar2 = l.prevChar1 + l.pervEOF2 = l.prevEOF1 + l.prevChar1 = l.lastChar + l.prevEOF1 = l.reachedEOF + l.lastChar = nullChar + l.reachedEOF = true + return l.lastChar, l.reachedEOF, nil + } + return nullChar, false, err + } + l.prevChar2 = l.prevChar1 + l.pervEOF2 = l.prevEOF1 + l.prevChar1 = l.lastChar + l.prevEOF1 = l.reachedEOF + l.lastChar = c + l.reachedEOF = false + return l.lastChar, l.reachedEOF, nil +} + +func (l *lexer) restore() error { + if l.lastChar == nullChar && !l.reachedEOF { + return fmt.Errorf("failed to call restore() because the last character is null") + } + l.peekChar2 = l.peekChar1 + l.peekEOF2 = l.peekEOF1 + l.peekChar1 = l.lastChar + l.peekEOF1 = l.reachedEOF + l.lastChar = l.prevChar1 + l.reachedEOF = l.prevEOF1 + l.prevChar1 = l.prevChar2 + l.prevEOF1 = l.pervEOF2 + l.prevChar2 = nullChar + l.pervEOF2 = false + return nil +} diff --git a/grammar/lexical/parser/lexer_test.go b/grammar/lexical/parser/lexer_test.go new file mode 100644 index 0000000..055466e --- /dev/null +++ b/grammar/lexical/parser/lexer_test.go @@ -0,0 +1,524 @@ +package parser + +import ( + "strings" + "testing" +) + +func TestLexer(t *testing.T) { + tests := []struct { + caption string + src string + tokens []*token + err error + }{ + { + caption: "lexer can recognize ordinaly characters", + src: "123abcいろは", + tokens: []*token{ + newToken(tokenKindChar, '1'), + newToken(tokenKindChar, '2'), + newToken(tokenKindChar, '3'), + newToken(tokenKindChar, 'a'), + newToken(tokenKindChar, 'b'), + newToken(tokenKindChar, 'c'), + newToken(tokenKindChar, 'い'), + newToken(tokenKindChar, 'ろ'), + newToken(tokenKindChar, 'は'), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "lexer can recognize the special characters in default mode", + src: ".*+?|()[\\u", + tokens: []*token{ + newToken(tokenKindAnyChar, nullChar), + newToken(tokenKindRepeat, nullChar), + newToken(tokenKindRepeatOneOrMore, nullChar), + newToken(tokenKindOption, nullChar), + newToken(tokenKindAlt, nullChar), + newToken(tokenKindGroupOpen, nullChar), + newToken(tokenKindGroupClose, nullChar), + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "lexer can recognize the escape sequences in default mode", + src: "\\\\\\.\\*\\+\\?\\|\\(\\)\\[", + tokens: []*token{ + newToken(tokenKindChar, '\\'), + newToken(tokenKindChar, '.'), + newToken(tokenKindChar, '*'), + newToken(tokenKindChar, '+'), + newToken(tokenKindChar, '?'), + newToken(tokenKindChar, '|'), + newToken(tokenKindChar, '('), + newToken(tokenKindChar, ')'), + newToken(tokenKindChar, '['), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "], {, and } are treated as an ordinary character in default mode", + src: "]{}", + tokens: []*token{ + newToken(tokenKindChar, ']'), + newToken(tokenKindChar, '{'), + newToken(tokenKindChar, '}'), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "lexer can recognize the special characters in bracket expression mode", + src: "[a-z\\u{09AF}][^a-z\\u{09abcf}]", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, 'a'), + newToken(tokenKindCharRange, nullChar), + newToken(tokenKindChar, 'z'), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("09AF"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, 'a'), + newToken(tokenKindCharRange, nullChar), + newToken(tokenKindChar, 'z'), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("09abcf"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "lexer can recognize the escape sequences in bracket expression mode", + src: "[\\^a\\-z]", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, '^'), + newToken(tokenKindChar, 'a'), + newToken(tokenKindChar, '-'), + newToken(tokenKindChar, 'z'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "in a bracket expression, the special characters are also handled as normal characters", + src: "[\\\\.*+?|()[", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, '\\'), + newToken(tokenKindChar, '.'), + newToken(tokenKindChar, '*'), + newToken(tokenKindChar, '+'), + newToken(tokenKindChar, '?'), + newToken(tokenKindChar, '|'), + newToken(tokenKindChar, '('), + newToken(tokenKindChar, ')'), + newToken(tokenKindChar, '['), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "hyphen symbols that appear in bracket expressions are handled as the character range symbol or ordinary characters", + // [...-...][...-][-...][-] + // ~~~~~~~ ~ ~ ~ + // ^ ^ ^ ^ + // | | | `-- Ordinary Character (b) + // | | `-- Ordinary Character (b) + // | `-- Ordinary Character (b) + // `-- Character Range (a) + // + // a. *-* is handled as a character-range expression. + // b. *-, -*, or - are handled as ordinary characters. + src: "[a-z][a-][-z][-][--][---][^a-z][^a-][^-z][^-][^--][^---]", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, 'a'), + newToken(tokenKindCharRange, nullChar), + newToken(tokenKindChar, 'z'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, 'a'), + newToken(tokenKindChar, '-'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindChar, 'z'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindChar, '-'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindCharRange, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindBExpClose, nullChar), + + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, 'a'), + newToken(tokenKindCharRange, nullChar), + newToken(tokenKindChar, 'z'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, 'a'), + newToken(tokenKindChar, '-'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindChar, 'z'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindChar, '-'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindCharRange, nullChar), + newToken(tokenKindChar, '-'), + newToken(tokenKindBExpClose, nullChar), + + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "caret symbols that appear in bracket expressions are handled as the logical inverse symbol or ordinary characters", + // [^...^...][^] + // ~~ ~ ~~ + // ^ ^ ^^ + // | | |`-- Ordinary Character (c) + // | | `-- Bracket Expression + // | `-- Ordinary Character (b) + // `-- Inverse Bracket Expression (a) + // + // a. Bracket expressions that have a caret symbol at the beginning are handled as logical inverse expressions. + // b. caret symbols that appear as the second and the subsequent symbols are handled as ordinary symbols. + // c. When a bracket expression has just one symbol, a caret symbol at the beginning is handled as an ordinary character. + src: "[^^][^]", + tokens: []*token{ + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindChar, '^'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindChar, '^'), + newToken(tokenKindBExpClose, nullChar), + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "lexer raises an error when an invalid escape sequence appears", + src: "\\@", + err: synErrInvalidEscSeq, + }, + { + caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears", + src: "\\", + err: synErrIncompletedEscSeq, + }, + { + caption: "lexer raises an error when an invalid escape sequence appears", + src: "[\\@", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + }, + err: synErrInvalidEscSeq, + }, + { + caption: "lexer raises an error when the incomplete escape sequence (EOF following \\) appears", + src: "[\\", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + }, + err: synErrIncompletedEscSeq, + }, + { + caption: "lexer can recognize the special characters and code points in code point expression mode", + src: "\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}[\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}][^\\u{0123}\\u{4567}\\u{89abcd}\\u{efAB}\\u{CDEF01}]", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("0123"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("4567"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("89abcd"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("efAB"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("CDEF01"), + newToken(tokenKindRBrace, nullChar), + + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("0123"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("4567"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("89abcd"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("efAB"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("CDEF01"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindBExpClose, nullChar), + + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("0123"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("4567"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("89abcd"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("efAB"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("CDEF01"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindBExpClose, nullChar), + + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "a one digit hex string isn't a valid code point", + src: "\\u{0", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a two digits hex string isn't a valid code point", + src: "\\u{01", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a three digits hex string isn't a valid code point", + src: "\\u{012", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a four digits hex string is a valid code point", + src: "\\u{0123}", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("0123"), + newToken(tokenKindRBrace, nullChar), + }, + }, + { + caption: "a five digits hex string isn't a valid code point", + src: "\\u{01234", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a six digits hex string is a valid code point", + src: "\\u{012345}", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCodePointToken("012345"), + newToken(tokenKindRBrace, nullChar), + }, + }, + { + caption: "a seven digits hex string isn't a valid code point", + src: "\\u{0123456", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a code point must be hex digits", + src: "\\u{g", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "a code point must be hex digits", + src: "\\u{G", + tokens: []*token{ + newToken(tokenKindCodePointLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + }, + err: synErrInvalidCodePoint, + }, + { + caption: "lexer can recognize the special characters and symbols in character property expression mode", + src: "\\p{Letter}\\p{General_Category=Letter}[\\p{Letter}\\p{General_Category=Letter}][^\\p{Letter}\\p{General_Category=Letter}]", + tokens: []*token{ + newToken(tokenKindCharPropLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCharPropSymbolToken("Letter"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCharPropLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCharPropSymbolToken("General_Category"), + newToken(tokenKindEqual, nullChar), + newCharPropSymbolToken("Letter"), + newToken(tokenKindRBrace, nullChar), + + newToken(tokenKindBExpOpen, nullChar), + newToken(tokenKindCharPropLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCharPropSymbolToken("Letter"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCharPropLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCharPropSymbolToken("General_Category"), + newToken(tokenKindEqual, nullChar), + newCharPropSymbolToken("Letter"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindBExpClose, nullChar), + + newToken(tokenKindInverseBExpOpen, nullChar), + newToken(tokenKindCharPropLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCharPropSymbolToken("Letter"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindCharPropLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newCharPropSymbolToken("General_Category"), + newToken(tokenKindEqual, nullChar), + newCharPropSymbolToken("Letter"), + newToken(tokenKindRBrace, nullChar), + newToken(tokenKindBExpClose, nullChar), + + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "lexer can recognize the special characters and symbols in fragment expression mode", + src: "\\f{integer}", + tokens: []*token{ + newToken(tokenKindFragmentLeader, nullChar), + newToken(tokenKindLBrace, nullChar), + newFragmentSymbolToken("integer"), + newToken(tokenKindRBrace, nullChar), + + newToken(tokenKindEOF, nullChar), + }, + }, + { + caption: "a fragment expression is not supported in a bracket expression", + src: "[\\f", + tokens: []*token{ + newToken(tokenKindBExpOpen, nullChar), + }, + err: synErrInvalidEscSeq, + }, + { + caption: "a fragment expression is not supported in an inverse bracket expression", + src: "[^\\f", + tokens: []*token{ + newToken(tokenKindInverseBExpOpen, nullChar), + }, + err: synErrInvalidEscSeq, + }, + } + for _, tt := range tests { + t.Run(tt.caption, func(t *testing.T) { + lex := newLexer(strings.NewReader(tt.src)) + var err error + var tok *token + i := 0 + for { + tok, err = lex.next() + if err != nil { + break + } + if i >= len(tt.tokens) { + break + } + eTok := tt.tokens[i] + i++ + testToken(t, tok, eTok) + + if tok.kind == tokenKindEOF { + break + } + } + if tt.err != nil { + if err != ParseErr { + t.Fatalf("unexpected error: want: %v, got: %v", ParseErr, err) + } + detail, cause := lex.error() + if cause != tt.err { + t.Fatalf("unexpected error: want: %v, got: %v (%v)", tt.err, cause, detail) + } + } else { + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + } + if i < len(tt.tokens) { + t.Fatalf("expecte more tokens") + } + }) + } +} + +func testToken(t *testing.T, a, e *token) { + t.Helper() + if e.kind != a.kind || e.char != a.char || e.codePoint != a.codePoint { + t.Fatalf("unexpected token: want: %+v, got: %+v", e, a) + } +} diff --git a/grammar/lexical/parser/parser.go b/grammar/lexical/parser/parser.go new file mode 100644 index 0000000..89362b8 --- /dev/null +++ b/grammar/lexical/parser/parser.go @@ -0,0 +1,531 @@ +package parser + +import ( + "bytes" + "fmt" + "io" + "strconv" + + spec "github.com/nihei9/vartan/spec/grammar" + "github.com/nihei9/vartan/ucd" +) + +type PatternEntry struct { + ID spec.LexModeKindID + Pattern []byte +} + +type parser struct { + kind spec.LexKindName + lex *lexer + peekedTok *token + lastTok *token + + // If and only if isContributoryPropertyExposed is true, the parser interprets contributory properties that + // appear in property expressions. + // + // The contributory properties are not exposed, and users cannot use those properties because the parser + // follows [UAX #44 5.13 Property APIs]. For instance, \p{Other_Alphabetic} is invalid. + // + // isContributoryPropertyExposed is set to true when the parser is generated recursively. The parser needs to + // interpret derived properties internally because the derived properties consist of other properties that + // may contain the contributory properties. + // + // [UAX #44 5.13 Property APIs] says: + // > The following subtypes of Unicode character properties should generally not be exposed in APIs, + // > except in limited circumstances. They may not be useful, particularly in public API collections, + // > and may instead prove misleading to the users of such API collections. + // > * Contributory properties are not recommended for public APIs. + // > ... + // https://unicode.org/reports/tr44/#Property_APIs + isContributoryPropertyExposed bool + + errCause error + errDetail string +} + +func NewParser(kind spec.LexKindName, src io.Reader) *parser { + return &parser{ + kind: kind, + lex: newLexer(src), + isContributoryPropertyExposed: false, + } +} + +func (p *parser) exposeContributoryProperty() { + p.isContributoryPropertyExposed = true +} + +func (p *parser) Error() (string, error) { + return p.errDetail, p.errCause +} + +func (p *parser) Parse() (root CPTree, retErr error) { + defer func() { + err := recover() + if err != nil { + var ok bool + retErr, ok = err.(error) + if !ok { + panic(err) + } + return + } + }() + + return newRootNode(p.kind, p.parseRegexp()), nil +} + +func (p *parser) parseRegexp() CPTree { + alt := p.parseAlt() + if alt == nil { + if p.consume(tokenKindGroupClose) { + p.raiseParseError(synErrGroupNoInitiator, "") + } + p.raiseParseError(synErrNullPattern, "") + } + if p.consume(tokenKindGroupClose) { + p.raiseParseError(synErrGroupNoInitiator, "") + } + p.expect(tokenKindEOF) + return alt +} + +func (p *parser) parseAlt() CPTree { + left := p.parseConcat() + if left == nil { + if p.consume(tokenKindAlt) { + p.raiseParseError(synErrAltLackOfOperand, "") + } + return nil + } + for { + if !p.consume(tokenKindAlt) { + break + } + right := p.parseConcat() + if right == nil { + p.raiseParseError(synErrAltLackOfOperand, "") + } + left = newAltNode(left, right) + } + return left +} + +func (p *parser) parseConcat() CPTree { + left := p.parseRepeat() + for { + right := p.parseRepeat() + if right == nil { + break + } + left = newConcatNode(left, right) + } + return left +} + +func (p *parser) parseRepeat() CPTree { + group := p.parseGroup() + if group == nil { + if p.consume(tokenKindRepeat) { + p.raiseParseError(synErrRepNoTarget, "* needs an operand") + } + if p.consume(tokenKindRepeatOneOrMore) { + p.raiseParseError(synErrRepNoTarget, "+ needs an operand") + } + if p.consume(tokenKindOption) { + p.raiseParseError(synErrRepNoTarget, "? needs an operand") + } + return nil + } + if p.consume(tokenKindRepeat) { + return newRepeatNode(group) + } + if p.consume(tokenKindRepeatOneOrMore) { + return newRepeatOneOrMoreNode(group) + } + if p.consume(tokenKindOption) { + return newOptionNode(group) + } + return group +} + +func (p *parser) parseGroup() CPTree { + if p.consume(tokenKindGroupOpen) { + alt := p.parseAlt() + if alt == nil { + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrGroupUnclosed, "") + } + p.raiseParseError(synErrGroupNoElem, "") + } + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrGroupUnclosed, "") + } + if !p.consume(tokenKindGroupClose) { + p.raiseParseError(synErrGroupInvalidForm, "") + } + return alt + } + return p.parseSingleChar() +} + +func (p *parser) parseSingleChar() CPTree { + if p.consume(tokenKindAnyChar) { + return genAnyCharAST() + } + if p.consume(tokenKindBExpOpen) { + left := p.parseBExpElem() + if left == nil { + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.raiseParseError(synErrBExpNoElem, "") + } + for { + right := p.parseBExpElem() + if right == nil { + break + } + left = newAltNode(left, right) + } + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.expect(tokenKindBExpClose) + return left + } + if p.consume(tokenKindInverseBExpOpen) { + elem := p.parseBExpElem() + if elem == nil { + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.raiseParseError(synErrBExpNoElem, "") + } + inverse := exclude(elem, genAnyCharAST()) + if inverse == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + for { + elem := p.parseBExpElem() + if elem == nil { + break + } + inverse = exclude(elem, inverse) + if inverse == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + } + if p.consume(tokenKindEOF) { + p.raiseParseError(synErrBExpUnclosed, "") + } + p.expect(tokenKindBExpClose) + return inverse + } + if p.consume(tokenKindCodePointLeader) { + return p.parseCodePoint() + } + if p.consume(tokenKindCharPropLeader) { + return p.parseCharProp() + } + if p.consume(tokenKindFragmentLeader) { + return p.parseFragment() + } + c := p.parseNormalChar() + if c == nil { + if p.consume(tokenKindBExpClose) { + p.raiseParseError(synErrBExpInvalidForm, "") + } + return nil + } + return c +} + +func (p *parser) parseBExpElem() CPTree { + var left CPTree + switch { + case p.consume(tokenKindCodePointLeader): + left = p.parseCodePoint() + case p.consume(tokenKindCharPropLeader): + left = p.parseCharProp() + if p.consume(tokenKindCharRange) { + p.raiseParseError(synErrRangePropIsUnavailable, "") + } + default: + left = p.parseNormalChar() + } + if left == nil { + return nil + } + if !p.consume(tokenKindCharRange) { + return left + } + var right CPTree + switch { + case p.consume(tokenKindCodePointLeader): + right = p.parseCodePoint() + case p.consume(tokenKindCharPropLeader): + p.raiseParseError(synErrRangePropIsUnavailable, "") + default: + right = p.parseNormalChar() + } + if right == nil { + p.raiseParseError(synErrRangeInvalidForm, "") + } + from, _, _ := left.Range() + _, to, _ := right.Range() + if !isValidOrder(from, to) { + p.raiseParseError(synErrRangeInvalidOrder, fmt.Sprintf("%X..%X", from, to)) + } + return newRangeSymbolNode(from, to) +} + +func (p *parser) parseCodePoint() CPTree { + if !p.consume(tokenKindLBrace) { + p.raiseParseError(synErrCPExpInvalidForm, "") + } + if !p.consume(tokenKindCodePoint) { + p.raiseParseError(synErrCPExpInvalidForm, "") + } + + n, err := strconv.ParseInt(p.lastTok.codePoint, 16, 64) + if err != nil { + panic(fmt.Errorf("failed to decode a code point (%v) into a int: %v", p.lastTok.codePoint, err)) + } + if n < 0x0000 || n > 0x10FFFF { + p.raiseParseError(synErrCPExpOutOfRange, "") + } + + sym := newSymbolNode(rune(n)) + + if !p.consume(tokenKindRBrace) { + p.raiseParseError(synErrCPExpInvalidForm, "") + } + + return sym +} + +func (p *parser) parseCharProp() CPTree { + if !p.consume(tokenKindLBrace) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + var sym1, sym2 string + if !p.consume(tokenKindCharPropSymbol) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + sym1 = p.lastTok.propSymbol + if p.consume(tokenKindEqual) { + if !p.consume(tokenKindCharPropSymbol) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + sym2 = p.lastTok.propSymbol + } + + var alt CPTree + var propName, propVal string + if sym2 != "" { + propName = sym1 + propVal = sym2 + } else { + propName = "" + propVal = sym1 + } + if !p.isContributoryPropertyExposed && ucd.IsContributoryProperty(propName) { + p.raiseParseError(synErrCharPropUnsupported, propName) + } + pat, err := ucd.NormalizeCharacterProperty(propName, propVal) + if err != nil { + p.raiseParseError(synErrCharPropUnsupported, err.Error()) + } + if pat != "" { + p := NewParser(p.kind, bytes.NewReader([]byte(pat))) + p.exposeContributoryProperty() + ast, err := p.Parse() + if err != nil { + panic(err) + } + alt = ast + } else { + cpRanges, inverse, err := ucd.FindCodePointRanges(propName, propVal) + if err != nil { + p.raiseParseError(synErrCharPropUnsupported, err.Error()) + } + if inverse { + r := cpRanges[0] + alt = exclude(newRangeSymbolNode(r.From, r.To), genAnyCharAST()) + if alt == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + for _, r := range cpRanges[1:] { + alt = exclude(newRangeSymbolNode(r.From, r.To), alt) + if alt == nil { + p.raiseParseError(synErrUnmatchablePattern, "") + } + } + } else { + for _, r := range cpRanges { + alt = genAltNode( + alt, + newRangeSymbolNode(r.From, r.To), + ) + } + } + } + + if !p.consume(tokenKindRBrace) { + p.raiseParseError(synErrCharPropExpInvalidForm, "") + } + + return alt +} + +func (p *parser) parseFragment() CPTree { + if !p.consume(tokenKindLBrace) { + p.raiseParseError(synErrFragmentExpInvalidForm, "") + } + if !p.consume(tokenKindFragmentSymbol) { + p.raiseParseError(synErrFragmentExpInvalidForm, "") + } + sym := p.lastTok.fragmentSymbol + + if !p.consume(tokenKindRBrace) { + p.raiseParseError(synErrFragmentExpInvalidForm, "") + } + + return newFragmentNode(spec.LexKindName(sym), nil) +} + +func (p *parser) parseNormalChar() CPTree { + if !p.consume(tokenKindChar) { + return nil + } + return newSymbolNode(p.lastTok.char) +} + +func exclude(symbol, base CPTree) CPTree { + if left, right, ok := symbol.Alternatives(); ok { + return exclude(right, exclude(left, base)) + } + + if left, right, ok := base.Alternatives(); ok { + return genAltNode( + exclude(symbol, left), + exclude(symbol, right), + ) + } + + if bFrom, bTo, ok := base.Range(); ok { + sFrom, sTo, ok := symbol.Range() + if !ok { + panic(fmt.Errorf("invalid symbol tree: %T", symbol)) + } + + switch { + case sFrom > bFrom && sTo < bTo: + return genAltNode( + newRangeSymbolNode(bFrom, sFrom-1), + newRangeSymbolNode(sTo+1, bTo), + ) + case sFrom <= bFrom && sTo >= bFrom && sTo < bTo: + return newRangeSymbolNode(sTo+1, bTo) + case sFrom > bFrom && sFrom <= bTo && sTo >= bTo: + return newRangeSymbolNode(bFrom, sFrom-1) + case sFrom <= bFrom && sTo >= bTo: + return nil + default: + return base + } + } + + panic(fmt.Errorf("invalid base tree: %T", base)) +} + +func genAnyCharAST() CPTree { + return newRangeSymbolNode(0x0, 0x10FFFF) +} + +func isValidOrder(from, to rune) bool { + return from <= to +} + +func genConcatNode(cs ...CPTree) CPTree { + nonNilNodes := []CPTree{} + for _, c := range cs { + if c == nil { + continue + } + nonNilNodes = append(nonNilNodes, c) + } + if len(nonNilNodes) <= 0 { + return nil + } + if len(nonNilNodes) == 1 { + return nonNilNodes[0] + } + concat := newConcatNode(nonNilNodes[0], nonNilNodes[1]) + for _, c := range nonNilNodes[2:] { + concat = newConcatNode(concat, c) + } + return concat +} + +func genAltNode(cs ...CPTree) CPTree { + nonNilNodes := []CPTree{} + for _, c := range cs { + if c == nil { + continue + } + nonNilNodes = append(nonNilNodes, c) + } + if len(nonNilNodes) <= 0 { + return nil + } + if len(nonNilNodes) == 1 { + return nonNilNodes[0] + } + alt := newAltNode(nonNilNodes[0], nonNilNodes[1]) + for _, c := range nonNilNodes[2:] { + alt = newAltNode(alt, c) + } + return alt +} + +func (p *parser) expect(expected tokenKind) { + if !p.consume(expected) { + tok := p.peekedTok + p.raiseParseError(synErrUnexpectedToken, fmt.Sprintf("expected: %v, actual: %v", expected, tok.kind)) + } +} + +func (p *parser) consume(expected tokenKind) bool { + var tok *token + var err error + if p.peekedTok != nil { + tok = p.peekedTok + p.peekedTok = nil + } else { + tok, err = p.lex.next() + if err != nil { + if err == ParseErr { + detail, cause := p.lex.error() + p.raiseParseError(cause, detail) + } + panic(err) + } + } + p.lastTok = tok + if tok.kind == expected { + return true + } + p.peekedTok = tok + p.lastTok = nil + + return false +} + +func (p *parser) raiseParseError(err error, detail string) { + p.errCause = err + p.errDetail = detail + panic(ParseErr) +} diff --git a/grammar/lexical/parser/parser_test.go b/grammar/lexical/parser/parser_test.go new file mode 100644 index 0000000..d6cc4a8 --- /dev/null +++ b/grammar/lexical/parser/parser_test.go @@ -0,0 +1,1389 @@ +package parser + +import ( + "fmt" + "reflect" + "strings" + "testing" + + spec "github.com/nihei9/vartan/spec/grammar" + "github.com/nihei9/vartan/ucd" +) + +func TestParse(t *testing.T) { + tests := []struct { + pattern string + fragments map[spec.LexKindName]string + ast CPTree + syntaxError error + + // When an AST is large, as patterns containing a character property expression, this test only checks + // that the pattern is parsable. The check of the validity of such AST is performed by checking that it + // can be matched correctly using the driver. + skipTestAST bool + }{ + { + pattern: "a", + ast: newSymbolNode('a'), + }, + { + pattern: "abc", + ast: genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + }, + { + pattern: "a?", + ast: newOptionNode( + newSymbolNode('a'), + ), + }, + { + pattern: "[abc]?", + ast: newOptionNode( + genAltNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + }, + { + pattern: "\\u{3042}?", + ast: newOptionNode( + newSymbolNode('\u3042'), + ), + }, + { + pattern: "\\p{Letter}?", + skipTestAST: true, + }, + { + pattern: "\\f{a2c}?", + fragments: map[spec.LexKindName]string{ + "a2c": "abc", + }, + ast: newOptionNode( + newFragmentNode("a2c", + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + ), + }, + { + pattern: "(a)?", + ast: newOptionNode( + newSymbolNode('a'), + ), + }, + { + pattern: "((a?)?)?", + ast: newOptionNode( + newOptionNode( + newOptionNode( + newSymbolNode('a'), + ), + ), + ), + }, + { + pattern: "(abc)?", + ast: newOptionNode( + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + }, + { + pattern: "(a|b)?", + ast: newOptionNode( + genAltNode( + newSymbolNode('a'), + newSymbolNode('b'), + ), + ), + }, + { + pattern: "?", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "(?)", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a|?", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "?|b", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a??", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a*", + ast: newRepeatNode( + newSymbolNode('a'), + ), + }, + { + pattern: "[abc]*", + ast: newRepeatNode( + genAltNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + }, + { + pattern: "\\u{3042}*", + ast: newRepeatNode( + newSymbolNode('\u3042'), + ), + }, + { + pattern: "\\p{Letter}*", + skipTestAST: true, + }, + { + pattern: "\\f{a2c}*", + fragments: map[spec.LexKindName]string{ + "a2c": "abc", + }, + ast: newRepeatNode( + newFragmentNode("a2c", + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + ), + }, + { + pattern: "((a*)*)*", + ast: newRepeatNode( + newRepeatNode( + newRepeatNode( + newSymbolNode('a'), + ), + ), + ), + }, + { + pattern: "(abc)*", + ast: newRepeatNode( + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + }, + { + pattern: "(a|b)*", + ast: newRepeatNode( + genAltNode( + newSymbolNode('a'), + newSymbolNode('b'), + ), + ), + }, + { + pattern: "*", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "(*)", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a|*", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "*|b", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a**", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a+", + ast: genConcatNode( + newSymbolNode('a'), + newRepeatNode( + newSymbolNode('a'), + ), + ), + }, + { + pattern: "[abc]+", + ast: genConcatNode( + genAltNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + newRepeatNode( + genAltNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + ), + }, + { + pattern: "\\u{3042}+", + ast: genConcatNode( + newSymbolNode('\u3042'), + newRepeatNode( + newSymbolNode('\u3042'), + ), + ), + }, + { + pattern: "\\p{Letter}+", + skipTestAST: true, + }, + { + pattern: "\\f{a2c}+", + fragments: map[spec.LexKindName]string{ + "a2c": "abc", + }, + ast: genConcatNode( + newFragmentNode("a2c", + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + newRepeatNode( + newFragmentNode("a2c", + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + ), + ), + }, + { + pattern: "((a+)+)+", + ast: genConcatNode( + genConcatNode( + genConcatNode( + genConcatNode( + newSymbolNode('a'), + newRepeatNode( + newSymbolNode('a'), + ), + ), + newRepeatNode( + genConcatNode( + newSymbolNode('a'), + newRepeatNode( + newSymbolNode('a'), + ), + ), + ), + ), + newRepeatNode( + genConcatNode( + genConcatNode( + newSymbolNode('a'), + newRepeatNode( + newSymbolNode('a'), + ), + ), + newRepeatNode( + genConcatNode( + newSymbolNode('a'), + newRepeatNode( + newSymbolNode('a'), + ), + ), + ), + ), + ), + ), + ), + }, + { + pattern: "(abc)+", + ast: genConcatNode( + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + newRepeatNode( + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + ), + }, + { + pattern: "(a|b)+", + ast: genConcatNode( + genAltNode( + newSymbolNode('a'), + newSymbolNode('b'), + ), + newRepeatNode( + genAltNode( + newSymbolNode('a'), + newSymbolNode('b'), + ), + ), + ), + }, + { + pattern: "+", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "(+)", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a|+", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "+|b", + syntaxError: synErrRepNoTarget, + }, + { + pattern: "a++", + syntaxError: synErrRepNoTarget, + }, + { + pattern: ".", + ast: newRangeSymbolNode(0x00, 0x10FFFF), + }, + { + pattern: "[a]", + ast: newSymbolNode('a'), + }, + { + pattern: "[abc]", + ast: genAltNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + }, + { + pattern: "[a-z]", + ast: newRangeSymbolNode('a', 'z'), + }, + { + pattern: "[A-Za-z]", + ast: genAltNode( + newRangeSymbolNode('A', 'Z'), + newRangeSymbolNode('a', 'z'), + ), + }, + { + pattern: "[\\u{004E}]", + ast: newSymbolNode('N'), + }, + { + pattern: "[\\u{0061}-\\u{007A}]", + ast: newRangeSymbolNode('a', 'z'), + }, + { + pattern: "[\\p{Lu}]", + skipTestAST: true, + }, + { + pattern: "[a-\\p{Lu}]", + syntaxError: synErrRangePropIsUnavailable, + }, + { + pattern: "[\\p{Lu}-z]", + syntaxError: synErrRangePropIsUnavailable, + }, + { + pattern: "[\\p{Lu}-\\p{Ll}]", + syntaxError: synErrRangePropIsUnavailable, + }, + { + pattern: "[z-a]", + syntaxError: synErrRangeInvalidOrder, + }, + { + pattern: "a[]", + syntaxError: synErrBExpNoElem, + }, + { + pattern: "[]a", + syntaxError: synErrBExpNoElem, + }, + { + pattern: "[]", + syntaxError: synErrBExpNoElem, + }, + { + pattern: "[^\\u{004E}]", + ast: genAltNode( + newRangeSymbolNode(0x00, '\u004E'-1), + newRangeSymbolNode('\u004E'+1, 0x10FFFF), + ), + }, + { + pattern: "[^\\u{0061}-\\u{007A}]", + ast: genAltNode( + newRangeSymbolNode(0x00, '\u0061'-1), + newRangeSymbolNode('\u007A'+1, 0x10FFFF), + ), + }, + { + pattern: "[^\\p{Lu}]", + skipTestAST: true, + }, + { + pattern: "[^a-\\p{Lu}]", + syntaxError: synErrRangePropIsUnavailable, + }, + { + pattern: "[^\\p{Lu}-z]", + syntaxError: synErrRangePropIsUnavailable, + }, + { + pattern: "[^\\p{Lu}-\\p{Ll}]", + syntaxError: synErrRangePropIsUnavailable, + }, + { + pattern: "[^\\u{0000}-\\u{10FFFF}]", + syntaxError: synErrUnmatchablePattern, + }, + { + pattern: "[^\\u{0000}-\\u{FFFF}\\u{010000}-\\u{10FFFF}]", + syntaxError: synErrUnmatchablePattern, + }, + { + pattern: "[^]", + ast: newSymbolNode('^'), + }, + { + pattern: "[", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "([", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "[a", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "([a", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "[a-", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "([a-", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "[^", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "([^", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "[^a", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "([^a", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "[^a-", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "([^a-", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "]", + ast: newSymbolNode(']'), + }, + { + pattern: "(]", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: "a]", + ast: genConcatNode( + newSymbolNode('a'), + newSymbolNode(']'), + ), + }, + { + pattern: "(a]", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: "([)", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "([a)", + syntaxError: synErrBExpUnclosed, + }, + { + pattern: "[a-]", + ast: genAltNode( + newSymbolNode('a'), + newSymbolNode('-'), + ), + }, + { + pattern: "[^a-]", + ast: genAltNode( + newRangeSymbolNode(0x00, 0x2C), + newRangeSymbolNode(0x2E, 0x60), + newRangeSymbolNode(0x62, 0x10FFFF), + ), + }, + { + pattern: "[-z]", + ast: genAltNode( + newSymbolNode('-'), + newSymbolNode('z'), + ), + }, + { + pattern: "[^-z]", + ast: newAltNode( + newRangeSymbolNode(0x00, 0x2C), + newAltNode( + newRangeSymbolNode(0x2E, 0x79), + newRangeSymbolNode(0x7B, 0x10FFFF), + ), + ), + }, + { + pattern: "[-]", + ast: newSymbolNode('-'), + }, + { + pattern: "[^-]", + ast: genAltNode( + newRangeSymbolNode(0x00, 0x2C), + newRangeSymbolNode(0x2E, 0x10FFFF), + ), + }, + { + pattern: "[^01]", + ast: genAltNode( + newRangeSymbolNode(0x00, '0'-1), + newRangeSymbolNode('1'+1, 0x10FFFF), + ), + }, + { + pattern: "[^10]", + ast: genAltNode( + newRangeSymbolNode(0x00, '0'-1), + newRangeSymbolNode('1'+1, 0x10FFFF), + ), + }, + { + pattern: "[^a-z]", + ast: genAltNode( + newRangeSymbolNode(0x00, 'a'-1), + newRangeSymbolNode('z'+1, 0x10FFFF), + ), + }, + { + pattern: "[^az]", + ast: genAltNode( + newRangeSymbolNode(0x00, 'a'-1), + genAltNode( + newRangeSymbolNode('a'+1, 'z'-1), + newRangeSymbolNode('z'+1, 0x10FFFF), + ), + ), + }, + { + pattern: "\\u{006E}", + ast: newSymbolNode('\u006E'), + }, + { + pattern: "\\u{03BD}", + ast: newSymbolNode('\u03BD'), + }, + { + pattern: "\\u{306B}", + ast: newSymbolNode('\u306B'), + }, + { + pattern: "\\u{01F638}", + ast: newSymbolNode('\U0001F638'), + }, + { + pattern: "\\u{0000}", + ast: newSymbolNode('\u0000'), + }, + { + pattern: "\\u{10FFFF}", + ast: newSymbolNode('\U0010FFFF'), + }, + { + pattern: "\\u{110000}", + syntaxError: synErrCPExpOutOfRange, + }, + { + pattern: "\\u", + syntaxError: synErrCPExpInvalidForm, + }, + { + pattern: "\\u{", + syntaxError: synErrCPExpInvalidForm, + }, + { + pattern: "\\u{03BD", + syntaxError: synErrCPExpInvalidForm, + }, + { + pattern: "\\u{}", + syntaxError: synErrCPExpInvalidForm, + }, + { + pattern: "\\p{Letter}", + skipTestAST: true, + }, + { + pattern: "\\p{General_Category=Letter}", + skipTestAST: true, + }, + { + pattern: "\\p{ Letter }", + skipTestAST: true, + }, + { + pattern: "\\p{ General_Category = Letter }", + skipTestAST: true, + }, + { + pattern: "\\p", + syntaxError: synErrCharPropExpInvalidForm, + }, + { + pattern: "\\p{", + syntaxError: synErrCharPropExpInvalidForm, + }, + { + pattern: "\\p{Letter", + syntaxError: synErrCharPropExpInvalidForm, + }, + { + pattern: "\\p{General_Category=}", + syntaxError: synErrCharPropExpInvalidForm, + }, + { + pattern: "\\p{General_Category= }", + syntaxError: synErrCharPropInvalidSymbol, + }, + { + pattern: "\\p{=Letter}", + syntaxError: synErrCharPropExpInvalidForm, + }, + { + pattern: "\\p{ =Letter}", + syntaxError: synErrCharPropInvalidSymbol, + }, + { + pattern: "\\p{=}", + syntaxError: synErrCharPropExpInvalidForm, + }, + { + pattern: "\\p{}", + syntaxError: synErrCharPropExpInvalidForm, + }, + { + pattern: "\\f{a2c}", + fragments: map[spec.LexKindName]string{ + "a2c": "abc", + }, + ast: newFragmentNode("a2c", + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + }, + { + pattern: "\\f{ a2c }", + fragments: map[spec.LexKindName]string{ + "a2c": "abc", + }, + ast: newFragmentNode("a2c", + genConcatNode( + newSymbolNode('a'), + newSymbolNode('b'), + newSymbolNode('c'), + ), + ), + }, + { + pattern: "\\f", + syntaxError: synErrFragmentExpInvalidForm, + }, + { + pattern: "\\f{", + syntaxError: synErrFragmentExpInvalidForm, + }, + { + pattern: "\\f{a2c", + fragments: map[spec.LexKindName]string{ + "a2c": "abc", + }, + syntaxError: synErrFragmentExpInvalidForm, + }, + { + pattern: "(a)", + ast: newSymbolNode('a'), + }, + { + pattern: "(((a)))", + ast: newSymbolNode('a'), + }, + { + pattern: "a()", + syntaxError: synErrGroupNoElem, + }, + { + pattern: "()a", + syntaxError: synErrGroupNoElem, + }, + { + pattern: "()", + syntaxError: synErrGroupNoElem, + }, + { + pattern: "(", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: "a(", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: "(a", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: "((", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: "((a)", + syntaxError: synErrGroupUnclosed, + }, + { + pattern: ")", + syntaxError: synErrGroupNoInitiator, + }, + { + pattern: "a)", + syntaxError: synErrGroupNoInitiator, + }, + { + pattern: ")a", + syntaxError: synErrGroupNoInitiator, + }, + { + pattern: "))", + syntaxError: synErrGroupNoInitiator, + }, + { + pattern: "(a))", + syntaxError: synErrGroupNoInitiator, + }, + { + pattern: "Mulder|Scully", + ast: genAltNode( + genConcatNode( + newSymbolNode('M'), + newSymbolNode('u'), + newSymbolNode('l'), + newSymbolNode('d'), + newSymbolNode('e'), + newSymbolNode('r'), + ), + genConcatNode( + newSymbolNode('S'), + newSymbolNode('c'), + newSymbolNode('u'), + newSymbolNode('l'), + newSymbolNode('l'), + newSymbolNode('y'), + ), + ), + }, + { + pattern: "Langly|Frohike|Byers", + ast: genAltNode( + genConcatNode( + newSymbolNode('L'), + newSymbolNode('a'), + newSymbolNode('n'), + newSymbolNode('g'), + newSymbolNode('l'), + newSymbolNode('y'), + ), + genConcatNode( + newSymbolNode('F'), + newSymbolNode('r'), + newSymbolNode('o'), + newSymbolNode('h'), + newSymbolNode('i'), + newSymbolNode('k'), + newSymbolNode('e'), + ), + genConcatNode( + newSymbolNode('B'), + newSymbolNode('y'), + newSymbolNode('e'), + newSymbolNode('r'), + newSymbolNode('s'), + ), + ), + }, + { + pattern: "|", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "||", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "Mulder|", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "|Scully", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "Langly|Frohike|", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "Langly||Byers", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "|Frohike|Byers", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "|Frohike|", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "Fox(|)Mulder", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "(Fox|)Mulder", + syntaxError: synErrAltLackOfOperand, + }, + { + pattern: "Fox(|Mulder)", + syntaxError: synErrAltLackOfOperand, + }, + } + for i, tt := range tests { + t.Run(fmt.Sprintf("#%v %v", i, tt.pattern), func(t *testing.T) { + fragmentTrees := map[spec.LexKindName]CPTree{} + for kind, pattern := range tt.fragments { + p := NewParser(kind, strings.NewReader(pattern)) + root, err := p.Parse() + if err != nil { + t.Fatal(err) + } + + fragmentTrees[kind] = root + } + err := CompleteFragments(fragmentTrees) + if err != nil { + t.Fatal(err) + } + + p := NewParser(spec.LexKindName("test"), strings.NewReader(tt.pattern)) + root, err := p.Parse() + if tt.syntaxError != nil { + // printCPTree(os.Stdout, root, "", "") + if err != ParseErr { + t.Fatalf("unexpected error: want: %v, got: %v", ParseErr, err) + } + _, synErr := p.Error() + if synErr != tt.syntaxError { + t.Fatalf("unexpected syntax error: want: %v, got: %v", tt.syntaxError, synErr) + } + if root != nil { + t.Fatalf("tree must be nil") + } + } else { + if err != nil { + detail, cause := p.Error() + t.Fatalf("%v: %v: %v", err, cause, detail) + } + if root == nil { + t.Fatal("tree must be non-nil") + } + + complete, err := ApplyFragments(root, fragmentTrees) + if err != nil { + t.Fatal(err) + } + if !complete { + t.Fatalf("incomplete fragments") + } + + // printCPTree(os.Stdout, root, "", "") + if !tt.skipTestAST { + r := root.(*rootNode) + testAST(t, tt.ast, r.tree) + } + } + }) + } +} + +func TestParse_ContributoryPropertyIsNotExposed(t *testing.T) { + for _, cProp := range ucd.ContributoryProperties() { + t.Run(fmt.Sprintf("%v", cProp), func(t *testing.T) { + p := NewParser(spec.LexKindName("test"), strings.NewReader(fmt.Sprintf(`\p{%v=yes}`, cProp))) + root, err := p.Parse() + if err == nil { + t.Fatalf("expected syntax error: got: nil") + } + _, synErr := p.Error() + if synErr != synErrCharPropUnsupported { + t.Fatalf("unexpected syntax error: want: %v, got: %v", synErrCharPropUnsupported, synErr) + } + if root != nil { + t.Fatalf("tree is not nil") + } + }) + } +} + +func TestExclude(t *testing.T) { + for _, test := range []struct { + caption string + target CPTree + base CPTree + result CPTree + }{ + // t.From > b.From && t.To < b.To + + // |t.From - b.From| = 1 + // |b.To - t.To| = 1 + // + // Target (t): +--+ + // Base (b): +--+--+--+ + // Result (b - t): +--+ +--+ + { + caption: "|t.From - b.From| = 1 && |b.To - t.To| = 1", + target: newSymbolNode('1'), + base: newRangeSymbolNode('0', '2'), + result: newAltNode( + newSymbolNode('0'), + newSymbolNode('2'), + ), + }, + // |t.From - b.From| > 1 + // |b.To - t.To| > 1 + // + // Target (t): +--+ + // Base (b): +--+--+--+--+--+ + // Result (b - t): +--+--+ +--+--+ + { + caption: "|t.From - b.From| > 1 && |b.To - t.To| > 1", + target: newSymbolNode('2'), + base: newRangeSymbolNode('0', '4'), + result: newAltNode( + newRangeSymbolNode('0', '1'), + newRangeSymbolNode('3', '4'), + ), + }, + + // t.From <= b.From && t.To >= b.From && t.To < b.To + + // |b.From - t.From| = 0 + // |t.To - b.From| = 0 + // |b.To - t.To| = 1 + // + // Target (t): +--+ + // Base (b): +--+--+ + // Result (b - t): +--+ + { + caption: "|b.From - t.From| = 0 && |t.To - b.From| = 0 && |b.To - t.To| = 1", + target: newSymbolNode('0'), + base: newRangeSymbolNode('0', '1'), + result: newSymbolNode('1'), + }, + // |b.From - t.From| = 0 + // |t.To - b.From| = 0 + // |b.To - t.To| > 1 + // + // Target (t): +--+ + // Base (b): +--+--+--+ + // Result (b - t): +--+--+ + { + caption: "|b.From - t.From| = 0 && |t.To - b.From| = 0 && |b.To - t.To| > 1", + target: newSymbolNode('0'), + base: newRangeSymbolNode('0', '2'), + result: newRangeSymbolNode('1', '2'), + }, + // |b.From - t.From| = 0 + // |t.To - b.From| > 0 + // |b.To - t.To| = 1 + // + // Target (t): +--+--+ + // Base (b): +--+--+--+ + // Result (b - t): +--+ + { + caption: "|b.From - t.From| = 0 && |t.To - b.From| > 0 && |b.To - t.To| = 1", + target: newRangeSymbolNode('0', '1'), + base: newRangeSymbolNode('0', '2'), + result: newSymbolNode('2'), + }, + // |b.From - t.From| = 0 + // |t.To - b.From| > 0 + // |b.To - t.To| > 1 + // + // Target (t): +--+--+ + // Base (b): +--+--+--+--+ + // Result (b - t): +--+--+ + { + caption: "|b.From - t.From| = 0 && |t.To - b.From| > 0 && |b.To - t.To| > 1", + target: newRangeSymbolNode('0', '1'), + base: newRangeSymbolNode('0', '3'), + result: newRangeSymbolNode('2', '3'), + }, + // |b.From - t.From| > 0 + // |t.To - b.From| = 0 + // |b.To - t.To| = 1 + // + // Target (t): +--+--+ + // Base (b): +--+--+ + // Result (b - t): +--+ + { + caption: "|b.From - t.From| > 0 && |t.To - b.From| = 0 && |b.To - t.To| = 1", + target: newRangeSymbolNode('0', '1'), + base: newRangeSymbolNode('1', '2'), + result: newSymbolNode('2'), + }, + // |b.From - t.From| > 0 + // |t.To - b.From| = 0 + // |b.To - t.To| > 1 + // + // Target (t): +--+--+ + // Base (b): +--+--+--+ + // Result (b - t): +--+--+ + { + caption: "|b.From - t.From| > 0 && |t.To - b.From| = 0 && |b.To - t.To| > 1", + target: newRangeSymbolNode('0', '1'), + base: newRangeSymbolNode('1', '3'), + result: newRangeSymbolNode('2', '3'), + }, + // |b.From - t.From| > 0 + // |t.To - b.From| > 0 + // |b.To - t.To| = 1 + // + // Target (t): +--+--+--+ + // Base (b): +--+--+--+ + // Result (b - t): +--+ + { + caption: "|b.From - t.From| > 0 && |t.To - b.From| > 0 && |b.To - t.To| = 1", + target: newRangeSymbolNode('0', '2'), + base: newRangeSymbolNode('1', '3'), + result: newSymbolNode('3'), + }, + // |b.From - t.From| > 0 + // |t.To - b.From| > 0 + // |b.To - t.To| > 1 + // + // Target (t): +--+--+--+ + // Base (b): +--+--+--+--+ + // Result (b - t): +--+--+ + { + caption: "|b.From - t.From| > 0 && |t.To - b.From| > 0 && |b.To - t.To| > 1", + target: newRangeSymbolNode('0', '2'), + base: newRangeSymbolNode('1', '4'), + result: newRangeSymbolNode('3', '4'), + }, + + // t.From > b.From && t.From <= b.To && t.To >= b.To + + // |t.From - b.From| = 1 + // |b.To - t.From| = 0 + // |t.To - b.To| = 0 + // + // Target (t): +--+ + // Base (b): +--+--+ + // Result (b - t): +--+ + { + caption: "|t.From - b.From| = 1 && |b.To - t.From| = 0 && |t.To - b.To| = 0", + target: newSymbolNode('1'), + base: newRangeSymbolNode('0', '1'), + result: newSymbolNode('0'), + }, + // |t.From - b.From| = 1 + // |b.To - t.From| = 0 + // |t.To - b.To| > 0 + // + // Target (t): +--+--+ + // Base (b): +--+--+ + // Result (b - t): +--+ + { + caption: "|t.From - b.From| = 1 && |b.To - t.From| = 0 && |t.To - b.To| > 0", + target: newRangeSymbolNode('1', '2'), + base: newRangeSymbolNode('0', '1'), + result: newSymbolNode('0'), + }, + // |t.From - b.From| = 1 + // |b.To - t.From| > 0 + // |t.To - b.To| = 0 + // + // Target (t): +--+--+ + // Base (b): +--+--+--+ + // Result (b - t): +--+ + { + caption: "|t.From - b.From| = 1 && |b.To - t.From| > 0 && |t.To - b.To| = 0", + target: newRangeSymbolNode('1', '2'), + base: newRangeSymbolNode('0', '2'), + result: newSymbolNode('0'), + }, + // |t.From - b.From| = 1 + // |b.To - t.From| > 0 + // |t.To - b.To| > 0 + // + // Target (t): +--+--+--+ + // Base (b): +--+--+--+ + // Result (b - t): +--+ + { + caption: "|t.From - b.From| = 1 && |b.To - t.From| > 0 && |t.To - b.To| > 0", + target: newRangeSymbolNode('1', '3'), + base: newRangeSymbolNode('0', '2'), + result: newSymbolNode('0'), + }, + // |t.From - b.From| > 1 + // |b.To - t.From| = 0 + // |t.To - b.To| = 0 + // + // Target (t): +--+ + // Base (b): +--+--+--+ + // Result (b - t): +--+--+ + { + caption: "|t.From - b.From| > 1 && |b.To - t.From| = 0 && |t.To - b.To| = 0", + target: newSymbolNode('2'), + base: newRangeSymbolNode('0', '2'), + result: newRangeSymbolNode('0', '1'), + }, + // |t.From - b.From| > 1 + // |b.To - t.From| = 0 + // |t.To - b.To| > 0 + // + // Target (t): +--+--+ + // Base (b): +--+--+--+ + // Result (b - t): +--+--+ + { + caption: "|t.From - b.From| > 1 && |b.To - t.From| = 0 && |t.To - b.To| > 0", + target: newRangeSymbolNode('2', '3'), + base: newRangeSymbolNode('0', '2'), + result: newRangeSymbolNode('0', '1'), + }, + // |t.From - b.From| > 1 + // |b.To - t.From| > 0 + // |t.To - b.To| = 0 + // + // Target (t): +--+--+ + // Base (b): +--+--+--+--+ + // Result (b - t): +--+--+ + { + caption: "|t.From - b.From| > 1 && |b.To - t.From| > 0 && |t.To - b.To| = 0", + target: newRangeSymbolNode('2', '3'), + base: newRangeSymbolNode('0', '3'), + result: newRangeSymbolNode('0', '1'), + }, + // |t.From - b.From| > 1 + // |b.To - t.From| > 0 + // |t.To - b.To| > 0 + // + // Target (t): +--+--+--+ + // Base (b): +--+--+--+--+ + // Result (b - t): +--+--+ + { + caption: "|t.From - b.From| > 1 && |b.To - t.From| > 0 && |t.To - b.To| > 0", + target: newRangeSymbolNode('2', '4'), + base: newRangeSymbolNode('0', '3'), + result: newRangeSymbolNode('0', '1'), + }, + + // t.From <= b.From && t.To >= b.To + + // |b.From - t.From| = 0 + // |t.To - b.To| = 0 + // + // Target (t): +--+ + // Base (b): +--+ + // Result (b - t): N/A + { + caption: "|b.From - t.From| = 0 && |t.To - b.To| = 0", + target: newSymbolNode('0'), + base: newSymbolNode('0'), + result: nil, + }, + // |b.From - t.From| = 0 + // |t.To - b.To| > 0 + // + // Target (t): +--+--+ + // Base (b): +--+ + // Result (b - t): N/A + { + caption: "|b.From - t.From| = 0 && |t.To - b.To| > 0", + target: newRangeSymbolNode('0', '1'), + base: newSymbolNode('0'), + result: nil, + }, + // |b.From - t.From| > 0 + // |t.To - b.To| = 0 + // + // Target (t): +--+--+ + // Base (b): +--+ + // Result (b - t): N/A + { + caption: "|b.From - t.From| > 0 && |t.To - b.To| = 0", + target: newRangeSymbolNode('0', '1'), + base: newSymbolNode('1'), + result: nil, + }, + // |b.From - t.From| > 0 + // |t.To - b.To| > 0 + // + // Target (t): +--+--+--+ + // Base (b): +--+ + // Result (b - t): N/A + { + caption: "|b.From - t.From| > 0 && |t.To - b.To| > 0", + target: newRangeSymbolNode('0', '2'), + base: newSymbolNode('1'), + result: nil, + }, + + // Others + + // |b.From - t.From| = 1 + // + // Target (t): +--+ + // Base (b): +--+ + // Result (b - t): +--+ + { + caption: "|b.From - t.From| = 1", + target: newSymbolNode('0'), + base: newSymbolNode('1'), + result: newSymbolNode('1'), + }, + // |b.From - t.From| > 1 + // + // Target (t): +--+ + // Base (b): +--+ + // Result (b - t): +--+ + { + caption: "|b.From - t.From| > 1", + target: newSymbolNode('0'), + base: newSymbolNode('2'), + result: newSymbolNode('2'), + }, + // |t.To - b.To| = 1 + // + // Target (t): +--+ + // Base (b): +--+ + // Result (b - t): +--+ + { + caption: "|t.To - b.To| = 1", + target: newSymbolNode('1'), + base: newSymbolNode('0'), + result: newSymbolNode('0'), + }, + // |t.To - b.To| > 1 + // + // Target (t): +--+ + // Base (b): +--+ + // Result (b - t): +--+ + { + caption: "|t.To - b.To| > 1", + target: newSymbolNode('2'), + base: newSymbolNode('0'), + result: newSymbolNode('0'), + }, + } { + t.Run(test.caption, func(t *testing.T) { + r := exclude(test.target, test.base) + testAST(t, test.result, r) + }) + } +} + +func testAST(t *testing.T, expected, actual CPTree) { + t.Helper() + + aTy := reflect.TypeOf(actual) + eTy := reflect.TypeOf(expected) + if eTy != aTy { + t.Fatalf("unexpected node: want: %+v, got: %+v", eTy, aTy) + } + + if actual == nil { + return + } + + switch e := expected.(type) { + case *symbolNode: + a := actual.(*symbolNode) + if a.From != e.From || a.To != e.To { + t.Fatalf("unexpected node: want: %+v, got: %+v", e, a) + } + } + eLeft, eRight := expected.children() + aLeft, aRight := actual.children() + testAST(t, eLeft, aLeft) + testAST(t, eRight, aRight) +} diff --git a/grammar/lexical/parser/tree.go b/grammar/lexical/parser/tree.go new file mode 100644 index 0000000..3d9d197 --- /dev/null +++ b/grammar/lexical/parser/tree.go @@ -0,0 +1,459 @@ +package parser + +import ( + "fmt" + "io" + "sort" + + spec "github.com/nihei9/vartan/spec/grammar" +) + +type CPRange struct { + From rune + To rune +} + +type CPTree interface { + fmt.Stringer + Range() (rune, rune, bool) + Optional() (CPTree, bool) + Repeatable() (CPTree, bool) + Concatenation() (CPTree, CPTree, bool) + Alternatives() (CPTree, CPTree, bool) + Describe() (spec.LexKindName, []spec.LexKindName, error) + + children() (CPTree, CPTree) + clone() CPTree +} + +var ( + _ CPTree = &rootNode{} + _ CPTree = &symbolNode{} + _ CPTree = &concatNode{} + _ CPTree = &altNode{} + _ CPTree = &quantifierNode{} + _ CPTree = &fragmentNode{} +) + +type rootNode struct { + kind spec.LexKindName + tree CPTree + fragments map[spec.LexKindName][]*fragmentNode +} + +func newRootNode(kind spec.LexKindName, t CPTree) *rootNode { + fragments := map[spec.LexKindName][]*fragmentNode{} + collectFragments(t, fragments) + + return &rootNode{ + kind: kind, + tree: t, + fragments: fragments, + } +} + +func collectFragments(n CPTree, fragments map[spec.LexKindName][]*fragmentNode) { + if n == nil { + return + } + + if f, ok := n.(*fragmentNode); ok { + fragments[f.kind] = append(fragments[f.kind], f) + return + } + + l, r := n.children() + collectFragments(l, fragments) + collectFragments(r, fragments) +} + +func (n *rootNode) String() string { + return fmt.Sprintf("root: %v: %v fragments", n.kind, len(n.fragments)) +} + +func (n *rootNode) Range() (rune, rune, bool) { + return n.tree.Range() +} + +func (n *rootNode) Optional() (CPTree, bool) { + return n.tree.Optional() +} + +func (n *rootNode) Repeatable() (CPTree, bool) { + return n.tree.Repeatable() +} + +func (n *rootNode) Concatenation() (CPTree, CPTree, bool) { + return n.tree.Concatenation() +} + +func (n *rootNode) Alternatives() (CPTree, CPTree, bool) { + return n.tree.Alternatives() +} + +func (n *rootNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + var frags []spec.LexKindName + for f := range n.fragments { + frags = append(frags, spec.LexKindName(f)) + } + sort.Slice(frags, func(i, j int) bool { + return frags[i] < frags[j] + }) + + return n.kind, frags, nil +} + +func (n *rootNode) children() (CPTree, CPTree) { + return n.tree.children() +} + +func (n *rootNode) clone() CPTree { + return n.tree.clone() +} + +func (n *rootNode) incomplete() bool { + return len(n.fragments) > 0 +} + +func (n *rootNode) applyFragment(kind spec.LexKindName, fragment CPTree) error { + root, ok := fragment.(*rootNode) + if !ok { + return fmt.Errorf("applyFragment can take only *rootNode: %T", fragment) + } + if root.incomplete() { + return fmt.Errorf("fragment is incomplete") + } + + fs, ok := n.fragments[kind] + if !ok { + return nil + } + for _, f := range fs { + f.tree = root.clone() + } + delete(n.fragments, kind) + + return nil +} + +type symbolNode struct { + CPRange +} + +func newSymbolNode(cp rune) *symbolNode { + return &symbolNode{ + CPRange: CPRange{ + From: cp, + To: cp, + }, + } +} + +func newRangeSymbolNode(from, to rune) *symbolNode { + return &symbolNode{ + CPRange: CPRange{ + From: from, + To: to, + }, + } +} + +func (n *symbolNode) String() string { + return fmt.Sprintf("symbol: %X..%X", n.From, n.To) +} + +func (n *symbolNode) Range() (rune, rune, bool) { + return n.From, n.To, true +} + +func (n *symbolNode) Optional() (CPTree, bool) { + return nil, false +} + +func (n *symbolNode) Repeatable() (CPTree, bool) { + return nil, false +} + +func (n *symbolNode) Concatenation() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *symbolNode) Alternatives() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *symbolNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *symbolNode) children() (CPTree, CPTree) { + return nil, nil +} + +func (n *symbolNode) clone() CPTree { + return newRangeSymbolNode(n.From, n.To) +} + +type concatNode struct { + left CPTree + right CPTree +} + +func newConcatNode(left, right CPTree) *concatNode { + return &concatNode{ + left: left, + right: right, + } +} + +func (n *concatNode) String() string { + return "concat" +} + +func (n *concatNode) Range() (rune, rune, bool) { + return 0, 0, false +} + +func (n *concatNode) Optional() (CPTree, bool) { + return nil, false +} + +func (n *concatNode) Repeatable() (CPTree, bool) { + return nil, false +} + +func (n *concatNode) Concatenation() (CPTree, CPTree, bool) { + return n.left, n.right, true +} + +func (n *concatNode) Alternatives() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *concatNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *concatNode) children() (CPTree, CPTree) { + return n.left, n.right +} + +func (n *concatNode) clone() CPTree { + if n == nil { + return nil + } + return newConcatNode(n.left.clone(), n.right.clone()) +} + +type altNode struct { + left CPTree + right CPTree +} + +func newAltNode(left, right CPTree) *altNode { + return &altNode{ + left: left, + right: right, + } +} + +func (n *altNode) String() string { + return "alt" +} + +func (n *altNode) Range() (rune, rune, bool) { + return 0, 0, false +} + +func (n *altNode) Optional() (CPTree, bool) { + return nil, false +} + +func (n *altNode) Repeatable() (CPTree, bool) { + return nil, false +} + +func (n *altNode) Concatenation() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *altNode) Alternatives() (CPTree, CPTree, bool) { + return n.left, n.right, true +} + +func (n *altNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *altNode) children() (CPTree, CPTree) { + return n.left, n.right +} + +func (n *altNode) clone() CPTree { + return newAltNode(n.left.clone(), n.right.clone()) +} + +type quantifierNode struct { + optional bool + repeatable bool + tree CPTree +} + +func (n *quantifierNode) String() string { + switch { + case n.repeatable: + return "repeatable (>= 0 times)" + case n.optional: + return "optional (0 or 1 times)" + default: + return "invalid quantifier" + } +} + +func newRepeatNode(t CPTree) *quantifierNode { + return &quantifierNode{ + repeatable: true, + tree: t, + } +} + +func newRepeatOneOrMoreNode(t CPTree) *concatNode { + return newConcatNode( + t, + &quantifierNode{ + repeatable: true, + tree: t.clone(), + }) +} + +func newOptionNode(t CPTree) *quantifierNode { + return &quantifierNode{ + optional: true, + tree: t, + } +} + +func (n *quantifierNode) Range() (rune, rune, bool) { + return 0, 0, false +} + +func (n *quantifierNode) Optional() (CPTree, bool) { + return n.tree, n.optional +} + +func (n *quantifierNode) Repeatable() (CPTree, bool) { + return n.tree, n.repeatable +} + +func (n *quantifierNode) Concatenation() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *quantifierNode) Alternatives() (CPTree, CPTree, bool) { + return nil, nil, false +} + +func (n *quantifierNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *quantifierNode) children() (CPTree, CPTree) { + return n.tree, nil +} + +func (n *quantifierNode) clone() CPTree { + if n.repeatable { + return newRepeatNode(n.tree.clone()) + } + return newOptionNode(n.tree.clone()) +} + +type fragmentNode struct { + kind spec.LexKindName + tree CPTree +} + +func newFragmentNode(kind spec.LexKindName, t CPTree) *fragmentNode { + return &fragmentNode{ + kind: kind, + tree: t, + } +} + +func (n *fragmentNode) String() string { + return fmt.Sprintf("fragment: %v", n.kind) +} + +func (n *fragmentNode) Range() (rune, rune, bool) { + return n.tree.Range() +} + +func (n *fragmentNode) Optional() (CPTree, bool) { + return n.tree.Optional() +} + +func (n *fragmentNode) Repeatable() (CPTree, bool) { + return n.tree.Repeatable() +} + +func (n *fragmentNode) Concatenation() (CPTree, CPTree, bool) { + return n.tree.Concatenation() +} + +func (n *fragmentNode) Alternatives() (CPTree, CPTree, bool) { + return n.tree.Alternatives() +} + +func (n *fragmentNode) Describe() (spec.LexKindName, []spec.LexKindName, error) { + return spec.LexKindNameNil, nil, fmt.Errorf("%T cannot describe", n) +} + +func (n *fragmentNode) children() (CPTree, CPTree) { + return n.tree.children() +} + +func (n *fragmentNode) clone() CPTree { + if n.tree == nil { + return newFragmentNode(n.kind, nil) + } + return newFragmentNode(n.kind, n.tree.clone()) +} + +//nolint:unused +func printCPTree(w io.Writer, t CPTree, ruledLine string, childRuledLinePrefix string) { + if t == nil { + return + } + fmt.Fprintf(w, "%v%v\n", ruledLine, t) + children := []CPTree{} + switch n := t.(type) { + case *rootNode: + children = append(children, n.tree) + case *fragmentNode: + children = append(children, n.tree) + default: + left, right := t.children() + if left != nil { + children = append(children, left) + } + if right != nil { + children = append(children, right) + } + } + num := len(children) + for i, child := range children { + line := "└─ " + if num > 1 { + if i == 0 { + line = "├─ " + } else if i < num-1 { + line = "│ " + } + } + prefix := "│ " + if i >= num-1 { + prefix = " " + } + printCPTree(w, child, childRuledLinePrefix+line, childRuledLinePrefix+prefix) + } +} diff --git a/grammar/lr0.go b/grammar/lr0.go index dea5254..77ad2e0 100644 --- a/grammar/lr0.go +++ b/grammar/lr0.go @@ -3,6 +3,8 @@ package grammar import ( "fmt" "sort" + + "github.com/nihei9/vartan/grammar/symbol" ) type lr0Automaton struct { @@ -10,8 +12,8 @@ type lr0Automaton struct { states map[kernelID]*lrState } -func genLR0Automaton(prods *productionSet, startSym symbol, errSym symbol) (*lr0Automaton, error) { - if !startSym.isStart() { +func genLR0Automaton(prods *productionSet, startSym symbol.Symbol, errSym symbol.Symbol) (*lr0Automaton, error) { + if !startSym.IsStart() { return nil, fmt.Errorf("passed symbold is not a start symbol") } @@ -67,7 +69,7 @@ func genLR0Automaton(prods *productionSet, startSym symbol, errSym symbol) (*lr0 return automaton, nil } -func genStateAndNeighbourKernels(k *kernel, prods *productionSet, errSym symbol) (*lrState, []*kernel, error) { +func genStateAndNeighbourKernels(k *kernel, prods *productionSet, errSym symbol.Symbol) (*lrState, []*kernel, error) { items, err := genLR0Closure(k, prods) if err != nil { return nil, nil, err @@ -77,7 +79,7 @@ func genStateAndNeighbourKernels(k *kernel, prods *productionSet, errSym symbol) return nil, nil, err } - next := map[symbol]kernelID{} + next := map[symbol.Symbol]kernelID{} kernels := []*kernel{} for _, n := range neighbours { next[n.symbol] = n.kernel.id @@ -125,7 +127,7 @@ func genLR0Closure(k *kernel, prods *productionSet) ([]*lrItem, error) { for len(uncheckedItems) > 0 { nextUncheckedItems := []*lrItem{} for _, item := range uncheckedItems { - if item.dottedSymbol.isTerminal() { + if item.dottedSymbol.IsTerminal() { continue } @@ -150,14 +152,14 @@ func genLR0Closure(k *kernel, prods *productionSet) ([]*lrItem, error) { } type neighbourKernel struct { - symbol symbol + symbol symbol.Symbol kernel *kernel } func genNeighbourKernels(items []*lrItem, prods *productionSet) ([]*neighbourKernel, error) { - kItemMap := map[symbol][]*lrItem{} + kItemMap := map[symbol.Symbol][]*lrItem{} for _, item := range items { - if item.dottedSymbol.isNil() { + if item.dottedSymbol.IsNil() { continue } prod, ok := prods.findByID(item.prod) @@ -171,7 +173,7 @@ func genNeighbourKernels(items []*lrItem, prods *productionSet) ([]*neighbourKer kItemMap[item.dottedSymbol] = append(kItemMap[item.dottedSymbol], kItem) } - nextSyms := []symbol{} + nextSyms := []symbol.Symbol{} for sym := range kItemMap { nextSyms = append(nextSyms, sym) } diff --git a/grammar/lr0_test.go b/grammar/lr0_test.go index 0d0b134..99d4e5b 100644 --- a/grammar/lr0_test.go +++ b/grammar/lr0_test.go @@ -5,12 +5,13 @@ import ( "strings" "testing" - spec "github.com/nihei9/vartan/spec/grammar" + "github.com/nihei9/vartan/grammar/symbol" + "github.com/nihei9/vartan/spec/grammar/parser" ) type expectedLRState struct { kernelItems []*lrItem - nextStates map[symbol][]*lrItem + nextStates map[symbol.Symbol][]*lrItem reducibleProds []*production emptyProdItems []*lrItem } @@ -41,15 +42,14 @@ id: "[A-Za-z_][0-9A-Za-z_]*"; var gram *Grammar var automaton *lr0Automaton { - ast, err := spec.Parse(strings.NewReader(src)) + ast, err := parser.Parse(strings.NewReader(src)) if err != nil { t.Fatal(err) } b := GrammarBuilder{ AST: ast, } - - gram, err = b.Build() + gram, err = b.build() if err != nil { t.Fatal(err) } @@ -118,7 +118,7 @@ id: "[A-Za-z_][0-9A-Za-z_]*"; expectedStates := []*expectedLRState{ { kernelItems: expectedKernels[0], - nextStates: map[symbol][]*lrItem{ + nextStates: map[symbol.Symbol][]*lrItem{ genSym("expr"): expectedKernels[1], genSym("term"): expectedKernels[2], genSym("factor"): expectedKernels[3], @@ -129,7 +129,7 @@ id: "[A-Za-z_][0-9A-Za-z_]*"; }, { kernelItems: expectedKernels[1], - nextStates: map[symbol][]*lrItem{ + nextStates: map[symbol.Symbol][]*lrItem{ genSym("add"): expectedKernels[6], }, reducibleProds: []*production{ @@ -138,7 +138,7 @@ id: "[A-Za-z_][0-9A-Za-z_]*"; }, { kernelItems: expectedKernels[2], - nextStates: map[symbol][]*lrItem{ + nextStates: map[symbol.Symbol][]*lrItem{ genSym("mul"): expectedKernels[7], }, reducibleProds: []*production{ @@ -147,14 +147,14 @@ id: "[A-Za-z_][0-9A-Za-z_]*"; }, { kernelItems: expectedKernels[3], - nextStates: map[symbol][]*lrItem{}, + nextStates: map[symbol.Symbol][]*lrItem{}, reducibleProds: []*production{ genProd("term", "factor"), }, }, { kernelItems: expectedKernels[4], - nextStates: map[symbol][]*lrItem{ + nextStates: map[symbol.Symbol][]*lrItem{ genSym("expr"): expectedKernels[8], genSym("term"): expectedKernels[2], genSym("factor"): expectedKernels[3], @@ -165,14 +165,14 @@ id: "[A-Za-z_][0-9A-Za-z_]*"; }, { kernelItems: expectedKernels[5], - nextStates: map[symbol][]*lrItem{}, + nextStates: map[symbol.Symbol][]*lrItem{}, reducibleProds: []*production{ genProd("factor", "id"), }, }, { kernelItems: expectedKernels[6], - nextStates: map[symbol][]*lrItem{ + nextStates: map[symbol.Symbol][]*lrItem{ genSym("term"): expectedKernels[9], genSym("factor"): expectedKernels[3], genSym("l_paren"): expectedKernels[4], @@ -182,7 +182,7 @@ id: "[A-Za-z_][0-9A-Za-z_]*"; }, { kernelItems: expectedKernels[7], - nextStates: map[symbol][]*lrItem{ + nextStates: map[symbol.Symbol][]*lrItem{ genSym("factor"): expectedKernels[10], genSym("l_paren"): expectedKernels[4], genSym("id"): expectedKernels[5], @@ -191,7 +191,7 @@ id: "[A-Za-z_][0-9A-Za-z_]*"; }, { kernelItems: expectedKernels[8], - nextStates: map[symbol][]*lrItem{ + nextStates: map[symbol.Symbol][]*lrItem{ genSym("add"): expectedKernels[6], genSym("r_paren"): expectedKernels[11], }, @@ -199,7 +199,7 @@ id: "[A-Za-z_][0-9A-Za-z_]*"; }, { kernelItems: expectedKernels[9], - nextStates: map[symbol][]*lrItem{ + nextStates: map[symbol.Symbol][]*lrItem{ genSym("mul"): expectedKernels[7], }, reducibleProds: []*production{ @@ -208,14 +208,14 @@ id: "[A-Za-z_][0-9A-Za-z_]*"; }, { kernelItems: expectedKernels[10], - nextStates: map[symbol][]*lrItem{}, + nextStates: map[symbol.Symbol][]*lrItem{}, reducibleProds: []*production{ genProd("term", "term", "mul", "factor"), }, }, { kernelItems: expectedKernels[11], - nextStates: map[symbol][]*lrItem{}, + nextStates: map[symbol.Symbol][]*lrItem{}, reducibleProds: []*production{ genProd("factor", "l_paren", "expr", "r_paren"), }, @@ -246,7 +246,7 @@ b: "bar"; var gram *Grammar var automaton *lr0Automaton { - ast, err := spec.Parse(strings.NewReader(src)) + ast, err := parser.Parse(strings.NewReader(src)) if err != nil { t.Fatal(err) } @@ -254,7 +254,7 @@ b: "bar"; b := GrammarBuilder{ AST: ast, } - gram, err = b.Build() + gram, err = b.build() if err != nil { t.Fatal(err) } @@ -298,7 +298,7 @@ b: "bar"; expectedStates := []*expectedLRState{ { kernelItems: expectedKernels[0], - nextStates: map[symbol][]*lrItem{ + nextStates: map[symbol.Symbol][]*lrItem{ genSym("s"): expectedKernels[1], genSym("foo"): expectedKernels[2], }, @@ -311,14 +311,14 @@ b: "bar"; }, { kernelItems: expectedKernels[1], - nextStates: map[symbol][]*lrItem{}, + nextStates: map[symbol.Symbol][]*lrItem{}, reducibleProds: []*production{ genProd("s'", "s"), }, }, { kernelItems: expectedKernels[2], - nextStates: map[symbol][]*lrItem{ + nextStates: map[symbol.Symbol][]*lrItem{ genSym("bar"): expectedKernels[3], genSym("b"): expectedKernels[4], }, @@ -331,14 +331,14 @@ b: "bar"; }, { kernelItems: expectedKernels[3], - nextStates: map[symbol][]*lrItem{}, + nextStates: map[symbol.Symbol][]*lrItem{}, reducibleProds: []*production{ genProd("s", "foo", "bar"), }, }, { kernelItems: expectedKernels[4], - nextStates: map[symbol][]*lrItem{}, + nextStates: map[symbol.Symbol][]*lrItem{}, reducibleProds: []*production{ genProd("bar", "b"), }, diff --git a/grammar/parsing_table.go b/grammar/parsing_table.go index 93033a3..53f692e 100644 --- a/grammar/parsing_table.go +++ b/grammar/parsing_table.go @@ -4,6 +4,7 @@ import ( "fmt" "sort" + "github.com/nihei9/vartan/grammar/symbol" spec "github.com/nihei9/vartan/spec/grammar" ) @@ -82,7 +83,7 @@ type conflict interface { type shiftReduceConflict struct { state stateNum - sym symbol + sym symbol.Symbol nextState stateNum prodNum productionNum resolvedBy conflictResolutionMethod @@ -93,7 +94,7 @@ func (c *shiftReduceConflict) conflict() { type reduceReduceConflict struct { state stateNum - sym symbol + sym symbol.Symbol prodNum1 productionNum prodNum2 productionNum resolvedBy conflictResolutionMethod @@ -123,12 +124,12 @@ type ParsingTable struct { InitialState stateNum } -func (t *ParsingTable) getAction(state stateNum, sym symbolNum) (ActionType, stateNum, productionNum) { +func (t *ParsingTable) getAction(state stateNum, sym symbol.SymbolNum) (ActionType, stateNum, productionNum) { pos := state.Int()*t.terminalCount + sym.Int() return t.actionTable[pos].describe() } -func (t *ParsingTable) getGoTo(state stateNum, sym symbolNum) (GoToType, stateNum) { +func (t *ParsingTable) getGoTo(state stateNum, sym symbol.SymbolNum) (GoToType, stateNum) { pos := state.Int()*t.nonTerminalCount + sym.Int() return t.goToTable[pos].describe() } @@ -141,8 +142,8 @@ func (t *ParsingTable) writeAction(row int, col int, act actionEntry) { t.actionTable[row*t.terminalCount+col] = act } -func (t *ParsingTable) writeGoTo(state stateNum, sym symbol, nextState stateNum) { - pos := state.Int()*t.nonTerminalCount + sym.num().Int() +func (t *ParsingTable) writeGoTo(state stateNum, sym symbol.Symbol, nextState stateNum) { + pos := state.Int()*t.nonTerminalCount + sym.Num().Int() t.goToTable[pos] = newGoToEntry(nextState) } @@ -151,7 +152,7 @@ type lrTableBuilder struct { prods *productionSet termCount int nonTermCount int - symTab *symbolTableReader + symTab *symbol.SymbolTableReader precAndAssoc *precAndAssoc conflicts []conflict @@ -179,7 +180,7 @@ func (b *lrTableBuilder) build() (*ParsingTable, error) { for sym, kID := range state.next { nextState := b.automaton.states[kID] - if sym.isTerminal() { + if sym.IsTerminal() { b.writeShiftAction(ptab, state.num, sym, nextState.num) } else { ptab.writeGoTo(state.num, sym, nextState.num) @@ -226,12 +227,12 @@ func (b *lrTableBuilder) build() (*ParsingTable, error) { // writeShiftAction writes a shift action to the parsing table. When a shift/reduce conflict occurred, // we prioritize the shift action. -func (b *lrTableBuilder) writeShiftAction(tab *ParsingTable, state stateNum, sym symbol, nextState stateNum) { - act := tab.readAction(state.Int(), sym.num().Int()) +func (b *lrTableBuilder) writeShiftAction(tab *ParsingTable, state stateNum, sym symbol.Symbol, nextState stateNum) { + act := tab.readAction(state.Int(), sym.Num().Int()) if !act.isEmpty() { ty, _, p := act.describe() if ty == ActionTypeReduce { - act, method := b.resolveSRConflict(sym.num(), p) + act, method := b.resolveSRConflict(sym.Num(), p) b.conflicts = append(b.conflicts, &shiftReduceConflict{ state: state, sym: sym, @@ -240,19 +241,19 @@ func (b *lrTableBuilder) writeShiftAction(tab *ParsingTable, state stateNum, sym resolvedBy: method, }) if act == ActionTypeShift { - tab.writeAction(state.Int(), sym.num().Int(), newShiftActionEntry(nextState)) + tab.writeAction(state.Int(), sym.Num().Int(), newShiftActionEntry(nextState)) } return } } - tab.writeAction(state.Int(), sym.num().Int(), newShiftActionEntry(nextState)) + tab.writeAction(state.Int(), sym.Num().Int(), newShiftActionEntry(nextState)) } // writeReduceAction writes a reduce action to the parsing table. When a shift/reduce conflict occurred, // we prioritize the shift action, and when a reduce/reduce conflict we prioritize the action that reduces // the production with higher priority. Productions defined earlier in the grammar file have a higher priority. -func (b *lrTableBuilder) writeReduceAction(tab *ParsingTable, state stateNum, sym symbol, prod productionNum) { - act := tab.readAction(state.Int(), sym.num().Int()) +func (b *lrTableBuilder) writeReduceAction(tab *ParsingTable, state stateNum, sym symbol.Symbol, prod productionNum) { + act := tab.readAction(state.Int(), sym.Num().Int()) if !act.isEmpty() { ty, s, p := act.describe() switch ty { @@ -269,12 +270,12 @@ func (b *lrTableBuilder) writeReduceAction(tab *ParsingTable, state stateNum, sy resolvedBy: ResolvedByProdOrder, }) if p < prod { - tab.writeAction(state.Int(), sym.num().Int(), newReduceActionEntry(p)) + tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(p)) } else { - tab.writeAction(state.Int(), sym.num().Int(), newReduceActionEntry(prod)) + tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod)) } case ActionTypeShift: - act, method := b.resolveSRConflict(sym.num(), prod) + act, method := b.resolveSRConflict(sym.Num(), prod) b.conflicts = append(b.conflicts, &shiftReduceConflict{ state: state, sym: sym, @@ -283,15 +284,15 @@ func (b *lrTableBuilder) writeReduceAction(tab *ParsingTable, state stateNum, sy resolvedBy: method, }) if act == ActionTypeReduce { - tab.writeAction(state.Int(), sym.num().Int(), newReduceActionEntry(prod)) + tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod)) } } return } - tab.writeAction(state.Int(), sym.num().Int(), newReduceActionEntry(prod)) + tab.writeAction(state.Int(), sym.Num().Int(), newReduceActionEntry(prod)) } -func (b *lrTableBuilder) resolveSRConflict(sym symbolNum, prod productionNum) (ActionType, conflictResolutionMethod) { +func (b *lrTableBuilder) resolveSRConflict(sym symbol.SymbolNum, prod productionNum) (ActionType, conflictResolutionMethod) { symPrec := b.precAndAssoc.terminalPrecedence(sym) prodPrec := b.precAndAssoc.productionPredence(prod) if symPrec == 0 || prodPrec == 0 { @@ -313,26 +314,26 @@ func (b *lrTableBuilder) resolveSRConflict(sym symbolNum, prod productionNum) (A func (b *lrTableBuilder) genReport(tab *ParsingTable, gram *Grammar) (*spec.Report, error) { var terms []*spec.Terminal { - termSyms := b.symTab.terminalSymbols() + termSyms := b.symTab.TerminalSymbols() terms = make([]*spec.Terminal, len(termSyms)+1) for _, sym := range termSyms { - name, ok := b.symTab.toText(sym) + name, ok := b.symTab.ToText(sym) if !ok { return nil, fmt.Errorf("failed to generate terminals: symbol not found: %v", sym) } term := &spec.Terminal{ - Number: sym.num().Int(), + Number: sym.Num().Int(), Name: name, } - prec := b.precAndAssoc.terminalPrecedence(sym.num()) + prec := b.precAndAssoc.terminalPrecedence(sym.Num()) if prec != precNil { term.Precedence = prec } - assoc := b.precAndAssoc.terminalAssociativity(sym.num()) + assoc := b.precAndAssoc.terminalAssociativity(sym.Num()) switch assoc { case assocTypeLeft: term.Associativity = "l" @@ -340,22 +341,22 @@ func (b *lrTableBuilder) genReport(tab *ParsingTable, gram *Grammar) (*spec.Repo term.Associativity = "r" } - terms[sym.num()] = term + terms[sym.Num()] = term } } var nonTerms []*spec.NonTerminal { - nonTermSyms := b.symTab.nonTerminalSymbols() + nonTermSyms := b.symTab.NonTerminalSymbols() nonTerms = make([]*spec.NonTerminal, len(nonTermSyms)+1) for _, sym := range nonTermSyms { - name, ok := b.symTab.toText(sym) + name, ok := b.symTab.ToText(sym) if !ok { return nil, fmt.Errorf("failed to generate non-terminals: symbol not found: %v", sym) } - nonTerms[sym.num()] = &spec.NonTerminal{ - Number: sym.num().Int(), + nonTerms[sym.Num()] = &spec.NonTerminal{ + Number: sym.Num().Int(), Name: name, } } @@ -368,16 +369,16 @@ func (b *lrTableBuilder) genReport(tab *ParsingTable, gram *Grammar) (*spec.Repo for _, p := range ps { rhs := make([]int, len(p.rhs)) for i, e := range p.rhs { - if e.isTerminal() { - rhs[i] = e.num().Int() + if e.IsTerminal() { + rhs[i] = e.Num().Int() } else { - rhs[i] = e.num().Int() * -1 + rhs[i] = e.Num().Int() * -1 } } prod := &spec.Production{ Number: p.num.Int(), - LHS: p.lhs.num().Int(), + LHS: p.lhs.Num().Int(), RHS: rhs, } @@ -441,33 +442,33 @@ func (b *lrTableBuilder) genReport(tab *ParsingTable, gram *Grammar) (*spec.Repo var goTo []*spec.Transition { TERMINALS_LOOP: - for _, t := range b.symTab.terminalSymbols() { - act, next, prod := tab.getAction(s.num, t.num()) + for _, t := range b.symTab.TerminalSymbols() { + act, next, prod := tab.getAction(s.num, t.Num()) switch act { case ActionTypeShift: shift = append(shift, &spec.Transition{ - Symbol: t.num().Int(), + Symbol: t.Num().Int(), State: next.Int(), }) case ActionTypeReduce: for _, r := range reduce { if r.Production == prod.Int() { - r.LookAhead = append(r.LookAhead, t.num().Int()) + r.LookAhead = append(r.LookAhead, t.Num().Int()) continue TERMINALS_LOOP } } reduce = append(reduce, &spec.Reduce{ - LookAhead: []int{t.num().Int()}, + LookAhead: []int{t.Num().Int()}, Production: prod.Int(), }) } } - for _, n := range b.symTab.nonTerminalSymbols() { - ty, next := tab.getGoTo(s.num, n.num()) + for _, n := range b.symTab.NonTerminalSymbols() { + ty, next := tab.getGoTo(s.num, n.Num()) if ty == GoToTypeRegistered { goTo = append(goTo, &spec.Transition{ - Symbol: n.num().Int(), + Symbol: n.Num().Int(), State: next.Int(), }) } @@ -489,13 +490,13 @@ func (b *lrTableBuilder) genReport(tab *ParsingTable, gram *Grammar) (*spec.Repo { for _, c := range srConflicts[s.num] { conflict := &spec.SRConflict{ - Symbol: c.sym.num().Int(), + Symbol: c.sym.Num().Int(), State: c.nextState.Int(), Production: c.prodNum.Int(), ResolvedBy: c.resolvedBy.Int(), } - ty, s, p := tab.getAction(s.num, c.sym.num()) + ty, s, p := tab.getAction(s.num, c.sym.Num()) switch ty { case ActionTypeShift: n := s.Int() @@ -514,13 +515,13 @@ func (b *lrTableBuilder) genReport(tab *ParsingTable, gram *Grammar) (*spec.Repo for _, c := range rrConflicts[s.num] { conflict := &spec.RRConflict{ - Symbol: c.sym.num().Int(), + Symbol: c.sym.Num().Int(), Production1: c.prodNum1.Int(), Production2: c.prodNum2.Int(), ResolvedBy: c.resolvedBy.Int(), } - _, _, p := tab.getAction(s.num, c.sym.num()) + _, _, p := tab.getAction(s.num, c.sym.Num()) conflict.AdoptedProduction = p.Int() rr = append(rr, conflict) diff --git a/grammar/parsing_table_test.go b/grammar/parsing_table_test.go index fe56722..ae829e6 100644 --- a/grammar/parsing_table_test.go +++ b/grammar/parsing_table_test.go @@ -5,13 +5,14 @@ import ( "strings" "testing" - spec "github.com/nihei9/vartan/spec/grammar" + "github.com/nihei9/vartan/grammar/symbol" + "github.com/nihei9/vartan/spec/grammar/parser" ) type expectedState struct { kernelItems []*lrItem - acts map[symbol]testActionEntry - goTos map[symbol][]*lrItem + acts map[symbol.Symbol]testActionEntry + goTos map[symbol.Symbol][]*lrItem } func TestGenLALRParsingTable(t *testing.T) { @@ -32,14 +33,14 @@ id: "[A-Za-z0-9_]+"; var nonTermCount int var termCount int { - ast, err := spec.Parse(strings.NewReader(src)) + ast, err := parser.Parse(strings.NewReader(src)) if err != nil { t.Fatal(err) } b := GrammarBuilder{ AST: ast, } - gram, err = b.Build() + gram, err = b.build() if err != nil { t.Fatal(err) } @@ -56,11 +57,11 @@ id: "[A-Za-z0-9_]+"; t.Fatal(err) } - nonTermTexts, err := gram.symbolTable.nonTerminalTexts() + nonTermTexts, err := gram.symbolTable.NonTerminalTexts() if err != nil { t.Fatal(err) } - termTexts, err := gram.symbolTable.terminalTexts() + termTexts, err := gram.symbolTable.TerminalTexts() if err != nil { t.Fatal(err) } @@ -89,42 +90,42 @@ id: "[A-Za-z0-9_]+"; expectedKernels := map[int][]*lrItem{ 0: { - withLookAhead(genLR0Item("s'", 0, "s"), symbolEOF), + withLookAhead(genLR0Item("s'", 0, "s"), symbol.SymbolEOF), }, 1: { - withLookAhead(genLR0Item("s'", 1, "s"), symbolEOF), + withLookAhead(genLR0Item("s'", 1, "s"), symbol.SymbolEOF), }, 2: { - withLookAhead(genLR0Item("s", 1, "l", "eq", "r"), symbolEOF), - withLookAhead(genLR0Item("r", 1, "l"), symbolEOF), + withLookAhead(genLR0Item("s", 1, "l", "eq", "r"), symbol.SymbolEOF), + withLookAhead(genLR0Item("r", 1, "l"), symbol.SymbolEOF), }, 3: { - withLookAhead(genLR0Item("s", 1, "r"), symbolEOF), + withLookAhead(genLR0Item("s", 1, "r"), symbol.SymbolEOF), }, 4: { - withLookAhead(genLR0Item("l", 1, "ref", "r"), genSym("eq"), symbolEOF), + withLookAhead(genLR0Item("l", 1, "ref", "r"), genSym("eq"), symbol.SymbolEOF), }, 5: { - withLookAhead(genLR0Item("l", 1, "id"), genSym("eq"), symbolEOF), + withLookAhead(genLR0Item("l", 1, "id"), genSym("eq"), symbol.SymbolEOF), }, 6: { - withLookAhead(genLR0Item("s", 2, "l", "eq", "r"), symbolEOF), + withLookAhead(genLR0Item("s", 2, "l", "eq", "r"), symbol.SymbolEOF), }, 7: { - withLookAhead(genLR0Item("l", 2, "ref", "r"), genSym("eq"), symbolEOF), + withLookAhead(genLR0Item("l", 2, "ref", "r"), genSym("eq"), symbol.SymbolEOF), }, 8: { - withLookAhead(genLR0Item("r", 1, "l"), genSym("eq"), symbolEOF), + withLookAhead(genLR0Item("r", 1, "l"), genSym("eq"), symbol.SymbolEOF), }, 9: { - withLookAhead(genLR0Item("s", 3, "l", "eq", "r"), symbolEOF), + withLookAhead(genLR0Item("s", 3, "l", "eq", "r"), symbol.SymbolEOF), }, } expectedStates := []expectedState{ { kernelItems: expectedKernels[0], - acts: map[symbol]testActionEntry{ + acts: map[symbol.Symbol]testActionEntry{ genSym("ref"): { ty: ActionTypeShift, nextState: expectedKernels[4], @@ -134,7 +135,7 @@ id: "[A-Za-z0-9_]+"; nextState: expectedKernels[5], }, }, - goTos: map[symbol][]*lrItem{ + goTos: map[symbol.Symbol][]*lrItem{ genSym("s"): expectedKernels[1], genSym("l"): expectedKernels[2], genSym("r"): expectedKernels[3], @@ -142,8 +143,8 @@ id: "[A-Za-z0-9_]+"; }, { kernelItems: expectedKernels[1], - acts: map[symbol]testActionEntry{ - symbolEOF: { + acts: map[symbol.Symbol]testActionEntry{ + symbol.SymbolEOF: { ty: ActionTypeReduce, production: genProd("s'", "s"), }, @@ -151,12 +152,12 @@ id: "[A-Za-z0-9_]+"; }, { kernelItems: expectedKernels[2], - acts: map[symbol]testActionEntry{ + acts: map[symbol.Symbol]testActionEntry{ genSym("eq"): { ty: ActionTypeShift, nextState: expectedKernels[6], }, - symbolEOF: { + symbol.SymbolEOF: { ty: ActionTypeReduce, production: genProd("r", "l"), }, @@ -164,8 +165,8 @@ id: "[A-Za-z0-9_]+"; }, { kernelItems: expectedKernels[3], - acts: map[symbol]testActionEntry{ - symbolEOF: { + acts: map[symbol.Symbol]testActionEntry{ + symbol.SymbolEOF: { ty: ActionTypeReduce, production: genProd("s", "r"), }, @@ -173,7 +174,7 @@ id: "[A-Za-z0-9_]+"; }, { kernelItems: expectedKernels[4], - acts: map[symbol]testActionEntry{ + acts: map[symbol.Symbol]testActionEntry{ genSym("ref"): { ty: ActionTypeShift, nextState: expectedKernels[4], @@ -183,19 +184,19 @@ id: "[A-Za-z0-9_]+"; nextState: expectedKernels[5], }, }, - goTos: map[symbol][]*lrItem{ + goTos: map[symbol.Symbol][]*lrItem{ genSym("r"): expectedKernels[7], genSym("l"): expectedKernels[8], }, }, { kernelItems: expectedKernels[5], - acts: map[symbol]testActionEntry{ + acts: map[symbol.Symbol]testActionEntry{ genSym("eq"): { ty: ActionTypeReduce, production: genProd("l", "id"), }, - symbolEOF: { + symbol.SymbolEOF: { ty: ActionTypeReduce, production: genProd("l", "id"), }, @@ -203,7 +204,7 @@ id: "[A-Za-z0-9_]+"; }, { kernelItems: expectedKernels[6], - acts: map[symbol]testActionEntry{ + acts: map[symbol.Symbol]testActionEntry{ genSym("ref"): { ty: ActionTypeShift, nextState: expectedKernels[4], @@ -213,19 +214,19 @@ id: "[A-Za-z0-9_]+"; nextState: expectedKernels[5], }, }, - goTos: map[symbol][]*lrItem{ + goTos: map[symbol.Symbol][]*lrItem{ genSym("l"): expectedKernels[8], genSym("r"): expectedKernels[9], }, }, { kernelItems: expectedKernels[7], - acts: map[symbol]testActionEntry{ + acts: map[symbol.Symbol]testActionEntry{ genSym("eq"): { ty: ActionTypeReduce, production: genProd("l", "ref", "r"), }, - symbolEOF: { + symbol.SymbolEOF: { ty: ActionTypeReduce, production: genProd("l", "ref", "r"), }, @@ -233,12 +234,12 @@ id: "[A-Za-z0-9_]+"; }, { kernelItems: expectedKernels[8], - acts: map[symbol]testActionEntry{ + acts: map[symbol.Symbol]testActionEntry{ genSym("eq"): { ty: ActionTypeReduce, production: genProd("r", "l"), }, - symbolEOF: { + symbol.SymbolEOF: { ty: ActionTypeReduce, production: genProd("r", "l"), }, @@ -246,8 +247,8 @@ id: "[A-Za-z0-9_]+"; }, { kernelItems: expectedKernels[9], - acts: map[symbol]testActionEntry{ - symbolEOF: { + acts: map[symbol.Symbol]testActionEntry{ + symbol.SymbolEOF: { ty: ActionTypeReduce, production: genProd("s", "l", "eq", "r"), }, @@ -287,11 +288,11 @@ id: "[A-Za-z0-9_]+"; } func testAction(t *testing.T, expectedState *expectedState, state *lrState, ptab *ParsingTable, automaton *lr0Automaton, gram *Grammar, termCount int) { - nonEmptyEntries := map[symbolNum]struct{}{} + nonEmptyEntries := map[symbol.SymbolNum]struct{}{} for eSym, eAct := range expectedState.acts { - nonEmptyEntries[eSym.num()] = struct{}{} + nonEmptyEntries[eSym.Num()] = struct{}{} - ty, stateNum, prodNum := ptab.getAction(state.num, eSym.num()) + ty, stateNum, prodNum := ptab.getAction(state.num, eSym.Num()) if ty != eAct.ty { t.Fatalf("action type is mismatched; want: %v, got: %v", eAct.ty, ty) } @@ -319,10 +320,10 @@ func testAction(t *testing.T, expectedState *expectedState, state *lrState, ptab } } for symNum := 0; symNum < termCount; symNum++ { - if _, checked := nonEmptyEntries[symbolNum(symNum)]; checked { + if _, checked := nonEmptyEntries[symbol.SymbolNum(symNum)]; checked { continue } - ty, stateNum, prodNum := ptab.getAction(state.num, symbolNum(symNum)) + ty, stateNum, prodNum := ptab.getAction(state.num, symbol.SymbolNum(symNum)) if ty != ActionTypeError { t.Errorf("unexpected ACTION entry; state: #%v, symbol: #%v, action type: %v, next state: #%v, prodction: #%v", state.num, symNum, ty, stateNum, prodNum) } @@ -330,15 +331,15 @@ func testAction(t *testing.T, expectedState *expectedState, state *lrState, ptab } func testGoTo(t *testing.T, expectedState *expectedState, state *lrState, ptab *ParsingTable, automaton *lr0Automaton, nonTermCount int) { - nonEmptyEntries := map[symbolNum]struct{}{} + nonEmptyEntries := map[symbol.SymbolNum]struct{}{} for eSym, eGoTo := range expectedState.goTos { - nonEmptyEntries[eSym.num()] = struct{}{} + nonEmptyEntries[eSym.Num()] = struct{}{} eNextState, err := newKernel(eGoTo) if err != nil { t.Fatal(err) } - ty, stateNum := ptab.getGoTo(state.num, eSym.num()) + ty, stateNum := ptab.getGoTo(state.num, eSym.Num()) if ty != GoToTypeRegistered { t.Fatalf("GOTO entry was not found; state: #%v, symbol: #%v", state.num, eSym) } @@ -351,10 +352,10 @@ func testGoTo(t *testing.T, expectedState *expectedState, state *lrState, ptab * } } for symNum := 0; symNum < nonTermCount; symNum++ { - if _, checked := nonEmptyEntries[symbolNum(symNum)]; checked { + if _, checked := nonEmptyEntries[symbol.SymbolNum(symNum)]; checked { continue } - ty, _ := ptab.getGoTo(state.num, symbolNum(symNum)) + ty, _ := ptab.getGoTo(state.num, symbol.SymbolNum(symNum)) if ty != GoToTypeError { t.Errorf("unexpected GOTO entry; state: #%v, symbol: #%v", state.num, symNum) } diff --git a/grammar/production.go b/grammar/production.go index 87b392f..1978039 100644 --- a/grammar/production.go +++ b/grammar/production.go @@ -4,6 +4,8 @@ import ( "crypto/sha256" "encoding/hex" "fmt" + + "github.com/nihei9/vartan/grammar/symbol" ) type productionID [32]byte @@ -12,10 +14,10 @@ func (id productionID) String() string { return hex.EncodeToString(id[:]) } -func genProductionID(lhs symbol, rhs []symbol) productionID { - seq := lhs.byte() +func genProductionID(lhs symbol.Symbol, rhs []symbol.Symbol) productionID { + seq := lhs.Byte() for _, sym := range rhs { - seq = append(seq, sym.byte()...) + seq = append(seq, sym.Byte()...) } return productionID(sha256.Sum256(seq)) } @@ -35,17 +37,17 @@ func (n productionNum) Int() int { type production struct { id productionID num productionNum - lhs symbol - rhs []symbol + lhs symbol.Symbol + rhs []symbol.Symbol rhsLen int } -func newProduction(lhs symbol, rhs []symbol) (*production, error) { - if lhs.isNil() { +func newProduction(lhs symbol.Symbol, rhs []symbol.Symbol) (*production, error) { + if lhs.IsNil() { return nil, fmt.Errorf("LHS must be a non-nil symbol; LHS: %v, RHS: %v", lhs, rhs) } for _, sym := range rhs { - if sym.isNil() { + if sym.IsNil() { return nil, fmt.Errorf("a symbol of RHS must be a non-nil symbol; LHS: %v, RHS: %v", lhs, rhs) } } @@ -63,14 +65,14 @@ func (p *production) isEmpty() bool { } type productionSet struct { - lhs2Prods map[symbol][]*production + lhs2Prods map[symbol.Symbol][]*production id2Prod map[productionID]*production num productionNum } func newProductionSet() *productionSet { return &productionSet{ - lhs2Prods: map[symbol][]*production{}, + lhs2Prods: map[symbol.Symbol][]*production{}, id2Prod: map[productionID]*production{}, num: productionNumMin, } @@ -81,7 +83,7 @@ func (ps *productionSet) append(prod *production) { return } - if prod.lhs.isStart() { + if prod.lhs.IsStart() { prod.num = productionNumStart } else { prod.num = ps.num @@ -101,8 +103,8 @@ func (ps *productionSet) findByID(id productionID) (*production, bool) { return prod, ok } -func (ps *productionSet) findByLHS(lhs symbol) ([]*production, bool) { - if lhs.isNil() { +func (ps *productionSet) findByLHS(lhs symbol.Symbol) ([]*production, bool) { + if lhs.IsNil() { return nil, false } diff --git a/grammar/semantic_error.go b/grammar/semantic_error.go index 589e324..88a6b17 100644 --- a/grammar/semantic_error.go +++ b/grammar/semantic_error.go @@ -1,42 +1,30 @@ package grammar -type SemanticError struct { - message string -} - -func newSemanticError(message string) *SemanticError { - return &SemanticError{ - message: message, - } -} - -func (e *SemanticError) Error() string { - return e.message -} +import "errors" var ( - semErrNoGrammarName = newSemanticError("name is missing") - semErrSpellingInconsistency = newSemanticError("the identifiers are treated as the same. please use the same spelling") - semErrDuplicateAssoc = newSemanticError("associativity and precedence cannot be specified multiple times for a symbol") - semErrUndefinedPrec = newSemanticError("symbol must has precedence") - semErrUndefinedOrdSym = newSemanticError("undefined ordered symbol") - semErrUnusedProduction = newSemanticError("unused production") - semErrUnusedTerminal = newSemanticError("unused terminal") - semErrTermCannotBeSkipped = newSemanticError("a terminal used in productions cannot be skipped") - semErrNoProduction = newSemanticError("a grammar needs at least one production") - semErrUndefinedSym = newSemanticError("undefined symbol") - semErrDuplicateProduction = newSemanticError("duplicate production") - semErrDuplicateTerminal = newSemanticError("duplicate terminal") - semErrDuplicateFragment = newSemanticError("duplicate fragment") - semErrDuplicateName = newSemanticError("duplicate names are not allowed between terminals and non-terminals") - semErrErrSymIsReserved = newSemanticError("symbol 'error' is reserved as a terminal symbol") - semErrDuplicateLabel = newSemanticError("a label must be unique in an alternative") - semErrInvalidLabel = newSemanticError("a label must differ from terminal symbols or non-terminal symbols") - semErrDirInvalidName = newSemanticError("invalid directive name") - semErrDirInvalidParam = newSemanticError("invalid parameter") - semErrDuplicateDir = newSemanticError("a directive must not be duplicated") - semErrDuplicateElem = newSemanticError("duplicate element") - semErrAmbiguousElem = newSemanticError("ambiguous element") - semErrInvalidProdDir = newSemanticError("invalid production directive") - semErrInvalidAltDir = newSemanticError("invalid alternative directive") + semErrNoGrammarName = errors.New("name is missing") + semErrSpellingInconsistency = errors.New("the identifiers are treated as the same. please use the same spelling") + semErrDuplicateAssoc = errors.New("associativity and precedence cannot be specified multiple times for a symbol") + semErrUndefinedPrec = errors.New("symbol must has precedence") + semErrUndefinedOrdSym = errors.New("undefined ordered symbol") + semErrUnusedProduction = errors.New("unused production") + semErrUnusedTerminal = errors.New("unused terminal") + semErrTermCannotBeSkipped = errors.New("a terminal used in productions cannot be skipped") + semErrNoProduction = errors.New("a grammar needs at least one production") + semErrUndefinedSym = errors.New("undefined symbol") + semErrDuplicateProduction = errors.New("duplicate production") + semErrDuplicateTerminal = errors.New("duplicate terminal") + semErrDuplicateFragment = errors.New("duplicate fragment") + semErrDuplicateName = errors.New("duplicate names are not allowed between terminals and non-terminals") + semErrErrSymIsReserved = errors.New("symbol 'error' is reserved as a terminal symbol") + semErrDuplicateLabel = errors.New("a label must be unique in an alternative") + semErrInvalidLabel = errors.New("a label must differ from terminal symbols or non-terminal symbols") + semErrDirInvalidName = errors.New("invalid directive name") + semErrDirInvalidParam = errors.New("invalid parameter") + semErrDuplicateDir = errors.New("a directive must not be duplicated") + semErrDuplicateElem = errors.New("duplicate element") + semErrAmbiguousElem = errors.New("ambiguous element") + semErrInvalidProdDir = errors.New("invalid production directive") + semErrInvalidAltDir = errors.New("invalid alternative directive") ) diff --git a/grammar/symbol.go b/grammar/symbol/symbol.go index 9eba032..f9e6a93 100644 --- a/grammar/symbol.go +++ b/grammar/symbol/symbol.go @@ -1,4 +1,4 @@ -package grammar +package symbol import ( "fmt" @@ -16,15 +16,15 @@ func (t symbolKind) String() string { return string(t) } -type symbolNum uint16 +type SymbolNum uint16 -func (n symbolNum) Int() int { +func (n SymbolNum) Int() int { return int(n) } -type symbol uint16 +type Symbol uint16 -func (s symbol) String() string { +func (s Symbol) String() string { kind, isStart, isEOF, num := s.describe() var prefix string switch { @@ -56,24 +56,24 @@ const ( symbolNumStart = uint16(0x0001) // 0000 0000 0000 0001 symbolNumEOF = uint16(0x0001) // 0000 0000 0000 0001 - symbolNil = symbol(0) // 0000 0000 0000 0000 - symbolStart = symbol(maskNonTerminal | maskStartOrEOF | symbolNumStart) // 0100 0000 0000 0001 - symbolEOF = symbol(maskTerminal | maskStartOrEOF | symbolNumEOF) // 1100 0000 0000 0001: The EOF symbol is treated as a terminal symbol. + SymbolNil = Symbol(0) // 0000 0000 0000 0000 + symbolStart = Symbol(maskNonTerminal | maskStartOrEOF | symbolNumStart) // 0100 0000 0000 0001 + SymbolEOF = Symbol(maskTerminal | maskStartOrEOF | symbolNumEOF) // 1100 0000 0000 0001: The EOF symbol is treated as a terminal symbol. // The symbol name contains `<` and `>` to avoid conflicting with user-defined symbols. symbolNameEOF = "<eof>" - nonTerminalNumMin = symbolNum(2) // The number 1 is used by a start symbol. - terminalNumMin = symbolNum(2) // The number 1 is used by the EOF symbol. - symbolNumMax = symbolNum(0xffff) >> 2 // 0011 1111 1111 1111 + nonTerminalNumMin = SymbolNum(2) // The number 1 is used by a start symbol. + terminalNumMin = SymbolNum(2) // The number 1 is used by the EOF symbol. + symbolNumMax = SymbolNum(0xffff) >> 2 // 0011 1111 1111 1111 ) -func newSymbol(kind symbolKind, isStart bool, num symbolNum) (symbol, error) { +func newSymbol(kind symbolKind, isStart bool, num SymbolNum) (Symbol, error) { if num > symbolNumMax { - return symbolNil, fmt.Errorf("a symbol number exceeds the limit; limit: %v, passed: %v", symbolNumMax, num) + return SymbolNil, fmt.Errorf("a symbol number exceeds the limit; limit: %v, passed: %v", symbolNumMax, num) } if kind == symbolKindTerminal && isStart { - return symbolNil, fmt.Errorf("a start symbol must be a non-terminal symbol") + return SymbolNil, fmt.Errorf("a start symbol must be a non-terminal symbol") } kindMask := maskNonTerminal @@ -84,58 +84,58 @@ func newSymbol(kind symbolKind, isStart bool, num symbolNum) (symbol, error) { if isStart { startMask = maskStartOrEOF } - return symbol(kindMask | startMask | uint16(num)), nil + return Symbol(kindMask | startMask | uint16(num)), nil } -func (s symbol) num() symbolNum { +func (s Symbol) Num() SymbolNum { _, _, _, num := s.describe() return num } -func (s symbol) byte() []byte { - if s.isNil() { +func (s Symbol) Byte() []byte { + if s.IsNil() { return []byte{0, 0} } return []byte{byte(uint16(s) >> 8), byte(uint16(s) & 0x00ff)} } -func (s symbol) isNil() bool { +func (s Symbol) IsNil() bool { _, _, _, num := s.describe() return num == 0 } -func (s symbol) isStart() bool { - if s.isNil() { +func (s Symbol) IsStart() bool { + if s.IsNil() { return false } _, isStart, _, _ := s.describe() return isStart } -func (s symbol) isEOF() bool { - if s.isNil() { +func (s Symbol) isEOF() bool { + if s.IsNil() { return false } _, _, isEOF, _ := s.describe() return isEOF } -func (s symbol) isNonTerminal() bool { - if s.isNil() { +func (s Symbol) isNonTerminal() bool { + if s.IsNil() { return false } kind, _, _, _ := s.describe() return kind == symbolKindNonTerminal } -func (s symbol) isTerminal() bool { - if s.isNil() { +func (s Symbol) IsTerminal() bool { + if s.IsNil() { return false } return !s.isNonTerminal() } -func (s symbol) describe() (symbolKind, bool, bool, symbolNum) { +func (s Symbol) describe() (symbolKind, bool, bool, SymbolNum) { kind := symbolKindNonTerminal if uint16(s)&maskKindPart > 0 { kind = symbolKindTerminal @@ -149,34 +149,34 @@ func (s symbol) describe() (symbolKind, bool, bool, symbolNum) { isEOF = true } } - num := symbolNum(uint16(s) & maskNumberPart) + num := SymbolNum(uint16(s) & maskNumberPart) return kind, isStart, isEOF, num } -type symbolTable struct { - text2Sym map[string]symbol - sym2Text map[symbol]string +type SymbolTable struct { + text2Sym map[string]Symbol + sym2Text map[Symbol]string nonTermTexts []string termTexts []string - nonTermNum symbolNum - termNum symbolNum + nonTermNum SymbolNum + termNum SymbolNum } -type symbolTableWriter struct { - *symbolTable +type SymbolTableWriter struct { + *SymbolTable } -type symbolTableReader struct { - *symbolTable +type SymbolTableReader struct { + *SymbolTable } -func newSymbolTable() *symbolTable { - return &symbolTable{ - text2Sym: map[string]symbol{ - symbolNameEOF: symbolEOF, +func NewSymbolTable() *SymbolTable { + return &SymbolTable{ + text2Sym: map[string]Symbol{ + symbolNameEOF: SymbolEOF, }, - sym2Text: map[symbol]string{ - symbolEOF: symbolNameEOF, + sym2Text: map[Symbol]string{ + SymbolEOF: symbolNameEOF, }, termTexts: []string{ "", // Nil @@ -191,32 +191,32 @@ func newSymbolTable() *symbolTable { } } -func (t *symbolTable) writer() *symbolTableWriter { - return &symbolTableWriter{ - symbolTable: t, +func (t *SymbolTable) Writer() *SymbolTableWriter { + return &SymbolTableWriter{ + SymbolTable: t, } } -func (t *symbolTable) reader() *symbolTableReader { - return &symbolTableReader{ - symbolTable: t, +func (t *SymbolTable) Reader() *SymbolTableReader { + return &SymbolTableReader{ + SymbolTable: t, } } -func (w *symbolTableWriter) registerStartSymbol(text string) (symbol, error) { +func (w *SymbolTableWriter) RegisterStartSymbol(text string) (Symbol, error) { w.text2Sym[text] = symbolStart w.sym2Text[symbolStart] = text - w.nonTermTexts[symbolStart.num().Int()] = text + w.nonTermTexts[symbolStart.Num().Int()] = text return symbolStart, nil } -func (w *symbolTableWriter) registerNonTerminalSymbol(text string) (symbol, error) { +func (w *SymbolTableWriter) RegisterNonTerminalSymbol(text string) (Symbol, error) { if sym, ok := w.text2Sym[text]; ok { return sym, nil } sym, err := newSymbol(symbolKindNonTerminal, false, w.nonTermNum) if err != nil { - return symbolNil, err + return SymbolNil, err } w.nonTermNum++ w.text2Sym[text] = sym @@ -225,13 +225,13 @@ func (w *symbolTableWriter) registerNonTerminalSymbol(text string) (symbol, erro return sym, nil } -func (w *symbolTableWriter) registerTerminalSymbol(text string) (symbol, error) { +func (w *SymbolTableWriter) RegisterTerminalSymbol(text string) (Symbol, error) { if sym, ok := w.text2Sym[text]; ok { return sym, nil } sym, err := newSymbol(symbolKindTerminal, false, w.termNum) if err != nil { - return symbolNil, err + return SymbolNil, err } w.termNum++ w.text2Sym[text] = sym @@ -240,22 +240,22 @@ func (w *symbolTableWriter) registerTerminalSymbol(text string) (symbol, error) return sym, nil } -func (r *symbolTableReader) toSymbol(text string) (symbol, bool) { +func (r *SymbolTableReader) ToSymbol(text string) (Symbol, bool) { if sym, ok := r.text2Sym[text]; ok { return sym, true } - return symbolNil, false + return SymbolNil, false } -func (r *symbolTableReader) toText(sym symbol) (string, bool) { +func (r *SymbolTableReader) ToText(sym Symbol) (string, bool) { text, ok := r.sym2Text[sym] return text, ok } -func (r *symbolTableReader) terminalSymbols() []symbol { - syms := make([]symbol, 0, r.termNum.Int()-terminalNumMin.Int()) +func (r *SymbolTableReader) TerminalSymbols() []Symbol { + syms := make([]Symbol, 0, r.termNum.Int()-terminalNumMin.Int()) for sym := range r.sym2Text { - if !sym.isTerminal() || sym.isNil() { + if !sym.IsTerminal() || sym.IsNil() { continue } syms = append(syms, sym) @@ -266,17 +266,17 @@ func (r *symbolTableReader) terminalSymbols() []symbol { return syms } -func (r *symbolTableReader) terminalTexts() ([]string, error) { +func (r *SymbolTableReader) TerminalTexts() ([]string, error) { if r.termNum == terminalNumMin { return nil, fmt.Errorf("symbol table has no terminals") } return r.termTexts, nil } -func (r *symbolTableReader) nonTerminalSymbols() []symbol { - syms := make([]symbol, 0, r.nonTermNum.Int()-nonTerminalNumMin.Int()) +func (r *SymbolTableReader) NonTerminalSymbols() []Symbol { + syms := make([]Symbol, 0, r.nonTermNum.Int()-nonTerminalNumMin.Int()) for sym := range r.sym2Text { - if !sym.isNonTerminal() || sym.isNil() { + if !sym.isNonTerminal() || sym.IsNil() { continue } syms = append(syms, sym) @@ -287,8 +287,8 @@ func (r *symbolTableReader) nonTerminalSymbols() []symbol { return syms } -func (r *symbolTableReader) nonTerminalTexts() ([]string, error) { - if r.nonTermNum == nonTerminalNumMin || r.nonTermTexts[symbolStart.num().Int()] == "" { +func (r *SymbolTableReader) NonTerminalTexts() ([]string, error) { + if r.nonTermNum == nonTerminalNumMin || r.nonTermTexts[symbolStart.Num().Int()] == "" { return nil, fmt.Errorf("symbol table has no terminals or no start symbol") } return r.nonTermTexts, nil diff --git a/grammar/symbol_test.go b/grammar/symbol/symbol_test.go index b9bcbdf..31c3edd 100644 --- a/grammar/symbol_test.go +++ b/grammar/symbol/symbol_test.go @@ -1,19 +1,19 @@ -package grammar +package symbol import "testing" func TestSymbol(t *testing.T) { - tab := newSymbolTable() - w := tab.writer() - _, _ = w.registerStartSymbol("expr'") - _, _ = w.registerNonTerminalSymbol("expr") - _, _ = w.registerNonTerminalSymbol("term") - _, _ = w.registerNonTerminalSymbol("factor") - _, _ = w.registerTerminalSymbol("id") - _, _ = w.registerTerminalSymbol("add") - _, _ = w.registerTerminalSymbol("mul") - _, _ = w.registerTerminalSymbol("l_paren") - _, _ = w.registerTerminalSymbol("r_paren") + tab := NewSymbolTable() + w := tab.Writer() + _, _ = w.RegisterStartSymbol("expr'") + _, _ = w.RegisterNonTerminalSymbol("expr") + _, _ = w.RegisterNonTerminalSymbol("term") + _, _ = w.RegisterNonTerminalSymbol("factor") + _, _ = w.RegisterTerminalSymbol("id") + _, _ = w.RegisterTerminalSymbol("add") + _, _ = w.RegisterTerminalSymbol("mul") + _, _ = w.RegisterTerminalSymbol("l_paren") + _, _ = w.RegisterTerminalSymbol("r_paren") nonTermTexts := []string{ "", // Nil @@ -81,13 +81,13 @@ func TestSymbol(t *testing.T) { } for _, tt := range tests { t.Run(tt.text, func(t *testing.T) { - r := tab.reader() - sym, ok := r.toSymbol(tt.text) + r := tab.Reader() + sym, ok := r.ToSymbol(tt.text) if !ok { t.Fatalf("symbol was not found") } testSymbolProperty(t, sym, tt.isNil, tt.isStart, tt.isEOF, tt.isNonTerminal, tt.isTerminal) - text, ok := r.toText(sym) + text, ok := r.ToText(sym) if !ok { t.Fatalf("text was not found") } @@ -98,16 +98,16 @@ func TestSymbol(t *testing.T) { } t.Run("EOF", func(t *testing.T) { - testSymbolProperty(t, symbolEOF, false, false, true, false, true) + testSymbolProperty(t, SymbolEOF, false, false, true, false, true) }) t.Run("Nil", func(t *testing.T) { - testSymbolProperty(t, symbolNil, true, false, false, false, false) + testSymbolProperty(t, SymbolNil, true, false, false, false, false) }) t.Run("texts of non-terminals", func(t *testing.T) { - r := tab.reader() - ts, err := r.nonTerminalTexts() + r := tab.Reader() + ts, err := r.NonTerminalTexts() if err != nil { t.Fatal(err) } @@ -122,8 +122,8 @@ func TestSymbol(t *testing.T) { }) t.Run("texts of terminals", func(t *testing.T) { - r := tab.reader() - ts, err := r.terminalTexts() + r := tab.Reader() + ts, err := r.TerminalTexts() if err != nil { t.Fatal(err) } @@ -138,13 +138,13 @@ func TestSymbol(t *testing.T) { }) } -func testSymbolProperty(t *testing.T, sym symbol, isNil, isStart, isEOF, isNonTerminal, isTerminal bool) { +func testSymbolProperty(t *testing.T, sym Symbol, isNil, isStart, isEOF, isNonTerminal, isTerminal bool) { t.Helper() - if v := sym.isNil(); v != isNil { + if v := sym.IsNil(); v != isNil { t.Fatalf("isNil property is mismatched; want: %v, got: %v", isNil, v) } - if v := sym.isStart(); v != isStart { + if v := sym.IsStart(); v != isStart { t.Fatalf("isStart property is mismatched; want: %v, got: %v", isStart, v) } if v := sym.isEOF(); v != isEOF { @@ -153,7 +153,7 @@ func testSymbolProperty(t *testing.T, sym symbol, isNil, isStart, isEOF, isNonTe if v := sym.isNonTerminal(); v != isNonTerminal { t.Fatalf("isNonTerminal property is mismatched; want: %v, got: %v", isNonTerminal, v) } - if v := sym.isTerminal(); v != isTerminal { + if v := sym.IsTerminal(); v != isTerminal { t.Fatalf("isTerminal property is mismatched; want: %v, got: %v", isTerminal, v) } } diff --git a/grammar/test_helper_test.go b/grammar/test_helper_test.go index 1dcdede..297a9a3 100644 --- a/grammar/test_helper_test.go +++ b/grammar/test_helper_test.go @@ -1,14 +1,18 @@ package grammar -import "testing" +import ( + "testing" -type testSymbolGenerator func(text string) symbol + "github.com/nihei9/vartan/grammar/symbol" +) -func newTestSymbolGenerator(t *testing.T, symTab *symbolTableReader) testSymbolGenerator { - return func(text string) symbol { +type testSymbolGenerator func(text string) symbol.Symbol + +func newTestSymbolGenerator(t *testing.T, symTab *symbol.SymbolTableReader) testSymbolGenerator { + return func(text string) symbol.Symbol { t.Helper() - sym, ok := symTab.toSymbol(text) + sym, ok := symTab.ToSymbol(text) if !ok { t.Fatalf("symbol was not found: %v", text) } @@ -22,7 +26,7 @@ func newTestProductionGenerator(t *testing.T, genSym testSymbolGenerator) testPr return func(lhs string, rhs ...string) *production { t.Helper() - rhsSym := []symbol{} + rhsSym := []symbol.Symbol{} for _, text := range rhs { rhsSym = append(rhsSym, genSym(text)) } @@ -51,9 +55,9 @@ func newTestLR0ItemGenerator(t *testing.T, genProd testProductionGenerator) test } } -func withLookAhead(item *lrItem, lookAhead ...symbol) *lrItem { +func withLookAhead(item *lrItem, lookAhead ...symbol.Symbol) *lrItem { if item.lookAhead.symbols == nil { - item.lookAhead.symbols = map[symbol]struct{}{} + item.lookAhead.symbols = map[symbol.Symbol]struct{}{} } for _, a := range lookAhead { |