diff options
Diffstat (limited to 'src/urubu/grammar/lexical.go')
-rw-r--r-- | src/urubu/grammar/lexical.go | 577 |
1 files changed, 577 insertions, 0 deletions
diff --git a/src/urubu/grammar/lexical.go b/src/urubu/grammar/lexical.go new file mode 100644 index 0000000..515e491 --- /dev/null +++ b/src/urubu/grammar/lexical.go @@ -0,0 +1,577 @@ +package lexical + +import ( + "bytes" + "fmt" + "sort" + "strings" + + "urubu/compressor" + "urubu/grammar/lexical/dfa" + psr "urubu/grammar/lexical/parser" + spec "urubu/spec/grammar" +) + +type CompileError struct { + Kind spec.LexKindName + Fragment bool + Cause error + Detail string +} + +func Compile(lexspec *LexSpec, compLv int) (*spec.LexicalSpec, error, []*CompileError) { + err := lexspec.Validate() + if err != nil { + return nil, fmt.Errorf("invalid lexical specification:\n%w", err), nil + } + + modeEntries, modeNames, modeName2ID, fragmetns := groupEntriesByLexMode(lexspec.Entries) + + modeSpecs := []*spec.CompiledLexModeSpec{ + nil, + } + for i, es := range modeEntries[1:] { + modeName := modeNames[i+1] + modeSpec, err, cerrs := compile(es, modeName2ID, fragmetns, compLv) + if err != nil { + return nil, fmt.Errorf("failed to compile in %v mode: %w", modeName, err), cerrs + } + modeSpecs = append(modeSpecs, modeSpec) + } + + var kindNames []spec.LexKindName + var name2ID map[spec.LexKindName]spec.LexKindID + { + name2ID = map[spec.LexKindName]spec.LexKindID{} + id := spec.LexKindIDMin + for _, modeSpec := range modeSpecs[1:] { + for _, name := range modeSpec.KindNames[1:] { + if _, ok := name2ID[name]; ok { + continue + } + name2ID[name] = id + id++ + } + } + + kindNames = make([]spec.LexKindName, len(name2ID)+1) + for name, id := range name2ID { + kindNames[id] = name + } + } + + var kindIDs [][]spec.LexKindID + { + kindIDs = make([][]spec.LexKindID, len(modeSpecs)) + for i, modeSpec := range modeSpecs[1:] { + ids := make([]spec.LexKindID, len(modeSpec.KindNames)) + for modeID, name := range modeSpec.KindNames { + if modeID == 0 { + continue + } + ids[modeID] = name2ID[name] + } + kindIDs[i+1] = ids + } + } + + return &spec.LexicalSpec{ + InitialModeID: spec.LexModeIDDefault, + ModeNames: modeNames, + KindNames: kindNames, + KindIDs: kindIDs, + CompressionLevel: compLv, + Specs: modeSpecs, + }, nil, nil +} + +func groupEntriesByLexMode(entries []*LexEntry) ([][]*LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeID, map[spec.LexKindName]*LexEntry) { + modeNames := []spec.LexModeName{ + spec.LexModeNameNil, + spec.LexModeNameDefault, + } + modeName2ID := map[spec.LexModeName]spec.LexModeID{ + spec.LexModeNameNil: spec.LexModeIDNil, + spec.LexModeNameDefault: spec.LexModeIDDefault, + } + lastModeID := spec.LexModeIDDefault + modeEntries := [][]*LexEntry{ + nil, + {}, + } + fragments := map[spec.LexKindName]*LexEntry{} + for _, e := range entries { + if e.Fragment { + fragments[e.Kind] = e + continue + } + ms := e.Modes + if len(ms) == 0 { + ms = []spec.LexModeName{ + spec.LexModeNameDefault, + } + } + for _, modeName := range ms { + modeID, ok := modeName2ID[modeName] + if !ok { + modeID = lastModeID + 1 + lastModeID = modeID + modeName2ID[modeName] = modeID + modeNames = append(modeNames, modeName) + modeEntries = append(modeEntries, []*LexEntry{}) + } + modeEntries[modeID] = append(modeEntries[modeID], e) + } + } + return modeEntries, modeNames, modeName2ID, fragments +} + +func compile( + entries []*LexEntry, + modeName2ID map[spec.LexModeName]spec.LexModeID, + fragments map[spec.LexKindName]*LexEntry, + compLv int, +) (*spec.CompiledLexModeSpec, error, []*CompileError) { + var kindNames []spec.LexKindName + kindIDToName := map[spec.LexModeKindID]spec.LexKindName{} + var patterns map[spec.LexModeKindID][]byte + { + kindNames = append(kindNames, spec.LexKindNameNil) + patterns = map[spec.LexModeKindID][]byte{} + for i, e := range entries { + kindID := spec.LexModeKindID(i + 1) + + kindNames = append(kindNames, e.Kind) + kindIDToName[kindID] = e.Kind + patterns[kindID] = []byte(e.Pattern) + } + } + + push := []spec.LexModeID{ + spec.LexModeIDNil, + } + pop := []int{ + 0, + } + for _, e := range entries { + pushV := spec.LexModeIDNil + if e.Push != "" { + pushV = modeName2ID[e.Push] + } + push = append(push, pushV) + popV := 0 + if e.Pop { + popV = 1 + } + pop = append(pop, popV) + } + + fragmentPatterns := map[spec.LexKindName][]byte{} + for k, e := range fragments { + fragmentPatterns[k] = []byte(e.Pattern) + } + + fragmentCPTrees := make(map[spec.LexKindName]psr.CPTree, len(fragmentPatterns)) + { + var cerrs []*CompileError + for kind, pat := range fragmentPatterns { + p := psr.NewParser(kind, bytes.NewReader(pat)) + t, err := p.Parse() + if err != nil { + if err == psr.ParseErr { + detail, cause := p.Error() + cerrs = append(cerrs, &CompileError{ + Kind: kind, + Fragment: true, + Cause: cause, + Detail: detail, + }) + } else { + cerrs = append(cerrs, &CompileError{ + Kind: kind, + Fragment: true, + Cause: err, + }) + } + continue + } + fragmentCPTrees[kind] = t + } + if len(cerrs) > 0 { + return nil, fmt.Errorf("compile error"), cerrs + } + + err := psr.CompleteFragments(fragmentCPTrees) + if err != nil { + if err == psr.ParseErr { + for _, frag := range fragmentCPTrees { + kind, frags, err := frag.Describe() + if err != nil { + return nil, err, nil + } + + cerrs = append(cerrs, &CompileError{ + Kind: kind, + Fragment: true, + Cause: fmt.Errorf("fragment contains undefined fragments or cycles"), + Detail: fmt.Sprintf("%v", frags), + }) + } + + return nil, fmt.Errorf("compile error"), cerrs + } + + return nil, err, nil + } + } + + cpTrees := map[spec.LexModeKindID]psr.CPTree{} + { + pats := make([]*psr.PatternEntry, len(patterns)+1) + pats[spec.LexModeKindIDNil] = &psr.PatternEntry{ + ID: spec.LexModeKindIDNil, + } + for id, pattern := range patterns { + pats[id] = &psr.PatternEntry{ + ID: id, + Pattern: pattern, + } + } + + var cerrs []*CompileError + for _, pat := range pats { + if pat.ID == spec.LexModeKindIDNil { + continue + } + + p := psr.NewParser(kindIDToName[pat.ID], bytes.NewReader(pat.Pattern)) + t, err := p.Parse() + if err != nil { + if err == psr.ParseErr { + detail, cause := p.Error() + cerrs = append(cerrs, &CompileError{ + Kind: kindIDToName[pat.ID], + Fragment: false, + Cause: cause, + Detail: detail, + }) + } else { + cerrs = append(cerrs, &CompileError{ + Kind: kindIDToName[pat.ID], + Fragment: false, + Cause: err, + }) + } + continue + } + + complete, err := psr.ApplyFragments(t, fragmentCPTrees) + if err != nil { + return nil, err, nil + } + if !complete { + _, frags, err := t.Describe() + if err != nil { + return nil, err, nil + } + + cerrs = append(cerrs, &CompileError{ + Kind: kindIDToName[pat.ID], + Fragment: false, + Cause: fmt.Errorf("pattern contains undefined fragments"), + Detail: fmt.Sprintf("%v", frags), + }) + continue + } + + cpTrees[pat.ID] = t + } + if len(cerrs) > 0 { + return nil, fmt.Errorf("compile error"), cerrs + } + } + + var tranTab *spec.TransitionTable + { + root, symTab, err := dfa.ConvertCPTreeToByteTree(cpTrees) + if err != nil { + return nil, err, nil + } + d := dfa.GenDFA(root, symTab) + tranTab, err = dfa.GenTransitionTable(d) + if err != nil { + return nil, err, nil + } + } + + var err error + switch compLv { + case 2: + tranTab, err = compressTransitionTableLv2(tranTab) + if err != nil { + return nil, err, nil + } + case 1: + tranTab, err = compressTransitionTableLv1(tranTab) + if err != nil { + return nil, err, nil + } + } + + return &spec.CompiledLexModeSpec{ + KindNames: kindNames, + Push: push, + Pop: pop, + DFA: tranTab, + }, nil, nil +} + +const ( + CompressionLevelMin = 0 + CompressionLevelMax = 2 +) + +func compressTransitionTableLv2(tranTab *spec.TransitionTable) (*spec.TransitionTable, error) { + ueTab := compressor.NewUniqueEntriesTable() + { + orig, err := compressor.NewOriginalTable(convertStateIDSliceToIntSlice(tranTab.UncompressedTransition), tranTab.ColCount) + if err != nil { + return nil, err + } + err = ueTab.Compress(orig) + if err != nil { + return nil, err + } + } + + rdTab := compressor.NewRowDisplacementTable(0) + { + orig, err := compressor.NewOriginalTable(ueTab.UniqueEntries, ueTab.OriginalColCount) + if err != nil { + return nil, err + } + err = rdTab.Compress(orig) + if err != nil { + return nil, err + } + } + + tranTab.Transition = &spec.UniqueEntriesTable{ + UniqueEntries: &spec.RowDisplacementTable{ + OriginalRowCount: rdTab.OriginalRowCount, + OriginalColCount: rdTab.OriginalColCount, + EmptyValue: spec.StateIDNil, + Entries: convertIntSliceToStateIDSlice(rdTab.Entries), + Bounds: rdTab.Bounds, + RowDisplacement: rdTab.RowDisplacement, + }, + RowNums: ueTab.RowNums, + OriginalRowCount: ueTab.OriginalRowCount, + OriginalColCount: ueTab.OriginalColCount, + } + tranTab.UncompressedTransition = nil + + return tranTab, nil +} + +func compressTransitionTableLv1(tranTab *spec.TransitionTable) (*spec.TransitionTable, error) { + ueTab := compressor.NewUniqueEntriesTable() + { + orig, err := compressor.NewOriginalTable(convertStateIDSliceToIntSlice(tranTab.UncompressedTransition), tranTab.ColCount) + if err != nil { + return nil, err + } + err = ueTab.Compress(orig) + if err != nil { + return nil, err + } + } + + tranTab.Transition = &spec.UniqueEntriesTable{ + UncompressedUniqueEntries: convertIntSliceToStateIDSlice(ueTab.UniqueEntries), + RowNums: ueTab.RowNums, + OriginalRowCount: ueTab.OriginalRowCount, + OriginalColCount: ueTab.OriginalColCount, + } + tranTab.UncompressedTransition = nil + + return tranTab, nil +} + +func convertStateIDSliceToIntSlice(s []spec.StateID) []int { + is := make([]int, len(s)) + for i, v := range s { + is[i] = v.Int() + } + return is +} + +func convertIntSliceToStateIDSlice(s []int) []spec.StateID { + ss := make([]spec.StateID, len(s)) + for i, v := range s { + ss[i] = spec.StateID(v) + } + return ss +} + +type LexEntry struct { + Kind spec.LexKindName + Pattern string + Modes []spec.LexModeName + Push spec.LexModeName + Pop bool + Fragment bool +} + +type LexSpec struct { + Entries []*LexEntry +} + +func (s *LexSpec) Validate() error { + if len(s.Entries) <= 0 { + return fmt.Errorf("the lexical specification must have at least one entry") + } + { + ks := map[string]struct{}{} + fks := map[string]struct{}{} + for _, e := range s.Entries { + // Allow duplicate names between fragments and non-fragments. + if e.Fragment { + if _, exist := fks[e.Kind.String()]; exist { + return fmt.Errorf("kinds `%v` are duplicates", e.Kind) + } + fks[e.Kind.String()] = struct{}{} + } else { + if _, exist := ks[e.Kind.String()]; exist { + return fmt.Errorf("kinds `%v` are duplicates", e.Kind) + } + ks[e.Kind.String()] = struct{}{} + } + } + } + { + kinds := []string{} + modes := []string{ + spec.LexModeNameDefault.String(), // This is a predefined mode. + } + for _, e := range s.Entries { + if e.Fragment { + continue + } + + kinds = append(kinds, e.Kind.String()) + + for _, m := range e.Modes { + modes = append(modes, m.String()) + } + } + + kindErrs := findSpellingInconsistenciesErrors(kinds, nil) + modeErrs := findSpellingInconsistenciesErrors(modes, func(ids []string) error { + if SnakeCaseToUpperCamelCase(ids[0]) == SnakeCaseToUpperCamelCase(spec.LexModeNameDefault.String()) { + var b strings.Builder + fmt.Fprintf(&b, "%+v", ids[0]) + for _, id := range ids[1:] { + fmt.Fprintf(&b, ", %+v", id) + } + return fmt.Errorf("these identifiers are treated as the same. please use the same spelling as predefined '%v': %v", spec.LexModeNameDefault, b.String()) + } + return nil + }) + errs := append(kindErrs, modeErrs...) + if len(errs) > 0 { + var b strings.Builder + fmt.Fprintf(&b, "%v", errs[0]) + for _, err := range errs[1:] { + fmt.Fprintf(&b, "\n%v", err) + } + return fmt.Errorf(b.String()) + } + } + + return nil +} + +func findSpellingInconsistenciesErrors(ids []string, hook func(ids []string) error) []error { + duplicated := FindSpellingInconsistencies(ids) + if len(duplicated) == 0 { + return nil + } + + var errs []error + for _, dup := range duplicated { + if hook != nil { + err := hook(dup) + if err != nil { + errs = append(errs, err) + continue + } + } + + var b strings.Builder + fmt.Fprintf(&b, "%+v", dup[0]) + for _, id := range dup[1:] { + fmt.Fprintf(&b, ", %+v", id) + } + err := fmt.Errorf("these identifiers are treated as the same. please use the same spelling: %v", b.String()) + errs = append(errs, err) + } + + return errs +} + +// FindSpellingInconsistencies finds spelling inconsistencies in identifiers. The identifiers are considered to be the same +// if they are spelled the same when expressed in UpperCamelCase. For example, `left_paren` and `LeftParen` are spelled the same +// in UpperCamelCase. Thus they are considere to be spelling inconsistency. +func FindSpellingInconsistencies(ids []string) [][]string { + m := map[string][]string{} + for _, id := range removeDuplicates(ids) { + c := SnakeCaseToUpperCamelCase(id) + m[c] = append(m[c], id) + } + + var duplicated [][]string + for _, camels := range m { + if len(camels) == 1 { + continue + } + duplicated = append(duplicated, camels) + } + + for _, dup := range duplicated { + sort.Slice(dup, func(i, j int) bool { + return dup[i] < dup[j] + }) + } + sort.Slice(duplicated, func(i, j int) bool { + return duplicated[i][0] < duplicated[j][0] + }) + + return duplicated +} + +func removeDuplicates(s []string) []string { + m := map[string]struct{}{} + for _, v := range s { + m[v] = struct{}{} + } + + var unique []string + for v := range m { + unique = append(unique, v) + } + + return unique +} + +func SnakeCaseToUpperCamelCase(snake string) string { + elems := strings.Split(snake, "_") + for i, e := range elems { + if len(e) == 0 { + continue + } + elems[i] = strings.ToUpper(string(e[0])) + e[1:] + } + + return strings.Join(elems, "") +} |