package lexical import ( "bytes" "fmt" "sort" "strings" "urubu/compressor" "urubu/grammar/lexical/dfa" psr "urubu/grammar/lexical/parser" spec "urubu/spec/grammar" ) type CompileError struct { Kind spec.LexKindName Fragment bool Cause error Detail string } func Compile(lexspec *LexSpec, compLv int) (*spec.LexicalSpec, error, []*CompileError) { err := lexspec.Validate() if err != nil { return nil, fmt.Errorf("invalid lexical specification:\n%w", err), nil } modeEntries, modeNames, modeName2ID, fragmetns := groupEntriesByLexMode(lexspec.Entries) modeSpecs := []*spec.CompiledLexModeSpec{ nil, } for i, es := range modeEntries[1:] { modeName := modeNames[i+1] modeSpec, err, cerrs := compile(es, modeName2ID, fragmetns, compLv) if err != nil { return nil, fmt.Errorf("failed to compile in %v mode: %w", modeName, err), cerrs } modeSpecs = append(modeSpecs, modeSpec) } var kindNames []spec.LexKindName var name2ID map[spec.LexKindName]spec.LexKindID { name2ID = map[spec.LexKindName]spec.LexKindID{} id := spec.LexKindIDMin for _, modeSpec := range modeSpecs[1:] { for _, name := range modeSpec.KindNames[1:] { if _, ok := name2ID[name]; ok { continue } name2ID[name] = id id++ } } kindNames = make([]spec.LexKindName, len(name2ID)+1) for name, id := range name2ID { kindNames[id] = name } } var kindIDs [][]spec.LexKindID { kindIDs = make([][]spec.LexKindID, len(modeSpecs)) for i, modeSpec := range modeSpecs[1:] { ids := make([]spec.LexKindID, len(modeSpec.KindNames)) for modeID, name := range modeSpec.KindNames { if modeID == 0 { continue } ids[modeID] = name2ID[name] } kindIDs[i+1] = ids } } return &spec.LexicalSpec{ InitialModeID: spec.LexModeIDDefault, ModeNames: modeNames, KindNames: kindNames, KindIDs: kindIDs, CompressionLevel: compLv, Specs: modeSpecs, }, nil, nil } func groupEntriesByLexMode(entries []*LexEntry) ([][]*LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeID, map[spec.LexKindName]*LexEntry) { modeNames := []spec.LexModeName{ spec.LexModeNameNil, spec.LexModeNameDefault, } modeName2ID := map[spec.LexModeName]spec.LexModeID{ spec.LexModeNameNil: spec.LexModeIDNil, spec.LexModeNameDefault: spec.LexModeIDDefault, } lastModeID := spec.LexModeIDDefault modeEntries := [][]*LexEntry{ nil, {}, } fragments := map[spec.LexKindName]*LexEntry{} for _, e := range entries { if e.Fragment { fragments[e.Kind] = e continue } ms := e.Modes if len(ms) == 0 { ms = []spec.LexModeName{ spec.LexModeNameDefault, } } for _, modeName := range ms { modeID, ok := modeName2ID[modeName] if !ok { modeID = lastModeID + 1 lastModeID = modeID modeName2ID[modeName] = modeID modeNames = append(modeNames, modeName) modeEntries = append(modeEntries, []*LexEntry{}) } modeEntries[modeID] = append(modeEntries[modeID], e) } } return modeEntries, modeNames, modeName2ID, fragments } func compile( entries []*LexEntry, modeName2ID map[spec.LexModeName]spec.LexModeID, fragments map[spec.LexKindName]*LexEntry, compLv int, ) (*spec.CompiledLexModeSpec, error, []*CompileError) { var kindNames []spec.LexKindName kindIDToName := map[spec.LexModeKindID]spec.LexKindName{} var patterns map[spec.LexModeKindID][]byte { kindNames = append(kindNames, spec.LexKindNameNil) patterns = map[spec.LexModeKindID][]byte{} for i, e := range entries { kindID := spec.LexModeKindID(i + 1) kindNames = append(kindNames, e.Kind) kindIDToName[kindID] = e.Kind patterns[kindID] = []byte(e.Pattern) } } push := []spec.LexModeID{ spec.LexModeIDNil, } pop := []int{ 0, } for _, e := range entries { pushV := spec.LexModeIDNil if e.Push != "" { pushV = modeName2ID[e.Push] } push = append(push, pushV) popV := 0 if e.Pop { popV = 1 } pop = append(pop, popV) } fragmentPatterns := map[spec.LexKindName][]byte{} for k, e := range fragments { fragmentPatterns[k] = []byte(e.Pattern) } fragmentCPTrees := make(map[spec.LexKindName]psr.CPTree, len(fragmentPatterns)) { var cerrs []*CompileError for kind, pat := range fragmentPatterns { p := psr.NewParser(kind, bytes.NewReader(pat)) t, err := p.Parse() if err != nil { if err == psr.ParseErr { detail, cause := p.Error() cerrs = append(cerrs, &CompileError{ Kind: kind, Fragment: true, Cause: cause, Detail: detail, }) } else { cerrs = append(cerrs, &CompileError{ Kind: kind, Fragment: true, Cause: err, }) } continue } fragmentCPTrees[kind] = t } if len(cerrs) > 0 { return nil, fmt.Errorf("compile error"), cerrs } err := psr.CompleteFragments(fragmentCPTrees) if err != nil { if err == psr.ParseErr { for _, frag := range fragmentCPTrees { kind, frags, err := frag.Describe() if err != nil { return nil, err, nil } cerrs = append(cerrs, &CompileError{ Kind: kind, Fragment: true, Cause: fmt.Errorf("fragment contains undefined fragments or cycles"), Detail: fmt.Sprintf("%v", frags), }) } return nil, fmt.Errorf("compile error"), cerrs } return nil, err, nil } } cpTrees := map[spec.LexModeKindID]psr.CPTree{} { pats := make([]*psr.PatternEntry, len(patterns)+1) pats[spec.LexModeKindIDNil] = &psr.PatternEntry{ ID: spec.LexModeKindIDNil, } for id, pattern := range patterns { pats[id] = &psr.PatternEntry{ ID: id, Pattern: pattern, } } var cerrs []*CompileError for _, pat := range pats { if pat.ID == spec.LexModeKindIDNil { continue } p := psr.NewParser(kindIDToName[pat.ID], bytes.NewReader(pat.Pattern)) t, err := p.Parse() if err != nil { if err == psr.ParseErr { detail, cause := p.Error() cerrs = append(cerrs, &CompileError{ Kind: kindIDToName[pat.ID], Fragment: false, Cause: cause, Detail: detail, }) } else { cerrs = append(cerrs, &CompileError{ Kind: kindIDToName[pat.ID], Fragment: false, Cause: err, }) } continue } complete, err := psr.ApplyFragments(t, fragmentCPTrees) if err != nil { return nil, err, nil } if !complete { _, frags, err := t.Describe() if err != nil { return nil, err, nil } cerrs = append(cerrs, &CompileError{ Kind: kindIDToName[pat.ID], Fragment: false, Cause: fmt.Errorf("pattern contains undefined fragments"), Detail: fmt.Sprintf("%v", frags), }) continue } cpTrees[pat.ID] = t } if len(cerrs) > 0 { return nil, fmt.Errorf("compile error"), cerrs } } var tranTab *spec.TransitionTable { root, symTab, err := dfa.ConvertCPTreeToByteTree(cpTrees) if err != nil { return nil, err, nil } d := dfa.GenDFA(root, symTab) tranTab, err = dfa.GenTransitionTable(d) if err != nil { return nil, err, nil } } var err error switch compLv { case 2: tranTab, err = compressTransitionTableLv2(tranTab) if err != nil { return nil, err, nil } case 1: tranTab, err = compressTransitionTableLv1(tranTab) if err != nil { return nil, err, nil } } return &spec.CompiledLexModeSpec{ KindNames: kindNames, Push: push, Pop: pop, DFA: tranTab, }, nil, nil } const ( CompressionLevelMin = 0 CompressionLevelMax = 2 ) func compressTransitionTableLv2(tranTab *spec.TransitionTable) (*spec.TransitionTable, error) { ueTab := compressor.NewUniqueEntriesTable() { orig, err := compressor.NewOriginalTable(convertStateIDSliceToIntSlice(tranTab.UncompressedTransition), tranTab.ColCount) if err != nil { return nil, err } err = ueTab.Compress(orig) if err != nil { return nil, err } } rdTab := compressor.NewRowDisplacementTable(0) { orig, err := compressor.NewOriginalTable(ueTab.UniqueEntries, ueTab.OriginalColCount) if err != nil { return nil, err } err = rdTab.Compress(orig) if err != nil { return nil, err } } tranTab.Transition = &spec.UniqueEntriesTable{ UniqueEntries: &spec.RowDisplacementTable{ OriginalRowCount: rdTab.OriginalRowCount, OriginalColCount: rdTab.OriginalColCount, EmptyValue: spec.StateIDNil, Entries: convertIntSliceToStateIDSlice(rdTab.Entries), Bounds: rdTab.Bounds, RowDisplacement: rdTab.RowDisplacement, }, RowNums: ueTab.RowNums, OriginalRowCount: ueTab.OriginalRowCount, OriginalColCount: ueTab.OriginalColCount, } tranTab.UncompressedTransition = nil return tranTab, nil } func compressTransitionTableLv1(tranTab *spec.TransitionTable) (*spec.TransitionTable, error) { ueTab := compressor.NewUniqueEntriesTable() { orig, err := compressor.NewOriginalTable(convertStateIDSliceToIntSlice(tranTab.UncompressedTransition), tranTab.ColCount) if err != nil { return nil, err } err = ueTab.Compress(orig) if err != nil { return nil, err } } tranTab.Transition = &spec.UniqueEntriesTable{ UncompressedUniqueEntries: convertIntSliceToStateIDSlice(ueTab.UniqueEntries), RowNums: ueTab.RowNums, OriginalRowCount: ueTab.OriginalRowCount, OriginalColCount: ueTab.OriginalColCount, } tranTab.UncompressedTransition = nil return tranTab, nil } func convertStateIDSliceToIntSlice(s []spec.StateID) []int { is := make([]int, len(s)) for i, v := range s { is[i] = v.Int() } return is } func convertIntSliceToStateIDSlice(s []int) []spec.StateID { ss := make([]spec.StateID, len(s)) for i, v := range s { ss[i] = spec.StateID(v) } return ss } type LexEntry struct { Kind spec.LexKindName Pattern string Modes []spec.LexModeName Push spec.LexModeName Pop bool Fragment bool } type LexSpec struct { Entries []*LexEntry } func (s *LexSpec) Validate() error { if len(s.Entries) <= 0 { return fmt.Errorf("the lexical specification must have at least one entry") } { ks := map[string]struct{}{} fks := map[string]struct{}{} for _, e := range s.Entries { // Allow duplicate names between fragments and non-fragments. if e.Fragment { if _, exist := fks[e.Kind.String()]; exist { return fmt.Errorf("kinds `%v` are duplicates", e.Kind) } fks[e.Kind.String()] = struct{}{} } else { if _, exist := ks[e.Kind.String()]; exist { return fmt.Errorf("kinds `%v` are duplicates", e.Kind) } ks[e.Kind.String()] = struct{}{} } } } { kinds := []string{} modes := []string{ spec.LexModeNameDefault.String(), // This is a predefined mode. } for _, e := range s.Entries { if e.Fragment { continue } kinds = append(kinds, e.Kind.String()) for _, m := range e.Modes { modes = append(modes, m.String()) } } kindErrs := findSpellingInconsistenciesErrors(kinds, nil) modeErrs := findSpellingInconsistenciesErrors(modes, func(ids []string) error { if SnakeCaseToUpperCamelCase(ids[0]) == SnakeCaseToUpperCamelCase(spec.LexModeNameDefault.String()) { var b strings.Builder fmt.Fprintf(&b, "%+v", ids[0]) for _, id := range ids[1:] { fmt.Fprintf(&b, ", %+v", id) } return fmt.Errorf("these identifiers are treated as the same. please use the same spelling as predefined '%v': %v", spec.LexModeNameDefault, b.String()) } return nil }) errs := append(kindErrs, modeErrs...) if len(errs) > 0 { var b strings.Builder fmt.Fprintf(&b, "%v", errs[0]) for _, err := range errs[1:] { fmt.Fprintf(&b, "\n%v", err) } return fmt.Errorf(b.String()) } } return nil } func findSpellingInconsistenciesErrors(ids []string, hook func(ids []string) error) []error { duplicated := FindSpellingInconsistencies(ids) if len(duplicated) == 0 { return nil } var errs []error for _, dup := range duplicated { if hook != nil { err := hook(dup) if err != nil { errs = append(errs, err) continue } } var b strings.Builder fmt.Fprintf(&b, "%+v", dup[0]) for _, id := range dup[1:] { fmt.Fprintf(&b, ", %+v", id) } err := fmt.Errorf("these identifiers are treated as the same. please use the same spelling: %v", b.String()) errs = append(errs, err) } return errs } // FindSpellingInconsistencies finds spelling inconsistencies in identifiers. The identifiers are considered to be the same // if they are spelled the same when expressed in UpperCamelCase. For example, `left_paren` and `LeftParen` are spelled the same // in UpperCamelCase. Thus they are considere to be spelling inconsistency. func FindSpellingInconsistencies(ids []string) [][]string { m := map[string][]string{} for _, id := range removeDuplicates(ids) { c := SnakeCaseToUpperCamelCase(id) m[c] = append(m[c], id) } var duplicated [][]string for _, camels := range m { if len(camels) == 1 { continue } duplicated = append(duplicated, camels) } for _, dup := range duplicated { sort.Slice(dup, func(i, j int) bool { return dup[i] < dup[j] }) } sort.Slice(duplicated, func(i, j int) bool { return duplicated[i][0] < duplicated[j][0] }) return duplicated } func removeDuplicates(s []string) []string { m := map[string]struct{}{} for _, v := range s { m[v] = struct{}{} } var unique []string for v := range m { unique = append(unique, v) } return unique } func SnakeCaseToUpperCamelCase(snake string) string { elems := strings.Split(snake, "_") for i, e := range elems { if len(e) == 0 { continue } elems[i] = strings.ToUpper(string(e[0])) + e[1:] } return strings.Join(elems, "") }