aboutsummaryrefslogtreecommitdiff
path: root/src/urubu/grammar/lexical.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/urubu/grammar/lexical.go')
-rw-r--r--src/urubu/grammar/lexical.go577
1 files changed, 577 insertions, 0 deletions
diff --git a/src/urubu/grammar/lexical.go b/src/urubu/grammar/lexical.go
new file mode 100644
index 0000000..515e491
--- /dev/null
+++ b/src/urubu/grammar/lexical.go
@@ -0,0 +1,577 @@
+package lexical
+
+import (
+ "bytes"
+ "fmt"
+ "sort"
+ "strings"
+
+ "urubu/compressor"
+ "urubu/grammar/lexical/dfa"
+ psr "urubu/grammar/lexical/parser"
+ spec "urubu/spec/grammar"
+)
+
+type CompileError struct {
+ Kind spec.LexKindName
+ Fragment bool
+ Cause error
+ Detail string
+}
+
+func Compile(lexspec *LexSpec, compLv int) (*spec.LexicalSpec, error, []*CompileError) {
+ err := lexspec.Validate()
+ if err != nil {
+ return nil, fmt.Errorf("invalid lexical specification:\n%w", err), nil
+ }
+
+ modeEntries, modeNames, modeName2ID, fragmetns := groupEntriesByLexMode(lexspec.Entries)
+
+ modeSpecs := []*spec.CompiledLexModeSpec{
+ nil,
+ }
+ for i, es := range modeEntries[1:] {
+ modeName := modeNames[i+1]
+ modeSpec, err, cerrs := compile(es, modeName2ID, fragmetns, compLv)
+ if err != nil {
+ return nil, fmt.Errorf("failed to compile in %v mode: %w", modeName, err), cerrs
+ }
+ modeSpecs = append(modeSpecs, modeSpec)
+ }
+
+ var kindNames []spec.LexKindName
+ var name2ID map[spec.LexKindName]spec.LexKindID
+ {
+ name2ID = map[spec.LexKindName]spec.LexKindID{}
+ id := spec.LexKindIDMin
+ for _, modeSpec := range modeSpecs[1:] {
+ for _, name := range modeSpec.KindNames[1:] {
+ if _, ok := name2ID[name]; ok {
+ continue
+ }
+ name2ID[name] = id
+ id++
+ }
+ }
+
+ kindNames = make([]spec.LexKindName, len(name2ID)+1)
+ for name, id := range name2ID {
+ kindNames[id] = name
+ }
+ }
+
+ var kindIDs [][]spec.LexKindID
+ {
+ kindIDs = make([][]spec.LexKindID, len(modeSpecs))
+ for i, modeSpec := range modeSpecs[1:] {
+ ids := make([]spec.LexKindID, len(modeSpec.KindNames))
+ for modeID, name := range modeSpec.KindNames {
+ if modeID == 0 {
+ continue
+ }
+ ids[modeID] = name2ID[name]
+ }
+ kindIDs[i+1] = ids
+ }
+ }
+
+ return &spec.LexicalSpec{
+ InitialModeID: spec.LexModeIDDefault,
+ ModeNames: modeNames,
+ KindNames: kindNames,
+ KindIDs: kindIDs,
+ CompressionLevel: compLv,
+ Specs: modeSpecs,
+ }, nil, nil
+}
+
+func groupEntriesByLexMode(entries []*LexEntry) ([][]*LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeID, map[spec.LexKindName]*LexEntry) {
+ modeNames := []spec.LexModeName{
+ spec.LexModeNameNil,
+ spec.LexModeNameDefault,
+ }
+ modeName2ID := map[spec.LexModeName]spec.LexModeID{
+ spec.LexModeNameNil: spec.LexModeIDNil,
+ spec.LexModeNameDefault: spec.LexModeIDDefault,
+ }
+ lastModeID := spec.LexModeIDDefault
+ modeEntries := [][]*LexEntry{
+ nil,
+ {},
+ }
+ fragments := map[spec.LexKindName]*LexEntry{}
+ for _, e := range entries {
+ if e.Fragment {
+ fragments[e.Kind] = e
+ continue
+ }
+ ms := e.Modes
+ if len(ms) == 0 {
+ ms = []spec.LexModeName{
+ spec.LexModeNameDefault,
+ }
+ }
+ for _, modeName := range ms {
+ modeID, ok := modeName2ID[modeName]
+ if !ok {
+ modeID = lastModeID + 1
+ lastModeID = modeID
+ modeName2ID[modeName] = modeID
+ modeNames = append(modeNames, modeName)
+ modeEntries = append(modeEntries, []*LexEntry{})
+ }
+ modeEntries[modeID] = append(modeEntries[modeID], e)
+ }
+ }
+ return modeEntries, modeNames, modeName2ID, fragments
+}
+
+func compile(
+ entries []*LexEntry,
+ modeName2ID map[spec.LexModeName]spec.LexModeID,
+ fragments map[spec.LexKindName]*LexEntry,
+ compLv int,
+) (*spec.CompiledLexModeSpec, error, []*CompileError) {
+ var kindNames []spec.LexKindName
+ kindIDToName := map[spec.LexModeKindID]spec.LexKindName{}
+ var patterns map[spec.LexModeKindID][]byte
+ {
+ kindNames = append(kindNames, spec.LexKindNameNil)
+ patterns = map[spec.LexModeKindID][]byte{}
+ for i, e := range entries {
+ kindID := spec.LexModeKindID(i + 1)
+
+ kindNames = append(kindNames, e.Kind)
+ kindIDToName[kindID] = e.Kind
+ patterns[kindID] = []byte(e.Pattern)
+ }
+ }
+
+ push := []spec.LexModeID{
+ spec.LexModeIDNil,
+ }
+ pop := []int{
+ 0,
+ }
+ for _, e := range entries {
+ pushV := spec.LexModeIDNil
+ if e.Push != "" {
+ pushV = modeName2ID[e.Push]
+ }
+ push = append(push, pushV)
+ popV := 0
+ if e.Pop {
+ popV = 1
+ }
+ pop = append(pop, popV)
+ }
+
+ fragmentPatterns := map[spec.LexKindName][]byte{}
+ for k, e := range fragments {
+ fragmentPatterns[k] = []byte(e.Pattern)
+ }
+
+ fragmentCPTrees := make(map[spec.LexKindName]psr.CPTree, len(fragmentPatterns))
+ {
+ var cerrs []*CompileError
+ for kind, pat := range fragmentPatterns {
+ p := psr.NewParser(kind, bytes.NewReader(pat))
+ t, err := p.Parse()
+ if err != nil {
+ if err == psr.ParseErr {
+ detail, cause := p.Error()
+ cerrs = append(cerrs, &CompileError{
+ Kind: kind,
+ Fragment: true,
+ Cause: cause,
+ Detail: detail,
+ })
+ } else {
+ cerrs = append(cerrs, &CompileError{
+ Kind: kind,
+ Fragment: true,
+ Cause: err,
+ })
+ }
+ continue
+ }
+ fragmentCPTrees[kind] = t
+ }
+ if len(cerrs) > 0 {
+ return nil, fmt.Errorf("compile error"), cerrs
+ }
+
+ err := psr.CompleteFragments(fragmentCPTrees)
+ if err != nil {
+ if err == psr.ParseErr {
+ for _, frag := range fragmentCPTrees {
+ kind, frags, err := frag.Describe()
+ if err != nil {
+ return nil, err, nil
+ }
+
+ cerrs = append(cerrs, &CompileError{
+ Kind: kind,
+ Fragment: true,
+ Cause: fmt.Errorf("fragment contains undefined fragments or cycles"),
+ Detail: fmt.Sprintf("%v", frags),
+ })
+ }
+
+ return nil, fmt.Errorf("compile error"), cerrs
+ }
+
+ return nil, err, nil
+ }
+ }
+
+ cpTrees := map[spec.LexModeKindID]psr.CPTree{}
+ {
+ pats := make([]*psr.PatternEntry, len(patterns)+1)
+ pats[spec.LexModeKindIDNil] = &psr.PatternEntry{
+ ID: spec.LexModeKindIDNil,
+ }
+ for id, pattern := range patterns {
+ pats[id] = &psr.PatternEntry{
+ ID: id,
+ Pattern: pattern,
+ }
+ }
+
+ var cerrs []*CompileError
+ for _, pat := range pats {
+ if pat.ID == spec.LexModeKindIDNil {
+ continue
+ }
+
+ p := psr.NewParser(kindIDToName[pat.ID], bytes.NewReader(pat.Pattern))
+ t, err := p.Parse()
+ if err != nil {
+ if err == psr.ParseErr {
+ detail, cause := p.Error()
+ cerrs = append(cerrs, &CompileError{
+ Kind: kindIDToName[pat.ID],
+ Fragment: false,
+ Cause: cause,
+ Detail: detail,
+ })
+ } else {
+ cerrs = append(cerrs, &CompileError{
+ Kind: kindIDToName[pat.ID],
+ Fragment: false,
+ Cause: err,
+ })
+ }
+ continue
+ }
+
+ complete, err := psr.ApplyFragments(t, fragmentCPTrees)
+ if err != nil {
+ return nil, err, nil
+ }
+ if !complete {
+ _, frags, err := t.Describe()
+ if err != nil {
+ return nil, err, nil
+ }
+
+ cerrs = append(cerrs, &CompileError{
+ Kind: kindIDToName[pat.ID],
+ Fragment: false,
+ Cause: fmt.Errorf("pattern contains undefined fragments"),
+ Detail: fmt.Sprintf("%v", frags),
+ })
+ continue
+ }
+
+ cpTrees[pat.ID] = t
+ }
+ if len(cerrs) > 0 {
+ return nil, fmt.Errorf("compile error"), cerrs
+ }
+ }
+
+ var tranTab *spec.TransitionTable
+ {
+ root, symTab, err := dfa.ConvertCPTreeToByteTree(cpTrees)
+ if err != nil {
+ return nil, err, nil
+ }
+ d := dfa.GenDFA(root, symTab)
+ tranTab, err = dfa.GenTransitionTable(d)
+ if err != nil {
+ return nil, err, nil
+ }
+ }
+
+ var err error
+ switch compLv {
+ case 2:
+ tranTab, err = compressTransitionTableLv2(tranTab)
+ if err != nil {
+ return nil, err, nil
+ }
+ case 1:
+ tranTab, err = compressTransitionTableLv1(tranTab)
+ if err != nil {
+ return nil, err, nil
+ }
+ }
+
+ return &spec.CompiledLexModeSpec{
+ KindNames: kindNames,
+ Push: push,
+ Pop: pop,
+ DFA: tranTab,
+ }, nil, nil
+}
+
+const (
+ CompressionLevelMin = 0
+ CompressionLevelMax = 2
+)
+
+func compressTransitionTableLv2(tranTab *spec.TransitionTable) (*spec.TransitionTable, error) {
+ ueTab := compressor.NewUniqueEntriesTable()
+ {
+ orig, err := compressor.NewOriginalTable(convertStateIDSliceToIntSlice(tranTab.UncompressedTransition), tranTab.ColCount)
+ if err != nil {
+ return nil, err
+ }
+ err = ueTab.Compress(orig)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ rdTab := compressor.NewRowDisplacementTable(0)
+ {
+ orig, err := compressor.NewOriginalTable(ueTab.UniqueEntries, ueTab.OriginalColCount)
+ if err != nil {
+ return nil, err
+ }
+ err = rdTab.Compress(orig)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ tranTab.Transition = &spec.UniqueEntriesTable{
+ UniqueEntries: &spec.RowDisplacementTable{
+ OriginalRowCount: rdTab.OriginalRowCount,
+ OriginalColCount: rdTab.OriginalColCount,
+ EmptyValue: spec.StateIDNil,
+ Entries: convertIntSliceToStateIDSlice(rdTab.Entries),
+ Bounds: rdTab.Bounds,
+ RowDisplacement: rdTab.RowDisplacement,
+ },
+ RowNums: ueTab.RowNums,
+ OriginalRowCount: ueTab.OriginalRowCount,
+ OriginalColCount: ueTab.OriginalColCount,
+ }
+ tranTab.UncompressedTransition = nil
+
+ return tranTab, nil
+}
+
+func compressTransitionTableLv1(tranTab *spec.TransitionTable) (*spec.TransitionTable, error) {
+ ueTab := compressor.NewUniqueEntriesTable()
+ {
+ orig, err := compressor.NewOriginalTable(convertStateIDSliceToIntSlice(tranTab.UncompressedTransition), tranTab.ColCount)
+ if err != nil {
+ return nil, err
+ }
+ err = ueTab.Compress(orig)
+ if err != nil {
+ return nil, err
+ }
+ }
+
+ tranTab.Transition = &spec.UniqueEntriesTable{
+ UncompressedUniqueEntries: convertIntSliceToStateIDSlice(ueTab.UniqueEntries),
+ RowNums: ueTab.RowNums,
+ OriginalRowCount: ueTab.OriginalRowCount,
+ OriginalColCount: ueTab.OriginalColCount,
+ }
+ tranTab.UncompressedTransition = nil
+
+ return tranTab, nil
+}
+
+func convertStateIDSliceToIntSlice(s []spec.StateID) []int {
+ is := make([]int, len(s))
+ for i, v := range s {
+ is[i] = v.Int()
+ }
+ return is
+}
+
+func convertIntSliceToStateIDSlice(s []int) []spec.StateID {
+ ss := make([]spec.StateID, len(s))
+ for i, v := range s {
+ ss[i] = spec.StateID(v)
+ }
+ return ss
+}
+
+type LexEntry struct {
+ Kind spec.LexKindName
+ Pattern string
+ Modes []spec.LexModeName
+ Push spec.LexModeName
+ Pop bool
+ Fragment bool
+}
+
+type LexSpec struct {
+ Entries []*LexEntry
+}
+
+func (s *LexSpec) Validate() error {
+ if len(s.Entries) <= 0 {
+ return fmt.Errorf("the lexical specification must have at least one entry")
+ }
+ {
+ ks := map[string]struct{}{}
+ fks := map[string]struct{}{}
+ for _, e := range s.Entries {
+ // Allow duplicate names between fragments and non-fragments.
+ if e.Fragment {
+ if _, exist := fks[e.Kind.String()]; exist {
+ return fmt.Errorf("kinds `%v` are duplicates", e.Kind)
+ }
+ fks[e.Kind.String()] = struct{}{}
+ } else {
+ if _, exist := ks[e.Kind.String()]; exist {
+ return fmt.Errorf("kinds `%v` are duplicates", e.Kind)
+ }
+ ks[e.Kind.String()] = struct{}{}
+ }
+ }
+ }
+ {
+ kinds := []string{}
+ modes := []string{
+ spec.LexModeNameDefault.String(), // This is a predefined mode.
+ }
+ for _, e := range s.Entries {
+ if e.Fragment {
+ continue
+ }
+
+ kinds = append(kinds, e.Kind.String())
+
+ for _, m := range e.Modes {
+ modes = append(modes, m.String())
+ }
+ }
+
+ kindErrs := findSpellingInconsistenciesErrors(kinds, nil)
+ modeErrs := findSpellingInconsistenciesErrors(modes, func(ids []string) error {
+ if SnakeCaseToUpperCamelCase(ids[0]) == SnakeCaseToUpperCamelCase(spec.LexModeNameDefault.String()) {
+ var b strings.Builder
+ fmt.Fprintf(&b, "%+v", ids[0])
+ for _, id := range ids[1:] {
+ fmt.Fprintf(&b, ", %+v", id)
+ }
+ return fmt.Errorf("these identifiers are treated as the same. please use the same spelling as predefined '%v': %v", spec.LexModeNameDefault, b.String())
+ }
+ return nil
+ })
+ errs := append(kindErrs, modeErrs...)
+ if len(errs) > 0 {
+ var b strings.Builder
+ fmt.Fprintf(&b, "%v", errs[0])
+ for _, err := range errs[1:] {
+ fmt.Fprintf(&b, "\n%v", err)
+ }
+ return fmt.Errorf(b.String())
+ }
+ }
+
+ return nil
+}
+
+func findSpellingInconsistenciesErrors(ids []string, hook func(ids []string) error) []error {
+ duplicated := FindSpellingInconsistencies(ids)
+ if len(duplicated) == 0 {
+ return nil
+ }
+
+ var errs []error
+ for _, dup := range duplicated {
+ if hook != nil {
+ err := hook(dup)
+ if err != nil {
+ errs = append(errs, err)
+ continue
+ }
+ }
+
+ var b strings.Builder
+ fmt.Fprintf(&b, "%+v", dup[0])
+ for _, id := range dup[1:] {
+ fmt.Fprintf(&b, ", %+v", id)
+ }
+ err := fmt.Errorf("these identifiers are treated as the same. please use the same spelling: %v", b.String())
+ errs = append(errs, err)
+ }
+
+ return errs
+}
+
+// FindSpellingInconsistencies finds spelling inconsistencies in identifiers. The identifiers are considered to be the same
+// if they are spelled the same when expressed in UpperCamelCase. For example, `left_paren` and `LeftParen` are spelled the same
+// in UpperCamelCase. Thus they are considere to be spelling inconsistency.
+func FindSpellingInconsistencies(ids []string) [][]string {
+ m := map[string][]string{}
+ for _, id := range removeDuplicates(ids) {
+ c := SnakeCaseToUpperCamelCase(id)
+ m[c] = append(m[c], id)
+ }
+
+ var duplicated [][]string
+ for _, camels := range m {
+ if len(camels) == 1 {
+ continue
+ }
+ duplicated = append(duplicated, camels)
+ }
+
+ for _, dup := range duplicated {
+ sort.Slice(dup, func(i, j int) bool {
+ return dup[i] < dup[j]
+ })
+ }
+ sort.Slice(duplicated, func(i, j int) bool {
+ return duplicated[i][0] < duplicated[j][0]
+ })
+
+ return duplicated
+}
+
+func removeDuplicates(s []string) []string {
+ m := map[string]struct{}{}
+ for _, v := range s {
+ m[v] = struct{}{}
+ }
+
+ var unique []string
+ for v := range m {
+ unique = append(unique, v)
+ }
+
+ return unique
+}
+
+func SnakeCaseToUpperCamelCase(snake string) string {
+ elems := strings.Split(snake, "_")
+ for i, e := range elems {
+ if len(e) == 0 {
+ continue
+ }
+ elems[i] = strings.ToUpper(string(e[0])) + e[1:]
+ }
+
+ return strings.Join(elems, "")
+}