1 files changed, 577 insertions, 0 deletions
diff --git a/src/urubu/grammar/lexical.go b/src/urubu/grammar/lexical.go
new file mode 100644
index 0000000..515e491
--- /dev/null
+++ b/src/urubu/grammar/lexical.go
@@ -0,0 +1,577 @@
+package lexical
+
+import (
+	"bytes"
+	"fmt"
+	"sort"
+	"strings"
+
+	"urubu/compressor"
+	"urubu/grammar/lexical/dfa"
+	psr "urubu/grammar/lexical/parser"
+	spec "urubu/spec/grammar"
+)
+
+type CompileError struct {
+	Kind     spec.LexKindName
+	Fragment bool
+	Cause    error
+	Detail   string
+}
+
+func Compile(lexspec *LexSpec, compLv int) (*spec.LexicalSpec, error, []*CompileError) {
+	err := lexspec.Validate()
+	if err != nil {
+		return nil, fmt.Errorf("invalid lexical specification:\n%w", err), nil
+	}
+
+	modeEntries, modeNames, modeName2ID, fragmetns := groupEntriesByLexMode(lexspec.Entries)
+
+	modeSpecs := []*spec.CompiledLexModeSpec{
+		nil,
+	}
+	for i, es := range modeEntries[1:] {
+		modeName := modeNames[i+1]
+		modeSpec, err, cerrs := compile(es, modeName2ID, fragmetns, compLv)
+		if err != nil {
+			return nil, fmt.Errorf("failed to compile in %v mode: %w", modeName, err), cerrs
+		}
+		modeSpecs = append(modeSpecs, modeSpec)
+	}
+
+	var kindNames []spec.LexKindName
+	var name2ID map[spec.LexKindName]spec.LexKindID
+	{
+		name2ID = map[spec.LexKindName]spec.LexKindID{}
+		id := spec.LexKindIDMin
+		for _, modeSpec := range modeSpecs[1:] {
+			for _, name := range modeSpec.KindNames[1:] {
+				if _, ok := name2ID[name]; ok {
+					continue
+				}
+				name2ID[name] = id
+				id++
+			}
+		}
+
+		kindNames = make([]spec.LexKindName, len(name2ID)+1)
+		for name, id := range name2ID {
+			kindNames[id] = name
+		}
+	}
+
+	var kindIDs [][]spec.LexKindID
+	{
+		kindIDs = make([][]spec.LexKindID, len(modeSpecs))
+		for i, modeSpec := range modeSpecs[1:] {
+			ids := make([]spec.LexKindID, len(modeSpec.KindNames))
+			for modeID, name := range modeSpec.KindNames {
+				if modeID == 0 {
+					continue
+				}
+				ids[modeID] = name2ID[name]
+			}
+			kindIDs[i+1] = ids
+		}
+	}
+
+	return &spec.LexicalSpec{
+		InitialModeID:    spec.LexModeIDDefault,
+		ModeNames:        modeNames,
+		KindNames:        kindNames,
+		KindIDs:          kindIDs,
+		CompressionLevel: compLv,
+		Specs:            modeSpecs,
+	}, nil, nil
+}
+
+func groupEntriesByLexMode(entries []*LexEntry) ([][]*LexEntry, []spec.LexModeName, map[spec.LexModeName]spec.LexModeID, map[spec.LexKindName]*LexEntry) {
+	modeNames := []spec.LexModeName{
+		spec.LexModeNameNil,
+		spec.LexModeNameDefault,
+	}
+	modeName2ID := map[spec.LexModeName]spec.LexModeID{
+		spec.LexModeNameNil:     spec.LexModeIDNil,
+		spec.LexModeNameDefault: spec.LexModeIDDefault,
+	}
+	lastModeID := spec.LexModeIDDefault
+	modeEntries := [][]*LexEntry{
+		nil,
+		{},
+	}
+	fragments := map[spec.LexKindName]*LexEntry{}
+	for _, e := range entries {
+		if e.Fragment {
+			fragments[e.Kind] = e
+			continue
+		}
+		ms := e.Modes
+		if len(ms) == 0 {
+			ms = []spec.LexModeName{
+				spec.LexModeNameDefault,
+			}
+		}
+		for _, modeName := range ms {
+			modeID, ok := modeName2ID[modeName]
+			if !ok {
+				modeID = lastModeID + 1
+				lastModeID = modeID
+				modeName2ID[modeName] = modeID
+				modeNames = append(modeNames, modeName)
+				modeEntries = append(modeEntries, []*LexEntry{})
+			}
+			modeEntries[modeID] = append(modeEntries[modeID], e)
+		}
+	}
+	return modeEntries, modeNames, modeName2ID, fragments
+}
+
+func compile(
+	entries []*LexEntry,
+	modeName2ID map[spec.LexModeName]spec.LexModeID,
+	fragments map[spec.LexKindName]*LexEntry,
+	compLv int,
+) (*spec.CompiledLexModeSpec, error, []*CompileError) {
+	var kindNames []spec.LexKindName
+	kindIDToName := map[spec.LexModeKindID]spec.LexKindName{}
+	var patterns map[spec.LexModeKindID][]byte
+	{
+		kindNames = append(kindNames, spec.LexKindNameNil)
+		patterns = map[spec.LexModeKindID][]byte{}
+		for i, e := range entries {
+			kindID := spec.LexModeKindID(i + 1)
+
+			kindNames = append(kindNames, e.Kind)
+			kindIDToName[kindID] = e.Kind
+			patterns[kindID] = []byte(e.Pattern)
+		}
+	}
+
+	push := []spec.LexModeID{
+		spec.LexModeIDNil,
+	}
+	pop := []int{
+		0,
+	}
+	for _, e := range entries {
+		pushV := spec.LexModeIDNil
+		if e.Push != "" {
+			pushV = modeName2ID[e.Push]
+		}
+		push = append(push, pushV)
+		popV := 0
+		if e.Pop {
+			popV = 1
+		}
+		pop = append(pop, popV)
+	}
+
+	fragmentPatterns := map[spec.LexKindName][]byte{}
+	for k, e := range fragments {
+		fragmentPatterns[k] = []byte(e.Pattern)
+	}
+
+	fragmentCPTrees := make(map[spec.LexKindName]psr.CPTree, len(fragmentPatterns))
+	{
+		var cerrs []*CompileError
+		for kind, pat := range fragmentPatterns {
+			p := psr.NewParser(kind, bytes.NewReader(pat))
+			t, err := p.Parse()
+			if err != nil {
+				if err == psr.ParseErr {
+					detail, cause := p.Error()
+					cerrs = append(cerrs, &CompileError{
+						Kind:     kind,
+						Fragment: true,
+						Cause:    cause,
+						Detail:   detail,
+					})
+				} else {
+					cerrs = append(cerrs, &CompileError{
+						Kind:     kind,
+						Fragment: true,
+						Cause:    err,
+					})
+				}
+				continue
+			}
+			fragmentCPTrees[kind] = t
+		}
+		if len(cerrs) > 0 {
+			return nil, fmt.Errorf("compile error"), cerrs
+		}
+
+		err := psr.CompleteFragments(fragmentCPTrees)
+		if err != nil {
+			if err == psr.ParseErr {
+				for _, frag := range fragmentCPTrees {
+					kind, frags, err := frag.Describe()
+					if err != nil {
+						return nil, err, nil
+					}
+
+					cerrs = append(cerrs, &CompileError{
+						Kind:     kind,
+						Fragment: true,
+						Cause:    fmt.Errorf("fragment contains undefined fragments or cycles"),
+						Detail:   fmt.Sprintf("%v", frags),
+					})
+				}
+
+				return nil, fmt.Errorf("compile error"), cerrs
+			}
+
+			return nil, err, nil
+		}
+	}
+
+	cpTrees := map[spec.LexModeKindID]psr.CPTree{}
+	{
+		pats := make([]*psr.PatternEntry, len(patterns)+1)
+		pats[spec.LexModeKindIDNil] = &psr.PatternEntry{
+			ID: spec.LexModeKindIDNil,
+		}
+		for id, pattern := range patterns {
+			pats[id] = &psr.PatternEntry{
+				ID:      id,
+				Pattern: pattern,
+			}
+		}
+
+		var cerrs []*CompileError
+		for _, pat := range pats {
+			if pat.ID == spec.LexModeKindIDNil {
+				continue
+			}
+
+			p := psr.NewParser(kindIDToName[pat.ID], bytes.NewReader(pat.Pattern))
+			t, err := p.Parse()
+			if err != nil {
+				if err == psr.ParseErr {
+					detail, cause := p.Error()
+					cerrs = append(cerrs, &CompileError{
+						Kind:     kindIDToName[pat.ID],
+						Fragment: false,
+						Cause:    cause,
+						Detail:   detail,
+					})
+				} else {
+					cerrs = append(cerrs, &CompileError{
+						Kind:     kindIDToName[pat.ID],
+						Fragment: false,
+						Cause:    err,
+					})
+				}
+				continue
+			}
+
+			complete, err := psr.ApplyFragments(t, fragmentCPTrees)
+			if err != nil {
+				return nil, err, nil
+			}
+			if !complete {
+				_, frags, err := t.Describe()
+				if err != nil {
+					return nil, err, nil
+				}
+
+				cerrs = append(cerrs, &CompileError{
+					Kind:     kindIDToName[pat.ID],
+					Fragment: false,
+					Cause:    fmt.Errorf("pattern contains undefined fragments"),
+					Detail:   fmt.Sprintf("%v", frags),
+				})
+				continue
+			}
+
+			cpTrees[pat.ID] = t
+		}
+		if len(cerrs) > 0 {
+			return nil, fmt.Errorf("compile error"), cerrs
+		}
+	}
+
+	var tranTab *spec.TransitionTable
+	{
+		root, symTab, err := dfa.ConvertCPTreeToByteTree(cpTrees)
+		if err != nil {
+			return nil, err, nil
+		}
+		d := dfa.GenDFA(root, symTab)
+		tranTab, err = dfa.GenTransitionTable(d)
+		if err != nil {
+			return nil, err, nil
+		}
+	}
+
+	var err error
+	switch compLv {
+	case 2:
+		tranTab, err = compressTransitionTableLv2(tranTab)
+		if err != nil {
+			return nil, err, nil
+		}
+	case 1:
+		tranTab, err = compressTransitionTableLv1(tranTab)
+		if err != nil {
+			return nil, err, nil
+		}
+	}
+
+	return &spec.CompiledLexModeSpec{
+		KindNames: kindNames,
+		Push:      push,
+		Pop:       pop,
+		DFA:       tranTab,
+	}, nil, nil
+}
+
+const (
+	CompressionLevelMin = 0
+	CompressionLevelMax = 2
+)
+
+func compressTransitionTableLv2(tranTab *spec.TransitionTable) (*spec.TransitionTable, error) {
+	ueTab := compressor.NewUniqueEntriesTable()
+	{
+		orig, err := compressor.NewOriginalTable(convertStateIDSliceToIntSlice(tranTab.UncompressedTransition), tranTab.ColCount)
+		if err != nil {
+			return nil, err
+		}
+		err = ueTab.Compress(orig)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	rdTab := compressor.NewRowDisplacementTable(0)
+	{
+		orig, err := compressor.NewOriginalTable(ueTab.UniqueEntries, ueTab.OriginalColCount)
+		if err != nil {
+			return nil, err
+		}
+		err = rdTab.Compress(orig)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	tranTab.Transition = &spec.UniqueEntriesTable{
+		UniqueEntries: &spec.RowDisplacementTable{
+			OriginalRowCount: rdTab.OriginalRowCount,
+			OriginalColCount: rdTab.OriginalColCount,
+			EmptyValue:       spec.StateIDNil,
+			Entries:          convertIntSliceToStateIDSlice(rdTab.Entries),
+			Bounds:           rdTab.Bounds,
+			RowDisplacement:  rdTab.RowDisplacement,
+		},
+		RowNums:          ueTab.RowNums,
+		OriginalRowCount: ueTab.OriginalRowCount,
+		OriginalColCount: ueTab.OriginalColCount,
+	}
+	tranTab.UncompressedTransition = nil
+
+	return tranTab, nil
+}
+
+func compressTransitionTableLv1(tranTab *spec.TransitionTable) (*spec.TransitionTable, error) {
+	ueTab := compressor.NewUniqueEntriesTable()
+	{
+		orig, err := compressor.NewOriginalTable(convertStateIDSliceToIntSlice(tranTab.UncompressedTransition), tranTab.ColCount)
+		if err != nil {
+			return nil, err
+		}
+		err = ueTab.Compress(orig)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	tranTab.Transition = &spec.UniqueEntriesTable{
+		UncompressedUniqueEntries: convertIntSliceToStateIDSlice(ueTab.UniqueEntries),
+		RowNums:                   ueTab.RowNums,
+		OriginalRowCount:          ueTab.OriginalRowCount,
+		OriginalColCount:          ueTab.OriginalColCount,
+	}
+	tranTab.UncompressedTransition = nil
+
+	return tranTab, nil
+}
+
+func convertStateIDSliceToIntSlice(s []spec.StateID) []int {
+	is := make([]int, len(s))
+	for i, v := range s {
+		is[i] = v.Int()
+	}
+	return is
+}
+
+func convertIntSliceToStateIDSlice(s []int) []spec.StateID {
+	ss := make([]spec.StateID, len(s))
+	for i, v := range s {
+		ss[i] = spec.StateID(v)
+	}
+	return ss
+}
+
+type LexEntry struct {
+	Kind     spec.LexKindName
+	Pattern  string
+	Modes    []spec.LexModeName
+	Push     spec.LexModeName
+	Pop      bool
+	Fragment bool
+}
+
+type LexSpec struct {
+	Entries []*LexEntry
+}
+
+func (s *LexSpec) Validate() error {
+	if len(s.Entries) <= 0 {
+		return fmt.Errorf("the lexical specification must have at least one entry")
+	}
+	{
+		ks := map[string]struct{}{}
+		fks := map[string]struct{}{}
+		for _, e := range s.Entries {
+			// Allow duplicate names between fragments and non-fragments.
+			if e.Fragment {
+				if _, exist := fks[e.Kind.String()]; exist {
+					return fmt.Errorf("kinds `%v` are duplicates", e.Kind)
+				}
+				fks[e.Kind.String()] = struct{}{}
+			} else {
+				if _, exist := ks[e.Kind.String()]; exist {
+					return fmt.Errorf("kinds `%v` are duplicates", e.Kind)
+				}
+				ks[e.Kind.String()] = struct{}{}
+			}
+		}
+	}
+	{
+		kinds := []string{}
+		modes := []string{
+			spec.LexModeNameDefault.String(), // This is a predefined mode.
+		}
+		for _, e := range s.Entries {
+			if e.Fragment {
+				continue
+			}
+
+			kinds = append(kinds, e.Kind.String())
+
+			for _, m := range e.Modes {
+				modes = append(modes, m.String())
+			}
+		}
+
+		kindErrs := findSpellingInconsistenciesErrors(kinds, nil)
+		modeErrs := findSpellingInconsistenciesErrors(modes, func(ids []string) error {
+			if SnakeCaseToUpperCamelCase(ids[0]) == SnakeCaseToUpperCamelCase(spec.LexModeNameDefault.String()) {
+				var b strings.Builder
+				fmt.Fprintf(&b, "%+v", ids[0])
+				for _, id := range ids[1:] {
+					fmt.Fprintf(&b, ", %+v", id)
+				}
+				return fmt.Errorf("these identifiers are treated as the same. please use the same spelling as predefined '%v': %v", spec.LexModeNameDefault, b.String())
+			}
+			return nil
+		})
+		errs := append(kindErrs, modeErrs...)
+		if len(errs) > 0 {
+			var b strings.Builder
+			fmt.Fprintf(&b, "%v", errs[0])
+			for _, err := range errs[1:] {
+				fmt.Fprintf(&b, "\n%v", err)
+			}
+			return fmt.Errorf(b.String())
+		}
+	}
+
+	return nil
+}
+
+func findSpellingInconsistenciesErrors(ids []string, hook func(ids []string) error) []error {
+	duplicated := FindSpellingInconsistencies(ids)
+	if len(duplicated) == 0 {
+		return nil
+	}
+
+	var errs []error
+	for _, dup := range duplicated {
+		if hook != nil {
+			err := hook(dup)
+			if err != nil {
+				errs = append(errs, err)
+				continue
+			}
+		}
+
+		var b strings.Builder
+		fmt.Fprintf(&b, "%+v", dup[0])
+		for _, id := range dup[1:] {
+			fmt.Fprintf(&b, ", %+v", id)
+		}
+		err := fmt.Errorf("these identifiers are treated as the same. please use the same spelling: %v", b.String())
+		errs = append(errs, err)
+	}
+
+	return errs
+}
+
+// FindSpellingInconsistencies finds spelling inconsistencies in identifiers. The identifiers are considered to be the same
+// if they are spelled the same when expressed in UpperCamelCase. For example, `left_paren` and `LeftParen` are spelled the same
+// in UpperCamelCase. Thus they are considere to be spelling inconsistency.
+func FindSpellingInconsistencies(ids []string) [][]string {
+	m := map[string][]string{}
+	for _, id := range removeDuplicates(ids) {
+		c := SnakeCaseToUpperCamelCase(id)
+		m[c] = append(m[c], id)
+	}
+
+	var duplicated [][]string
+	for _, camels := range m {
+		if len(camels) == 1 {
+			continue
+		}
+		duplicated = append(duplicated, camels)
+	}
+
+	for _, dup := range duplicated {
+		sort.Slice(dup, func(i, j int) bool {
+			return dup[i] < dup[j]
+		})
+	}
+	sort.Slice(duplicated, func(i, j int) bool {
+		return duplicated[i][0] < duplicated[j][0]
+	})
+
+	return duplicated
+}
+
+func removeDuplicates(s []string) []string {
+	m := map[string]struct{}{}
+	for _, v := range s {
+		m[v] = struct{}{}
+	}
+
+	var unique []string
+	for v := range m {
+		unique = append(unique, v)
+	}
+
+	return unique
+}
+
+func SnakeCaseToUpperCamelCase(snake string) string {
+	elems := strings.Split(snake, "_")
+	for i, e := range elems {
+		if len(e) == 0 {
+			continue
+		}
+		elems[i] = strings.ToUpper(string(e[0])) + e[1:]
+	}
+
+	return strings.Join(elems, "")
+}