1 files changed, 407 insertions, 14 deletions
diff --git a/src/tre.go b/src/tre.go
index cab7784..2350a52 100644
--- a/src/tre.go
+++ b/src/tre.go
@@ -3,6 +3,8 @@ package tre
 import (
 	"encoding/binary"
 	"fmt"
+	"regexp"
+	"strconv"
 	"strings"
 	"sort"
 )
@@ -174,33 +176,33 @@ type Compressor interface {
 }
 
 var (
-	_ Compressor = &UniqueEntriesTable{}
-	_ Compressor = &RowDisplacementTable{}
+	_ Compressor = &CompressorUniqueEntriesTable{}
+	_ Compressor = &CompressorRowDisplacementTable{}
 )
 
-type UniqueEntriesTable struct {
+type CompressorUniqueEntriesTable struct {
 	UniqueEntries    []int
 	RowNums          []int
 	OriginalRowCount int
 	OriginalColCount int
 }
 
-func NewUniqueEntriesTable() *UniqueEntriesTable {
-	return &UniqueEntriesTable{}
+func NewCompressorUniqueEntriesTable() *CompressorUniqueEntriesTable {
+	return &CompressorUniqueEntriesTable{}
 }
 
-func (tab *UniqueEntriesTable) Lookup(row, col int) (int, error) {
+func (tab *CompressorUniqueEntriesTable) Lookup(row, col int) (int, error) {
 	if row < 0 || row >= tab.OriginalRowCount || col < 0 || col >= tab.OriginalColCount {
 		return 0, fmt.Errorf("indexes are out of range: [%v, %v]", row, col)
 	}
 	return tab.UniqueEntries[tab.RowNums[row]*tab.OriginalColCount+col], nil
 }
 
-func (tab *UniqueEntriesTable) OriginalTableSize() (int, int) {
+func (tab *CompressorUniqueEntriesTable) OriginalTableSize() (int, int) {
 	return tab.OriginalRowCount, tab.OriginalColCount
 }
 
-func (tab *UniqueEntriesTable) Compress(orig *OriginalTable) error {
+func (tab *CompressorUniqueEntriesTable) Compress(orig *OriginalTable) error {
 	var uniqueEntries []int
 	rowNums := make([]int, orig.rowCount)
 	hash2RowNum := map[string]int{}
@@ -238,7 +240,7 @@ func (tab *UniqueEntriesTable) Compress(orig *OriginalTable) error {
 
 const ForbiddenValue = -1
 
-type RowDisplacementTable struct {
+type CompressorRowDisplacementTable struct {
 	OriginalRowCount int
 	OriginalColCount int
 	EmptyValue       int
@@ -247,13 +249,13 @@ type RowDisplacementTable struct {
 	RowDisplacement  []int
 }
 
-func NewRowDisplacementTable(emptyValue int) *RowDisplacementTable {
-	return &RowDisplacementTable{
+func NewCompressorRowDisplacementTable(emptyValue int) *CompressorRowDisplacementTable {
+	return &CompressorRowDisplacementTable{
 		EmptyValue: emptyValue,
 	}
 }
 
-func (tab *RowDisplacementTable) Lookup(row int, col int) (int, error) {
+func (tab *CompressorRowDisplacementTable) Lookup(row int, col int) (int, error) {
 	if row < 0 || row >= tab.OriginalRowCount || col < 0 || col >= tab.OriginalColCount {
 		return tab.EmptyValue, fmt.Errorf("indexes are out of range: [%v, %v]", row, col)
 	}
@@ -264,7 +266,7 @@ func (tab *RowDisplacementTable) Lookup(row int, col int) (int, error) {
 	return tab.Entries[d+col], nil
 }
 
-func (tab *RowDisplacementTable) OriginalTableSize() (int, int) {
+func (tab *CompressorRowDisplacementTable) OriginalTableSize() (int, int) {
 	return tab.OriginalRowCount, tab.OriginalColCount
 }
 
@@ -274,7 +276,7 @@ type rowInfo struct {
 	nonEmptyCol   []int
 }
 
-func (tab *RowDisplacementTable) Compress(orig *OriginalTable) error {
+func (tab *CompressorRowDisplacementTable) Compress(orig *OriginalTable) error {
 	rowInfo := make([]rowInfo, orig.rowCount)
 	{
 		row := 0
@@ -354,3 +356,394 @@ func (tab *RowDisplacementTable) Compress(orig *OriginalTable) error {
 
 func Main() {
 }
+
+var rep = strings.NewReplacer(
+	`.`, `\.`,
+	`*`, `\*`,
+	`+`, `\+`,
+	`?`, `\?`,
+	`|`, `\|`,
+	`(`, `\(`,
+	`)`, `\)`,
+	`[`, `\[`,
+	`\`, `\\`,
+)
+
+// EscapePattern escapes the special characters.
+// For example, EscapePattern(`+`) returns `\+`.
+func EscapePattern(s string) string {
+	return rep.Replace(s)
+}
+
+// LexKindID represents an ID of a lexical kind and is unique across all modes.
+type LexKindID int
+
+const (
+	LexKindIDNil = LexKindID(0)
+	LexKindIDMin = LexKindID(1)
+)
+
+func (id LexKindID) Int() int {
+	return int(id)
+}
+
+// LexModeKindID represents an ID of a lexical kind and is unique within a mode.
+// Use LexKindID to identify a kind across all modes uniquely.
+type LexModeKindID int
+
+const (
+	LexModeKindIDNil = LexModeKindID(0)
+	LexModeKindIDMin = LexModeKindID(1)
+)
+
+func (id LexModeKindID) Int() int {
+	return int(id)
+}
+
+// LexKindName represents a name of a lexical kind.
+type LexKindName string
+
+const LexKindNameNil = LexKindName("")
+
+func (k LexKindName) String() string {
+	return string(k)
+}
+
+func (k LexKindName) validate() error {
+	err := validateIdentifier(k.String())
+	if err != nil {
+		return fmt.Errorf("invalid kind name: %v", err)
+	}
+	return nil
+}
+
+// LexPattern represents a pattern of a lexeme.
+// The pattern is written in regular expression.
+type LexPattern string
+
+func (p LexPattern) validate() error {
+	if p == "" {
+		return fmt.Errorf("pattern doesn't allow to be the empty string")
+	}
+	return nil
+}
+
+// LexModeID represents an ID of a lex mode.
+type LexModeID int
+
+const (
+	LexModeIDNil     = LexModeID(0)
+	LexModeIDDefault = LexModeID(1)
+)
+
+func (n LexModeID) String() string {
+	return strconv.Itoa(int(n))
+}
+
+func (n LexModeID) Int() int {
+	return int(n)
+}
+
+func (n LexModeID) IsNil() bool {
+	return n == LexModeIDNil
+}
+
+// LexModeName represents a name of a lex mode.
+type LexModeName string
+
+const (
+	LexModeNameNil     = LexModeName("")
+	LexModeNameDefault = LexModeName("default")
+)
+
+func (m LexModeName) String() string {
+	return string(m)
+}
+
+func (m LexModeName) validate() error {
+	err := validateIdentifier(m.String())
+	if err != nil {
+		return fmt.Errorf("invalid mode name: %v", err)
+	}
+	return nil
+}
+
+const idPattern = `^[a-z](_?[0-9a-z]+)*$`
+
+var idRE = regexp.MustCompile(idPattern)
+
+func validateIdentifier(id string) error {
+	if id == "" {
+		return fmt.Errorf("identifier doesn't allow to be the empty string")
+	}
+	if !idRE.MatchString(id) {
+		return fmt.Errorf("identifier must be %v", idPattern)
+	}
+	return nil
+}
+
+func SnakeCaseToUpperCamelCase(snake string) string {
+	elems := strings.Split(snake, "_")
+	for i, e := range elems {
+		if len(e) == 0 {
+			continue
+		}
+		elems[i] = strings.ToUpper(string(e[0])) + e[1:]
+	}
+
+	return strings.Join(elems, "")
+}
+
+type LexEntry struct {
+	Kind     LexKindName   `json:"kind"`
+	Pattern  LexPattern    `json:"pattern"`
+	Modes    []LexModeName `json:"modes"`
+	Push     LexModeName   `json:"push"`
+	Pop      bool          `json:"pop"`
+	Fragment bool          `json:"fragment"`
+}
+
+func (e *LexEntry) validate() error {
+	err := e.Kind.validate()
+	if err != nil {
+		return err
+	}
+	err = e.Pattern.validate()
+	if err != nil {
+		return err
+	}
+	if len(e.Modes) > 0 {
+		for _, mode := range e.Modes {
+			err = mode.validate()
+			if err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+type LexSpec struct {
+	Name    string      `json:"name"`
+	Entries []*LexEntry `json:"entries"`
+}
+
+func (s *LexSpec) Validate() error {
+	err := validateIdentifier(s.Name)
+	if err != nil {
+		return fmt.Errorf("invalid specification name: %v", err)
+	}
+
+	if len(s.Entries) <= 0 {
+		return fmt.Errorf("the lexical specification must have at least one entry")
+	}
+	{
+		var errs []error
+		for i, e := range s.Entries {
+			err := e.validate()
+			if err != nil {
+				errs = append(errs, fmt.Errorf("entry #%v: %w", i+1, err))
+			}
+		}
+		if len(errs) > 0 {
+			var b strings.Builder
+			fmt.Fprintf(&b, "%v", errs[0])
+			for _, err := range errs[1:] {
+				fmt.Fprintf(&b, "\n%v", err)
+			}
+			return fmt.Errorf(b.String())
+		}
+	}
+	{
+		ks := map[string]struct{}{}
+		fks := map[string]struct{}{}
+		for _, e := range s.Entries {
+			// Allow duplicate names between fragments and non-fragments.
+			if e.Fragment {
+				if _, exist := fks[e.Kind.String()]; exist {
+					return fmt.Errorf("kinds `%v` are duplicates", e.Kind)
+				}
+				fks[e.Kind.String()] = struct{}{}
+			} else {
+				if _, exist := ks[e.Kind.String()]; exist {
+					return fmt.Errorf("kinds `%v` are duplicates", e.Kind)
+				}
+				ks[e.Kind.String()] = struct{}{}
+			}
+		}
+	}
+	{
+		kinds := []string{}
+		modes := []string{
+			LexModeNameDefault.String(), // This is a predefined mode.
+		}
+		for _, e := range s.Entries {
+			if e.Fragment {
+				continue
+			}
+
+			kinds = append(kinds, e.Kind.String())
+
+			for _, m := range e.Modes {
+				modes = append(modes, m.String())
+			}
+		}
+
+		kindErrs := findSpellingInconsistenciesErrors(kinds, nil)
+		modeErrs := findSpellingInconsistenciesErrors(modes, func(ids []string) error {
+			if SnakeCaseToUpperCamelCase(ids[0]) == SnakeCaseToUpperCamelCase(LexModeNameDefault.String()) {
+				var b strings.Builder
+				fmt.Fprintf(&b, "%+v", ids[0])
+				for _, id := range ids[1:] {
+					fmt.Fprintf(&b, ", %+v", id)
+				}
+				return fmt.Errorf("these identifiers are treated as the same. please use the same spelling as predefined '%v': %v", LexModeNameDefault, b.String())
+			}
+			return nil
+		})
+		errs := append(kindErrs, modeErrs...)
+		if len(errs) > 0 {
+			var b strings.Builder
+			fmt.Fprintf(&b, "%v", errs[0])
+			for _, err := range errs[1:] {
+				fmt.Fprintf(&b, "\n%v", err)
+			}
+			return fmt.Errorf(b.String())
+		}
+	}
+
+	return nil
+}
+
+func findSpellingInconsistenciesErrors(ids []string, hook func(ids []string) error) []error {
+	duplicated := FindSpellingInconsistencies(ids)
+	if len(duplicated) == 0 {
+		return nil
+	}
+
+	var errs []error
+	for _, dup := range duplicated {
+		if hook != nil {
+			err := hook(dup)
+			if err != nil {
+				errs = append(errs, err)
+				continue
+			}
+		}
+
+		var b strings.Builder
+		fmt.Fprintf(&b, "%+v", dup[0])
+		for _, id := range dup[1:] {
+			fmt.Fprintf(&b, ", %+v", id)
+		}
+		err := fmt.Errorf("these identifiers are treated as the same. please use the same spelling: %v", b.String())
+		errs = append(errs, err)
+	}
+
+	return errs
+}
+
+// FindSpellingInconsistencies finds spelling inconsistencies in identifiers. The identifiers are considered to be the same
+// if they are spelled the same when expressed in UpperCamelCase. For example, `left_paren` and `LeftParen` are spelled the same
+// in UpperCamelCase. Thus they are considere to be spelling inconsistency.
+func FindSpellingInconsistencies(ids []string) [][]string {
+	m := map[string][]string{}
+	for _, id := range removeDuplicates(ids) {
+		c := SnakeCaseToUpperCamelCase(id)
+		m[c] = append(m[c], id)
+	}
+
+	var duplicated [][]string
+	for _, camels := range m {
+		if len(camels) == 1 {
+			continue
+		}
+		duplicated = append(duplicated, camels)
+	}
+
+	for _, dup := range duplicated {
+		sort.Slice(dup, func(i, j int) bool {
+			return dup[i] < dup[j]
+		})
+	}
+	sort.Slice(duplicated, func(i, j int) bool {
+		return duplicated[i][0] < duplicated[j][0]
+	})
+
+	return duplicated
+}
+
+func removeDuplicates(s []string) []string {
+	m := map[string]struct{}{}
+	for _, v := range s {
+		m[v] = struct{}{}
+	}
+
+	var unique []string
+	for v := range m {
+		unique = append(unique, v)
+	}
+
+	return unique
+}
+
+// StateID represents an ID of a state of a transition table.
+type StateID int
+
+const (
+	// StateIDNil represents an empty entry of a transition table.
+	// When the driver reads this value, it raises an error meaning lexical analysis failed.
+	StateIDNil = StateID(0)
+
+	// StateIDMin is the minimum value of the state ID. All valid state IDs are represented as
+	// sequential numbers starting from this value.
+	StateIDMin = StateID(1)
+)
+
+func (id StateID) Int() int {
+	return int(id)
+}
+
+type SpecRowDisplacementTable struct {
+	OriginalRowCount int       `json:"original_row_count"`
+	OriginalColCount int       `json:"original_col_count"`
+	EmptyValue       StateID   `json:"empty_value"`
+	Entries          []StateID `json:"entries"`
+	Bounds           []int     `json:"bounds"`
+	RowDisplacement  []int     `json:"row_displacement"`
+}
+
+type SpecUniqueEntriesTable struct {
+	UniqueEntries             *SpecRowDisplacementTable `json:"unique_entries,omitempty"`
+	UncompressedUniqueEntries []StateID             `json:"uncompressed_unique_entries,omitempty"`
+	RowNums                   []int                 `json:"row_nums"`
+	OriginalRowCount          int                   `json:"original_row_count"`
+	OriginalColCount          int                   `json:"original_col_count"`
+	EmptyValue                int                   `json:"empty_value"`
+}
+
+type TransitionTable struct {
+	InitialStateID         StateID             `json:"initial_state_id"`
+	AcceptingStates        []LexModeKindID     `json:"accepting_states"`
+	RowCount               int                 `json:"row_count"`
+	ColCount               int                 `json:"col_count"`
+	Transition             *SpecUniqueEntriesTable `json:"transition,omitempty"`
+	UncompressedTransition []StateID           `json:"uncompressed_transition,omitempty"`
+}
+
+type CompiledLexModeSpec struct {
+	KindNames []LexKindName    `json:"kind_names"`
+	Push      []LexModeID      `json:"push"`
+	Pop       []int            `json:"pop"`
+	DFA       *TransitionTable `json:"dfa"`
+}
+
+type CompiledLexSpec struct {
+	Name             string                 `json:"name"`
+	InitialModeID    LexModeID              `json:"initial_mode_id"`
+	ModeNames        []LexModeName          `json:"mode_names"`
+	KindNames        []LexKindName          `json:"kind_names"`
+	KindIDs          [][]LexKindID          `json:"kind_ids"`
+	CompressionLevel int                    `json:"compression_level"`
+	Specs            []*CompiledLexModeSpec `json:"specs"`
+}