aboutsummaryrefslogtreecommitdiff
path: root/src/tre.go
diff options
context:
space:
mode:
Diffstat (limited to 'src/tre.go')
-rw-r--r--src/tre.go421
1 files changed, 407 insertions, 14 deletions
diff --git a/src/tre.go b/src/tre.go
index cab7784..2350a52 100644
--- a/src/tre.go
+++ b/src/tre.go
@@ -3,6 +3,8 @@ package tre
import (
"encoding/binary"
"fmt"
+ "regexp"
+ "strconv"
"strings"
"sort"
)
@@ -174,33 +176,33 @@ type Compressor interface {
}
var (
- _ Compressor = &UniqueEntriesTable{}
- _ Compressor = &RowDisplacementTable{}
+ _ Compressor = &CompressorUniqueEntriesTable{}
+ _ Compressor = &CompressorRowDisplacementTable{}
)
-type UniqueEntriesTable struct {
+type CompressorUniqueEntriesTable struct {
UniqueEntries []int
RowNums []int
OriginalRowCount int
OriginalColCount int
}
-func NewUniqueEntriesTable() *UniqueEntriesTable {
- return &UniqueEntriesTable{}
+func NewCompressorUniqueEntriesTable() *CompressorUniqueEntriesTable {
+ return &CompressorUniqueEntriesTable{}
}
-func (tab *UniqueEntriesTable) Lookup(row, col int) (int, error) {
+func (tab *CompressorUniqueEntriesTable) Lookup(row, col int) (int, error) {
if row < 0 || row >= tab.OriginalRowCount || col < 0 || col >= tab.OriginalColCount {
return 0, fmt.Errorf("indexes are out of range: [%v, %v]", row, col)
}
return tab.UniqueEntries[tab.RowNums[row]*tab.OriginalColCount+col], nil
}
-func (tab *UniqueEntriesTable) OriginalTableSize() (int, int) {
+func (tab *CompressorUniqueEntriesTable) OriginalTableSize() (int, int) {
return tab.OriginalRowCount, tab.OriginalColCount
}
-func (tab *UniqueEntriesTable) Compress(orig *OriginalTable) error {
+func (tab *CompressorUniqueEntriesTable) Compress(orig *OriginalTable) error {
var uniqueEntries []int
rowNums := make([]int, orig.rowCount)
hash2RowNum := map[string]int{}
@@ -238,7 +240,7 @@ func (tab *UniqueEntriesTable) Compress(orig *OriginalTable) error {
const ForbiddenValue = -1
-type RowDisplacementTable struct {
+type CompressorRowDisplacementTable struct {
OriginalRowCount int
OriginalColCount int
EmptyValue int
@@ -247,13 +249,13 @@ type RowDisplacementTable struct {
RowDisplacement []int
}
-func NewRowDisplacementTable(emptyValue int) *RowDisplacementTable {
- return &RowDisplacementTable{
+func NewCompressorRowDisplacementTable(emptyValue int) *CompressorRowDisplacementTable {
+ return &CompressorRowDisplacementTable{
EmptyValue: emptyValue,
}
}
-func (tab *RowDisplacementTable) Lookup(row int, col int) (int, error) {
+func (tab *CompressorRowDisplacementTable) Lookup(row int, col int) (int, error) {
if row < 0 || row >= tab.OriginalRowCount || col < 0 || col >= tab.OriginalColCount {
return tab.EmptyValue, fmt.Errorf("indexes are out of range: [%v, %v]", row, col)
}
@@ -264,7 +266,7 @@ func (tab *RowDisplacementTable) Lookup(row int, col int) (int, error) {
return tab.Entries[d+col], nil
}
-func (tab *RowDisplacementTable) OriginalTableSize() (int, int) {
+func (tab *CompressorRowDisplacementTable) OriginalTableSize() (int, int) {
return tab.OriginalRowCount, tab.OriginalColCount
}
@@ -274,7 +276,7 @@ type rowInfo struct {
nonEmptyCol []int
}
-func (tab *RowDisplacementTable) Compress(orig *OriginalTable) error {
+func (tab *CompressorRowDisplacementTable) Compress(orig *OriginalTable) error {
rowInfo := make([]rowInfo, orig.rowCount)
{
row := 0
@@ -354,3 +356,394 @@ func (tab *RowDisplacementTable) Compress(orig *OriginalTable) error {
func Main() {
}
+
+var rep = strings.NewReplacer(
+ `.`, `\.`,
+ `*`, `\*`,
+ `+`, `\+`,
+ `?`, `\?`,
+ `|`, `\|`,
+ `(`, `\(`,
+ `)`, `\)`,
+ `[`, `\[`,
+ `\`, `\\`,
+)
+
+// EscapePattern escapes the special characters.
+// For example, EscapePattern(`+`) returns `\+`.
+func EscapePattern(s string) string {
+ return rep.Replace(s)
+}
+
+// LexKindID represents an ID of a lexical kind and is unique across all modes.
+type LexKindID int
+
+const (
+ LexKindIDNil = LexKindID(0)
+ LexKindIDMin = LexKindID(1)
+)
+
+func (id LexKindID) Int() int {
+ return int(id)
+}
+
+// LexModeKindID represents an ID of a lexical kind and is unique within a mode.
+// Use LexKindID to identify a kind across all modes uniquely.
+type LexModeKindID int
+
+const (
+ LexModeKindIDNil = LexModeKindID(0)
+ LexModeKindIDMin = LexModeKindID(1)
+)
+
+func (id LexModeKindID) Int() int {
+ return int(id)
+}
+
+// LexKindName represents a name of a lexical kind.
+type LexKindName string
+
+const LexKindNameNil = LexKindName("")
+
+func (k LexKindName) String() string {
+ return string(k)
+}
+
+func (k LexKindName) validate() error {
+ err := validateIdentifier(k.String())
+ if err != nil {
+ return fmt.Errorf("invalid kind name: %v", err)
+ }
+ return nil
+}
+
+// LexPattern represents a pattern of a lexeme.
+// The pattern is written in regular expression.
+type LexPattern string
+
+func (p LexPattern) validate() error {
+ if p == "" {
+ return fmt.Errorf("pattern doesn't allow to be the empty string")
+ }
+ return nil
+}
+
+// LexModeID represents an ID of a lex mode.
+type LexModeID int
+
+const (
+ LexModeIDNil = LexModeID(0)
+ LexModeIDDefault = LexModeID(1)
+)
+
+func (n LexModeID) String() string {
+ return strconv.Itoa(int(n))
+}
+
+func (n LexModeID) Int() int {
+ return int(n)
+}
+
+func (n LexModeID) IsNil() bool {
+ return n == LexModeIDNil
+}
+
+// LexModeName represents a name of a lex mode.
+type LexModeName string
+
+const (
+ LexModeNameNil = LexModeName("")
+ LexModeNameDefault = LexModeName("default")
+)
+
+func (m LexModeName) String() string {
+ return string(m)
+}
+
+func (m LexModeName) validate() error {
+ err := validateIdentifier(m.String())
+ if err != nil {
+ return fmt.Errorf("invalid mode name: %v", err)
+ }
+ return nil
+}
+
+const idPattern = `^[a-z](_?[0-9a-z]+)*$`
+
+var idRE = regexp.MustCompile(idPattern)
+
+func validateIdentifier(id string) error {
+ if id == "" {
+ return fmt.Errorf("identifier doesn't allow to be the empty string")
+ }
+ if !idRE.MatchString(id) {
+ return fmt.Errorf("identifier must be %v", idPattern)
+ }
+ return nil
+}
+
+func SnakeCaseToUpperCamelCase(snake string) string {
+ elems := strings.Split(snake, "_")
+ for i, e := range elems {
+ if len(e) == 0 {
+ continue
+ }
+ elems[i] = strings.ToUpper(string(e[0])) + e[1:]
+ }
+
+ return strings.Join(elems, "")
+}
+
+type LexEntry struct {
+ Kind LexKindName `json:"kind"`
+ Pattern LexPattern `json:"pattern"`
+ Modes []LexModeName `json:"modes"`
+ Push LexModeName `json:"push"`
+ Pop bool `json:"pop"`
+ Fragment bool `json:"fragment"`
+}
+
+func (e *LexEntry) validate() error {
+ err := e.Kind.validate()
+ if err != nil {
+ return err
+ }
+ err = e.Pattern.validate()
+ if err != nil {
+ return err
+ }
+ if len(e.Modes) > 0 {
+ for _, mode := range e.Modes {
+ err = mode.validate()
+ if err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
+
+type LexSpec struct {
+ Name string `json:"name"`
+ Entries []*LexEntry `json:"entries"`
+}
+
+func (s *LexSpec) Validate() error {
+ err := validateIdentifier(s.Name)
+ if err != nil {
+ return fmt.Errorf("invalid specification name: %v", err)
+ }
+
+ if len(s.Entries) <= 0 {
+ return fmt.Errorf("the lexical specification must have at least one entry")
+ }
+ {
+ var errs []error
+ for i, e := range s.Entries {
+ err := e.validate()
+ if err != nil {
+ errs = append(errs, fmt.Errorf("entry #%v: %w", i+1, err))
+ }
+ }
+ if len(errs) > 0 {
+ var b strings.Builder
+ fmt.Fprintf(&b, "%v", errs[0])
+ for _, err := range errs[1:] {
+ fmt.Fprintf(&b, "\n%v", err)
+ }
+ return fmt.Errorf(b.String())
+ }
+ }
+ {
+ ks := map[string]struct{}{}
+ fks := map[string]struct{}{}
+ for _, e := range s.Entries {
+ // Allow duplicate names between fragments and non-fragments.
+ if e.Fragment {
+ if _, exist := fks[e.Kind.String()]; exist {
+ return fmt.Errorf("kinds `%v` are duplicates", e.Kind)
+ }
+ fks[e.Kind.String()] = struct{}{}
+ } else {
+ if _, exist := ks[e.Kind.String()]; exist {
+ return fmt.Errorf("kinds `%v` are duplicates", e.Kind)
+ }
+ ks[e.Kind.String()] = struct{}{}
+ }
+ }
+ }
+ {
+ kinds := []string{}
+ modes := []string{
+ LexModeNameDefault.String(), // This is a predefined mode.
+ }
+ for _, e := range s.Entries {
+ if e.Fragment {
+ continue
+ }
+
+ kinds = append(kinds, e.Kind.String())
+
+ for _, m := range e.Modes {
+ modes = append(modes, m.String())
+ }
+ }
+
+ kindErrs := findSpellingInconsistenciesErrors(kinds, nil)
+ modeErrs := findSpellingInconsistenciesErrors(modes, func(ids []string) error {
+ if SnakeCaseToUpperCamelCase(ids[0]) == SnakeCaseToUpperCamelCase(LexModeNameDefault.String()) {
+ var b strings.Builder
+ fmt.Fprintf(&b, "%+v", ids[0])
+ for _, id := range ids[1:] {
+ fmt.Fprintf(&b, ", %+v", id)
+ }
+ return fmt.Errorf("these identifiers are treated as the same. please use the same spelling as predefined '%v': %v", LexModeNameDefault, b.String())
+ }
+ return nil
+ })
+ errs := append(kindErrs, modeErrs...)
+ if len(errs) > 0 {
+ var b strings.Builder
+ fmt.Fprintf(&b, "%v", errs[0])
+ for _, err := range errs[1:] {
+ fmt.Fprintf(&b, "\n%v", err)
+ }
+ return fmt.Errorf(b.String())
+ }
+ }
+
+ return nil
+}
+
+func findSpellingInconsistenciesErrors(ids []string, hook func(ids []string) error) []error {
+ duplicated := FindSpellingInconsistencies(ids)
+ if len(duplicated) == 0 {
+ return nil
+ }
+
+ var errs []error
+ for _, dup := range duplicated {
+ if hook != nil {
+ err := hook(dup)
+ if err != nil {
+ errs = append(errs, err)
+ continue
+ }
+ }
+
+ var b strings.Builder
+ fmt.Fprintf(&b, "%+v", dup[0])
+ for _, id := range dup[1:] {
+ fmt.Fprintf(&b, ", %+v", id)
+ }
+ err := fmt.Errorf("these identifiers are treated as the same. please use the same spelling: %v", b.String())
+ errs = append(errs, err)
+ }
+
+ return errs
+}
+
+// FindSpellingInconsistencies finds spelling inconsistencies in identifiers. The identifiers are considered to be the same
+// if they are spelled the same when expressed in UpperCamelCase. For example, `left_paren` and `LeftParen` are spelled the same
+// in UpperCamelCase. Thus they are considere to be spelling inconsistency.
+func FindSpellingInconsistencies(ids []string) [][]string {
+ m := map[string][]string{}
+ for _, id := range removeDuplicates(ids) {
+ c := SnakeCaseToUpperCamelCase(id)
+ m[c] = append(m[c], id)
+ }
+
+ var duplicated [][]string
+ for _, camels := range m {
+ if len(camels) == 1 {
+ continue
+ }
+ duplicated = append(duplicated, camels)
+ }
+
+ for _, dup := range duplicated {
+ sort.Slice(dup, func(i, j int) bool {
+ return dup[i] < dup[j]
+ })
+ }
+ sort.Slice(duplicated, func(i, j int) bool {
+ return duplicated[i][0] < duplicated[j][0]
+ })
+
+ return duplicated
+}
+
+func removeDuplicates(s []string) []string {
+ m := map[string]struct{}{}
+ for _, v := range s {
+ m[v] = struct{}{}
+ }
+
+ var unique []string
+ for v := range m {
+ unique = append(unique, v)
+ }
+
+ return unique
+}
+
+// StateID represents an ID of a state of a transition table.
+type StateID int
+
+const (
+ // StateIDNil represents an empty entry of a transition table.
+ // When the driver reads this value, it raises an error meaning lexical analysis failed.
+ StateIDNil = StateID(0)
+
+ // StateIDMin is the minimum value of the state ID. All valid state IDs are represented as
+ // sequential numbers starting from this value.
+ StateIDMin = StateID(1)
+)
+
+func (id StateID) Int() int {
+ return int(id)
+}
+
+type SpecRowDisplacementTable struct {
+ OriginalRowCount int `json:"original_row_count"`
+ OriginalColCount int `json:"original_col_count"`
+ EmptyValue StateID `json:"empty_value"`
+ Entries []StateID `json:"entries"`
+ Bounds []int `json:"bounds"`
+ RowDisplacement []int `json:"row_displacement"`
+}
+
+type SpecUniqueEntriesTable struct {
+ UniqueEntries *SpecRowDisplacementTable `json:"unique_entries,omitempty"`
+ UncompressedUniqueEntries []StateID `json:"uncompressed_unique_entries,omitempty"`
+ RowNums []int `json:"row_nums"`
+ OriginalRowCount int `json:"original_row_count"`
+ OriginalColCount int `json:"original_col_count"`
+ EmptyValue int `json:"empty_value"`
+}
+
+type TransitionTable struct {
+ InitialStateID StateID `json:"initial_state_id"`
+ AcceptingStates []LexModeKindID `json:"accepting_states"`
+ RowCount int `json:"row_count"`
+ ColCount int `json:"col_count"`
+ Transition *SpecUniqueEntriesTable `json:"transition,omitempty"`
+ UncompressedTransition []StateID `json:"uncompressed_transition,omitempty"`
+}
+
+type CompiledLexModeSpec struct {
+ KindNames []LexKindName `json:"kind_names"`
+ Push []LexModeID `json:"push"`
+ Pop []int `json:"pop"`
+ DFA *TransitionTable `json:"dfa"`
+}
+
+type CompiledLexSpec struct {
+ Name string `json:"name"`
+ InitialModeID LexModeID `json:"initial_mode_id"`
+ ModeNames []LexModeName `json:"mode_names"`
+ KindNames []LexKindName `json:"kind_names"`
+ KindIDs [][]LexKindID `json:"kind_ids"`
+ CompressionLevel int `json:"compression_level"`
+ Specs []*CompiledLexModeSpec `json:"specs"`
+}