package tre import ( "encoding/binary" "fmt" "regexp" "strconv" "strings" "sort" ) type CharBlock struct { From []byte To []byte } type cpRange struct { from rune to rune } func (b *CharBlock) String() string { var s strings.Builder fmt.Fprint(&s, "<") fmt.Fprintf(&s, "%X", b.From[0]) for i := 1; i < len(b.From); i++ { fmt.Fprintf(&s, " %X", b.From[i]) } fmt.Fprint(&s, "..") fmt.Fprintf(&s, "%X", b.To[0]) for i := 1; i < len(b.To); i++ { fmt.Fprintf(&s, " %X", b.To[i]) } fmt.Fprint(&s, ">") return s.String() } func GenCharBlocks(from, to rune) ([]*CharBlock, error) { rs, err := splitCodePoint(from, to) if err != nil { return nil, err } blks := make([]*CharBlock, len(rs)) for i, r := range rs { blks[i] = &CharBlock{ From: []byte(string(r.from)), To: []byte(string(r.to)), } } return blks, nil } /// `splitCodePoint` splits a code point range represented by into /// some blocks. The code points that the block contains will be a continuous /// byte sequence when encoded into UTF-8. For instance, this function splits /// into and because /// is continuous on the code point but non-continuous in the /// UTF-8 byte sequence (In UTF-8, is encoded <00..7F>, and /// is encoded ). /// /// The blocks don't contain surrogate code points because byte /// sequences encoding them are ill-formed in UTF-8. For instance, /// is split into and . /// However, when `from` or `to` itself is the surrogate code point, this /// function returns an error. func splitCodePoint(from, to rune) ([]*cpRange, error) { if from > to { return nil, fmt.Errorf( "code point range must be from <= to: U+%X..U+%X", from, to, ) } if from < 0x0000 || from > 0x10ffff || to < 0x0000 || to > 0x10ffff { return nil, fmt.Errorf( "code point must be >=U+0000 and <=U+10FFFF:" + "U+%X..U+%X", from, to, ) } // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf // > 3.9 Unicode Encoding Forms // > UTF-8 D92 // > Because surrogate code points are not Unicode scalar values, // > any UTF-8 byte sequence that would otherwise // > map to code points U+D800..U+DFFF is ill-formed. if from >= 0xd800 && from <= 0xdfff || to >= 0xd800 && to <= 0xdfff { return nil, fmt.Errorf( "surrogate code points U+D800..U+DFFF " + "are not allowed in UTF-8: U+%X..U+%X", from, to, ) } in := &cpRange{ from: from, to: to, } var rs []*cpRange for in.from <= in.to { r := &cpRange{ from: in.from, to: in.to, } // https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf // > 3.9 Unicode Encoding Forms // > UTF-8 Table 3-7. // > Well-Formed UTF-8 Byte Sequences switch { case in.from <= 0x007f && in.to > 0x007f: r.to = 0x007f case in.from <= 0x07ff && in.to > 0x07ff: r.to = 0x07ff case in.from <= 0x0fff && in.to > 0x0fff: r.to = 0x0fff case in.from <= 0xcfff && in.to > 0xcfff: r.to = 0xcfff case in.from <= 0xd7ff && in.to > 0xd7ff: r.to = 0xd7ff case in.from <= 0xffff && in.to > 0xffff: r.to = 0xffff case in.from <= 0x3ffff && in.to > 0x3ffff: r.to = 0x3ffff case in.from <= 0xfffff && in.to > 0xfffff: r.to = 0xfffff } rs = append(rs, r) in.from = r.to + 1 // Skip surrogate code points U+D800..U+DFFF. if in.from >= 0xd800 && in.from <= 0xdfff { in.from = 0xe000 } } return rs, nil } type OriginalTable struct { entries []int rowCount int colCount int } func NewOriginalTable(entries []int, colCount int) (*OriginalTable, error) { if len(entries) == 0 { return nil, fmt.Errorf("enries is empty") } if colCount <= 0 { return nil, fmt.Errorf("colCount must be >=1") } if len(entries)%colCount != 0 { return nil, fmt.Errorf("entries length or column count are incorrect; entries length: %v, column count: %v", len(entries), colCount) } return &OriginalTable{ entries: entries, rowCount: len(entries) / colCount, colCount: colCount, }, nil } type Compressor interface { Compress(orig *OriginalTable) error Lookup(row, col int) (int, error) OriginalTableSize() (int, int) } var ( _ Compressor = &CompressorUniqueEntriesTable{} _ Compressor = &CompressorRowDisplacementTable{} ) type CompressorUniqueEntriesTable struct { UniqueEntries []int RowNums []int OriginalRowCount int OriginalColCount int } func NewCompressorUniqueEntriesTable() *CompressorUniqueEntriesTable { return &CompressorUniqueEntriesTable{} } func (tab *CompressorUniqueEntriesTable) Lookup(row, col int) (int, error) { if row < 0 || row >= tab.OriginalRowCount || col < 0 || col >= tab.OriginalColCount { return 0, fmt.Errorf("indexes are out of range: [%v, %v]", row, col) } return tab.UniqueEntries[tab.RowNums[row]*tab.OriginalColCount+col], nil } func (tab *CompressorUniqueEntriesTable) OriginalTableSize() (int, int) { return tab.OriginalRowCount, tab.OriginalColCount } func (tab *CompressorUniqueEntriesTable) Compress(orig *OriginalTable) error { var uniqueEntries []int rowNums := make([]int, orig.rowCount) hash2RowNum := map[string]int{} nextRowNum := 0 for row := 0; row < orig.rowCount; row++ { var rowHash string { buf := make([]byte, 0, orig.colCount*8) for col := 0; col < orig.colCount; col++ { b := make([]byte, 8) binary.PutUvarint(b, uint64(orig.entries[row*orig.colCount+col])) buf = append(buf, b...) } rowHash = string(buf) } rowNum, ok := hash2RowNum[rowHash] if !ok { rowNum = nextRowNum nextRowNum++ hash2RowNum[rowHash] = rowNum start := row * orig.colCount entry := append([]int{}, orig.entries[start:start+orig.colCount]...) uniqueEntries = append(uniqueEntries, entry...) } rowNums[row] = rowNum } tab.UniqueEntries = uniqueEntries tab.RowNums = rowNums tab.OriginalRowCount = orig.rowCount tab.OriginalColCount = orig.colCount return nil } const ForbiddenValue = -1 type CompressorRowDisplacementTable struct { OriginalRowCount int OriginalColCount int EmptyValue int Entries []int Bounds []int RowDisplacement []int } func NewCompressorRowDisplacementTable(emptyValue int) *CompressorRowDisplacementTable { return &CompressorRowDisplacementTable{ EmptyValue: emptyValue, } } func (tab *CompressorRowDisplacementTable) Lookup(row int, col int) (int, error) { if row < 0 || row >= tab.OriginalRowCount || col < 0 || col >= tab.OriginalColCount { return tab.EmptyValue, fmt.Errorf("indexes are out of range: [%v, %v]", row, col) } d := tab.RowDisplacement[row] if tab.Bounds[d+col] != row { return tab.EmptyValue, nil } return tab.Entries[d+col], nil } func (tab *CompressorRowDisplacementTable) OriginalTableSize() (int, int) { return tab.OriginalRowCount, tab.OriginalColCount } type rowInfo struct { rowNum int nonEmptyCount int nonEmptyCol []int } func (tab *CompressorRowDisplacementTable) Compress(orig *OriginalTable) error { rowInfo := make([]rowInfo, orig.rowCount) { row := 0 col := 0 rowInfo[0].rowNum = 0 for _, v := range orig.entries { if col == orig.colCount { row++ col = 0 rowInfo[row].rowNum = row } if v != tab.EmptyValue { rowInfo[row].nonEmptyCount++ rowInfo[row].nonEmptyCol = append(rowInfo[row].nonEmptyCol, col) } col++ } sort.SliceStable(rowInfo, func(i int, j int) bool { return rowInfo[i].nonEmptyCount > rowInfo[j].nonEmptyCount }) } origEntriesLen := len(orig.entries) entries := make([]int, origEntriesLen) bounds := make([]int, origEntriesLen) resultBottom := orig.colCount rowDisplacement := make([]int, orig.rowCount) { for i := 0; i < origEntriesLen; i++ { entries[i] = tab.EmptyValue bounds[i] = ForbiddenValue } nextRowDisplacement := 0 for _, rInfo := range rowInfo { if rInfo.nonEmptyCount <= 0 { continue } for { isOverlapped := false for _, col := range rInfo.nonEmptyCol { if entries[nextRowDisplacement+col] == tab.EmptyValue { continue } nextRowDisplacement++ isOverlapped = true break } if isOverlapped { continue } rowDisplacement[rInfo.rowNum] = nextRowDisplacement for _, col := range rInfo.nonEmptyCol { entries[nextRowDisplacement+col] = orig.entries[(rInfo.rowNum*orig.colCount)+col] bounds[nextRowDisplacement+col] = rInfo.rowNum } resultBottom = nextRowDisplacement + orig.colCount nextRowDisplacement++ break } } } tab.OriginalRowCount = orig.rowCount tab.OriginalColCount = orig.colCount tab.Entries = entries[:resultBottom] tab.Bounds = bounds[:resultBottom] tab.RowDisplacement = rowDisplacement return nil } func Main() { } var rep = strings.NewReplacer( `.`, `\.`, `*`, `\*`, `+`, `\+`, `?`, `\?`, `|`, `\|`, `(`, `$`, `)`, `$`, `[`, `\[`, `\`, `\\`, ) // EscapePattern escapes the special characters. // For example, EscapePattern(`+`) returns `\+`. func EscapePattern(s string) string { return rep.Replace(s) } // LexKindID represents an ID of a lexical kind and is unique across all modes. type LexKindID int const ( LexKindIDNil = LexKindID(0) LexKindIDMin = LexKindID(1) ) func (id LexKindID) Int() int { return int(id) } // LexModeKindID represents an ID of a lexical kind and is unique within a mode. // Use LexKindID to identify a kind across all modes uniquely. type LexModeKindID int const ( LexModeKindIDNil = LexModeKindID(0) LexModeKindIDMin = LexModeKindID(1) ) func (id LexModeKindID) Int() int { return int(id) } // LexKindName represents a name of a lexical kind. type LexKindName string const LexKindNameNil = LexKindName("") func (k LexKindName) String() string { return string(k) } func (k LexKindName) validate() error { err := validateIdentifier(k.String()) if err != nil { return fmt.Errorf("invalid kind name: %v", err) } return nil } // LexPattern represents a pattern of a lexeme. // The pattern is written in regular expression. type LexPattern string func (p LexPattern) validate() error { if p == "" { return fmt.Errorf("pattern doesn't allow to be the empty string") } return nil } // LexModeID represents an ID of a lex mode. type LexModeID int const ( LexModeIDNil = LexModeID(0) LexModeIDDefault = LexModeID(1) ) func (n LexModeID) String() string { return strconv.Itoa(int(n)) } func (n LexModeID) Int() int { return int(n) } func (n LexModeID) IsNil() bool { return n == LexModeIDNil } // LexModeName represents a name of a lex mode. type LexModeName string const ( LexModeNameNil = LexModeName("") LexModeNameDefault = LexModeName("default") ) func (m LexModeName) String() string { return string(m) } func (m LexModeName) validate() error { err := validateIdentifier(m.String()) if err != nil { return fmt.Errorf("invalid mode name: %v", err) } return nil } const idPattern = `^[a-z](_?[0-9a-z]+)*$` var idRE = regexp.MustCompile(idPattern) func validateIdentifier(id string) error { if id == "" { return fmt.Errorf("identifier doesn't allow to be the empty string") } if !idRE.MatchString(id) { return fmt.Errorf("identifier must be %v", idPattern) } return nil } func SnakeCaseToUpperCamelCase(snake string) string { elems := strings.Split(snake, "_") for i, e := range elems { if len(e) == 0 { continue } elems[i] = strings.ToUpper(string(e[0])) + e[1:] } return strings.Join(elems, "") } type LexEntry struct { Kind LexKindName `json:"kind"` Pattern LexPattern `json:"pattern"` Modes []LexModeName `json:"modes"` Push LexModeName `json:"push"` Pop bool `json:"pop"` Fragment bool `json:"fragment"` } func (e *LexEntry) validate() error { err := e.Kind.validate() if err != nil { return err } err = e.Pattern.validate() if err != nil { return err } if len(e.Modes) > 0 { for _, mode := range e.Modes { err = mode.validate() if err != nil { return err } } } return nil } type LexSpec struct { Name string `json:"name"` Entries []*LexEntry `json:"entries"` } func (s *LexSpec) Validate() error { err := validateIdentifier(s.Name) if err != nil { return fmt.Errorf("invalid specification name: %v", err) } if len(s.Entries) <= 0 { return fmt.Errorf("the lexical specification must have at least one entry") } { var errs []error for i, e := range s.Entries { err := e.validate() if err != nil { errs = append(errs, fmt.Errorf("entry #%v: %w", i+1, err)) } } if len(errs) > 0 { var b strings.Builder fmt.Fprintf(&b, "%v", errs[0]) for _, err := range errs[1:] { fmt.Fprintf(&b, "\n%v", err) } return fmt.Errorf(b.String()) } } { ks := map[string]struct{}{} fks := map[string]struct{}{} for _, e := range s.Entries { // Allow duplicate names between fragments and non-fragments. if e.Fragment { if _, exist := fks[e.Kind.String()]; exist { return fmt.Errorf("kinds `%v` are duplicates", e.Kind) } fks[e.Kind.String()] = struct{}{} } else { if _, exist := ks[e.Kind.String()]; exist { return fmt.Errorf("kinds `%v` are duplicates", e.Kind) } ks[e.Kind.String()] = struct{}{} } } } { kinds := []string{} modes := []string{ LexModeNameDefault.String(), // This is a predefined mode. } for _, e := range s.Entries { if e.Fragment { continue } kinds = append(kinds, e.Kind.String()) for _, m := range e.Modes { modes = append(modes, m.String()) } } kindErrs := findSpellingInconsistenciesErrors(kinds, nil) modeErrs := findSpellingInconsistenciesErrors(modes, func(ids []string) error { if SnakeCaseToUpperCamelCase(ids[0]) == SnakeCaseToUpperCamelCase(LexModeNameDefault.String()) { var b strings.Builder fmt.Fprintf(&b, "%+v", ids[0]) for _, id := range ids[1:] { fmt.Fprintf(&b, ", %+v", id) } return fmt.Errorf("these identifiers are treated as the same. please use the same spelling as predefined '%v': %v", LexModeNameDefault, b.String()) } return nil }) errs := append(kindErrs, modeErrs...) if len(errs) > 0 { var b strings.Builder fmt.Fprintf(&b, "%v", errs[0]) for _, err := range errs[1:] { fmt.Fprintf(&b, "\n%v", err) } return fmt.Errorf(b.String()) } } return nil } func findSpellingInconsistenciesErrors(ids []string, hook func(ids []string) error) []error { duplicated := FindSpellingInconsistencies(ids) if len(duplicated) == 0 { return nil } var errs []error for _, dup := range duplicated { if hook != nil { err := hook(dup) if err != nil { errs = append(errs, err) continue } } var b strings.Builder fmt.Fprintf(&b, "%+v", dup[0]) for _, id := range dup[1:] { fmt.Fprintf(&b, ", %+v", id) } err := fmt.Errorf("these identifiers are treated as the same. please use the same spelling: %v", b.String()) errs = append(errs, err) } return errs } // FindSpellingInconsistencies finds spelling inconsistencies in identifiers. The identifiers are considered to be the same // if they are spelled the same when expressed in UpperCamelCase. For example, `left_paren` and `LeftParen` are spelled the same // in UpperCamelCase. Thus they are considere to be spelling inconsistency. func FindSpellingInconsistencies(ids []string) [][]string { m := map[string][]string{} for _, id := range removeDuplicates(ids) { c := SnakeCaseToUpperCamelCase(id) m[c] = append(m[c], id) } var duplicated [][]string for _, camels := range m { if len(camels) == 1 { continue } duplicated = append(duplicated, camels) } for _, dup := range duplicated { sort.Slice(dup, func(i, j int) bool { return dup[i] < dup[j] }) } sort.Slice(duplicated, func(i, j int) bool { return duplicated[i][0] < duplicated[j][0] }) return duplicated } func removeDuplicates(s []string) []string { m := map[string]struct{}{} for _, v := range s { m[v] = struct{}{} } var unique []string for v := range m { unique = append(unique, v) } return unique } // StateID represents an ID of a state of a transition table. type StateID int const ( // StateIDNil represents an empty entry of a transition table. // When the driver reads this value, it raises an error meaning lexical analysis failed. StateIDNil = StateID(0) // StateIDMin is the minimum value of the state ID. All valid state IDs are represented as // sequential numbers starting from this value. StateIDMin = StateID(1) ) func (id StateID) Int() int { return int(id) } type SpecRowDisplacementTable struct { OriginalRowCount int `json:"original_row_count"` OriginalColCount int `json:"original_col_count"` EmptyValue StateID `json:"empty_value"` Entries []StateID `json:"entries"` Bounds []int `json:"bounds"` RowDisplacement []int `json:"row_displacement"` } type SpecUniqueEntriesTable struct { UniqueEntries *SpecRowDisplacementTable `json:"unique_entries,omitempty"` UncompressedUniqueEntries []StateID `json:"uncompressed_unique_entries,omitempty"` RowNums []int `json:"row_nums"` OriginalRowCount int `json:"original_row_count"` OriginalColCount int `json:"original_col_count"` EmptyValue int `json:"empty_value"` } type TransitionTable struct { InitialStateID StateID `json:"initial_state_id"` AcceptingStates []LexModeKindID `json:"accepting_states"` RowCount int `json:"row_count"` ColCount int `json:"col_count"` Transition *SpecUniqueEntriesTable `json:"transition,omitempty"` UncompressedTransition []StateID `json:"uncompressed_transition,omitempty"` } type CompiledLexModeSpec struct { KindNames []LexKindName `json:"kind_names"` Push []LexModeID `json:"push"` Pop []int `json:"pop"` DFA *TransitionTable `json:"dfa"` } type CompiledLexSpec struct { Name string `json:"name"` InitialModeID LexModeID `json:"initial_mode_id"` ModeNames []LexModeName `json:"mode_names"` KindNames []LexKindName `json:"kind_names"` KindIDs [][]LexKindID `json:"kind_ids"` CompressionLevel int `json:"compression_level"` Specs []*CompiledLexModeSpec `json:"specs"` }