diff options
author | Ryo Nihei <nihei.dev@gmail.com> | 2021-05-01 01:23:54 +0900 |
---|---|---|
committer | Ryo Nihei <nihei.dev@gmail.com> | 2021-05-02 14:40:46 +0900 |
commit | edbca717ae82aea7cd4b693c907581531ba37706 (patch) | |
tree | 5d363fdabd6df90e7f4fbd619be759fc6ae55724 /compiler/symbol_position.go | |
parent | Add character property expression (Meet RL1.2 of UTS #18 partially) (diff) | |
download | tre-edbca717ae82aea7cd4b693c907581531ba37706.tar.gz tre-edbca717ae82aea7cd4b693c907581531ba37706.tar.xz |
Improve compilation time a little
A pattern like \p{Letter} generates an AST with many symbols concatenated by alt operators,
which results in a large number of symbol positions in one state of the DFA.
Such a pattern increases the compilation time. This commit improves the compilation time a little better.
- To avoid calling astNode#first and astNode#last recursively, memoize the result of them.
- Use a byte sequence that symbol positions are encoded to as a hash value to avoid using fmt.Fprintf function.
- Implement a sort function for symbol positions instead of using sort.Slice function.
Diffstat (limited to 'compiler/symbol_position.go')
-rw-r--r-- | compiler/symbol_position.go | 171 |
1 files changed, 171 insertions, 0 deletions
diff --git a/compiler/symbol_position.go b/compiler/symbol_position.go new file mode 100644 index 0000000..1b400bd --- /dev/null +++ b/compiler/symbol_position.go @@ -0,0 +1,171 @@ +package compiler + +import ( + "encoding/binary" + "fmt" + "strings" +) + +type symbolPosition uint16 + +const ( + symbolPositionNil = symbolPosition(0x0000) // 0000 0000 0000 0000 + + symbolPositionMin = uint16(0x0001) // 0000 0000 0000 0001 + symbolPositionMax = uint16(0x7fff) // 0111 1111 1111 1111 + + symbolPositionMaskSymbol = uint16(0x0000) // 0000 0000 0000 0000 + symbolPositionMaskEndMark = uint16(0x8000) // 1000 0000 0000 0000 + + symbolPositionMaskValue = uint16(0x7fff) // 0111 1111 1111 1111 +) + +func newSymbolPosition(n uint16, endMark bool) (symbolPosition, error) { + if n < symbolPositionMin || n > symbolPositionMax { + return symbolPositionNil, fmt.Errorf("symbol position must be within %v to %v; n: %v, endMark: %v", symbolPositionMin, symbolPositionMax, n, endMark) + } + if endMark { + return symbolPosition(n | symbolPositionMaskEndMark), nil + } + return symbolPosition(n | symbolPositionMaskSymbol), nil +} + +func (p symbolPosition) String() string { + if p.isEndMark() { + return fmt.Sprintf("end#%v", uint16(p)&symbolPositionMaskValue) + } + return fmt.Sprintf("sym#%v", uint16(p)&symbolPositionMaskValue) +} + +func (p symbolPosition) isEndMark() bool { + if uint16(p)&symbolPositionMaskEndMark > 1 { + return true + } + return false +} + +func (p symbolPosition) describe() (uint16, bool) { + v := uint16(p) & symbolPositionMaskValue + if p.isEndMark() { + return v, true + } + return v, false +} + +type symbolPositionSet map[symbolPosition]struct{} + +func newSymbolPositionSet() symbolPositionSet { + return map[symbolPosition]struct{}{} +} + +func (s symbolPositionSet) String() string { + if len(s) <= 0 { + return "{}" + } + ps := s.sort() + var b strings.Builder + fmt.Fprintf(&b, "{") + for i, p := range ps { + if i <= 0 { + fmt.Fprintf(&b, "%v", p) + continue + } + fmt.Fprintf(&b, ", %v", p) + } + fmt.Fprintf(&b, "}") + return b.String() +} + +func (s symbolPositionSet) add(pos symbolPosition) symbolPositionSet { + s[pos] = struct{}{} + return s +} + +func (s symbolPositionSet) merge(t symbolPositionSet) symbolPositionSet { + for p := range t { + s.add(p) + } + return s +} + +func (s symbolPositionSet) intersect(set symbolPositionSet) symbolPositionSet { + in := newSymbolPositionSet() + for p1 := range s { + for p2 := range set { + if p1 != p2 { + continue + } + in.add(p1) + } + } + return in +} + +func (s symbolPositionSet) hash() string { + if len(s) <= 0 { + return "" + } + sorted := s.sort() + var buf []byte + for _, p := range sorted { + b := make([]byte, 8) + binary.PutUvarint(b, uint64(p)) + buf = append(buf, b...) + } + // Convert to a string to be able to use it as a key of a map. + // But note this byte sequence is made from values of symbol positions, + // so this is not a well-formed UTF-8 sequence. + return string(buf) +} + +func (s symbolPositionSet) sort() []symbolPosition { + sorted := make([]symbolPosition, len(s)) + i := 0 + for p := range s { + sorted[i] = p + i++ + } + sortSymbolPositions(sorted, 0, len(sorted)-1) + return sorted +} + +// sortSymbolPositions sorts a slice of symbol positions as it uses quick sort. +func sortSymbolPositions(ps []symbolPosition, left, right int) { + if left >= right { + return + } + var pivot symbolPosition + { + // Use a median as a pivot. + p1 := ps[left] + p2 := ps[(left+right)/2] + p3 := ps[right] + if p1 > p2 { + p1, p2 = p2, p1 + } + if p2 > p3 { + p2, p3 = p3, p2 + if p1 > p2 { + p1, p2 = p2, p1 + } + } + pivot = p2 + } + i := left + j := right + for i <= j { + for ps[i] < pivot { + i++ + } + for ps[j] > pivot { + j-- + } + if i <= j { + ps[i], ps[j] = ps[j], ps[i] + i++ + j-- + } + } + sortSymbolPositions(ps, left, j) + sortSymbolPositions(ps, i, right) +} |