From a0fd0d5a9077947d09777cf7631ca721ffabf9ec Mon Sep 17 00:00:00 2001 From: Ryo Nihei Date: Tue, 4 May 2021 17:45:24 +0900 Subject: Improve performance of the symbolPositionSet When using a map to represent a set, performance degrades due to the increased number of calls of runtime.mapassign. Especially when the number of symbols is large, as in compiling a pattern that contains character properties like \p{Letter}, adding elements to the set alone may take several tens of seconds of CPU time. Therefore, this commit solves this problem by changing the representation of the set from map to array. --- compiler/symbol_position.go | 79 ++++++++++++++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 26 deletions(-) (limited to 'compiler/symbol_position.go') diff --git a/compiler/symbol_position.go b/compiler/symbol_position.go index 1b400bd..d35dfde 100644 --- a/compiler/symbol_position.go +++ b/compiler/symbol_position.go @@ -52,17 +52,26 @@ func (p symbolPosition) describe() (uint16, bool) { return v, false } -type symbolPositionSet map[symbolPosition]struct{} +type symbolPositionSet struct { + // `s` represents a set of symbol positions. + // However, immediately after adding a symbol position, the elements may be duplicated. + // When you need an aligned set with no duplicates, you can get such value via the set function. + s []symbolPosition + sorted bool +} -func newSymbolPositionSet() symbolPositionSet { - return map[symbolPosition]struct{}{} +func newSymbolPositionSet() *symbolPositionSet { + return &symbolPositionSet{ + s: []symbolPosition{}, + sorted: false, + } } -func (s symbolPositionSet) String() string { - if len(s) <= 0 { +func (s *symbolPositionSet) String() string { + if len(s.s) <= 0 { return "{}" } - ps := s.sort() + ps := s.sortAndRemoveDuplicates() var b strings.Builder fmt.Fprintf(&b, "{") for i, p := range ps { @@ -76,22 +85,27 @@ func (s symbolPositionSet) String() string { return b.String() } -func (s symbolPositionSet) add(pos symbolPosition) symbolPositionSet { - s[pos] = struct{}{} +func (s *symbolPositionSet) set() []symbolPosition { + s.sortAndRemoveDuplicates() + return s.s +} + +func (s *symbolPositionSet) add(pos symbolPosition) *symbolPositionSet { + s.s = append(s.s, pos) + s.sorted = false return s } -func (s symbolPositionSet) merge(t symbolPositionSet) symbolPositionSet { - for p := range t { - s.add(p) - } +func (s *symbolPositionSet) merge(t *symbolPositionSet) *symbolPositionSet { + s.s = append(s.s, t.s...) + s.sorted = false return s } -func (s symbolPositionSet) intersect(set symbolPositionSet) symbolPositionSet { +func (s *symbolPositionSet) intersect(set *symbolPositionSet) *symbolPositionSet { in := newSymbolPositionSet() - for p1 := range s { - for p2 := range set { + for _, p1 := range s.s { + for _, p2 := range set.s { if p1 != p2 { continue } @@ -101,11 +115,11 @@ func (s symbolPositionSet) intersect(set symbolPositionSet) symbolPositionSet { return in } -func (s symbolPositionSet) hash() string { - if len(s) <= 0 { +func (s *symbolPositionSet) hash() string { + if len(s.s) <= 0 { return "" } - sorted := s.sort() + sorted := s.sortAndRemoveDuplicates() var buf []byte for _, p := range sorted { b := make([]byte, 8) @@ -118,15 +132,28 @@ func (s symbolPositionSet) hash() string { return string(buf) } -func (s symbolPositionSet) sort() []symbolPosition { - sorted := make([]symbolPosition, len(s)) - i := 0 - for p := range s { - sorted[i] = p - i++ +func (s *symbolPositionSet) sortAndRemoveDuplicates() []symbolPosition { + if s.sorted { + return s.s } - sortSymbolPositions(sorted, 0, len(sorted)-1) - return sorted + + sortSymbolPositions(s.s, 0, len(s.s)-1) + + // Remove duplicates. + lastV := s.s[0] + nextIdx := 1 + for _, v := range s.s[1:] { + if v == lastV { + continue + } + s.s[nextIdx] = v + nextIdx++ + lastV = v + } + s.s = s.s[:nextIdx] + s.sorted = true + + return s.s } // sortSymbolPositions sorts a slice of symbol positions as it uses quick sort. -- cgit v1.2.3