WARNING: THIS SITE IS A MIRROR OF GITHUB.COM / IT CANNOT LOGIN OR REGISTER ACCOUNTS / THE CONTENTS ARE PROVIDED AS-IS / THIS SITE ASSUMES NO RESPONSIBILITY FOR ANY DISPLAYED CONTENT OR LINKS / IF YOU FOUND SOMETHING MAY NOT GOOD FOR EVERYONE, CONTACT ADMIN AT ilovescratch@foxmail.com
Skip to content

Commit e54f64b

Browse files
committed
Improve removeExistingDocuments performance by adding cache
Signed-off-by: Gao Hongtao <[email protected]>
1 parent 279547f commit e54f64b

File tree

11 files changed

+456
-229
lines changed

11 files changed

+456
-229
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ vendor/**
1717
/search/query/y.output
1818
*.test
1919
tags
20+
*.prof

batch.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,14 @@ import (
2020

2121
const _idField = "_id"
2222

23-
type Identifier string
23+
type Identifier []byte
2424

2525
func (i Identifier) Field() string {
2626
return _idField
2727
}
2828

2929
func (i Identifier) Term() []byte {
30-
return []byte(i)
30+
return i
3131
}
3232

3333
// NewBatch creates a new empty batch.

go.mod

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@ require (
2222
)
2323

2424
require (
25+
github.com/VictoriaMetrics/fastcache v1.12.2 // indirect
26+
github.com/cespare/xxhash/v2 v2.2.0 // indirect
2527
github.com/dgryski/go-metro v0.0.0-20211217172704-adc40b04c140 // indirect
28+
github.com/golang/snappy v0.0.4 // indirect
2629
github.com/inconshreveable/mousetrap v1.1.0 // indirect
2730
github.com/klauspost/compress v1.17.11 // indirect
2831
github.com/leesper/go_rng v0.0.0-20190531154944-a612b043e353 // indirect

go.sum

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@ github.com/RoaringBitmap/roaring v1.9.4 h1:yhEIoH4YezLYT04s1nHehNO64EKFTop/wBhxv
33
github.com/RoaringBitmap/roaring v1.9.4/go.mod h1:6AXUsoIEzDTFFQCe1RbGA6uFONMhvejWj5rqITANK90=
44
github.com/SkyAPM/ice v0.0.0-20241108011032-c3d8eea75118 h1:Ja62sgOCp2qPTd8Xmldv1U83v11IRIsh6KlB7UaFLj4=
55
github.com/SkyAPM/ice v0.0.0-20241108011032-c3d8eea75118/go.mod h1:DoQeb0Ee86LyruZSL77Ddscfk/THJ38x453CRCnGEPI=
6+
github.com/VictoriaMetrics/fastcache v1.12.2 h1:N0y9ASrJ0F6h0QaC3o6uJb3NIZ9VKLjCM7NQbSmF7WI=
7+
github.com/VictoriaMetrics/fastcache v1.12.2/go.mod h1:AmC+Nzz1+3G2eCPapF6UcsnkThDcMsQicp4xDukwJYI=
68
github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw=
9+
github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156/go.mod h1:Cb/ax3seSYIx7SuZdm2G2xzfwmv3TPSk2ucNfQESPXM=
710
github.com/axiomhq/hyperloglog v0.2.0 h1:u1XT3yyY1rjzlWuP6NQIrV4bRYHOaqZaovqjcBEvZJo=
811
github.com/axiomhq/hyperloglog v0.2.0/go.mod h1:GcgMjz9gaDKZ3G0UMS6Fq/VkZ4l7uGgcJyxA7M+omIM=
912
github.com/bits-and-blooms/bitset v1.2.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA=
@@ -22,6 +25,8 @@ github.com/blevesearch/vellum v1.0.10 h1:HGPJDT2bTva12hrHepVT3rOyIKFFF4t7Gf6yMxy
2225
github.com/blevesearch/vellum v1.0.10/go.mod h1:ul1oT0FhSMDIExNjIxHqJoGpVrBpKCdgDQNxfqgJt7k=
2326
github.com/caio/go-tdigest v3.1.0+incompatible h1:uoVMJ3Q5lXmVLCCqaMGHLBWnbGoN6Lpu7OAUPR60cds=
2427
github.com/caio/go-tdigest v3.1.0+incompatible/go.mod h1:sHQM/ubZStBUmF1WbB8FAm8q9GjDajLC5T7ydxE3JHI=
28+
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
29+
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
2530
github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
2631
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
2732
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
@@ -30,6 +35,8 @@ github.com/dgryski/go-metro v0.0.0-20211217172704-adc40b04c140 h1:y7y0Oa6UawqTFP
3035
github.com/dgryski/go-metro v0.0.0-20211217172704-adc40b04c140/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw=
3136
github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k=
3237
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k=
38+
github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM=
39+
github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
3340
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
3441
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
3542
github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes=
@@ -47,6 +54,7 @@ github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3k
4754
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
4855
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
4956
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
57+
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
5058
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
5159
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
5260
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
@@ -58,6 +66,7 @@ golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL
5866
golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
5967
golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs=
6068
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
69+
golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
6170
golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo=
6271
golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
6372
golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM=

index/batch.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,12 @@ func (b *Batch) Reset() {
5454
b.persistedCallback = nil
5555
b.unparsedDocuments = b.unparsedDocuments[:0]
5656
b.unparsedIDs = b.unparsedIDs[:0]
57+
b.fieldNames = b.fieldNames[:0]
58+
}
59+
60+
func (b *Batch) ResetDoc() {
61+
b.documents = b.documents[:0]
62+
b.ids = b.ids[:0]
5763
}
5864

5965
func (b *Batch) SetPersistedCallback(f func(error)) {

index/config.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ type Config struct {
7878
ValidateSnapshotCRC bool
7979

8080
virtualFields map[string][]segment.Field
81+
82+
CacheMaxBytes int
8183
}
8284

8385
func (config Config) WithSegmentType(typ string) Config {

index/stats.go

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ package index
1717
import (
1818
"reflect"
1919
"sync/atomic"
20+
21+
"github.com/VictoriaMetrics/fastcache"
2022
)
2123

2224
func (s *Writer) DirectoryStats() (numFilesOnDisk, numBytesUsedDisk uint64) {
@@ -30,7 +32,19 @@ func (s *Writer) Stats() Stats {
3032
// Update the stats atomically
3133
atomic.StoreUint64(&s.stats.CurOnDiskBytes, numBytesUsedDisk)
3234
atomic.StoreUint64(&s.stats.CurOnDiskFiles, numFilesOnDisk)
33-
return s.stats.Clone()
35+
stats := s.stats.Clone()
36+
c := s.cache.Load()
37+
if c != nil {
38+
var cs fastcache.Stats
39+
c.UpdateStats(&cs)
40+
stats.CacheGetCalls = cs.GetCalls
41+
stats.CacheSetCalls = cs.SetCalls
42+
stats.CacheMisses = cs.Misses
43+
stats.CacheEntriesCount = cs.EntriesCount
44+
stats.CacheBytesSize = cs.BytesSize
45+
stats.CacheMaxBytesSize = cs.MaxBytesSize
46+
}
47+
return stats
3448
}
3549

3650
// Stats tracks statistics about the index, fields that are
@@ -152,6 +166,13 @@ type Stats struct {
152166
newSegBufBytesRemoved uint64
153167
analysisBytesAdded uint64
154168
analysisBytesRemoved uint64
169+
170+
CacheGetCalls uint64
171+
CacheSetCalls uint64
172+
CacheMisses uint64
173+
CacheEntriesCount uint64
174+
CacheBytesSize uint64
175+
CacheMaxBytesSize uint64
155176
}
156177

157178
func (s *Stats) ToMap() map[string]interface{} {

index/writer.go

Lines changed: 71 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ import (
2424
"sync/atomic"
2525
"time"
2626

27+
"github.com/VictoriaMetrics/fastcache"
28+
"github.com/bits-and-blooms/bitset"
2729
segment "github.com/blugelabs/bluge_segment_api"
2830

2931
"github.com/RoaringBitmap/roaring"
@@ -53,6 +55,8 @@ type Writer struct {
5355
asyncTasks sync.WaitGroup
5456

5557
closeOnce sync.Once
58+
59+
cache atomic.Pointer[fastcache.Cache]
5660
}
5761

5862
func OpenWriter(config Config) (*Writer, error) {
@@ -63,6 +67,10 @@ func OpenWriter(config Config) (*Writer, error) {
6367
closeCh: make(chan struct{}),
6468
}
6569

70+
if config.CacheMaxBytes > 0 {
71+
rv.cache.Store(fastcache.New(config.CacheMaxBytes))
72+
}
73+
6674
// start the requested number of analysis workers
6775
for i := 0; i < config.NumAnalysisWorkers; i++ {
6876
config.GoFunc(func() {
@@ -195,10 +203,19 @@ func (s *Writer) fireAsyncError(err error) {
195203
func (s *Writer) Close() (err error) {
196204
s.closeOnce.Do(func() {
197205
err = s.close()
206+
s.ResetCache()
198207
})
199208
return err
200209
}
201210

211+
func (s *Writer) ResetCache() {
212+
c := s.cache.Load()
213+
if c != nil {
214+
c.Reset()
215+
s.cache.Store(nil)
216+
}
217+
}
218+
202219
func (s *Writer) close() (err error) {
203220
startTime := time.Now()
204221
defer func() {
@@ -296,38 +313,76 @@ func (s *Writer) Batch(batch *Batch) (err error) {
296313
return err
297314
}
298315

316+
var id = "_id"
317+
299318
func (s *Writer) removeExistingDocuments(batch *Batch) error {
319+
if len(batch.unparsedIDs) == 0 {
320+
return nil
321+
}
322+
300323
root := s.currentSnapshot()
301324
defer func() { _ = root.Close() }()
325+
removeIDMap := bitset.New(uint(len(batch.unparsedIDs)))
302326

327+
var dict segment.Dictionary
328+
var err error
303329
for _, seg := range root.segment {
304-
dict, err := seg.segment.Dictionary(batch.unparsedIDs[0].Field())
305-
if err != nil {
306-
return err
307-
}
308-
309-
for i := 0; i < len(batch.unparsedIDs); i++ {
310-
if ok, _ := dict.Contains(batch.unparsedIDs[i].Term()); !ok {
330+
dict = nil
331+
ff := seg.segment.Fields()
332+
for i := uint(0); i < uint(len(batch.unparsedIDs)); i++ {
333+
if removeIDMap.Test(i) {
311334
continue
312335
}
336+
idTerm := batch.unparsedIDs[i].Term()
337+
c := s.cache.Load()
338+
if c != nil {
339+
if !c.Has(idTerm) {
340+
if dict == nil {
341+
dict, err = seg.segment.Dictionary(id)
342+
if err != nil {
343+
return err
344+
}
345+
}
346+
if ok, _ := dict.Contains(idTerm); !ok {
347+
continue
348+
}
349+
c.Set(idTerm, nil)
350+
}
351+
} else {
352+
if dict == nil {
353+
dict, err = seg.segment.Dictionary(id)
354+
if err != nil {
355+
return err
356+
}
357+
}
358+
if ok, _ := dict.Contains(idTerm); !ok {
359+
continue
360+
}
361+
}
362+
313363
fn := batch.fieldNames[i]
314364
if len(fn) > 0 {
315-
if anyItemNotExist(fn, seg.segment.Fields()) {
365+
if anyItemNotExist(fn, ff) {
316366
continue
317367
}
318368
}
319-
batch.unparsedDocuments = append(batch.unparsedDocuments[:i], batch.unparsedDocuments[i+1:]...)
320-
batch.unparsedIDs = append(batch.unparsedIDs[:i], batch.unparsedIDs[i+1:]...)
321-
batch.fieldNames = append(batch.fieldNames[:i], batch.fieldNames[i+1:]...)
322-
i--
323-
if len(batch.unparsedDocuments) == 0 {
369+
removeIDMap.Set(i)
370+
if removeIDMap.All() {
324371
return nil
325372
}
326373
}
327374
}
328-
if len(batch.unparsedDocuments) > 0 {
329-
batch.documents = append(batch.documents, batch.unparsedDocuments...)
330-
batch.ids = append(batch.ids, batch.unparsedIDs...)
375+
if removeIDMap.Any() {
376+
for i := uint(0); i < uint(len(batch.unparsedIDs)); i++ {
377+
if removeIDMap.Test(i) {
378+
continue
379+
}
380+
batch.documents = append(batch.documents, batch.unparsedDocuments[i])
381+
batch.ids = append(batch.ids, batch.unparsedIDs[i])
382+
}
383+
} else {
384+
batch.documents = batch.unparsedDocuments
385+
batch.ids = batch.unparsedIDs
331386
}
332387
return nil
333388
}

index/writer_benchmark_test.go

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
// Copyright (c) 2020 Couchbase, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package index
16+
17+
import (
18+
"fmt"
19+
"os"
20+
"runtime/pprof"
21+
"testing"
22+
"time"
23+
)
24+
25+
// goos: darwin
26+
// goarch: arm64
27+
// pkg: github.com/blugelabs/bluge/index
28+
// cpu: Apple M1 Pro
29+
// BenchmarkWriter_removeExistingDocuments-NoCache 24632 41085 ns/op 18711 B/op 204 allocs/op
30+
// BenchmarkWriter_removeExistingDocuments-Cache 161628 6865 ns/op 2456 B/op 102 allocs/op
31+
func BenchmarkWriter_removeExistingDocuments(b *testing.B) {
32+
cfg, cleanup := CreateConfig("BenchmarkWriter_removeExistingDocuments")
33+
cfg.CacheMaxBytes = 100 << 20
34+
defer func() {
35+
err := cleanup()
36+
if err != nil {
37+
b.Log(err)
38+
}
39+
}()
40+
41+
idx, err := OpenWriter(cfg)
42+
if err != nil {
43+
b.Fatal(err)
44+
}
45+
defer func() {
46+
err = idx.Close()
47+
if err != nil {
48+
b.Fatal(err)
49+
}
50+
}()
51+
for i := 0; i < 3000; i += 100 {
52+
batch := NewBatch()
53+
for j := 0; j < 100; j++ {
54+
serviceName := fmt.Sprintf("service-%d", (i+j)%10) // 10 different service names
55+
ipAddress := fmt.Sprintf("192.168.%d.%d", (i+j)/256, (i+j)%256) // IP addresses
56+
docID := fmt.Sprintf("%s-%s", serviceName, ipAddress)
57+
doc := &FakeDocument{
58+
NewFakeField("_id", docID, true, false, true),
59+
NewFakeField("title", fmt.Sprintf("mister-%d", i), true, false, true),
60+
}
61+
batch.Insert(doc)
62+
}
63+
if err := idx.Batch(batch); err != nil {
64+
b.Fatalf("failed to apply batch: %v", err)
65+
}
66+
}
67+
time.Sleep(1 * time.Second)
68+
batchRemove := NewBatch()
69+
for j := 0; j < 100; j++ {
70+
serviceName := fmt.Sprintf("service-%d", j%10) // 10 different service names
71+
ipAddress := fmt.Sprintf("192.168.%d.%d", j/256, j%256) // IP addresses
72+
docID := fmt.Sprintf("%s-%s", serviceName, ipAddress) // Document ID composed of service name and IP address
73+
doc := &FakeDocument{
74+
NewFakeField("_id", docID, true, false, true),
75+
NewFakeField("title", fmt.Sprintf("mister-%d", j), true, false, true),
76+
}
77+
batchRemove.InsertIfAbsent(testIdentifier(docID), []string{"title"}, doc)
78+
}
79+
80+
// Start profiling
81+
f, err := os.Create("cpu.prof")
82+
if err != nil {
83+
b.Fatal(err)
84+
}
85+
defer f.Close()
86+
if err := pprof.StartCPUProfile(f); err != nil {
87+
b.Fatal(err)
88+
}
89+
defer pprof.StopCPUProfile()
90+
b.ResetTimer()
91+
for i := 0; i < b.N; i++ {
92+
batchRemove.ResetDoc()
93+
err := idx.removeExistingDocuments(batchRemove)
94+
if err != nil {
95+
b.Fatal(err)
96+
}
97+
}
98+
}

0 commit comments

Comments
 (0)