diff --git a/Tiltfile b/Tiltfile index 675573c30c3..82fa2e13d94 100644 --- a/Tiltfile +++ b/Tiltfile @@ -157,10 +157,21 @@ k8s_yaml( 'helm template --set-file rustFrontendService.configuration=rust/frontend/sample_configs/distributed.yaml,rustLogService.configuration=rust/worker/chroma_config.yaml,heapTenderService.configuration=rust/worker/chroma_config.yaml,compactionService.configuration=rust/worker/chroma_config.yaml,queryService.configuration=rust/worker/chroma_config.yaml,garbageCollector.configuration=rust/worker/chroma_config.yaml --values k8s/distributed-chroma/values.yaml,k8s/distributed-chroma/values.dev.yaml k8s/distributed-chroma' ), ) + +k8s_yaml( + local( + 'helm template --set-file rustFrontendService.configuration=rust/frontend/sample_configs/distributed2.yaml,rustLogService.configuration=rust/worker/chroma_config2.yaml,heapTenderService.configuration=rust/worker/chroma_config2.yaml,compactionService.configuration=rust/worker/chroma_config2.yaml,queryService.configuration=rust/worker/chroma_config2.yaml,garbageCollector.configuration=rust/worker/chroma_config2.yaml --values k8s/distributed-chroma/values2.yaml,k8s/distributed-chroma/values2.dev.yaml k8s/distributed-chroma' + ), +) + watch_file('rust/frontend/sample_configs/distributed.yaml') +watch_file('rust/frontend/sample_configs/distributed2.yaml') watch_file('rust/worker/chroma_config.yaml') +watch_file('rust/worker/chroma_config2.yaml') watch_file('k8s/distributed-chroma/values.yaml') watch_file('k8s/distributed-chroma/values.dev.yaml') +watch_file('k8s/distributed-chroma/values2.yaml') +watch_file('k8s/distributed-chroma/values2.dev.yaml') watch_file('k8s/distributed-chroma/*.yaml') @@ -177,60 +188,151 @@ k8s_yaml([ 'k8s/test/prometheus.yaml', 'k8s/test/test-memberlist-cr.yaml', 'k8s/test/postgres.yaml', + 'k8s/test/postgres2.yaml', ]) # Lots of things assume the cluster is in a basic state. Get it into a basic # state before deploying anything else. k8s_resource( objects=[ - 'pod-watcher:Role', 'memberlists.chroma.cluster:CustomResourceDefinition', - 'query-service-memberlist:MemberList', - 'compaction-service-memberlist:MemberList', - 'garbage-collection-service-memberlist:MemberList', - 'rust-log-service-memberlist:MemberList', - - 'sysdb-serviceaccount:ServiceAccount', - 'sysdb-serviceaccount-rolebinding:RoleBinding', - 'sysdb-query-service-memberlist-binding:RoleBinding', - 'sysdb-compaction-service-memberlist-binding:RoleBinding', - - 'query-service-serviceaccount:ServiceAccount', - 'query-service-serviceaccount-rolebinding:RoleBinding', - 'query-service-memberlist-readerwriter:Role', - 'query-service-query-service-memberlist-binding:RoleBinding', - 'query-service-memberlist-readerwriter-binding:RoleBinding', - - 'compaction-service-memberlist-readerwriter:Role', - 'compaction-service-compaction-service-memberlist-binding:RoleBinding', - 'compaction-service-memberlist-readerwriter-binding:RoleBinding', - 'compaction-service-serviceaccount:ServiceAccount', - 'compaction-service-serviceaccount-rolebinding:RoleBinding', - - 'test-memberlist:MemberList', - 'test-memberlist-reader:Role', - 'test-memberlist-reader-binding:RoleBinding', - 'lease-watcher:Role', - 'rust-frontend-service-config:ConfigMap', + + 'pod-watcher:Role:chroma', + 'query-service-memberlist:MemberList:chroma', + 'compaction-service-memberlist:MemberList:chroma', + 'garbage-collection-service-memberlist:MemberList:chroma', + 'rust-log-service-memberlist:MemberList:chroma', + + 'sysdb-serviceaccount:ServiceAccount:chroma', + 'sysdb-serviceaccount-rolebinding:RoleBinding:chroma', + 'sysdb-query-service-memberlist-binding:RoleBinding:chroma', + 'sysdb-compaction-service-memberlist-binding:RoleBinding:chroma', + + 'query-service-serviceaccount:ServiceAccount:chroma', + 'query-service-serviceaccount-rolebinding:RoleBinding:chroma', + 'query-service-memberlist-readerwriter:Role:chroma', + 'query-service-query-service-memberlist-binding:RoleBinding:chroma', + 'query-service-memberlist-readerwriter-binding:RoleBinding:chroma', + + 'compaction-service-memberlist-readerwriter:Role:chroma', + 'compaction-service-compaction-service-memberlist-binding:RoleBinding:chroma', + 'compaction-service-memberlist-readerwriter-binding:RoleBinding:chroma', + 'compaction-service-serviceaccount:ServiceAccount:chroma', + 'compaction-service-serviceaccount-rolebinding:RoleBinding:chroma', + + 'test-memberlist:MemberList:chroma', + 'test-memberlist-reader:Role:chroma', + 'test-memberlist-reader-binding:RoleBinding:chroma', + 'lease-watcher:Role:chroma', + 'rust-frontend-service-config:ConfigMap:chroma', ], new_name='k8s_setup', labels=["infrastructure"], ) +# Lots of things assume the cluster is in a basic state. Get it into a basic +# state before deploying anything else. +k8s_resource( + objects=[ + 'pod-watcher:Role:chroma2', + 'query-service-memberlist:MemberList:chroma2', + 'compaction-service-memberlist:MemberList:chroma2', + 'garbage-collection-service-memberlist:MemberList:chroma2', + 'rust-log-service-memberlist:MemberList:chroma2', + + 'sysdb-serviceaccount:ServiceAccount:chroma2', + 'sysdb-serviceaccount-rolebinding:RoleBinding:chroma2', + 'sysdb-query-service-memberlist-binding:RoleBinding:chroma2', + 'sysdb-compaction-service-memberlist-binding:RoleBinding:chroma2', + + 'query-service-serviceaccount:ServiceAccount:chroma2', + 'query-service-serviceaccount-rolebinding:RoleBinding:chroma2', + 'query-service-memberlist-readerwriter:Role:chroma2', + 'query-service-query-service-memberlist-binding:RoleBinding:chroma2', + 'query-service-memberlist-readerwriter-binding:RoleBinding:chroma2', + + 'compaction-service-memberlist-readerwriter:Role:chroma2', + 'compaction-service-compaction-service-memberlist-binding:RoleBinding:chroma2', + 'compaction-service-memberlist-readerwriter-binding:RoleBinding:chroma2', + 'compaction-service-serviceaccount:ServiceAccount:chroma2', + 'compaction-service-serviceaccount-rolebinding:RoleBinding:chroma2', + + 'lease-watcher:Role:chroma2', + 'rust-frontend-service-config:ConfigMap:chroma2', + ], + new_name='k8s_setup2', + labels=["infrastructure2"], +) + # Production Chroma -k8s_resource('postgres', resource_deps=['k8s_setup'], labels=["infrastructure"], port_forwards='5432:5432') +k8s_resource('postgres:deployment:chroma', resource_deps=['k8s_setup'], labels=["infrastructure"], port_forwards='5432:5432') # Jobs are suffixed with the image tag to ensure they are unique. In this context, the image tag is defined in k8s/distributed-chroma/values.yaml. -k8s_resource('sysdb-migration-latest', resource_deps=['postgres'], labels=["infrastructure"]) -k8s_resource('rust-log-service', labels=["chroma"], port_forwards=['50054:50051', '50052:50052'], resource_deps=['minio-deployment']) -k8s_resource('sysdb', resource_deps=['sysdb-migration-latest'], labels=["chroma"], port_forwards='50051:50051') -k8s_resource('rust-frontend-service', resource_deps=['sysdb', 'rust-log-service'], labels=["chroma"], port_forwards='8000:8000') -k8s_resource('query-service', resource_deps=['sysdb'], labels=["chroma"], port_forwards='50053:50051') -k8s_resource('compaction-service', resource_deps=['sysdb'], labels=["chroma"]) +k8s_resource('sysdb-migration-latest:job:chroma', resource_deps=['postgres:deployment:chroma'], labels=["infrastructure"]) +k8s_resource('rust-log-service:statefulset:chroma', labels=["chroma"], port_forwards=['50054:50051', '50052:50052'], resource_deps=['minio-deployment']) +k8s_resource('sysdb:deployment:chroma', resource_deps=['sysdb-migration-latest:job:chroma'], labels=["chroma"], port_forwards='50051:50051') +k8s_resource('rust-frontend-service:deployment:chroma', resource_deps=['sysdb:deployment:chroma', 'rust-log-service:statefulset:chroma'], labels=["chroma"], port_forwards='8000:8000') +k8s_resource('query-service:statefulset:chroma', resource_deps=['sysdb:deployment:chroma'], labels=["chroma"], port_forwards='50053:50051') +k8s_resource('compaction-service:statefulset:chroma', resource_deps=['sysdb:deployment:chroma'], labels=["chroma"]) +k8s_resource('garbage-collector:statefulset:chroma', resource_deps=['k8s_setup', 'minio-deployment'], labels=["chroma"], port_forwards='50055:50055') k8s_resource('load-service', resource_deps=['k8s_setup'], labels=["infrastructure"], port_forwards='3001:3001') + +# Production Chroma 2 +k8s_resource('postgres:deployment:chroma2', resource_deps=['k8s_setup2'], labels=["infrastructure2"], port_forwards='6432:5432') +# Jobs are suffixed with the image tag to ensure they are unique. In this context, the image tag is defined in k8s/distributed-chroma/values.yaml. +k8s_resource('sysdb-migration-latest:job:chroma2', resource_deps=['postgres:deployment:chroma2'], labels=["infrastructure2"]) +k8s_resource('rust-log-service:statefulset:chroma2', labels=["chroma2"], port_forwards=['60054:50051', '60052:50052'], resource_deps=['minio-deployment']) +k8s_resource('sysdb:deployment:chroma2', resource_deps=['sysdb-migration-latest:job:chroma2'], labels=["chroma2"], port_forwards='60051:50051') +k8s_resource('rust-frontend-service:deployment:chroma2', resource_deps=['sysdb:deployment:chroma2', 'rust-log-service:statefulset:chroma2'], labels=["chroma2"], port_forwards='8001:8000') +k8s_resource('query-service:statefulset:chroma2', resource_deps=['sysdb:deployment:chroma2'], labels=["chroma2"], port_forwards='60053:50051') +k8s_resource('compaction-service:statefulset:chroma2', resource_deps=['sysdb:deployment:chroma2'], labels=["chroma2"]) +k8s_resource('garbage-collector:statefulset:chroma2', resource_deps=['k8s_setup2', 'minio-deployment'], labels=["chroma2"], port_forwards='60055:50055') + +# Observability k8s_resource('jaeger', resource_deps=['k8s_setup'], labels=["observability"]) k8s_resource('grafana', resource_deps=['k8s_setup'], labels=["observability"]) k8s_resource('prometheus', resource_deps=['k8s_setup'], labels=["observability"]) k8s_resource('otel-collector', resource_deps=['k8s_setup'], labels=["observability"]) -k8s_resource('garbage-collector', resource_deps=['k8s_setup', 'minio-deployment'], labels=["chroma"], port_forwards='50055:50055') + # Local S3 k8s_resource('minio-deployment', resource_deps=['k8s_setup'], labels=["debug"], port_forwards=['9000:9000', '9005:9005']) + + +# Set the enabled resources +# - Basic resources are always enabled. +# - Multi-region resources are only enabled if the env var MULTI_REGION is set to true. +config.clear_enabled_resources() + +groups = { + 'basic': [ + 'k8s_setup', + 'postgres:deployment:chroma', + 'sysdb-migration-latest:job:chroma', + 'rust-log-service:statefulset:chroma', + 'sysdb:deployment:chroma', + 'rust-frontend-service:deployment:chroma', + 'query-service:statefulset:chroma', + 'compaction-service:statefulset:chroma', + 'load-service', + 'garbage-collector:statefulset:chroma', + 'jaeger', + 'grafana', + 'prometheus', + 'otel-collector', + 'minio-deployment', + ], + 'multi_region': [ + 'postgres:deployment:chroma2', + 'sysdb-migration-latest:job:chroma2', + 'rust-log-service:statefulset:chroma2', + 'sysdb:deployment:chroma2', + 'rust-frontend-service:deployment:chroma2', + 'query-service:statefulset:chroma2', + 'compaction-service:statefulset:chroma2', + 'garbage-collector:statefulset:chroma2', + ], +} + +if os.environ.get('MULTI_REGION') == 'true': + config.set_enabled_resources(groups['basic'] + groups['multi_region']) +else: + config.set_enabled_resources(groups['basic']) diff --git a/go/pkg/memberlist_manager/memberlist_store.go b/go/pkg/memberlist_manager/memberlist_store.go index 7f59277f5d4..1feaa9cb5ca 100644 --- a/go/pkg/memberlist_manager/memberlist_store.go +++ b/go/pkg/memberlist_manager/memberlist_store.go @@ -144,7 +144,7 @@ func (s *CRMemberlistStore) UpdateMemberlist(ctx context.Context, memberlist Mem log.Debug("Updating memberlist store", zap.Any("memberlist", memberlist)) unstructured := memberlist.toCr(s.coordinatorNamespace, s.memberlistCustomResource, resourceVersion) log.Debug("Setting memberlist to unstructured object", zap.Any("unstructured", unstructured)) - _, err := s.dynamicClient.Resource(gvr).Namespace("chroma").Update(context.Background(), unstructured, metav1.UpdateOptions{}) + _, err := s.dynamicClient.Resource(gvr).Namespace(s.coordinatorNamespace).Update(context.Background(), unstructured, metav1.UpdateOptions{}) if err != nil { return err } diff --git a/k8s/distributed-chroma/values.dev.yaml b/k8s/distributed-chroma/values.dev.yaml index 46f66739009..8e5e50112a3 100644 --- a/k8s/distributed-chroma/values.dev.yaml +++ b/k8s/distributed-chroma/values.dev.yaml @@ -1,11 +1,12 @@ sysdb: flags: version-file-enabled: true - s3-endpoint: "http://minio:9000" + s3-endpoint: "http://minio.chroma.svc.cluster.local:9000" s3-access-key-id: "minio" s3-secret-access-key: "minio123" s3-force-path-style: true create-bucket-if-not-exists: true + kubernetes-namespace: chroma rustFrontendService: # We have to specify the command, because the Dockerfile uses the CLI since its shared with # single node, so in values.dev we pass the CONFIG_PATH into the chroma run command diff --git a/k8s/distributed-chroma/values.yaml b/k8s/distributed-chroma/values.yaml index d7ba8d932df..a91bf6825d0 100644 --- a/k8s/distributed-chroma/values.yaml +++ b/k8s/distributed-chroma/values.yaml @@ -20,7 +20,7 @@ sysdb: replicaCount: 1 env: - name: OPTL_TRACING_ENDPOINT - value: 'value: "jaeger:4317"' + value: 'value: "jaeger.chroma.svc.cluster.local:4317"' resources: limits: cpu: '2000m' diff --git a/k8s/distributed-chroma/values2.dev.yaml b/k8s/distributed-chroma/values2.dev.yaml new file mode 100644 index 00000000000..7ea3b5ada6e --- /dev/null +++ b/k8s/distributed-chroma/values2.dev.yaml @@ -0,0 +1,36 @@ +sysdb: + flags: + version-file-enabled: true + s3-endpoint: "http://minio.chroma.svc.cluster.local:9000" + s3-access-key-id: "minio" + s3-secret-access-key: "minio123" + s3-force-path-style: true + create-bucket-if-not-exists: true + kubernetes-namespace: chroma2 +rustFrontendService: + # We have to specify the command, because the Dockerfile uses the CLI since its shared with + # single node, so in values.dev we pass the CONFIG_PATH into the chroma run command + command: '["chroma", "run", "$(CONFIG_PATH)"]' + otherEnvConfig: | + - name: CHROMA_ALLOW_RESET + value: "true" + - name: RUST_BACKTRACE + value: 'value: "1"' + +queryService: + env: + - name: RUST_BACKTRACE + value: 'value: "1"' + jemallocConfig: "prof:true,prof_active:true,lg_prof_sample:19" + +compactionService: + env: + - name: RUST_BACKTRACE + value: 'value: "1"' + jemallocConfig: "prof:true,prof_active:true,lg_prof_sample:19" + +rustLogService: + replicaCount: 1 + +garbageCollector: + jemallocConfig: "prof:true,prof_active:true,lg_prof_sample:19" diff --git a/k8s/distributed-chroma/values2.yaml b/k8s/distributed-chroma/values2.yaml new file mode 100644 index 00000000000..4a50a90d57f --- /dev/null +++ b/k8s/distributed-chroma/values2.yaml @@ -0,0 +1,87 @@ +# Default values for distributed-chroma. +# Strongly prefer single quotes. +namespace: 'chroma2' +rustFrontendService: + image: + repository: 'rust-frontend-service' + tag: 'latest' + replicaCount: 1 + resources: + limits: + cpu: '2000m' + memory: '1Gi' + requests: + cpu: '1000m' + memory: '512Mi' +sysdb: + image: + repository: 'sysdb' + tag: 'latest' + replicaCount: 1 + env: + - name: OPTL_TRACING_ENDPOINT + value: 'value: "jaeger.chroma.svc.cluster.local:4317"' + resources: + limits: + cpu: '2000m' + memory: '1Gi' + requests: + cpu: '1000m' + memory: '512Mi' +rustLogService: + image: + repository: 'rust-log-service' + tag: 'latest' + cache: + hostPath: '/local/cache2/chroma-log-service' + type: DirectoryOrCreate + mountPath: '/cache/' + additionalVolumes: [] +queryService: + image: + repository: 'query-service' + tag: 'latest' + env: + cache: + hostPath: '/local/cache2/chroma-query-service' + type: DirectoryOrCreate + mountPath: '/cache/' + additionalVolumes: [] + replicaCount: 2 +compactionService: + image: + repository: 'compaction-service' + tag: 'latest' + env: + cache: + hostPath: '/local/cache2/chroma-compaction-service' + type: DirectoryOrCreate + mountPath: '/cache/' + additionalVolumes: [] + replicaCount: 1 +sysdbMigration: + image: + repository: 'sysdb-migration' + tag: 'latest' + username: chroma + password: chroma + netloc: postgres + port: 5432 + dbName: sysdb + sslmode: disable +# Add the garbage collector configuration +garbageCollector: + image: + repository: 'garbage-collector' + tag: 'latest' + replicaCount: 1 + resources: + limits: + cpu: '200m' + memory: '256Mi' + requests: + cpu: '100m' + memory: '128Mi' + cache: + hostPath: '/local/cache2/chroma-garbage-collector' + mountPath: '/cache/' diff --git a/k8s/test/postgres2.yaml b/k8s/test/postgres2.yaml new file mode 100644 index 00000000000..ade9b98f3e0 --- /dev/null +++ b/k8s/test/postgres2.yaml @@ -0,0 +1,50 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: postgres + namespace: chroma2 +spec: + replicas: 1 + selector: + matchLabels: + app: postgres + template: + metadata: + labels: + app: postgres + spec: + containers: + - name: postgres + image: chroma-postgres + env: + - name: POSTGRES_MULTIPLE_DATABASES + value: "sysdb,log" + - name: POSTGRES_USER + value: chroma + - name: POSTGRES_PASSWORD + value: chroma + ports: + - containerPort: 5432 + readinessProbe: + exec: + command: + - pg_isready + - -U + - chroma + periodSeconds: 1 + failureThreshold: 10 + +--- +apiVersion: v1 +kind: Service +metadata: + name: postgres + namespace: chroma2 +spec: + ports: + - name: postgres-port + port: 5432 + targetPort: 5432 + selector: + app: postgres + type: ClusterIP diff --git a/rust/frontend/sample_configs/distributed.yaml b/rust/frontend/sample_configs/distributed.yaml index 8c549f3e9bd..c59f4fda148 100644 --- a/rust/frontend/sample_configs/distributed.yaml +++ b/rust/frontend/sample_configs/distributed.yaml @@ -2,7 +2,7 @@ # distributed chroma. It is used in our Tiltfile as well. open_telemetry: service_name: "rust-frontend-service" - endpoint: "http://otel-collector:4317" + endpoint: "http://otel-collector.chroma.svc.cluster.local:4317" filters: - crate_name: "chroma_frontend" filter_level: "trace" @@ -28,6 +28,11 @@ log: port: 50051 connect_timeout_ms: 5000 request_timeout_ms: 60000 # 1 minute + memberlist_provider: + custom_resource: + kube_namespace: "chroma" + memberlist_name: "rust-log-service-memberlist" + queue_size: 100 executor: distributed: diff --git a/rust/frontend/sample_configs/distributed2.yaml b/rust/frontend/sample_configs/distributed2.yaml new file mode 100644 index 00000000000..2ff160a2532 --- /dev/null +++ b/rust/frontend/sample_configs/distributed2.yaml @@ -0,0 +1,71 @@ +# This file is an example of configuring the frontend for +# distributed chroma. It is used in our Tiltfile as well. +open_telemetry: + service_name: "rust-frontend-service" + endpoint: "http://otel-collector.chroma.svc.cluster.local:4317" + filters: + - crate_name: "chroma_frontend" + filter_level: "trace" +sysdb: + grpc: + host: "sysdb.chroma2" + port: 50051 + connect_timeout_ms: 60000 + request_timeout_ms: 60000 +collections_with_segments_provider: + cache: + lru: + name: "collections_with_segments_cache" + capacity: 1000 + cache_ttl_secs: 60 + permitted_parallelism: 180 + cache_invalidation_retry_policy: + delay_ms: 0 + max_retries: 3 +log: + grpc: + host: "rust-log-service.chroma2" + port: 50051 + connect_timeout_ms: 5000 + request_timeout_ms: 60000 # 1 minute + memberlist_provider: + custom_resource: + kube_namespace: "chroma2" + memberlist_name: "rust-log-service-memberlist" + queue_size: 100 + +executor: + distributed: + connections_per_node: 5 + replication_factor: 2 + connect_timeout_ms: 5000 + request_timeout_ms: 60000 # 1 minute + assignment: + rendezvous_hashing: + hasher: Murmur3 + memberlist_provider: + custom_resource: + kube_namespace: "chroma2" + memberlist_name: "query-service-memberlist" + queue_size: 100 + client_selection: + first_attempt_weights: [0.8, 0.2] + uniform_on_retry: true +scorecard_enabled: true +scorecard: + - patterns: + - "op:read" + - "collection:*" + score: 10 + - patterns: + - "op:write" + - "collection:*" + score: 1 +circuit_breaker: + requests: 1000 +enable_span_indexing: true +default_knn_index: "spann" +enable_schema: true +tenants_to_migrate_immediately: +- "default_tenant" +tenants_to_migrate_immediately_threshold: "ffffffff-ffff-ffff-ffff-ffffffffffff" diff --git a/rust/worker/chroma_config.yaml b/rust/worker/chroma_config.yaml index 4c3b4037ea1..d2439e65335 100644 --- a/rust/worker/chroma_config.yaml +++ b/rust/worker/chroma_config.yaml @@ -4,7 +4,7 @@ # for now we nest it in the worker directory query_service: service_name: "query-service" - otel_endpoint: "http://otel-collector:4317" + otel_endpoint: "http://otel-collector.chroma.svc.cluster.local:4317" otel_filters: - crate_name: "worker" filter_level: "trace" @@ -43,6 +43,11 @@ query_service: port: 50051 connect_timeout_ms: 5000 request_timeout_ms: 60000 # 1 minute + memberlist_provider: + custom_resource: + kube_namespace: "chroma" + memberlist_name: "rust-log-service-memberlist" + queue_size: 100 dispatcher: num_worker_threads: 4 dispatcher_queue_size: 1000 @@ -87,7 +92,7 @@ query_service: fetch_log_batch_size: 1000 compaction_service: service_name: "compaction-service" - otel_endpoint: "http://otel-collector:4317" + otel_endpoint: "http://otel-collector.chroma.svc.cluster.local:4317" otel_filters: - crate_name: "worker" filter_level: "trace" @@ -126,6 +131,11 @@ compaction_service: port: 50051 connect_timeout_ms: 5000 request_timeout_ms: 60000 # 1 minute + memberlist_provider: + custom_resource: + kube_namespace: "chroma" + memberlist_name: "rust-log-service-memberlist" + queue_size: 100 dispatcher: num_worker_threads: 4 dispatcher_queue_size: 1000 @@ -179,7 +189,7 @@ log_service: reinsert_threshold: 0 opentelemetry: service_name: "rust-log-service" - endpoint: "http://otel-collector:4317" + endpoint: "http://otel-collector.chroma.svc.cluster.local:4317" filters: - crate_name: "chroma_log" filter_level: "trace" @@ -209,7 +219,7 @@ log_service: headroom: 200 garbage_collector: service_name: "garbage-collector" - otel_endpoint: "http://otel-collector:4317" + otel_endpoint: "http://otel-collector.chroma.svc.cluster.local:4317" otel_filters: - crate_name: "garbage_collector" filter_level: "debug" @@ -244,3 +254,8 @@ garbage_collector: port: 50051 connect_timeout_ms: 5000 request_timeout_ms: 5000 + memberlist_provider: + custom_resource: + kube_namespace: "chroma" + memberlist_name: "rust-log-service-memberlist" + queue_size: 100 diff --git a/rust/worker/chroma_config2.yaml b/rust/worker/chroma_config2.yaml new file mode 100644 index 00000000000..1487b2877f1 --- /dev/null +++ b/rust/worker/chroma_config2.yaml @@ -0,0 +1,261 @@ +# Default configuration for query and compaction service +# In the long term, every service should have an entry in this file +# and this can become the global configuration file for Chroma +# for now we nest it in the worker directory +query_service: + service_name: "query-service" + otel_endpoint: "http://otel-collector.chroma.svc.cluster.local:4317" + otel_filters: + - crate_name: "worker" + filter_level: "trace" + my_member_id: "query-service-0" + my_port: 50051 + assignment_policy: + rendezvous_hashing: + hasher: Murmur3 + memberlist_provider: + custom_resource: + kube_namespace: "chroma2" + memberlist_name: "query-service-memberlist" + queue_size: 100 + sysdb: + grpc: + host: "sysdb.chroma2" + port: 50051 + connect_timeout_ms: 5000 + request_timeout_ms: 5000 + storage: + admission_controlled_s3: + s3_config: + bucket: "chroma-storage2" + credentials: "Minio" + connect_timeout_ms: 5000 + request_timeout_ms: 30000 # 1 minute + upload_part_size_bytes: 536870912 # 512MiB + download_part_size_bytes: 8388608 # 8MiB + rate_limiting_policy: + count_based_policy: + max_concurrent_requests: 30 + bandwidth_allocation: [0.7, 0.3] + log: + grpc: + host: "rust-log-service.chroma2" + port: 50051 + connect_timeout_ms: 5000 + request_timeout_ms: 60000 # 1 minute + memberlist_provider: + custom_resource: + kube_namespace: "chroma2" + memberlist_name: "rust-log-service-memberlist" + queue_size: 100 + dispatcher: + num_worker_threads: 4 + dispatcher_queue_size: 1000 + worker_queue_size: 100 + task_queue_limit: 10000 + active_io_tasks: 10000 + blockfile_provider: + arrow: + block_manager_config: + max_block_size_bytes: 8388608 # 8MB + block_cache_config: + disk: + dir: "/cache/chroma/query-service/block-cache" + name: "block_cache" + # 1k blocks * 8MiB = 8GiB, this is actually ignored in the disk cache config. Leaving it set to 1k for consistency. + capacity: 1000 + mem: 8000 # 8GiB + disk: 12884 # 12GiB + file_size: 256 # 256 MiB + flushers: 4 + flush: false + reclaimers: 2 + recover_concurrency: 16 + admission_rate_limit: 256 # 256MiB/s + shards: 64 + eviction: lru + num_concurrent_block_flushes: 40 + sparse_index_manager_config: + sparse_index_cache_config: + lru: + name: "sparse_index_cache" + capacity: 1000 + hnsw_provider: + hnsw_temporary_path: "~/tmp" + hnsw_cache_config: + weighted_lru: + name: "hnsw_cache" + capacity: 8589934592 # 8GB + permitted_parallelism: 180 + spann_provider: + adaptive_search_nprobe: false + fetch_log_batch_size: 1000 +compaction_service: + service_name: "compaction-service" + otel_endpoint: "http://otel-collector.chroma.svc.cluster.local:4317" + otel_filters: + - crate_name: "worker" + filter_level: "trace" + my_member_id: "compaction-service-0" + my_port: 50051 + assignment_policy: + rendezvous_hashing: + hasher: Murmur3 + memberlist_provider: + custom_resource: + kube_namespace: "chroma2" + memberlist_name: "compaction-service-memberlist" + queue_size: 100 + sysdb: + grpc: + host: "sysdb.chroma2" + port: 50051 + connect_timeout_ms: 5000 + request_timeout_ms: 5000 + storage: + admission_controlled_s3: + s3_config: + bucket: "chroma-storage2" + credentials: "Minio" + connect_timeout_ms: 5000 + request_timeout_ms: 60000 # 1 minute + upload_part_size_bytes: 536870912 # 512MiB + download_part_size_bytes: 8388608 # 8MiB + rate_limiting_policy: + count_based_policy: + max_concurrent_requests: 30 + bandwidth_allocation: [0.7, 0.3] + log: + grpc: + host: "rust-log-service.chroma2" + port: 50051 + connect_timeout_ms: 5000 + request_timeout_ms: 60000 # 1 minute + memberlist_provider: + custom_resource: + kube_namespace: "chroma2" + memberlist_name: "rust-log-service-memberlist" + queue_size: 100 + dispatcher: + num_worker_threads: 4 + dispatcher_queue_size: 1000 + worker_queue_size: 100 + task_queue_limit: 10000 + active_io_tasks: 10000 + compactor: + compaction_manager_queue_size: 1000 + max_concurrent_jobs: 50 + compaction_interval_sec: 10 + min_compaction_size: 10 + max_compaction_size: 10000 + max_partition_size: 5000 + disabled_collections: [] # uuids to disable compaction for + fetch_log_batch_size: 1000 + task_runner: + enabled: true + blockfile_provider: + arrow: + block_manager_config: + max_block_size_bytes: 8388608 # 8MB + block_cache_config: + lru: + name: "block_cache" + capacity: 1000 + num_concurrent_block_flushes: 40 + sparse_index_manager_config: + sparse_index_cache_config: + lru: + name: "sparse_index_cache" + capacity: 1000 + hnsw_provider: + hnsw_temporary_path: "~/tmp" + hnsw_cache_config: + weighted_lru: + name: "hnsw_cache" + capacity: 8192 # 8192 MiB = 8GB + permitted_parallelism: 180 + spann_provider: + pl_block_size: 5242880 # 5MiB + pl_garbage_collection: + enabled: true + policy: + random_sample: + sample_size: 0.1 + hnsw_garbage_collection: + enabled: true + policy: "full_rebuild" +log_service: + num_records_before_backpressure: 100000 + reinsert_threshold: 0 + opentelemetry: + service_name: "rust-log-service" + endpoint: "http://otel-collector.chroma.svc.cluster.local:4317" + filters: + - crate_name: "chroma_log" + filter_level: "trace" + - crate_name: "wal3" + filter_level: "trace" + storage: + admission_controlled_s3: + s3_config: + bucket: "chroma-storage2" + credentials: "Minio" + connect_timeout_ms: 5000 + request_timeout_ms: 60000 # 1 minute + upload_part_size_bytes: 536870912 # 512MiB + download_part_size_bytes: 8388608 # 8MiB + rate_limiting_policy: + count_based_policy: + max_concurrent_requests: 500 + bandwidth_allocation: [1.0] + cache: + memory: + capacity: 100000000 # 100 MB + writer: + throttle_fragment: + batch_interval_us: 100000 + batch_size_bytes: 8388608 # 8MiB + throughput: 3300 + headroom: 200 +garbage_collector: + service_name: "garbage-collector" + otel_endpoint: "http://otel-collector.chroma.svc.cluster.local:4317" + otel_filters: + - crate_name: "garbage_collector" + filter_level: "debug" + relative_cutoff_time_seconds: 60 # GC all versions created at time < now() - relative_cutoff_time_seconds (1 minute) + max_collections_to_gc: 1000 + gc_interval_mins: 1 + disallow_collections: [] + default_mode: "deletev2" + sysdb_config: + host: "sysdb.chroma2" + port: 50051 + connect_timeout_ms: 60000 + request_timeout_ms: 60000 + dispatcher_config: + num_worker_threads: 4 + dispatcher_queue_size: 100 + worker_queue_size: 100 + storage_config: + s3: + bucket: "chroma-storage2" + assignment_policy: + rendezvous_hashing: + hasher: Murmur3 + memberlist_provider: + custom_resource: + kube_namespace: "chroma2" + memberlist_name: "garbage-collection-service-memberlist" + queue_size: 100 + log: + grpc: + host: "rust-log-service.chroma2" + port: 50051 + connect_timeout_ms: 5000 + request_timeout_ms: 5000 + memberlist_provider: + custom_resource: + kube_namespace: "chroma2" + memberlist_name: "rust-log-service-memberlist" + queue_size: 100