WARNING: THIS SITE IS A MIRROR OF GITHUB.COM / IT CANNOT LOGIN OR REGISTER ACCOUNTS / THE CONTENTS ARE PROVIDED AS-IS / THIS SITE ASSUMES NO RESPONSIBILITY FOR ANY DISPLAYED CONTENT OR LINKS / IF YOU FOUND SOMETHING MAY NOT GOOD FOR EVERYONE, CONTACT ADMIN AT ilovescratch@foxmail.com
Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/ct-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ chart-repos:
- prometheus-community=https://prometheus-community.github.io/helm-charts
- opentelemetry=https://open-telemetry.github.io/opentelemetry-helm-charts
- kuberay=https://ray-project.github.io/kuberay-helm
- splunk=https://splunk.github.io/splunk-operator/

# Target branch for comparison (used in PRs)
target-branch: main
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/helm-lint-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ jobs:
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo add opentelemetry https://open-telemetry.github.io/opentelemetry-helm-charts
helm repo add kuberay https://ray-project.github.io/kuberay-helm
helm repo add splunk https://splunk.github.io/splunk-operator/
helm repo update

- name: Lint splunk-ai-operator chart
Expand Down
1 change: 1 addition & 0 deletions config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ patches:
\ - name: RELATED_IMAGE_WEAVIATE\n value: \"semitechnologies/weaviate:stable-v1.28-007846a\"\n
\ - name: RELATED_IMAGE_SAIA_API\n value: \"667741767953.dkr.ecr.us-west-2.amazonaws.com/ml-platform/saia/saia-api:build-1\"\n
\ - name: RELATED_IMAGE_POST_INSTALL_HOOK\n value: \"667741767953.dkr.ecr.us-west-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-1\"\n
\ - name: RELATED_IMAGE_OTEL_COLLECTOR\n value: \"otel/opentelemetry-collector-contrib:0.122.1\"\n
\ - name: RELATED_IMAGE_FLUENT_BIT\n value: \"fluent/fluent-bit:1.9.6\"\n -
name: MODEL_VERSION\n value: \"v0.3.14-36-g1549f5a\"\n - name: RAY_VERSION\n
\ value: \"2.44.0\""
Expand Down
33 changes: 33 additions & 0 deletions helm-chart/splunk-ai-operator/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,36 @@ version: "0.1.0"
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "0.1.0"

# Dependencies required by the Splunk AI Operator
# These operators must be installed for the AI Operator to function correctly
dependencies:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe we have to add these chart .tgz files to the charts/ directory to avoid errors during helm installation.

# cert-manager - Required for webhook certificates and mTLS
- name: cert-manager
version: "1.18.0"
repository: "https://charts.jetstack.io"
condition: cert-manager.enabled

# KubeRay Operator - Required for Ray cluster management
- name: kuberay-operator
version: "1.2.2"
repository: "https://ray-project.github.io/kuberay-helm"
condition: kuberay-operator.enabled

# OpenTelemetry Operator - Required for observability sidecars
- name: opentelemetry-operator
version: "0.88.6"
repository: "https://open-telemetry.github.io/opentelemetry-helm-charts"
condition: opentelemetry-operator.enabled

# Kube Prometheus Stack - Required for Prometheus monitoring
- name: kube-prometheus-stack
version: "72.4.0"
repository: "https://prometheus-community.github.io/helm-charts"
condition: kube-prometheus-stack.enabled

# Splunk Operator - Required for managing Splunk Enterprise instances
- name: splunk-operator
version: "3.0.0"
repository: "https://splunk.github.io/splunk-operator"
condition: splunk-operator.enabled
2 changes: 2 additions & 0 deletions helm-chart/splunk-ai-operator/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ spec:
value: {{ .Values.saiaApiImage }}
- name: RELATED_IMAGE_POST_INSTALL_HOOK
value: {{ .Values.saiaSchemaImage }}
- name: RELATED_IMAGE_OTEL_COLLECTOR
value: {{ .Values.otelCollectorImage }}
- name: MODEL_VERSION
value: v0.3.14-36-g1549f5a
- name: RAY_VERSION
Expand Down
58 changes: 55 additions & 3 deletions helm-chart/splunk-ai-operator/values.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,52 @@
# ============================================================================
# Dependency Management
# ============================================================================
# These operators are required by the Splunk AI Operator and are installed
# automatically as dependencies. Set to false if already installed in your cluster.

# cert-manager - Required for webhook certificates and mTLS
cert-manager:
enabled: true
crds:
enabled: true

# KubeRay Operator - Required for Ray cluster management
kuberay-operator:
enabled: true

# OpenTelemetry Operator - Required for observability sidecars
opentelemetry-operator:
enabled: true
manager:
collectorImage:
repository: "otel/opentelemetry-collector-k8s"
tag: "0.102.1"

# Kube Prometheus Stack - Required for Prometheus monitoring
kube-prometheus-stack:
enabled: true
prometheus:
enabled: true
grafana:
enabled: false # Minimal installation
alertmanager:
enabled: false # Minimal installation

# Splunk Operator - Required for managing Splunk Enterprise instances
splunk-operator:
enabled: true
image:
repository: docker.io/splunk/splunk-operator
tag: 3.0.0
# Environment variables for splunk-operator
env:
- name: SPLUNK_GENERAL_TERMS
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We cannot add this in the helm chart. The user has to manually add this to accept the terms after reading the SGT. Can you set the value to be empty and add a comment to view the SOK readme for the appropriate value?

value: "--accept-sgt-current-at-splunk-com"

# ============================================================================
# Splunk AI Operator Configuration
# ============================================================================

# Metadata overrides:
# Override the Splunk AI Operator helm chart name
nameOverride: ""
Expand Down Expand Up @@ -44,6 +93,9 @@ weaviateImage: "docker.io/semitechnologies/weaviate:stable-v1.28-007846a"
saiaApiImage: "docker.io/splunk/saia-api:1.1.0"
saiaSchemaImage: "docker.io/splunk/saia-data-loader:1.1.0"

# OpenTelemetry Collector image for observability sidecars
otelCollectorImage: "otel/opentelemetry-collector-contrib:0.122.1"

# Set security context for Splunk Operator pod
# reference: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.24/#podsecuritycontext-v1-core
securityContext:
Expand All @@ -53,10 +105,10 @@ securityContext:

# Splunk AI Operator image and pull policy
# reference: https://github.com/splunk/splunk-ai-operator
# Default uses GitHub Container Registry (ghcr.io) for official releases
# You can also use Docker Hub: docker.io/splunk/splunk-ai-operator
# Default uses Docker Hub for public releases
# You can also use GitHub Container Registry: ghcr.io/splunk/splunk-ai-operator
image:
repository: ghcr.io/splunk/splunk-ai-operator
repository: docker.io/splunk/splunk-ai-operator
tag: "v0.1.0"
pullPolicy: IfNotPresent

Expand Down
19 changes: 3 additions & 16 deletions helm-chart/splunk-ai-platform/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,24 +46,11 @@ annotations:
- kind: added
description: Initial release with full observability stack

# Dependencies
# The splunk-ai-operator chart includes all required operator dependencies
# (cert-manager, kuberay, opentelemetry, prometheus, splunk-operator)
dependencies:
- name: splunk-ai-operator
version: "0.1.0"
repository: "file://../splunk-ai-operator"
condition: splunk-ai-operator.enabled
- name: kuberay-operator
version: "1.3.2"
repository: "https://ray-project.github.io/kuberay-helm"
condition: kuberay-operator.enabled
- name: cert-manager
version: "1.18.0"
repository: "https://charts.jetstack.io"
condition: cert-manager.enabled
- name: kube-prometheus-stack
version: "72.4.0"
repository: "https://prometheus-community.github.io/helm-charts"
condition: prometheus.enabled
- name: opentelemetry-operator
version: "0.88.6"
repository: "https://open-telemetry.github.io/opentelemetry-helm-charts"
condition: opentelemetry-operator.enabled
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
82 changes: 36 additions & 46 deletions helm-chart/splunk-ai-platform/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,22 @@
# This is a YAML-formatted file.
# Users can override these (e.g. via --values) to configure both the CR and the Ingress.

# Deploy Splunk AI Operator chart
# Disable if the Splunk Operator is already deployed
splunk-ai-operator:
enabled: true

# Deploy KubeRay Operator chart
# Disable if the KubeRay Operator is already deployed
kuberay-operator:
enabled: true
# ============================================================================
# Operator Installation
# ============================================================================
# The Splunk AI Operator and all its dependencies (cert-manager, kuberay,
# opentelemetry, prometheus, splunk-operator) are installed automatically.
# Set to false if the operator is already installed in your cluster.

# Deploy Cert Manager chart
# Disable if the Cert Manager is already deployed
cert-manager:
enabled: true

# Deploy Kube Prometheus Stack chart
# Disable if the Kube Prometheus Stack is already deployed
prometheus:
splunk-ai-operator:
enabled: true

# Deploy OpenTelemetry Operator chart
# Disable if the OpenTelemetry Operator is already deployed
opentelemetry-operator:
enabled: true
manager:
collectorImage:
repository: "otel/opentelemetry-collector-k8s"
tag: "0.102.1"
# You can override dependency settings here if needed
# For example, to disable specific operators:
# cert-manager:
# enabled: false
# kuberay-operator:
# enabled: false

# Metadata overrides:
# Override the Splunk AI Platform helm chart name
Expand Down Expand Up @@ -133,27 +121,29 @@ images:
# Default GPU type to use for Ray worker groups
defaultAcceleratorType: ""

# SplunkConfigurationSpec instance reference
splunkConfiguration:
# CR of the SplunkConfiguration instance
splunkCustomResourceRef:
{}
# kind: "Standalone"
# namespace: "default"
# name: "splunk-standalone-stdln-0"
# apiVersion: "enterprise.splunk.com/v4"
# Splunk secret reference
secretRef:
{}
# name: "splunk-secret"
# namespace: "default"
endpoint: "https://splunk.default.svc.cluster.local:8089"
# Optional, if not using secretRef
token: ""
# Whether token comes from Kubernetes Secret or Vault Agent, e.g. "kubernetes" or "vault"
secretSource: "kubernetes"
# Path where Vault Agent injects the Splunk HEC token
vaultFilePath: ""
# SplunkConfigurationSpec instance reference (optional)
# Set to null or comment out if Splunk is not configured
# Uncomment and configure when Splunk integration is needed
splunkConfiguration: {}
# Example configuration:
# splunkConfiguration:
# # CR of the SplunkConfiguration instance
# splunkCustomResourceRef:
# kind: "Standalone"
# namespace: "default"
# name: "splunk-standalone-stdln-0"
# apiVersion: "enterprise.splunk.com/v4"
# # Splunk secret reference
# secretRef:
# name: "splunk-secret"
# namespace: "default"
# endpoint: "https://splunk.default.svc.cluster.local:8089"
# # Optional, if not using secretRef
# token: ""
# # Whether token comes from Kubernetes Secret or Vault Agent, e.g. "kubernetes" or "vault"
# secretSource: "kubernetes"
# # Path where Vault Agent injects the Splunk HEC token
# vaultFilePath: ""

# Specifies object storage for AI artifacts (e.g., S3 or GCS bucket)
storage:
Expand Down
8 changes: 3 additions & 5 deletions internal/webhook/v1/aiplatform_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,9 @@ func (d *AIPlatformCustomDefaulter) Default(_ context.Context, obj runtime.Objec
aiplatform.Spec.ClusterDomain = "cluster.local"
}

// Default Sidecars
if !aiplatform.Spec.Sidecars.Otel && !aiplatform.Spec.Sidecars.PrometheusOperator {
aiplatform.Spec.Sidecars.Otel = true
aiplatform.Spec.Sidecars.PrometheusOperator = true
}
// Default Sidecars - removed automatic defaulting to true
// Users must explicitly enable sidecars in their configuration
// This allows users to disable sidecars by setting them to false

// Default Storage size for VectorDB if not specified
if aiplatform.Spec.Storage.VectorDB.Size == "" && aiplatform.Spec.Storage.VectorDB.PVCName == "" {
Expand Down
36 changes: 27 additions & 9 deletions pkg/ai/sidecars/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,16 @@ func (s *Builder) reconcileOpenTelemetryCollector(ctx context.Context, p *aiApi.
return fmt.Errorf("json unmarshal: %w", err)
}

// Get OTEL collector image from environment variable
otelImage := os.Getenv("RELATED_IMAGE_OTEL_COLLECTOR")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some of these changes were added as part of #75. Should we cancel that PR in favor of the changes here?

if otelImage == "" {
otelImage = "otel/opentelemetry-collector-contrib:0.122.1" // fallback default
}

// construct spec
specMap := map[string]interface{}{
"mode": "sidecar",
"image": "otel/opentelemetry-collector-contrib:0.122.1",
"image": otelImage,
"env": []map[string]interface{}{
{"name": "SPLUNK_ACCESS_TOKEN", "valueFrom": map[string]interface{}{"secretKeyRef": map[string]interface{}{"name": s.ai.Spec.SplunkConfiguration.SecretRef.Name, "key": "hec_token"}}},
{"name": "POD_NAME", "valueFrom": map[string]interface{}{"fieldRef": map[string]interface{}{"fieldPath": "metadata.name"}}},
Expand Down Expand Up @@ -183,7 +189,10 @@ func (s *Builder) reconcileOtelConfigMap(ctx context.Context, p *aiApi.AIPlatfor
cm.Data = map[string]string{}
}
if _, exists := cm.Data["otel-config.yaml"]; !exists {
content := s.renderOtelConf(ctx, p)
content, err := s.renderOtelConf(ctx, p)
if err != nil {
return fmt.Errorf("rendering otel config: %w", err)
}
yamlBytes, err := syaml.Marshal(content)
if err != nil {
return fmt.Errorf("marshaling otel config: %w", err)
Expand All @@ -199,16 +208,23 @@ func (s *Builder) reconcileOtelConfigMap(ctx context.Context, p *aiApi.AIPlatfor
}

// renderOtelConf builds the OpenTelemetry Collector config map data.
func (s *Builder) renderOtelConf(ctx context.Context, cr *aiApi.AIPlatform) map[string]interface{} {
// Returns the config and an error if validation fails.
func (s *Builder) renderOtelConf(ctx context.Context, cr *aiApi.AIPlatform) (map[string]interface{}, error) {
// Validate that the secret reference is provided
if cr.Spec.SplunkConfiguration.SecretRef.Name == "" {
return nil, fmt.Errorf("SplunkConfiguration.SecretRef.Name is required for OTEL sidecar")
}

// Validate that the secret exists
secret := &corev1.Secret{}
key := types.NamespacedName{Name: cr.Spec.SplunkConfiguration.SecretRef.Name, Namespace: cr.Namespace}
if err := s.Client.Get(ctx, key, secret); err != nil {
return map[string]interface{}{"error": fmt.Sprintf("loading secret %q: %v", key.Name, err)}
return nil, fmt.Errorf("failed to validate secret %q: %w", key.Name, err)
}

token, ok := secret.Data["hec_token"]
if !ok {
return map[string]interface{}{"error": "hec_token field not found in secret"}
// Verify the secret has the required key
if _, ok := secret.Data["hec_token"]; !ok {
return nil, fmt.Errorf("secret %q does not contain required key 'hec_token'", key.Name)
}

endpoint := fmt.Sprintf("%s/services/collector", cr.Spec.SplunkConfiguration.Endpoint)
Expand All @@ -219,7 +235,9 @@ func (s *Builder) renderOtelConf(ctx context.Context, cr *aiApi.AIPlatform) map[
return map[string]interface{}{
"exporters": map[string]interface{}{
"splunk_hec": map[string]interface{}{
"token": string(token),
// Use environment variable reference instead of embedding the token
// The SPLUNK_ACCESS_TOKEN env var is injected by the OpenTelemetryCollector CR
"token": "${SPLUNK_ACCESS_TOKEN}",
"endpoint": endpoint,
"source": "otel",
"sourcetype": "otel",
Expand Down Expand Up @@ -275,7 +293,7 @@ func (s *Builder) renderOtelConf(ctx context.Context, cr *aiApi.AIPlatform) map[
},
},
},
}
}, nil
}

// renderEnvoyConf generates the Envoy configuration for the given AIPlatform.
Expand Down
Loading
Loading