WARNING: THIS SITE IS A MIRROR OF GITHUB.COM / IT CANNOT LOGIN OR REGISTER ACCOUNTS / THE CONTENTS ARE PROVIDED AS-IS / THIS SITE ASSUMES NO RESPONSIBILITY FOR ANY DISPLAYED CONTENT OR LINKS / IF YOU FOUND SOMETHING MAY NOT GOOD FOR EVERYONE, CONTACT ADMIN AT ilovescratch@foxmail.com
Skip to content

Commit e0ba4ab

Browse files
authored
Merge pull request #1949 from rahulait/fix-disabling-nvidiadriver
2 parents ba5eabe + 85ddbc1 commit e0ba4ab

File tree

10 files changed

+215
-6
lines changed

10 files changed

+215
-6
lines changed

api/nvidia/v1/clusterpolicy_types.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1912,8 +1912,8 @@ func (d *DriverSpec) IsEnabled() bool {
19121912
return *d.Enabled
19131913
}
19141914

1915-
// UseNvdiaDriverCRDType returns true if the driver installation is managed by NVIDIADriver CRD type
1916-
func (d *DriverSpec) UseNvdiaDriverCRDType() bool {
1915+
// UseNvidiaDriverCRDType returns true if the driver installation is managed by NVIDIADriver CRD type
1916+
func (d *DriverSpec) UseNvidiaDriverCRDType() bool {
19171917
if d.UseNvidiaDriverCRD == nil {
19181918
// default is false if not specified by user
19191919
return false

api/nvidia/v1alpha1/nvidiadriver_types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -476,7 +476,7 @@ const (
476476
type NVIDIADriverStatus struct {
477477
// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
478478
// Important: Run "make" to regenerate code after modifying this file
479-
// +kubebuilder:validation:Enum=ignored;ready;notReady
479+
// +kubebuilder:validation:Enum=ignored;ready;notReady;disabled
480480
// State indicates status of NVIDIADriver instance
481481
State State `json:"state"`
482482
// Namespace indicates a namespace in which the operator and driver are installed

bundle/manifests/nvidia.com_nvidiadrivers.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -807,6 +807,7 @@ spec:
807807
- ignored
808808
- ready
809809
- notReady
810+
- disabled
810811
type: string
811812
required:
812813
- state

config/crd/bases/nvidia.com_nvidiadrivers.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -807,6 +807,7 @@ spec:
807807
- ignored
808808
- ready
809809
- notReady
810+
- disabled
810811
type: string
811812
required:
812813
- state

controllers/nvidiadriver_controller.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,17 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
120120
}
121121
clusterPolicyInstance := clusterPolicyList.Items[0]
122122

123+
// Ensure that ClusterPolicy is configured to use NVIDIADriver CRD
124+
if !clusterPolicyInstance.Spec.Driver.UseNvidiaDriverCRDType() {
125+
msg := "useNvidiaDriverCRD is not enabled in ClusterPolicy"
126+
logger.V(consts.LogLevelWarning).Info("NVIDIADriver reconciliation skipped", "reason", msg)
127+
instance.Status.State = nvidiav1alpha1.Disabled
128+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.Reconciled, msg); condErr != nil {
129+
logger.Error(condErr, "failed to set condition")
130+
}
131+
return reconcile.Result{}, nil
132+
}
133+
123134
// Create a new InfoCatalog which is a generic interface for passing information to state managers
124135
infoCatalog := state.NewInfoCatalog()
125136

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
/**
2+
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
**/
16+
17+
package controllers
18+
19+
import (
20+
"bytes"
21+
"context"
22+
"errors"
23+
"testing"
24+
25+
"github.com/go-logr/logr"
26+
"github.com/go-logr/zapr"
27+
"github.com/stretchr/testify/require"
28+
"go.uber.org/zap"
29+
"go.uber.org/zap/zapcore"
30+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
31+
"k8s.io/apimachinery/pkg/runtime"
32+
"k8s.io/apimachinery/pkg/types"
33+
"k8s.io/utils/ptr"
34+
ctrl "sigs.k8s.io/controller-runtime"
35+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
36+
37+
gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
38+
nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
39+
"github.com/NVIDIA/gpu-operator/internal/validator"
40+
)
41+
42+
// FakeConditionUpdater implements conditions.Updater
43+
// It always returns CustomError if set
44+
type FakeConditionUpdater struct {
45+
CustomError error
46+
}
47+
48+
// SetConditionsError always returns CustomError if set
49+
func (f *FakeConditionUpdater) SetConditionsError(ctx context.Context, obj any, condType, msg string) error {
50+
return f.CustomError
51+
}
52+
53+
// SetConditionsReady always returns CustomError if set
54+
func (f *FakeConditionUpdater) SetConditionsReady(ctx context.Context, obj any, condType, msg string) error {
55+
return f.CustomError
56+
}
57+
58+
// FakeNodeSelectorValidator always returns CustomError if set
59+
type FakeNodeSelectorValidator struct {
60+
CustomError error
61+
}
62+
63+
// Validate always returns CustomError if set
64+
func (f *FakeNodeSelectorValidator) Validate(ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver) error {
65+
return f.CustomError
66+
}
67+
68+
// newTestLogger creates a zap.Logger that writes to an in-memory buffer for testing
69+
func newTestLogger() (logr.Logger, *bytes.Buffer) {
70+
buf := &bytes.Buffer{}
71+
72+
encoder := zapcore.NewConsoleEncoder(zapcore.EncoderConfig{
73+
MessageKey: "msg",
74+
LevelKey: "level",
75+
NameKey: "logger",
76+
CallerKey: "caller",
77+
EncodeLevel: zapcore.CapitalLevelEncoder,
78+
EncodeCaller: zapcore.ShortCallerEncoder,
79+
EncodeName: zapcore.FullNameEncoder,
80+
})
81+
82+
core := zapcore.NewCore(encoder, zapcore.AddSync(buf), zapcore.DebugLevel)
83+
zapLogger := zap.New(core)
84+
85+
return zapr.NewLogger(zapLogger), buf
86+
}
87+
88+
// TestReconcile tests that reconciliation proceeds or skips based on the
89+
// ClusterPolicy and NVIDIADriver. Since Reconcile() does in-memory updates,
90+
// we just ensure it does not error out or the error is handled gracefully.
91+
func TestReconcile(t *testing.T) {
92+
scheme := runtime.NewScheme()
93+
require.NoError(t, nvidiav1alpha1.AddToScheme(scheme))
94+
require.NoError(t, gpuv1.AddToScheme(scheme))
95+
96+
tests := []struct {
97+
name string
98+
useCRD *bool
99+
spec nvidiav1alpha1.NVIDIADriverSpec
100+
validator validator.Validator
101+
error error
102+
expectedLog string
103+
}{
104+
{
105+
name: "ClusterPolicy has driver CRD false → reconciliation skips driver",
106+
useCRD: ptr.To(false),
107+
validator: &FakeNodeSelectorValidator{
108+
CustomError: errors.New("fake list error"),
109+
},
110+
error: nil,
111+
expectedLog: "useNvidiaDriverCRD is not enabled in ClusterPolicy",
112+
},
113+
{
114+
name: "ClusterPolicy has driver CRD true but validator errors",
115+
useCRD: ptr.To(true),
116+
validator: &FakeNodeSelectorValidator{
117+
CustomError: errors.New("fake list error"),
118+
},
119+
error: nil,
120+
expectedLog: "nodeSelector validation failed",
121+
},
122+
{
123+
name: "driver CRD true, no validator errors, use precompiled drivers and GDS enabled",
124+
useCRD: ptr.To(true),
125+
spec: nvidiav1alpha1.NVIDIADriverSpec{
126+
UsePrecompiled: ptr.To(true),
127+
GPUDirectStorage: &nvidiav1alpha1.GPUDirectStorageSpec{
128+
Enabled: ptr.To(true),
129+
},
130+
},
131+
validator: &FakeNodeSelectorValidator{
132+
CustomError: nil,
133+
},
134+
error: nil,
135+
expectedLog: "GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers",
136+
},
137+
}
138+
139+
for _, tc := range tests {
140+
t.Run(tc.name, func(t *testing.T) {
141+
logger, buf := newTestLogger()
142+
ctx := ctrl.LoggerInto(context.Background(), logger)
143+
driver := &nvidiav1alpha1.NVIDIADriver{
144+
ObjectMeta: metav1.ObjectMeta{
145+
Name: "test-driver",
146+
Namespace: "default",
147+
},
148+
Spec: tc.spec,
149+
}
150+
151+
cp := &gpuv1.ClusterPolicy{
152+
ObjectMeta: metav1.ObjectMeta{Name: "default"},
153+
Spec: gpuv1.ClusterPolicySpec{
154+
Driver: gpuv1.DriverSpec{
155+
UseNvidiaDriverCRD: tc.useCRD,
156+
},
157+
},
158+
}
159+
160+
// Initialize fake client with ClusterPolicy (driver optional)
161+
clientBuilder := fake.NewClientBuilder().WithScheme(scheme).WithObjects(cp, driver)
162+
client := clientBuilder.Build()
163+
164+
updater := &FakeConditionUpdater{}
165+
166+
reconciler := &NVIDIADriverReconciler{
167+
Client: client,
168+
Scheme: scheme,
169+
conditionUpdater: updater,
170+
nodeSelectorValidator: tc.validator,
171+
}
172+
173+
req := ctrl.Request{
174+
NamespacedName: types.NamespacedName{
175+
Name: driver.Name,
176+
Namespace: driver.Namespace,
177+
},
178+
}
179+
180+
_, err := reconciler.Reconcile(ctx, req)
181+
182+
if tc.error != nil {
183+
require.Error(t, err)
184+
require.EqualError(t, err, tc.error.Error())
185+
} else {
186+
require.NoError(t, err)
187+
}
188+
189+
logs := buf.String()
190+
if tc.expectedLog != "" {
191+
require.Contains(t, logs, tc.expectedLog)
192+
}
193+
})
194+
}
195+
}

controllers/state_manager.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -942,7 +942,7 @@ func (n *ClusterPolicyController) step() (gpuv1.State, error) {
942942
// before managing them. Clusterpolicy controller should not be creating /
943943
// updating / deleting objects owned by another controller.
944944
if (n.stateNames[n.idx] == "state-driver" || n.stateNames[n.idx] == "state-vgpu-manager") &&
945-
n.singleton.Spec.Driver.UseNvdiaDriverCRDType() {
945+
n.singleton.Spec.Driver.UseNvidiaDriverCRDType() {
946946
n.logger.Info("NVIDIADriver CRD is enabled, cleaning up all NVIDIA driver daemonsets owned by ClusterPolicy")
947947
n.idx++
948948
// Cleanup all driver daemonsets owned by ClusterPolicy.

controllers/upgrade_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
130130
driverLabelKey := DriverLabelKey
131131
driverLabelValue := DriverLabelValue
132132

133-
if clusterPolicy.Spec.Driver.UseNvdiaDriverCRDType() {
133+
if clusterPolicy.Spec.Driver.UseNvidiaDriverCRDType() {
134134
// app component label is added for all new driver daemonsets deployed by NVIDIADriver controller
135135
driverLabelKey = AppComponentLabelKey
136136
driverLabelValue = AppComponentLabelValue

deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -807,6 +807,7 @@ spec:
807807
- ignored
808808
- ready
809809
- notReady
810+
- disabled
810811
type: string
811812
required:
812813
- state

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ require (
1010
github.com/NVIDIA/nvidia-container-toolkit v1.18.1
1111
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc
1212
github.com/go-logr/logr v1.4.3
13+
github.com/go-logr/zapr v1.3.0
1314
github.com/onsi/ginkgo/v2 v2.27.2
1415
github.com/onsi/gomega v1.38.2
1516
github.com/openshift/api v0.0.0-20251021124544-a2cb0c5d994d
@@ -52,7 +53,6 @@ require (
5253
github.com/fsnotify/fsnotify v1.9.0 // indirect
5354
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
5455
github.com/go-errors/errors v1.5.1 // indirect
55-
github.com/go-logr/zapr v1.3.0 // indirect
5656
github.com/go-openapi/jsonpointer v0.21.1 // indirect
5757
github.com/go-openapi/jsonreference v0.21.0 // indirect
5858
github.com/go-openapi/swag v0.23.1 // indirect

0 commit comments

Comments
 (0)