diff --git a/.ko.yaml b/.ko.yaml index 2c2a063d9..177154fea 100644 --- a/.ko.yaml +++ b/.ko.yaml @@ -18,7 +18,6 @@ platforms: [linux/amd64, linux/arm64] env: [CGO_ENABLED=0] builds: - - id: fault-quarantine dir: fault-quarantine main: . @@ -69,7 +68,7 @@ builds: org.opencontainers.image.version: "{{.Env.VERSION}}" org.opencontainers.image.revision: "{{.Env.GIT_COMMIT}}" org.opencontainers.image.created: "{{.Env.BUILD_DATE}}" - + - id: csp-health-monitor dir: health-monitors/csp-health-monitor main: ./cmd/csp-health-monitor @@ -155,6 +154,91 @@ builds: org.opencontainers.image.revision: "{{.Env.GIT_COMMIT}}" org.opencontainers.image.created: "{{.Env.BUILD_DATE}}" + - id: janitor-provider-aws + dir: providers/aws + main: . + ldflags: + - "-s -w" + - "-X main.version={{.Env.VERSION}} -X main.commit={{.Env.GIT_COMMIT}} -X main.date={{.Env.BUILD_DATE}}" + annotations: + org.opencontainers.image.description: "AWS provider for the NVSentinel Janitor" + labels: + org.opencontainers.image.source: "https://github.com/nvidia/nvsentinel" + org.opencontainers.image.licenses: "Apache-2.0" + org.opencontainers.image.title: "janitor-provider-aws" + org.opencontainers.image.description: "AWS implementation of the CSP provider interface for Janitor" + org.opencontainers.image.version: "{{.Env.VERSION}}" + org.opencontainers.image.revision: "{{.Env.GIT_COMMIT}}" + org.opencontainers.image.created: "{{.Env.BUILD_DATE}}" + + - id: janitor-provider-azure + dir: providers/azure + main: . + ldflags: + - "-s -w" + - "-X main.version={{.Env.VERSION}} -X main.commit={{.Env.GIT_COMMIT}} -X main.date={{.Env.BUILD_DATE}}" + annotations: + org.opencontainers.image.description: "Azure provider for the NVSentinel Janitor" + labels: + org.opencontainers.image.source: "https://github.com/nvidia/nvsentinel" + org.opencontainers.image.licenses: "Apache-2.0" + org.opencontainers.image.title: "janitor-provider-azure" + org.opencontainers.image.description: "Azure implementation of the CSP provider interface for Janitor" + org.opencontainers.image.version: "{{.Env.VERSION}}" + org.opencontainers.image.revision: "{{.Env.GIT_COMMIT}}" + org.opencontainers.image.created: "{{.Env.BUILD_DATE}}" + + - id: janitor-provider-gcp + dir: providers/gcp + main: . + ldflags: + - "-s -w" + - "-X main.version={{.Env.VERSION}} -X main.commit={{.Env.GIT_COMMIT}} -X main.date={{.Env.BUILD_DATE}}" + annotations: + org.opencontainers.image.description: "GCP provider for the NVSentinel Janitor" + labels: + org.opencontainers.image.source: "https://github.com/nvidia/nvsentinel" + org.opencontainers.image.licenses: "Apache-2.0" + org.opencontainers.image.title: "janitor-provider-gcp" + org.opencontainers.image.description: "GCP implementation of the CSP provider interface for Janitor" + org.opencontainers.image.version: "{{.Env.VERSION}}" + org.opencontainers.image.revision: "{{.Env.GIT_COMMIT}}" + org.opencontainers.image.created: "{{.Env.BUILD_DATE}}" + + - id: janitor-provider-kwok + dir: providers/kwok + main: . + ldflags: + - "-s -w" + - "-X main.version={{.Env.VERSION}} -X main.commit={{.Env.GIT_COMMIT}} -X main.date={{.Env.BUILD_DATE}}" + annotations: + org.opencontainers.image.description: "KWOK provider for the NVSentinel Janitor" + labels: + org.opencontainers.image.source: "https://github.com/nvidia/nvsentinel" + org.opencontainers.image.licenses: "Apache-2.0" + org.opencontainers.image.title: "janitor-provider-kwok" + org.opencontainers.image.description: "KWOK implementation of the CSP provider interface for Janitor" + org.opencontainers.image.version: "{{.Env.VERSION}}" + org.opencontainers.image.revision: "{{.Env.GIT_COMMIT}}" + org.opencontainers.image.created: "{{.Env.BUILD_DATE}}" + + - id: janitor-provider-oci + dir: providers/oci + main: . + ldflags: + - "-s -w" + - "-X main.version={{.Env.VERSION}} -X main.commit={{.Env.GIT_COMMIT}} -X main.date={{.Env.BUILD_DATE}}" + annotations: + org.opencontainers.image.description: "OCI provider for the NVSentinel Janitor" + labels: + org.opencontainers.image.source: "https://github.com/nvidia/nvsentinel" + org.opencontainers.image.licenses: "Apache-2.0" + org.opencontainers.image.title: "janitor-provider-oci" + org.opencontainers.image.description: "OCI implementation of the CSP provider interface for Janitor" + org.opencontainers.image.version: "{{.Env.VERSION}}" + org.opencontainers.image.revision: "{{.Env.GIT_COMMIT}}" + org.opencontainers.image.created: "{{.Env.BUILD_DATE}}" + - id: platform-connectors dir: platform-connectors main: . diff --git a/api/gen/go/csp/v1alpha1/provider.pb.go b/api/gen/go/csp/v1alpha1/provider.pb.go new file mode 100644 index 000000000..ae5a9f030 --- /dev/null +++ b/api/gen/go/csp/v1alpha1/provider.pb.go @@ -0,0 +1,383 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.36.10 +// protoc v6.33.0 +// source: csp/v1alpha1/provider.proto + +package cspv1alpha1 + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" + unsafe "unsafe" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type SendRebootSignalRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + NodeName string `protobuf:"bytes,1,opt,name=node_name,json=nodeName,proto3" json:"node_name,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *SendRebootSignalRequest) Reset() { + *x = SendRebootSignalRequest{} + mi := &file_csp_v1alpha1_provider_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *SendRebootSignalRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SendRebootSignalRequest) ProtoMessage() {} + +func (x *SendRebootSignalRequest) ProtoReflect() protoreflect.Message { + mi := &file_csp_v1alpha1_provider_proto_msgTypes[0] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SendRebootSignalRequest.ProtoReflect.Descriptor instead. +func (*SendRebootSignalRequest) Descriptor() ([]byte, []int) { + return file_csp_v1alpha1_provider_proto_rawDescGZIP(), []int{0} +} + +func (x *SendRebootSignalRequest) GetNodeName() string { + if x != nil { + return x.NodeName + } + return "" +} + +type SendRebootSignalResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + RequestId string `protobuf:"bytes,1,opt,name=request_id,json=requestId,proto3" json:"request_id,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *SendRebootSignalResponse) Reset() { + *x = SendRebootSignalResponse{} + mi := &file_csp_v1alpha1_provider_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *SendRebootSignalResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SendRebootSignalResponse) ProtoMessage() {} + +func (x *SendRebootSignalResponse) ProtoReflect() protoreflect.Message { + mi := &file_csp_v1alpha1_provider_proto_msgTypes[1] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SendRebootSignalResponse.ProtoReflect.Descriptor instead. +func (*SendRebootSignalResponse) Descriptor() ([]byte, []int) { + return file_csp_v1alpha1_provider_proto_rawDescGZIP(), []int{1} +} + +func (x *SendRebootSignalResponse) GetRequestId() string { + if x != nil { + return x.RequestId + } + return "" +} + +type IsNodeReadyRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + NodeName string `protobuf:"bytes,1,opt,name=node_name,json=nodeName,proto3" json:"node_name,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *IsNodeReadyRequest) Reset() { + *x = IsNodeReadyRequest{} + mi := &file_csp_v1alpha1_provider_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *IsNodeReadyRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*IsNodeReadyRequest) ProtoMessage() {} + +func (x *IsNodeReadyRequest) ProtoReflect() protoreflect.Message { + mi := &file_csp_v1alpha1_provider_proto_msgTypes[2] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use IsNodeReadyRequest.ProtoReflect.Descriptor instead. +func (*IsNodeReadyRequest) Descriptor() ([]byte, []int) { + return file_csp_v1alpha1_provider_proto_rawDescGZIP(), []int{2} +} + +func (x *IsNodeReadyRequest) GetNodeName() string { + if x != nil { + return x.NodeName + } + return "" +} + +type IsNodeReadyResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + IsReady bool `protobuf:"varint,1,opt,name=is_ready,json=isReady,proto3" json:"is_ready,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *IsNodeReadyResponse) Reset() { + *x = IsNodeReadyResponse{} + mi := &file_csp_v1alpha1_provider_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *IsNodeReadyResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*IsNodeReadyResponse) ProtoMessage() {} + +func (x *IsNodeReadyResponse) ProtoReflect() protoreflect.Message { + mi := &file_csp_v1alpha1_provider_proto_msgTypes[3] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use IsNodeReadyResponse.ProtoReflect.Descriptor instead. +func (*IsNodeReadyResponse) Descriptor() ([]byte, []int) { + return file_csp_v1alpha1_provider_proto_rawDescGZIP(), []int{3} +} + +func (x *IsNodeReadyResponse) GetIsReady() bool { + if x != nil { + return x.IsReady + } + return false +} + +type SendTerminateSignalRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + NodeName string `protobuf:"bytes,1,opt,name=node_name,json=nodeName,proto3" json:"node_name,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *SendTerminateSignalRequest) Reset() { + *x = SendTerminateSignalRequest{} + mi := &file_csp_v1alpha1_provider_proto_msgTypes[4] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *SendTerminateSignalRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SendTerminateSignalRequest) ProtoMessage() {} + +func (x *SendTerminateSignalRequest) ProtoReflect() protoreflect.Message { + mi := &file_csp_v1alpha1_provider_proto_msgTypes[4] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SendTerminateSignalRequest.ProtoReflect.Descriptor instead. +func (*SendTerminateSignalRequest) Descriptor() ([]byte, []int) { + return file_csp_v1alpha1_provider_proto_rawDescGZIP(), []int{4} +} + +func (x *SendTerminateSignalRequest) GetNodeName() string { + if x != nil { + return x.NodeName + } + return "" +} + +type SendTerminateSignalResponse struct { + state protoimpl.MessageState `protogen:"open.v1"` + RequestId string `protobuf:"bytes,1,opt,name=request_id,json=requestId,proto3" json:"request_id,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *SendTerminateSignalResponse) Reset() { + *x = SendTerminateSignalResponse{} + mi := &file_csp_v1alpha1_provider_proto_msgTypes[5] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *SendTerminateSignalResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SendTerminateSignalResponse) ProtoMessage() {} + +func (x *SendTerminateSignalResponse) ProtoReflect() protoreflect.Message { + mi := &file_csp_v1alpha1_provider_proto_msgTypes[5] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SendTerminateSignalResponse.ProtoReflect.Descriptor instead. +func (*SendTerminateSignalResponse) Descriptor() ([]byte, []int) { + return file_csp_v1alpha1_provider_proto_rawDescGZIP(), []int{5} +} + +func (x *SendTerminateSignalResponse) GetRequestId() string { + if x != nil { + return x.RequestId + } + return "" +} + +var File_csp_v1alpha1_provider_proto protoreflect.FileDescriptor + +const file_csp_v1alpha1_provider_proto_rawDesc = "" + + "\n" + + "\x1bcsp/v1alpha1/provider.proto\x12\"nvidia.nvsentinel.janitor.v1alpha1\"6\n" + + "\x17SendRebootSignalRequest\x12\x1b\n" + + "\tnode_name\x18\x01 \x01(\tR\bnodeName\"9\n" + + "\x18SendRebootSignalResponse\x12\x1d\n" + + "\n" + + "request_id\x18\x01 \x01(\tR\trequestId\"1\n" + + "\x12IsNodeReadyRequest\x12\x1b\n" + + "\tnode_name\x18\x01 \x01(\tR\bnodeName\"0\n" + + "\x13IsNodeReadyResponse\x12\x19\n" + + "\bis_ready\x18\x01 \x01(\bR\aisReady\"9\n" + + "\x1aSendTerminateSignalRequest\x12\x1b\n" + + "\tnode_name\x18\x01 \x01(\tR\bnodeName\"<\n" + + "\x1bSendTerminateSignalResponse\x12\x1d\n" + + "\n" + + "request_id\x18\x01 \x01(\tR\trequestId2\xc4\x03\n" + + "\x12CSPProviderService\x12\x8f\x01\n" + + "\x10SendRebootSignal\x12;.nvidia.nvsentinel.janitor.v1alpha1.SendRebootSignalRequest\x1a<.nvidia.nvsentinel.janitor.v1alpha1.SendRebootSignalResponse\"\x00\x12\x80\x01\n" + + "\vIsNodeReady\x126.nvidia.nvsentinel.janitor.v1alpha1.IsNodeReadyRequest\x1a7.nvidia.nvsentinel.janitor.v1alpha1.IsNodeReadyResponse\"\x00\x12\x98\x01\n" + + "\x13SendTerminateSignal\x12>.nvidia.nvsentinel.janitor.v1alpha1.SendTerminateSignalRequest\x1a?.nvidia.nvsentinel.janitor.v1alpha1.SendTerminateSignalResponse\"\x00BCZAgithub.com/nvidia/nvsentinel/janitor/api/csp/v1alpha1;cspv1alpha1b\x06proto3" + +var ( + file_csp_v1alpha1_provider_proto_rawDescOnce sync.Once + file_csp_v1alpha1_provider_proto_rawDescData []byte +) + +func file_csp_v1alpha1_provider_proto_rawDescGZIP() []byte { + file_csp_v1alpha1_provider_proto_rawDescOnce.Do(func() { + file_csp_v1alpha1_provider_proto_rawDescData = protoimpl.X.CompressGZIP(unsafe.Slice(unsafe.StringData(file_csp_v1alpha1_provider_proto_rawDesc), len(file_csp_v1alpha1_provider_proto_rawDesc))) + }) + return file_csp_v1alpha1_provider_proto_rawDescData +} + +var file_csp_v1alpha1_provider_proto_msgTypes = make([]protoimpl.MessageInfo, 6) +var file_csp_v1alpha1_provider_proto_goTypes = []any{ + (*SendRebootSignalRequest)(nil), // 0: nvidia.nvsentinel.janitor.v1alpha1.SendRebootSignalRequest + (*SendRebootSignalResponse)(nil), // 1: nvidia.nvsentinel.janitor.v1alpha1.SendRebootSignalResponse + (*IsNodeReadyRequest)(nil), // 2: nvidia.nvsentinel.janitor.v1alpha1.IsNodeReadyRequest + (*IsNodeReadyResponse)(nil), // 3: nvidia.nvsentinel.janitor.v1alpha1.IsNodeReadyResponse + (*SendTerminateSignalRequest)(nil), // 4: nvidia.nvsentinel.janitor.v1alpha1.SendTerminateSignalRequest + (*SendTerminateSignalResponse)(nil), // 5: nvidia.nvsentinel.janitor.v1alpha1.SendTerminateSignalResponse +} +var file_csp_v1alpha1_provider_proto_depIdxs = []int32{ + 0, // 0: nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService.SendRebootSignal:input_type -> nvidia.nvsentinel.janitor.v1alpha1.SendRebootSignalRequest + 2, // 1: nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService.IsNodeReady:input_type -> nvidia.nvsentinel.janitor.v1alpha1.IsNodeReadyRequest + 4, // 2: nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService.SendTerminateSignal:input_type -> nvidia.nvsentinel.janitor.v1alpha1.SendTerminateSignalRequest + 1, // 3: nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService.SendRebootSignal:output_type -> nvidia.nvsentinel.janitor.v1alpha1.SendRebootSignalResponse + 3, // 4: nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService.IsNodeReady:output_type -> nvidia.nvsentinel.janitor.v1alpha1.IsNodeReadyResponse + 5, // 5: nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService.SendTerminateSignal:output_type -> nvidia.nvsentinel.janitor.v1alpha1.SendTerminateSignalResponse + 3, // [3:6] is the sub-list for method output_type + 0, // [0:3] is the sub-list for method input_type + 0, // [0:0] is the sub-list for extension type_name + 0, // [0:0] is the sub-list for extension extendee + 0, // [0:0] is the sub-list for field type_name +} + +func init() { file_csp_v1alpha1_provider_proto_init() } +func file_csp_v1alpha1_provider_proto_init() { + if File_csp_v1alpha1_provider_proto != nil { + return + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: unsafe.Slice(unsafe.StringData(file_csp_v1alpha1_provider_proto_rawDesc), len(file_csp_v1alpha1_provider_proto_rawDesc)), + NumEnums: 0, + NumMessages: 6, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_csp_v1alpha1_provider_proto_goTypes, + DependencyIndexes: file_csp_v1alpha1_provider_proto_depIdxs, + MessageInfos: file_csp_v1alpha1_provider_proto_msgTypes, + }.Build() + File_csp_v1alpha1_provider_proto = out.File + file_csp_v1alpha1_provider_proto_goTypes = nil + file_csp_v1alpha1_provider_proto_depIdxs = nil +} diff --git a/api/gen/go/csp/v1alpha1/provider_grpc.pb.go b/api/gen/go/csp/v1alpha1/provider_grpc.pb.go new file mode 100644 index 000000000..1e5ea5abf --- /dev/null +++ b/api/gen/go/csp/v1alpha1/provider_grpc.pb.go @@ -0,0 +1,211 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.5.1 +// - protoc v6.33.0 +// source: csp/v1alpha1/provider.proto + +package cspv1alpha1 + +import ( + context "context" + grpc "google.golang.org/grpc" + codes "google.golang.org/grpc/codes" + status "google.golang.org/grpc/status" +) + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.64.0 or later. +const _ = grpc.SupportPackageIsVersion9 + +const ( + CSPProviderService_SendRebootSignal_FullMethodName = "/nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService/SendRebootSignal" + CSPProviderService_IsNodeReady_FullMethodName = "/nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService/IsNodeReady" + CSPProviderService_SendTerminateSignal_FullMethodName = "/nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService/SendTerminateSignal" +) + +// CSPProviderServiceClient is the client API for CSPProviderService service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +type CSPProviderServiceClient interface { + SendRebootSignal(ctx context.Context, in *SendRebootSignalRequest, opts ...grpc.CallOption) (*SendRebootSignalResponse, error) + IsNodeReady(ctx context.Context, in *IsNodeReadyRequest, opts ...grpc.CallOption) (*IsNodeReadyResponse, error) + SendTerminateSignal(ctx context.Context, in *SendTerminateSignalRequest, opts ...grpc.CallOption) (*SendTerminateSignalResponse, error) +} + +type cSPProviderServiceClient struct { + cc grpc.ClientConnInterface +} + +func NewCSPProviderServiceClient(cc grpc.ClientConnInterface) CSPProviderServiceClient { + return &cSPProviderServiceClient{cc} +} + +func (c *cSPProviderServiceClient) SendRebootSignal(ctx context.Context, in *SendRebootSignalRequest, opts ...grpc.CallOption) (*SendRebootSignalResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(SendRebootSignalResponse) + err := c.cc.Invoke(ctx, CSPProviderService_SendRebootSignal_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *cSPProviderServiceClient) IsNodeReady(ctx context.Context, in *IsNodeReadyRequest, opts ...grpc.CallOption) (*IsNodeReadyResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(IsNodeReadyResponse) + err := c.cc.Invoke(ctx, CSPProviderService_IsNodeReady_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *cSPProviderServiceClient) SendTerminateSignal(ctx context.Context, in *SendTerminateSignalRequest, opts ...grpc.CallOption) (*SendTerminateSignalResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(SendTerminateSignalResponse) + err := c.cc.Invoke(ctx, CSPProviderService_SendTerminateSignal_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +// CSPProviderServiceServer is the server API for CSPProviderService service. +// All implementations must embed UnimplementedCSPProviderServiceServer +// for forward compatibility. +type CSPProviderServiceServer interface { + SendRebootSignal(context.Context, *SendRebootSignalRequest) (*SendRebootSignalResponse, error) + IsNodeReady(context.Context, *IsNodeReadyRequest) (*IsNodeReadyResponse, error) + SendTerminateSignal(context.Context, *SendTerminateSignalRequest) (*SendTerminateSignalResponse, error) + mustEmbedUnimplementedCSPProviderServiceServer() +} + +// UnimplementedCSPProviderServiceServer must be embedded to have +// forward compatible implementations. +// +// NOTE: this should be embedded by value instead of pointer to avoid a nil +// pointer dereference when methods are called. +type UnimplementedCSPProviderServiceServer struct{} + +func (UnimplementedCSPProviderServiceServer) SendRebootSignal(context.Context, *SendRebootSignalRequest) (*SendRebootSignalResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method SendRebootSignal not implemented") +} +func (UnimplementedCSPProviderServiceServer) IsNodeReady(context.Context, *IsNodeReadyRequest) (*IsNodeReadyResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method IsNodeReady not implemented") +} +func (UnimplementedCSPProviderServiceServer) SendTerminateSignal(context.Context, *SendTerminateSignalRequest) (*SendTerminateSignalResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method SendTerminateSignal not implemented") +} +func (UnimplementedCSPProviderServiceServer) mustEmbedUnimplementedCSPProviderServiceServer() {} +func (UnimplementedCSPProviderServiceServer) testEmbeddedByValue() {} + +// UnsafeCSPProviderServiceServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to CSPProviderServiceServer will +// result in compilation errors. +type UnsafeCSPProviderServiceServer interface { + mustEmbedUnimplementedCSPProviderServiceServer() +} + +func RegisterCSPProviderServiceServer(s grpc.ServiceRegistrar, srv CSPProviderServiceServer) { + // If the following call pancis, it indicates UnimplementedCSPProviderServiceServer was + // embedded by pointer and is nil. This will cause panics if an + // unimplemented method is ever invoked, so we test this at initialization + // time to prevent it from happening at runtime later due to I/O. + if t, ok := srv.(interface{ testEmbeddedByValue() }); ok { + t.testEmbeddedByValue() + } + s.RegisterService(&CSPProviderService_ServiceDesc, srv) +} + +func _CSPProviderService_SendRebootSignal_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(SendRebootSignalRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(CSPProviderServiceServer).SendRebootSignal(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: CSPProviderService_SendRebootSignal_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(CSPProviderServiceServer).SendRebootSignal(ctx, req.(*SendRebootSignalRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _CSPProviderService_IsNodeReady_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(IsNodeReadyRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(CSPProviderServiceServer).IsNodeReady(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: CSPProviderService_IsNodeReady_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(CSPProviderServiceServer).IsNodeReady(ctx, req.(*IsNodeReadyRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _CSPProviderService_SendTerminateSignal_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(SendTerminateSignalRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(CSPProviderServiceServer).SendTerminateSignal(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: CSPProviderService_SendTerminateSignal_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(CSPProviderServiceServer).SendTerminateSignal(ctx, req.(*SendTerminateSignalRequest)) + } + return interceptor(ctx, in, info, handler) +} + +// CSPProviderService_ServiceDesc is the grpc.ServiceDesc for CSPProviderService service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var CSPProviderService_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "nvidia.nvsentinel.janitor.v1alpha1.CSPProviderService", + HandlerType: (*CSPProviderServiceServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "SendRebootSignal", + Handler: _CSPProviderService_SendRebootSignal_Handler, + }, + { + MethodName: "IsNodeReady", + Handler: _CSPProviderService_IsNodeReady_Handler, + }, + { + MethodName: "SendTerminateSignal", + Handler: _CSPProviderService_SendTerminateSignal_Handler, + }, + }, + Streams: []grpc.StreamDesc{}, + Metadata: "csp/v1alpha1/provider.proto", +} diff --git a/api/proto/csp/v1alpha1/provider.proto b/api/proto/csp/v1alpha1/provider.proto new file mode 100644 index 000000000..c452b3153 --- /dev/null +++ b/api/proto/csp/v1alpha1/provider.proto @@ -0,0 +1,48 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; +package nvidia.nvsentinel.janitor.v1alpha1; + +option go_package = "github.com/nvidia/nvsentinel/janitor/api/csp/v1alpha1;cspv1alpha1"; + +service CSPProviderService { + rpc SendRebootSignal(SendRebootSignalRequest) returns (SendRebootSignalResponse) {} + rpc IsNodeReady(IsNodeReadyRequest) returns (IsNodeReadyResponse) {} + rpc SendTerminateSignal(SendTerminateSignalRequest) returns (SendTerminateSignalResponse) {} +} + +message SendRebootSignalRequest { + string node_name = 1; +} + +message SendRebootSignalResponse { + string request_id = 1; +} + +message IsNodeReadyRequest { + string node_name = 1; +} + +message IsNodeReadyResponse { + bool is_ready = 1; +} + +message SendTerminateSignalRequest { + string node_name = 1; +} + +message SendTerminateSignalResponse { + string request_id = 1; +} diff --git a/distros/kubernetes/nvsentinel/Chart.yaml b/distros/kubernetes/nvsentinel/Chart.yaml index 14a586f2d..c180d1939 100644 --- a/distros/kubernetes/nvsentinel/Chart.yaml +++ b/distros/kubernetes/nvsentinel/Chart.yaml @@ -55,6 +55,9 @@ dependencies: - name: janitor version: "0.1.0" condition: global.janitor.enabled + - name: janitor-provider-kwok + version: "0.1.0" + condition: global.janitor.enabled && global.janitor.provider == "kwok" - name: metadata-collector version: "0.1.0" condition: global.metadataCollector.enabled diff --git a/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/Chart.yaml b/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/Chart.yaml new file mode 100644 index 000000000..b037bfdbe --- /dev/null +++ b/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/Chart.yaml @@ -0,0 +1,19 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v2 +name: janitor-provider-kwok +description: KWOK provider for NVSentinel Janitor +type: application +version: 0.1.0 diff --git a/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/templates/_helpers.tpl b/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/templates/_helpers.tpl new file mode 100644 index 000000000..e43ceab3d --- /dev/null +++ b/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "provider.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "provider.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "provider.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "provider.labels" -}} +helm.sh/chart: {{ include "provider.chart" . }} +{{ include "provider.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "provider.selectorLabels" -}} +app.kubernetes.io/name: {{ include "provider.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "provider.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "provider.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/templates/deployment.yaml b/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/templates/deployment.yaml new file mode 100644 index 000000000..ead09c8c2 --- /dev/null +++ b/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/templates/deployment.yaml @@ -0,0 +1,64 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "provider.fullname" . }} + labels: + {{- include "provider.labels" . | nindent 4}} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + {{- include "provider.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with ((.Values.global).podAnnotations | default .Values.podAnnotations) }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "provider.selectorLabels" . | nindent 8 }} + spec: + {{- with ((.Values.global).imagePullSecrets | default .Values.imagePullSecrets) }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "provider.serviceAccountName" . }} + containers: + - name: janitor-provider-kwok + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default ((.Values.global).image).tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + ports: + - name: metrics + containerPort: {{ ((.Values.global).metricsPort) | default 2112 }} + - name: service + containerPort: {{ .Values.service.port }} + protocol: TCP + restartPolicy: Always + {{- with (((.Values.global).systemNodeSelector) | default .Values.nodeSelector) }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with (((.Values.global).affinity) | default .Values.affinity) }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with (((.Values.global).systemNodeTolerations) | default .Values.tolerations) }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + diff --git a/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/templates/networkpolicy.yaml b/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/templates/networkpolicy.yaml new file mode 100644 index 000000000..a5866759f --- /dev/null +++ b/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/templates/networkpolicy.yaml @@ -0,0 +1,30 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: {{ include "provider.fullname" . }}-allow-service-traffic + labels: + {{- include "provider.labels" . | nindent 4 }} +spec: + podSelector: + matchLabels: + {{- include "provider.selectorLabels" . | nindent 6 }} + policyTypes: + - Ingress + ingress: + - ports: + - port: {{ .Values.service.port | default 50051 }} + protocol: TCP + diff --git a/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/templates/service.yaml b/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/templates/service.yaml new file mode 100644 index 000000000..c4418efdc --- /dev/null +++ b/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/templates/service.yaml @@ -0,0 +1,27 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: v1 +kind: Service +metadata: + name: {{ include "provider.fullname" . }} + labels: + {{- include "provider.labels" . | nindent 4}} +spec: + type: {{ .Values.service.type | default "ClusterIP" }} + selector: + {{- include "provider.selectorLabels" . | nindent 4 }} + ports: + - name: service + port: {{ .Values.service.port }} + targetPort: {{ .Values.service.port }} diff --git a/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/templates/serviceaccount.yaml b/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/templates/serviceaccount.yaml new file mode 100644 index 000000000..a5fc0d0f2 --- /dev/null +++ b/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/templates/serviceaccount.yaml @@ -0,0 +1,19 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "provider.serviceAccountName" . }} + labels: + {{- include "provider.labels" . | nindent 4 }} diff --git a/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/values.yaml b/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/values.yaml new file mode 100644 index 000000000..839e4f14c --- /dev/null +++ b/distros/kubernetes/nvsentinel/charts/janitor-provider-kwok/values.yaml @@ -0,0 +1,107 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +replicaCount: 1 + +# Image configuration +image: + repository: ghcr.io/nvidia/nvsentinel/janitor-provider-kwok + pullPolicy: IfNotPresent + tag: "" + +nameOverride: "" +fullnameOverride: "janitor-provider-kwok" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Automatically mount a ServiceAccount's API credentials? + automount: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +podAnnotations: {} +podLabels: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +livenessProbe: + httpGet: + path: / + port: http +readinessProbe: + httpGet: + path: / + port: http + +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 100 + targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +service: + port: 50051 + +# Metrics Configuration +metrics: + # TLS configuration for metrics server + tls: + # Enable TLS for metrics endpoint + enabled: false + # Directory containing metrics server TLS certs + certDir: "/tmp/k8s-metrics-server/metrics-certs" + # Cert-manager issuer name (defaults to webhook.certIssuer if not set) + issuerName: "" + # Cert-manager issuer kind (Issuer or ClusterIssuer) + issuerKind: "Issuer" + # Cert-manager issuer group (optional) + issuerGroup: "" + # Certificate duration (default: 90 days) + duration: "2160h" + # Certificate renewal time before expiry (default: 30 days) + renewBefore: "720h" + # Organization name for certificate subject + organization: "NVIDIA" diff --git a/distros/kubernetes/nvsentinel/charts/janitor/templates/configmap.yaml b/distros/kubernetes/nvsentinel/charts/janitor/templates/configmap.yaml index 4b6ab4274..688af360a 100644 --- a/distros/kubernetes/nvsentinel/charts/janitor/templates/configmap.yaml +++ b/distros/kubernetes/nvsentinel/charts/janitor/templates/configmap.yaml @@ -47,13 +47,20 @@ data: {{- end }} {{- end }} {{- end }} + cspProviderHost: {{ .Values.config.cspProviderHost }} rebootNodeController: enabled: {{ if (hasKey .Values.config.controllers.rebootNode "enabled") }}{{ .Values.config.controllers.rebootNode.enabled }}{{ else }}true{{ end }} timeout: {{ .Values.config.controllers.rebootNode.timeout | default .Values.config.timeout | default "25m" }} manualMode: {{ .Values.config.manualMode | default false }} - + {{- if .Values.config.controllers.rebootNode.cspProviderHost }} + cspProviderHost: {{ .Values.config.controllers.rebootNode.cspProviderHost }} + {{- end }} + terminateNodeController: enabled: {{ if (hasKey .Values.config.controllers.terminateNode "enabled") }}{{ .Values.config.controllers.terminateNode.enabled }}{{ else }}true{{ end }} timeout: {{ .Values.config.controllers.terminateNode.timeout | default .Values.config.timeout | default "25m" }} manualMode: {{ .Values.config.manualMode | default false }} + {{- if .Values.config.controllers.terminateNode.cspProviderHost }} + cspProviderHost: {{ .Values.config.controllers.terminateNode.cspProviderHost }} + {{- end }} diff --git a/distros/kubernetes/nvsentinel/values-tilt.yaml b/distros/kubernetes/nvsentinel/values-tilt.yaml index 3e40641c3..f6d68e238 100755 --- a/distros/kubernetes/nvsentinel/values-tilt.yaml +++ b/distros/kubernetes/nvsentinel/values-tilt.yaml @@ -22,17 +22,17 @@ global: # This ensures proper configuration for both database backends nodeSelector: {} - - tolerations: - - operator: Exists - + + tolerations: + - operator: Exists + affinity: {} - + systemNodeSelector: node-role.kubernetes.io/control-plane: "" - + systemNodeTolerations: - - operator: Exists + - operator: Exists gpuHealthMonitor: enabled: true @@ -62,9 +62,10 @@ global: inclusterFileServer: enabled: true - + janitor: enabled: true + provider: "kwok" mongodbStore: enabled: true @@ -77,13 +78,13 @@ global: eventExporter: enabled: true - + metricsPort: 2112 mongodb-store: useBitnami: true usePerconaOperator: false - + job: nodeSelector: node-role.kubernetes.io/control-plane: "" @@ -124,26 +125,26 @@ mongodb-store: - operator: Exists podDisruptionBudget: maxUnavailable: 1 - + sharding: enabled: false - + logcollector: enabled: false - + tls: mode: requireTLS secrets: keyFile: mongodb-keyfile encryptionKey: mongodb-encryption-key - + backup: enabled: false storages: {} tasks: [] volumeMounts: [] - + finalizers: [] psmdb: @@ -156,18 +157,18 @@ mongodb-store: repository: ghcr.io/rtsp/docker-mongosh tag: "2.5.2" pullPolicy: IfNotPresent - + # Bitnami MongoDB configuration mongodb: replicaCount: 1 nodeSelector: node-role.kubernetes.io/control-plane: "" - + tolerations: - - operator: Exists + - operator: Exists jobTolerations: - - operator: Exists + - operator: Exists image: registry: "docker.io" @@ -179,7 +180,7 @@ mongodb-store: replicaset: existingSecrets: - "mongo-server-cert-0" - + image: registry: "docker.io" repository: "bitnamilegacy/nginx" @@ -206,13 +207,13 @@ fault-quarantine: affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchLabels: - app.kubernetes.io/name: kwok - namespaceSelector: - matchLabels: - kubernetes.io/metadata.name: kube-system - topologyKey: kubernetes.io/hostname + - labelSelector: + matchLabels: + app.kubernetes.io/name: kwok + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + topologyKey: kubernetes.io/hostname fault-remediation: logLevel: debug @@ -221,22 +222,21 @@ fault-remediation: enabled: true image: repository: localhost:5001/ghcr.io_nvidia_nvsentinel_log-collector - tag: latest - pullPolicy: Always - timeout: "10s" # Short timeout for faster testing (production default: "10m") + tag: latest + pullPolicy: Always + timeout: "10s" # Short timeout for faster testing (production default: "10m") env: - MOCK_MODE: "true" + MOCK_MODE: "true" affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchLabels: - app.kubernetes.io/name: kwok - namespaceSelector: - matchLabels: - kubernetes.io/metadata.name: kube-system - topologyKey: kubernetes.io/hostname - + - labelSelector: + matchLabels: + app.kubernetes.io/name: kwok + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + topologyKey: kubernetes.io/hostname node-drainer: logLevel: debug @@ -244,14 +244,13 @@ node-drainer: affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchLabels: - app.kubernetes.io/name: kwok - namespaceSelector: - matchLabels: - kubernetes.io/metadata.name: kube-system - topologyKey: kubernetes.io/hostname - + - labelSelector: + matchLabels: + app.kubernetes.io/name: kwok + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + topologyKey: kubernetes.io/hostname health-events-analyzer: logLevel: debug @@ -259,15 +258,17 @@ health-events-analyzer: affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchLabels: - app.kubernetes.io/name: kwok - namespaceSelector: - matchLabels: - kubernetes.io/metadata.name: kube-system - topologyKey: kubernetes.io/hostname + - labelSelector: + matchLabels: + app.kubernetes.io/name: kwok + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + topologyKey: kubernetes.io/hostname janitor: + config: + cspProviderHost: "janitor-provider-kwok.nvsentinel.svc.cluster.local:50051" webhook: certIssuer: "janitor-selfsigned-issuer" @@ -311,13 +312,13 @@ kubernetes-object-monitor: affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchLabels: - app.kubernetes.io/name: kwok - namespaceSelector: - matchLabels: - kubernetes.io/metadata.name: kube-system - topologyKey: kubernetes.io/hostname + - labelSelector: + matchLabels: + app.kubernetes.io/name: kwok + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + topologyKey: kubernetes.io/hostname event-exporter: replicaCount: 1 @@ -341,13 +342,13 @@ event-exporter: affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchLabels: - app.kubernetes.io/name: kwok - namespaceSelector: - matchLabels: - kubernetes.io/metadata.name: kube-system - topologyKey: kubernetes.io/hostname + - labelSelector: + matchLabels: + app.kubernetes.io/name: kwok + namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + topologyKey: kubernetes.io/hostname exporter: metadata: diff --git a/janitor/go.mod b/janitor/go.mod index e2b1d5a98..fc667b4a5 100644 --- a/janitor/go.mod +++ b/janitor/go.mod @@ -1,6 +1,6 @@ module github.com/nvidia/nvsentinel/janitor -go 1.25 +go 1.25.0 toolchain go1.25.3 @@ -12,6 +12,7 @@ require ( github.com/aws/aws-sdk-go-v2/config v1.31.18 github.com/aws/aws-sdk-go-v2/service/ec2 v1.254.1 github.com/go-logr/logr v1.4.3 + github.com/nvidia/nvsentinel/api v0.0.0 github.com/nvidia/nvsentinel/commons v0.0.0 github.com/onsi/ginkgo/v2 v2.26.0 github.com/onsi/gomega v1.38.2 @@ -118,7 +119,7 @@ require ( go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 // indirect go.opentelemetry.io/otel/metric v1.38.0 // indirect - go.opentelemetry.io/otel/sdk v1.37.0 // indirect + go.opentelemetry.io/otel/sdk v1.38.0 // indirect go.opentelemetry.io/otel/trace v1.38.0 // indirect go.opentelemetry.io/proto/otlp v1.5.0 // indirect go.uber.org/automaxprocs v1.6.0 // indirect @@ -141,7 +142,7 @@ require ( google.golang.org/genproto v0.0.0-20250603155806-513f23925822 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20251111163417-95abcf5c77ba // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251111163417-95abcf5c77ba // indirect - google.golang.org/grpc v1.76.0 // indirect + google.golang.org/grpc v1.77.0 // indirect google.golang.org/protobuf v1.36.10 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect @@ -165,3 +166,5 @@ replace github.com/nvidia/nvsentinel/store-client => ../store-client replace github.com/nvidia/nvsentinel/data-models => ../data-models replace github.com/nvidia/nvsentinel/commons => ../commons + +replace github.com/nvidia/nvsentinel/api => ../api diff --git a/janitor/go.sum b/janitor/go.sum index c22d81047..e375506ad 100644 --- a/janitor/go.sum +++ b/janitor/go.sum @@ -291,10 +291,10 @@ go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 h1:tgJ0u go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0/go.mod h1:U7HYyW0zt/a9x5J1Kjs+r1f/d4ZHnYFclhYY2+YbeoE= go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= -go.opentelemetry.io/otel/sdk v1.37.0 h1:ItB0QUqnjesGRvNcmAcU0LyvkVyGJ2xftD29bWdDvKI= -go.opentelemetry.io/otel/sdk v1.37.0/go.mod h1:VredYzxUvuo2q3WRcDnKDjbdvmO0sCzOvVAiY+yUkAg= -go.opentelemetry.io/otel/sdk/metric v1.37.0 h1:90lI228XrB9jCMuSdA0673aubgRobVZFhbjxHHspCPc= -go.opentelemetry.io/otel/sdk/metric v1.37.0/go.mod h1:cNen4ZWfiD37l5NhS+Keb5RXVWZWpRE+9WyVCpbo5ps= +go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= +go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= +go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= go.opentelemetry.io/proto/otlp v1.5.0 h1:xJvq7gMzB31/d406fB8U5CBdyQGw4P399D1aQWU/3i4= @@ -371,8 +371,8 @@ google.golang.org/genproto/googleapis/api v0.0.0-20251111163417-95abcf5c77ba h1: google.golang.org/genproto/googleapis/api v0.0.0-20251111163417-95abcf5c77ba/go.mod h1:G5IanEx8/PgI9w6CFcYQf7jMtHQhZruvfM1i3qOqk5U= google.golang.org/genproto/googleapis/rpc v0.0.0-20251111163417-95abcf5c77ba h1:UKgtfRM7Yh93Sya0Fo8ZzhDP4qBckrrxEr2oF5UIVb8= google.golang.org/genproto/googleapis/rpc v0.0.0-20251111163417-95abcf5c77ba/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= -google.golang.org/grpc v1.76.0 h1:UnVkv1+uMLYXoIz6o7chp59WfQUYA2ex/BXQ9rHZu7A= -google.golang.org/grpc v1.76.0/go.mod h1:Ju12QI8M6iQJtbcsV+awF5a4hfJMLi4X0JLo94ULZ6c= +google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM= +google.golang.org/grpc v1.77.0/go.mod h1:z0BY1iVj0q8E1uSQCjL9cppRj+gnZjzDnzV0dHhrNig= google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/janitor/pkg/config/config.go b/janitor/pkg/config/config.go index 3507a3ecf..738111e48 100644 --- a/janitor/pkg/config/config.go +++ b/janitor/pkg/config/config.go @@ -31,9 +31,10 @@ type Config struct { // GlobalConfig contains global janitor settings type GlobalConfig struct { - Timeout time.Duration `mapstructure:"timeout" json:"timeout"` - ManualMode bool `mapstructure:"manualMode" json:"manualMode"` - Nodes NodeConfig `mapstructure:"nodes" json:"nodes"` + Timeout time.Duration `mapstructure:"timeout" json:"timeout"` + ManualMode bool `mapstructure:"manualMode" json:"manualMode"` + Nodes NodeConfig `mapstructure:"nodes" json:"nodes"` + CSPProviderHost string `mapstructure:"cspProviderHost" json:"cspProviderHost"` } // NodeConfig contains configuration for nodes @@ -52,6 +53,8 @@ type RebootNodeControllerConfig struct { // NodeExclusions defines label selectors for nodes that should be excluded from reboot operations // Nodes matching any of these label selectors will be rejected by the admission webhook NodeExclusions []metav1.LabelSelector + // CSPProviderHost is the host of the CSP provider + CSPProviderHost string } // TerminateNodeControllerConfig contains configuration for terminate node controller @@ -65,6 +68,8 @@ type TerminateNodeControllerConfig struct { // NodeExclusions defines label selectors for nodes that should be excluded from terminate operations // Nodes matching any of these label selectors will be rejected by the admission webhook NodeExclusions []metav1.LabelSelector + // CSPProviderHost is the host of the CSP provider + CSPProviderHost string } // LoadConfig loads configuration from a YAML file using Viper @@ -98,5 +103,13 @@ func LoadConfig(configPath string) (*Config, error) { config.RebootNode.NodeExclusions = config.Global.Nodes.Exclusions config.TerminateNode.NodeExclusions = config.Global.Nodes.Exclusions + // If CSPProviderHost is not set, use the global CSPProviderHost + if config.RebootNode.CSPProviderHost == "" { + config.RebootNode.CSPProviderHost = config.Global.CSPProviderHost + } + if config.TerminateNode.CSPProviderHost == "" { + config.TerminateNode.CSPProviderHost = config.Global.CSPProviderHost + } + return &config, nil } diff --git a/janitor/pkg/controller/rebootnode_controller.go b/janitor/pkg/controller/rebootnode_controller.go index d8460150e..498c3e469 100644 --- a/janitor/pkg/controller/rebootnode_controller.go +++ b/janitor/pkg/controller/rebootnode_controller.go @@ -21,6 +21,8 @@ import ( "fmt" "time" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -29,11 +31,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/log" + cspv1alpha1 "github.com/nvidia/nvsentinel/api/gen/go/csp/v1alpha1" janitordgxcnvidiacomv1alpha1 "github.com/nvidia/nvsentinel/janitor/api/v1alpha1" "github.com/nvidia/nvsentinel/janitor/pkg/config" - "github.com/nvidia/nvsentinel/janitor/pkg/csp" "github.com/nvidia/nvsentinel/janitor/pkg/metrics" - "github.com/nvidia/nvsentinel/janitor/pkg/model" ) const ( @@ -74,7 +75,7 @@ type RebootNodeReconciler struct { client.Client Scheme *runtime.Scheme Config *config.RebootNodeControllerConfig - CSPClient model.CSPClient + CSPClient cspv1alpha1.CSPProviderServiceClient } // +kubebuilder:rbac:groups=janitor.dgxc.nvidia.com,resources=rebootnodes,verbs=get;list;watch;create;update;patch;delete @@ -190,7 +191,10 @@ func (r *RebootNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) cspCtx, cancel := context.WithTimeout(ctx, CSPOperationTimeout) defer cancel() - cspReady, nodeReadyErr = r.CSPClient.IsNodeReady(cspCtx, node, rebootNode.GetCSPReqRef()) + rsp, nodeReadyErr := r.CSPClient.IsNodeReady(cspCtx, &cspv1alpha1.IsNodeReadyRequest{ + NodeName: node.Name, + }) + cspReady = rsp.IsReady // Check for timeout specifically if errors.Is(nodeReadyErr, context.DeadlineExceeded) { @@ -339,7 +343,9 @@ func (r *RebootNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) cspCtx, cancel := context.WithTimeout(ctx, CSPOperationTimeout) defer cancel() - reqRef, rebootErr := r.CSPClient.SendRebootSignal(cspCtx, node) + rsp, rebootErr := r.CSPClient.SendRebootSignal(cspCtx, &cspv1alpha1.SendRebootSignalRequest{ + NodeName: node.Name, + }) // Check for timeout if errors.Is(rebootErr, context.DeadlineExceeded) { @@ -367,7 +373,7 @@ func (r *RebootNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) Type: janitordgxcnvidiacomv1alpha1.RebootNodeConditionSignalSent, Status: metav1.ConditionTrue, Reason: "Succeeded", - Message: string(reqRef), + Message: rsp.RequestId, LastTransitionTime: metav1.Now(), } // Continue monitoring if signal was sent successfully @@ -401,16 +407,11 @@ func (r *RebootNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) // SetupWithManager sets up the controller with the Manager. func (r *RebootNodeReconciler) SetupWithManager(mgr ctrl.Manager) error { - // Use background context for client initialization during controller setup - // This is synchronous and happens before the controller starts processing events - ctx := context.Background() - - var err error - - r.CSPClient, err = csp.New(ctx) + conn, err := grpc.NewClient(r.Config.CSPProviderHost, grpc.WithTransportCredentials(insecure.NewCredentials())) if err != nil { return fmt.Errorf("failed to create CSP client: %w", err) } + r.CSPClient = cspv1alpha1.NewCSPProviderServiceClient(conn) // Note: We use RequeueAfter in the reconcile loop rather than the controller's // rate limiter because we need per-resource (per-node) backoff based on each diff --git a/providers/aws/go.mod b/providers/aws/go.mod new file mode 100644 index 000000000..212572371 --- /dev/null +++ b/providers/aws/go.mod @@ -0,0 +1,20 @@ +module github.com/nvidia/nvsentinel/providers/aws + +go 1.25.0 + +toolchain go1.25.3 + +require ( + github.com/nvidia/nvsentinel/api v0.0.0-00010101000000-000000000000 + google.golang.org/grpc v1.77.0 +) + +require ( + golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82 // indirect + golang.org/x/sys v0.37.0 // indirect + golang.org/x/text v0.30.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20251103181224-f26f9409b101 // indirect + google.golang.org/protobuf v1.36.10 // indirect +) + +replace github.com/nvidia/nvsentinel/api => ../../api diff --git a/providers/aws/go.sum b/providers/aws/go.sum new file mode 100644 index 000000000..2b39a17fb --- /dev/null +++ b/providers/aws/go.sum @@ -0,0 +1,36 @@ +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= +go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= +go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82 h1:6/3JGEh1C88g7m+qzzTbl3A0FtsLguXieqofVLU/JAo= +golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= +golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= +golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k= +golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251103181224-f26f9409b101 h1:tRPGkdGHuewF4UisLzzHHr1spKw92qLM98nIzxbC0wY= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251103181224-f26f9409b101/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= +google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM= +google.golang.org/grpc v1.77.0/go.mod h1:z0BY1iVj0q8E1uSQCjL9cppRj+gnZjzDnzV0dHhrNig= +google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= +google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= diff --git a/providers/aws/main.go b/providers/aws/main.go new file mode 100644 index 000000000..7e7dac021 --- /dev/null +++ b/providers/aws/main.go @@ -0,0 +1,53 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "log/slog" + "net" + "os" + + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + + cspv1alpha1 "github.com/nvidia/nvsentinel/api/gen/go/csp/v1alpha1" +) + +type awsProviderServer struct { + cspv1alpha1.UnimplementedCSPProviderServiceServer +} + +func (s *awsProviderServer) SendRebootSignal(ctx context.Context, req *cspv1alpha1.SendRebootSignalRequest) (*cspv1alpha1.SendRebootSignalResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method SendRebootSignal not implemented") +} + +func main() { + slog.Info("Starting AWS provider") + + lis, err := net.Listen("tcp", ":50051") + if err != nil { + slog.Error("Failed to listen", "error", err) + os.Exit(1) + } + + svr := grpc.NewServer() + cspv1alpha1.RegisterCSPProviderServiceServer(svr, &awsProviderServer{}) + if err := svr.Serve(lis); err != nil { + slog.Error("Failed to serve", "error", err) + os.Exit(1) + } +} diff --git a/providers/azure/go.mod b/providers/azure/go.mod new file mode 100644 index 000000000..bec48dd2f --- /dev/null +++ b/providers/azure/go.mod @@ -0,0 +1,20 @@ +module github.com/nvidia/nvsentinel/providers/azure + +go 1.25.0 + +toolchain go1.25.3 + +require ( + github.com/nvidia/nvsentinel/api v0.4.0 + google.golang.org/grpc v1.77.0 +) + +require ( + golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82 // indirect + golang.org/x/sys v0.37.0 // indirect + golang.org/x/text v0.30.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20251103181224-f26f9409b101 // indirect + google.golang.org/protobuf v1.36.10 // indirect +) + +replace github.com/nvidia/nvsentinel/api => ../../api diff --git a/providers/azure/go.sum b/providers/azure/go.sum new file mode 100644 index 000000000..2b39a17fb --- /dev/null +++ b/providers/azure/go.sum @@ -0,0 +1,36 @@ +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= +go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= +go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82 h1:6/3JGEh1C88g7m+qzzTbl3A0FtsLguXieqofVLU/JAo= +golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= +golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= +golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k= +golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251103181224-f26f9409b101 h1:tRPGkdGHuewF4UisLzzHHr1spKw92qLM98nIzxbC0wY= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251103181224-f26f9409b101/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= +google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM= +google.golang.org/grpc v1.77.0/go.mod h1:z0BY1iVj0q8E1uSQCjL9cppRj+gnZjzDnzV0dHhrNig= +google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= +google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= diff --git a/providers/azure/main.go b/providers/azure/main.go new file mode 100644 index 000000000..6d28f7678 --- /dev/null +++ b/providers/azure/main.go @@ -0,0 +1,53 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "log/slog" + "net" + "os" + + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + + cspv1alpha1 "github.com/nvidia/nvsentinel/api/gen/go/csp/v1alpha1" +) + +type azureProviderServer struct { + cspv1alpha1.UnimplementedCSPProviderServiceServer +} + +func (s *azureProviderServer) SendRebootSignal(ctx context.Context, req *cspv1alpha1.SendRebootSignalRequest) (*cspv1alpha1.SendRebootSignalResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method SendRebootSignal not implemented") +} + +func main() { + slog.Info("Starting Azure provider") + + lis, err := net.Listen("tcp", ":50051") + if err != nil { + slog.Error("Failed to listen", "error", err) + os.Exit(1) + } + + svr := grpc.NewServer() + cspv1alpha1.RegisterCSPProviderServiceServer(svr, &azureProviderServer{}) + if err := svr.Serve(lis); err != nil { + slog.Error("Failed to serve", "error", err) + os.Exit(1) + } +} diff --git a/providers/gcp/go.mod b/providers/gcp/go.mod new file mode 100644 index 000000000..bec48dd2f --- /dev/null +++ b/providers/gcp/go.mod @@ -0,0 +1,20 @@ +module github.com/nvidia/nvsentinel/providers/azure + +go 1.25.0 + +toolchain go1.25.3 + +require ( + github.com/nvidia/nvsentinel/api v0.4.0 + google.golang.org/grpc v1.77.0 +) + +require ( + golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82 // indirect + golang.org/x/sys v0.37.0 // indirect + golang.org/x/text v0.30.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20251103181224-f26f9409b101 // indirect + google.golang.org/protobuf v1.36.10 // indirect +) + +replace github.com/nvidia/nvsentinel/api => ../../api diff --git a/providers/gcp/go.sum b/providers/gcp/go.sum new file mode 100644 index 000000000..2b39a17fb --- /dev/null +++ b/providers/gcp/go.sum @@ -0,0 +1,36 @@ +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= +go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= +go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82 h1:6/3JGEh1C88g7m+qzzTbl3A0FtsLguXieqofVLU/JAo= +golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= +golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= +golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k= +golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251103181224-f26f9409b101 h1:tRPGkdGHuewF4UisLzzHHr1spKw92qLM98nIzxbC0wY= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251103181224-f26f9409b101/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= +google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM= +google.golang.org/grpc v1.77.0/go.mod h1:z0BY1iVj0q8E1uSQCjL9cppRj+gnZjzDnzV0dHhrNig= +google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= +google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= diff --git a/providers/gcp/main.go b/providers/gcp/main.go new file mode 100644 index 000000000..3ac2e9a38 --- /dev/null +++ b/providers/gcp/main.go @@ -0,0 +1,53 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "log/slog" + "net" + "os" + + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + + cspv1alpha1 "github.com/nvidia/nvsentinel/api/gen/go/csp/v1alpha1" +) + +type gcpProviderServer struct { + cspv1alpha1.UnimplementedCSPProviderServiceServer +} + +func (s *gcpProviderServer) SendRebootSignal(ctx context.Context, req *cspv1alpha1.SendRebootSignalRequest) (*cspv1alpha1.SendRebootSignalResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method SendRebootSignal not implemented") +} + +func main() { + slog.Info("Starting GCP provider") + + lis, err := net.Listen("tcp", ":50051") + if err != nil { + slog.Error("Failed to listen", "error", err) + os.Exit(1) + } + + svr := grpc.NewServer() + cspv1alpha1.RegisterCSPProviderServiceServer(svr, &gcpProviderServer{}) + if err := svr.Serve(lis); err != nil { + slog.Error("Failed to serve", "error", err) + os.Exit(1) + } +} diff --git a/providers/kwok/Tiltfile b/providers/kwok/Tiltfile new file mode 100644 index 000000000..dded593af --- /dev/null +++ b/providers/kwok/Tiltfile @@ -0,0 +1,21 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Build and deploy +custom_build( + 'ghcr.io/nvidia/nvsentinel/janitor-provider-kwok', + '../../scripts/ko-tilt-build.sh . $EXPECTED_REF', + deps=['./', '../../api'], + skips_local_docker=True +) diff --git a/providers/kwok/go.mod b/providers/kwok/go.mod new file mode 100644 index 000000000..bec48dd2f --- /dev/null +++ b/providers/kwok/go.mod @@ -0,0 +1,20 @@ +module github.com/nvidia/nvsentinel/providers/azure + +go 1.25.0 + +toolchain go1.25.3 + +require ( + github.com/nvidia/nvsentinel/api v0.4.0 + google.golang.org/grpc v1.77.0 +) + +require ( + golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82 // indirect + golang.org/x/sys v0.37.0 // indirect + golang.org/x/text v0.30.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20251103181224-f26f9409b101 // indirect + google.golang.org/protobuf v1.36.10 // indirect +) + +replace github.com/nvidia/nvsentinel/api => ../../api diff --git a/providers/kwok/go.sum b/providers/kwok/go.sum new file mode 100644 index 000000000..2b39a17fb --- /dev/null +++ b/providers/kwok/go.sum @@ -0,0 +1,36 @@ +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= +go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= +go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82 h1:6/3JGEh1C88g7m+qzzTbl3A0FtsLguXieqofVLU/JAo= +golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= +golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= +golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k= +golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251103181224-f26f9409b101 h1:tRPGkdGHuewF4UisLzzHHr1spKw92qLM98nIzxbC0wY= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251103181224-f26f9409b101/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= +google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM= +google.golang.org/grpc v1.77.0/go.mod h1:z0BY1iVj0q8E1uSQCjL9cppRj+gnZjzDnzV0dHhrNig= +google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= +google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= diff --git a/providers/kwok/main.go b/providers/kwok/main.go new file mode 100644 index 000000000..deb039394 --- /dev/null +++ b/providers/kwok/main.go @@ -0,0 +1,68 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "log/slog" + "net" + "os" + + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + + cspv1alpha1 "github.com/nvidia/nvsentinel/api/gen/go/csp/v1alpha1" +) + +type kwokProviderServer struct { + cspv1alpha1.UnimplementedCSPProviderServiceServer +} + +func (s *kwokProviderServer) SendRebootSignal(ctx context.Context, req *cspv1alpha1.SendRebootSignalRequest) (*cspv1alpha1.SendRebootSignalResponse, error) { + slog.Info("Sending reboot signal", "node", req.NodeName) + return &cspv1alpha1.SendRebootSignalResponse{ + RequestId: "1234567890", + }, nil +} + +func (s *kwokProviderServer) IsNodeReady(ctx context.Context, req *cspv1alpha1.IsNodeReadyRequest) (*cspv1alpha1.IsNodeReadyResponse, error) { + slog.Info("Checking if node is ready", "node", req.NodeName) + return &cspv1alpha1.IsNodeReadyResponse{ + IsReady: true, + }, nil +} + +func (s *kwokProviderServer) SendTerminateSignal(ctx context.Context, req *cspv1alpha1.SendTerminateSignalRequest) (*cspv1alpha1.SendTerminateSignalResponse, error) { + slog.Info("Sending terminate signal", "node", req.NodeName) + return nil, status.Errorf(codes.Unimplemented, "method SendTerminateSignal not implemented") +} + +func main() { + slog.Info("Starting Kwok provider") + + lis, err := net.Listen("tcp", ":50051") + if err != nil { + slog.Error("Failed to listen", "error", err) + os.Exit(1) + } + + svr := grpc.NewServer() + cspv1alpha1.RegisterCSPProviderServiceServer(svr, &kwokProviderServer{}) + if err := svr.Serve(lis); err != nil { + slog.Error("Failed to serve", "error", err) + os.Exit(1) + } +} diff --git a/providers/oci/go.mod b/providers/oci/go.mod new file mode 100644 index 000000000..bec48dd2f --- /dev/null +++ b/providers/oci/go.mod @@ -0,0 +1,20 @@ +module github.com/nvidia/nvsentinel/providers/azure + +go 1.25.0 + +toolchain go1.25.3 + +require ( + github.com/nvidia/nvsentinel/api v0.4.0 + google.golang.org/grpc v1.77.0 +) + +require ( + golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82 // indirect + golang.org/x/sys v0.37.0 // indirect + golang.org/x/text v0.30.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20251103181224-f26f9409b101 // indirect + google.golang.org/protobuf v1.36.10 // indirect +) + +replace github.com/nvidia/nvsentinel/api => ../../api diff --git a/providers/oci/go.sum b/providers/oci/go.sum new file mode 100644 index 000000000..2b39a17fb --- /dev/null +++ b/providers/oci/go.sum @@ -0,0 +1,36 @@ +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= +go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= +go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= +go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= +go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= +go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= +go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= +go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= +go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= +go.opentelemetry.io/otel/trace v1.38.0/go.mod h1:j1P9ivuFsTceSWe1oY+EeW3sc+Pp42sO++GHkg4wwhs= +golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82 h1:6/3JGEh1C88g7m+qzzTbl3A0FtsLguXieqofVLU/JAo= +golang.org/x/net v0.46.1-0.20251013234738-63d1a5100f82/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= +golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ= +golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k= +golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251103181224-f26f9409b101 h1:tRPGkdGHuewF4UisLzzHHr1spKw92qLM98nIzxbC0wY= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251103181224-f26f9409b101/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= +google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM= +google.golang.org/grpc v1.77.0/go.mod h1:z0BY1iVj0q8E1uSQCjL9cppRj+gnZjzDnzV0dHhrNig= +google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= +google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= diff --git a/providers/oci/main.go b/providers/oci/main.go new file mode 100644 index 000000000..d6de6702e --- /dev/null +++ b/providers/oci/main.go @@ -0,0 +1,53 @@ +// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "log/slog" + "net" + "os" + + "google.golang.org/grpc" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + + cspv1alpha1 "github.com/nvidia/nvsentinel/api/gen/go/csp/v1alpha1" +) + +type ociProviderServer struct { + cspv1alpha1.UnimplementedCSPProviderServiceServer +} + +func (s *ociProviderServer) SendRebootSignal(ctx context.Context, req *cspv1alpha1.SendRebootSignalRequest) (*cspv1alpha1.SendRebootSignalResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method SendRebootSignal not implemented") +} + +func main() { + slog.Info("Starting OCI provider") + + lis, err := net.Listen("tcp", ":50051") + if err != nil { + slog.Error("Failed to listen", "error", err) + os.Exit(1) + } + + svr := grpc.NewServer() + cspv1alpha1.RegisterCSPProviderServiceServer(svr, &ociProviderServer{}) + if err := svr.Serve(lis); err != nil { + slog.Error("Failed to serve", "error", err) + os.Exit(1) + } +} diff --git a/tilt/Tiltfile b/tilt/Tiltfile index 124be48e5..00752fb69 100755 --- a/tilt/Tiltfile +++ b/tilt/Tiltfile @@ -112,6 +112,7 @@ k8s_yaml('./nvidia-dcgm-daemonset.yaml') include('../fault-quarantine/Tiltfile') include('../fault-remediation/Tiltfile') include('../janitor/Tiltfile') +include('../providers/kwok/Tiltfile') include('../node-drainer/Tiltfile') include('../platform-connectors/Tiltfile') include('./simple-health-client/Tiltfile') @@ -214,6 +215,11 @@ k8s_resource( resource_deps=['wait-for-janitor-cert'], ) +k8s_resource( + 'janitor-provider-kwok', + resource_deps=['janitor'], +) + if use_percona: k8s_resource( 'nvsentinel-psmdb-operator',