WARNING: THIS SITE IS A MIRROR OF GITHUB.COM / IT CANNOT LOGIN OR REGISTER ACCOUNTS / THE CONTENTS ARE PROVIDED AS-IS / THIS SITE ASSUMES NO RESPONSIBILITY FOR ANY DISPLAYED CONTENT OR LINKS / IF YOU FOUND SOMETHING MAY NOT GOOD FOR EVERYONE, CONTACT ADMIN AT ilovescratch@foxmail.com
Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions tools/cluster_setup/EKS_README.md
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,26 @@ cd /path/to/splunk-ai-operator/tools/cluster_setup
- Valid AWS credentials with appropriate permissions
- Existing VPC with public and private subnets in multiple AZs **OR** let eksctl create a new VPC automatically
- Required tools installed: `eksctl`, `kubectl`, `helm`, `jq`, `yq`
- **If you are setting your cluster up without eksctl**
- An aws eks cluster with OIDC, 2 managed node groups: one for CPU nodes, and one for GPU nodes, and addons vpc-cni, kube-proxy, coredns, and eks-pod-identity-agent
- IAM OIDC provider
- IAM service account in the `kube-system` namespace with the name `ebs-csi-controller-sa`, role name `EBSCSIDriverRole-${CLUSTER_NAME}`, and attached policy arn `arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy`
- IAM service account in the `kube-system` namespace with the name `cluster-autoscaler`, role name `ClusterAutoscalerRole-${CLUSTER_NAME}`, and attached policy arn `arn:aws:iam::aws:policy/AutoScalingFullAccess`
- IAM policy with the name `S3Access-${CLUSTER_NAME}-ai-platform`, policy arn `arn:aws:iam::${ACCOUNT_ID}:policy/S3Access-${CLUSTER_NAME}-ai-platform`, and policy document
```
{
"Version": "2012-10-17",
"Statement": [
{ "Sid":"ListBucket","Effect":"Allow","Action":["s3:ListBucket"],"Resource":"arn:aws:s3:::${S3_BUCKET_NAME}" },
{ "Sid":"ObjectRW","Effect":"Allow","Action":["s3:GetObject","s3:PutObject","s3:DeleteObject","s3:AbortMultipartUpload","s3:ListMultipartUploadParts","s3:ListBucketMultipartUploads"],"Resource":"arn:aws:s3:::${S3_BUCKET_NAME}/*" }
]
}
```
- IAM service account in the `ai-platform` namespace with the name `saia-service-sa`, role name `IRSA-${CLUSTER_NAME}-saia-service-sa`, and attached policy arn `arn:aws:iam::${ACCOUNT_ID}:policy/S3Access-${CLUSTER_NAME}-ai-platform`
- IAM service account in the `ai-platform` namespace with the name `ray-head-sa`, role name `IRSA-${CLUSTER_NAME}-ray-head-sa`, and attached policy arn `arn:aws:iam::${ACCOUNT_ID}:policy/S3Access-${CLUSTER_NAME}-ai-platform`
- IAM service account in the `ai-platform` namespace with the name `ray-worker-sa`, role name `IRSA-${CLUSTER_NAME}-ray-worker-sa`, and attached policy arn `arn:aws:iam::${ACCOUNT_ID}:policy/S3Access-${CLUSTER_NAME}-ai-platform`
- aws-ebs-csi-driver add on with the service-account-role-arn "arn:aws:iam::${ACCOUNT_ID}:role/EBSCSIDriverRole-${CLUSTER_NAME}"
- **NOTE** eksctl is required for automated teardown using the script

**🔐 Set AWS Credentials:**
```bash
Expand Down Expand Up @@ -1033,14 +1053,14 @@ aiPlatform:
# Install EKS cluster and AI Platform
./eks_cluster_with_stack.sh install

# Install AI Platform on a cluster created using a tool other than eksctl
./eks_cluster_with_stack.sh install --no-eksctl

# Delete entire cluster and all AWS resources
./eks_cluster_with_stack.sh delete

# Full cleanup (including S3 buckets, IAM roles)
./eks_cluster_with_stack.sh delete-full

# Check AIPlatform status
./eks_cluster_with_stack.sh status
```

### Post-Installation Tasks
Expand Down Expand Up @@ -1103,7 +1123,7 @@ kubectl get secret splunk-splunk-standalone-standalone-secret-v1 \

# Port forward Splunk Web UI
kubectl port-forward -n ai-platform \
svc/splunk-standalone-standalone-service 8000:8000
svc/splunk-splunk-standalone-standalone-service 8000:8000

# Access at http://localhost:8000
# Username: admin
Expand Down
198 changes: 137 additions & 61 deletions tools/cluster_setup/eks_cluster_with_stack.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ aws() { command /usr/bin/env aws "$@"; }
# ====== CONFIG FILE LOCATION ======
CONFIG_FILE="${CONFIG_FILE:-$(dirname "$0")/cluster-config.yaml}"

# ====== GLOBAL FLAGS ======
USE_EKSCTL=true

# ====== LOAD CONFIGURATION FROM YAML ======
load_config() {
local cfg="$CONFIG_FILE"
Expand Down Expand Up @@ -847,27 +850,35 @@ ensure_oidc() {
# First check if cluster has OIDC issuer configured
local issuer; issuer=$(aws eks describe-cluster --name "${CLUSTER_NAME}" --region "${REGION}" --query 'cluster.identity.oidc.issuer' --output text 2>/dev/null || true)
if [[ -z "$issuer" || "$issuer" == "None" ]]; then
log "Cluster does not have OIDC issuer configured. Associating OIDC provider..."
if ! eksctl utils associate-iam-oidc-provider --region "${REGION}" --cluster "${CLUSTER_NAME}" --approve; then
err "Failed to associate OIDC provider with cluster"
if [[ "$USE_EKSCTL" == "true" ]]; then
log "Cluster does not have OIDC issuer configured. Associating OIDC provider..."
if ! eksctl utils associate-iam-oidc-provider --region "${REGION}" --cluster "${CLUSTER_NAME}" --approve; then
err "Failed to associate OIDC provider with cluster"
fi
# Re-fetch issuer after association
issuer=$(aws eks describe-cluster --name "${CLUSTER_NAME}" --region "${REGION}" --query 'cluster.identity.oidc.issuer' --output text 2>/dev/null || true)
else
err "Cluster does not have OIDC issuer configured. Please associate the OIDC cluster issuer with the cluster."
fi
# Re-fetch issuer after association
issuer=$(aws eks describe-cluster --name "${CLUSTER_NAME}" --region "${REGION}" --query 'cluster.identity.oidc.issuer' --output text 2>/dev/null || true)
fi

log "Cluster OIDC issuer: ${issuer}"

# Check if IAM OIDC provider actually exists
log "Checking if IAM OIDC provider exists..."
local oidc_arn; oidc_arn="$(get_oidc_provider_arn || true)"

if [[ -z "$oidc_arn" ]]; then
log "OIDC provider ARN not found. Creating IAM OIDC provider..."
if ! eksctl utils associate-iam-oidc-provider --region "${REGION}" --cluster "${CLUSTER_NAME}" --approve; then
err "Failed to create IAM OIDC provider"
if [[ "$USE_EKSCTL" == "true" ]]; then
log "OIDC provider ARN not found. Creating IAM OIDC provider..."
if ! eksctl utils associate-iam-oidc-provider --region "${REGION}" --cluster "${CLUSTER_NAME}" --approve; then
err "Failed to create IAM OIDC provider"
fi
# Re-fetch ARN after creation
oidc_arn="$(get_oidc_provider_arn || true)"
else
err "OIDC provider ARN not found. Please create the IAM OIDC provider."
fi
# Re-fetch ARN after creation
oidc_arn="$(get_oidc_provider_arn || true)"
fi

# Verify OIDC provider exists in IAM
Expand All @@ -877,15 +888,19 @@ ensure_oidc() {
fi

if ! aws iam get-open-id-connect-provider --open-id-connect-provider-arn "$oidc_arn" >/dev/null 2>&1; then
log "IAM OIDC provider not found in IAM. Creating it now..."
if ! eksctl utils associate-iam-oidc-provider --region "${REGION}" --cluster "${CLUSTER_NAME}" --approve; then
err "Failed to create IAM OIDC provider even after retry"
fi
if [[ "$USE_EKSCTL" == "true" ]]; then
log "IAM OIDC provider not found in IAM. Creating it now..."
if ! eksctl utils associate-iam-oidc-provider --region "${REGION}" --cluster "${CLUSTER_NAME}" --approve; then
err "Failed to create IAM OIDC provider even after retry"
fi

# Final verification
sleep 5 # Give IAM a moment to propagate
if ! aws iam get-open-id-connect-provider --open-id-connect-provider-arn "$oidc_arn" >/dev/null 2>&1; then
err "OIDC provider ARN $oidc_arn not found in IAM after creation. IAM propagation may be delayed."
# Final verification
sleep 5 # Give IAM a moment to propagate
if ! aws iam get-open-id-connect-provider --open-id-connect-provider-arn "$oidc_arn" >/dev/null 2>&1; then
err "OIDC provider ARN $oidc_arn not found in IAM after creation. IAM propagation may be delayed."
fi
else
err "IAM OIDC provider not found in IAM. Please create the IAM OIDC provider."
fi
fi

Expand All @@ -904,19 +919,28 @@ install_ebs_csi_addon() {
fi
log "✓ IAM role ${EBS_IRSA_ROLE_NAME} exists"

# Use eksctl to create addon with IRSA
log "Creating aws-ebs-csi-driver addon..."
if ! eksctl create addon \
--cluster "${CLUSTER_NAME}" \
--name aws-ebs-csi-driver \
--service-account-role-arn "arn:aws:iam::${ACCOUNT_ID}:role/${EBS_IRSA_ROLE_NAME}" \
--force; then
warn "Addon creation command failed. Checking if addon already exists..."
# Check if addon exists (idempotent behavior)
if aws eks describe-addon --cluster-name "${CLUSTER_NAME}" --addon-name aws-ebs-csi-driver >/dev/null 2>&1; then
log "Addon already exists, continuing..."
# Check addon with IRSA
log "Checking for aws-ebs-csi-driver addon..."
if aws eks describe-addon --cluster-name "${CLUSTER_NAME}" --addon-name aws-ebs-csi-driver >/dev/null 2>&1; then
log "Addon already exists, continuing..."
else
if [[ "$USE_EKSCTL" == "true" ]]; then
log "Creating aws-ebs-csi-driver addon..."
if ! eksctl create addon \
--cluster "${CLUSTER_NAME}" \
--name aws-ebs-csi-driver \
--service-account-role-arn "arn:aws:iam::${ACCOUNT_ID}:role/${EBS_IRSA_ROLE_NAME}" \
--force; then
warn "Addon creation command failed. Checking if addon already exists..."
# Check if addon exists (idempotent behavior)
if aws eks describe-addon --cluster-name "${CLUSTER_NAME}" --addon-name aws-ebs-csi-driver >/dev/null 2>&1; then
log "Addon created, continuing..."
else
err "Failed to create EBS CSI addon. Check: aws eks describe-addon --cluster-name ${CLUSTER_NAME} --addon-name aws-ebs-csi-driver"
fi
fi
else
err "Failed to create EBS CSI addon. Check: aws eks describe-addon --cluster-name ${CLUSTER_NAME} --addon-name aws-ebs-csi-driver"
err "EBS CSI addon does not exist. Check: aws eks describe-addon --cluster-name ${CLUSTER_NAME} --addon-name aws-ebs-csi-driver"
fi
fi

Expand Down Expand Up @@ -987,15 +1011,19 @@ install_ebs_csi_addon() {
ensure_ebs_irsa_role() {
log "Ensuring EBS CSI IRSA role and service account..."

# Create IRSA for EBS CSI using eksctl (handles role creation, trust policy, and SA annotation)
eksctl create iamserviceaccount \
--cluster "${CLUSTER_NAME}" \
--namespace "${EBS_NS}" \
--name "${EBS_SA}" \
--role-name "${EBS_IRSA_ROLE_NAME}" \
--attach-policy-arn "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy" \
--approve \
--override-existing-serviceaccounts
if [[ "$USE_EKSCTL" == "true" ]]; then
# Create IRSA for EBS CSI using eksctl (handles role creation, trust policy, and SA annotation)
eksctl create iamserviceaccount \
--cluster "${CLUSTER_NAME}" \
--namespace "${EBS_NS}" \
--name "${EBS_SA}" \
--role-name "${EBS_IRSA_ROLE_NAME}" \
--attach-policy-arn "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy" \
--approve \
--override-existing-serviceaccounts
fi

wait_resource_exists "${EBS_NS}" sa "${EBS_SA}" 180

log "✓ EBS CSI IRSA role and service account configured"
}
Expand Down Expand Up @@ -1072,14 +1100,17 @@ get_autoscaler_version() {

install_cluster_autoscaler() {
log "Installing Cluster Autoscaler with IRSA..."
eksctl create iamserviceaccount \
--cluster "${CLUSTER_NAME}" \
--name "${AUTOSCALER_SA}" \
--namespace "${AUTOSCALER_NS}" \
--role-name "${AUTOSCALER_ROLE_NAME}" \
--attach-policy-arn arn:aws:iam::aws:policy/AutoScalingFullAccess \
--approve \
--override-existing-serviceaccounts

if [[ "$USE_EKSCTL" == "true" ]]; then
eksctl create iamserviceaccount \
--cluster "${CLUSTER_NAME}" \
--name "${AUTOSCALER_SA}" \
--namespace "${AUTOSCALER_NS}" \
--role-name "${AUTOSCALER_ROLE_NAME}" \
--attach-policy-arn arn:aws:iam::aws:policy/AutoScalingFullAccess \
--approve \
--override-existing-serviceaccounts
fi

helm repo add autoscaler https://kubernetes.github.io/autoscaler
helm repo update
Expand Down Expand Up @@ -1387,14 +1418,16 @@ ensure_irsa_for_sa() {

# Ensure SA+Role via eksctl (idempotent)
log "Ensuring IRSA (role ${role}) for ${ns}/${sa} with policy ${policy_arn}"
eksctl create iamserviceaccount \
--cluster "${CLUSTER_NAME}" \
--namespace "${ns}" \
--name "${sa}" \
--role-name "${role}" \
--attach-policy-arn "${policy_arn}" \
--approve \
--override-existing-serviceaccounts
if [[ "$USE_EKSCTL" == "true" ]]; then
eksctl create iamserviceaccount \
--cluster "${CLUSTER_NAME}" \
--namespace "${ns}" \
--name "${sa}" \
--role-name "${role}" \
--attach-policy-arn "${policy_arn}" \
--approve \
--override-existing-serviceaccounts
fi

wait_resource_exists "${ns}" sa "${sa}" 180

Expand Down Expand Up @@ -2419,10 +2452,14 @@ preflight_env() {
fi

pf_header "Tools"
for t in aws eksctl kubectl helm git jq; do
for t in aws kubectl helm git jq; do
if command -v "$t" >/dev/null 2>&1; then pf_ok "$t found ($(command -v $t))"; else pf_fail "$t not found in PATH"; fi
done

if [[ "$USE_EKSCTL" == "true" ]]; then
if command -v "eksctl" >/dev/null 2>&1; then pf_ok "eksctl found ($(command -v eksctl))"; else pf_fail "eksctl not found in PATH"; fi
fi

pf_header "AWS identity & region"
local acct region_id
acct="$(aws sts get-caller-identity --query Account --output text 2>/dev/null || true)"
Expand Down Expand Up @@ -2710,7 +2747,10 @@ reconcile_flow() {

# ---------- MAIN ----------
main_install() {
for t in aws eksctl kubectl helm git jq; do need "$t"; done
for t in aws kubectl helm git jq; do need "$t"; done
if [[ "$USE_EKSCTL" == "true" ]]; then
need eksctl
fi

# Load configuration from YAML file
load_config
Expand Down Expand Up @@ -2751,13 +2791,49 @@ main_install() {
}

usage() {
echo "Usage: $0 {install|delete|delete-full}"
echo "Usage: $0 {install|delete|delete-full} [OPTIONS]"
echo ""
echo "Commands:"
echo " install preflight + create/reconcile cluster and components (idempotent)"
echo " delete delete cluster and ALL AWS resources/roles/policies created by this script"
echo " delete-full uninstall CRs/operators then run comprehensive AWS cleanup"
}
echo ""
echo "Options:"
echo " --no-eksctl Disable the use of eksctl for cluster operations, only available with install command"
echo ""
echo "Examples:"
echo " $0 install --no-eksctl"
echo " $0 delete"
}

# ====== PARSE FLAGS ======
COMMAND=""
while [[ $# -gt 0 ]]; do
case "$1" in
--no-eksctl)
USE_EKSCTL=false
shift
;;
install|delete|delete-full)
COMMAND="$1"
shift
;;
--help|-h)
usage
exit 0
;;
*)
echo "Unknown option or command: $1"
usage
exit 1
;;
esac
done

# Default to install if no command specified
COMMAND="${COMMAND:-install}"

case "${1:-install}" in
case "$COMMAND" in
install)
main_install
;;
Expand Down
Loading