freeleaps-ops/docs/bootstrap-k8s-cluster.sh

395 lines
11 KiB
Bash
Raw Normal View History

2025-09-03 23:59:04 +00:00
#!/bin/bash
# Freeleaps Kubernetes Cluster Bootstrap Script
# This script bootstraps a complete Kubernetes cluster from Azure VMs
set -e
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
FREELEAPS_OPS_DIR="$(dirname "$SCRIPT_DIR")"
INVENTORY_FILE="$FREELEAPS_OPS_DIR/cluster/ansible/manifests/inventory.ini"
KUBESPRAY_DIR="$FREELEAPS_OPS_DIR/3rd/kubespray"
MANIFESTS_DIR="$FREELEAPS_OPS_DIR/cluster/manifests"
BIN_DIR="$FREELEAPS_OPS_DIR/cluster/bin"
# Function to print colored output
print_status() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Function to check prerequisites
check_prerequisites() {
print_status "Checking prerequisites..."
# Check if we're in the right directory
if [[ ! -f "$INVENTORY_FILE" ]]; then
print_error "Inventory file not found: $INVENTORY_FILE"
print_error "Please run this script from the freeleaps-ops/docs directory"
exit 1
fi
# Check if kubespray exists
if [[ ! -d "$KUBESPRAY_DIR" ]]; then
print_error "Kubespray directory not found: $KUBESPRAY_DIR"
exit 1
fi
# Check required tools
local missing_tools=()
if ! command -v ansible &> /dev/null; then
missing_tools+=("ansible")
fi
if ! command -v az &> /dev/null; then
missing_tools+=("azure-cli")
fi
if ! command -v kubectl &> /dev/null; then
missing_tools+=("kubectl")
fi
if [[ ${#missing_tools[@]} -gt 0 ]]; then
print_error "Missing required tools: ${missing_tools[*]}"
print_warning "Please install missing tools before proceeding"
exit 1
fi
print_success "All prerequisites are met"
}
# Function to verify Azure VMs
verify_azure_vms() {
print_status "Verifying Azure VMs..."
# Get VMs from inventory
local vms=()
while IFS= read -r line; do
if [[ $line =~ ^[a-zA-Z0-9-]+ ]]; then
vm_name=$(echo "$line" | awk '{print $1}')
vms+=("$vm_name")
fi
done < "$INVENTORY_FILE"
print_status "Found VMs in inventory: ${vms[*]}"
# Check VM status in Azure
for vm in "${vms[@]}"; do
local power_state=$(az vm show --resource-group k8s --name "$vm" --query "powerState" -o tsv 2>/dev/null)
if [[ "$power_state" != "VM running" ]]; then
print_warning "VM $vm is not running (state: $power_state)"
read -p "Do you want to start VM $vm? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
az vm start --resource-group k8s --name "$vm"
print_status "Starting VM $vm..."
sleep 30
fi
else
print_success "VM $vm is running"
fi
done
}
# Function to test connectivity
test_connectivity() {
print_status "Testing connectivity to all VMs..."
cd "$(dirname "$INVENTORY_FILE")"
if ansible -i inventory.ini all -m ping -kK; then
print_success "Connectivity to all VMs verified"
else
print_error "Connectivity test failed"
print_warning "Please check:"
print_warning "1. VMs are running"
print_warning "2. Network security groups allow SSH (port 22)"
print_warning "3. SSH credentials are correct"
exit 1
fi
}
# Function to bootstrap Kubernetes cluster
bootstrap_cluster() {
print_status "Bootstrapping Kubernetes cluster..."
cd "$KUBESPRAY_DIR"
print_status "Running Kubespray cluster installation..."
print_warning "This process may take 15-30 minutes..."
if ansible-playbook -i ../../cluster/ansible/manifests/inventory.ini ./cluster.yml -kK -b; then
print_success "Kubernetes cluster bootstrapped successfully"
else
print_error "Cluster bootstrap failed"
print_warning "Check the Ansible output for errors"
exit 1
fi
}
# Function to get kubeconfig
get_kubeconfig() {
print_status "Retrieving kubeconfig..."
# Get the first master node IP
local master_ip=$(grep -A 10 "\[kube_control_plane\]" "$INVENTORY_FILE" | grep ansible_host | head -1 | awk '{print $2}' | cut -d'=' -f2)
if [[ -z "$master_ip" ]]; then
print_error "Could not find master node IP in inventory"
exit 1
fi
print_status "Getting kubeconfig from master node: $master_ip"
# Create .kube directory if it doesn't exist
mkdir -p ~/.kube
# Get kubeconfig from master node
ssh wwwadmin@mathmast.com@"$master_ip" "sudo cat /etc/kubernetes/admin.conf" > ~/.kube/config
if [[ $? -eq 0 ]]; then
print_success "Kubeconfig retrieved successfully"
else
print_error "Failed to retrieve kubeconfig"
exit 1
fi
}
# Function to verify cluster
verify_cluster() {
print_status "Verifying cluster installation..."
# Wait for cluster to be ready
local max_attempts=30
local attempt=1
while [[ $attempt -le $max_attempts ]]; do
if kubectl get nodes &> /dev/null; then
print_success "Cluster is accessible"
break
fi
print_status "Waiting for cluster to be ready... (attempt $attempt/$max_attempts)"
sleep 30
((attempt++))
done
if [[ $attempt -gt $max_attempts ]]; then
print_error "Cluster verification failed"
print_warning "Troubleshooting steps:"
print_warning "1. Check VM resources (CPU, memory)"
print_warning "2. Check network connectivity between nodes"
print_warning "3. Check Ansible logs for errors"
print_warning "4. Verify inventory file configuration"
exit 1
fi
# Check node status
print_status "Checking node status..."
kubectl get nodes
# Wait for all nodes to be ready
local ready_nodes=$(kubectl get nodes --no-headers | grep -c "Ready")
local total_nodes=$(kubectl get nodes --no-headers | wc -l)
if [[ $ready_nodes -eq $total_nodes ]]; then
print_success "All $total_nodes nodes are ready"
else
print_warning "Only $ready_nodes/$total_nodes nodes are ready"
kubectl get nodes
fi
# Check system pods
print_status "Checking system pods..."
kubectl get pods -n kube-system
# Wait for critical system pods
print_status "Waiting for critical system pods..."
local critical_pods=("kube-apiserver" "kube-controller-manager" "kube-scheduler" "etcd")
for pod_prefix in "${critical_pods[@]}"; do
local max_pod_attempts=20
local pod_attempt=1
while [[ $pod_attempt -le $max_pod_attempts ]]; do
if kubectl get pods -n kube-system | grep -q "$pod_prefix.*Running"; then
print_success "$pod_prefix is running"
break
fi
if [[ $pod_attempt -eq $max_pod_attempts ]]; then
print_warning "$pod_prefix is not running"
kubectl get pods -n kube-system | grep "$pod_prefix"
fi
sleep 10
((pod_attempt++))
done
done
# Check cluster info
print_status "Checking cluster info..."
kubectl cluster-info
}
# Function to deploy infrastructure
deploy_infrastructure() {
print_status "Deploying infrastructure components..."
cd "$MANIFESTS_DIR"
# Deploy in order
local components=(
"freeleaps-controls-system"
"freeleaps-devops-system"
"freeleaps-monitoring-system"
"freeleaps-logging-system"
"freeleaps-data-platform"
)
for component in "${components[@]}"; do
if [[ -d "$component" ]]; then
print_status "Deploying $component..."
kubectl apply -f "$component/"
# Wait for deployment to stabilize
print_status "Waiting for $component to stabilize..."
sleep 30
else
print_warning "Component directory not found: $component"
fi
done
print_success "Infrastructure deployment completed"
}
# Function to setup authentication
setup_authentication() {
print_status "Setting up authentication..."
cd "$BIN_DIR"
if [[ -f "freeleaps-cluster-authenticator" ]]; then
print_status "Running authentication setup..."
./freeleaps-cluster-authenticator auth
else
print_warning "Authentication script not found"
print_warning "Please run authentication setup manually"
fi
}
# Function to display final status
display_final_status() {
print_success "Kubernetes cluster bootstrap completed!"
echo
echo "=== Cluster Status ==="
kubectl get nodes
echo
echo "=== System Pods ==="
kubectl get pods -n kube-system
echo
echo "=== Infrastructure Status ==="
kubectl get pods --all-namespaces | grep -E "(argocd|cert-manager|prometheus|grafana)"
echo
echo "=== Next Steps ==="
echo "1. Verify all components are running: kubectl get pods --all-namespaces"
echo "2. Access ArgoCD: kubectl port-forward svc/argocd-server -n freeleaps-devops-system 8080:80"
echo "3. Access Grafana: kubectl port-forward svc/kube-prometheus-stack-grafana -n freeleaps-monitoring-system 3000:80"
echo "4. Setup authentication: cd $BIN_DIR && ./freeleaps-cluster-authenticator auth"
echo "5. Deploy applications via ArgoCD"
}
# Main function
main() {
echo "=========================================="
echo "Freeleaps Kubernetes Cluster Bootstrap"
echo "=========================================="
echo
# Check prerequisites
check_prerequisites
# Verify Azure VMs
verify_azure_vms
# Test connectivity
test_connectivity
# Bootstrap cluster
bootstrap_cluster
# Get kubeconfig
get_kubeconfig
# Verify cluster
verify_cluster
# Deploy infrastructure
deploy_infrastructure
# Setup authentication
setup_authentication
# Display final status
display_final_status
}
# Handle script arguments
if [[ $# -eq 0 ]]; then
main
else
case "$1" in
--help|-h)
echo "Usage: $0 [OPTIONS]"
echo
echo "Options:"
echo " --help, -h Show this help message"
echo " --verify Only verify prerequisites and connectivity"
echo " --bootstrap Only bootstrap the cluster (skip infrastructure)"
echo
echo "This script bootstraps a complete Kubernetes cluster from Azure VMs."
exit 0
;;
--verify)
check_prerequisites
verify_azure_vms
test_connectivity
print_success "Verification completed successfully"
;;
--bootstrap)
check_prerequisites
verify_azure_vms
test_connectivity
bootstrap_cluster
get_kubeconfig
verify_cluster
print_success "Cluster bootstrap completed"
;;
*)
print_error "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
fi