I have deployed three servers in Azure cloud using a terraform script. I want to make them 1 master node and 2 workers. I am making the cluster using the kubeadm command.
My problem is that when I reboot after I have configured the master node all of the config files for some reason disappear.
The commands I have ran on the master node to install and initiate the cluster are these:
sudo apt update
Disable swap mem
sudo swapoff -a
sudo sed -i '/ swap / s/^/#/' /etc/fstab
Load Kernel Modules and Set Sysctl Parameters
cat <<EOF | sudo tee /etc/modules-load.d/k8s.conf
overlay
br_netfilter
EOF
sudo modprobe overlay
sudo modprobe br_netfilter
cat <<EOF | sudo tee /etc/sysctl.d/k8s.conf
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.ipv4.ip_forward = 1
EOF
sudo sysctl --system
Install docker and containerd
sudo apt-get install ca-certificates curl
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
echo
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu
$(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" |
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get update
sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin -y
Setup containerd
sudo mkdir -p /etc/containerd
containerd config default | sudo tee /etc/containerd/config.toml
sudo sed -i 's/SystemdCgroup = false/SystemdCgroup = true/g' /etc/containerd/config.toml
sudo systemctl restart containerd
sudo systemctl enable containerd
Install Kubernetes
sudo apt-get install -y apt-transport-https ca-certificates curl gnupg
curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.33/deb/Release.key | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
sudo chmod 644 /etc/apt/keyrings/kubernetes-apt-keyring.gpg
echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.33/deb/ /' | sudo tee /etc/apt/sources.list.d/kubernetes.list
sudo chmod 644 /etc/apt/sources.list.d/kubernetes.list
sudo apt-get update
Install the tools
sudo apt install -y kubelet kubeadm kubectl
sudo apt-mark hold kubelet kubeadm kubectl
sudo systemctl enable kubelet
sudo systemctl start kubelet
Init master node
sudo kubeadm init --pod-network-cidr=10.244.0.0/16
Configure kubectl for the Regular User
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
Deploy a Pod Network
kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml
And as per my terraform main.tf file I have this (I have my provider and variables in other files):
resource "azurerm_resource_group" "rg" {
name = "k8s-cluster-rg"
location = var.location
}
resource "azurerm_virtual_network" "vnet" {
name = "k8s-vnet"
address_space = ["10.0.0.0/16"]
location = var.location
resource_group_name = azurerm_resource_group.rg.name
}
resource "azurerm_subnet" "subnet" {
name = "k8s-subnet"
resource_group_name = azurerm_resource_group.rg.name
virtual_network_name = azurerm_virtual_network.vnet.name
address_prefixes = ["10.0.1.0/24"]
}
resource "azurerm_public_ip" "pip" {
count = 3
name = "k8s-pip-${count.index}"
location = var.location
resource_group_name = azurerm_resource_group.rg.name
allocation_method = "Static"
sku = "Basic"
}
resource "azurerm_network_security_group" "nsg" {
name = "k8s-nsg"
location = var.location
resource_group_name = azurerm_resource_group.rg.name
security_rule {
name = "All-traffic"
priority = 1001
direction = "Inbound"
access = "Allow"
protocol = "Tcp"
source_port_range = "22"
destination_port_range = ""
source_address_prefix = ""
destination_address_prefix = "*"
}
}
resource "azurerm_network_interface" "nic" {
count = 3
name = "k8s-nic-${count.index}"
location = var.location
resource_group_name = azurerm_resource_group.rg.name
ip_configuration {
name = "ipconfig"
subnet_id = azurerm_subnet.subnet.id
private_ip_address_allocation = "Static"
private_ip_address = "10.0.1.${count.index + 4}"
public_ip_address_id = azurerm_public_ip.pip[count.index].id
}
}
resource "azurerm_network_interface_security_group_association" "nic_nsg_assoc" {
count = 3
network_interface_id = azurerm_network_interface.nic[count.index].id
network_security_group_id = azurerm_network_security_group.nsg.id
}
resource "azurerm_linux_virtual_machine" "vms" {
count = 3
name = "k8s-vm-${count.index}"
resource_group_name = azurerm_resource_group.rg.name
location = var.location
size = "Standard_B2s"
admin_username = var.admin_username
network_interface_ids = [azurerm_network_interface.nic[count.index].id]
admin_ssh_key {
username = var.admin_username
public_key = file(var.ssh_public_key_path)
}
os_disk {
name = "osdisk-${count.index}"
caching = "ReadWrite"
storage_account_type = "Premium_LRS"
}
source_image_reference {
publisher = "Canonical"
offer = "0001-com-ubuntu-server-jammy"
sku = "22_04-lts-gen2"
version = "latest"
}
disable_password_authentication = true
}
There has been a case where I actually rebooted the node and the kubectl get nodes worked but as soon as I joined a worker node it broke and the configuration files were not there.
At first I suspected that the issue was with the private IP of the instances because I had them dynamically assigned but even after I changed it I still didn't get any results.