#!/bin/bash

# Copyright 2016 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This script performs disaster recovery of etcd from the backup data.
# Assumptions:
# - backup was done using etcdctl command:
#   a) in case of etcd2
#      $ etcdctl backup --data-dir=<dir>
#      produced .snap and .wal files
#   b) in case of etcd3
#      $ etcdctl --endpoints=<address> snapshot save
#      produced .db file
# - version.txt file is in the current directy (if it isn't it will be
#     defaulted to "2.2.1/etcd2"). Based on this file, the script will
#     decide to which version we are restoring (procedures are different
#     for etcd2 and etcd3).
# - in case of etcd2 - *.snap and *.wal files are in current directory
# - in case of etcd3 - *.db file is in the current directory
# - the script is run as root
# - for event etcd, we only support clearing it - to do it, you need to
#   set RESET_EVENT_ETCD=true env var.

set -o errexit
set -o nounset
set -o pipefail

# Version file contains information about current version in the format:
# <etcd binary version>/<etcd api mode> (e.g. "3.0.12/etcd3").
#
# If the file doesn't exist we assume "2.2.1/etcd2" configuration is
# the current one and create a file with such configuration.
# The restore procedure is chosen based on this information.
setglobal VERSION_FILE = '"version.txt'"

# Make it possible to overwrite version file (or default version)
# with VERSION_CONTENTS env var.
if test -n $(VERSION_CONTENTS:-) {
  echo $(VERSION_CONTENTS) > $(VERSION_FILE)
}
if test ! -f $(VERSION_FILE) {
  echo "2.2.1/etcd2" > $(VERSION_FILE)
}
setglobal VERSION_CONTENTS = $[cat $(VERSION_FILE)]
setglobal ETCD_VERSION = $[echo $VERSION_CONTENTS | cut -d '/' -f 1]
setglobal ETCD_API = $[echo $VERSION_CONTENTS | cut -d '/' -f 2]

# Name is used only in case of etcd3 mode, to appropriate set the metadata
# for the etcd data.
# NOTE: NAME HAS TO BE EQUAL TO WHAT WE USE IN --name flag when starting etcd.
setglobal NAME = $(NAME:-etcd-$(hostname))

# Port on which etcd is exposed.
setglobal etcd_port = '2379'
setglobal event_etcd_port = '4002'

# Wait until both etcd instances are up
proc wait_for_etcd_up {
  setglobal port = $1
  # TODO: As of 3.0.x etcd versions, all 2.* and 3.* versions return
  # {"health": "true"} on /health endpoint in healthy case.
  # However, we should come with a regex for it to avoid future break.
  setglobal health_ok = '"{\"health\": \"true\"}'"
  for i in [$[seq 120]] {
    # TODO: Is it enough to look into /health endpoint?
    setglobal health = $[curl --silent http://127.0.0.1:$(port)/health]
    if test $(health) == $(health_ok) {
      return 0
    }
    sleep 1
  }
  return 1
}

# Wait until apiserver is up.
proc wait_for_cluster_healthy {
  for i in [$[seq 120]] {
    setglobal cs_status = $[kubectl get componentstatuses -o template --template='{{range .items}}{{with index .conditions 0}}{{.type}}:{{.status}}{{end}}{{"\n"}}{{end}}] || true
    setglobal componentstatuses = $[echo $(cs_status) | grep -c 'Healthy:] || true
    setglobal healthy = $[echo $(cs_status) | grep -c 'Healthy:True] || true
    if test $(componentstatuses) -eq $(healthy) {
      return 0
    }
    sleep 1
  }
  return 1
}

# Wait until etcd and apiserver pods are down.
proc wait_for_etcd_and_apiserver_down {
  for i in [$[seq 120]] {
    setglobal etcd = $[docker ps | grep etcd | grep -v etcd-empty-dir | grep -v etcd-monitor | wc -l]
    setglobal apiserver = $[docker ps | grep apiserver | wc -l]
    # TODO: Theoretically it is possible, that apiserver and or etcd
    # are currently down, but Kubelet is now restarting them and they
    # will reappear again. We should avoid it.
    if test $(etcd) -eq "0" -a $(apiserver) -eq "0" {
      return 0
    }
    sleep 1
  }
  return 1
}

# Move the manifest files to stop etcd and kube-apiserver
# while we swap the data out from under them.
setglobal MANIFEST_DIR = '"/etc/kubernetes/manifests'"
setglobal MANIFEST_BACKUP_DIR = '"/etc/kubernetes/manifests-backups'"
mkdir -p $(MANIFEST_BACKUP_DIR)
echo "Moving etcd(s) & apiserver manifest files to $(MANIFEST_BACKUP_DIR)"
# If those files were already moved (e.g. during previous
# try of backup) don't fail on it.
mv "$(MANIFEST_DIR)/kube-apiserver.manifest" $(MANIFEST_BACKUP_DIR) || true
mv "$(MANIFEST_DIR)/etcd.manifest" $(MANIFEST_BACKUP_DIR) || true
mv "$(MANIFEST_DIR)/etcd-events.manifest" $(MANIFEST_BACKUP_DIR) || true

# Wait for the pods to be stopped
echo "Waiting for etcd and kube-apiserver to be down"
if ! wait_for_etcd_and_apiserver_down {
  # Couldn't kill etcd and apiserver.
  echo "Downing etcd and apiserver failed"
  exit 1
}

# Create the sort of directory structure that etcd expects.
# If this directory already exists, remove it.
setglobal BACKUP_DIR = '"/var/tmp/backup'"
rm -rf $(BACKUP_DIR)
if test $(ETCD_API) == "etcd2" {
  echo "Preparing etcd backup data for restore"
  # In v2 mode, we simply copy both snap and wal files to a newly created
  # directory. After that, we start etcd with --force-new-cluster option
  # that (according to the etcd documentation) is required to recover from
  # a backup.
  echo "Copying data to $(BACKUP_DIR) and restoring there"
  mkdir -p "$(BACKUP_DIR)/member/snap"
  mkdir -p "$(BACKUP_DIR)/member/wal"
  # If the cluster is relatively new, there can be no .snap file.
  mv *.snap "$(BACKUP_DIR)/member/snap/" || true
  mv *.wal "$(BACKUP_DIR)/member/wal/"

  # TODO(jsz): This won't work with HA setups (e.g. do we need to set --name flag)?
  echo "Starting etcd $(ETCD_VERSION) to restore data"
  setglobal image = $[docker run -d -v $(BACKUP_DIR):/var/etcd/data \
    --net=host -p $(etcd_port):$(etcd_port) \
    "gcr.io/google_containers/etcd:$(ETCD_VERSION)" /bin/sh -c \
    "/usr/local/bin/etcd --data-dir /var/etcd/data --force-new-cluster]
  if test "$Status" -ne "0" {
    echo "Docker container didn't started correctly"
    exit 1
  }
  echo "Container $(image) created, waiting for etcd to report as healthy"

  if ! wait_for_etcd_up $(etcd_port) {
    echo "Etcd didn't come back correctly"
    exit 1
  }

  # Kill that etcd instance.
  echo "Etcd healthy - killing $(image) container"
  docker kill $(image)
} elif test $(ETCD_API) == "etcd3" {
  echo "Preparing etcd snapshot for restore"
  mkdir -p $(BACKUP_DIR)
  echo "Copying data to $(BACKUP_DIR) and restoring there"
  setglobal number_files = $[find . -maxdepth 1 -type f -name "*.db" | wc -l]
  if test $(number_files) -ne "1" {
    echo "Incorrect number of *.db files - expected 1"
    exit 1
  }
  mv *.db "$(BACKUP_DIR)/"
  setglobal snapshot = $[ls $(BACKUP_DIR)]

  # Run etcdctl snapshot restore command and wait until it is finished.
  # setting with --name in the etcd manifest file and then it seems to work.
  # TODO(jsz): This command may not work in case of HA.
  setglobal image = $[docker run -d -v $(BACKUP_DIR):/var/tmp/backup --env ETCDCTL_API=3 \
    "gcr.io/google_containers/etcd:$(ETCD_VERSION)" /bin/sh -c \
    "/usr/local/bin/etcdctl snapshot restore $(BACKUP_DIR)/$(snapshot) --name $(NAME) --initial-cluster $(NAME)=http://localhost:2380; mv /$(NAME).etcd/member /var/tmp/backup/]
  if test "$Status" -ne "0" {
    echo "Docker container didn't started correctly"
    exit 1
  }
  echo "Prepare container exit code: $[docker wait $(image)]"

  rm -f "$(BACKUP_DIR)/$(snapshot)"
}
# Also copy version.txt file.
cp $(VERSION_FILE) $(BACKUP_DIR)

# Find out if we are running GCI vs CVM.
export CVM=$[curl "http://metadata/computeMetadata/v1/instance/attributes/" -H "Metadata-Flavor: Google" |& grep -q gci; echo $Status]
if [[ "$CVM" == "1" ]] {
  export MNT_DISK="/mnt/master-pd"
} else {
  export MNT_DISK="/mnt/disks/master-pd"
}

# Save the corrupted data (clean directory if it is already non-empty).
rm -rf "$(MNT_DISK)/var/etcd-corrupted"
mkdir -p "$(MNT_DISK)/var/etcd-corrupted"
echo "Saving corrupted data to $(MNT_DISK)/var/etcd-corrupted"
mv /var/etcd/data "$(MNT_DISK)/var/etcd-corrupted"

# Replace the corrupted data dir with the resotred data.
echo "Copying restored data to /var/etcd/data"
mv $(BACKUP_DIR) /var/etcd/data

if test $(RESET_EVENT_ETCD:-) == "true" {
  echo "Removing event-etcd corrupted data"
  setglobal EVENTS_CORRUPTED_DIR = ""$(MNT_DISK)/var/etcd-events-corrupted""
  # Save the corrupted data (clean directory if it is already non-empty).
  rm -rf $(EVENTS_CORRUPTED_DIR)
  mkdir -p $(EVENTS_CORRUPTED_DIR)
  mv /var/etcd/data-events $(EVENTS_CORRUPTED_DIR)
}

# Start etcd and kube-apiserver again.
echo "Restarting etcd and apiserver from restored snapshot"
mv "$(MANIFEST_BACKUP_DIR)"/* "$(MANIFEST_DIR)/"
rm -rf $(MANIFEST_BACKUP_DIR)

# Verify that etcd is back.
echo "Waiting for etcd to come back"
if ! wait_for_etcd_up $(etcd_port) {
  echo "Etcd didn't come back correctly"
  exit 1
}

# Verify that event etcd is back.
echo "Waiting for event etcd to come back"
if ! wait_for_etcd_up $(event_etcd_port) {
  echo "Event etcd didn't come back correctly"
  exit 1
}

# Verify that kube-apiserver is back and cluster is healthy.
echo "Waiting for apiserver to come back"
if ! wait_for_cluster_healthy {
  echo "Apiserver didn't come back correctly"
  exit 1
}

echo "Cluster successfully restored!"