#!/bin/bash
# Copyright 2016 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script performs disaster recovery of etcd from the backup data.
# Assumptions:
# - backup was done using etcdctl command:
# a) in case of etcd2
# $ etcdctl backup --data-dir=
# produced .snap and .wal files
# b) in case of etcd3
# $ etcdctl --endpoints= snapshot save
# produced .db file
# - version.txt file is in the current directy (if it isn't it will be
# defaulted to "2.2.1/etcd2"). Based on this file, the script will
# decide to which version we are restoring (procedures are different
# for etcd2 and etcd3).
# - in case of etcd2 - *.snap and *.wal files are in current directory
# - in case of etcd3 - *.db file is in the current directory
# - the script is run as root
# - for event etcd, we only support clearing it - to do it, you need to
# set RESET_EVENT_ETCD=true env var.
set -o errexit
set -o nounset
set -o pipefail
# Version file contains information about current version in the format:
# / (e.g. "3.0.12/etcd3").
#
# If the file doesn't exist we assume "2.2.1/etcd2" configuration is
# the current one and create a file with such configuration.
# The restore procedure is chosen based on this information.
global VERSION_FILE := '"version.txt'"
# Make it possible to overwrite version file (or default version)
# with VERSION_CONTENTS env var.
if test -n $(VERSION_CONTENTS:-) {
echo $(VERSION_CONTENTS) > $(VERSION_FILE)
}
if test ! -f $(VERSION_FILE) {
echo "2.2.1/etcd2" > $(VERSION_FILE)
}
global VERSION_CONTENTS := $[cat $(VERSION_FILE)]
global ETCD_VERSION := $[echo $VERSION_CONTENTS | cut -d '/' -f 1]
global ETCD_API := $[echo $VERSION_CONTENTS | cut -d '/' -f 2]
# Name is used only in case of etcd3 mode, to appropriate set the metadata
# for the etcd data.
# NOTE: NAME HAS TO BE EQUAL TO WHAT WE USE IN --name flag when starting etcd.
global NAME := $(NAME:-etcd-$(hostname))
# Port on which etcd is exposed.
global etcd_port := '2379'
global event_etcd_port := '4002'
# Wait until both etcd instances are up
proc wait_for_etcd_up {
global port := $1
# TODO: As of 3.0.x etcd versions, all 2.* and 3.* versions return
# {"health": "true"} on /health endpoint in healthy case.
# However, we should come with a regex for it to avoid future break.
global health_ok := '"{\"health\": \"true\"}'"
for i in [$[seq 120]] {
# TODO: Is it enough to look into /health endpoint?
global health := $[curl --silent http://127.0.0.1:$(port)/health]
if test $(health) == $(health_ok) {
return 0
}
sleep 1
}
return 1
}
# Wait until apiserver is up.
proc wait_for_cluster_healthy {
for i in [$[seq 120]] {
global cs_status := $[kubectl get componentstatuses -o template --template='{{range .items}}{{with index .conditions 0}}{{.type}}:{{.status}}{{end}}{{"\n"}}{{end}}] || true
global componentstatuses := $[echo $(cs_status) | grep -c 'Healthy:] || true
global healthy := $[echo $(cs_status) | grep -c 'Healthy:True] || true
if test $(componentstatuses) -eq $(healthy) {
return 0
}
sleep 1
}
return 1
}
# Wait until etcd and apiserver pods are down.
proc wait_for_etcd_and_apiserver_down {
for i in [$[seq 120]] {
global etcd := $[docker ps | grep etcd | grep -v etcd-empty-dir | grep -v etcd-monitor | wc -l]
global apiserver := $[docker ps | grep apiserver | wc -l]
# TODO: Theoretically it is possible, that apiserver and or etcd
# are currently down, but Kubelet is now restarting them and they
# will reappear again. We should avoid it.
if test $(etcd) -eq "0" -a $(apiserver) -eq "0" {
return 0
}
sleep 1
}
return 1
}
# Move the manifest files to stop etcd and kube-apiserver
# while we swap the data out from under them.
global MANIFEST_DIR := '"/etc/kubernetes/manifests'"
global MANIFEST_BACKUP_DIR := '"/etc/kubernetes/manifests-backups'"
mkdir -p $(MANIFEST_BACKUP_DIR)
echo "Moving etcd(s) & apiserver manifest files to $(MANIFEST_BACKUP_DIR)"
# If those files were already moved (e.g. during previous
# try of backup) don't fail on it.
mv "$(MANIFEST_DIR)/kube-apiserver.manifest" $(MANIFEST_BACKUP_DIR) || true
mv "$(MANIFEST_DIR)/etcd.manifest" $(MANIFEST_BACKUP_DIR) || true
mv "$(MANIFEST_DIR)/etcd-events.manifest" $(MANIFEST_BACKUP_DIR) || true
# Wait for the pods to be stopped
echo "Waiting for etcd and kube-apiserver to be down"
if ! wait_for_etcd_and_apiserver_down {
# Couldn't kill etcd and apiserver.
echo "Downing etcd and apiserver failed"
exit 1
}
# Create the sort of directory structure that etcd expects.
# If this directory already exists, remove it.
global BACKUP_DIR := '"/var/tmp/backup'"
rm -rf $(BACKUP_DIR)
if test $(ETCD_API) == "etcd2" {
echo "Preparing etcd backup data for restore"
# In v2 mode, we simply copy both snap and wal files to a newly created
# directory. After that, we start etcd with --force-new-cluster option
# that (according to the etcd documentation) is required to recover from
# a backup.
echo "Copying data to $(BACKUP_DIR) and restoring there"
mkdir -p "$(BACKUP_DIR)/member/snap"
mkdir -p "$(BACKUP_DIR)/member/wal"
# If the cluster is relatively new, there can be no .snap file.
mv *.snap "$(BACKUP_DIR)/member/snap/" || true
mv *.wal "$(BACKUP_DIR)/member/wal/"
# TODO(jsz): This won't work with HA setups (e.g. do we need to set --name flag)?
echo "Starting etcd $(ETCD_VERSION) to restore data"
global image := $[docker run -d -v $(BACKUP_DIR):/var/etcd/data \
--net=host -p $(etcd_port):$(etcd_port) \
"gcr.io/google_containers/etcd:$(ETCD_VERSION)" /bin/sh -c \
"/usr/local/bin/etcd --data-dir /var/etcd/data --force-new-cluster]
if test "$Status" -ne "0" {
echo "Docker container didn't started correctly"
exit 1
}
echo "Container $(image) created, waiting for etcd to report as healthy"
if ! wait_for_etcd_up $(etcd_port) {
echo "Etcd didn't come back correctly"
exit 1
}
# Kill that etcd instance.
echo "Etcd healthy - killing $(image) container"
docker kill $(image)
} elif test $(ETCD_API) == "etcd3" {
echo "Preparing etcd snapshot for restore"
mkdir -p $(BACKUP_DIR)
echo "Copying data to $(BACKUP_DIR) and restoring there"
global number_files := $[find . -maxdepth 1 -type f -name "*.db" | wc -l]
if test $(number_files) -ne "1" {
echo "Incorrect number of *.db files - expected 1"
exit 1
}
mv *.db "$(BACKUP_DIR)/"
global snapshot := $[ls $(BACKUP_DIR)]
# Run etcdctl snapshot restore command and wait until it is finished.
# setting with --name in the etcd manifest file and then it seems to work.
# TODO(jsz): This command may not work in case of HA.
global image := $[docker run -d -v $(BACKUP_DIR):/var/tmp/backup --env ETCDCTL_API=3 \
"gcr.io/google_containers/etcd:$(ETCD_VERSION)" /bin/sh -c \
"/usr/local/bin/etcdctl snapshot restore $(BACKUP_DIR)/$(snapshot) --name $(NAME) --initial-cluster $(NAME)=http://localhost:2380; mv /$(NAME).etcd/member /var/tmp/backup/]
if test "$Status" -ne "0" {
echo "Docker container didn't started correctly"
exit 1
}
echo "Prepare container exit code: $[docker wait $(image)]"
rm -f "$(BACKUP_DIR)/$(snapshot)"
}
# Also copy version.txt file.
cp $(VERSION_FILE) $(BACKUP_DIR)
# Find out if we are running GCI vs CVM.
export CVM=$[curl "http://metadata/computeMetadata/v1/instance/attributes/" -H "Metadata-Flavor: Google" |& grep -q gci; echo $Status]
if [[ "$CVM" == "1" ]] {
export MNT_DISK="/mnt/master-pd"
} else {
export MNT_DISK="/mnt/disks/master-pd"
}
# Save the corrupted data (clean directory if it is already non-empty).
rm -rf "$(MNT_DISK)/var/etcd-corrupted"
mkdir -p "$(MNT_DISK)/var/etcd-corrupted"
echo "Saving corrupted data to $(MNT_DISK)/var/etcd-corrupted"
mv /var/etcd/data "$(MNT_DISK)/var/etcd-corrupted"
# Replace the corrupted data dir with the resotred data.
echo "Copying restored data to /var/etcd/data"
mv $(BACKUP_DIR) /var/etcd/data
if test $(RESET_EVENT_ETCD:-) == "true" {
echo "Removing event-etcd corrupted data"
global EVENTS_CORRUPTED_DIR := ""$(MNT_DISK)/var/etcd-events-corrupted""
# Save the corrupted data (clean directory if it is already non-empty).
rm -rf $(EVENTS_CORRUPTED_DIR)
mkdir -p $(EVENTS_CORRUPTED_DIR)
mv /var/etcd/data-events $(EVENTS_CORRUPTED_DIR)
}
# Start etcd and kube-apiserver again.
echo "Restarting etcd and apiserver from restored snapshot"
mv "$(MANIFEST_BACKUP_DIR)"/* "$(MANIFEST_DIR)/"
rm -rf $(MANIFEST_BACKUP_DIR)
# Verify that etcd is back.
echo "Waiting for etcd to come back"
if ! wait_for_etcd_up $(etcd_port) {
echo "Etcd didn't come back correctly"
exit 1
}
# Verify that event etcd is back.
echo "Waiting for event etcd to come back"
if ! wait_for_etcd_up $(event_etcd_port) {
echo "Event etcd didn't come back correctly"
exit 1
}
# Verify that kube-apiserver is back and cluster is healthy.
echo "Waiting for apiserver to come back"
if ! wait_for_cluster_healthy {
echo "Apiserver didn't come back correctly"
exit 1
}
echo "Cluster successfully restored!"
(CommandList
children: [
(C {(set)} {(-o)} {(errexit)})
(C {(set)} {(-o)} {(nounset)})
(C {(set)} {(-o)} {(pipefail)})
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:VERSION_FILE)
op: Equal
rhs: {(DQ (version.txt))}
spids: [136]
)
]
spids: [136]
)
(If
arms: [
(if_arm
cond: [
(Sentence
child:
(C {(Lit_Other "[")} {(-n)}
{
(DQ
(BracedVarSub
token:
suffix_op: (StringUnary op_id:VTest_ColonHyphen arg_word:{(SQ )})
spids: [155 158]
)
)
} {(Lit_Other "]")}
)
terminator:
)
]
action: [
(SimpleCommand
words: [{(echo)} {(DQ (${ VSub_Name VERSION_CONTENTS))}]
redirects: [
(Redir
op_id: Redir_Great
fd: -1
arg_word: {(DQ (${ VSub_Name VERSION_FILE))}
spids: [175]
)
]
)
]
spids: [-1 164]
)
]
spids: [-1 183]
)
(If
arms: [
(if_arm
cond: [
(Sentence
child:
(C {(Lit_Other "[")} {(KW_Bang "!")} {(-f)} {(DQ (${ VSub_Name VERSION_FILE))}
{(Lit_Other "]")}
)
terminator:
)
]
action: [
(SimpleCommand
words: [{(echo)} {(DQ (2.2.1/etcd2))}]
redirects: [
(Redir
op_id: Redir_Great
fd: -1
arg_word: {(DQ (${ VSub_Name VERSION_FILE))}
spids: [211]
)
]
)
]
spids: [-1 202]
)
]
spids: [-1 219]
)
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:VERSION_CONTENTS)
op: Equal
rhs:
{
(DQ
(CommandSubPart
command_list: (CommandList children:[(C {(cat)} {(${ VSub_Name VERSION_FILE)})])
left_token:
spids: [223 229]
)
)
}
spids: [221]
)
]
spids: [221]
)
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:ETCD_VERSION)
op: Equal
rhs:
{
(DQ
(CommandSubPart
command_list:
(CommandList
children: [
(Pipeline
children: [
(C {(echo)} {($ VSub_Name "$VERSION_CONTENTS")})
(C {(cut)} {(-d)} {(SQ >)} {(-f)} {(1)})
]
negated: False
)
]
)
left_token:
spids: [234 252]
)
)
}
spids: [232]
)
]
spids: [232]
)
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:ETCD_API)
op: Equal
rhs:
{
(DQ
(CommandSubPart
command_list:
(CommandList
children: [
(Pipeline
children: [
(C {(echo)} {($ VSub_Name "$VERSION_CONTENTS")})
(C {(cut)} {(-d)} {(SQ >)} {(-f)} {(2)})
]
negated: False
)
]
)
left_token:
spids: [257 275]
)
)
}
spids: [255]
)
]
spids: [255]
)
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:NAME)
op: Equal
rhs:
{
(DQ
(BracedVarSub
token:
suffix_op:
(StringUnary
op_id: VTest_ColonHyphen
arg_word:
{(etcd-)
(CommandSubPart
command_list: (CommandList children:[(C {(hostname)})])
left_token:
spids: [294 296]
)
}
)
spids: [290 297]
)
)
}
spids: [288]
)
]
spids: [288]
)
(Assignment
keyword: Assign_None
pairs: [(assign_pair lhs:(LhsName name:etcd_port) op:Equal rhs:{(2379)} spids:[304])]
spids: [304]
)
(Assignment
keyword: Assign_None
pairs: [(assign_pair lhs:(LhsName name:event_etcd_port) op:Equal rhs:{(4002)} spids:[307])]
spids: [307]
)
(FuncDef
name: wait_for_etcd_up
body:
(BraceGroup
children: [
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:port)
op: Equal
rhs: {($ VSub_Number "$1")}
spids: [321]
)
]
spids: [321]
)
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:health_ok)
op: Equal
rhs:
{
(DQ ("{") (EscapedLiteralPart token:) (health)
(EscapedLiteralPart token:) (": ") (EscapedLiteralPart token:) (true)
(EscapedLiteralPart token:) ("}")
)
}
spids: [337]
)
]
spids: [337]
)
(ForEach
iter_name: i
iter_words: [
{
(CommandSubPart
command_list: (CommandList children:[(C {(seq)} {(120)})])
left_token:
spids: [357 361]
)
}
]
do_arg_iter: False
body:
(DoGroup
children: [
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:health)
op: Equal
rhs:
{
(CommandSubPart
command_list:
(CommandList
children: [
(C {(curl)} {(--silent)}
{(http) (Lit_Other ":") (//127.0.0.1) (Lit_Other ":")
(${ VSub_Name port) (/health)
}
)
]
)
left_token:
spids: [372 385]
)
}
spids: [371]
)
]
spids: [371]
)
(If
arms: [
(if_arm
cond: [
(Sentence
child:
(C {(Lit_Other "[")} {(DQ (${ VSub_Name health))}
{(Lit_Other "=") (Lit_Other "=")} {(DQ (${ VSub_Name health_ok))} {(Lit_Other "]")}
)
terminator:
)
]
action: [(ControlFlow token: arg_word:{(0)})]
spids: [-1 410]
)
]
spids: [-1 418]
)
(C {(sleep)} {(1)})
]
spids: [364 426]
)
spids: [356 362]
)
(ControlFlow token: arg_word:{(1)})
]
spids: [318]
)
spids: [314 317]
)
(FuncDef
name: wait_for_cluster_healthy
body:
(BraceGroup
children: [
(ForEach
iter_name: i
iter_words: [
{
(CommandSubPart
command_list: (CommandList children:[(C {(seq)} {(120)})])
left_token:
spids: [452 456]
)
}
]
do_arg_iter: False
body:
(DoGroup
children: [
(AndOr
children: [
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:cs_status)
op: Equal
rhs:
{
(CommandSubPart
command_list:
(CommandList
children: [
(C {(kubectl)} {(get)} {(componentstatuses)} {(-o)} {(template)}
{(--template) (Lit_Other "=")
(SQ
<
"{{range .items}}{{with index .conditions 0}}{{.type}}:{{.status}}{{end}}{{\"\\n\"}}{{end}}"
>
)
}
)
]
)
left_token:
spids: [463 479]
)
}
spids: [462]
)
]
spids: [462]
)
(C {(true)})
]
op_id: Op_DPipe
)
(AndOr
children: [
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:componentstatuses)
op: Equal
rhs:
{
(CommandSubPart
command_list:
(CommandList
children: [
(Pipeline
children: [
(C {(echo)} {(DQ (${ VSub_Name cs_status))})
(C {(grep)} {(-c)} {(SQ <"Healthy:">)})
]
negated: False
)
]
)
left_token:
spids: [487 505]
)
}
spids: [486]
)
]
spids: [486]
)
(C {(true)})
]
op_id: Op_DPipe
)
(AndOr
children: [
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:healthy)
op: Equal
rhs:
{
(CommandSubPart
command_list:
(CommandList
children: [
(Pipeline
children: [
(C {(echo)} {(DQ (${ VSub_Name cs_status))})
(C {(grep)} {(-c)} {(SQ <"Healthy:True">)})
]
negated: False
)
]
)
left_token:
spids: [513 531]
)
}
spids: [512]
)
]
spids: [512]
)
(C {(true)})
]
op_id: Op_DPipe
)
(If
arms: [
(if_arm
cond: [
(Sentence
child:
(C {(Lit_Other "[")} {(DQ (${ VSub_Name componentstatuses))} {(-eq)}
{(DQ (${ VSub_Name healthy))} {(Lit_Other "]")}
)
terminator:
)
]
action: [(ControlFlow token: arg_word:{(0)})]
spids: [-1 559]
)
]
spids: [-1 567]
)
(C {(sleep)} {(1)})
]
spids: [459 575]
)
spids: [451 457]
)
(ControlFlow token: arg_word:{(1)})
]
spids: [443]
)
spids: [439 442]
)
(FuncDef
name: wait_for_etcd_and_apiserver_down
body:
(BraceGroup
children: [
(ForEach
iter_name: i
iter_words: [
{
(CommandSubPart
command_list: (CommandList children:[(C {(seq)} {(120)})])
left_token:
spids: [601 605]
)
}
]
do_arg_iter: False
body:
(DoGroup
children: [
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:etcd)
op: Equal
rhs:
{
(CommandSubPart
command_list:
(CommandList
children: [
(Pipeline
children: [
(C {(docker)} {(ps)})
(C {(grep)} {(etcd)})
(C {(grep)} {(-v)} {(etcd-empty-dir)})
(C {(grep)} {(-v)} {(etcd-monitor)})
(C {(wc)} {(-l)})
]
negated: False
)
]
)
left_token:
spids: [612 644]
)
}
spids: [611]
)
]
spids: [611]
)
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:apiserver)
op: Equal
rhs:
{
(CommandSubPart
command_list:
(CommandList
children: [
(Pipeline
children: [
(C {(docker)} {(ps)})
(C {(grep)} {(apiserver)})
(C {(wc)} {(-l)})
]
negated: False
)
]
)
left_token:
spids: [648 664]
)
}
spids: [647]
)
]
spids: [647]
)
(If
arms: [
(if_arm
cond: [
(Sentence
child:
(C {(Lit_Other "[")} {(DQ (${ VSub_Name etcd))} {(-eq)} {(DQ (0))} {(-a)}
{(DQ (${ VSub_Name apiserver))} {(-eq)} {(DQ (0))} {(Lit_Other "]")}
)
terminator:
)
]
action: [(ControlFlow token: arg_word:{(0)})]
spids: [-1 712]
)
]
spids: [-1 720]
)
(C {(sleep)} {(1)})
]
spids: [608 728]
)
spids: [600 606]
)
(ControlFlow token: arg_word:{(1)})
]
spids: [592]
)
spids: [588 591]
)
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:MANIFEST_DIR)
op: Equal
rhs: {(DQ (/etc/kubernetes/manifests))}
spids: [744]
)
]
spids: [744]
)
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:MANIFEST_BACKUP_DIR)
op: Equal
rhs: {(DQ (/etc/kubernetes/manifests-backups))}
spids: [749]
)
]
spids: [749]
)
(C {(mkdir)} {(-p)} {(DQ (${ VSub_Name MANIFEST_BACKUP_DIR))})
(C {(echo)}
{(DQ ("Moving etcd(s) & apiserver manifest files to ") (${ VSub_Name MANIFEST_BACKUP_DIR))}
)
(AndOr
children: [
(C {(mv)} {(DQ (${ VSub_Name MANIFEST_DIR) (/kube-apiserver.manifest))}
{(DQ (${ VSub_Name MANIFEST_BACKUP_DIR))}
)
(C {(true)})
]
op_id: Op_DPipe
)
(AndOr
children: [
(C {(mv)} {(DQ (${ VSub_Name MANIFEST_DIR) (/etcd.manifest))}
{(DQ (${ VSub_Name MANIFEST_BACKUP_DIR))}
)
(C {(true)})
]
op_id: Op_DPipe
)
(AndOr
children: [
(C {(mv)} {(DQ (${ VSub_Name MANIFEST_DIR) (/etcd-events.manifest))}
{(DQ (${ VSub_Name MANIFEST_BACKUP_DIR))}
)
(C {(true)})
]
op_id: Op_DPipe
)
(C {(echo)} {(DQ ("Waiting for etcd and kube-apiserver to be down"))})
(If
arms: [
(if_arm
cond: [
(Sentence
child: (Pipeline children:[(C {(wait_for_etcd_and_apiserver_down)})] negated:True)
terminator:
)
]
action: [(C {(echo)} {(DQ ("Downing etcd and apiserver failed"))}) (C {(exit)} {(1)})]
spids: [-1 853]
)
]
spids: [-1 871]
)
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:BACKUP_DIR)
op: Equal
rhs: {(DQ (/var/tmp/backup))}
spids: [880]
)
]
spids: [880]
)
(C {(rm)} {(-rf)} {(DQ (${ VSub_Name BACKUP_DIR))})
(If
arms: [
(if_arm
cond: [
(Sentence
child:
(C {(Lit_Other "[")} {(DQ (${ VSub_Name ETCD_API))} {(Lit_Other "=") (Lit_Other "=")}
{(DQ (etcd2))} {(Lit_Other "]")}
)
terminator:
)
]
action: [
(C {(echo)} {(DQ ("Preparing etcd backup data for restore"))})
(C {(echo)} {(DQ ("Copying data to ") (${ VSub_Name BACKUP_DIR) (" and restoring there"))})
(C {(mkdir)} {(-p)} {(DQ (${ VSub_Name BACKUP_DIR) (/member/snap))})
(C {(mkdir)} {(-p)} {(DQ (${ VSub_Name BACKUP_DIR) (/member/wal))})
(AndOr
children: [
(C {(mv)} {(Lit_Other "*") (.snap)} {(DQ (${ VSub_Name BACKUP_DIR) (/member/snap/))})
(C {(true)})
]
op_id: Op_DPipe
)
(C {(mv)} {(Lit_Other "*") (.wal)} {(DQ (${ VSub_Name BACKUP_DIR) (/member/wal/))})
(C {(echo)} {(DQ ("Starting etcd ") (${ VSub_Name ETCD_VERSION) (" to restore data"))})
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:image)
op: Equal
rhs:
{
(CommandSubPart
command_list:
(CommandList
children: [
(C {(docker)} {(run)} {(-d)} {(-v)}
{(${ VSub_Name BACKUP_DIR) (Lit_Other ":") (/var/etcd/data)} {(--net) (Lit_Other "=") (host)} {(-p)}
{(${ VSub_Name etcd_port) (Lit_Other ":") (${ VSub_Name etcd_port)} {(DQ ("gcr.io/google_containers/etcd:") (${ VSub_Name ETCD_VERSION))} {(/bin/sh)} {(-c)}
{
(DQ
(
"/usr/local/bin/etcd --data-dir /var/etcd/data --force-new-cluster"
)
)
}
)
]
)
left_token:
spids: [1027 1076]
)
}
spids: [1026]
)
]
spids: [1026]
)
(If
arms: [
(if_arm
cond: [
(Sentence
child:
(C {(Lit_Other "[")} {(DQ ($ VSub_QMark "$?"))} {(-ne)} {(DQ (0))}
{(Lit_Other "]")}
)
terminator:
)
]
action: [
(C {(echo)} {(DQ ("Docker container didn't started correctly"))})
(C {(exit)} {(1)})
]
spids: [-1 1096]
)
]
spids: [-1 1111]
)
(C {(echo)}
{
(DQ ("Container ") (${ VSub_Name image)
(" created, waiting for etcd to report as healthy")
)
}
)
(If
arms: [
(if_arm
cond: [
(Sentence
child:
(Pipeline
children: [(C {(wait_for_etcd_up)} {(DQ (${ VSub_Name etcd_port))})]
negated: True
)
terminator:
)
]
action: [(C {(echo)} {(DQ ("Etcd didn't come back correctly"))}) (C {(exit)} {(1)})]
spids: [-1 1139]
)
]
spids: [-1 1154]
)
(C {(echo)} {(DQ ("Etcd healthy - killing ") (${ VSub_Name image) (" container"))})
(C {(docker)} {(kill)} {(DQ (${ VSub_Name image))})
]
spids: [-1 915]
)
(if_arm
cond: [
(Sentence
child:
(C {(Lit_Other "[")} {(DQ (${ VSub_Name ETCD_API))} {(Lit_Other "=") (Lit_Other "=")}
{(DQ (etcd3))} {(Lit_Other "]")}
)
terminator:
)
]
action: [
(C {(echo)} {(DQ ("Preparing etcd snapshot for restore"))})
(C {(mkdir)} {(-p)} {(DQ (${ VSub_Name BACKUP_DIR))})
(C {(echo)} {(DQ ("Copying data to ") (${ VSub_Name BACKUP_DIR) (" and restoring there"))})
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:number_files)
op: Equal
rhs:
{
(CommandSubPart
command_list:
(CommandList
children: [
(Pipeline
children: [
(C {(find)} {(.)} {(-maxdepth)} {(1)} {(-type)} {(f)} {(-name)}
{(DQ ("*.db"))}
)
(C {(wc)} {(-l)})
]
negated: False
)
]
)
left_token:
spids: [1236 1260]
)
}
spids: [1235]
)
]
spids: [1235]
)
(If
arms: [
(if_arm
cond: [
(Sentence
child:
(C {(Lit_Other "[")} {(DQ (${ VSub_Name number_files))} {(-ne)} {(DQ (1))}
{(Lit_Other "]")}
)
terminator:
)
]
action: [
(C {(echo)} {(DQ ("Incorrect number of *.db files - expected 1"))})
(C {(exit)} {(1)})
]
spids: [-1 1282]
)
]
spids: [-1 1297]
)
(C {(mv)} {(Lit_Other "*") (.db)} {(DQ (${ VSub_Name BACKUP_DIR) (/))})
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:snapshot)
op: Equal
rhs:
{
(DQ
(CommandSubPart
command_list:
(CommandList
children: [(C {(ls)} {(${ VSub_Name BACKUP_DIR)})]
)
left_token:
spids: [1315 1321]
)
)
}
spids: [1313]
)
]
spids: [1313]
)
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:image)
op: Equal
rhs:
{
(CommandSubPart
command_list:
(CommandList
children: [
(C {(docker)} {(run)} {(-d)} {(-v)}
{(${ VSub_Name BACKUP_DIR) (Lit_Other ":") (/var/tmp/backup)} {(--env)} {(Lit_VarLike "ETCDCTL_API=") (3)}
{(DQ ("gcr.io/google_containers/etcd:") (${ VSub_Name ETCD_VERSION))} {(/bin/sh)} {(-c)}
{
(DQ ("/usr/local/bin/etcdctl snapshot restore ")
(${ VSub_Name BACKUP_DIR) (/) (${ VSub_Name snapshot) (" --name ") (${ VSub_Name NAME) (" --initial-cluster ") (${ VSub_Name NAME)
("=http://localhost:2380; mv /") (${ VSub_Name NAME) (".etcd/member /var/tmp/backup/")
)
}
)
]
)
left_token:
spids: [1339 1397]
)
}
spids: [1338]
)
]
spids: [1338]
)
(If
arms: [
(if_arm
cond: [
(Sentence
child:
(C {(Lit_Other "[")} {(DQ ($ VSub_QMark "$?"))} {(-ne)} {(DQ (0))}
{(Lit_Other "]")}
)
terminator:
)
]
action: [
(C {(echo)} {(DQ ("Docker container didn't started correctly"))})
(C {(exit)} {(1)})
]
spids: [-1 1417]
)
]
spids: [-1 1432]
)
(C {(echo)}
{
(DQ ("Prepare container exit code: ")
(CommandSubPart
command_list:
(CommandList
children: [(C {(docker)} {(wait)} {(${ VSub_Name image)})]
)
left_token:
spids: [1439 1447]
)
)
}
)
(C {(rm)} {(-f)} {(DQ (${ VSub_Name BACKUP_DIR) (/) (${ VSub_Name snapshot))})
]
spids: [1183 1203]
)
]
spids: [-1 1466]
)
(C {(cp)} {(DQ (${ VSub_Name VERSION_FILE))} {(DQ (${ VSub_Name BACKUP_DIR))})
(C {(export)}
{(Lit_VarLike "CVM=")
(CommandSubPart
command_list:
(CommandList
children: [
(Sentence
child:
(Pipeline
children: [
(C {(curl)} {(DQ ("http://metadata/computeMetadata/v1/instance/attributes/"))}
{(-H)} {(DQ ("Metadata-Flavor: Google"))}
)
(C {(grep)} {(-q)} {(gci)})
]
negated: False
stderr_indices: [0]
)
terminator:
)
(C {(echo)} {($ VSub_QMark "$?")})
]
)
left_token:
spids: [1492 1517]
)
}
)
(If
arms: [
(if_arm
cond: [
(Sentence
child:
(DBracket
expr:
(BoolBinary
op_id: BoolBinary_GlobDEqual
left: {(DQ ($ VSub_Name "$CVM"))}
right: {(DQ (1))}
)
)
terminator:
)
]
action: [(C {(export)} {(Lit_VarLike "MNT_DISK=") (DQ (/mnt/master-pd))})]
spids: [-1 1536]
)
]
else_action: [(C {(export)} {(Lit_VarLike "MNT_DISK=") (DQ (/mnt/disks/master-pd))})]
spids: [1546 1556]
)
(C {(rm)} {(-rf)} {(DQ (${ VSub_Name MNT_DISK) (/var/etcd-corrupted))})
(C {(mkdir)} {(-p)} {(DQ (${ VSub_Name MNT_DISK) (/var/etcd-corrupted))})
(C {(echo)} {(DQ ("Saving corrupted data to ") (${ VSub_Name MNT_DISK) (/var/etcd-corrupted))})
(C {(mv)} {(/var/etcd/data)} {(DQ (${ VSub_Name MNT_DISK) (/var/etcd-corrupted))})
(C {(echo)} {(DQ ("Copying restored data to /var/etcd/data"))})
(C {(mv)} {(DQ (${ VSub_Name BACKUP_DIR))} {(/var/etcd/data)})
(If
arms: [
(if_arm
cond: [
(Sentence
child:
(C {(Lit_Other "[")}
{
(DQ
(BracedVarSub
token:
suffix_op: (StringUnary op_id:VTest_ColonHyphen arg_word:{(SQ )})
spids: [1631 1634]
)
)
} {(Lit_Other "=") (Lit_Other "=")} {(DQ (true))} {(Lit_Other "]")}
)
terminator:
)
]
action: [
(C {(echo)} {(DQ ("Removing event-etcd corrupted data"))})
(Assignment
keyword: Assign_None
pairs: [
(assign_pair
lhs: (LhsName name:EVENTS_CORRUPTED_DIR)
op: Equal
rhs: {(DQ (${ VSub_Name MNT_DISK) (/var/etcd-events-corrupted))}
spids: [1657]
)
]
spids: [1657]
)
(C {(rm)} {(-rf)} {(DQ (${ VSub_Name EVENTS_CORRUPTED_DIR))})
(C {(mkdir)} {(-p)} {(DQ (${ VSub_Name EVENTS_CORRUPTED_DIR))})
(C {(mv)} {(/var/etcd/data-events)} {(DQ (${ VSub_Name EVENTS_CORRUPTED_DIR))})
]
spids: [-1 1647]
)
]
spids: [-1 1702]
)
(C {(echo)} {(DQ ("Restarting etcd and apiserver from restored snapshot"))})
(C {(mv)} {(DQ (${ VSub_Name MANIFEST_BACKUP_DIR)) (/) (Lit_Other "*")}
{(DQ (${ VSub_Name MANIFEST_DIR) (/))}
)
(C {(rm)} {(-rf)} {(DQ (${ VSub_Name MANIFEST_BACKUP_DIR))})
(C {(echo)} {(DQ ("Waiting for etcd to come back"))})
(If
arms: [
(if_arm
cond: [
(Sentence
child:
(Pipeline
children: [(C {(wait_for_etcd_up)} {(DQ (${ VSub_Name etcd_port))})]
negated: True
)
terminator:
)
]
action: [(C {(echo)} {(DQ ("Etcd didn't come back correctly"))}) (C {(exit)} {(1)})]
spids: [-1 1764]
)
]
spids: [-1 1778]
)
(C {(echo)} {(DQ ("Waiting for event etcd to come back"))})
(If
arms: [
(if_arm
cond: [
(Sentence
child:
(Pipeline
children: [(C {(wait_for_etcd_up)} {(DQ (${ VSub_Name event_etcd_port))})]
negated: True
)
terminator:
)
]
action: [(C {(echo)} {(DQ ("Event etcd didn't come back correctly"))}) (C {(exit)} {(1)})]
spids: [-1 1803]
)
]
spids: [-1 1817]
)
(C {(echo)} {(DQ ("Waiting for apiserver to come back"))})
(If
arms: [
(if_arm
cond: [
(Sentence
child: (Pipeline children:[(C {(wait_for_cluster_healthy)})] negated:True)
terminator:
)
]
action: [(C {(echo)} {(DQ ("Apiserver didn't come back correctly"))}) (C {(exit)} {(1)})]
spids: [-1 1836]
)
]
spids: [-1 1850]
)
(C {(echo)} {(DQ ("Cluster successfully restored!"))})
]
)