#!/bin/sh
#
#
# Checkpoint_last OCF Resource Agent.
#
# Copyright (c) 2006 Andrew Beekhof
# Copyright (c) 2012. Synology, Inc. All Rights Reserved.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like.  Any license provided herein, whether implied or
# otherwise, applies only to this software file.  Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation,
# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
#

#######################################################################
# Initialization:

: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
. $prefix/etc.defaults/rc.subr

CRM_MASTER="${USR_BIN_DIR}/crm_master -l reboot"
SYNOHA_PROG="$prefix/sbin/synoha"
SYNODRBD_PROG="$prefix/sbin/synodrbd"
SYNO_HA_RCHA="$prefix/etc.defaults/rc.ha"
REMOTE_STATUS="--remote-status"
NOTIFY_HA_FAIL_OVER="--notify fail-over"
NOTIFY_HA_SERVICE_UP="--notify service-up"
CHECK_HA_DATA_SYNC="--check-data-sync"
CLEAN_CRM_STATE="--crm-clean-state"
CHECK_IF_SWITCH_OVER="--check-if-switch-over"
CLEAN_SWITCH_OVER_STATE="--clean-switch-over-state"
UPDATE_HA_CSP="--update-ha-csp"
SAVE_HA_STATE_TIME="--save-ha-state-time"
CHECK_SYNC="--check-sync"
CHECK_PASSIVE_CRASH="--check-passive-crash"
CHECK_SERVICE_FAIL="--check-service-fail"
CHECK_ENABLE_SWITCHOVER="--check-enable-switchover"
CHECK_VER_MISMATCH="--check-ver-mismatch"
FLAG_HA_NET_FAILED=$NETWORK_FAILED
FLAG_HA_PING_FAILED=$PING_FAILED

NO_ACTIVE_COUNT="/tmp/ha/no_active_cnt"
NO_ACTIVE_SHUTDOWN="/tmp/ha/no_active_shutdown"

CRM_PROG="${prefix}/sbin/crm"
CRM_DELETE_FAILCOUNT="resource failcount SERV delete"

FLAG_HA_ROLE_ACTIVE=$ROLE_ACTIVE_FILE
CHECKPOINT_CHECKING="/tmp/ha/checkpoint_checking"

FLAG_WAIT_FSCK=$WAIT_FSCK_FLAG
#######################################################################

meta_data() {
	cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="Checkpoint_last" version="1.0">
<version>1.0</version>

<longdesc lang="en">
This is an example resource agent that impliments two states
</longdesc>
<shortdesc lang="en">Example Checkpoint_last resource agent</shortdesc>

<parameters>

<parameter name="state" unique="1">
<longdesc lang="en">
Location to store the resource state in
</longdesc>
<shortdesc lang="en">State file</shortdesc>
<content type="string" default="${HA_RSCTMP}/Checkpoint_last-{OCF_RESOURCE_INSTANCE}.state" />
</parameter>

</parameters>

<actions>
<action name="start"   timeout="20s" />
<action name="stop"    timeout="20s" />
<action name="promote" timeout="20s" />
<action name="demote"  timeout="20s" />
<action name="monitor" timeout="20" interval="31" role="Slave" depth="0" />
<action name="monitor" timeout="20" interval="29" role="Master" depth="0" />
<action name="meta-data"  timeout="5" />
<action name="validate-all"  timeout="20s" />
</actions>
</resource-agent>
END
	exit $OCF_SUCCESS
}

#######################################################################

checkpoint_last_usage() {
	cat <<END
usage: $0 {start|stop|promote|demote|monitor|validate-all|meta-data}

Expects to have a fully populated OCF RA-compliant environment set.
END
	exit $OCF_ERR_GENERIC
}

checkpoint_last_update() {
	echo $1 > ${OCF_RESKEY_state}
}

checkpoint_last_check_state() {
	target=$1
	if [ -f ${OCF_RESKEY_state} ]; then
		state=`cat ${OCF_RESKEY_state}`
		if [ "x$target" = "x$state" ]; then
			return $OCF_SUCCESS
		fi

	else
		if [ "x$target" = "x" ]; then
			return $OCF_SUCCESS
		fi
	fi

	return $OCF_ERR_GENERIC
}

checkpoint_last_start() {
	checkpoint_last_check_state master
	if [ $? = 0 ]; then
		# CRM Error - Should never happen
		return $OCF_RUNNING_MASTER
	fi

	init_peer_lost

	rm $NO_ACTIVE_COUNT &> /dev/null
	rm $NO_ACTIVE_SHUTDOWN &> /dev/null

	checkpoint_last_update slave
	$CRM_MASTER -v 5
	return $OCF_SUCCESS
}

checkpoint_last_demote() {
	checkpoint_last_check_state
	if [ $? = 0 ]; then
		# CRM Error - Should never happen
		return $OCF_NOT_RUNNING
	fi

	rm $NO_ACTIVE_COUNT &> /dev/null
	rm $NO_ACTIVE_SHUTDOWN &> /dev/null

	checkpoint_last_update slave
	active_daemon_kill
	check_passive_daemon_alive
	$CRM_MASTER -v 5
	rm -f $FLAG_HA_ROLE_ACTIVE
	return $OCF_SUCCESS
}

notify_failed_service()
{
	[ $# -ne 1 ] && return
	local failed_service=""
	# Must corresponds to SZ_SERVICE_* in ha_serv.cc
	case $1 in
		SAMBA)      failed_service="samba"      ;;
		ISCSI)      failed_service="iscsitrg"   ;;
		FTP)        failed_service="ftpd"       ;;
		ATALK)      failed_service="atalk"      ;;
		NFS)        failed_service="nfsd"       ;;
		NETWORKING) failed_service="networking" ;;
	esac
	${SYNOHA_PROG} ${NOTIFY_HA_FAIL_OVER} "service failed" "$failed_service" "$REMOTE_HOST" "$LOCAL_HOST" &> /dev/null
}

checkpoint_last_promote() {
	${SYNOHA_PROG} ${CLEAN_SWITCH_OVER_STATE} &> /dev/null
	rm -f $CHECKPOINT_CHECKING
	rm -f $FLAG_HA_NET_FAILED
	rm -f $FLAG_HA_PING_FAILED
	checkpoint_last_check_state
	if [ $? = 0 ]; then
		return $OCF_NOT_RUNNING
	fi

	${SYNOHA_PROG} ${UPDATE_HA_CSP} &> /dev/null
	${SYNOHA_PROG} ${SAVE_HA_STATE_TIME} active &> /dev/null
	checkpoint_last_update master
	passive_daemon_kill
	check_active_daemon_alive
	{

# need to review the flow to prevent: disconect and connect again in the short time (10~15s)
# passive mount fs but not run service
# in this case, original active can be still active and overwrite the passive
#	if [ -f "${FLAG_HA_HAS_BUMPED_UP_ADMIN_EPOCH}.pre" ]; then
#		rm -f ${FLAG_HA_HAS_BUMPED_UP_ADMIN_EPOCH}.pre
#		touch ${FLAG_HA_HAS_BUMPED_UP_ADMIN_EPOCH}
#		bump_admin_epoch
#	fi

	${SYNOHA_PROG} ${REMOTE_STATUS} | grep -q offline &> /dev/null
	if [ 0 -eq $? -a ! -f $REMOTE_SHUTDOWN_FLAG ]; then
		${SYNOLOGSET1_BIN} sys info 0x13400055 "active server disappeared" "${REMOTE_HOST}" "${LOCAL_HOST}" &> /dev/null
	else
		failed_service_on_passive="`sed -n "s/^NODE_SERV_\(.*\)=yes$/\1/p" /tmp/ha/info_node.remote | tail -n 1`"
		if [ -n "$failed_service_on_passive" ]; then
			notify_failed_service "$failed_service_on_passive"
		fi
	fi
	${SYNOHA_PROG} ${NOTIFY_HA_SERVICE_UP} &> /dev/null
	${SYNOHA_PROG} ${CLEAN_CRM_STATE} &> /dev/null
	}&

	# TODO: delete failcount when success to recovery from monitor fail
	# We can't put delete failcount here. Crm may run Checkpoint_last:promote
	# and SERV:monitor concurrently, such that we may clear failcount before counting failcount
	if false; then
		local HostName=`${SYNOHA_PROG} --local-name | awk '{print $3}'`
		${CRM_PROG} ${CRM_DELETE_FAILCOUNT} ${HostName}
	fi

	$CRM_MASTER -v 10
	touch $FLAG_HA_ROLE_ACTIVE
	return $OCF_SUCCESS
}

checkpoint_last_stop() {
	$CRM_MASTER -D
	checkpoint_last_check_state master
	if [ $? = 0 ]; then
		# CRM Error - Should never happen
		return $OCF_RUNNING_MASTER
	fi

	active_daemon_kill
	passive_daemon_kill

	if [ -f ${OCF_RESKEY_state} ]; then
		rm ${OCF_RESKEY_state}
	fi
	return $OCF_SUCCESS
}

check_and_operate_bumped_up_flag() {
	local remoteStatus=`${SYNOHA_PROG} --remote-status | cut -d" " -f2`
	case $remoteStatus in
		online)
			if ! drbdadm cstate all | grep -q Connected; then
				return
			fi
			;& # fall through
		none)
			rm -f $FLAG_HA_HAS_BUMPED_UP_ADMIN_EPOCH &> /dev/null
			rm -f ${FLAG_HA_HAS_BUMPED_UP_ADMIN_EPOCH}.pre &> /dev/null
			;;
		warning_online|offline|warning_offline)
			;;
		unknown|standalone|error|*)
			synoha_log "Unexpected status for remote: "$remoteStatus
			;;
	esac
}

checkpoint_last_monitor() {
	check_ping_server $LOCAL_HOST
	local ping_server_local=$?

	check_and_operate_bumped_up_flag
	check_daemon_alive
	check_if_skip_seq_io_for_ssd_cache
	if checkpoint_last_check_state "master"; then
		[ -z "$REMOTE_HOST" ] && return $OCF_RUNNING_MASTER
		{
		if [ ! -e $CHECKPOINT_CHECKING ]; then
			touch $CHECKPOINT_CHECKING
			${SYNODRBD_PROG} ${CHECK_HA_DATA_SYNC} &> /dev/null
			${SYNODRBD_PROG} ${CHECK_SYNC} &> /dev/null
			${SYNOHA_PROG} ${CHECK_PASSIVE_CRASH} &> /dev/null
			${SYNOHA_PROG} ${CHECK_SERVICE_FAIL} &> /dev/null
			${SYNOHA_PROG} ${CHECK_ENABLE_SWITCHOVER} &> /dev/null
			${SYNOHA_PROG} ${CHECK_VER_MISMATCH} &> /dev/null
			push_network_settings

			check_ping_server $REMOTE_HOST
			local ping_server_remote=$?
			if [ 1 -eq $ping_server_remote ]; then
				set_ping_server_constraint $REMOTE_HOST
			fi
			if [ 1 -eq $ping_server_local ]; then
				if [ 0 -eq $ping_server_remote ]; then
					# I can't see ping server, but remote can. Switch over!
					touch $FLAG_HA_PING_FAILED
				fi
			fi

			${SYNOHA_PROG} ${CHECK_IF_SWITCH_OVER} &> /dev/null
			rm -f $CHECKPOINT_CHECKING
		fi
		}&
		check_active_daemon_alive
		return $OCF_RUNNING_MASTER
	fi

	if checkpoint_last_check_state "slave"; then
		{
			if $DRBDADM sh-status all | grep -q "_flags_user_isp=1"; then
				synoha_log notice "resume-sync all"
				$DRBDADM resume-sync all
			fi
		}&
		{
			# Check if the passive is constrained due to ping server invisibility
			if ! get_ping_server_constraint; then
				# No, keep going
				exit
			fi
			# I did not see ping server before. How about now?
			if [ 0 -eq ${ping_server_local} ] && $SYNOHA_BIN --check-connectivity "$(get_ping_server)"; then
				# Yes, I can see it now. Set me free from RULE_PING_SERVER!
				synoha_log "Bring up passive due to ping server availability."
				unset_ping_server_constraint
			fi
		}&
		{
		if [ -r $NO_ACTIVE_SHUTDOWN ]; then
			exit
		fi
		check_passive_daemon_alive
		if $SYNOHA_BIN --remote-role | grep -q "Active"; then
			if [ -r $NO_ACTIVE_COUNT ]; then
				rm $NO_ACTIVE_COUNT
			fi
		else
			if [ -r $NO_ACTIVE_COUNT ]; then
				local timeout=$MAX_HA_PASSIVE_ONLY
				#if ${HA_SBIN_DIR}/synoha --remote-status | grep -q "normal"; then
				#	timeout=600 # two passive, 600s
				#elif ${HA_SBIN_DIR}/synoha --remote-status | grep -q "warning_online"; then
				#	timeout=600 # two passive, 600s
				#else
				#	timeout=600 # I am the only node in cluster
				#fi

				local cntStart=`cat $NO_ACTIVE_COUNT`
				local cntCur=`date +%s`
				local cntDiff=`expr $cntCur - $cntStart`
				if [ $cntDiff -ge $timeout ]; then
					if [ -f $FLAG_WAIT_FSCK ]; then
						synoha_log warning "Waiting file system check. Shutdown local."
						synoha --poweroff-ds
					elif [ -f "$FLAG_HA_REMOTE_BUSY" ]; then
						synoha_log warning "Waiting busy active. Shutdown local."
						synoha --poweroff-ds
					else
					{
						synoha_log "Waiting active node timeout. Shutdown local."
						touch $NO_ACTIVE_SHUTDOWN

						${HA_SBIN_DIR}/synoha --notify passive-timeout $LOCAL_HOST $REMOTE_HOST &> /dev/null &
						${HA_SBIN_DIR}/synoha --poweroff-ds &> /dev/null &
						# wait reboot after unbind local
						sleep 300
						poweroff -f
					}&
					fi
				fi
			else
				date +%s > $NO_ACTIVE_COUNT 2>/dev/null
			fi
		fi
		}&
		{
			if $SYNOHA_BIN --remote-has-constraint && cibadmin -Q --xpath '//rsc_location[@id="cli-standby-DUMMY_START"]'; then
				synoha_log warning "Passive server is constrained during switching/failing over. Try to resume";
				crm configure delete cli-standby-DUMMY_START
			fi
		}&
		return $OCF_SUCCESS
	fi

	if [ -f ${OCF_RESKEY_state} ]; then
		echo "File '${OCF_RESKEY_state}' exists but contains unexpected contents"
		cat ${OCF_RESKEY_state}
		return $OCF_ERR_GENERIC
	fi
	return $OCF_NOT_RUNNING
}

checkpoint_last_validate() {
	exit $OCF_SUCCESS
}

: ${OCF_RESKEY_state=${HA_RSCTMP}/Checkpoint_last-${OCF_RESOURCE_INSTANCE}.state}

case $__OCF_ACTION in
meta-data)    meta_data;;
start)        synoha_log notice "checkpoint_last start";   checkpoint_last_start;;
promote)      synoha_log notice "checkpoint_last promote"; checkpoint_last_promote;;
demote)       synoha_log notice "checkpoint_last demote";  checkpoint_last_demote;;
stop)         synoha_log notice "checkpoint_last stop";    checkpoint_last_stop;;
monitor)      checkpoint_last_monitor;;
validate-all) checkpoint_last_validate;;
usage|help)   checkpoint_last_usage $OCF_SUCCESS;;
*)            checkpoint_last_usage $OCF_ERR_UNIMPLEMENTED;;
esac

exit $?

