#!/bin/bash

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for license information.
# --------------------------------------------------------------------------------------------

#
# Watchdog for monitoring unmounts and stunnel processes
#

mtime_mountmap=0

#
# Kill stunnel process and clean up stunnel files generated by aznfs mount helper
#
cleanup_stunnel_files()
{
    local l_conf=$1
    local l_log=$2
    local l_pid=$3
    local accept_port

    # Kill stunnel process first.
    pid=$(cat $l_pid)
    accept_port=$(cat $l_conf | grep accept | cut -d ':' -f 2)
    pecho "killing stunnel process with pid: $pid on port: $accept_port"
    kill -9 $pid
    if [ $? -ne 0 ]; then
        eecho "Unable to kill stunnel process $pid!"
    fi

    # Cleanup stunnel files
    rm $l_log
    if [ $? -ne 0 ]; then
        eecho "[FATAL] Unable to delete stunnel log file $l_log!"
    fi

    rm $l_pid
    if [ $? -ne 0 ]; then
        eecho "[FATAL] Unable to delete stunnel pid file $l_pid!"
    fi

    chattr -if $l_conf
    rm $l_conf
    if [ $? -ne 0 ]; then
        eecho "[FATAL] Unable to delete stunnel conf file $l_conf!"
    fi
}

#
# Delete entry from MOUNTMAPv4.
#
ensure_mountmapv4_not_exist()
{
    #
    # If watchdog wants to delete the entry only if MOUNTMAPv4 has not changed since
    # watchdog looked up, honour that.
    #
    local mountmap_mtime="$2"
    if [ -n "$mountmap_mtime" ]; then
        local mtime=$(stat -c%Y $MOUNTMAPv4)
        if [ "$mtime" != "$mountmap_mtime" ]; then
            eecho "[$1] Refusing to remove from ${MOUNTMAPv4} as $mtime != $mountmap_mtime!"
            return 1
        fi
    fi

    chattr -f -i $MOUNTMAPv4
    #
    # We overwrite the file instead of inplace update by sed as that has a
    # very bad side-effect of creating a new MOUNTMAPv4 file. This breaks
    # any locking that we dependent on the old file.
    #
    out=$(sed "\%^${1}$%d" $MOUNTMAPv4)
    ret=$?
    if [ $ret -eq 0 ]; then
        #
        # If this echo fails then MOUNTMAPv4 could be truncated.
        #
        echo "$out" > $MOUNTMAPv4
        ret=$?
        out=
        if [ $ret -ne 0 ]; then
            eecho "*** [FATAL] MOUNTMAPv4 may be in inconsistent state, contact Microsoft support ***"
        fi
    fi

    chattr -f +i $MOUNTMAPv4

    if [ $ret -ne 0 ]; then
        eecho "[$1] failed to remove from ${MOUNTMAPv4}!"
        return 1
    fi

    pecho "[$1] removed from ${MOUNTMAPv4} successfully!"

    # Return the mtime after our mods.
    echo $(stat -c%Y $MOUNTMAPv4)
}

cleanup_mount()
{
    local l_conf=$1
    local l_log=$2
    local l_pid=$3
    local line=$4

    # Need to lock MOUNTMAPv4 since the mountscript could modify them as well.
    exec {fd2}<$MOUNTMAPv4
    flock -e $fd2

    # Delete IFF mountmap is not changed since we read it above.
    l_mtime=$(ensure_mountmapv4_not_exist "$line" "$mtime_mountmap")

    #
    # Update mountmap mtime in case of successful updation of MOUNTMAPv4,
    # so that we can distinguish between MOUNTMAPv4 mtime changing because
    # of our action or some mount helper changing it. In the former case
    # it's safe to update the MOUNTMAPv4, so update mtime_mountmap to the
    # mtime after this update.
    #
    if [ $? -eq 0 ]; then
        mtime_mountmap=$l_mtime
    else
        # If the mountmap file is changed since we read it, we need to read it again - don't modify anything.
        eecho "Failed to delete entry from ${MOUNTMAPv4}! Entry: [$line]"
        flock -u $fd2
        exec {fd2}<&-
        return 1
    fi

    cleanup_stunnel_files $l_conf $l_log $l_pid

    flock -u $fd2
    exec {fd2}<&-
    return 0
}

process_nfsv4_mounts()
{
    local l_conf
    local l_log
    local l_pid
    local l_checksumhash

    epoch_now=$(date +%s)

    #
    # Go over all lines in MOUNTMAPv4 and check them for two things:
    # 1. Is that entry still in use by at least one aznfs mount, if not remove the entry.
    # 2. Is stunnel process running?
    #
    # We store the mtime of MOUNTMAPv4 while inside the lock so that if any mount helper process
    # updates it after this we will skip modification for sake of safety. We will come to it
    # in the next iteration when it's safer.
    #
    exec {fd}<$MOUNTMAPv4
    flock -e $fd
    mtime_mountmap=$(stat -c%Y $MOUNTMAPv4)
    IFS=$'\n' lines=$(cat $MOUNTMAPv4)
    flock -u $fd
    exec {fd}<&-

    #
    # findmnt must be done after reading MOUNTMAPv4 so that if we come accross a
    # MOUNTMAPv4 entry whose all nfs file shares are unmounted, we know
    # for sure that it's not in use by any mount and can be removed.
    #
    findmnt=$(findmnt | grep 'nfs4\|$LOCALHOST' 2>&1)

    #
    # For no matching mounts also, findmnt exits with a failure return, so check
    # for both exit status and non-empty error o/p.
    #
    if [ $? -ne 0 -a -n "$findmnt" ]; then
        eecho "${findmnt}."
        eecho "[FATAL] findmnt failed unexpectedly!"
        eecho "[FATAL] aznfswatchdogv4 service is exiting, will not monitor Azure NFS file shares."
        eecho "[FATAL] Please contact Microsoft support before using any NFS File shares."
        # This usually indicates some non-transient issue, bail out.
        exit 1
    fi

    if [ -z "$NETSTATCOMMAND" ]; then
        eecho "[FATAL] No socket statistics command (netstat or ss) found! Aznfswatchdogv4 service is exiting. Please contact Microsoft support"
        exit 1
    fi

    for line in $lines; do
        if [ -z "$line" ]; then
            continue
        fi

        #
        # MOUNTMAPv4 line is of the form:
        # <IP>;<stunnel_account.file.preprod.core.windows.net.conf path>;<stunnel_account.file.preprod.core.windows.net.log path>;<stunnel_account.file.preprod.core.windows.net.pid path>;<checksumHash>;<status>;<timeout>
        #
        IFS=";" read l_ip l_conf l_log l_pid l_checksumhash l_status l_timeout <<< "$line"

        if [ -z "$l_ip" -o -z "$l_conf" -o -z "$l_pid" ]; then
            wecho "[FATAL] Deleting invalid line in $MOUNTMAPv4: [$line]!"
            exec {fd2}<$MOUNTMAPv4
            flock -e $fd2
            l_mtime=$(ensure_mountmapv4_not_exist "$line")
            [ $? -eq 0 ] && mtime_mountmap=$l_mtime
            flock -u $fd2
            exec {fd2}<&-
            continue
        fi

        # Skip if the status is waiting, which means the mountscript is still processing the mount.
        if [ "$l_status" == "waiting" ]; then
            # vecho "Skipping entry with status 'waiting': [$line]"
            if [[ $l_timeout -ge $(date +%s) ]]; then
                # Timeout is in future, skip this entry.
                # If a mount entry stays in the 'waiting' state for a long time (grater than the mount timeout), it's safe to clean it up.
                # vecho "Timeout is in future, skipping entry with status 'waiting': [$line]"
                continue
            fi
        fi

        accept_port=$(cat $l_conf | grep accept | cut -d ':' -f 2)
        # vecho "accept_port: $accept_port"

        #
        # Delete entry from MOUNTMAPv4 if there are no mounted shares on that host.
        # As long as we have at least one mount using the MOUNTMAPv4 entry, we leave
        # it around.
        #
        if ! echo "$findmnt" | grep "$accept_port" >/dev/null; then
            pecho "No mounted shares for host $l_ip with accept port $accept_port, deleting from ${MOUNTMAPv4} [$line]."

            cleanup_mount $l_conf $l_log $l_pid $line
            continue
        else
            # vecho "Mounted shares found for host $l_ip with accept port $accept_port."

            # Check if checksumHash for stunnel.conf file has changed.
            # Customers should not modify stunnel.conf files created by aznfs mount helper.
            checksumHash=`cksum $l_conf | awk '{print $1}'`
            if [ $? -ne 0 ]; then
                eecho "Failed to get the checksum hash of file: '${l_conf}'!"
            fi

            if [ $checksumHash != $l_checksumhash ]; then
                eecho "'${l_conf}' file has modified!"
                eecho "It's not recommended to modify '${l_conf}' file created by aznfs mount helper!"
                eecho "watchdog service will do cleanup, kill stunnel process with pid:$(cat $l_pid) and remove '${l_conf}'; '${l_log}'; '${l_pid}'!"
                eecho "Please remount the shares from ${l_ip} using aznfs mount helper."

                cleanup_mount $l_conf $l_log $l_pid $line
                continue
            fi

            is_stunnel_running=$($NETSTATCOMMAND -anp | grep stunnel | grep `cat $l_pid`)
            if [ -z "$is_stunnel_running" ]; then
                vecho "Watchdog: stunnel is not running! Restarting the stunnel"

                stunnel_status=$(stunnel $l_conf 2>&1)
                if [ -n "$stunnel_status" ]; then
                    used_port=$(cat $l_conf | grep accept | cut -d: -f2)
                    is_binding_error=$(echo $stunnel_status | grep "$LOCALHOST:$used_port: Address already in use")
                    is_caroot_cert_failure=$(echo $stunnel_status | grep "certificate verify failed")
                    if [ -n "$is_binding_error" ]; then
                        eecho "[FATAL] Restarting stunnel failed.$used_port port is already being used by other process.!"
                        eecho "It's recommended to unmount all shares from $l_ip and then remount shares using aznfs mount helper!"
                    elif [ -n "$is_caroot_cert_failure" ]; then
                        eecho "[FATAL] Restarting stunnel failed. CA root certificate is either missing or is unable to authenticate TLS server certificate."
                        eecho "Please download the CA root certificate from https://learn.microsoft.com/en-us/azure/security/fundamentals/azure-ca-details"
                        eecho "It's recommended to unmount all shares from $l_ip and then remount shares using aznfs mount helper!"
                    else
                        eecho "[FATAL] watchdog service is unable to start stunnel process for '${l_conf}'!"
                        eecho "Please check the stunnel logs in $l_log for more details."
                        eecho "It's recommended to unmount all shares from $l_ip and then remount shares using aznfs mount helper!"
                    fi
                fi
            fi
        fi
    done
}

# Load common aznfs helpers.
AZNFS_VERSION=4
. /opt/microsoft/aznfs/common.sh

vecho "Starting aznfswatchdog for NFSv4..."

# Detect and log distro, bash and AZNFS-mount version
log_version_info

if ! chattr -f +i $MOUNTMAPv4; then
    wecho "chattr does not work for ${MOUNTMAPv4}!"
fi

while :; do
    sleep $MONITOR_INTERVAL_SECS
    process_nfsv4_mounts
done