#!/bin/bash

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for license information.
# --------------------------------------------------------------------------------------------

# How often do we check for change in FQDN->IP?
IP_CHANGE_DETECTION_FREQUENCY=60

# Determines the frequency at which we check for updates of AZNFS and run the daily function.
DAILY_CHECK_AFTER_SECONDS=86400

#
# Remove unmounted entries only if MOUNTMAP has not been changed till MONITOR_INTERVAL_SECS seconds.
# Don't set it below 3 minutes.
#
MOUNTMAP_INACTIVITY_SECS=300

#
# Don't perform mountmap and iptables rule cleanup for unmounted filesystems.
# This can be set if you want lazy umount to work.
#
AZNFS_SKIP_UNMOUNT_CLEANUP="${AZNFS_SKIP_UNMOUNT_CLEANUP:-0}"

# TIMEWAIT timeout to be used for conntrack entries.
AZNFS_TIMEWAIT_TIMEOUT="${AZNFS_TIMEWAIT_TIMEOUT:-65}"

#
# Environment variable to control skipping of IP change detection for regional accounts.
# By default we want to skip IP change detection for regional accounts, but if we want to 
# disable skipping we can set this environment variable to 0.
#
AZNFS_SKIP_IP_CHANGE_DETECTION_FOR_REGIONAL_ACCOUNTS="${AZNFS_SKIP_IP_CHANGE_DETECTION_FOR_REGIONAL_ACCOUNTS:-1}"

next_ip_change_detection_epoch=0

#
# To track the time for daily_run_once function with a 1-minute deferral for update detection.
# The deferral is intended to prevent any issues with auto-update,
# and ensures that updates are deferred until 1 minute after the start of the aznfswatchdog.
#
next_daily_run_epoch=$(expr $(date +%s) + 60)

# Load common aznfs helpers.
AZNFS_VERSION=3
. /opt/microsoft/aznfs/common.sh

# Create /opt/microsoft/aznfs/data/randbytes if not already created.
if [ ! -s $RANDBYTES ]; then
        dd if=/dev/urandom of=$RANDBYTES bs=256 count=1
fi
if [ ! -s $RANDBYTES ]; then
        uuidgen > $RANDBYTES
fi
if [ ! -s $RANDBYTES ]; then
        date | md5sum | awk '{print $1}' > $RANDBYTES
fi
if [ ! -s $RANDBYTES ]; then
        date > $RANDBYTES
fi
chattr -f +i $RANDBYTES

declare -A ip_change_count
declare -A last_ip_change_time
declare -A regional_accounts

#
# Checks the passed in account FQDN and tells if the IP change detection has to be skipped as it is a regional account.
#
should_skip_regional_account()
{
    local l_host=$1

    if [[ ${regional_accounts["$l_host"]} -eq 1 ]]; then
        # Environment variable prohibits skipping, return failure to do so.
        if [ "$AZNFS_SKIP_IP_CHANGE_DETECTION_FOR_REGIONAL_ACCOUNTS" == "0" ]; then
            return 1
        fi

        return 0
    fi

    return 1
}

#
# To check if an account should be marked as a "Regional Account".
#
check_for_regional_account()
{
    local l_host=$1

    if [[ ${regional_accounts["$l_host"]} -eq 1 ]]; then
        return
    fi

    let ip_change_count[$l_host]++
    local current_time=$(date +%s)

    # Check if it's the first IP change or if the time since the last change is greater than 5 minutes.
    if [ ${ip_change_count["$l_host"]} -eq 1 ] || [ $((current_time - last_ip_change_time["$l_host"])) -gt 300 ]; then
        last_ip_change_time["$l_host"]=$current_time

        #
        # Ensures that the count is reset to 1 when either it's the first IP change or
        # when the time since the last change is greater than 5 minutes.
        #  
        ip_change_count["$l_host"]=1

    elif [ ${ip_change_count["$l_host"]} -ge 3 ]; then
        # If the IP changed more than 3 times in less than 5 minutes, mark it as "Regional Account" FQDN.
        wecho "Marking $l_host as Regional Account FQDN due to frequent IP changes."
        regional_accounts["$l_host"]=1  # Set to true
    fi
}

#
# Hash for storing how many times we have seen a conntrack entry in SYN_SENT state.
# Used for finding if some entry is stuck in SYN_SENT state due to a bug in older
# kernels. If we find an entry stuck for more than a certain time in SYN_SENT state
# we delete the entry so that kernel looks up fresh NAT rules and creates a new entry.
#
declare -A cthash_synsent

reconcile_conntrack_synsent()
{
    local l_ip=$1
    local l_sport=$2
    local l_dport=$3
    local l_nfsip=$4
    local seconds_remaining=$5

    key="${l_ip}:${l_sport}:${l_dport}:${l_nfsip}"

    # First time we are seeing this conntrack entry.
    if [[ ! -v cthash_synsent[$key] ]]; then
        cthash_synsent[$key]=$seconds_remaining
        return
    fi

    #
    # How long has this entry been around?
    # If it's around for more than 25-30 secs, we consider the entry as "stuck" and delete it to cause fresh entry to
    # be created, and help make progress.
    #
    age_seconds=$(expr ${cthash_synsent[$key]} - $seconds_remaining)

    if [ $age_seconds -ge 25 ]; then
        cmd="conntrack -D -p tcp -d $l_ip -r $l_nfsip --sport $l_sport --dport $l_dport"
        wecho "Deleting conntrack entry stuck in SYN_SENT state for $age_seconds seconds [$cmd]"

        eval $cmd
        if [ $? -ne 0 ]; then
            eecho "Failed to delete conntrack entry [$cmd]!"
        else
            unset cthash_synsent[$key]
        fi
    fi
}

#
# Hash for storing how many times we have seen a conntrack entry in UNREPLIED state.
# Used for finding if some entry is stuck in UNREPLIED state due to no response from
# NFS server or some n/w issue. We want to delete the entry as it prevents creation of
# a new conntrack entry, if the IP were to change again soon.
#
declare -A cthash_unreplied

reconcile_conntrack_unreplied()
{
    local l_ip=$1
    local l_sport=$2
    local l_dport=$3
    local l_reply_srcip=$4
    local seconds_remaining=$5

    key="${l_ip}:${l_sport}:${l_dport}:${l_reply_srcip}"

    # First time we are seeing this conntrack entry.
    if [[ ! -v cthash_unreplied[$key] ]]; then
        cthash_unreplied[$key]=$seconds_remaining
        return
    fi

    #
    # How long has this entry been around?
    # If it's around for more than 25-30 secs, we consider the entry as "stuck" and delete it to cause fresh entry to
    # be created, and help make progress.
    #
    age_seconds=$(expr ${cthash_unreplied[$key]} - $seconds_remaining)

    if [ $age_seconds -ge 25 ]; then
        cmd="conntrack -D -p tcp -d $l_ip -r $l_reply_srcip --sport $l_sport --dport $l_dport"
        wecho "Deleting conntrack entry stuck in UNREPLIED state for $age_seconds seconds [$cmd]"

        eval $cmd
        if [ $? -ne 0 ]; then
            eecho "Failed to delete conntrack entry [$cmd]!"
        else
            unset cthash_unreplied[$key]
        fi
    fi
}

reconcile_conntrack()
{
    local l_ip=$1
    local l_nfsip=$2

    #
    # For mounts with nconnect, there could be more than one conntrack entries to the same
    # proxy IP, but with different local ports. We must track them separately.
    #
    IFS=$'\n' output111=$(conntrack -L -p tcp -d $l_ip -r $l_nfsip --dport 111 --state SYN_SENT 2>/dev/null)
    IFS=$'\n' output2048=$(conntrack -L -p tcp -d $l_ip -r $l_nfsip --dport 2048 --state SYN_SENT 2>/dev/null)
    output="$output111"$'\n'"$output2048"

    if [ -n "$output" ]; then
        for entry in $output; do
            # XXX Remove this log after running for few days (not too noisy!)
            vecho "$entry"

            # Sample conntrack entry.
            # tcp      6 114 SYN_SENT src=10.20.0.17 dst=10.161.100.101 sport=819 dport=2048 [UNREPLIED] src=20.150.35.196 dst=10.20.0.17 sport=2048 dport=819 mark=0 use=1
            matchstr="tcp\s+[0-9]+\s+([0-9]+)\s+SYN_SENT\s+src=[0-9]+.[0-9]+.[0-9]+.[0-9]+\s+dst=${l_ip}\s+sport=([0-9]+)\s+dport=([0-9]+).*"
            if [[ "$entry" =~ $matchstr ]]; then
                l_seconds_remaining=${BASH_REMATCH[1]}
                l_sport=${BASH_REMATCH[2]}
                l_dport=${BASH_REMATCH[3]}
                reconcile_conntrack_synsent $l_ip $l_sport $l_dport $l_nfsip $l_seconds_remaining
            fi
        done
    fi

    IFS=$'\n' output111=$(conntrack -L -p tcp -d $l_ip --dport 111 2>/dev/null | grep -v "SYN_SENT" | grep "\[UNREPLIED\]")
    IFS=$'\n' output2048=$(conntrack -L -p tcp -d $l_ip --dport 2048 2>/dev/null | grep -v "SYN_SENT" | grep "\[UNREPLIED\]")
    output="$output111"$'\n'"$output2048"
    if [ -n "$output" ]; then
        for entry in $output; do
            # XXX Remove this log after running for few days (too noisy!)
            vecho "$entry"

           # Sample conntrack entry.
            # tcp      6 299 ESTABLISHED src=10.2.4.4 dst=10.161.100.100 sport=1015 dport=2048 [UNREPLIED] src=20.60.236.11 dst=10.2.4.4 sport=2048 dport=1015 mark=0 use=1 
            matchstr="tcp\s+[0-9]+\s+([0-9]+) .* dst=$l_ip sport=([0-9]+) dport=([0-9]+) \[UNREPLIED\] src=([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+).*"
            if [[ "$entry" =~ $matchstr ]]; then
                l_seconds_remaining=${BASH_REMATCH[1]}
                l_sport=${BASH_REMATCH[2]}
                l_dport=${BASH_REMATCH[3]}
                l_reply_srcip=${BASH_REMATCH[4]}
                reconcile_conntrack_unreplied $l_ip $l_sport $l_dport $l_reply_srcip $l_seconds_remaining
            fi
        done
    fi
}

#
# Function for running stat for mountpoint so that everytime DNAT rule is updated, it is
# used to make sure to send atleast one packet that matches the DNAT rule.
# This will make sure that the connection gets a TCP reset and outstanding NFS RPC requests
# are retransmitted right away w/o waiting for the 1 min timeout.
#
ping_new_endpoint()
{
    local target="$1"

    vecho "[$BASHPID] stat($target) #1 start"
    stat "$target"
    vecho "[$BASHPID] stat($target) #1 done"

    sleep 35

    # One more stat after 30 sec sleep to let dir attributes timeout.
    vecho "[$BASHPID] stat($target) #2 start"
    stat "$target"
    vecho "[$BASHPID] stat($target) #2 done"
}

run_daily_once()
{
    next_daily_run_epoch=$(expr $(date +%s) + $DAILY_CHECK_AFTER_SECONDS)
    
    #
    # To increase resilience in case the account is marked as regional due to a bad state,
    # the arrays have been cleared. This ensures that we can check and mark the regional account on a daily basis.
    # This helps prevent falsely marking an account as a regional account.
    #
    ip_change_count=()
    last_ip_change_time=()
    regional_accounts=()

    # Since AKS users don't require auto-update, there's no need to check for updates.
    if [ "$AKS_USER" != "true" ]; then
        check_for_latest_update=true
    fi
}

#
# Watchdog for monitoring unmounts and more importantly change in blob endpoint
# addresses possibly as a result of migration.
#
process_nfsv3_mounts()
{
    #
    # TODO: Add a function reconcile_mountmap() and call it from here. This
    #       should reconstruct the MOUNTMAPv3 file from findmnt and output of
    #       iptables. This will be added in subsequent revisions.
    #

    epoch_now=$(date +%s)

    check_for_latest_update=false
    if [ $epoch_now -ge $next_daily_run_epoch ]; then
        run_daily_once
    fi

    if $check_for_latest_update; then
        if [ -e "$INSTALLSCRIPT" ]; then
            "$INSTALLSCRIPT" "auto-update"
        else
            wecho "[FATAL] $INSTALLSCRIPT not found. This is an unexpected error!"
            Wecho "[FATAL] Please contact Microsoft support."
        fi
    fi

    #
    # Go over all lines in MOUNTMAPv3 and check them for two things:
    # 1. Is that entry still in use by at least one aznfs mount, if not remove the entry.
    # 2. Has the Blob endpoint address changed from what is stored?
    #    If yes, update DNAT rule to point to the new address and update entry accordingly.
    #
    # Sample line in MOUNTMAPv3.
    # account.blob.preprod.core.windows.net 10.100.100.100 52.230.170.200
    #
    # where the format is
    # blobendpoint_fqdn proxy_ip blobendpoint_ip
    #
    # We store the mtime of MOUNTMAPv3 while inside the lock so that if any mount helper process
    # updates it after this we will skip modification for sake of safety. We will come to it
    # in the next iteration when it's safer.
    #
    exec {fd}<$MOUNTMAPv3
    flock -e $fd
    mtime_mountmap=$(stat -c%Y $MOUNTMAPv3)
    IFS=$'\n' lines=$(cat $MOUNTMAPv3)
    flock -u $fd
    exec {fd}<&-

    do_ip_change_detection=false
    if [ $epoch_now -ge $next_ip_change_detection_epoch ]; then
        do_ip_change_detection=true
        next_ip_change_detection_epoch=$(expr $(date +%s) + $IP_CHANGE_DETECTION_FREQUENCY)
    fi

    #
    # Do unmount GC only if MOUNTMAPv3 file is not modified in the last
    # MOUNTMAP_INACTIVITY_SECS seconds. We don't want to incorrectly delete an
    # entry while some aznfs mount is ongoing.
    #
    do_unmount_gc=false
    if [ "$AZNFS_SKIP_UNMOUNT_CLEANUP" == "0" ]; then
        if [ $epoch_now -ge $(expr $mtime_mountmap + $MOUNTMAP_INACTIVITY_SECS) ]; then
            do_unmount_gc=true
        fi
    fi

    #
    # findmnt must be done after reading MOUNTMAPv3 so that if we come accross a
    # MOUNTMAPv3 entry whose proxy_ip is not used by any existing mount, we know
    # for sure that it's not in use by any mount and can be removed.
    #
    findmnt=$(findmnt --raw --noheading -o MAJ:MIN,FSTYPE,SOURCE,TARGET,OPTIONS -t nfs 2>&1)

    #
    # For no matching mounts also, findmnt exits with a failure return, so check
    # for both exit status and non-empty error o/p.
    #
    if [ $? -ne 0 -a -n "$findmnt" ]; then
        eecho "${findmnt}."
        eecho "[FATAL] findmnt failed unexpectedly!"
        eecho "[FATAL] Aznfswatchdog service is exiting, will not monitor Azure NFS shares."
        eecho "[FATAL] Please contact Microsoft support before using any Blob NFS shares."
        # This usually indicates some non-transient issue, bail out.
        exit 1
    fi

    for line in $lines; do
        if [ -z "$line" ]; then
            continue
        fi

        #
        # MOUNTMAPv3 line is of the form:
        # account.blob.preprod.core.windows.net <local ip> <public ip> [<PID>]
        #
        IFS=" " read l_host l_ip l_nfsip <<< "$line"

        if [ -z "$l_host" -o -z "$l_ip" -o -z "$l_nfsip" ]; then
            wecho "[FATAL] Deleting invalid line in $MOUNTMAPv3: [$line]!"
            l_mtime=$(ensure_mountmapv3_not_exist "$line")
            [ $? -eq 0 ] && mtime_mountmap=$l_mtime
            continue
        fi

        # Since we added it to the MOUNTMAPv3 file, it cannot be invalid.
        if ! is_private_ip "$l_ip"; then
            wecho "[FATAL] local ip ($l_ip) is invalid!"
            l_mtime=$(ensure_mountmapv3_not_exist "$line")
            [ $? -eq 0 ] && mtime_mountmap=$l_mtime
            continue
        fi

        # Since we added it to the MOUNTMAPv3 file, it cannot be invalid.
        if ! is_valid_ipv4_address "$l_nfsip"; then
            wecho "[FATAL] Blob endpoint ip ($l_nfsip) is invalid!"
            l_mtime=$(ensure_mountmapv3_not_exist "$line")
            [ $? -eq 0 ] && mtime_mountmap=$l_mtime
            continue
        fi

        #
        # Delete entry from MOUNTMAPv3 if there are no mounted shares on that host.
        # As long as we have at least one mount using the MOUNTMAPv3 entry, we leave
        # it around.
        #
        if ! echo "$findmnt" | grep " nfs ${l_ip}:" >/dev/null; then
            if $do_unmount_gc; then
                pecho "No mounted shares for host $l_host, deleting from ${MOUNTMAPv3} [$line]."

                # Delete IFF mountmap is not changed since we read it above.
                l_mtime=$(ensure_mountmapv3_not_exist "$line" "$mtime_mountmap")

                #
                # Update ifmatch time in case of successful updation of MOUNTMAPv3,
                # so that we can distinguish between MOUNTMAPv3 mtime changing because
                # of our action or some mount helper changing it. In the former case
                # it's safe to update the MOUNTMAPv3, so update mtime_mountmap to the
                # mtime after this update.
                #
                [ $? -eq 0 ] && mtime_mountmap=$l_mtime
                continue
            fi
        else
            #
            # Verify that iptable entry should be present for corresponding
            # MOUNTMAPv3 entry if the share is not unmounted.
            #
            # Note: This is extra protection in case user flushes the iptable
            #       entries or removes it by mistake. This should not be
            #       required normally.
            #
            # We also reconcile conntrack entries stuck in some bad states which
            # may hamper communication, f.e., in older kernels there's a bug due to
            # which conntrack entry may get stuck in SYN_SENT state if client
            # reuse the source port and keep retransmitting SYNs before the entry
            # can timeout.
            #
            reconcile_conntrack "$l_ip" "$l_nfsip"
            verify_iptable_entry "$l_ip" "$l_nfsip"

        fi

        #
        # We do IP change detection less frequently than unmount detection
        # since it will cause DNS calls on network.
        #
        if ! $do_ip_change_detection; then
            continue
        fi

        #
        # See if the account is regional account and we need to skip IP change detection for that.
        #
        if should_skip_regional_account "$l_host"; then
            continue
        fi

        #
        # Check if blob endpoint IP address changed.
        # This is the migration check.
        #
        new_ip=$(resolve_ipv4 "$l_host" "false")

        # If we fail to resolve the host name, try next time.
        if [ $? -ne 0 ]; then
            #
            # If account is deleted then we need to delete the MOUNTMAPv3 entry along
            # with the proxy iptable entry created for that account.
            # Note that we don't delete if the MOUNTMAPv3 was changed recently since
            # the account may have been re-created after the dns lookup failure.
            #
            if [ "$new_ip" == "NXDOMAIN" ]; then
                pecho "Account corresponding to $l_host seems to have been deleted, deleting from ${MOUNTMAPv3} [$line]!"

                l_mtime=$(ensure_mountmapv3_not_exist "$line" "$mtime_mountmap")
                [ $? -eq 0 ] && mtime_mountmap=$l_mtime
            else
                eecho "resolve_ipv4($l_host) failed: $new_ip"
            fi
            continue
        fi

        #
        # If the IP changed for the Blob endpoint, we need to update the DNAT rule.
        # This will take care of migration/failover causing the Blob endpoint IP to change.
        #
        if [ "$new_ip" != "$l_nfsip" ]; then
            pecho "IP for $l_host changed [$l_nfsip -> $new_ip]."
            check_for_regional_account "$l_host"

            # This will update DNAT rule as well.
            if ! update_mountmapv3_entry "$line" "$l_host $l_ip $new_ip"; then
                eecho "Will reattempt the operation in next iteration."
            else
                mountpoint2048=$(echo "$findmnt" 2>/dev/null | egrep -m1 " nfs ${l_ip}:.*\<port=2048\>" | awk '{print $4}')
                mountpoint2048=$(echo -e "$mountpoint2048")
                if [ -n "$mountpoint2048" ]; then
                    ping_new_endpoint "$mountpoint2048" &
                fi
            fi
        fi
    done
}

vecho "Starting aznfswatchdog for NFSv3..."

# Detect and log distro, bash and AZNFS-mount version
log_version_info

# Dump NAT table once on startup in case we have reported conflicts.
vecho "NAT table:\n$(iptables-save -t nat)"
conntrack -L > /dev/null

# conntrack timewait timeout higher than the TCP timewait timeout value isn't very valuable.
conntrack_timeo_timew=$(cat /proc/sys/net/netfilter/nf_conntrack_tcp_timeout_time_wait 2>/dev/null)
if [ $? -eq 0 ] && [ -n "$conntrack_timeo_timew" -a $conntrack_timeo_timew -gt $AZNFS_TIMEWAIT_TIMEOUT ]; then
        vecho "Changing /proc/sys/net/netfilter/nf_conntrack_tcp_timeout_time_wait [$conntrack_timeo_timew -> $AZNFS_TIMEWAIT_TIMEOUT]"
        echo $AZNFS_TIMEWAIT_TIMEOUT > /proc/sys/net/netfilter/nf_conntrack_tcp_timeout_time_wait
fi

if ! chattr -f +i $MOUNTMAPv3; then
    wecho "chattr does not work for ${MOUNTMAPv3}!"
fi

#
# If we are not skipping IP change detection for regional accounts, then mountmap file will be updated every minute
# due to the change in IP for regional account FQDN. If we use standard MOUNTMAP_INACTIVITY_SECS,
# then this will prevent mountmap entry to be deleted. We set this to less than 1 minute in this case.
#
if [ "$AZNFS_SKIP_IP_CHANGE_DETECTION_FOR_REGIONAL_ACCOUNTS" == "0" ]; then
    MOUNTMAP_INACTIVITY_SECS=45
fi

while :; do
    sleep $MONITOR_INTERVAL_SECS
    process_nfsv3_mounts
done
