#!/bin/sh
#
# dpinger_watchdog.sh - Detects hung/missing/zombie dpinger processes and restarts them
#
# Detection covers three failure modes:
#   1. Missing: dpinger process gone entirely (no socket, no proc)
#   2. Hung: socket exists but unresponsive (usocket_thread dead)
#   3. Zombie: socket responds but reports 100% loss while manual ping
#      from bind_addr to monitor_addr succeeds (send/recv threads orphaned
#      after ipsec interface destroy/recreate)
#
# Usage:
#   dpinger_watchdog.sh           # quiet (cron mode) - logs to syslog only
#   dpinger_watchdog.sh -v        # verbose - prints progress to stdout
#   When run on a TTY, verbose mode is enabled automatically.

SOCK_DIR="/var/run"
TIMEOUT_SEC=5
PING_COUNT=4
PING_WAIT=1
LOGCMD="logger -t dpinger_watchdog"

# Verbose if -v passed or stdout is a TTY
VERBOSE=0
if [ "$1" = "-v" ] || [ -t 1 ]; then
    VERBOSE=1
fi

say() {
    $LOGCMD "$1"
    [ $VERBOSE -eq 1 ] && echo "[$(date '+%H:%M:%S')] $1"
}

vsay() {
    [ $VERBOSE -eq 1 ] && echo "[$(date '+%H:%M:%S')] $1"
}

restart_needed=0
problem_gateways=""

vsay "=== dpinger watchdog start ==="

# --- Check 1: Missing dpinger processes ---
vsay "Check 1: querying pfSense for expected gateways..."
expected_gateways=$(/usr/local/bin/php -r '
require_once("config.inc");
require_once("gwlb.inc");
$gateways = get_gateways();
foreach ($gateways as $gw) {
    if (isset($gw["disabled"])) continue;
    if (isset($gw["monitor_disable"])) continue;
    echo $gw["name"] . "\n";
}
' 2>/dev/null)

vsay "Expected gateways: $(echo $expected_gateways | tr '\n' ' ')"

for gwname in $expected_gateways; do
    sock_match=$(ls ${SOCK_DIR}/dpinger_${gwname}~*.sock 2>/dev/null)
    if [ -z "$sock_match" ]; then
        if ! pgrep -f "dpinger.*-i ${gwname}" > /dev/null 2>&1; then
            restart_needed=1
            problem_gateways="${problem_gateways} ${gwname}(missing)"
            say "MISSING: dpinger process gone for gateway ${gwname}"
        else
            vsay "  ${gwname}: proc exists but socket missing (transient?)"
        fi
    else
        vsay "  ${gwname}: socket present"
    fi
done

# --- Checks 2 & 3: hung sockets and zombie processes ---
vsay "Checks 2/3: probing each socket and checking for zombies..."
for sock in ${SOCK_DIR}/dpinger_*.sock; do
    [ -e "$sock" ] || continue

    base=$(basename "$sock" .sock)
    gwname=$(echo "$base" | sed 's|^dpinger_||; s|~.*||')
    bind_addr=$(echo "$base" | awk -F~ '{print $2}')
    monitor_addr=$(echo "$base" | awk -F~ '{print $3}')

    result=$(timeout ${TIMEOUT_SEC} nc -U "$sock" < /dev/null 2>/dev/null)
    rc=$?

    # Check 2: hung socket
    if [ $rc -ne 0 ] || [ -z "$result" ]; then
        restart_needed=1
        problem_gateways="${problem_gateways} ${gwname}(hung)"
        say "HUNG: socket unresponsive for ${gwname} (rc=${rc})"
        continue
    fi

    latency=$(echo "$result" | awk '{print $2}')
    stddev=$(echo "$result" | awk '{print $3}')
    loss=$(echo "$result" | awk '{print $4}')
    vsay "  ${gwname} (bind=${bind_addr} mon=${monitor_addr}): latency=${latency}us stddev=${stddev}us loss=${loss}%"

    # Check 3: zombie state
    if [ "$loss" = "100" ]; then
        case "$bind_addr" in
            *:*) PING=ping6 ;;
            *)   PING=ping ;;
        esac
        vsay "    100% loss reported - probing manually with $PING -c $PING_COUNT -S $bind_addr $monitor_addr"
        ping_out=$(timeout 8 $PING -c $PING_COUNT -W $((PING_WAIT * 1000)) -S "$bind_addr" "$monitor_addr" 2>/dev/null)
        replies=$(echo "$ping_out" | awk -F, '/packets received/ {gsub(/ /, "", $2); print $2}' | grep -oE '^[0-9]+')
        replies=${replies:-0}
        vsay "    probe result: ${replies}/${PING_COUNT} replies"
        if [ "$replies" -gt 0 ]; then
            restart_needed=1
            problem_gateways="${problem_gateways} ${gwname}(zombie)"
            say "ZOMBIE: ${gwname} reports 100% loss but ${replies}/${PING_COUNT} pings succeeded from ${bind_addr} to ${monitor_addr}"
        else
            vsay "    real outage (probe also fails) - skipping"
        fi
    fi
done

# --- Restart if needed ---
if [ $restart_needed -eq 1 ]; then
    say "Restarting gateway monitoring for:${problem_gateways}"
    /usr/local/bin/php -r '
        require_once("config.inc");
        require_once("gwlb.inc");
        setup_gateways_monitor();
    ' 2>&1
    rc=$?
    if [ $rc -eq 0 ]; then
        say "Gateway monitoring restarted successfully"
    else
        say "ERROR: failed to restart gateway monitoring (rc=${rc})"
    fi
else
    vsay "All gateways healthy - no action"
fi

vsay "=== done ==="
