Project

General

Profile

Bug #16824 ยป dpinger_watchdog.sh

workaround cron script - Chris Baker, 05/04/2026 08:31 PM

 
1
#!/bin/sh
2
#
3
# dpinger_watchdog.sh - Detects hung/missing/zombie dpinger processes and restarts them
4
#
5
# Detection covers three failure modes:
6
#   1. Missing: dpinger process gone entirely (no socket, no proc)
7
#   2. Hung: socket exists but unresponsive (usocket_thread dead)
8
#   3. Zombie: socket responds but reports 100% loss while manual ping
9
#      from bind_addr to monitor_addr succeeds (send/recv threads orphaned
10
#      after ipsec interface destroy/recreate)
11
#
12
# Usage:
13
#   dpinger_watchdog.sh           # quiet (cron mode) - logs to syslog only
14
#   dpinger_watchdog.sh -v        # verbose - prints progress to stdout
15
#   When run on a TTY, verbose mode is enabled automatically.
16

    
17
SOCK_DIR="/var/run"
18
TIMEOUT_SEC=5
19
PING_COUNT=4
20
PING_WAIT=1
21
LOGCMD="logger -t dpinger_watchdog"
22

    
23
# Verbose if -v passed or stdout is a TTY
24
VERBOSE=0
25
if [ "$1" = "-v" ] || [ -t 1 ]; then
26
    VERBOSE=1
27
fi
28

    
29
say() {
30
    $LOGCMD "$1"
31
    [ $VERBOSE -eq 1 ] && echo "[$(date '+%H:%M:%S')] $1"
32
}
33

    
34
vsay() {
35
    [ $VERBOSE -eq 1 ] && echo "[$(date '+%H:%M:%S')] $1"
36
}
37

    
38
restart_needed=0
39
problem_gateways=""
40

    
41
vsay "=== dpinger watchdog start ==="
42

    
43
# --- Check 1: Missing dpinger processes ---
44
vsay "Check 1: querying pfSense for expected gateways..."
45
expected_gateways=$(/usr/local/bin/php -r '
46
require_once("config.inc");
47
require_once("gwlb.inc");
48
$gateways = get_gateways();
49
foreach ($gateways as $gw) {
50
    if (isset($gw["disabled"])) continue;
51
    if (isset($gw["monitor_disable"])) continue;
52
    echo $gw["name"] . "\n";
53
}
54
' 2>/dev/null)
55

    
56
vsay "Expected gateways: $(echo $expected_gateways | tr '\n' ' ')"
57

    
58
for gwname in $expected_gateways; do
59
    sock_match=$(ls ${SOCK_DIR}/dpinger_${gwname}~*.sock 2>/dev/null)
60
    if [ -z "$sock_match" ]; then
61
        if ! pgrep -f "dpinger.*-i ${gwname}" > /dev/null 2>&1; then
62
            restart_needed=1
63
            problem_gateways="${problem_gateways} ${gwname}(missing)"
64
            say "MISSING: dpinger process gone for gateway ${gwname}"
65
        else
66
            vsay "  ${gwname}: proc exists but socket missing (transient?)"
67
        fi
68
    else
69
        vsay "  ${gwname}: socket present"
70
    fi
71
done
72

    
73
# --- Checks 2 & 3: hung sockets and zombie processes ---
74
vsay "Checks 2/3: probing each socket and checking for zombies..."
75
for sock in ${SOCK_DIR}/dpinger_*.sock; do
76
    [ -e "$sock" ] || continue
77

    
78
    base=$(basename "$sock" .sock)
79
    gwname=$(echo "$base" | sed 's|^dpinger_||; s|~.*||')
80
    bind_addr=$(echo "$base" | awk -F~ '{print $2}')
81
    monitor_addr=$(echo "$base" | awk -F~ '{print $3}')
82

    
83
    result=$(timeout ${TIMEOUT_SEC} nc -U "$sock" < /dev/null 2>/dev/null)
84
    rc=$?
85

    
86
    # Check 2: hung socket
87
    if [ $rc -ne 0 ] || [ -z "$result" ]; then
88
        restart_needed=1
89
        problem_gateways="${problem_gateways} ${gwname}(hung)"
90
        say "HUNG: socket unresponsive for ${gwname} (rc=${rc})"
91
        continue
92
    fi
93

    
94
    latency=$(echo "$result" | awk '{print $2}')
95
    stddev=$(echo "$result" | awk '{print $3}')
96
    loss=$(echo "$result" | awk '{print $4}')
97
    vsay "  ${gwname} (bind=${bind_addr} mon=${monitor_addr}): latency=${latency}us stddev=${stddev}us loss=${loss}%"
98

    
99
    # Check 3: zombie state
100
    if [ "$loss" = "100" ]; then
101
        case "$bind_addr" in
102
            *:*) PING=ping6 ;;
103
            *)   PING=ping ;;
104
        esac
105
        vsay "    100% loss reported - probing manually with $PING -c $PING_COUNT -S $bind_addr $monitor_addr"
106
        ping_out=$(timeout 8 $PING -c $PING_COUNT -W $((PING_WAIT * 1000)) -S "$bind_addr" "$monitor_addr" 2>/dev/null)
107
        replies=$(echo "$ping_out" | awk -F, '/packets received/ {gsub(/ /, "", $2); print $2}' | grep -oE '^[0-9]+')
108
        replies=${replies:-0}
109
        vsay "    probe result: ${replies}/${PING_COUNT} replies"
110
        if [ "$replies" -gt 0 ]; then
111
            restart_needed=1
112
            problem_gateways="${problem_gateways} ${gwname}(zombie)"
113
            say "ZOMBIE: ${gwname} reports 100% loss but ${replies}/${PING_COUNT} pings succeeded from ${bind_addr} to ${monitor_addr}"
114
        else
115
            vsay "    real outage (probe also fails) - skipping"
116
        fi
117
    fi
118
done
119

    
120
# --- Restart if needed ---
121
if [ $restart_needed -eq 1 ]; then
122
    say "Restarting gateway monitoring for:${problem_gateways}"
123
    /usr/local/bin/php -r '
124
        require_once("config.inc");
125
        require_once("gwlb.inc");
126
        setup_gateways_monitor();
127
    ' 2>&1
128
    rc=$?
129
    if [ $rc -eq 0 ]; then
130
        say "Gateway monitoring restarted successfully"
131
    else
132
        say "ERROR: failed to restart gateway monitoring (rc=${rc})"
133
    fi
134
else
135
    vsay "All gateways healthy - no action"
136
fi
137

    
138
vsay "=== done ==="
    (1-1/1)