|
1
|
#!/bin/sh
|
|
2
|
#
|
|
3
|
# dpinger_watchdog.sh - Detects hung/missing/zombie dpinger processes and restarts them
|
|
4
|
#
|
|
5
|
# Detection covers three failure modes:
|
|
6
|
# 1. Missing: dpinger process gone entirely (no socket, no proc)
|
|
7
|
# 2. Hung: socket exists but unresponsive (usocket_thread dead)
|
|
8
|
# 3. Zombie: socket responds but reports 100% loss while manual ping
|
|
9
|
# from bind_addr to monitor_addr succeeds (send/recv threads orphaned
|
|
10
|
# after ipsec interface destroy/recreate)
|
|
11
|
#
|
|
12
|
# Usage:
|
|
13
|
# dpinger_watchdog.sh # quiet (cron mode) - logs to syslog only
|
|
14
|
# dpinger_watchdog.sh -v # verbose - prints progress to stdout
|
|
15
|
# When run on a TTY, verbose mode is enabled automatically.
|
|
16
|
|
|
17
|
SOCK_DIR="/var/run"
|
|
18
|
TIMEOUT_SEC=5
|
|
19
|
PING_COUNT=4
|
|
20
|
PING_WAIT=1
|
|
21
|
LOGCMD="logger -t dpinger_watchdog"
|
|
22
|
|
|
23
|
# Verbose if -v passed or stdout is a TTY
|
|
24
|
VERBOSE=0
|
|
25
|
if [ "$1" = "-v" ] || [ -t 1 ]; then
|
|
26
|
VERBOSE=1
|
|
27
|
fi
|
|
28
|
|
|
29
|
say() {
|
|
30
|
$LOGCMD "$1"
|
|
31
|
[ $VERBOSE -eq 1 ] && echo "[$(date '+%H:%M:%S')] $1"
|
|
32
|
}
|
|
33
|
|
|
34
|
vsay() {
|
|
35
|
[ $VERBOSE -eq 1 ] && echo "[$(date '+%H:%M:%S')] $1"
|
|
36
|
}
|
|
37
|
|
|
38
|
restart_needed=0
|
|
39
|
problem_gateways=""
|
|
40
|
|
|
41
|
vsay "=== dpinger watchdog start ==="
|
|
42
|
|
|
43
|
# --- Check 1: Missing dpinger processes ---
|
|
44
|
vsay "Check 1: querying pfSense for expected gateways..."
|
|
45
|
expected_gateways=$(/usr/local/bin/php -r '
|
|
46
|
require_once("config.inc");
|
|
47
|
require_once("gwlb.inc");
|
|
48
|
$gateways = get_gateways();
|
|
49
|
foreach ($gateways as $gw) {
|
|
50
|
if (isset($gw["disabled"])) continue;
|
|
51
|
if (isset($gw["monitor_disable"])) continue;
|
|
52
|
echo $gw["name"] . "\n";
|
|
53
|
}
|
|
54
|
' 2>/dev/null)
|
|
55
|
|
|
56
|
vsay "Expected gateways: $(echo $expected_gateways | tr '\n' ' ')"
|
|
57
|
|
|
58
|
for gwname in $expected_gateways; do
|
|
59
|
sock_match=$(ls ${SOCK_DIR}/dpinger_${gwname}~*.sock 2>/dev/null)
|
|
60
|
if [ -z "$sock_match" ]; then
|
|
61
|
if ! pgrep -f "dpinger.*-i ${gwname}" > /dev/null 2>&1; then
|
|
62
|
restart_needed=1
|
|
63
|
problem_gateways="${problem_gateways} ${gwname}(missing)"
|
|
64
|
say "MISSING: dpinger process gone for gateway ${gwname}"
|
|
65
|
else
|
|
66
|
vsay " ${gwname}: proc exists but socket missing (transient?)"
|
|
67
|
fi
|
|
68
|
else
|
|
69
|
vsay " ${gwname}: socket present"
|
|
70
|
fi
|
|
71
|
done
|
|
72
|
|
|
73
|
# --- Checks 2 & 3: hung sockets and zombie processes ---
|
|
74
|
vsay "Checks 2/3: probing each socket and checking for zombies..."
|
|
75
|
for sock in ${SOCK_DIR}/dpinger_*.sock; do
|
|
76
|
[ -e "$sock" ] || continue
|
|
77
|
|
|
78
|
base=$(basename "$sock" .sock)
|
|
79
|
gwname=$(echo "$base" | sed 's|^dpinger_||; s|~.*||')
|
|
80
|
bind_addr=$(echo "$base" | awk -F~ '{print $2}')
|
|
81
|
monitor_addr=$(echo "$base" | awk -F~ '{print $3}')
|
|
82
|
|
|
83
|
result=$(timeout ${TIMEOUT_SEC} nc -U "$sock" < /dev/null 2>/dev/null)
|
|
84
|
rc=$?
|
|
85
|
|
|
86
|
# Check 2: hung socket
|
|
87
|
if [ $rc -ne 0 ] || [ -z "$result" ]; then
|
|
88
|
restart_needed=1
|
|
89
|
problem_gateways="${problem_gateways} ${gwname}(hung)"
|
|
90
|
say "HUNG: socket unresponsive for ${gwname} (rc=${rc})"
|
|
91
|
continue
|
|
92
|
fi
|
|
93
|
|
|
94
|
latency=$(echo "$result" | awk '{print $2}')
|
|
95
|
stddev=$(echo "$result" | awk '{print $3}')
|
|
96
|
loss=$(echo "$result" | awk '{print $4}')
|
|
97
|
vsay " ${gwname} (bind=${bind_addr} mon=${monitor_addr}): latency=${latency}us stddev=${stddev}us loss=${loss}%"
|
|
98
|
|
|
99
|
# Check 3: zombie state
|
|
100
|
if [ "$loss" = "100" ]; then
|
|
101
|
case "$bind_addr" in
|
|
102
|
*:*) PING=ping6 ;;
|
|
103
|
*) PING=ping ;;
|
|
104
|
esac
|
|
105
|
vsay " 100% loss reported - probing manually with $PING -c $PING_COUNT -S $bind_addr $monitor_addr"
|
|
106
|
ping_out=$(timeout 8 $PING -c $PING_COUNT -W $((PING_WAIT * 1000)) -S "$bind_addr" "$monitor_addr" 2>/dev/null)
|
|
107
|
replies=$(echo "$ping_out" | awk -F, '/packets received/ {gsub(/ /, "", $2); print $2}' | grep -oE '^[0-9]+')
|
|
108
|
replies=${replies:-0}
|
|
109
|
vsay " probe result: ${replies}/${PING_COUNT} replies"
|
|
110
|
if [ "$replies" -gt 0 ]; then
|
|
111
|
restart_needed=1
|
|
112
|
problem_gateways="${problem_gateways} ${gwname}(zombie)"
|
|
113
|
say "ZOMBIE: ${gwname} reports 100% loss but ${replies}/${PING_COUNT} pings succeeded from ${bind_addr} to ${monitor_addr}"
|
|
114
|
else
|
|
115
|
vsay " real outage (probe also fails) - skipping"
|
|
116
|
fi
|
|
117
|
fi
|
|
118
|
done
|
|
119
|
|
|
120
|
# --- Restart if needed ---
|
|
121
|
if [ $restart_needed -eq 1 ]; then
|
|
122
|
say "Restarting gateway monitoring for:${problem_gateways}"
|
|
123
|
/usr/local/bin/php -r '
|
|
124
|
require_once("config.inc");
|
|
125
|
require_once("gwlb.inc");
|
|
126
|
setup_gateways_monitor();
|
|
127
|
' 2>&1
|
|
128
|
rc=$?
|
|
129
|
if [ $rc -eq 0 ]; then
|
|
130
|
say "Gateway monitoring restarted successfully"
|
|
131
|
else
|
|
132
|
say "ERROR: failed to restart gateway monitoring (rc=${rc})"
|
|
133
|
fi
|
|
134
|
else
|
|
135
|
vsay "All gateways healthy - no action"
|
|
136
|
fi
|
|
137
|
|
|
138
|
vsay "=== done ==="
|