@@ -57,8 +57,16 @@ fi |
| 57 | 57 | if (( SECS_SINCE > 300 )); then |
| 58 | 58 | fail "pg_stat_archiver last_archived_time is ${SECS_SINCE}s ago (>300s); archiver may be wedged" |
| 59 | 59 | fi |
| 60 | | -if (( FAILED_COUNT > 0 )); then |
| 61 | | - fail "pg_stat_archiver.failed_count=$FAILED_COUNT (last_failed_time epoch=$LAST_FAILED_TIME); inspect journalctl -u postgresql@16-main" |
| 60 | +# failed_count is cumulative since the last pg_stat_reset_shared('archiver') |
| 61 | +# — a non-zero count is fine if the failures pre-date the most recent |
| 62 | +# success. We only flag when the most recent FAILURE is newer than the |
| 63 | +# most recent SUCCESS (genuine ongoing breakage) AND that failure is |
| 64 | +# recent enough to still be relevant. |
| 65 | +if (( FAILED_COUNT > 0 && LAST_FAILED_TIME > LAST_TIME )); then |
| 66 | + SECS_SINCE_FAIL="$((NOW - LAST_FAILED_TIME))" |
| 67 | + if (( SECS_SINCE_FAIL < 600 )); then |
| 68 | + fail "pg_stat_archiver: most recent failure (${SECS_SINCE_FAIL}s ago) is newer than most recent success; archive_command is broken — inspect journalctl -u postgresql@16-main" |
| 69 | + fi |
| 62 | 70 | fi |
| 63 | 71 | |
| 64 | 72 | # 3. The most-recent segment is visible in Spaces. We list today's |
@@ -67,14 +75,14 @@ fi |
| 67 | 75 | # means rclone reported success but the bucket lost the object — |
| 68 | 76 | # rare but worth flagging. |
| 69 | 77 | TODAY="$(date -u +%Y/%m/%d)" |
| 70 | | -COUNT="$(rclone --config /root/.config/rclone/rclone.conf --s3-no-check-bucket \ |
| 78 | +COUNT="$(rclone --config /etc/rclone-shithub.conf --s3-no-check-bucket \ |
| 71 | 79 | lsf "spaces-prod:shithub-wal/$TODAY/" 2>/dev/null | wc -l)" |
| 72 | 80 | if (( COUNT == 0 )); then |
| 73 | 81 | # Edge case: it's a few minutes after UTC midnight and today's |
| 74 | 82 | # prefix is genuinely empty. Look at yesterday too. |
| 75 | 83 | YDAY="$(date -u -d 'yesterday' +%Y/%m/%d 2>/dev/null || \ |
| 76 | 84 | date -u -v-1d +%Y/%m/%d)" |
| 77 | | - YCOUNT="$(rclone --config /root/.config/rclone/rclone.conf --s3-no-check-bucket \ |
| 85 | + YCOUNT="$(rclone --config /etc/rclone-shithub.conf --s3-no-check-bucket \ |
| 78 | 86 | lsf "spaces-prod:shithub-wal/$YDAY/" 2>/dev/null | wc -l)" |
| 79 | 87 | if (( YCOUNT == 0 )); then |
| 80 | 88 | fail "no WAL segments visible in spaces-prod:shithub-wal/$TODAY or /$YDAY despite recent archive success" |