HxHippy

Monitoring Checks

Daily health checks and alerting verification.

Last updated: 2025-01-15

Monitoring Checks

Daily health checks and alerting verification.

Quick Health Check

# System snapshot
uptime
free -h
df -h | grep -E "^/dev"

# Services
systemctl --failed

# Recent errors
journalctl -p err --since "1 hour ago" | tail -10

System Health

CPU & Load

# Load average
uptime
cat /proc/loadavg

# CPU usage
mpstat 1 3
top -bn1 | head -5

Memory

# Memory usage
free -h

# Swap usage (should be minimal)
swapon --show

# Memory hogs
ps aux --sort=-%mem | head -10

Disk

# Disk usage
df -h

# Inode usage
df -i

# Disk I/O
iostat -x 1 3

Service Checks

Essential Services

# Check specific services
for svc in nginx mysql redis; do
    systemctl is-active $svc && echo "$svc: OK" || echo "$svc: FAILED"
done

# Failed services
systemctl --failed

# Recent restarts
journalctl -u nginx --since "1 hour ago" | grep -i start

Port Checks

# Listening ports
ss -tuln | grep LISTEN

# Specific ports
nc -zv localhost 80
nc -zv localhost 443
nc -zv localhost 3306

Application Health

HTTP Endpoints

# Health check endpoint
curl -s http://localhost:8080/health | jq .

# Response time
curl -w "Time: %{time_total}s\n" -o /dev/null -s http://localhost/

# Status codes
curl -s -o /dev/null -w "%{http_code}" http://localhost/

Database

# PostgreSQL
pg_isready -h localhost

# MySQL
mysqladmin ping

# Redis
redis-cli ping

Alerting Verification

Check Alert Rules

# Prometheus rules
curl -s localhost:9090/api/v1/rules | jq '.data.groups[].rules[] | select(.state == "firing")'

# Alert count
curl -s localhost:9090/api/v1/alerts | jq '.data.alerts | length'

Test Alerts

# Trigger test alert
curl -X POST localhost:9093/api/v1/alerts \
  -d '[{"labels":{"alertname":"TestAlert"}}]'

# Check alert history
curl -s localhost:9093/api/v1/alerts | jq .

Daily Checklist Script

#!/bin/bash
echo "=== Daily Health Check $(date) ==="

echo -e "\n--- System ---"
uptime
free -h | grep Mem
df -h | grep -E "^/dev" | awk '{print $1, $5}'

echo -e "\n--- Services ---"
for svc in nginx mysql redis; do
    status=$(systemctl is-active $svc 2>/dev/null)
    echo "$svc: $status"
done

echo -e "\n--- Errors (last hour) ---"
journalctl -p err --since "1 hour ago" --no-pager | tail -5

echo -e "\n--- Endpoints ---"
curl -s -o /dev/null -w "Web: %{http_code}\n" http://localhost/
curl -s -o /dev/null -w "API: %{http_code}\n" http://localhost:8080/health

echo -e "\n--- Done ---"

Metrics to Watch

Metric Warning Critical
CPU > 70% > 90%
Memory > 80% > 95%
Disk > 75% > 90%
Load > CPUs > 2x CPUs
Error rate > 1% > 5%
Response time > 1s > 5s
beginner Daily Operations Updated 2025-01-15
  • monitoring
  • health check
  • alerting
  • metrics
  • observability