Monitoring Checks
Daily health checks and alerting verification.
Quick Health Check
# System snapshot
uptime
free -h
df -h | grep -E "^/dev"
# Services
systemctl --failed
# Recent errors
journalctl -p err --since "1 hour ago" | tail -10System Health
CPU & Load
# Load average
uptime
cat /proc/loadavg
# CPU usage
mpstat 1 3
top -bn1 | head -5Memory
# Memory usage
free -h
# Swap usage (should be minimal)
swapon --show
# Memory hogs
ps aux --sort=-%mem | head -10Disk
# Disk usage
df -h
# Inode usage
df -i
# Disk I/O
iostat -x 1 3Service Checks
Essential Services
# Check specific services
for svc in nginx mysql redis; do
systemctl is-active $svc && echo "$svc: OK" || echo "$svc: FAILED"
done
# Failed services
systemctl --failed
# Recent restarts
journalctl -u nginx --since "1 hour ago" | grep -i startPort Checks
# Listening ports
ss -tuln | grep LISTEN
# Specific ports
nc -zv localhost 80
nc -zv localhost 443
nc -zv localhost 3306Application Health
HTTP Endpoints
# Health check endpoint
curl -s http://localhost:8080/health | jq .
# Response time
curl -w "Time: %{time_total}s\n" -o /dev/null -s http://localhost/
# Status codes
curl -s -o /dev/null -w "%{http_code}" http://localhost/Database
# PostgreSQL
pg_isready -h localhost
# MySQL
mysqladmin ping
# Redis
redis-cli pingAlerting Verification
Check Alert Rules
# Prometheus rules
curl -s localhost:9090/api/v1/rules | jq '.data.groups[].rules[] | select(.state == "firing")'
# Alert count
curl -s localhost:9090/api/v1/alerts | jq '.data.alerts | length'Test Alerts
# Trigger test alert
curl -X POST localhost:9093/api/v1/alerts \
-d '[{"labels":{"alertname":"TestAlert"}}]'
# Check alert history
curl -s localhost:9093/api/v1/alerts | jq .Daily Checklist Script
#!/bin/bash
echo "=== Daily Health Check $(date) ==="
echo -e "\n--- System ---"
uptime
free -h | grep Mem
df -h | grep -E "^/dev" | awk '{print $1, $5}'
echo -e "\n--- Services ---"
for svc in nginx mysql redis; do
status=$(systemctl is-active $svc 2>/dev/null)
echo "$svc: $status"
done
echo -e "\n--- Errors (last hour) ---"
journalctl -p err --since "1 hour ago" --no-pager | tail -5
echo -e "\n--- Endpoints ---"
curl -s -o /dev/null -w "Web: %{http_code}\n" http://localhost/
curl -s -o /dev/null -w "API: %{http_code}\n" http://localhost:8080/health
echo -e "\n--- Done ---"Metrics to Watch
| Metric | Warning | Critical |
|---|---|---|
| CPU | > 70% | > 90% |
| Memory | > 80% | > 95% |
| Disk | > 75% | > 90% |
| Load | > CPUs | > 2x CPUs |
| Error rate | > 1% | > 5% |
| Response time | > 1s | > 5s |
- monitoring
- health check
- alerting
- metrics
- observability