Full sync - all projects, memory, configs
This commit is contained in:
239
tools/self-healing-server.py
Normal file
239
tools/self-healing-server.py
Normal file
@ -0,0 +1,239 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Self-Healing Server — Always-on infrastructure monitoring & auto-repair.
|
||||
|
||||
Monitors services, disk, memory, CPU. Auto-restarts failed services.
|
||||
Logs all actions. Run via cron every 5 minutes or as a daemon.
|
||||
|
||||
Usage:
|
||||
python3 self-healing-server.py # Run once (check & heal)
|
||||
python3 self-healing-server.py --daemon # Run continuously (every 5 min)
|
||||
python3 self-healing-server.py --status # Show current health
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import psutil
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
DATA_DIR = Path(__file__).parent.parent / "data" / "self-healing"
|
||||
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
LOG_FILE = DATA_DIR / "heal-log.jsonl"
|
||||
CONFIG_FILE = DATA_DIR / "config.json"
|
||||
|
||||
DEFAULT_CONFIG = {
|
||||
"monitored_services": [
|
||||
"nexus",
|
||||
"nginx",
|
||||
"ssh"
|
||||
],
|
||||
"monitored_ports": {
|
||||
"8000": "control-panel",
|
||||
"8888": "feed-hunter",
|
||||
"8889": "market-watch",
|
||||
"8890": "ticker",
|
||||
"80": "nginx",
|
||||
"3000": "nexus"
|
||||
},
|
||||
"thresholds": {
|
||||
"disk_percent": 90,
|
||||
"memory_percent": 90,
|
||||
"cpu_percent": 95,
|
||||
"swap_percent": 80
|
||||
},
|
||||
"auto_restart": True,
|
||||
"check_interval_seconds": 300
|
||||
}
|
||||
|
||||
|
||||
def load_config():
|
||||
if CONFIG_FILE.exists():
|
||||
return json.loads(CONFIG_FILE.read_text())
|
||||
CONFIG_FILE.write_text(json.dumps(DEFAULT_CONFIG, indent=2))
|
||||
return DEFAULT_CONFIG
|
||||
|
||||
|
||||
def log_event(event_type, message, action=None, success=None):
|
||||
entry = {
|
||||
"ts": datetime.now().isoformat(),
|
||||
"type": event_type,
|
||||
"message": message,
|
||||
}
|
||||
if action:
|
||||
entry["action"] = action
|
||||
if success is not None:
|
||||
entry["success"] = success
|
||||
with open(LOG_FILE, "a") as f:
|
||||
f.write(json.dumps(entry) + "\n")
|
||||
icon = "✅" if success else ("⚠️" if success is None else "❌")
|
||||
print(f" {icon} [{event_type}] {message}")
|
||||
|
||||
|
||||
def run_cmd(cmd, timeout=10):
|
||||
try:
|
||||
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
|
||||
return r.returncode, r.stdout.strip(), r.stderr.strip()
|
||||
except subprocess.TimeoutExpired:
|
||||
return -1, "", "timeout"
|
||||
|
||||
|
||||
def check_services(config):
|
||||
issues = []
|
||||
for svc in config["monitored_services"]:
|
||||
# Try systemd user units first, then system
|
||||
code, out, _ = run_cmd(f"systemctl --user is-active {svc} 2>/dev/null || systemctl is-active {svc} 2>/dev/null")
|
||||
if "active" not in out:
|
||||
issues.append(("service_down", svc))
|
||||
log_event("service_down", f"Service '{svc}' is not active (status: {out})")
|
||||
if config.get("auto_restart"):
|
||||
# Try user unit first, then system
|
||||
code, _, err = run_cmd(f"systemctl --user restart {svc} 2>/dev/null || sudo systemctl restart {svc} 2>/dev/null")
|
||||
success = code == 0
|
||||
log_event("auto_restart", f"Attempted restart of '{svc}'", action="restart", success=success)
|
||||
else:
|
||||
log_event("service_ok", f"Service '{svc}' is active", success=True)
|
||||
return issues
|
||||
|
||||
|
||||
def check_ports(config):
|
||||
issues = []
|
||||
connections = psutil.net_connections(kind='inet')
|
||||
listening_ports = {str(c.laddr.port) for c in connections if c.status == 'LISTEN'}
|
||||
|
||||
for port, name in config.get("monitored_ports", {}).items():
|
||||
if port not in listening_ports:
|
||||
issues.append(("port_down", f"{name} (:{port})"))
|
||||
log_event("port_down", f"Nothing listening on port {port} ({name})")
|
||||
else:
|
||||
log_event("port_ok", f"Port {port} ({name}) is listening", success=True)
|
||||
return issues
|
||||
|
||||
|
||||
def check_resources(config):
|
||||
issues = []
|
||||
thresholds = config.get("thresholds", {})
|
||||
|
||||
# Disk
|
||||
disk = psutil.disk_usage('/')
|
||||
if disk.percent >= thresholds.get("disk_percent", 90):
|
||||
issues.append(("disk_high", f"{disk.percent}%"))
|
||||
log_event("disk_high", f"Disk usage at {disk.percent}% (threshold: {thresholds.get('disk_percent', 90)}%)")
|
||||
# Auto-clean: journal logs, apt cache
|
||||
run_cmd("sudo journalctl --vacuum-time=3d 2>/dev/null")
|
||||
run_cmd("sudo apt-get autoremove -y 2>/dev/null")
|
||||
log_event("auto_clean", "Ran journal vacuum and apt autoremove", action="clean", success=True)
|
||||
else:
|
||||
log_event("disk_ok", f"Disk usage at {disk.percent}%", success=True)
|
||||
|
||||
# Memory
|
||||
mem = psutil.virtual_memory()
|
||||
if mem.percent >= thresholds.get("memory_percent", 90):
|
||||
issues.append(("memory_high", f"{mem.percent}%"))
|
||||
log_event("memory_high", f"Memory usage at {mem.percent}%")
|
||||
# Clear caches
|
||||
run_cmd("sync && echo 3 | sudo tee /proc/sys/vm/drop_caches 2>/dev/null")
|
||||
log_event("auto_clean", "Dropped filesystem caches", action="drop_caches", success=True)
|
||||
else:
|
||||
log_event("memory_ok", f"Memory usage at {mem.percent}%", success=True)
|
||||
|
||||
# CPU (1-min load avg)
|
||||
load_1, _, _ = psutil.getloadavg()
|
||||
cpu_count = psutil.cpu_count()
|
||||
cpu_pct = (load_1 / cpu_count) * 100
|
||||
if cpu_pct >= thresholds.get("cpu_percent", 95):
|
||||
issues.append(("cpu_high", f"{cpu_pct:.0f}%"))
|
||||
log_event("cpu_high", f"CPU load at {cpu_pct:.0f}% (load avg: {load_1})")
|
||||
else:
|
||||
log_event("cpu_ok", f"CPU load at {cpu_pct:.0f}%", success=True)
|
||||
|
||||
# Swap
|
||||
swap = psutil.swap_memory()
|
||||
if swap.percent >= thresholds.get("swap_percent", 80):
|
||||
issues.append(("swap_high", f"{swap.percent}%"))
|
||||
log_event("swap_high", f"Swap usage at {swap.percent}%")
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
def check_zombie_processes():
|
||||
issues = []
|
||||
zombies = [p for p in psutil.process_iter(['pid', 'name', 'status']) if p.info['status'] == psutil.STATUS_ZOMBIE]
|
||||
if len(zombies) > 5:
|
||||
issues.append(("zombies", f"{len(zombies)} zombie processes"))
|
||||
log_event("zombies", f"Found {len(zombies)} zombie processes")
|
||||
return issues
|
||||
|
||||
|
||||
def run_health_check():
|
||||
config = load_config()
|
||||
print(f"\n🏥 Self-Healing Server Check — {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print("=" * 60)
|
||||
|
||||
all_issues = []
|
||||
|
||||
print("\n📡 Services:")
|
||||
all_issues.extend(check_services(config))
|
||||
|
||||
print("\n🔌 Ports:")
|
||||
all_issues.extend(check_ports(config))
|
||||
|
||||
print("\n💾 Resources:")
|
||||
all_issues.extend(check_resources(config))
|
||||
|
||||
print("\n👻 Processes:")
|
||||
all_issues.extend(check_zombie_processes())
|
||||
if not any(i[0] == "zombies" for i in all_issues):
|
||||
print(" ✅ No zombie process issues")
|
||||
|
||||
# Summary
|
||||
print(f"\n{'=' * 60}")
|
||||
if all_issues:
|
||||
print(f"⚠️ {len(all_issues)} issue(s) found:")
|
||||
for itype, detail in all_issues:
|
||||
print(f" - {itype}: {detail}")
|
||||
else:
|
||||
print("✅ All systems healthy!")
|
||||
|
||||
# Write status file
|
||||
status = {
|
||||
"last_check": datetime.now().isoformat(),
|
||||
"healthy": len(all_issues) == 0,
|
||||
"issues": [{"type": t, "detail": d} for t, d in all_issues],
|
||||
"disk_pct": psutil.disk_usage('/').percent,
|
||||
"mem_pct": psutil.virtual_memory().percent,
|
||||
"cpu_count": psutil.cpu_count(),
|
||||
}
|
||||
(DATA_DIR / "status.json").write_text(json.dumps(status, indent=2))
|
||||
|
||||
return all_issues
|
||||
|
||||
|
||||
def show_status():
|
||||
status_file = DATA_DIR / "status.json"
|
||||
if not status_file.exists():
|
||||
print("No status yet. Run a health check first.")
|
||||
return
|
||||
status = json.loads(status_file.read_text())
|
||||
print(json.dumps(status, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if "--status" in sys.argv:
|
||||
show_status()
|
||||
elif "--daemon" in sys.argv:
|
||||
config = load_config()
|
||||
interval = config.get("check_interval_seconds", 300)
|
||||
print(f"🔄 Running in daemon mode (every {interval}s). Ctrl+C to stop.")
|
||||
while True:
|
||||
try:
|
||||
run_health_check()
|
||||
time.sleep(interval)
|
||||
except KeyboardInterrupt:
|
||||
print("\nStopped.")
|
||||
break
|
||||
else:
|
||||
run_health_check()
|
||||
Reference in New Issue
Block a user