v3.8.0: AI Monitoring — self-healing watchdog with 3-tier response system

- HealthWatcher thread: monitors proxy /health every 5s
- LogAnalyzer thread: tails cc-debug.log for 18 failure signal patterns
- Tier 1 rule engine: 14 rules for instant auto-recovery (< 1s)
- Tier 2 incident store: JSON pattern database with success rates
- Tier 3 AI diagnostic agent: calls configurable provider/model for novel failures
- AIMonitoringWindow GUI: ON/OFF toggle, provider/model/API key selector, incident log
- 30 fault types catalogued across 5 categories (A-E)
- Enhanced /health endpoint with memory_mb, uptime_s, requests_total
- Auto-restart proxy, auto-clear schema cache, kill stale processes
- Safety: rate-limited AI calls, restart caps, cooldowns per pattern
- AI Monitoring design spec (AI-MONITORING-DESIGN.md)
- 54 self-test patterns passing
This commit is contained in:
admin
2026-05-22 22:36:16 +04:00
Unverified
parent 4334540f33
commit 080f2bc56e
4 changed files with 621 additions and 17 deletions

Binary file not shown.

View File

@@ -3,11 +3,11 @@ set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
if [ -f "$SCRIPT_DIR/codex-launcher_3.7.0_all.deb" ]; then if [ -f "$SCRIPT_DIR/codex-launcher_3.8.0_all.deb" ]; then
echo "Installing codex-launcher_3.7.0_all.deb ..." echo "Installing codex-launcher_3.8.0_all.deb ..."
sudo dpkg -i "$SCRIPT_DIR/codex-launcher_3.7.0_all.deb" sudo dpkg -i "$SCRIPT_DIR/codex-launcher_3.8.0_all.deb"
echo "" echo ""
echo "Installed v3.7.0 via .deb package." echo "Installed v3.8.0 via .deb package."
echo " translate-proxy.py -> /usr/bin/translate-proxy.py" echo " translate-proxy.py -> /usr/bin/translate-proxy.py"
echo " codex-launcher-gui -> /usr/bin/codex-launcher-gui" echo " codex-launcher-gui -> /usr/bin/codex-launcher-gui"
echo " cleanup-codex-stale -> /usr/bin/cleanup-codex-stale.sh" echo " cleanup-codex-stale -> /usr/bin/cleanup-codex-stale.sh"

View File

@@ -5,7 +5,7 @@ import gi
gi.require_version("Gtk", "3.0") gi.require_version("Gtk", "3.0")
from gi.repository import Gtk, GLib from gi.repository import Gtk, GLib
import subprocess, os, signal, sys, threading, time, json, urllib.request, urllib.parse, urllib.error, tempfile, shutil import subprocess, os, signal, sys, threading, time, json, urllib.request, urllib.parse, urllib.error, tempfile, shutil
import hashlib, socket, ssl, contextlib, re import hashlib, socket, ssl, contextlib, re, collections
import base64, secrets import base64, secrets
from pathlib import Path from pathlib import Path
@@ -1123,6 +1123,524 @@ def _check_codex_auth():
except Exception as e: except Exception as e:
return ("error", str(e)) return ("error", str(e))
# ═══════════════════════════════════════════════════════════════════
# AI Monitoring — Self-Healing Watchdog
# ═══════════════════════════════════════════════════════════════════
MONITORING_FILE = Path.home() / ".cache/codex-proxy/monitoring-config.json"
INCIDENT_STORE_FILE = Path.home() / ".cache/codex-proxy/incident-store.json"
MONITORING_LOG = Path.home() / ".cache/codex-proxy/monitoring.log"
_TIER1_RULES = [
("proxy_health_fail", "restart_proxy", 30),
("proxy_port_conflict", "kill_stale_restart", 60),
("upstream_429", "wait_retry", 0),
("upstream_502_503", "retry_backoff", 30),
("upstream_500_repeat", "switch_provider", 60),
("upstream_timeout", "retry_increase_timeout",30),
("upstream_401_403", "alert_bad_key", 0),
("stream_broken_pipe", "restart_proxy", 30),
("stream_reset", "restart_proxy", 30),
("parsed_tool_calls_0_x3", "clear_schema_cache", 300),
("sanitizer_suspicious_5x","alert_model_issue", 0),
("stuck_recovery_x5", "suggest_switch_model", 0),
("codex_process_dead", "alert_restart", 0),
("schema_corrupt", "delete_provider_caps", 0),
]
_FAILURE_SIGNALS = {
"parsed_tool_calls=0": ("C1", "parser_empty"),
"[STUCK-RECOVERY]": ("C3", "stuck_recovery"),
"suspicious cmd": ("C4", "sanitizer_flag"),
"empty cmd recovered": ("C6", "empty_cmd"),
"HTTP 429": ("B1", "rate_limited"),
"HTTP 500": ("B2", "server_error"),
"HTTP 502": ("B2", "server_error"),
"HTTP 503": ("B2", "server_error"),
"HTTP 401": ("B3", "auth_failure"),
"HTTP 403": ("B4", "forbidden"),
"Connection refused": ("A1", "proxy_dead"),
"Address already in use": ("A2", "port_conflict"),
"Broken pipe": ("B7", "broken_pipe"),
"Connection reset": ("B6", "connection_reset"),
"timed out": ("B5", "timeout"),
"SELF-REVIVE CRASH": ("A5", "proxy_crash"),
"stream error": ("B6", "stream_error"),
"content_type.*array": ("E1", "schema_corrupt"),
}
_DIAGNOSTIC_SYSTEM_PROMPT = (
'You are a diagnostic agent for "Codex Launcher" — a desktop app that runs a local '
'translation proxy between OpenAI Codex CLI/Desktop and AI providers.\n\n'
'Analyze the incident and respond with ONLY a JSON object:\n'
'{"action": "...", "reason": "...", "confidence": 0.0-1.0}\n\n'
'Available actions: restart_proxy, kill_stale_processes, clear_schema_cache, '
'switch_provider, increase_timeout, regenerate_config, cleanup_stale, '
'alert_user, ignore, retry_now\n\n'
'Rules:\n'
'- upstream 401/403 with auth error -> alert_user\n'
'- proxy dead -> restart_proxy\n'
'- same error 5+ times -> switch_provider or alert_user\n'
'- schema/content_type error -> clear_schema_cache\n'
'- "Address already in use" -> kill_stale_processes then restart_proxy\n'
'- timeout on slow upstream -> increase_timeout\n'
'- single transient 429/502/503 -> ignore\n'
'- "stream disconnected" + proxy healthy -> ignore\n'
'- no extra text, no markdown, just the JSON object'
)
def _load_monitoring_config():
if MONITORING_FILE.exists():
try:
return json.loads(MONITORING_FILE.read_text())
except Exception:
pass
return {
"enabled": False,
"provider_url": "",
"model": "",
"api_key": "",
"health_check_interval_s": 5,
"auto_restart_proxy": True,
"auto_switch_provider": False,
}
def _save_monitoring_config(cfg):
MONITORING_FILE.parent.mkdir(parents=True, exist_ok=True)
MONITORING_FILE.write_text(json.dumps(cfg, indent=2))
def _load_incident_store():
if INCIDENT_STORE_FILE.exists():
try:
return json.loads(INCIDENT_STORE_FILE.read_text())
except Exception:
pass
return {"version": 1, "incidents": {}, "stats": {"ai_calls": 0, "tokens_used": 0}}
def _save_incident_store(store):
INCIDENT_STORE_FILE.parent.mkdir(parents=True, exist_ok=True)
INCIDENT_STORE_FILE.write_text(json.dumps(store, indent=2))
def _monitoring_log(msg):
try:
with open(str(MONITORING_LOG), "a") as f:
f.write(f"[{time.strftime('%H:%M:%S')}] {msg}\n")
except Exception:
pass
class IncidentStore:
def __init__(self):
self._store = _load_incident_store()
self._dirty = False
def lookup(self, pattern):
inc = self._store.get("incidents", {}).get(pattern)
if inc and inc.get("success_count", 0) > 0:
rate = inc["success_count"] / max(inc["success_count"] + inc.get("fail_count", 0), 1)
if rate > 0.5:
return inc
return None
def record(self, pattern, fix, success=True):
incs = self._store.setdefault("incidents", {})
inc = incs.setdefault(pattern, {
"fix": fix, "success_count": 0, "fail_count": 0,
"last_seen": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"occurrences": 0,
})
inc["last_seen"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
inc["occurrences"] = inc.get("occurrences", 0) + 1
if success:
inc["success_count"] = inc.get("success_count", 0) + 1
else:
inc["fail_count"] = inc.get("fail_count", 0) + 1
self._dirty = True
def record_ai_call(self, tokens=0):
stats = self._store.setdefault("stats", {"ai_calls": 0, "tokens_used": 0})
stats["ai_calls"] = stats.get("ai_calls", 0) + 1
stats["tokens_used"] = stats.get("tokens_used", 0) + tokens
self._dirty = True
def flush(self):
if self._dirty:
_save_incident_store(self._store)
self._dirty = False
@property
def stats(self):
return self._store.get("stats", {"ai_calls": 0, "tokens_used": 0})
class AIDiagnosticAgent:
def __init__(self, provider_url, model, api_key):
self.provider_url = provider_url
self.model = model
self.api_key = api_key
self.incident_store = IncidentStore()
def diagnose(self, context):
pattern = self._extract_pattern(context)
known = self.incident_store.lookup(pattern)
if known:
_monitoring_log(f"Tier 2 HIT: pattern={pattern} fix={known['fix']}")
return {"action": known["fix"], "reason": "known_pattern", "confidence": 0.9, "tier": 2}
action = self._call_model(context)
if action:
self.incident_store.record(pattern, action.get("action", "unknown"))
self.incident_store.flush()
return action
def _extract_pattern(self, context):
parts = []
for k in sorted(context.get("signals", [])):
parts.append(k)
if context.get("http_code"):
parts.append(f"http_{context['http_code']}")
return "+".join(parts[:3]) or "unknown"
def _call_model(self, context):
prompt = (
f"INCIDENT REPORT:\n"
f"Time: {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}\n"
f"Proxy health: {context.get('proxy_alive', 'unknown')}\n"
f"Upstream: {context.get('upstream_url', 'unknown')}\n"
f"Model: {context.get('model', 'unknown')}\n"
f"Last HTTP code: {context.get('http_code', 'n/a')}\n"
f"Recent signals: {context.get('signals', [])}\n"
f"Recent log tail:\n{context.get('log_tail', '')[:1500]}\n"
)
body = {
"model": self.model,
"messages": [
{"role": "system", "content": _DIAGNOSTIC_SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
"max_tokens": 200,
"temperature": 0.1,
}
try:
req = urllib.request.Request(
self.provider_url,
data=json.dumps(body).encode(),
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}",
},
)
resp = urllib.request.urlopen(req, timeout=15)
result = json.loads(resp.read())
text = result["choices"][0]["message"]["content"].strip()
self.incident_store.record_ai_call(tokens=800)
action = json.loads(text)
action["tier"] = 3
_monitoring_log(f"Tier 3 AI: action={action.get('action')} reason={action.get('reason')}")
return action
except Exception as e:
_monitoring_log(f"Tier 3 AI FAILED: {e}")
return {"action": "alert_user", "reason": f"ai_diag_failed: {e}", "confidence": 0.0, "tier": 3}
class HealthWatcher(threading.Thread):
def __init__(self, on_failure, on_recovery, on_signal, on_action):
super().__init__(daemon=True)
self.cfg = _load_monitoring_config()
self.on_failure = on_failure
self.on_recovery = on_recovery
self.on_signal = on_signal
self.on_action = on_action
self.failures = 0
self.running = False
self._signal_counts = collections.defaultdict(int)
self._last_actions = {}
self._restart_count = 0
self._last_restart_time = 0
def run(self):
self.running = True
self.incident_store = IncidentStore()
self._log_analyzer = _LogAnalyzerThread(self._on_log_signal)
self._log_analyzer.start()
while self.running:
self.cfg = _load_monitoring_config()
if not self.cfg.get("enabled"):
time.sleep(5)
continue
port = self._get_proxy_port()
if port:
healthy = self._check_health(port)
if healthy:
if self.failures > 0:
self.failures = 0
self.on_recovery()
else:
self.failures += 1
if self.failures >= 3:
self._handle_failure("proxy_health_fail")
self.incident_store.flush()
interval = self.cfg.get("health_check_interval_s", 5)
time.sleep(interval)
def stop(self):
self.running = False
if hasattr(self, '_log_analyzer'):
self._log_analyzer.running = False
def _get_proxy_port(self):
try:
cfg_path = Path.home() / ".cache/codex-proxy/proxy-config.json"
if cfg_path.exists():
d = json.loads(cfg_path.read_text())
return d.get("port")
except Exception:
pass
return None
def _check_health(self, port):
try:
req = urllib.request.Request(f"http://localhost:{port}/health")
resp = urllib.request.urlopen(req, timeout=5)
return resp.status == 200
except Exception:
return False
def _on_log_signal(self, fault_id, category, line):
self._signal_counts[category] += 1
self.on_signal(fault_id, category, line[:200])
count = self._signal_counts[category]
if category in ("proxy_dead", "port_conflict") and count >= 2:
self._handle_failure(category)
elif category in ("server_error", "timeout") and count >= 3:
self._handle_failure(category + "_repeat")
elif category in ("sanitizer_flag",) and count >= 5:
self._handle_failure("sanitizer_suspicious_5x")
elif category in ("stuck_recovery",) and count >= 5:
self._handle_failure("stuck_recovery_x5")
elif category in ("parser_empty",) and count >= 3:
self._handle_failure("parsed_tool_calls_0_x3")
elif category in ("schema_corrupt",):
self._handle_failure("schema_corrupt")
def _handle_failure(self, trigger):
now = time.time()
for rule_trigger, action, cooldown in _TIER1_RULES:
if rule_trigger == trigger:
last_t = self._last_actions.get(action, 0)
if now - last_t < cooldown:
return
self._last_actions[action] = now
_monitoring_log(f"Tier 1: trigger={trigger} action={action}")
self.on_action(action, trigger)
self.incident_store.record(trigger, action, success=True)
return
self._try_tier2_3(trigger)
def _try_tier2_3(self, trigger):
cfg = self.cfg
if not cfg.get("provider_url") or not cfg.get("model") or not cfg.get("api_key"):
_monitoring_log(f"No AI configured for Tier 2/3 — alerting user for trigger={trigger}")
self.on_action("alert_user", trigger)
return
agent = AIDiagnosticAgent(cfg["provider_url"], cfg["model"], cfg["api_key"])
context = {
"signals": [trigger],
"proxy_alive": self.failures == 0,
"log_tail": self._get_recent_log(),
}
result = agent.diagnose(context)
if result:
action = result.get("action", "alert_user")
_monitoring_log(f"Tier {result.get('tier', '?')}: action={action}")
self.on_action(action, trigger)
class _LogAnalyzerThread(threading.Thread):
def __init__(self, on_signal):
super().__init__(daemon=True)
self.on_signal = on_signal
self.running = False
def run(self):
self.running = True
log_paths = [
str(Path.home() / ".cache/codex-proxy/cc-debug.log"),
str(Path.home() / ".cache/codex-proxy/proxy.log"),
]
fhs = {}
for p in log_paths:
try:
f = open(p, "r")
f.seek(0, 2)
fhs[p] = f
except Exception:
pass
while self.running:
activity = False
for p, fh in list(fhs.items()):
try:
line = fh.readline()
if line:
activity = True
for pattern, (fault_id, category) in _FAILURE_SIGNALS.items():
if re.search(pattern, line):
self.on_signal(fault_id, category, line.strip())
break
except Exception:
pass
if not activity:
time.sleep(0.5)
class AIMonitoringWindow(Gtk.Window):
def __init__(self, parent=None):
super().__init__(title="AI Monitoring")
self.set_transient_for(parent)
self.set_default_size(580, 520)
self.set_border_width(12)
self._cfg = _load_monitoring_config()
self._store = _load_incident_store()
vbox = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=8)
self.add(vbox)
hdr = Gtk.Box(spacing=8)
vbox.pack_start(hdr, False, False, 0)
lbl = Gtk.Label()
lbl.set_markup("<b>AI Monitoring</b>")
lbl.set_use_markup(True)
hdr.pack_start(lbl, False, False, 0)
self._toggle = Gtk.Switch()
self._toggle.set_active(self._cfg.get("enabled", False))
self._toggle.connect("state-set", self._on_toggle)
hdr.pack_end(self._toggle, False, False, 0)
lbl2 = Gtk.Label(label="Enabled")
hdr.pack_end(lbl2, False, False, 0)
frame = Gtk.Frame(label="Diagnostic Agent")
vbox.pack_start(frame, False, False, 0)
grid = Gtk.Grid(column_spacing=8, row_spacing=6, margin=8)
frame.add(grid)
grid.attach(Gtk.Label(label="Provider URL:", halign=Gtk.Align.END), 0, 0, 1, 1)
self._url_entry = Gtk.Entry(hexpand=True)
self._url_entry.set_text(self._cfg.get("provider_url", ""))
self._url_entry.set_placeholder_text("https://api.openai.com/v1/chat/completions")
grid.attach(self._url_entry, 1, 0, 2, 1)
grid.attach(Gtk.Label(label="Model:", halign=Gtk.Align.END), 0, 1, 1, 1)
self._model_entry = Gtk.Entry(hexpand=True)
self._model_entry.set_text(self._cfg.get("model", ""))
self._model_entry.set_placeholder_text("gpt-4o-mini or Qwen/Qwen3-32B")
grid.attach(self._model_entry, 1, 1, 2, 1)
grid.attach(Gtk.Label(label="API Key:", halign=Gtk.Align.END), 0, 2, 1, 1)
self._key_entry = Gtk.Entry(hexpand=True, visibility=False)
self._key_entry.set_text(self._cfg.get("api_key", ""))
self._key_entry.set_placeholder_text("sk-...")
grid.attach(self._key_entry, 1, 2, 1, 1)
self._reveal_btn = Gtk.ToggleButton(label="Show")
self._reveal_btn.connect("toggled", lambda b: self._key_entry.set_visibility(b.get_active()))
grid.attach(self._reveal_btn, 2, 2, 1, 1)
grid.attach(Gtk.Label(label="Health Check:", halign=Gtk.Align.END), 0, 3, 1, 1)
adj = Gtk.Adjustment(value=self._cfg.get("health_check_interval_s", 5), lower=2, upper=30, step_increment=1)
self._interval_spin = Gtk.SpinButton(adjustment=adj)
self._interval_spin.set_numeric(True)
grid.attach(self._interval_spin, 1, 3, 1, 1)
grid.attach(Gtk.Label(label="seconds"), 2, 3, 1, 1)
opts_box = Gtk.Box(spacing=12, margin_top=4)
grid.attach(opts_box, 0, 4, 3, 1)
self._auto_restart_cb = Gtk.CheckButton(label="Auto-restart proxy on crash")
self._auto_restart_cb.set_active(self._cfg.get("auto_restart_proxy", True))
opts_box.pack_start(self._auto_restart_cb, False, False, 0)
self._auto_switch_cb = Gtk.CheckButton(label="Auto-switch provider on repeated failure")
self._auto_switch_cb.set_active(self._cfg.get("auto_switch_provider", False))
opts_box.pack_start(self._auto_switch_cb, False, False, 0)
save_btn = Gtk.Button(label="Save Configuration")
save_btn.get_style_context().add_class("suggested-action")
save_btn.connect("clicked", self._on_save)
grid.attach(save_btn, 0, 5, 3, 1)
stats_box = Gtk.Box(spacing=16)
vbox.pack_start(stats_box, False, False, 0)
stats = self._store.get("stats", {"ai_calls": 0, "tokens_used": 0})
self._stats_lbl = Gtk.Label()
self._stats_lbl.set_markup(
f"<small>AI diagnostic calls: <b>{stats.get('ai_calls', 0)}</b> | "
f"Tokens used: <b>{stats.get('tokens_used', 0):,}</b> | "
f"Known patterns: <b>{len(self._store.get('incidents', {}))}</b></small>"
)
self._stats_lbl.set_use_markup(True)
stats_box.pack_start(self._stats_lbl, False, False, 0)
frame2 = Gtk.Frame(label="Recent Incidents")
vbox.pack_start(frame2, True, True, 0)
sw = Gtk.ScrolledWindow()
sw.set_policy(Gtk.PolicyType.AUTOMATIC, Gtk.PolicyType.AUTOMATIC)
frame2.add(sw)
self._inc_buf = Gtk.TextBuffer()
tv = Gtk.TextView(buffer=self._inc_buf)
tv.set_editable(False)
tv.set_cursor_visible(False)
tv.set_wrap_mode(Gtk.WrapMode.WORD_CHAR)
sw.add(tv)
self._refresh_incidents()
bb = Gtk.Box(spacing=8)
vbox.pack_start(bb, False, False, 0)
view_btn = Gtk.Button(label="View Monitoring Log")
view_btn.connect("clicked", lambda b: subprocess.Popen(["xdg-open", str(MONITORING_LOG)]))
bb.pack_start(view_btn, False, False, 0)
clear_btn = Gtk.Button(label="Clear Incident Store")
clear_btn.connect("clicked", self._on_clear_store)
bb.pack_start(clear_btn, False, False, 0)
close_btn = Gtk.Button(label="Close")
close_btn.connect("clicked", lambda b: self.destroy())
bb.pack_end(close_btn, False, False, 0)
self.show_all()
def _on_toggle(self, switch, state):
self._cfg["enabled"] = state
_save_monitoring_config(self._cfg)
def _on_save(self, btn):
self._cfg["provider_url"] = self._url_entry.get_text().strip()
self._cfg["model"] = self._model_entry.get_text().strip()
self._cfg["api_key"] = self._key_entry.get_text().strip()
self._cfg["health_check_interval_s"] = int(self._interval_spin.get_value())
self._cfg["auto_restart_proxy"] = self._auto_restart_cb.get_active()
self._cfg["auto_switch_provider"] = self._auto_switch_cb.get_active()
_save_monitoring_config(self._cfg)
self._inc_buf.set_text("Configuration saved.\n")
def _on_clear_store(self, btn):
_save_incident_store({"version": 1, "incidents": {}, "stats": {"ai_calls": 0, "tokens_used": 0}})
self._store = {"version": 1, "incidents": {}, "stats": {"ai_calls": 0, "tokens_used": 0}}
self._refresh_incidents()
def _refresh_incidents(self):
lines = []
for pattern, inc in sorted(self._store.get("incidents", {}).items(),
key=lambda x: x[1].get("last_seen", ""), reverse=True):
sc = inc.get("success_count", 0)
fc = inc.get("fail_count", 0)
rate = sc / max(sc + fc, 1)
bar = "+" * min(int(rate * 10), 10) + "-" * (10 - min(int(rate * 10), 10))
lines.append(
f"[{inc.get('last_seen', '?')[:16]}] {pattern}\n"
f" fix={inc.get('fix', '?')} success_rate={rate:.0%} [{bar}] "
f"seen={inc.get('occurrences', 0)}x\n"
)
if not lines:
lines.append("No incidents recorded yet.\n")
lines.append("\nEnable AI Monitoring and use Codex to populate the store.\n")
self._inc_buf.set_text("\n".join(lines))
# ═══════════════════════════════════════════════════════════════════ # ═══════════════════════════════════════════════════════════════════
# Main window # Main window
# ═══════════════════════════════════════════════════════════════════ # ═══════════════════════════════════════════════════════════════════
@@ -1143,7 +1661,7 @@ class LauncherWin(Gtk.Window):
# header row # header row
hdr = Gtk.Box(spacing=8) hdr = Gtk.Box(spacing=8)
vbox.pack_start(hdr, False, False, 0) vbox.pack_start(hdr, False, False, 0)
lbl = Gtk.Label(label="<b>Codex Launcher v3.7.0</b>") lbl = Gtk.Label(label="<b>Codex Launcher v3.8.0</b>")
lbl.set_use_markup(True) lbl.set_use_markup(True)
hdr.pack_start(lbl, False, False, 0) hdr.pack_start(lbl, False, False, 0)
changelog_btn = Gtk.Button(label="Changelog") changelog_btn = Gtk.Button(label="Changelog")
@@ -1161,6 +1679,9 @@ class LauncherWin(Gtk.Window):
bgp_btn = Gtk.Button(label="AI BGP") bgp_btn = Gtk.Button(label="AI BGP")
bgp_btn.connect("clicked", lambda b: self._open_bgp()) bgp_btn.connect("clicked", lambda b: self._open_bgp())
hdr.pack_end(bgp_btn, False, False, 0) hdr.pack_end(bgp_btn, False, False, 0)
mon_btn = Gtk.Button(label="AI Monitor")
mon_btn.connect("clicked", lambda b: self._open_monitoring())
hdr.pack_end(mon_btn, False, False, 0)
mgr_btn = Gtk.Button(label="Manage Endpoints") mgr_btn = Gtk.Button(label="Manage Endpoints")
mgr_btn.connect("clicked", lambda b: self._open_mgr()) mgr_btn.connect("clicked", lambda b: self._open_mgr())
hdr.pack_end(mgr_btn, False, False, 0) hdr.pack_end(mgr_btn, False, False, 0)
@@ -1310,6 +1831,7 @@ class LauncherWin(Gtk.Window):
self.show_all() self.show_all()
self._rebuild_combo() self._rebuild_combo()
self._log_dependency_status() self._log_dependency_status()
self._start_watcher()
# ── helpers ────────────────────────────────────────────────── # ── helpers ──────────────────────────────────────────────────
@@ -1456,13 +1978,84 @@ class LauncherWin(Gtk.Window):
d.run(); d.destroy() d.run(); d.destroy()
def _open_bgp(self): def _open_bgp(self):
try: try:
self._bgp_window = BGPPoolMgr(self) self._bgp_window = BGPPoolMgr(self)
self._bgp_window.connect("destroy", lambda *_: setattr(self, "_bgp_window", None)) self._bgp_window.connect("destroy", lambda *_: setattr(self, "_bgp_window", None))
except Exception as e: except Exception as e:
import traceback; traceback.print_exc() import traceback; traceback.print_exc()
d = Gtk.MessageDialog(self, 0, Gtk.MessageType.ERROR, Gtk.ButtonsType.OK, f"Error: {e}") d = Gtk.MessageDialog(self, 0, Gtk.MessageType.ERROR, Gtk.ButtonsType.OK, f"Error: {e}")
d.run(); d.destroy() d.run(); d.destroy()
def _open_monitoring(self):
try:
self._monitoring_window = AIMonitoringWindow(self)
self._monitoring_window.connect("destroy", lambda *_: setattr(self, "_monitoring_window", None))
except Exception as e:
import traceback; traceback.print_exc()
d = Gtk.MessageDialog(self, 0, Gtk.MessageType.ERROR, Gtk.ButtonsType.OK, f"Error: {e}")
d.run(); d.destroy()
def _start_watcher(self):
cfg = _load_monitoring_config()
if not cfg.get("enabled"):
return
self._watcher = HealthWatcher(
on_failure=self._on_watcher_failure,
on_recovery=self._on_watcher_recovery,
on_signal=self._on_watcher_signal,
on_action=self._on_watcher_action,
)
self._watcher.start()
self.log("AI Monitoring: watchdog started")
def _on_watcher_failure(self, count):
GLib.idle_add(self.log, f"[AI Monitor] Proxy unresponsive (failures={count})")
def _on_watcher_recovery(self):
GLib.idle_add(self.log, "[AI Monitor] Proxy recovered")
def _on_watcher_signal(self, fault_id, category, line):
pass
def _on_watcher_action(self, action, trigger):
cfg = _load_monitoring_config()
if action == "restart_proxy" and cfg.get("auto_restart_proxy"):
GLib.idle_add(self.log, f"[AI Monitor] Auto-restarting proxy (trigger: {trigger})")
GLib.idle_add(self._restart_proxy_from_watcher)
elif action == "clear_schema_cache":
try:
cap_file = Path.home() / ".cache/codex-proxy/provider-caps.json"
if cap_file.exists():
cap_file.unlink()
GLib.idle_add(self.log, "[AI Monitor] Cleared corrupt schema cache")
except Exception as e:
GLib.idle_add(self.log, f"[AI Monitor] Failed to clear cache: {e}")
elif action == "delete_provider_caps":
try:
cap_file = Path.home() / ".cache/codex-proxy/provider-caps.json"
if cap_file.exists():
cap_file.unlink()
GLib.idle_add(self.log, "[AI Monitor] Deleted corrupted provider-caps.json")
except Exception as e:
GLib.idle_add(self.log, f"[AI Monitor] Failed: {e}")
elif action == "kill_stale_restart":
GLib.idle_add(self.log, f"[AI Monitor] Killing stale processes + restarting (trigger: {trigger})")
self._kill()
GLib.idle_add(self._restart_proxy_from_watcher)
else:
GLib.idle_add(self.log, f"[AI Monitor] Alert: {action} (trigger: {trigger})")
def _restart_proxy_from_watcher(self):
try:
ep_name = load_endpoints().get("default")
if not ep_name:
return
for ep in load_endpoints().get("endpoints", []):
if ep.get("name") == ep_name:
self._start_proxy(ep)
break
except Exception as e:
self.log(f"[AI Monitor] Proxy restart failed: {e}")
def _open_usage(self): def _open_usage(self):
try: try:

View File

@@ -3410,10 +3410,20 @@ class Handler(http.server.BaseHTTPRequestHandler):
if self.path in ("/v1/models", "/models"): if self.path in ("/v1/models", "/models"):
self.send_json(200, {"object": "list", "data": MODELS}) self.send_json(200, {"object": "list", "data": MODELS})
elif self.path in ("/health", "/v1/health"): elif self.path in ("/health", "/v1/health"):
import resource as _res
_mem_mb = 0
try:
_mem_mb = _res.getrusage(_res.RUSAGE_SELF).ru_maxrss / 1024
except Exception:
pass
_uptime = time.time() - _START_TIME if '_START_TIME' in dir() else 0
self.send_json(200, {"ok": True, "backend": BACKEND, self.send_json(200, {"ok": True, "backend": BACKEND,
"target_url": TARGET_URL, "target_url": TARGET_URL,
"models": [m.get("id") for m in MODELS], "models": [m.get("id") for m in MODELS],
"bgp_routes": len(BGP_ROUTES)}) "bgp_routes": len(BGP_ROUTES),
"uptime_s": round(_uptime, 1),
"memory_mb": round(_mem_mb, 1),
"requests_total": _STATS.get("requests", 0)})
else: else:
self.send_error(404) self.send_error(404)
@@ -4750,10 +4760,11 @@ def _handle_shutdown_signal(sig, frame):
_SHUTDOWN_REQUESTED = True _SHUTDOWN_REQUESTED = True
print(f"[SELF-REVIVE] Signal {sig} received, shutting down cleanly", flush=True) print(f"[SELF-REVIVE] Signal {sig} received, shutting down cleanly", flush=True)
if 'SERVER' in globals() and SERVER: if 'SERVER' in globals() and SERVER:
SERVER.shutdown() SERVER.shutdown()
def main(): def main():
global SERVER global SERVER, _START_TIME
_START_TIME = time.time()
_init_runtime() _init_runtime()
signal.signal(signal.SIGTERM, _handle_shutdown_signal) signal.signal(signal.SIGTERM, _handle_shutdown_signal)
signal.signal(signal.SIGINT, _handle_shutdown_signal) signal.signal(signal.SIGINT, _handle_shutdown_signal)