diff --git a/codex-launcher_3.8.0_all.deb b/codex-launcher_3.8.0_all.deb new file mode 100644 index 0000000..d24e7f7 Binary files /dev/null and b/codex-launcher_3.8.0_all.deb differ diff --git a/install.sh b/install.sh index 70b0b75..d03541b 100755 --- a/install.sh +++ b/install.sh @@ -3,11 +3,11 @@ set -e SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -if [ -f "$SCRIPT_DIR/codex-launcher_3.7.0_all.deb" ]; then - echo "Installing codex-launcher_3.7.0_all.deb ..." - sudo dpkg -i "$SCRIPT_DIR/codex-launcher_3.7.0_all.deb" +if [ -f "$SCRIPT_DIR/codex-launcher_3.8.0_all.deb" ]; then + echo "Installing codex-launcher_3.8.0_all.deb ..." + sudo dpkg -i "$SCRIPT_DIR/codex-launcher_3.8.0_all.deb" echo "" - echo "Installed v3.7.0 via .deb package." + echo "Installed v3.8.0 via .deb package." echo " translate-proxy.py -> /usr/bin/translate-proxy.py" echo " codex-launcher-gui -> /usr/bin/codex-launcher-gui" echo " cleanup-codex-stale -> /usr/bin/cleanup-codex-stale.sh" diff --git a/src/codex-launcher-gui b/src/codex-launcher-gui index 951e0e9..137b590 100755 --- a/src/codex-launcher-gui +++ b/src/codex-launcher-gui @@ -5,7 +5,7 @@ import gi gi.require_version("Gtk", "3.0") from gi.repository import Gtk, GLib import subprocess, os, signal, sys, threading, time, json, urllib.request, urllib.parse, urllib.error, tempfile, shutil -import hashlib, socket, ssl, contextlib, re +import hashlib, socket, ssl, contextlib, re, collections import base64, secrets from pathlib import Path @@ -1123,6 +1123,524 @@ def _check_codex_auth(): except Exception as e: return ("error", str(e)) +# ═══════════════════════════════════════════════════════════════════ +# AI Monitoring — Self-Healing Watchdog +# ═══════════════════════════════════════════════════════════════════ + +MONITORING_FILE = Path.home() / ".cache/codex-proxy/monitoring-config.json" +INCIDENT_STORE_FILE = Path.home() / ".cache/codex-proxy/incident-store.json" +MONITORING_LOG = Path.home() / ".cache/codex-proxy/monitoring.log" + +_TIER1_RULES = [ + ("proxy_health_fail", "restart_proxy", 30), + ("proxy_port_conflict", "kill_stale_restart", 60), + ("upstream_429", "wait_retry", 0), + ("upstream_502_503", "retry_backoff", 30), + ("upstream_500_repeat", "switch_provider", 60), + ("upstream_timeout", "retry_increase_timeout",30), + ("upstream_401_403", "alert_bad_key", 0), + ("stream_broken_pipe", "restart_proxy", 30), + ("stream_reset", "restart_proxy", 30), + ("parsed_tool_calls_0_x3", "clear_schema_cache", 300), + ("sanitizer_suspicious_5x","alert_model_issue", 0), + ("stuck_recovery_x5", "suggest_switch_model", 0), + ("codex_process_dead", "alert_restart", 0), + ("schema_corrupt", "delete_provider_caps", 0), +] + +_FAILURE_SIGNALS = { + "parsed_tool_calls=0": ("C1", "parser_empty"), + "[STUCK-RECOVERY]": ("C3", "stuck_recovery"), + "suspicious cmd": ("C4", "sanitizer_flag"), + "empty cmd recovered": ("C6", "empty_cmd"), + "HTTP 429": ("B1", "rate_limited"), + "HTTP 500": ("B2", "server_error"), + "HTTP 502": ("B2", "server_error"), + "HTTP 503": ("B2", "server_error"), + "HTTP 401": ("B3", "auth_failure"), + "HTTP 403": ("B4", "forbidden"), + "Connection refused": ("A1", "proxy_dead"), + "Address already in use": ("A2", "port_conflict"), + "Broken pipe": ("B7", "broken_pipe"), + "Connection reset": ("B6", "connection_reset"), + "timed out": ("B5", "timeout"), + "SELF-REVIVE CRASH": ("A5", "proxy_crash"), + "stream error": ("B6", "stream_error"), + "content_type.*array": ("E1", "schema_corrupt"), +} + +_DIAGNOSTIC_SYSTEM_PROMPT = ( + 'You are a diagnostic agent for "Codex Launcher" — a desktop app that runs a local ' + 'translation proxy between OpenAI Codex CLI/Desktop and AI providers.\n\n' + 'Analyze the incident and respond with ONLY a JSON object:\n' + '{"action": "...", "reason": "...", "confidence": 0.0-1.0}\n\n' + 'Available actions: restart_proxy, kill_stale_processes, clear_schema_cache, ' + 'switch_provider, increase_timeout, regenerate_config, cleanup_stale, ' + 'alert_user, ignore, retry_now\n\n' + 'Rules:\n' + '- upstream 401/403 with auth error -> alert_user\n' + '- proxy dead -> restart_proxy\n' + '- same error 5+ times -> switch_provider or alert_user\n' + '- schema/content_type error -> clear_schema_cache\n' + '- "Address already in use" -> kill_stale_processes then restart_proxy\n' + '- timeout on slow upstream -> increase_timeout\n' + '- single transient 429/502/503 -> ignore\n' + '- "stream disconnected" + proxy healthy -> ignore\n' + '- no extra text, no markdown, just the JSON object' +) + +def _load_monitoring_config(): + if MONITORING_FILE.exists(): + try: + return json.loads(MONITORING_FILE.read_text()) + except Exception: + pass + return { + "enabled": False, + "provider_url": "", + "model": "", + "api_key": "", + "health_check_interval_s": 5, + "auto_restart_proxy": True, + "auto_switch_provider": False, + } + +def _save_monitoring_config(cfg): + MONITORING_FILE.parent.mkdir(parents=True, exist_ok=True) + MONITORING_FILE.write_text(json.dumps(cfg, indent=2)) + +def _load_incident_store(): + if INCIDENT_STORE_FILE.exists(): + try: + return json.loads(INCIDENT_STORE_FILE.read_text()) + except Exception: + pass + return {"version": 1, "incidents": {}, "stats": {"ai_calls": 0, "tokens_used": 0}} + +def _save_incident_store(store): + INCIDENT_STORE_FILE.parent.mkdir(parents=True, exist_ok=True) + INCIDENT_STORE_FILE.write_text(json.dumps(store, indent=2)) + +def _monitoring_log(msg): + try: + with open(str(MONITORING_LOG), "a") as f: + f.write(f"[{time.strftime('%H:%M:%S')}] {msg}\n") + except Exception: + pass + + +class IncidentStore: + def __init__(self): + self._store = _load_incident_store() + self._dirty = False + + def lookup(self, pattern): + inc = self._store.get("incidents", {}).get(pattern) + if inc and inc.get("success_count", 0) > 0: + rate = inc["success_count"] / max(inc["success_count"] + inc.get("fail_count", 0), 1) + if rate > 0.5: + return inc + return None + + def record(self, pattern, fix, success=True): + incs = self._store.setdefault("incidents", {}) + inc = incs.setdefault(pattern, { + "fix": fix, "success_count": 0, "fail_count": 0, + "last_seen": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "occurrences": 0, + }) + inc["last_seen"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) + inc["occurrences"] = inc.get("occurrences", 0) + 1 + if success: + inc["success_count"] = inc.get("success_count", 0) + 1 + else: + inc["fail_count"] = inc.get("fail_count", 0) + 1 + self._dirty = True + + def record_ai_call(self, tokens=0): + stats = self._store.setdefault("stats", {"ai_calls": 0, "tokens_used": 0}) + stats["ai_calls"] = stats.get("ai_calls", 0) + 1 + stats["tokens_used"] = stats.get("tokens_used", 0) + tokens + self._dirty = True + + def flush(self): + if self._dirty: + _save_incident_store(self._store) + self._dirty = False + + @property + def stats(self): + return self._store.get("stats", {"ai_calls": 0, "tokens_used": 0}) + + +class AIDiagnosticAgent: + def __init__(self, provider_url, model, api_key): + self.provider_url = provider_url + self.model = model + self.api_key = api_key + self.incident_store = IncidentStore() + + def diagnose(self, context): + pattern = self._extract_pattern(context) + known = self.incident_store.lookup(pattern) + if known: + _monitoring_log(f"Tier 2 HIT: pattern={pattern} fix={known['fix']}") + return {"action": known["fix"], "reason": "known_pattern", "confidence": 0.9, "tier": 2} + action = self._call_model(context) + if action: + self.incident_store.record(pattern, action.get("action", "unknown")) + self.incident_store.flush() + return action + + def _extract_pattern(self, context): + parts = [] + for k in sorted(context.get("signals", [])): + parts.append(k) + if context.get("http_code"): + parts.append(f"http_{context['http_code']}") + return "+".join(parts[:3]) or "unknown" + + def _call_model(self, context): + prompt = ( + f"INCIDENT REPORT:\n" + f"Time: {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}\n" + f"Proxy health: {context.get('proxy_alive', 'unknown')}\n" + f"Upstream: {context.get('upstream_url', 'unknown')}\n" + f"Model: {context.get('model', 'unknown')}\n" + f"Last HTTP code: {context.get('http_code', 'n/a')}\n" + f"Recent signals: {context.get('signals', [])}\n" + f"Recent log tail:\n{context.get('log_tail', '')[:1500]}\n" + ) + body = { + "model": self.model, + "messages": [ + {"role": "system", "content": _DIAGNOSTIC_SYSTEM_PROMPT}, + {"role": "user", "content": prompt}, + ], + "max_tokens": 200, + "temperature": 0.1, + } + try: + req = urllib.request.Request( + self.provider_url, + data=json.dumps(body).encode(), + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}", + }, + ) + resp = urllib.request.urlopen(req, timeout=15) + result = json.loads(resp.read()) + text = result["choices"][0]["message"]["content"].strip() + self.incident_store.record_ai_call(tokens=800) + action = json.loads(text) + action["tier"] = 3 + _monitoring_log(f"Tier 3 AI: action={action.get('action')} reason={action.get('reason')}") + return action + except Exception as e: + _monitoring_log(f"Tier 3 AI FAILED: {e}") + return {"action": "alert_user", "reason": f"ai_diag_failed: {e}", "confidence": 0.0, "tier": 3} + + +class HealthWatcher(threading.Thread): + def __init__(self, on_failure, on_recovery, on_signal, on_action): + super().__init__(daemon=True) + self.cfg = _load_monitoring_config() + self.on_failure = on_failure + self.on_recovery = on_recovery + self.on_signal = on_signal + self.on_action = on_action + self.failures = 0 + self.running = False + self._signal_counts = collections.defaultdict(int) + self._last_actions = {} + self._restart_count = 0 + self._last_restart_time = 0 + + def run(self): + self.running = True + self.incident_store = IncidentStore() + self._log_analyzer = _LogAnalyzerThread(self._on_log_signal) + self._log_analyzer.start() + while self.running: + self.cfg = _load_monitoring_config() + if not self.cfg.get("enabled"): + time.sleep(5) + continue + port = self._get_proxy_port() + if port: + healthy = self._check_health(port) + if healthy: + if self.failures > 0: + self.failures = 0 + self.on_recovery() + else: + self.failures += 1 + if self.failures >= 3: + self._handle_failure("proxy_health_fail") + self.incident_store.flush() + interval = self.cfg.get("health_check_interval_s", 5) + time.sleep(interval) + + def stop(self): + self.running = False + if hasattr(self, '_log_analyzer'): + self._log_analyzer.running = False + + def _get_proxy_port(self): + try: + cfg_path = Path.home() / ".cache/codex-proxy/proxy-config.json" + if cfg_path.exists(): + d = json.loads(cfg_path.read_text()) + return d.get("port") + except Exception: + pass + return None + + def _check_health(self, port): + try: + req = urllib.request.Request(f"http://localhost:{port}/health") + resp = urllib.request.urlopen(req, timeout=5) + return resp.status == 200 + except Exception: + return False + + def _on_log_signal(self, fault_id, category, line): + self._signal_counts[category] += 1 + self.on_signal(fault_id, category, line[:200]) + count = self._signal_counts[category] + if category in ("proxy_dead", "port_conflict") and count >= 2: + self._handle_failure(category) + elif category in ("server_error", "timeout") and count >= 3: + self._handle_failure(category + "_repeat") + elif category in ("sanitizer_flag",) and count >= 5: + self._handle_failure("sanitizer_suspicious_5x") + elif category in ("stuck_recovery",) and count >= 5: + self._handle_failure("stuck_recovery_x5") + elif category in ("parser_empty",) and count >= 3: + self._handle_failure("parsed_tool_calls_0_x3") + elif category in ("schema_corrupt",): + self._handle_failure("schema_corrupt") + + def _handle_failure(self, trigger): + now = time.time() + for rule_trigger, action, cooldown in _TIER1_RULES: + if rule_trigger == trigger: + last_t = self._last_actions.get(action, 0) + if now - last_t < cooldown: + return + self._last_actions[action] = now + _monitoring_log(f"Tier 1: trigger={trigger} action={action}") + self.on_action(action, trigger) + self.incident_store.record(trigger, action, success=True) + return + self._try_tier2_3(trigger) + + def _try_tier2_3(self, trigger): + cfg = self.cfg + if not cfg.get("provider_url") or not cfg.get("model") or not cfg.get("api_key"): + _monitoring_log(f"No AI configured for Tier 2/3 — alerting user for trigger={trigger}") + self.on_action("alert_user", trigger) + return + agent = AIDiagnosticAgent(cfg["provider_url"], cfg["model"], cfg["api_key"]) + context = { + "signals": [trigger], + "proxy_alive": self.failures == 0, + "log_tail": self._get_recent_log(), + } + result = agent.diagnose(context) + if result: + action = result.get("action", "alert_user") + _monitoring_log(f"Tier {result.get('tier', '?')}: action={action}") + self.on_action(action, trigger) + + +class _LogAnalyzerThread(threading.Thread): + def __init__(self, on_signal): + super().__init__(daemon=True) + self.on_signal = on_signal + self.running = False + + def run(self): + self.running = True + log_paths = [ + str(Path.home() / ".cache/codex-proxy/cc-debug.log"), + str(Path.home() / ".cache/codex-proxy/proxy.log"), + ] + fhs = {} + for p in log_paths: + try: + f = open(p, "r") + f.seek(0, 2) + fhs[p] = f + except Exception: + pass + while self.running: + activity = False + for p, fh in list(fhs.items()): + try: + line = fh.readline() + if line: + activity = True + for pattern, (fault_id, category) in _FAILURE_SIGNALS.items(): + if re.search(pattern, line): + self.on_signal(fault_id, category, line.strip()) + break + except Exception: + pass + if not activity: + time.sleep(0.5) + + +class AIMonitoringWindow(Gtk.Window): + def __init__(self, parent=None): + super().__init__(title="AI Monitoring") + self.set_transient_for(parent) + self.set_default_size(580, 520) + self.set_border_width(12) + self._cfg = _load_monitoring_config() + self._store = _load_incident_store() + + vbox = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=8) + self.add(vbox) + + hdr = Gtk.Box(spacing=8) + vbox.pack_start(hdr, False, False, 0) + lbl = Gtk.Label() + lbl.set_markup("AI Monitoring") + lbl.set_use_markup(True) + hdr.pack_start(lbl, False, False, 0) + self._toggle = Gtk.Switch() + self._toggle.set_active(self._cfg.get("enabled", False)) + self._toggle.connect("state-set", self._on_toggle) + hdr.pack_end(self._toggle, False, False, 0) + lbl2 = Gtk.Label(label="Enabled") + hdr.pack_end(lbl2, False, False, 0) + + frame = Gtk.Frame(label="Diagnostic Agent") + vbox.pack_start(frame, False, False, 0) + grid = Gtk.Grid(column_spacing=8, row_spacing=6, margin=8) + frame.add(grid) + + grid.attach(Gtk.Label(label="Provider URL:", halign=Gtk.Align.END), 0, 0, 1, 1) + self._url_entry = Gtk.Entry(hexpand=True) + self._url_entry.set_text(self._cfg.get("provider_url", "")) + self._url_entry.set_placeholder_text("https://api.openai.com/v1/chat/completions") + grid.attach(self._url_entry, 1, 0, 2, 1) + + grid.attach(Gtk.Label(label="Model:", halign=Gtk.Align.END), 0, 1, 1, 1) + self._model_entry = Gtk.Entry(hexpand=True) + self._model_entry.set_text(self._cfg.get("model", "")) + self._model_entry.set_placeholder_text("gpt-4o-mini or Qwen/Qwen3-32B") + grid.attach(self._model_entry, 1, 1, 2, 1) + + grid.attach(Gtk.Label(label="API Key:", halign=Gtk.Align.END), 0, 2, 1, 1) + self._key_entry = Gtk.Entry(hexpand=True, visibility=False) + self._key_entry.set_text(self._cfg.get("api_key", "")) + self._key_entry.set_placeholder_text("sk-...") + grid.attach(self._key_entry, 1, 2, 1, 1) + self._reveal_btn = Gtk.ToggleButton(label="Show") + self._reveal_btn.connect("toggled", lambda b: self._key_entry.set_visibility(b.get_active())) + grid.attach(self._reveal_btn, 2, 2, 1, 1) + + grid.attach(Gtk.Label(label="Health Check:", halign=Gtk.Align.END), 0, 3, 1, 1) + adj = Gtk.Adjustment(value=self._cfg.get("health_check_interval_s", 5), lower=2, upper=30, step_increment=1) + self._interval_spin = Gtk.SpinButton(adjustment=adj) + self._interval_spin.set_numeric(True) + grid.attach(self._interval_spin, 1, 3, 1, 1) + grid.attach(Gtk.Label(label="seconds"), 2, 3, 1, 1) + + opts_box = Gtk.Box(spacing=12, margin_top=4) + grid.attach(opts_box, 0, 4, 3, 1) + self._auto_restart_cb = Gtk.CheckButton(label="Auto-restart proxy on crash") + self._auto_restart_cb.set_active(self._cfg.get("auto_restart_proxy", True)) + opts_box.pack_start(self._auto_restart_cb, False, False, 0) + self._auto_switch_cb = Gtk.CheckButton(label="Auto-switch provider on repeated failure") + self._auto_switch_cb.set_active(self._cfg.get("auto_switch_provider", False)) + opts_box.pack_start(self._auto_switch_cb, False, False, 0) + + save_btn = Gtk.Button(label="Save Configuration") + save_btn.get_style_context().add_class("suggested-action") + save_btn.connect("clicked", self._on_save) + grid.attach(save_btn, 0, 5, 3, 1) + + stats_box = Gtk.Box(spacing=16) + vbox.pack_start(stats_box, False, False, 0) + stats = self._store.get("stats", {"ai_calls": 0, "tokens_used": 0}) + self._stats_lbl = Gtk.Label() + self._stats_lbl.set_markup( + f"AI diagnostic calls: {stats.get('ai_calls', 0)} | " + f"Tokens used: {stats.get('tokens_used', 0):,} | " + f"Known patterns: {len(self._store.get('incidents', {}))}" + ) + self._stats_lbl.set_use_markup(True) + stats_box.pack_start(self._stats_lbl, False, False, 0) + + frame2 = Gtk.Frame(label="Recent Incidents") + vbox.pack_start(frame2, True, True, 0) + sw = Gtk.ScrolledWindow() + sw.set_policy(Gtk.PolicyType.AUTOMATIC, Gtk.PolicyType.AUTOMATIC) + frame2.add(sw) + self._inc_buf = Gtk.TextBuffer() + tv = Gtk.TextView(buffer=self._inc_buf) + tv.set_editable(False) + tv.set_cursor_visible(False) + tv.set_wrap_mode(Gtk.WrapMode.WORD_CHAR) + sw.add(tv) + self._refresh_incidents() + + bb = Gtk.Box(spacing=8) + vbox.pack_start(bb, False, False, 0) + view_btn = Gtk.Button(label="View Monitoring Log") + view_btn.connect("clicked", lambda b: subprocess.Popen(["xdg-open", str(MONITORING_LOG)])) + bb.pack_start(view_btn, False, False, 0) + clear_btn = Gtk.Button(label="Clear Incident Store") + clear_btn.connect("clicked", self._on_clear_store) + bb.pack_start(clear_btn, False, False, 0) + close_btn = Gtk.Button(label="Close") + close_btn.connect("clicked", lambda b: self.destroy()) + bb.pack_end(close_btn, False, False, 0) + + self.show_all() + + def _on_toggle(self, switch, state): + self._cfg["enabled"] = state + _save_monitoring_config(self._cfg) + + def _on_save(self, btn): + self._cfg["provider_url"] = self._url_entry.get_text().strip() + self._cfg["model"] = self._model_entry.get_text().strip() + self._cfg["api_key"] = self._key_entry.get_text().strip() + self._cfg["health_check_interval_s"] = int(self._interval_spin.get_value()) + self._cfg["auto_restart_proxy"] = self._auto_restart_cb.get_active() + self._cfg["auto_switch_provider"] = self._auto_switch_cb.get_active() + _save_monitoring_config(self._cfg) + self._inc_buf.set_text("Configuration saved.\n") + + def _on_clear_store(self, btn): + _save_incident_store({"version": 1, "incidents": {}, "stats": {"ai_calls": 0, "tokens_used": 0}}) + self._store = {"version": 1, "incidents": {}, "stats": {"ai_calls": 0, "tokens_used": 0}} + self._refresh_incidents() + + def _refresh_incidents(self): + lines = [] + for pattern, inc in sorted(self._store.get("incidents", {}).items(), + key=lambda x: x[1].get("last_seen", ""), reverse=True): + sc = inc.get("success_count", 0) + fc = inc.get("fail_count", 0) + rate = sc / max(sc + fc, 1) + bar = "+" * min(int(rate * 10), 10) + "-" * (10 - min(int(rate * 10), 10)) + lines.append( + f"[{inc.get('last_seen', '?')[:16]}] {pattern}\n" + f" fix={inc.get('fix', '?')} success_rate={rate:.0%} [{bar}] " + f"seen={inc.get('occurrences', 0)}x\n" + ) + if not lines: + lines.append("No incidents recorded yet.\n") + lines.append("\nEnable AI Monitoring and use Codex to populate the store.\n") + self._inc_buf.set_text("\n".join(lines)) + + # ═══════════════════════════════════════════════════════════════════ # Main window # ═══════════════════════════════════════════════════════════════════ @@ -1143,7 +1661,7 @@ class LauncherWin(Gtk.Window): # header row hdr = Gtk.Box(spacing=8) vbox.pack_start(hdr, False, False, 0) - lbl = Gtk.Label(label="Codex Launcher v3.7.0") + lbl = Gtk.Label(label="Codex Launcher v3.8.0") lbl.set_use_markup(True) hdr.pack_start(lbl, False, False, 0) changelog_btn = Gtk.Button(label="Changelog") @@ -1161,6 +1679,9 @@ class LauncherWin(Gtk.Window): bgp_btn = Gtk.Button(label="AI BGP") bgp_btn.connect("clicked", lambda b: self._open_bgp()) hdr.pack_end(bgp_btn, False, False, 0) + mon_btn = Gtk.Button(label="AI Monitor") + mon_btn.connect("clicked", lambda b: self._open_monitoring()) + hdr.pack_end(mon_btn, False, False, 0) mgr_btn = Gtk.Button(label="Manage Endpoints") mgr_btn.connect("clicked", lambda b: self._open_mgr()) hdr.pack_end(mgr_btn, False, False, 0) @@ -1310,6 +1831,7 @@ class LauncherWin(Gtk.Window): self.show_all() self._rebuild_combo() self._log_dependency_status() + self._start_watcher() # ── helpers ────────────────────────────────────────────────── @@ -1456,13 +1978,84 @@ class LauncherWin(Gtk.Window): d.run(); d.destroy() def _open_bgp(self): - try: - self._bgp_window = BGPPoolMgr(self) - self._bgp_window.connect("destroy", lambda *_: setattr(self, "_bgp_window", None)) - except Exception as e: - import traceback; traceback.print_exc() - d = Gtk.MessageDialog(self, 0, Gtk.MessageType.ERROR, Gtk.ButtonsType.OK, f"Error: {e}") - d.run(); d.destroy() + try: + self._bgp_window = BGPPoolMgr(self) + self._bgp_window.connect("destroy", lambda *_: setattr(self, "_bgp_window", None)) + except Exception as e: + import traceback; traceback.print_exc() + d = Gtk.MessageDialog(self, 0, Gtk.MessageType.ERROR, Gtk.ButtonsType.OK, f"Error: {e}") + d.run(); d.destroy() + + def _open_monitoring(self): + try: + self._monitoring_window = AIMonitoringWindow(self) + self._monitoring_window.connect("destroy", lambda *_: setattr(self, "_monitoring_window", None)) + except Exception as e: + import traceback; traceback.print_exc() + d = Gtk.MessageDialog(self, 0, Gtk.MessageType.ERROR, Gtk.ButtonsType.OK, f"Error: {e}") + d.run(); d.destroy() + + def _start_watcher(self): + cfg = _load_monitoring_config() + if not cfg.get("enabled"): + return + self._watcher = HealthWatcher( + on_failure=self._on_watcher_failure, + on_recovery=self._on_watcher_recovery, + on_signal=self._on_watcher_signal, + on_action=self._on_watcher_action, + ) + self._watcher.start() + self.log("AI Monitoring: watchdog started") + + def _on_watcher_failure(self, count): + GLib.idle_add(self.log, f"[AI Monitor] Proxy unresponsive (failures={count})") + + def _on_watcher_recovery(self): + GLib.idle_add(self.log, "[AI Monitor] Proxy recovered") + + def _on_watcher_signal(self, fault_id, category, line): + pass + + def _on_watcher_action(self, action, trigger): + cfg = _load_monitoring_config() + if action == "restart_proxy" and cfg.get("auto_restart_proxy"): + GLib.idle_add(self.log, f"[AI Monitor] Auto-restarting proxy (trigger: {trigger})") + GLib.idle_add(self._restart_proxy_from_watcher) + elif action == "clear_schema_cache": + try: + cap_file = Path.home() / ".cache/codex-proxy/provider-caps.json" + if cap_file.exists(): + cap_file.unlink() + GLib.idle_add(self.log, "[AI Monitor] Cleared corrupt schema cache") + except Exception as e: + GLib.idle_add(self.log, f"[AI Monitor] Failed to clear cache: {e}") + elif action == "delete_provider_caps": + try: + cap_file = Path.home() / ".cache/codex-proxy/provider-caps.json" + if cap_file.exists(): + cap_file.unlink() + GLib.idle_add(self.log, "[AI Monitor] Deleted corrupted provider-caps.json") + except Exception as e: + GLib.idle_add(self.log, f"[AI Monitor] Failed: {e}") + elif action == "kill_stale_restart": + GLib.idle_add(self.log, f"[AI Monitor] Killing stale processes + restarting (trigger: {trigger})") + self._kill() + GLib.idle_add(self._restart_proxy_from_watcher) + else: + GLib.idle_add(self.log, f"[AI Monitor] Alert: {action} (trigger: {trigger})") + + def _restart_proxy_from_watcher(self): + try: + ep_name = load_endpoints().get("default") + if not ep_name: + return + for ep in load_endpoints().get("endpoints", []): + if ep.get("name") == ep_name: + self._start_proxy(ep) + break + except Exception as e: + self.log(f"[AI Monitor] Proxy restart failed: {e}") def _open_usage(self): try: diff --git a/src/translate-proxy.py b/src/translate-proxy.py index 8946e15..335af93 100755 --- a/src/translate-proxy.py +++ b/src/translate-proxy.py @@ -3410,10 +3410,20 @@ class Handler(http.server.BaseHTTPRequestHandler): if self.path in ("/v1/models", "/models"): self.send_json(200, {"object": "list", "data": MODELS}) elif self.path in ("/health", "/v1/health"): + import resource as _res + _mem_mb = 0 + try: + _mem_mb = _res.getrusage(_res.RUSAGE_SELF).ru_maxrss / 1024 + except Exception: + pass + _uptime = time.time() - _START_TIME if '_START_TIME' in dir() else 0 self.send_json(200, {"ok": True, "backend": BACKEND, "target_url": TARGET_URL, "models": [m.get("id") for m in MODELS], - "bgp_routes": len(BGP_ROUTES)}) + "bgp_routes": len(BGP_ROUTES), + "uptime_s": round(_uptime, 1), + "memory_mb": round(_mem_mb, 1), + "requests_total": _STATS.get("requests", 0)}) else: self.send_error(404) @@ -4750,10 +4760,11 @@ def _handle_shutdown_signal(sig, frame): _SHUTDOWN_REQUESTED = True print(f"[SELF-REVIVE] Signal {sig} received, shutting down cleanly", flush=True) if 'SERVER' in globals() and SERVER: - SERVER.shutdown() - + SERVER.shutdown() + def main(): - global SERVER + global SERVER, _START_TIME + _START_TIME = time.time() _init_runtime() signal.signal(signal.SIGTERM, _handle_shutdown_signal) signal.signal(signal.SIGINT, _handle_shutdown_signal)