v3.8.0: AI Monitoring — self-healing watchdog with 3-tier response system
- HealthWatcher thread: monitors proxy /health every 5s - LogAnalyzer thread: tails cc-debug.log for 18 failure signal patterns - Tier 1 rule engine: 14 rules for instant auto-recovery (< 1s) - Tier 2 incident store: JSON pattern database with success rates - Tier 3 AI diagnostic agent: calls configurable provider/model for novel failures - AIMonitoringWindow GUI: ON/OFF toggle, provider/model/API key selector, incident log - 30 fault types catalogued across 5 categories (A-E) - Enhanced /health endpoint with memory_mb, uptime_s, requests_total - Auto-restart proxy, auto-clear schema cache, kill stale processes - Safety: rate-limited AI calls, restart caps, cooldowns per pattern - AI Monitoring design spec (AI-MONITORING-DESIGN.md) - 54 self-test patterns passing
This commit is contained in:
BIN
codex-launcher_3.8.0_all.deb
Normal file
BIN
codex-launcher_3.8.0_all.deb
Normal file
Binary file not shown.
@@ -3,11 +3,11 @@ set -e
|
|||||||
|
|
||||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||||
|
|
||||||
if [ -f "$SCRIPT_DIR/codex-launcher_3.7.0_all.deb" ]; then
|
if [ -f "$SCRIPT_DIR/codex-launcher_3.8.0_all.deb" ]; then
|
||||||
echo "Installing codex-launcher_3.7.0_all.deb ..."
|
echo "Installing codex-launcher_3.8.0_all.deb ..."
|
||||||
sudo dpkg -i "$SCRIPT_DIR/codex-launcher_3.7.0_all.deb"
|
sudo dpkg -i "$SCRIPT_DIR/codex-launcher_3.8.0_all.deb"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Installed v3.7.0 via .deb package."
|
echo "Installed v3.8.0 via .deb package."
|
||||||
echo " translate-proxy.py -> /usr/bin/translate-proxy.py"
|
echo " translate-proxy.py -> /usr/bin/translate-proxy.py"
|
||||||
echo " codex-launcher-gui -> /usr/bin/codex-launcher-gui"
|
echo " codex-launcher-gui -> /usr/bin/codex-launcher-gui"
|
||||||
echo " cleanup-codex-stale -> /usr/bin/cleanup-codex-stale.sh"
|
echo " cleanup-codex-stale -> /usr/bin/cleanup-codex-stale.sh"
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import gi
|
|||||||
gi.require_version("Gtk", "3.0")
|
gi.require_version("Gtk", "3.0")
|
||||||
from gi.repository import Gtk, GLib
|
from gi.repository import Gtk, GLib
|
||||||
import subprocess, os, signal, sys, threading, time, json, urllib.request, urllib.parse, urllib.error, tempfile, shutil
|
import subprocess, os, signal, sys, threading, time, json, urllib.request, urllib.parse, urllib.error, tempfile, shutil
|
||||||
import hashlib, socket, ssl, contextlib, re
|
import hashlib, socket, ssl, contextlib, re, collections
|
||||||
import base64, secrets
|
import base64, secrets
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@@ -1123,6 +1123,524 @@ def _check_codex_auth():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
return ("error", str(e))
|
return ("error", str(e))
|
||||||
|
|
||||||
|
# ═══════════════════════════════════════════════════════════════════
|
||||||
|
# AI Monitoring — Self-Healing Watchdog
|
||||||
|
# ═══════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
|
MONITORING_FILE = Path.home() / ".cache/codex-proxy/monitoring-config.json"
|
||||||
|
INCIDENT_STORE_FILE = Path.home() / ".cache/codex-proxy/incident-store.json"
|
||||||
|
MONITORING_LOG = Path.home() / ".cache/codex-proxy/monitoring.log"
|
||||||
|
|
||||||
|
_TIER1_RULES = [
|
||||||
|
("proxy_health_fail", "restart_proxy", 30),
|
||||||
|
("proxy_port_conflict", "kill_stale_restart", 60),
|
||||||
|
("upstream_429", "wait_retry", 0),
|
||||||
|
("upstream_502_503", "retry_backoff", 30),
|
||||||
|
("upstream_500_repeat", "switch_provider", 60),
|
||||||
|
("upstream_timeout", "retry_increase_timeout",30),
|
||||||
|
("upstream_401_403", "alert_bad_key", 0),
|
||||||
|
("stream_broken_pipe", "restart_proxy", 30),
|
||||||
|
("stream_reset", "restart_proxy", 30),
|
||||||
|
("parsed_tool_calls_0_x3", "clear_schema_cache", 300),
|
||||||
|
("sanitizer_suspicious_5x","alert_model_issue", 0),
|
||||||
|
("stuck_recovery_x5", "suggest_switch_model", 0),
|
||||||
|
("codex_process_dead", "alert_restart", 0),
|
||||||
|
("schema_corrupt", "delete_provider_caps", 0),
|
||||||
|
]
|
||||||
|
|
||||||
|
_FAILURE_SIGNALS = {
|
||||||
|
"parsed_tool_calls=0": ("C1", "parser_empty"),
|
||||||
|
"[STUCK-RECOVERY]": ("C3", "stuck_recovery"),
|
||||||
|
"suspicious cmd": ("C4", "sanitizer_flag"),
|
||||||
|
"empty cmd recovered": ("C6", "empty_cmd"),
|
||||||
|
"HTTP 429": ("B1", "rate_limited"),
|
||||||
|
"HTTP 500": ("B2", "server_error"),
|
||||||
|
"HTTP 502": ("B2", "server_error"),
|
||||||
|
"HTTP 503": ("B2", "server_error"),
|
||||||
|
"HTTP 401": ("B3", "auth_failure"),
|
||||||
|
"HTTP 403": ("B4", "forbidden"),
|
||||||
|
"Connection refused": ("A1", "proxy_dead"),
|
||||||
|
"Address already in use": ("A2", "port_conflict"),
|
||||||
|
"Broken pipe": ("B7", "broken_pipe"),
|
||||||
|
"Connection reset": ("B6", "connection_reset"),
|
||||||
|
"timed out": ("B5", "timeout"),
|
||||||
|
"SELF-REVIVE CRASH": ("A5", "proxy_crash"),
|
||||||
|
"stream error": ("B6", "stream_error"),
|
||||||
|
"content_type.*array": ("E1", "schema_corrupt"),
|
||||||
|
}
|
||||||
|
|
||||||
|
_DIAGNOSTIC_SYSTEM_PROMPT = (
|
||||||
|
'You are a diagnostic agent for "Codex Launcher" — a desktop app that runs a local '
|
||||||
|
'translation proxy between OpenAI Codex CLI/Desktop and AI providers.\n\n'
|
||||||
|
'Analyze the incident and respond with ONLY a JSON object:\n'
|
||||||
|
'{"action": "...", "reason": "...", "confidence": 0.0-1.0}\n\n'
|
||||||
|
'Available actions: restart_proxy, kill_stale_processes, clear_schema_cache, '
|
||||||
|
'switch_provider, increase_timeout, regenerate_config, cleanup_stale, '
|
||||||
|
'alert_user, ignore, retry_now\n\n'
|
||||||
|
'Rules:\n'
|
||||||
|
'- upstream 401/403 with auth error -> alert_user\n'
|
||||||
|
'- proxy dead -> restart_proxy\n'
|
||||||
|
'- same error 5+ times -> switch_provider or alert_user\n'
|
||||||
|
'- schema/content_type error -> clear_schema_cache\n'
|
||||||
|
'- "Address already in use" -> kill_stale_processes then restart_proxy\n'
|
||||||
|
'- timeout on slow upstream -> increase_timeout\n'
|
||||||
|
'- single transient 429/502/503 -> ignore\n'
|
||||||
|
'- "stream disconnected" + proxy healthy -> ignore\n'
|
||||||
|
'- no extra text, no markdown, just the JSON object'
|
||||||
|
)
|
||||||
|
|
||||||
|
def _load_monitoring_config():
|
||||||
|
if MONITORING_FILE.exists():
|
||||||
|
try:
|
||||||
|
return json.loads(MONITORING_FILE.read_text())
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return {
|
||||||
|
"enabled": False,
|
||||||
|
"provider_url": "",
|
||||||
|
"model": "",
|
||||||
|
"api_key": "",
|
||||||
|
"health_check_interval_s": 5,
|
||||||
|
"auto_restart_proxy": True,
|
||||||
|
"auto_switch_provider": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _save_monitoring_config(cfg):
|
||||||
|
MONITORING_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
MONITORING_FILE.write_text(json.dumps(cfg, indent=2))
|
||||||
|
|
||||||
|
def _load_incident_store():
|
||||||
|
if INCIDENT_STORE_FILE.exists():
|
||||||
|
try:
|
||||||
|
return json.loads(INCIDENT_STORE_FILE.read_text())
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return {"version": 1, "incidents": {}, "stats": {"ai_calls": 0, "tokens_used": 0}}
|
||||||
|
|
||||||
|
def _save_incident_store(store):
|
||||||
|
INCIDENT_STORE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
INCIDENT_STORE_FILE.write_text(json.dumps(store, indent=2))
|
||||||
|
|
||||||
|
def _monitoring_log(msg):
|
||||||
|
try:
|
||||||
|
with open(str(MONITORING_LOG), "a") as f:
|
||||||
|
f.write(f"[{time.strftime('%H:%M:%S')}] {msg}\n")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class IncidentStore:
|
||||||
|
def __init__(self):
|
||||||
|
self._store = _load_incident_store()
|
||||||
|
self._dirty = False
|
||||||
|
|
||||||
|
def lookup(self, pattern):
|
||||||
|
inc = self._store.get("incidents", {}).get(pattern)
|
||||||
|
if inc and inc.get("success_count", 0) > 0:
|
||||||
|
rate = inc["success_count"] / max(inc["success_count"] + inc.get("fail_count", 0), 1)
|
||||||
|
if rate > 0.5:
|
||||||
|
return inc
|
||||||
|
return None
|
||||||
|
|
||||||
|
def record(self, pattern, fix, success=True):
|
||||||
|
incs = self._store.setdefault("incidents", {})
|
||||||
|
inc = incs.setdefault(pattern, {
|
||||||
|
"fix": fix, "success_count": 0, "fail_count": 0,
|
||||||
|
"last_seen": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||||
|
"occurrences": 0,
|
||||||
|
})
|
||||||
|
inc["last_seen"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
||||||
|
inc["occurrences"] = inc.get("occurrences", 0) + 1
|
||||||
|
if success:
|
||||||
|
inc["success_count"] = inc.get("success_count", 0) + 1
|
||||||
|
else:
|
||||||
|
inc["fail_count"] = inc.get("fail_count", 0) + 1
|
||||||
|
self._dirty = True
|
||||||
|
|
||||||
|
def record_ai_call(self, tokens=0):
|
||||||
|
stats = self._store.setdefault("stats", {"ai_calls": 0, "tokens_used": 0})
|
||||||
|
stats["ai_calls"] = stats.get("ai_calls", 0) + 1
|
||||||
|
stats["tokens_used"] = stats.get("tokens_used", 0) + tokens
|
||||||
|
self._dirty = True
|
||||||
|
|
||||||
|
def flush(self):
|
||||||
|
if self._dirty:
|
||||||
|
_save_incident_store(self._store)
|
||||||
|
self._dirty = False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def stats(self):
|
||||||
|
return self._store.get("stats", {"ai_calls": 0, "tokens_used": 0})
|
||||||
|
|
||||||
|
|
||||||
|
class AIDiagnosticAgent:
|
||||||
|
def __init__(self, provider_url, model, api_key):
|
||||||
|
self.provider_url = provider_url
|
||||||
|
self.model = model
|
||||||
|
self.api_key = api_key
|
||||||
|
self.incident_store = IncidentStore()
|
||||||
|
|
||||||
|
def diagnose(self, context):
|
||||||
|
pattern = self._extract_pattern(context)
|
||||||
|
known = self.incident_store.lookup(pattern)
|
||||||
|
if known:
|
||||||
|
_monitoring_log(f"Tier 2 HIT: pattern={pattern} fix={known['fix']}")
|
||||||
|
return {"action": known["fix"], "reason": "known_pattern", "confidence": 0.9, "tier": 2}
|
||||||
|
action = self._call_model(context)
|
||||||
|
if action:
|
||||||
|
self.incident_store.record(pattern, action.get("action", "unknown"))
|
||||||
|
self.incident_store.flush()
|
||||||
|
return action
|
||||||
|
|
||||||
|
def _extract_pattern(self, context):
|
||||||
|
parts = []
|
||||||
|
for k in sorted(context.get("signals", [])):
|
||||||
|
parts.append(k)
|
||||||
|
if context.get("http_code"):
|
||||||
|
parts.append(f"http_{context['http_code']}")
|
||||||
|
return "+".join(parts[:3]) or "unknown"
|
||||||
|
|
||||||
|
def _call_model(self, context):
|
||||||
|
prompt = (
|
||||||
|
f"INCIDENT REPORT:\n"
|
||||||
|
f"Time: {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}\n"
|
||||||
|
f"Proxy health: {context.get('proxy_alive', 'unknown')}\n"
|
||||||
|
f"Upstream: {context.get('upstream_url', 'unknown')}\n"
|
||||||
|
f"Model: {context.get('model', 'unknown')}\n"
|
||||||
|
f"Last HTTP code: {context.get('http_code', 'n/a')}\n"
|
||||||
|
f"Recent signals: {context.get('signals', [])}\n"
|
||||||
|
f"Recent log tail:\n{context.get('log_tail', '')[:1500]}\n"
|
||||||
|
)
|
||||||
|
body = {
|
||||||
|
"model": self.model,
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": _DIAGNOSTIC_SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
],
|
||||||
|
"max_tokens": 200,
|
||||||
|
"temperature": 0.1,
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(
|
||||||
|
self.provider_url,
|
||||||
|
data=json.dumps(body).encode(),
|
||||||
|
headers={
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Authorization": f"Bearer {self.api_key}",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
resp = urllib.request.urlopen(req, timeout=15)
|
||||||
|
result = json.loads(resp.read())
|
||||||
|
text = result["choices"][0]["message"]["content"].strip()
|
||||||
|
self.incident_store.record_ai_call(tokens=800)
|
||||||
|
action = json.loads(text)
|
||||||
|
action["tier"] = 3
|
||||||
|
_monitoring_log(f"Tier 3 AI: action={action.get('action')} reason={action.get('reason')}")
|
||||||
|
return action
|
||||||
|
except Exception as e:
|
||||||
|
_monitoring_log(f"Tier 3 AI FAILED: {e}")
|
||||||
|
return {"action": "alert_user", "reason": f"ai_diag_failed: {e}", "confidence": 0.0, "tier": 3}
|
||||||
|
|
||||||
|
|
||||||
|
class HealthWatcher(threading.Thread):
|
||||||
|
def __init__(self, on_failure, on_recovery, on_signal, on_action):
|
||||||
|
super().__init__(daemon=True)
|
||||||
|
self.cfg = _load_monitoring_config()
|
||||||
|
self.on_failure = on_failure
|
||||||
|
self.on_recovery = on_recovery
|
||||||
|
self.on_signal = on_signal
|
||||||
|
self.on_action = on_action
|
||||||
|
self.failures = 0
|
||||||
|
self.running = False
|
||||||
|
self._signal_counts = collections.defaultdict(int)
|
||||||
|
self._last_actions = {}
|
||||||
|
self._restart_count = 0
|
||||||
|
self._last_restart_time = 0
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
self.running = True
|
||||||
|
self.incident_store = IncidentStore()
|
||||||
|
self._log_analyzer = _LogAnalyzerThread(self._on_log_signal)
|
||||||
|
self._log_analyzer.start()
|
||||||
|
while self.running:
|
||||||
|
self.cfg = _load_monitoring_config()
|
||||||
|
if not self.cfg.get("enabled"):
|
||||||
|
time.sleep(5)
|
||||||
|
continue
|
||||||
|
port = self._get_proxy_port()
|
||||||
|
if port:
|
||||||
|
healthy = self._check_health(port)
|
||||||
|
if healthy:
|
||||||
|
if self.failures > 0:
|
||||||
|
self.failures = 0
|
||||||
|
self.on_recovery()
|
||||||
|
else:
|
||||||
|
self.failures += 1
|
||||||
|
if self.failures >= 3:
|
||||||
|
self._handle_failure("proxy_health_fail")
|
||||||
|
self.incident_store.flush()
|
||||||
|
interval = self.cfg.get("health_check_interval_s", 5)
|
||||||
|
time.sleep(interval)
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self.running = False
|
||||||
|
if hasattr(self, '_log_analyzer'):
|
||||||
|
self._log_analyzer.running = False
|
||||||
|
|
||||||
|
def _get_proxy_port(self):
|
||||||
|
try:
|
||||||
|
cfg_path = Path.home() / ".cache/codex-proxy/proxy-config.json"
|
||||||
|
if cfg_path.exists():
|
||||||
|
d = json.loads(cfg_path.read_text())
|
||||||
|
return d.get("port")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _check_health(self, port):
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(f"http://localhost:{port}/health")
|
||||||
|
resp = urllib.request.urlopen(req, timeout=5)
|
||||||
|
return resp.status == 200
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _on_log_signal(self, fault_id, category, line):
|
||||||
|
self._signal_counts[category] += 1
|
||||||
|
self.on_signal(fault_id, category, line[:200])
|
||||||
|
count = self._signal_counts[category]
|
||||||
|
if category in ("proxy_dead", "port_conflict") and count >= 2:
|
||||||
|
self._handle_failure(category)
|
||||||
|
elif category in ("server_error", "timeout") and count >= 3:
|
||||||
|
self._handle_failure(category + "_repeat")
|
||||||
|
elif category in ("sanitizer_flag",) and count >= 5:
|
||||||
|
self._handle_failure("sanitizer_suspicious_5x")
|
||||||
|
elif category in ("stuck_recovery",) and count >= 5:
|
||||||
|
self._handle_failure("stuck_recovery_x5")
|
||||||
|
elif category in ("parser_empty",) and count >= 3:
|
||||||
|
self._handle_failure("parsed_tool_calls_0_x3")
|
||||||
|
elif category in ("schema_corrupt",):
|
||||||
|
self._handle_failure("schema_corrupt")
|
||||||
|
|
||||||
|
def _handle_failure(self, trigger):
|
||||||
|
now = time.time()
|
||||||
|
for rule_trigger, action, cooldown in _TIER1_RULES:
|
||||||
|
if rule_trigger == trigger:
|
||||||
|
last_t = self._last_actions.get(action, 0)
|
||||||
|
if now - last_t < cooldown:
|
||||||
|
return
|
||||||
|
self._last_actions[action] = now
|
||||||
|
_monitoring_log(f"Tier 1: trigger={trigger} action={action}")
|
||||||
|
self.on_action(action, trigger)
|
||||||
|
self.incident_store.record(trigger, action, success=True)
|
||||||
|
return
|
||||||
|
self._try_tier2_3(trigger)
|
||||||
|
|
||||||
|
def _try_tier2_3(self, trigger):
|
||||||
|
cfg = self.cfg
|
||||||
|
if not cfg.get("provider_url") or not cfg.get("model") or not cfg.get("api_key"):
|
||||||
|
_monitoring_log(f"No AI configured for Tier 2/3 — alerting user for trigger={trigger}")
|
||||||
|
self.on_action("alert_user", trigger)
|
||||||
|
return
|
||||||
|
agent = AIDiagnosticAgent(cfg["provider_url"], cfg["model"], cfg["api_key"])
|
||||||
|
context = {
|
||||||
|
"signals": [trigger],
|
||||||
|
"proxy_alive": self.failures == 0,
|
||||||
|
"log_tail": self._get_recent_log(),
|
||||||
|
}
|
||||||
|
result = agent.diagnose(context)
|
||||||
|
if result:
|
||||||
|
action = result.get("action", "alert_user")
|
||||||
|
_monitoring_log(f"Tier {result.get('tier', '?')}: action={action}")
|
||||||
|
self.on_action(action, trigger)
|
||||||
|
|
||||||
|
|
||||||
|
class _LogAnalyzerThread(threading.Thread):
|
||||||
|
def __init__(self, on_signal):
|
||||||
|
super().__init__(daemon=True)
|
||||||
|
self.on_signal = on_signal
|
||||||
|
self.running = False
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
self.running = True
|
||||||
|
log_paths = [
|
||||||
|
str(Path.home() / ".cache/codex-proxy/cc-debug.log"),
|
||||||
|
str(Path.home() / ".cache/codex-proxy/proxy.log"),
|
||||||
|
]
|
||||||
|
fhs = {}
|
||||||
|
for p in log_paths:
|
||||||
|
try:
|
||||||
|
f = open(p, "r")
|
||||||
|
f.seek(0, 2)
|
||||||
|
fhs[p] = f
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
while self.running:
|
||||||
|
activity = False
|
||||||
|
for p, fh in list(fhs.items()):
|
||||||
|
try:
|
||||||
|
line = fh.readline()
|
||||||
|
if line:
|
||||||
|
activity = True
|
||||||
|
for pattern, (fault_id, category) in _FAILURE_SIGNALS.items():
|
||||||
|
if re.search(pattern, line):
|
||||||
|
self.on_signal(fault_id, category, line.strip())
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if not activity:
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
|
||||||
|
class AIMonitoringWindow(Gtk.Window):
|
||||||
|
def __init__(self, parent=None):
|
||||||
|
super().__init__(title="AI Monitoring")
|
||||||
|
self.set_transient_for(parent)
|
||||||
|
self.set_default_size(580, 520)
|
||||||
|
self.set_border_width(12)
|
||||||
|
self._cfg = _load_monitoring_config()
|
||||||
|
self._store = _load_incident_store()
|
||||||
|
|
||||||
|
vbox = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=8)
|
||||||
|
self.add(vbox)
|
||||||
|
|
||||||
|
hdr = Gtk.Box(spacing=8)
|
||||||
|
vbox.pack_start(hdr, False, False, 0)
|
||||||
|
lbl = Gtk.Label()
|
||||||
|
lbl.set_markup("<b>AI Monitoring</b>")
|
||||||
|
lbl.set_use_markup(True)
|
||||||
|
hdr.pack_start(lbl, False, False, 0)
|
||||||
|
self._toggle = Gtk.Switch()
|
||||||
|
self._toggle.set_active(self._cfg.get("enabled", False))
|
||||||
|
self._toggle.connect("state-set", self._on_toggle)
|
||||||
|
hdr.pack_end(self._toggle, False, False, 0)
|
||||||
|
lbl2 = Gtk.Label(label="Enabled")
|
||||||
|
hdr.pack_end(lbl2, False, False, 0)
|
||||||
|
|
||||||
|
frame = Gtk.Frame(label="Diagnostic Agent")
|
||||||
|
vbox.pack_start(frame, False, False, 0)
|
||||||
|
grid = Gtk.Grid(column_spacing=8, row_spacing=6, margin=8)
|
||||||
|
frame.add(grid)
|
||||||
|
|
||||||
|
grid.attach(Gtk.Label(label="Provider URL:", halign=Gtk.Align.END), 0, 0, 1, 1)
|
||||||
|
self._url_entry = Gtk.Entry(hexpand=True)
|
||||||
|
self._url_entry.set_text(self._cfg.get("provider_url", ""))
|
||||||
|
self._url_entry.set_placeholder_text("https://api.openai.com/v1/chat/completions")
|
||||||
|
grid.attach(self._url_entry, 1, 0, 2, 1)
|
||||||
|
|
||||||
|
grid.attach(Gtk.Label(label="Model:", halign=Gtk.Align.END), 0, 1, 1, 1)
|
||||||
|
self._model_entry = Gtk.Entry(hexpand=True)
|
||||||
|
self._model_entry.set_text(self._cfg.get("model", ""))
|
||||||
|
self._model_entry.set_placeholder_text("gpt-4o-mini or Qwen/Qwen3-32B")
|
||||||
|
grid.attach(self._model_entry, 1, 1, 2, 1)
|
||||||
|
|
||||||
|
grid.attach(Gtk.Label(label="API Key:", halign=Gtk.Align.END), 0, 2, 1, 1)
|
||||||
|
self._key_entry = Gtk.Entry(hexpand=True, visibility=False)
|
||||||
|
self._key_entry.set_text(self._cfg.get("api_key", ""))
|
||||||
|
self._key_entry.set_placeholder_text("sk-...")
|
||||||
|
grid.attach(self._key_entry, 1, 2, 1, 1)
|
||||||
|
self._reveal_btn = Gtk.ToggleButton(label="Show")
|
||||||
|
self._reveal_btn.connect("toggled", lambda b: self._key_entry.set_visibility(b.get_active()))
|
||||||
|
grid.attach(self._reveal_btn, 2, 2, 1, 1)
|
||||||
|
|
||||||
|
grid.attach(Gtk.Label(label="Health Check:", halign=Gtk.Align.END), 0, 3, 1, 1)
|
||||||
|
adj = Gtk.Adjustment(value=self._cfg.get("health_check_interval_s", 5), lower=2, upper=30, step_increment=1)
|
||||||
|
self._interval_spin = Gtk.SpinButton(adjustment=adj)
|
||||||
|
self._interval_spin.set_numeric(True)
|
||||||
|
grid.attach(self._interval_spin, 1, 3, 1, 1)
|
||||||
|
grid.attach(Gtk.Label(label="seconds"), 2, 3, 1, 1)
|
||||||
|
|
||||||
|
opts_box = Gtk.Box(spacing=12, margin_top=4)
|
||||||
|
grid.attach(opts_box, 0, 4, 3, 1)
|
||||||
|
self._auto_restart_cb = Gtk.CheckButton(label="Auto-restart proxy on crash")
|
||||||
|
self._auto_restart_cb.set_active(self._cfg.get("auto_restart_proxy", True))
|
||||||
|
opts_box.pack_start(self._auto_restart_cb, False, False, 0)
|
||||||
|
self._auto_switch_cb = Gtk.CheckButton(label="Auto-switch provider on repeated failure")
|
||||||
|
self._auto_switch_cb.set_active(self._cfg.get("auto_switch_provider", False))
|
||||||
|
opts_box.pack_start(self._auto_switch_cb, False, False, 0)
|
||||||
|
|
||||||
|
save_btn = Gtk.Button(label="Save Configuration")
|
||||||
|
save_btn.get_style_context().add_class("suggested-action")
|
||||||
|
save_btn.connect("clicked", self._on_save)
|
||||||
|
grid.attach(save_btn, 0, 5, 3, 1)
|
||||||
|
|
||||||
|
stats_box = Gtk.Box(spacing=16)
|
||||||
|
vbox.pack_start(stats_box, False, False, 0)
|
||||||
|
stats = self._store.get("stats", {"ai_calls": 0, "tokens_used": 0})
|
||||||
|
self._stats_lbl = Gtk.Label()
|
||||||
|
self._stats_lbl.set_markup(
|
||||||
|
f"<small>AI diagnostic calls: <b>{stats.get('ai_calls', 0)}</b> | "
|
||||||
|
f"Tokens used: <b>{stats.get('tokens_used', 0):,}</b> | "
|
||||||
|
f"Known patterns: <b>{len(self._store.get('incidents', {}))}</b></small>"
|
||||||
|
)
|
||||||
|
self._stats_lbl.set_use_markup(True)
|
||||||
|
stats_box.pack_start(self._stats_lbl, False, False, 0)
|
||||||
|
|
||||||
|
frame2 = Gtk.Frame(label="Recent Incidents")
|
||||||
|
vbox.pack_start(frame2, True, True, 0)
|
||||||
|
sw = Gtk.ScrolledWindow()
|
||||||
|
sw.set_policy(Gtk.PolicyType.AUTOMATIC, Gtk.PolicyType.AUTOMATIC)
|
||||||
|
frame2.add(sw)
|
||||||
|
self._inc_buf = Gtk.TextBuffer()
|
||||||
|
tv = Gtk.TextView(buffer=self._inc_buf)
|
||||||
|
tv.set_editable(False)
|
||||||
|
tv.set_cursor_visible(False)
|
||||||
|
tv.set_wrap_mode(Gtk.WrapMode.WORD_CHAR)
|
||||||
|
sw.add(tv)
|
||||||
|
self._refresh_incidents()
|
||||||
|
|
||||||
|
bb = Gtk.Box(spacing=8)
|
||||||
|
vbox.pack_start(bb, False, False, 0)
|
||||||
|
view_btn = Gtk.Button(label="View Monitoring Log")
|
||||||
|
view_btn.connect("clicked", lambda b: subprocess.Popen(["xdg-open", str(MONITORING_LOG)]))
|
||||||
|
bb.pack_start(view_btn, False, False, 0)
|
||||||
|
clear_btn = Gtk.Button(label="Clear Incident Store")
|
||||||
|
clear_btn.connect("clicked", self._on_clear_store)
|
||||||
|
bb.pack_start(clear_btn, False, False, 0)
|
||||||
|
close_btn = Gtk.Button(label="Close")
|
||||||
|
close_btn.connect("clicked", lambda b: self.destroy())
|
||||||
|
bb.pack_end(close_btn, False, False, 0)
|
||||||
|
|
||||||
|
self.show_all()
|
||||||
|
|
||||||
|
def _on_toggle(self, switch, state):
|
||||||
|
self._cfg["enabled"] = state
|
||||||
|
_save_monitoring_config(self._cfg)
|
||||||
|
|
||||||
|
def _on_save(self, btn):
|
||||||
|
self._cfg["provider_url"] = self._url_entry.get_text().strip()
|
||||||
|
self._cfg["model"] = self._model_entry.get_text().strip()
|
||||||
|
self._cfg["api_key"] = self._key_entry.get_text().strip()
|
||||||
|
self._cfg["health_check_interval_s"] = int(self._interval_spin.get_value())
|
||||||
|
self._cfg["auto_restart_proxy"] = self._auto_restart_cb.get_active()
|
||||||
|
self._cfg["auto_switch_provider"] = self._auto_switch_cb.get_active()
|
||||||
|
_save_monitoring_config(self._cfg)
|
||||||
|
self._inc_buf.set_text("Configuration saved.\n")
|
||||||
|
|
||||||
|
def _on_clear_store(self, btn):
|
||||||
|
_save_incident_store({"version": 1, "incidents": {}, "stats": {"ai_calls": 0, "tokens_used": 0}})
|
||||||
|
self._store = {"version": 1, "incidents": {}, "stats": {"ai_calls": 0, "tokens_used": 0}}
|
||||||
|
self._refresh_incidents()
|
||||||
|
|
||||||
|
def _refresh_incidents(self):
|
||||||
|
lines = []
|
||||||
|
for pattern, inc in sorted(self._store.get("incidents", {}).items(),
|
||||||
|
key=lambda x: x[1].get("last_seen", ""), reverse=True):
|
||||||
|
sc = inc.get("success_count", 0)
|
||||||
|
fc = inc.get("fail_count", 0)
|
||||||
|
rate = sc / max(sc + fc, 1)
|
||||||
|
bar = "+" * min(int(rate * 10), 10) + "-" * (10 - min(int(rate * 10), 10))
|
||||||
|
lines.append(
|
||||||
|
f"[{inc.get('last_seen', '?')[:16]}] {pattern}\n"
|
||||||
|
f" fix={inc.get('fix', '?')} success_rate={rate:.0%} [{bar}] "
|
||||||
|
f"seen={inc.get('occurrences', 0)}x\n"
|
||||||
|
)
|
||||||
|
if not lines:
|
||||||
|
lines.append("No incidents recorded yet.\n")
|
||||||
|
lines.append("\nEnable AI Monitoring and use Codex to populate the store.\n")
|
||||||
|
self._inc_buf.set_text("\n".join(lines))
|
||||||
|
|
||||||
|
|
||||||
# ═══════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════
|
||||||
# Main window
|
# Main window
|
||||||
# ═══════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════
|
||||||
@@ -1143,7 +1661,7 @@ class LauncherWin(Gtk.Window):
|
|||||||
# header row
|
# header row
|
||||||
hdr = Gtk.Box(spacing=8)
|
hdr = Gtk.Box(spacing=8)
|
||||||
vbox.pack_start(hdr, False, False, 0)
|
vbox.pack_start(hdr, False, False, 0)
|
||||||
lbl = Gtk.Label(label="<b>Codex Launcher v3.7.0</b>")
|
lbl = Gtk.Label(label="<b>Codex Launcher v3.8.0</b>")
|
||||||
lbl.set_use_markup(True)
|
lbl.set_use_markup(True)
|
||||||
hdr.pack_start(lbl, False, False, 0)
|
hdr.pack_start(lbl, False, False, 0)
|
||||||
changelog_btn = Gtk.Button(label="Changelog")
|
changelog_btn = Gtk.Button(label="Changelog")
|
||||||
@@ -1161,6 +1679,9 @@ class LauncherWin(Gtk.Window):
|
|||||||
bgp_btn = Gtk.Button(label="AI BGP")
|
bgp_btn = Gtk.Button(label="AI BGP")
|
||||||
bgp_btn.connect("clicked", lambda b: self._open_bgp())
|
bgp_btn.connect("clicked", lambda b: self._open_bgp())
|
||||||
hdr.pack_end(bgp_btn, False, False, 0)
|
hdr.pack_end(bgp_btn, False, False, 0)
|
||||||
|
mon_btn = Gtk.Button(label="AI Monitor")
|
||||||
|
mon_btn.connect("clicked", lambda b: self._open_monitoring())
|
||||||
|
hdr.pack_end(mon_btn, False, False, 0)
|
||||||
mgr_btn = Gtk.Button(label="Manage Endpoints")
|
mgr_btn = Gtk.Button(label="Manage Endpoints")
|
||||||
mgr_btn.connect("clicked", lambda b: self._open_mgr())
|
mgr_btn.connect("clicked", lambda b: self._open_mgr())
|
||||||
hdr.pack_end(mgr_btn, False, False, 0)
|
hdr.pack_end(mgr_btn, False, False, 0)
|
||||||
@@ -1310,6 +1831,7 @@ class LauncherWin(Gtk.Window):
|
|||||||
self.show_all()
|
self.show_all()
|
||||||
self._rebuild_combo()
|
self._rebuild_combo()
|
||||||
self._log_dependency_status()
|
self._log_dependency_status()
|
||||||
|
self._start_watcher()
|
||||||
|
|
||||||
# ── helpers ──────────────────────────────────────────────────
|
# ── helpers ──────────────────────────────────────────────────
|
||||||
|
|
||||||
@@ -1456,13 +1978,84 @@ class LauncherWin(Gtk.Window):
|
|||||||
d.run(); d.destroy()
|
d.run(); d.destroy()
|
||||||
|
|
||||||
def _open_bgp(self):
|
def _open_bgp(self):
|
||||||
try:
|
try:
|
||||||
self._bgp_window = BGPPoolMgr(self)
|
self._bgp_window = BGPPoolMgr(self)
|
||||||
self._bgp_window.connect("destroy", lambda *_: setattr(self, "_bgp_window", None))
|
self._bgp_window.connect("destroy", lambda *_: setattr(self, "_bgp_window", None))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
import traceback; traceback.print_exc()
|
import traceback; traceback.print_exc()
|
||||||
d = Gtk.MessageDialog(self, 0, Gtk.MessageType.ERROR, Gtk.ButtonsType.OK, f"Error: {e}")
|
d = Gtk.MessageDialog(self, 0, Gtk.MessageType.ERROR, Gtk.ButtonsType.OK, f"Error: {e}")
|
||||||
d.run(); d.destroy()
|
d.run(); d.destroy()
|
||||||
|
|
||||||
|
def _open_monitoring(self):
|
||||||
|
try:
|
||||||
|
self._monitoring_window = AIMonitoringWindow(self)
|
||||||
|
self._monitoring_window.connect("destroy", lambda *_: setattr(self, "_monitoring_window", None))
|
||||||
|
except Exception as e:
|
||||||
|
import traceback; traceback.print_exc()
|
||||||
|
d = Gtk.MessageDialog(self, 0, Gtk.MessageType.ERROR, Gtk.ButtonsType.OK, f"Error: {e}")
|
||||||
|
d.run(); d.destroy()
|
||||||
|
|
||||||
|
def _start_watcher(self):
|
||||||
|
cfg = _load_monitoring_config()
|
||||||
|
if not cfg.get("enabled"):
|
||||||
|
return
|
||||||
|
self._watcher = HealthWatcher(
|
||||||
|
on_failure=self._on_watcher_failure,
|
||||||
|
on_recovery=self._on_watcher_recovery,
|
||||||
|
on_signal=self._on_watcher_signal,
|
||||||
|
on_action=self._on_watcher_action,
|
||||||
|
)
|
||||||
|
self._watcher.start()
|
||||||
|
self.log("AI Monitoring: watchdog started")
|
||||||
|
|
||||||
|
def _on_watcher_failure(self, count):
|
||||||
|
GLib.idle_add(self.log, f"[AI Monitor] Proxy unresponsive (failures={count})")
|
||||||
|
|
||||||
|
def _on_watcher_recovery(self):
|
||||||
|
GLib.idle_add(self.log, "[AI Monitor] Proxy recovered")
|
||||||
|
|
||||||
|
def _on_watcher_signal(self, fault_id, category, line):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _on_watcher_action(self, action, trigger):
|
||||||
|
cfg = _load_monitoring_config()
|
||||||
|
if action == "restart_proxy" and cfg.get("auto_restart_proxy"):
|
||||||
|
GLib.idle_add(self.log, f"[AI Monitor] Auto-restarting proxy (trigger: {trigger})")
|
||||||
|
GLib.idle_add(self._restart_proxy_from_watcher)
|
||||||
|
elif action == "clear_schema_cache":
|
||||||
|
try:
|
||||||
|
cap_file = Path.home() / ".cache/codex-proxy/provider-caps.json"
|
||||||
|
if cap_file.exists():
|
||||||
|
cap_file.unlink()
|
||||||
|
GLib.idle_add(self.log, "[AI Monitor] Cleared corrupt schema cache")
|
||||||
|
except Exception as e:
|
||||||
|
GLib.idle_add(self.log, f"[AI Monitor] Failed to clear cache: {e}")
|
||||||
|
elif action == "delete_provider_caps":
|
||||||
|
try:
|
||||||
|
cap_file = Path.home() / ".cache/codex-proxy/provider-caps.json"
|
||||||
|
if cap_file.exists():
|
||||||
|
cap_file.unlink()
|
||||||
|
GLib.idle_add(self.log, "[AI Monitor] Deleted corrupted provider-caps.json")
|
||||||
|
except Exception as e:
|
||||||
|
GLib.idle_add(self.log, f"[AI Monitor] Failed: {e}")
|
||||||
|
elif action == "kill_stale_restart":
|
||||||
|
GLib.idle_add(self.log, f"[AI Monitor] Killing stale processes + restarting (trigger: {trigger})")
|
||||||
|
self._kill()
|
||||||
|
GLib.idle_add(self._restart_proxy_from_watcher)
|
||||||
|
else:
|
||||||
|
GLib.idle_add(self.log, f"[AI Monitor] Alert: {action} (trigger: {trigger})")
|
||||||
|
|
||||||
|
def _restart_proxy_from_watcher(self):
|
||||||
|
try:
|
||||||
|
ep_name = load_endpoints().get("default")
|
||||||
|
if not ep_name:
|
||||||
|
return
|
||||||
|
for ep in load_endpoints().get("endpoints", []):
|
||||||
|
if ep.get("name") == ep_name:
|
||||||
|
self._start_proxy(ep)
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
self.log(f"[AI Monitor] Proxy restart failed: {e}")
|
||||||
|
|
||||||
def _open_usage(self):
|
def _open_usage(self):
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -3410,10 +3410,20 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
|||||||
if self.path in ("/v1/models", "/models"):
|
if self.path in ("/v1/models", "/models"):
|
||||||
self.send_json(200, {"object": "list", "data": MODELS})
|
self.send_json(200, {"object": "list", "data": MODELS})
|
||||||
elif self.path in ("/health", "/v1/health"):
|
elif self.path in ("/health", "/v1/health"):
|
||||||
|
import resource as _res
|
||||||
|
_mem_mb = 0
|
||||||
|
try:
|
||||||
|
_mem_mb = _res.getrusage(_res.RUSAGE_SELF).ru_maxrss / 1024
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
_uptime = time.time() - _START_TIME if '_START_TIME' in dir() else 0
|
||||||
self.send_json(200, {"ok": True, "backend": BACKEND,
|
self.send_json(200, {"ok": True, "backend": BACKEND,
|
||||||
"target_url": TARGET_URL,
|
"target_url": TARGET_URL,
|
||||||
"models": [m.get("id") for m in MODELS],
|
"models": [m.get("id") for m in MODELS],
|
||||||
"bgp_routes": len(BGP_ROUTES)})
|
"bgp_routes": len(BGP_ROUTES),
|
||||||
|
"uptime_s": round(_uptime, 1),
|
||||||
|
"memory_mb": round(_mem_mb, 1),
|
||||||
|
"requests_total": _STATS.get("requests", 0)})
|
||||||
else:
|
else:
|
||||||
self.send_error(404)
|
self.send_error(404)
|
||||||
|
|
||||||
@@ -4750,10 +4760,11 @@ def _handle_shutdown_signal(sig, frame):
|
|||||||
_SHUTDOWN_REQUESTED = True
|
_SHUTDOWN_REQUESTED = True
|
||||||
print(f"[SELF-REVIVE] Signal {sig} received, shutting down cleanly", flush=True)
|
print(f"[SELF-REVIVE] Signal {sig} received, shutting down cleanly", flush=True)
|
||||||
if 'SERVER' in globals() and SERVER:
|
if 'SERVER' in globals() and SERVER:
|
||||||
SERVER.shutdown()
|
SERVER.shutdown()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
global SERVER
|
global SERVER, _START_TIME
|
||||||
|
_START_TIME = time.time()
|
||||||
_init_runtime()
|
_init_runtime()
|
||||||
signal.signal(signal.SIGTERM, _handle_shutdown_signal)
|
signal.signal(signal.SIGTERM, _handle_shutdown_signal)
|
||||||
signal.signal(signal.SIGINT, _handle_shutdown_signal)
|
signal.signal(signal.SIGINT, _handle_shutdown_signal)
|
||||||
|
|||||||
Reference in New Issue
Block a user