v3.11.0: merge cobra PR, smart-continue, hot-reload, XML extraction

- Merge PR #5 from cobra91: concurrency semaphore, auto-continue, SO_REUSEADDR, proxy-stderr.log, stream diagnostics, timeout handler, restart proxy fix - Tool call argument normalizer, smart-continue loop, XML extraction - API key hot-reload with mtime tracking + /admin/ endpoints - GUI hot-reload on endpoint edit with upstream verification - Synthetic tool-results disabled (caused deepseek-v4-pro truncation) - Version bump 3.10.12 -> 3.11.0, rebuild .deb
2026-05-26 15:02:02 +04:00
parent cae161023f
commit c3ba3286ff
7 changed files with 1100 additions and 191 deletions
--- a/translate-proxy.py
+++ b/translate-proxy.py
@@ -188,6 +188,7 @@ DEFAULT_MODELS = {
 }

 def load_config():
+    global _CONFIG_PATH, _CONFIG_MTIME
    p = argparse.ArgumentParser(description="Responses API translation proxy")
    p.add_argument("--config", help="JSON config file path")
    p.add_argument("--port", type=int, default=None)
@@ -195,16 +196,21 @@ def load_config():
    p.add_argument("--target-url", default=None)
    p.add_argument("--api-key", default=None)
    p.add_argument("--models-file", default=None, help="JSON file with model list array")
-    args = p.parse_args()
+    _args = p.parse_args()

    cfg = {}
-    if args.config:
-        with open(args.config) as f:
+    if _args.config:
+        _CONFIG_PATH = os.path.abspath(_args.config)
+        with open(_args.config) as f:
            cfg = json.load(f)
+        try:
+            _CONFIG_MTIME = os.path.getmtime(_CONFIG_PATH)
+        except OSError:
+            pass

    for ck, ak in [("port", "port"), ("backend_type", "backend"),
                    ("target_url", "target_url"), ("api_key", "api_key")]:
-        v = getattr(args, ak, None)
+        v = getattr(_args, ak, None)
        if v is not None:
            cfg[ck] = v

@@ -226,8 +232,8 @@ def load_config():
    cfg.setdefault("api_key", "")

    models = cfg.get("models", [])
-    if not models and args.models_file:
-        with open(args.models_file) as f:
+    if not models and _args.models_file:
+        with open(_args.models_file) as f:
            models = json.load(f)
    if not models:
        models = DEFAULT_MODELS.get(cfg["backend_type"], [])
@@ -236,6 +242,8 @@ def load_config():
    return cfg

 CONFIG = None
+_CONFIG_PATH = None
+_CONFIG_MTIME = 0
 PORT = 8080
 BACKEND = "openai-compat"
 TARGET_URL = ""
@@ -315,6 +323,8 @@ _conn_pool_lock = threading.Lock()
 _conn_pool = {}

 _STREAM_IDLE_TIMEOUT = 300
+_MAX_CONCURRENT_REQUESTS = 3
+_request_semaphore = threading.Semaphore(_MAX_CONCURRENT_REQUESTS)

 _CODEBUFF_AUTH_URL = "https://www.codebuff.com"
 _CODEBUFF_API_URL = "https://www.codebuff.com"
@@ -616,6 +626,51 @@ class APIKeyPool(AccountPool):
 _cb_pool = CodebuffAccountPool("codebuff")
 _google_antigravity_pool = GoogleAccountPool("antigravity")
 _google_cli_pool = GoogleAccountPool("cli")
+_antigravity_preferred_endpoint = None
+_antigravity_endpoint_lock = threading.Lock()
+
+def _classify_antigravity_error(status_code, body):
+    lower = body.lower()
+    if status_code == 400:
+        return "bad_request"
+    if status_code == 401:
+        if any(x in lower for x in ["invalid_grant", "token revoked", "token_revoked", "invalid_client"]):
+            return "auth_permanent"
+        return "auth_transient"
+    if status_code == 403:
+        if "validation_required" in lower or "account_disabled" in lower:
+            return "validation_required"
+        if "has been disabled" in lower and "violation of terms of service" in lower:
+            return "account_banned"
+        if "service_disabled" in lower:
+            return "service_disabled"
+        return "forbidden"
+    if status_code in (429, 503, 529):
+        if any(x in lower for x in ["model_capacity_exhausted", "capacity_exhausted", "model is currently overloaded", "service temporarily unavailable"]):
+            return "capacity_exhausted"
+        if any(x in lower for x in ["quota_exhausted", "resource_exhausted", "daily limit", "quota exceeded", "quotaresetdelay"]):
+            return "quota_exhausted"
+        return "rate_limited"
+    if status_code >= 500:
+        return "server_error"
+    return "unknown"
+
+def _parse_rate_limit_reset(body):
+    import re as _re
+    m = _re.search(r'quotaResetDelay[:"\s]+(\d+(?:\.\d+)?)(ms|s)', body, _re.IGNORECASE)
+    if m:
+        val = float(m.group(1))
+        return val / 1000 if m.group(2) == 'ms' else val
+    m = _re.search(r'(\d+)h(\d+)m(\d+)s', body, _re.IGNORECASE)
+    if m:
+        return int(m.group(1)) * 3600 + int(m.group(2)) * 60 + int(m.group(3))
+    m = _re.search(r'Resets in ~(\d+)h(\d+)m', body, _re.IGNORECASE)
+    if m:
+        return int(m.group(1)) * 3600 + int(m.group(2)) * 60
+    m = _re.search(r'retry[-_]?after[:\s]+(\d+)\s*(?:sec|s\b)', body, _re.IGNORECASE)
+    if m:
+        return int(m.group(1))
+    return None

 def _get_codebuff_account():
    """Return (token, account_dict) for best available codebuff account."""
@@ -714,14 +769,21 @@ def _gemini_reattach_sigs(contents):

 # Gemini follow-through guardrail
 _GEMINI_AGENT_GUARDRAIL = (
-    "You are running inside Codex as an autonomous coding agent. "
-    "When the user asks for a change to existing files, do not merely describe the previous work or summarize. "
-    "You must inspect the existing files, apply edits with tools, and verify the result. "
-    "If a file path is known from prior context, reuse it. "
-    "If unsure, list files first. "
-    "After tool results, continue until the requested change is actually implemented. "
-    "Never answer only with a plan such as 'I will start by...' or 'I am going to...'. "
-    "Always emit the actual tool call in the same response."
+    "!!! ABSOLUTELY CRITICAL - DO NOT IGNORE THIS UNDER ANY CIRCUMSTANCES !!! "
+    "YOU ARE RUNNING INSIDE CODEX AS AN AUTONOMOUS CODING AGENT. "
+    "!!!!!! NEVER EVER CONTINUE, PARAPHRASE, COMPLETE, OR ADD ANYTHING TO THE USER'S INSTRUCTIONS !!!!!! "
+    "!!!!!! NEVER SAY 'LET\\'S FIRST VIEW' OR 'LET\\'S FIRST FIND' OR SIMILAR PHRASES - EMIT THE ACTUAL TOOL CALL NOW !!!!!! "
+    "WHEN THE USER ASKS FOR A CHANGE TO EXISTING FILES, YOU MUST "
+    "1. IMMEDIATELY INSPECT EXISTING FILES USING exec_command OR read_files TOOLS RIGHT NOW, "
+    "2. THEN APPLY EDITS USING write OR exec_command TOOLS, "
+    "3. THEN VERIFY THE RESULT. "
+    "IF A FILE PATH IS KNOWN, REUSE IT IMMEDIATELY. "
+    "IF UNSURE, LIST FILES FIRST USING exec_command (ls -la). "
+    "AFTER TOOL RESULTS, CONTINUE UNTIL THE REQUESTED CHANGE IS FULLY IMPLEMENTED AND FILES ARE MODIFIED. "
+    "NEVER ANSWER ONLY WITH A PLAN LIKE 'I WILL START BY...' OR 'I AM GOING TO...'. "
+    "NEVER SUMMARIZE THE USER'S REQUEST. NEVER CONTINUE THEIR SENTENCE. "
+    "ALWAYS, ALWAYS, ALWAYS EMIT THE ACTUAL TOOL CALL IN THE SAME RESPONSE. "
+    "!!! FAILURE TO FOLLOW THESE INSTRUCTIONS WILL RESULT IN A BROKEN USER EXPERIENCE !!!"
 )

 _LOG_FILE_LOCK = threading.Lock()
@@ -771,6 +833,20 @@ def _ensure_antigravity_version():
        _antigravity_version_checked = time.time()
        return _antigravity_version

+_antigravity_client_version = "1.110.0"
+_antigravity_client_version_checked = 0
+
+def _ensure_antigravity_client_version():
+    global _antigravity_client_version, _antigravity_client_version_checked
+    env_ver = os.environ.get("ANTIGRAVITY_CLIENT_VERSION", "").strip()
+    if env_ver:
+        return env_ver
+    if time.time() - _antigravity_client_version_checked < 6 * 3600:
+        return _antigravity_client_version
+    _antigravity_client_version = os.environ.get("ANTIGRAVITY_CLIENT_VERSION_FALLBACK", "1.110.0")
+    _antigravity_client_version_checked = time.time()
+    return _antigravity_client_version
+
 def _init_runtime():
    global CONFIG, PORT, BACKEND, TARGET_URL, API_KEY, OAUTH_PROVIDER, _antigravity_version
    global MODELS, CC_VERSION, REASONING_ENABLED, REASONING_EFFORT, BGP_ROUTES
@@ -801,6 +877,68 @@ def _init_runtime():
        _antigravity_version = _ensure_antigravity_version()
        print(f"[antigravity] version={_antigravity_version}", file=sys.stderr)

+def _verify_api_key(key, target_url):
+    if not key or not target_url:
+        return {"valid": False, "error": "missing key or url"}
+    test_url = upstream_target(target_url, "/models")
+    if not test_url:
+        return {"valid": False, "error": "invalid target url"}
+    try:
+        req = urllib.request.Request(test_url, headers={
+            "Authorization": f"Bearer {key}",
+            "Content-Type": "application/json",
+        })
+        resp = urllib.request.urlopen(req, timeout=10)
+        body = resp.read().decode()
+        model_count = 0
+        try:
+            data = json.loads(body)
+            model_count = len(data.get("data", []))
+        except Exception:
+            pass
+        return {"valid": True, "status": resp.status, "models": model_count}
+    except urllib.error.HTTPError as e:
+        err = e.read().decode()[:200]
+        return {"valid": False, "status": e.code, "error": err}
+    except Exception as e:
+        return {"valid": False, "error": str(e)[:200]}
+
+_HOT_RELOAD_LOCK = threading.Lock()
+
+def _hot_reload_api_key():
+    global API_KEY, _api_key_pool, _CONFIG_MTIME
+    if not _CONFIG_PATH:
+        return False
+    try:
+        cur_mtime = os.path.getmtime(_CONFIG_PATH)
+    except OSError:
+        return False
+    if cur_mtime <= _CONFIG_MTIME:
+        return False
+    with _HOT_RELOAD_LOCK:
+        try:
+            cur_mtime2 = os.path.getmtime(_CONFIG_PATH)
+            if cur_mtime2 <= _CONFIG_MTIME:
+                return False
+            with open(_CONFIG_PATH) as f:
+                new_cfg = json.load(f)
+            new_key = (new_cfg.get("api_key") or "").strip()
+            if not new_key or new_key == API_KEY:
+                _CONFIG_MTIME = cur_mtime2
+                return False
+            old_preview = API_KEY[:8] + "..." if len(API_KEY) > 8 else "(empty)"
+            new_preview = new_key[:8] + "..." if len(new_key) > 8 else "(empty)"
+            API_KEY = new_key
+            _CONFIG_MTIME = cur_mtime2
+            if API_KEY and "," in API_KEY and not OAUTH_PROVIDER.startswith("google") and BACKEND not in ("codebuff", "freebuff"):
+                _api_key_pool = APIKeyPool(BACKEND, API_KEY)
+                print(f"[hot-reload] API key pool refreshed: {len(_api_key_pool._accounts)} keys", file=sys.stderr)
+            print(f"[hot-reload] API key updated: {old_preview} -> {new_preview}", file=sys.stderr)
+            return True
+        except Exception as e:
+            print(f"[hot-reload] error: {e}", file=sys.stderr)
+            return False
+
    bgp_models = []
    for _r in BGP_ROUTES:
        for _m in _r.get("models", [{"id": _r.get("model", "unknown")}]):
@@ -1623,7 +1761,8 @@ _PROVIDER_POLICIES = {
    "openrouter": {"reasoning_mode": "provider_default", "max_tokens": 32768, "strip_reasoning": True,
                   "tool_output_limit": 6000, "max_input_items": 35, "compaction": "balanced"},
    "openadapter": {"reasoning_mode": "off", "max_tokens": 32768, "strip_reasoning": True,
-                    "tool_output_limit": 6000, "max_input_items": 30, "compaction": "balanced"},
+                    "tool_output_limit": 1000, "max_input_items": 10, "compaction": "aggressive",
+                    "synthetic_tool_results": True},
    "cloudcode-pa": {"compaction": "aggressive", "context_size": 1000000,
                     "tool_output_limit": 6000, "max_input_items": 60},
    "googleapis": {"compaction": "balanced", "context_size": 1000000,
@@ -2054,6 +2193,75 @@ def _inject_stored_reasoning(messages):
            msg["reasoning_content"] = reasoning
    return messages

+def _normalize_tool_args(raw_args):
+    if not raw_args or raw_args == "{}":
+        return raw_args
+    try:
+        parsed = json.loads(raw_args)
+        if isinstance(parsed, dict):
+            if "Arguments" in parsed and "arguments" not in parsed:
+                inner = parsed["Arguments"]
+                if isinstance(inner, str):
+                    inner = inner.strip()
+                    for pfx in ("```json", "```"):
+                        if inner.startswith(pfx):
+                            inner = inner[len(pfx):].strip()
+                    if inner.endswith("```"):
+                        inner = inner[:-3].strip()
+                    try:
+                        inner_parsed = json.loads(inner)
+                        if isinstance(inner_parsed, dict):
+                            return json.dumps(inner_parsed)
+                    except json.JSONDecodeError:
+                        pass
+            if "cmd" not in parsed and "Arguments" in parsed:
+                inner = parsed["Arguments"]
+                if isinstance(inner, str):
+                    inner = inner.strip()
+                    for pfx in ("```json", "```"):
+                        if inner.startswith(pfx):
+                            inner = inner[len(pfx):].strip()
+                    if inner.endswith("```"):
+                        inner = inner[:-3].strip()
+                    try:
+                        inner_parsed = json.loads(inner)
+                        if isinstance(inner_parsed, dict):
+                            return json.dumps(inner_parsed)
+                    except json.JSONDecodeError:
+                        pass
+        return raw_args
+    except json.JSONDecodeError:
+        return raw_args
+
+_XML_TC_RE = re.compile(r'<tool_call>(\w+)(.*?)</tool_call>', re.DOTALL)
+_XML_ARG_VALUE_RE = re.compile(r'</?arg_value>\s*')
+
+def _extract_xml_tool_calls(text):
+    if not text:
+        return []
+    results = []
+    for m in _XML_TC_RE.finditer(text):
+        name = m.group(1)
+        rest = _XML_ARG_VALUE_RE.sub("", m.group(2)).strip()
+        args_str = "{}"
+        try:
+            for pfx in ("```json", "```"):
+                if rest.startswith(pfx):
+                    rest = rest[len(pfx):].strip()
+            if rest.endswith("```"):
+                rest = rest[:-3].strip()
+            if rest.startswith("{"):
+                json.loads(rest)
+                args_str = rest
+            else:
+                json.loads(rest)
+                args_str = rest
+        except Exception:
+            if rest.startswith("{"):
+                args_str = rest
+        results.append({"name": name, "args": args_str, "call_id": f"xml_{len(results)}"})
+    return results
+
 def oa_input_to_messages(input_data):
    msgs = []
    tool_name_by_id = {}
@@ -2066,11 +2274,13 @@ def oa_input_to_messages(input_data):
            t = item.get("type")
            if t == "function_call":
                tcid = item.get("call_id") or item.get("id") or uid("tc")
+                raw_args = item.get("arguments", "{}")
+                normalized_args = _normalize_tool_args(raw_args)
                pending_tool_calls.append(
                    {"id": tcid,
                     "type": "function",
                     "function": {"name": item.get("name", ""),
-                                   "arguments": item.get("arguments", "{}")}})
+                                   "arguments": normalized_args}})
                tool_name_by_id[tcid] = item.get("name", "")
                continue
            if pending_tool_calls:
@@ -4280,9 +4490,10 @@ def _antigravity_is_simple_user(text):
            return True
    return False

-def _antigravity_normalize_context(input_data):
+def _antigravity_normalize_context(input_data, model=""):
    if not isinstance(input_data, list) or len(input_data) < 2:
        return input_data
+    is_claude_model = "claude" in model.lower()

    latest_user = ""
    latest_user_idx = -1
@@ -4310,13 +4521,19 @@ def _antigravity_normalize_context(input_data):
    if os.environ.get("ANTIGRAVITY_AUTO_RESET_POLLUTED_CONTEXT", "1") != "1":
        auto_reset = False

-    if is_simple and (auto_reset or n_tool_outputs == 0):
+    has_compaction_summary = any(
+        isinstance(it, dict) and it.get("type") == "message" and it.get("role") == "user"
+        and ("Auto-compacted" in str(it.get("content", "")) or "auto-compacted" in str(it.get("content", "")).lower())
+        for it in input_data
+    )
+
+    if is_simple and auto_reset and not has_compaction_summary:
        system_items = [it for it in input_data if isinstance(it, dict) and it.get("type") == "message" and it.get("role") in ("developer", "system")]
        user_item = input_data[latest_user_idx]
        result = system_items + [user_item] if system_items else [user_item]
        print(f"[antigravity-context] raw_items={n_raw} compacted_items={n_raw} final_items={len(result)}", file=sys.stderr)
        print(f"[antigravity-context] raw_tool_outputs={n_tool_outputs} kept_tool_outputs=0", file=sys.stderr)
-        print(f"[antigravity-context] simple_latest_user=true auto_reset={auto_reset}", file=sys.stderr)
+        print(f"[antigravity-context] simple_latest_user=true auto_reset={auto_reset} has_compaction={has_compaction_summary}", file=sys.stderr)
        return result

    dev_messages = []
@@ -4340,9 +4557,15 @@ def _antigravity_normalize_context(input_data):
    latest_words = set(latest_user.strip().lower().split())
    has_edit_intent = bool(latest_words.intersection(_ANTIGRAVITY_EDIT_WORDS))
    has_ref_intent = bool(latest_words.intersection(_ANTIGRAVITY_REFERENCE_WORDS))
-    keep_tools = 2 if (has_edit_intent or has_ref_intent) else 1
+    if is_claude_model:
+        keep_tools = len(tool_outputs)
+    else:
+        keep_tools = 2 if (has_edit_intent or has_ref_intent) else 1

-    kept_tools = tool_outputs[-keep_tools:] if tool_outputs and (has_edit_intent or has_ref_intent) else []
+    if is_claude_model:
+        kept_tools = tool_outputs
+    else:
+        kept_tools = tool_outputs[-keep_tools:] if tool_outputs and (has_edit_intent or has_ref_intent) else []

    for idx_t, t_item in enumerate(kept_tools):
        orig = t_item[1]
@@ -4357,6 +4580,22 @@ def _antigravity_normalize_context(input_data):
    tail_start = max(0, len(recent_items) - 6)
    recent_tail = recent_items[tail_start:]

+    deduped_tail = []
+    seen_goal_context = False
+    for idx, msg_item in recent_tail:
+        content_str = ""
+        c = msg_item.get("content", "")
+        if isinstance(c, str):
+            content_str = c
+        elif isinstance(c, list):
+            content_str = " ".join(p.get("text", p.get("input_text", "")) for p in c if isinstance(p, dict))
+        if "<goal_context>" in content_str:
+            if seen_goal_context:
+                continue
+            seen_goal_context = True
+        deduped_tail.append((idx, msg_item))
+    recent_tail = deduped_tail if deduped_tail else recent_tail
+
    tool_call_ids = set()
    for _, t_item in kept_tools:
        cid = t_item.get("call_id", t_item.get("id", ""))
@@ -4371,6 +4610,15 @@ def _antigravity_normalize_context(input_data):

    result = list(dev_messages)

+    compaction_summaries = []
+    for idx, msg_item in recent_items:
+        if msg_item is input_data[latest_user_idx]:
+            continue
+        c = msg_item.get("content", "")
+        content_str = c if isinstance(c, str) else " ".join(p.get("text", p.get("input_text", "")) for p in c if isinstance(p, dict)) if isinstance(c, list) else ""
+        if "Auto-compacted" in content_str or "auto-compacted" in content_str.lower():
+            compaction_summaries.append(msg_item)
+
    if n_summarized > 0:
        summary_text = f"[Tool history summary: {n_summarized} older tool outputs omitted. {n_tool_calls} prior function calls were made for file inspection/editing.]"
        result.append({"type": "message", "role": "user", "content": [{"type": "input_text", "text": summary_text}]})
@@ -4381,6 +4629,9 @@ def _antigravity_normalize_context(input_data):
    for _, tool_item in kept_tools:
        result.append(tool_item)

+    for cs_item in compaction_summaries:
+        result.append(cs_item)
+
    for _, msg_item in recent_tail:
        if msg_item is not input_data[latest_user_idx]:
            result.append(msg_item)
@@ -4408,7 +4659,10 @@ def _antigravity_normalize_context(input_data):

    if total_chars > _ANTIGRAVITY_EMERGENCY_CHARS:
        print(f"[antigravity-context] EMERGENCY: {total_chars} chars exceeds limit, resetting to minimal", file=sys.stderr)
-        result = list(dev_messages) + [input_data[latest_user_idx]]
+        result = list(dev_messages)
+        if compaction_summaries:
+            result.extend(compaction_summaries)
+        result.append(input_data[latest_user_idx])
        total_chars = sum(len(json.dumps(it, ensure_ascii=False)) for it in result)

    while len(result) > _ANTIGRAVITY_MAX_CONTENTS and total_chars > _ANTIGRAVITY_SOFT_CHARS:
@@ -4475,12 +4729,23 @@ class Handler(http.server.BaseHTTPRequestHandler):
                pass
            _uptime = time.time() - _START_TIME if '_START_TIME' in dir() else 0
            self.send_json(200, {"ok": True, "backend": BACKEND,
-                                 "target_url": TARGET_URL,
-                                 "models": [m.get("id") for m in MODELS],
-                                 "bgp_routes": len(BGP_ROUTES),
-                                 "uptime_s": round(_uptime, 1),
-                                 "memory_mb": round(_mem_mb, 1),
-                                 "requests_total": _STATS.get("requests", 0)})
+                                  "target_url": TARGET_URL,
+                                  "models": [m.get("id") for m in MODELS],
+                                  "bgp_routes": len(BGP_ROUTES),
+                                  "uptime_s": round(_uptime, 1),
+                                  "memory_mb": round(_mem_mb, 1),
+                                   "requests_total": _STATS.get("requests", 0)})
+         elif self.path == "/admin/reload":
+            reloaded = _hot_reload_api_key()
+            key_preview = API_KEY[:8] + "..." if len(API_KEY) > 8 else "(empty)"
+            self.send_json(200, {"ok": True, "reloaded": reloaded,
+                                 "api_key_preview": key_preview,
+                                 "config_path": _CONFIG_PATH or "none"})
+         elif self.path == "/admin/verify-key":
+            result = _verify_api_key(API_KEY, TARGET_URL)
+            key_preview = API_KEY[:8] + "..." if len(API_KEY) > 8 else "(empty)"
+            result["api_key_preview"] = key_preview
+            self.send_json(200, result)
         else:
             self.send_error(404)

@@ -4502,6 +4767,7 @@ class Handler(http.server.BaseHTTPRequestHandler):
    _logf = None

    def _handle(self):
+        _hot_reload_api_key()
        try:
            clen = int(self.headers.get("Content-Length", 0))
            body = json.loads(self.rfile.read(clen))
@@ -4565,6 +4831,11 @@ class Handler(http.server.BaseHTTPRequestHandler):
                        _last_user_urls.append(url_m.group(0))
        save_request_snapshot(request_id, body)
        _req_t0 = time.time()
+        wait_start = time.monotonic()
+        _request_semaphore.acquire()
+        wait_ms = (time.monotonic() - wait_start) * 1000
+        if wait_ms > 100:
+            print(f"[{_sid}] waited {wait_ms:.0f}ms for upstream slot (concurrency gate)", file=sys.stderr)
        try:
            with RequestTracker(request_id) as tracker:
                if BACKEND == "auto":
@@ -4583,6 +4854,8 @@ class Handler(http.server.BaseHTTPRequestHandler):
        except Exception as _snap_err:
            update_snapshot_response(request_id, "error", time.time() - _req_t0, _snap_err)
            raise
+        finally:
+            _request_semaphore.release()

    def _handle_openai_compat(self, body, model, stream, tracker=None):
        input_data = body.get("input", "")
@@ -4595,7 +4868,8 @@ class Handler(http.server.BaseHTTPRequestHandler):
            body = dict(body)
            body["input"] = input_data

-        if (policy.get("synthetic_tool_results") or _provider_cap(model, "synthetic_tool_results", False)) and isinstance(input_data, list):
+        # synthetic tool-results disabled: causes deepseek-v4-pro truncation on opencode.ai
+        if False and (policy.get("synthetic_tool_results") or _provider_cap(model, "synthetic_tool_results", False)) and isinstance(input_data, list):
            input_data, synthesized = synthesize_tool_results_for_chat(input_data)
            if synthesized:
                print("[provider-adapter] using synthetic tool-result continuation", file=sys.stderr)
@@ -4603,7 +4877,7 @@ class Handler(http.server.BaseHTTPRequestHandler):
                body["input"] = input_data

        compacted = False
-        if policy.get("compaction") and isinstance(input_data, list):
+        if policy.get("compaction") and isinstance(input_data, list) and "claude" not in model.lower():
            input_data, compacted = _adaptive_compact(input_data, model, policy)
            if compacted:
                body = dict(body)
@@ -4652,6 +4926,22 @@ class Handler(http.server.BaseHTTPRequestHandler):
                    upstream = urllib.request.urlopen(req, timeout=_upstream_timeout(body, stream))
                except urllib.error.HTTPError as e:
                    err_body = e.read().decode()
+                    if "context_length_exceeded" in err_body and attempt < max_retries:
+                        print(f"[{self._session_id}] context_length_exceeded (attempt {attempt+1}/{max_retries}), retrying with extreme compaction!", file=sys.stderr)
+                        policy = provider_policy()
+                        if isinstance(input_data, list):
+                            print(f"[{self._session_id}] applying extreme compaction to {len(input_data)} items", file=sys.stderr)
+                            input_data = _crof_compact_for_retry(input_data, model)
+                            body = dict(body)
+                            body["input"] = input_data
+                            messages = oa_input_to_messages(input_data)
+                            messages = _inject_stored_reasoning(messages)
+                            instructions = body.get("instructions", "").strip()
+                            if instructions:
+                                messages.insert(0, {"role": "system", "content": instructions})
+                            chat_body = self._build_chat_body(model, messages, body, stream)
+                            chat_body_b = json.dumps(chat_body).encode()
+                            continue
                    if e.code in (429, 502, 503) and attempt < max_retries:
                        if e.code == 429 and _api_key_pool:
                            pool_acct = _api_key_pool.get()
@@ -4782,7 +5072,7 @@ class Handler(http.server.BaseHTTPRequestHandler):
            body["input"] = input_data

        compacted = False
-        if policy.get("compaction") and isinstance(input_data, list):
+        if policy.get("compaction") and isinstance(input_data, list) and "claude" not in model.lower():
            input_data, compacted = _adaptive_compact(input_data, model, policy)
            if compacted:
                body = dict(body)
@@ -4793,8 +5083,8 @@ class Handler(http.server.BaseHTTPRequestHandler):
            body = dict(body)
            body["input"] = input_data

-        if OAUTH_PROVIDER == "google-antigravity" and isinstance(input_data, list):
-            input_data = _antigravity_normalize_context(input_data)
+        if OAUTH_PROVIDER == "google-antigravity" and isinstance(input_data, list) and "claude" not in model.lower():
+            input_data = _antigravity_normalize_context(input_data, model)
            body = dict(body)
            body["input"] = input_data

@@ -4872,7 +5162,7 @@ class Handler(http.server.BaseHTTPRequestHandler):
                        resp_part["functionResponse"]["id"] = call_id
                    contents.append({"role": "user", "parts": [resp_part]})

-        if OAUTH_PROVIDER.startswith("google"):
+        if OAUTH_PROVIDER.startswith("google") and "claude" not in model.lower():
            sanitized = []
            last_user_text = None
            last_role = None
@@ -4963,8 +5253,18 @@ class Handler(http.server.BaseHTTPRequestHandler):
            contents = _gemini_reattach_sigs(contents)

        if OAUTH_PROVIDER == "google-antigravity":
-            guardrail_found = any("autonomous coding agent" in json.dumps(c.get("parts", []), ensure_ascii=False) for c in contents[:2])
-            if not guardrail_found:
+            latest_user = ""
+            if isinstance(input_data, list):
+                for item in reversed(input_data):
+                    if item.get("type") == "message" and item.get("role") == "user":
+                        c = item.get("content", "")
+                        if isinstance(c, str):
+                            latest_user = c
+                        elif isinstance(c, list):
+                            latest_user = "\n".join(p.get("text", p.get("input_text", "")) for p in c if isinstance(p, dict))
+                        break
+            is_latest_simple = _antigravity_is_simple_user(latest_user)
+            if not is_latest_simple:
                contents.insert(0, {"role": "user", "parts": [{"text": _GEMINI_AGENT_GUARDRAIL}]})

        if OAUTH_PROVIDER == "google-antigravity" and isinstance(input_data, list):
@@ -4976,11 +5276,10 @@ class Handler(http.server.BaseHTTPRequestHandler):
                    if isinstance(c, str): latest_lower = c.lower()
                    elif isinstance(c, list): latest_lower = " ".join(p.get("text", p.get("input_text", "")) for p in c if isinstance(p, dict)).lower()
                    break
-            if latest_lower and any(w in latest_lower for w in _EDIT_WORDS) and len(input_data) > 6:
+            if latest_lower and any(w in latest_lower for w in _EDIT_WORDS):
                n_tool_calls = sum(1 for it in input_data if isinstance(it, dict) and it.get("type") == "function_call")
-                if n_tool_calls > 0:
-                    contents.append({"role": "user", "parts": [{"text": "IMPORTANT: The user is requesting a modification to existing files. You MUST use tools (exec_command, write, etc.) to make the changes. Do NOT just describe what to do — actually call the tools to modify the files now."}]})
-                    print(f"[antigravity] edit-intent detected with {n_tool_calls} prior tool calls; injected tool-use nudge", file=sys.stderr)
+                contents.append({"role": "user", "parts": [{"text": "!!! ABSOLUTELY NO PLANNING - EMIT THE TOOL CALL NOW !!! IMPORTANT: The user is requesting a modification to existing files. You MUST use tools (exec_command, read_files, write, etc.) to make the changes RIGHT NOW. Do NOT just describe what to do — actually CALL THE TOOLS IN THIS RESPONSE. IMMEDIATELY INSPECT THE FILE OR LIST FILES USING exec_command TOOL CALL."}]})
+                print(f"[antigravity] edit-intent detected; injected tool-use nudge", file=sys.stderr)

        if OAUTH_PROVIDER == "google-antigravity" and isinstance(input_data, list):
            latest_user = ""
@@ -5031,10 +5330,14 @@ class Handler(http.server.BaseHTTPRequestHandler):
            wrapped["requestType"] = "agent"
            wrapped["userAgent"] = "antigravity"
            wrapped["requestId"] = f"agent-{uuid.uuid4().hex[:12]}"
+            wrapped["request"]["sessionId"] = f"{uuid.uuid4().hex}{int(time.time()*1000)}"

        _allow_staging = os.environ.get("ALLOW_ANTIGRAVITY_STAGING", "0") == "1"
        if OAUTH_PROVIDER == "google-antigravity":
-            _antigravity_endpoints = ["https://cloudcode-pa.googleapis.com"]
+            _antigravity_endpoints = [
+                "https://cloudcode-pa.googleapis.com",
+                "https://daily-cloudcode-pa.googleapis.com",
+            ]
            if _allow_staging:
                _antigravity_endpoints.extend([
                    "https://daily-cloudcode-pa.sandbox.googleapis.com",
@@ -5052,7 +5355,13 @@ class Handler(http.server.BaseHTTPRequestHandler):
        }
        if OAUTH_PROVIDER == "google-antigravity":
            version = _ensure_antigravity_version()
-            headers["User-Agent"] = f"antigravity/{version} darwin/arm64"
+            import platform as _plat
+            _os_name = _plat.system().lower()
+            _os_arch = _plat.machine().lower().replace("x86_64", "x64").replace("aarch64", "arm64")
+            headers["User-Agent"] = f"antigravity/{version} {_os_name}/{_os_arch}"
+            headers["X-Client-Name"] = "antigravity"
+            headers["X-Client-Version"] = _ensure_antigravity_client_version()
+            headers["x-goog-api-client"] = "gl-node/18.18.2 fire/0.8.6 grpc/1.10.x"
        else:
            headers["User-Agent"] = "google-api-nodejs-client/9.15.1"
            headers["X-Goog-Api-Client"] = "gl-node/22.17.0"
@@ -5072,14 +5381,33 @@ class Handler(http.server.BaseHTTPRequestHandler):
        if OAUTH_PROVIDER == "google-antigravity":
            print(f"[antigravity-endpoint] endpoints={[e.replace('https://','') for e in endpoints]} project={project_id}", file=sys.stderr)

-        for ep in endpoints:
+        upstream = None
+        chosen_ep = None
+        global _antigravity_preferred_endpoint
+
+        with _antigravity_endpoint_lock:
+            _pref = _antigravity_preferred_endpoint
+
+        if _pref and _pref in endpoints:
+            ordered = [_pref] + [e for e in endpoints if e != _pref]
+        else:
+            ordered = list(endpoints)
+
+        for ep in ordered:
            target = f"{ep}/{url_suffix}"
            req = urllib.request.Request(target, data=body_b, headers=headers)
            try:
                upstream = urllib.request.urlopen(req, timeout=_upstream_timeout(body, stream))
+                chosen_ep = ep
+                with _antigravity_endpoint_lock:
+                    _antigravity_preferred_endpoint = ep
+                if ep != _pref:
+                    print(f"[{self._session_id}] fallback OK: {ep.replace('https://','')}", file=sys.stderr)
                break
            except urllib.error.HTTPError as e:
                err_body = e.read().decode()
+                err_class = _classify_antigravity_error(e.code, err_body)
+                print(f"[{self._session_id}] {ep.replace('https://','')} {e.code} class={err_class}", file=sys.stderr)
                if e.code == 400 and OAUTH_PROVIDER.startswith("google"):
                    try:
                        debug_path = os.path.join(_LOG_DIR, "gemini-last-400-request.json")
@@ -5088,23 +5416,38 @@ class Handler(http.server.BaseHTTPRequestHandler):
                        print(f"[{self._session_id}] saved 400 debug request to {debug_path}", file=sys.stderr)
                    except Exception:
                        pass
-                if e.code == 403 and "SERVICE_DISABLED" in err_body[:500] and ep != endpoints[-1]:
-                    print(f"[{self._session_id}] {ep} SERVICE_DISABLED, trying next endpoint", file=sys.stderr)
+                    return self.send_json(e.code, {"error": {"type": "upstream_error", "message": _sanitize_err_body(err_body)}})
+                if err_class == "auth_permanent":
+                    return self.send_json(e.code, {"error": {"type": "upstream_error", "message": _sanitize_err_body(err_body)}})
+                if err_class in ("quota_exhausted", "rate_limited"):
+                    reset_s = _parse_rate_limit_reset(err_body)
+                    if ep == ordered[-1]:
+                        pool = _google_antigravity_pool if OAUTH_PROVIDER == "google-antigravity" else _google_cli_pool
+                        _, acct = _get_google_account(OAUTH_PROVIDER)
+                        if acct:
+                            cooldown = reset_s if reset_s and reset_s > 10 else 60
+                            pool.mark_rate_limited(acct, cooldown)
+                            print(f"[{self._session_id}] quota reset in ~{reset_s}s, cooldown={cooldown}s", file=sys.stderr)
+                        return self.send_json(e.code, {"error": {"type": "upstream_error", "message": _sanitize_err_body(err_body)}})
+                    print(f"[{self._session_id}] {ep.replace('https://','')} 429, trying next", file=sys.stderr)
+                    with _antigravity_endpoint_lock:
+                        _antigravity_preferred_endpoint = None
                    continue
-                if e.code == 429 and ep != endpoints[-1] and _allow_staging:
-                    print(f"[{self._session_id}] {ep} HTTP 429, trying next endpoint", file=sys.stderr)
+                if err_class in ("service_disabled", "forbidden", "account_banned", "validation_required"):
+                    if ep == ordered[-1]:
+                        return self.send_json(e.code, {"error": {"type": "upstream_error", "message": _sanitize_err_body(err_body)}})
                    continue
-                if e.code == 429:
-                    pool = _google_antigravity_pool if OAUTH_PROVIDER == "google-antigravity" else _google_cli_pool
-                    _, acct = _get_google_account(OAUTH_PROVIDER)
-                    if acct:
-                        pool.mark_rate_limited(acct, 60)
-                return self.send_json(e.code, {"error": {"type": "upstream_error", "message": _sanitize_err_body(err_body)}})
-            except Exception as e:
-                if ep == endpoints[-1]:
-                    return self.send_json(502, {"error": {"type": "proxy_error", "message": str(e)}})
-                print(f"[{self._session_id}] {ep} failed: {e}, trying next", file=sys.stderr)
+                if ep == ordered[-1]:
+                    return self.send_json(e.code, {"error": {"type": "upstream_error", "message": _sanitize_err_body(err_body)}})
                continue
+            except Exception as e:
+                print(f"[{self._session_id}] {ep.replace('https://','')} conn failed: {e}", file=sys.stderr)
+                if ep == ordered[-1]:
+                    return self.send_json(502, {"error": {"type": "proxy_error", "message": str(e)}})
+                continue
+
+        if upstream is None:
+            return self.send_json(502, {"error": {"type": "proxy_error", "message": "All endpoints failed"}})

        if stream:
            self._forward_gemini_sse(upstream, model, body, input_data, tracker)
@@ -5406,11 +5749,25 @@ class Handler(http.server.BaseHTTPRequestHandler):
                        break
                    collected_events.append(event)
                    _observe_event(event)
+                print(f"[{self._session_id}] stream ended: events={len(collected_events)} finish={finish_reason} has_content={has_content} elapsed={time.time()-t0:.1f}s", file=sys.stderr)
            except (ConnectionResetError, BrokenPipeError, ConnectionAbortedError):
                print("[translate-proxy] client disconnected during stream", file=sys.stderr)
                _crof_record(model, n_items, False)
                _log_resp(last_resp_id, "client_disconnect", last_output)
                return
+            except (TimeoutError, OSError, urllib.error.URLError) as e:
+                print(f"[translate-proxy] upstream error during stream: {type(e).__name__}: {e}", file=sys.stderr)
+                err_resp_id = body.get("request_id") or body.get("id") or uid("resp")
+                try:
+                    self.wfile.write(emit("response.failed", {"type": "response.failed",
+                        "response": {"id": err_resp_id, "error": {"type": "upstream_error",
+                        "code": "stream_interrupted", "message": str(e)[:200]}}}).encode())
+                    self.wfile.flush()
+                except Exception:
+                    pass
+                _crof_record(model, n_items, False)
+                _log_resp(last_resp_id, "upstream_error", last_output)
+                return

            # Record outcome
            success = (finish_reason != "length")
@@ -5486,6 +5843,171 @@ class Handler(http.server.BaseHTTPRequestHandler):
                    except Exception as e:
                        print(f"[crof-adaptive] retry failed: {e}", file=sys.stderr)

+            # ── Auto-continue for truncated responses ── (cobra PR)
+            _ac_did_run = False
+            if stream and collected_events:
+                _ac_text = ""
+                _ac_msg_id = _ac_resp_id = None
+                for _ev in collected_events:
+                    for _ln in _ev.strip().split("\n"):
+                        if not _ln.startswith("data: "):
+                            continue
+                        try:
+                            _d = json.loads(_ln[6:])
+                            _t = _d.get("type")
+                            if _t == "response.output_text.done":
+                                _ac_text = _d.get("text", "")
+                            elif _t == "response.output_item.added" and _d.get("item",{}).get("type") == "message":
+                                _ac_msg_id = _d.get("item",{}).get("id")
+                            elif _t == "response.completed":
+                                _ac_resp_id = _d.get("response",{}).get("id")
+                        except Exception:
+                            pass
+
+                _ac_tc = reasoning_out.get("tool_calls", [])
+                _ac_truncated = False
+                if not _ac_tc and _ac_text:
+                    _ac_stripped = _ac_text.rstrip()
+                    if finish_reason == "length":
+                        _ac_truncated = True
+                    elif len(_ac_stripped) > 10 and _ac_stripped[-1] in "(:,;…":
+                        _ac_truncated = True
+
+                if _ac_truncated and _ac_text:
+                    print(f"[{self._session_id}] auto-continue: truncated (finish={finish_reason}, ends '{_ac_text.rstrip()[-10:]}')", file=sys.stderr)
+                    _ac_did_run = True
+                    _ac_cut = len(collected_events)
+                    for _i, _ev2 in enumerate(collected_events):
+                        if "response.output_text.done" in _ev2:
+                            _ac_cut = _i
+                            break
+                    collected_events = collected_events[:_ac_cut]
+
+                    _ac_accumulated = _ac_text
+                    _ac_max = 3
+                    for _ac_attempt in range(_ac_max):
+                        try:
+                            _ac_cont_msgs = list(chat_body.get("messages", []))
+                            _ac_cont_msgs.append({"role": "assistant", "content": _ac_accumulated})
+                            _ac_cont_msgs.append({"role": "user", "content": "Continue exactly where you left off. Do not repeat anything already written."})
+                            _ac_cont_body = dict(chat_body)
+                            _ac_cont_body["messages"] = _ac_cont_msgs
+                            _ac_cont_body["stream"] = False
+                            _ac_cont_req = urllib.request.Request(target, data=json.dumps(_ac_cont_body).encode(), headers=fwd)
+                            _ac_cont_resp = json.loads(urllib.request.urlopen(_ac_cont_req, timeout=120).read())
+                            _ac_choices = _ac_cont_resp.get("choices", [])
+                            if _ac_choices:
+                                _ac_chunk = _ac_choices[0].get("message",{}).get("content","")
+                                if not _ac_chunk:
+                                    _ac_chunk = _ac_choices[0].get("delta",{}).get("content","")
+                                _ac_finish = _ac_choices[0].get("finish_reason")
+                                if _ac_chunk:
+                                    _ac_accumulated += _ac_chunk
+                                    collected_events.append(emit("response.output_text.delta", {
+                                        "type": "response.output_text.delta",
+                                        "delta": _ac_chunk, "item_id": _ac_msg_id, "content_index": 0}))
+                                if _ac_finish != "length":
+                                    break
+                                _ac_text = _ac_accumulated
+                        except Exception as _ac_e:
+                            print(f"[{self._session_id}] auto-continue attempt {_ac_attempt+1} failed: {_ac_e}", file=sys.stderr)
+                            break
+
+                    if _ac_msg_id:
+                        collected_events.append(emit("response.output_text.done", {
+                            "type": "response.output_text.done",
+                            "text": _ac_accumulated, "item_id": _ac_msg_id, "content_index": 0}))
+                        collected_events.append(emit("response.content_part.done", {
+                            "type": "response.content_part.done",
+                            "part": {"type": "output_text", "text": _ac_accumulated, "annotations": []}, "item_id": _ac_msg_id}))
+                        collected_events.append(emit("response.output_item.done", {
+                            "type": "response.output_item.done",
+                            "item": {"type": "message", "id": _ac_msg_id, "role": "assistant", "status": "completed",
+                                     "content": [{"type": "output_text", "text": _ac_accumulated, "annotations": []}]}}))
+                    if _ac_resp_id:
+                        collected_events.append(emit("response.completed", {
+                            "type": "response.completed",
+                            "response": {"id": _ac_resp_id, "object": "response", "model": model,
+                                         "status": "completed", "created": int(time.time()),
+                                         "output": [{"type": "message", "id": _ac_msg_id, "role": "assistant",
+                                                     "status": "completed",
+                                                     "content": [{"type": "output_text", "text": _ac_accumulated, "annotations": []}]}]}}))
+                    has_content = True
+                    finish_reason = "stop"
+                    print(f"[{self._session_id}] auto-continue done: {len(_ac_text)} -> {len(_ac_accumulated)} chars", file=sys.stderr)
+
+            # Smart continuation: loop with escalating nudges when model stops text-only mid-task.
+            # Skip if auto-continue already handled the response.
+            if not _ac_did_run:
+                _smart_max = 2
+                _smart_attempt = 0
+                while _smart_attempt < _smart_max:
+                    _has_tool_calls_in_output = any(o.get("type") == "function_call" for o in (last_output or []))
+                    if not (finish_reason == "stop" and has_content and not _has_tool_calls_in_output
+                            and isinstance(input_data, list) and len(input_data) >= 3
+                            and has_function_call_output(input_data)):
+                        break
+                    _smart_attempt += 1
+                    _nudges = [
+                        "Continue with the task using tool calls. Do NOT describe what to do — call the appropriate functions.",
+                        "You MUST use tool calls to complete the task. Read files, run commands, and make changes using tools. Do NOT output XML tool calls as text.",
+                    ]
+                    nudge_text = _nudges[min(_smart_attempt - 1, len(_nudges) - 1)]
+                    # Try extracting XML tool calls from text as fallback before nudging
+                    last_text = ""
+                    for o in (last_output or []):
+                        if o.get("type") == "message":
+                            for c in (o.get("content") or []):
+                                if isinstance(c, dict) and c.get("type") == "output_text":
+                                    last_text += c.get("text", "")
+                    xml_fc = _extract_xml_tool_calls(last_text)
+                    if xml_fc:
+                        print(f"[{self._session_id}] [smart-continue] extracted {len(xml_fc)} XML tool calls from text, injecting and retrying", file=sys.stderr)
+                        fake_input = list(input_data)
+                        for xfc in xml_fc:
+                            fake_input.append({"type": "function_call", "id": uid("fcx"), "call_id": uid("fcx"),
+                                               "name": xfc["name"], "arguments": xfc["args"], "status": "completed"})
+                        fake_messages = oa_input_to_messages(fake_input)
+                        instructions = body.get("instructions", "").strip()
+                        if instructions:
+                            fake_messages.insert(0, {"role": "system", "content": instructions})
+                        fake_chat_body = self._build_chat_body(model, fake_messages, body, stream)
+                        fake_req = urllib.request.Request(target, data=json.dumps(fake_chat_body).encode(), headers=fwd)
+                        try:
+                            retry_upstream = urllib.request.urlopen(fake_req, timeout=_upstream_timeout(body, True))
+                            collected_events = []
+                            last_resp_id = last_output = last_status = None
+                            finish_reason = None
+                            has_content = False
+                            for event in oa_stream_to_sse(retry_upstream, model, body.get("request_id") or body.get("id")):
+                                collected_events.append(event)
+                                _observe_event(event)
+                            input_data = fake_input
+                            continue
+                        except Exception as e:
+                            print(f"[{self._session_id}] [smart-continue] XML injection retry failed: {e}", file=sys.stderr)
+                            break
+                    _nudge_msg = {"role": "user", "content": nudge_text}
+                    nudge_messages = oa_input_to_messages(input_data) + [_nudge_msg]
+                    instructions = body.get("instructions", "").strip()
+                    if instructions:
+                        nudge_messages.insert(0, {"role": "system", "content": instructions})
+                    nudge_chat_body = self._build_chat_body(model, nudge_messages, body, stream)
+                    nudge_req = urllib.request.Request(target, data=json.dumps(nudge_chat_body).encode(), headers=fwd)
+                    print(f"[{self._session_id}] [smart-continue] attempt {_smart_attempt}/{_smart_max}: model stopped mid-task, nudging", file=sys.stderr)
+                    try:
+                        retry_upstream = urllib.request.urlopen(nudge_req, timeout=_upstream_timeout(body, True))
+                        collected_events = []
+                        last_resp_id = last_output = last_status = None
+                        finish_reason = None
+                        has_content = False
+                        for event in oa_stream_to_sse(retry_upstream, model, body.get("request_id") or body.get("id")):
+                            collected_events.append(event)
+                            _observe_event(event)
+                    except Exception as e:
+                        print(f"[{self._session_id}] [smart-continue] nudge attempt {_smart_attempt} failed: {e}", file=sys.stderr)
+                        break
+
            self.stream_buffered_events(collected_events)
        else:
            result = oa_resp_to_responses(json.loads(upstream.read()), model)
@@ -6392,12 +6914,45 @@ def _handle_shutdown_signal(sig, frame):
    if 'SERVER' in globals() and SERVER:
         SERVER.shutdown()
 
+def _anti_stall_cleanup():
+    my_pid = os.getpid()
+    my_port = PORT
+    killed = []
+    try:
+        import subprocess as _sp
+        out = _sp.run(["pgrep", "-f", "translate-proxy"], capture_output=True, text=True, timeout=5).stdout.strip()
+        for pid_str in out.splitlines():
+            pid_str = pid_str.strip()
+            if not pid_str or not pid_str.isdigit():
+                continue
+            pid = int(pid_str)
+            if pid == my_pid:
+                continue
+            try:
+                os.kill(pid, signal.SIGTERM)
+                killed.append(pid)
+            except (ProcessLookupError, PermissionError):
+                pass
+    except Exception:
+        pass
+    try:
+        _cache_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "__pycache__")
+        if os.path.isdir(_cache_dir):
+            import shutil
+            shutil.rmtree(_cache_dir, ignore_errors=True)
+    except Exception:
+        pass
+    if killed:
+        print(f"[anti-stall] killed {len(killed)} stale proxy process(es): {killed}", flush=True)
+        time.sleep(1)
+
 def main():
    global SERVER, _START_TIME
    _START_TIME = time.time()
+    _anti_stall_cleanup()
    _init_runtime()
    try:
-        _current_cfg = os.path.basename(args.config) if args.config else ""
+        _current_cfg = os.path.basename(_CONFIG_PATH) if _CONFIG_PATH else ""
        for _f in os.listdir(_LOG_DIR):
            if _f.startswith("proxy-") and _f.endswith(".json") and _f != _current_cfg:
                os.remove(os.path.join(_LOG_DIR, _f))