v10.13.8: intelligent model profiles — dynamic limits per model capability

2026-05-27 18:04:01 +04:00
parent eebffaab4b
commit d273bf2518
1 changed files with 110 additions and 21 deletions
--- a/translate-proxy.py
+++ b/translate-proxy.py
@@ -382,12 +382,89 @@ _conn_pool = {}
 _STREAM_IDLE_TIMEOUT = 300
 def _idle_timeout_for_model(model, default=300):
    return _model_profile(model).get("idle_timeout", default)
 _MODEL_PROFILES = {
    "flash": {
        "idle_timeout": 120, "max_tool_calls": 100, "warn_tool_calls": 60,
        "max_reads_no_write": 10, "warn_reads_no_write": 6,
        "max_input_items": 120, "tool_output_limit": 8000, "compaction": "balanced",
        "reasoning_budget": 8192, "max_tokens": 65536,
    },
    "gemini-3.5-flash": {
        "idle_timeout": 120, "max_tool_calls": 100, "warn_tool_calls": 60,
        "max_reads_no_write": 10, "warn_reads_no_write": 6,
        "max_input_items": 120, "tool_output_limit": 8000, "compaction": "balanced",
        "reasoning_budget": 8192, "max_tokens": 65536,
    },
    "gemini-3.1-pro": {
        "idle_timeout": 300, "max_tool_calls": 150, "warn_tool_calls": 80,
        "max_reads_no_write": 12, "warn_reads_no_write": 8,
        "max_input_items": 200, "tool_output_limit": 8000, "compaction": "conservative",
        "reasoning_budget": 24576, "max_tokens": 65536,
    },
    "pro": {
        "idle_timeout": 300, "max_tool_calls": 150, "warn_tool_calls": 80,
        "max_reads_no_write": 12, "warn_reads_no_write": 8,
        "max_input_items": 200, "tool_output_limit": 8000, "compaction": "conservative",
        "reasoning_budget": 24576, "max_tokens": 65536,
    },
    "sonnet": {
        "idle_timeout": 300, "max_tool_calls": 150, "warn_tool_calls": 80,
        "max_reads_no_write": 10, "warn_reads_no_write": 7,
        "max_input_items": 180, "tool_output_limit": 8000, "compaction": "balanced",
        "reasoning_budget": 16384, "max_tokens": 65536,
    },
    "opus": {
        "idle_timeout": 600, "max_tool_calls": 200, "warn_tool_calls": 100,
        "max_reads_no_write": 8, "warn_reads_no_write": 5,
        "max_input_items": 250, "tool_output_limit": 10000, "compaction": "conservative",
        "reasoning_budget": 32768, "max_tokens": 131072,
    },
    "deepseek": {
        "idle_timeout": 300, "max_tool_calls": 120, "warn_tool_calls": 70,
        "max_reads_no_write": 10, "warn_reads_no_write": 7,
        "max_input_items": 150, "tool_output_limit": 6000, "compaction": "balanced",
        "reasoning_budget": 16384, "max_tokens": 65536,
    },
    "qwen": {
        "idle_timeout": 300, "max_tool_calls": 120, "warn_tool_calls": 70,
        "max_reads_no_write": 10, "warn_reads_no_write": 7,
        "max_input_items": 150, "tool_output_limit": 6000, "compaction": "balanced",
        "reasoning_budget": 16384, "max_tokens": 65536,
    },
    "gpt-oss": {
        "idle_timeout": 300, "max_tool_calls": 100, "warn_tool_calls": 60,
        "max_reads_no_write": 10, "warn_reads_no_write": 6,
        "max_input_items": 120, "tool_output_limit": 6000, "compaction": "balanced",
        "reasoning_budget": 8192, "max_tokens": 32768,
    },
 }
 _DEFAULT_MODEL_PROFILE = {
    "idle_timeout": 300, "max_tool_calls": 150, "warn_tool_calls": 80,
    "max_reads_no_write": 12, "warn_reads_no_write": 8,
    "max_input_items": 150, "tool_output_limit": 6000, "compaction": "balanced",
    "reasoning_budget": 16384, "max_tokens": 65536,
 }
 def _model_profile(model):
    if not model:
-        return default
+        return dict(_DEFAULT_MODEL_PROFILE)
-    m = model.lower()
+    m = model.lower().replace("-", "").replace("_", "").replace(" ", "")
-    if "flash" in m or "mini" in m or "haiku" in m:
+    for key, profile in _MODEL_PROFILES.items():
-        return 120
+        key_norm = key.replace("-", "").replace("_", "").replace(" ", "")
-    return default
+        if key_norm in m:
            return dict(profile)
    if "flash" in m or "mini" in m or "haiku" in m or "tiny" in m:
        return dict(_MODEL_PROFILES["flash"])
    if "opus" in m or "ultra" in m:
        return dict(_MODEL_PROFILES["opus"])
    if "sonnet" in m:
        return dict(_MODEL_PROFILES["sonnet"])
    if "pro" in m and "flash" not in m:
        return dict(_MODEL_PROFILES["pro"])
    return dict(_DEFAULT_MODEL_PROFILE)
 _MAX_CONCURRENT_REQUESTS = 3
 _request_semaphore = threading.Semaphore(_MAX_CONCURRENT_REQUESTS)
@@ -5882,18 +5959,24 @@ class Handler(http.server.BaseHTTPRequestHandler):
            cumulative_calls = ag_state.get("total_tool_calls", 0) + n_tool_calls
            ag_state["total_tool_calls"] = cumulative_calls
-            if cumulative_calls > _ANTIGRAVITY_MAX_TOOL_CALLS_PER_TASK:
+            _mp = _model_profile(model)
-                print(f"[{getattr(self, '_session_id', '?')}] [antigravity-budget] HARD CAP: {cumulative_calls} calls, injecting force-write directive", file=sys.stderr)
+            _mp_max_calls = _mp["max_tool_calls"]
            _mp_warn_calls = _mp["warn_tool_calls"]
            _mp_max_reads = _mp["max_reads_no_write"]
            _mp_warn_reads = _mp["warn_reads_no_write"]
            if cumulative_calls > _mp_max_calls:
                print(f"[{getattr(self, '_session_id', '?')}] [antigravity-budget] HARD CAP: {cumulative_calls}/{_mp_max_calls} calls (model={model}), injecting force-write directive", file=sys.stderr)
                contents.append({"role": "user", "parts": [{"text":
                    f"CRITICAL BUDGET LIMIT: {cumulative_calls} tool calls made. "
                    f"YOU MUST STOP NOW. Do NOT call any more tools. "
                    f"Write your FINAL answer immediately using the information you already have. "
                    f"If you have file edits, apply them in this response using exec_command with a write command. "
                    f"DO NOT READ ANY MORE FILES."}]})
-            elif cumulative_calls > _ANTIGRAVITY_WARN_TOOL_CALLS_PER_TASK:
+            elif cumulative_calls > _mp_warn_calls:
                contents.append({"role": "user", "parts": [{"text":
                    f"WARNING: {cumulative_calls} tool calls made. "
-                    f"{_ANTIGRAVITY_MAX_TOOL_CALLS_PER_TASK - cumulative_calls} remaining before forced stop. "
+                    f"{_mp_max_calls - cumulative_calls} remaining before forced stop. "
                    f"STOP READING FILES AND APPLY YOUR EDITS NOW."}]})
            # CHANGE 2: Read-vs-write loop detection
@@ -5918,10 +6001,10 @@ class Handler(http.server.BaseHTTPRequestHandler):
                            ft["reads"] += 1
                n_reads = ft["reads"]
                n_writes = ft["writes"]
-                if n_reads >= 12 and n_writes == 0:
+                if n_reads >= _mp_max_reads and n_writes == 0:
                    ag_state["force_finalize"] = True
-                    print(f"[antigravity-loop] READ-WRITE IMBALANCE: {n_reads} reads, {n_writes} writes — model never writes", file=sys.stderr)
+                    print(f"[antigravity-loop] READ-WRITE IMBALANCE: {n_reads} reads, {n_writes} writes (model={model}, limit={_mp_max_reads})", file=sys.stderr)
-                elif n_reads >= 8 and n_writes == 0 and not ag_state.get("force_finalize"):
+                elif n_reads >= _mp_warn_reads and n_writes == 0 and not ag_state.get("force_finalize"):
                    contents.append({"role": "user", "parts": [{"text":
                        f"WARNING: You have made {n_reads} tool calls and ZERO writes. "
                        f"You MUST apply your edit NOW using exec_command with a python write. "
@@ -5975,7 +6058,7 @@ class Handler(http.server.BaseHTTPRequestHandler):
        request_body["systemInstruction"] = {"role": "user", "parts": system_parts}
        if gen_config:
            request_body["generationConfig"] = gen_config
-        _budget_exceeded = ag_state.get("total_tool_calls", 0) > _ANTIGRAVITY_MAX_TOOL_CALLS_PER_TASK
+        _budget_exceeded = ag_state.get("total_tool_calls", 0) > _mp.get("max_tool_calls", 150)
        if gemini_tools and not _budget_exceeded and not ag_state.get("force_finalize"):
            request_body["tools"] = gemini_tools
        elif _budget_exceeded or ag_state.get("force_finalize"):
@@ -6727,16 +6810,22 @@ class Handler(http.server.BaseHTTPRequestHandler):
                    cumulative_calls = ag_state.get("total_tool_calls", 0) + n_tool_calls
                    ag_state["total_tool_calls"] = cumulative_calls
-                    if cumulative_calls > _ANTIGRAVITY_MAX_TOOL_CALLS_PER_TASK:
+                    _mp_oa = _model_profile(model)
-                        print(f"[antigravity-budget] HARD CAP: {cumulative_calls} calls, injecting force-write", file=sys.stderr)
+                    _mp_max = _mp_oa["max_tool_calls"]
                    _mp_warn = _mp_oa["warn_tool_calls"]
                    _mp_maxr = _mp_oa["max_reads_no_write"]
                    _mp_warnr = _mp_oa["warn_reads_no_write"]
                    if cumulative_calls > _mp_max:
                        print(f"[antigravity-budget] HARD CAP: {cumulative_calls}/{_mp_max} calls (model={model}), injecting force-write", file=sys.stderr)
                        contents.append({"role": "user", "parts": [{"text":
                            f"CRITICAL BUDGET LIMIT: {cumulative_calls} tool calls. "
                            f"STOP ALL TOOL CALLS. Write your FINAL answer now. "
                            f"Apply any edits using exec_command with a write command in this response."}]})
-                    elif cumulative_calls > _ANTIGRAVITY_WARN_TOOL_CALLS_PER_TASK:
+                    elif cumulative_calls > _mp_warn:
                        contents.append({"role": "user", "parts": [{"text":
                            f"WARNING: {cumulative_calls} tool calls. "
-                            f"{_ANTIGRAVITY_MAX_TOOL_CALLS_PER_TASK - cumulative_calls} remaining. "
+                            f"{_mp_max - cumulative_calls} remaining. "
                            f"STOP READING AND WRITE NOW."}]})
                    with _ANTIGRAVITY_LOOP_TRACKER_LOCK:
@@ -6758,10 +6847,10 @@ class Handler(http.server.BaseHTTPRequestHandler):
                                    ft["reads"] += 1
                        n_reads = ft["reads"]
                        n_writes = ft["writes"]
-                        if n_reads >= 12 and n_writes == 0:
+                        if n_reads >= _mp_maxr and n_writes == 0:
                            ag_state["force_finalize"] = True
-                            print(f"[antigravity-loop] READ-WRITE IMBALANCE: {n_reads} reads, {n_writes} writes — model never writes", file=sys.stderr)
+                            print(f"[antigravity-loop] READ-WRITE IMBALANCE: {n_reads} reads, {n_writes} writes (model={model}, limit={_mp_maxr})", file=sys.stderr)
-                        elif n_reads >= 8 and n_writes == 0 and not ag_state.get("force_finalize"):
+                        elif n_reads >= _mp_warnr and n_writes == 0 and not ag_state.get("force_finalize"):
                            contents.append({"role": "user", "parts": [{"text":
                                f"WARNING: You have made {n_reads} tool calls and ZERO writes. "
                                f"You MUST apply your edit NOW using exec_command with a python write. "
@@ -6851,7 +6940,7 @@ class Handler(http.server.BaseHTTPRequestHandler):
            request_body["systemInstruction"] = {"parts": system_parts}
        if gen_config:
            request_body["generationConfig"] = gen_config
-        _budget_exceeded_oa = ag_state.get("total_tool_calls", 0) > _ANTIGRAVITY_MAX_TOOL_CALLS_PER_TASK
+        _budget_exceeded_oa = ag_state.get("total_tool_calls", 0) > _mp_oa.get("max_tool_calls", 150)
        if gemini_tools and not _budget_exceeded_oa and not ag_state.get("force_finalize"):
            request_body["tools"] = gemini_tools
        elif _budget_exceeded_oa or ag_state.get("force_finalize"):