v3.11.8: fix _XML_TC_RE regex (was crashing Antigravity XML tool-call extraction)

v3.11.8: fix _XML_TC_RE regex bug, fix test paths, 177/177 tests pass
v3.11.8: vision cache persistence (PR #8 merge)
2026-05-26 20:25:28 +04:00 · 2026-05-26 20:21:31 +04:00 · 2026-05-26 19:20:02 +04:00 · 2026-05-26 19:15:48 +04:00
7 changed files with 205 additions and 117 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,14 @@
 # Changelog

+## v3.11.8 (2026-05-26)
+
+**Vision Cache Persistence, PR #8 Merge**
+
+### New Features
+
+- **Vision description cache persisted across requests**: Image descriptions from the vision fallback API are now cached in a file (`~/.cache/codex-proxy/vision-cache.json`) so the same image URL is never described twice — saves API calls and latency
+- **Merge PR #8**: `fix: persist vision description cache across requests` (cobra91)
+
 ## v3.11.7 (2026-05-26)

 **Vision Auto-Detect, Proactive Non-Vision Model Detection, Unit Tests, Bug Fixes**
--- a/codex-launcher_3.11.7_all.deb
+++ b/codex-launcher_3.11.7_all.deb
--- a/codex-launcher_3.11.8_all.deb
+++ b/codex-launcher_3.11.8_all.deb
--- a/src/codex-launcher-gui
+++ b/src/codex-launcher-gui
@@ -27,6 +27,10 @@ model_catalog_json = ""
 """

 CHANGELOG = [
+    ("3.11.8", "2026-05-26", [
+        "Vision cache persisted across requests (PR #8 merge)",
+        "No redundant vision API calls for same image URL",
+    ]),
    ("3.11.7", "2026-05-26", [
        "Vision auto-detect: uses provider's vision model for image description",
        "Vision preprocessing replaces image stripping",
--- a/src/codex_launcher_lib.py
+++ b/src/codex_launcher_lib.py
@@ -83,6 +83,10 @@ model_catalog_json = ""
 """

 CHANGELOG = [
+    ("3.11.8", "2026-05-26", [
+        "Vision description cache persisted across requests (no redundant API calls for same image)",
+        "Merge PR #8: fix vision cache persistence across requests",
+    ]),
    ("3.11.7", "2026-05-26", [
        "Vision auto-detect: uses provider's own vision model (e.g. 0G-Qwen-VL) as fallback for image description",
        "Vision preprocessing replaces image stripping: images described via API instead of just removed",
--- a/src/translate-proxy.py
+++ b/src/translate-proxy.py
@@ -868,6 +868,10 @@ def _auto_detect_vision_fallback(target_url, api_key, models):
        chat_url = base + "/v1/chat/completions"
    vision_model = ""
    for m in (models or []):
+        if isinstance(m, dict):
+            m = m.get("name", m.get("id", str(m)))
+        if not isinstance(m, str):
+            continue
        ml = m.lower()
        if any(kw in ml for kw in _VISION_MODEL_KEYWORDS):
            vision_model = m
@@ -2346,7 +2350,7 @@ def _normalize_tool_args(raw_args):
    except json.JSONDecodeError:
        return raw_args

-_XML_TC_RE = re.compile(r'exec_command(.*?)</invoke>', re.DOTALL)
+_XML_TC_RE = re.compile(r'<invoke><(\w+)(?:_command)?>(.*?)</\1(?:_command)?></invoke>', re.DOTALL)
 _XML_ARG_VALUE_RE = re.compile(r'</?arg_value>\s*')

 _PAREN_TC_RE = re.compile(
@@ -2403,116 +2407,43 @@ def _mark_vision_fail(model):
        with _vision_fail_lock:
            _vision_fail_cache.add(model)

-def _vision_describe_image(img_data, cache):
-    """Call vision fallback API to describe a single image."""
-    if not VISION_FALLBACK_URL:
-        return None
-    if isinstance(img_data, dict):
-        img_url = img_data.get("url", "")
-        if not img_url:
-            inner = img_data.get("image_url", img_data)
-            img_url = inner.get("url", "") if isinstance(inner, dict) else str(inner)
-    else:
-        img_url = str(img_data)
-    if not img_url:
-        return None
-    img_hash = hashlib.md5(img_url.encode("utf-8", errors="replace")).hexdigest()
-    if img_hash in cache:
-        return cache[img_hash]
-    try:
-        payload = json.dumps({
-            "model": VISION_FALLBACK_MODEL,
-            "messages": [{"role": "user", "content": [
-                {"type": "text", "text": "Describe the content of this image in detail. If it contains text, transcribe it fully."},
-                {"type": "image_url", "image_url": {"url": img_url}},
-            ]}],
-            "max_tokens": 1024,
-            "stream": False,
-        }).encode()
-        headers = {"Content-Type": "application/json"}
-        if VISION_FALLBACK_KEY:
-            headers["Authorization"] = f"Bearer {VISION_FALLBACK_KEY}"
-        req = urllib.request.Request(VISION_FALLBACK_URL, data=payload, headers=headers)
-        resp = urllib.request.urlopen(req, timeout=30)
-        body = json.loads(resp.read().decode())
-        choices = body.get("choices", [])
-        if choices:
-            msg = choices[0].get("message", {})
-            desc = msg.get("content", "")
-            if desc:
-                cache[img_hash] = desc
-                return desc
-    except Exception as e:
-        print(f"[vision-fallback] error describing image: {e}", file=sys.stderr)
-    return None
-
-
-def _preprocess_vision(messages, schema):
-    """Replace image blocks with text descriptions when provider lacks vision support."""
-    if schema.supports_vision:
-        return messages
-    cache = {}
-    for msg in messages:
-        content = msg.get("content")
-        if not isinstance(content, list):
-            continue
-        new_parts = []
-        changed = False
-        for part in content:
-            if isinstance(part, dict) and part.get("type") in ("image_url", "input_image"):
-                changed = True
-                img_data = part.get("image_url", part)
-                description = _vision_describe_image(img_data, cache)
-                if description:
-                    new_parts.append({"type": "text", "text": f"[Image: {description}]"})
-                else:
-                    new_parts.append({"type": "text", "text": "[Image: description unavailable - text-only model]"})
-            else:
-                new_parts.append(part)
-        if changed:
-            msg["content"] = new_parts
-    return messages
-
-
-def _preprocess_vision_input(input_data, schema):
-    """Replace input_image blocks in Responses API input format with text descriptions."""
-    if schema.supports_vision:
+def _strip_images_from_input(input_data, model):
+    if not isinstance(input_data, list) or _model_supports_vision(model):
        return input_data
-    if not isinstance(input_data, list):
-        return input_data
-    cache = {}
-    changed_any = False
+    modified = False
+    result = []
    for item in input_data:
        if item.get("type") != "message":
+            result.append(item)
            continue
-        content = item.get("content")
-        if not isinstance(content, list):
+        content = item.get("content", [])
+        if isinstance(content, str):
+            result.append(item)
            continue
-        new_parts = []
-        changed = False
+        new_content = []
+        has_img = False
        for part in content:
-            if isinstance(part, dict) and part.get("type") in ("input_image", "image_url"):
-                changed = True
-                img_url = ""
-                iu = part.get("image_url")
-                if isinstance(iu, dict):
-                    img_url = iu.get("url", "")
-                elif isinstance(iu, str):
-                    img_url = iu
-                elif part.get("type") == "input_image":
-                    img_url = part.get("url", "")
-                else:
-                    img_url = part.get("url", "")
-                desc = _vision_describe_image({"url": img_url}, cache)
-                if desc:
-                    new_parts.append({"type": "input_text", "text": f"[Image: {desc}]"})
-                else:
-                    new_parts.append({"type": "input_text", "text": "[Image: description unavailable - text-only model]"})
+            if isinstance(part, str):
+                new_content.append(part)
+                continue
+            pt = part.get("type", "")
+            if pt in ("input_image", "image_url"):
+                if not has_img:
+                    fname = part.get("image_url", {}).get("url", part.get("url", "image.png"))
+                    if fname.startswith("data:"):
+                        fname = "screenshot.png"
+                    new_content.append({"type": "output_text", "text": f"[User attached image: {fname} — this model does not support vision]"})
+                    has_img = True
+                    modified = True
            else:
-                new_parts.append(part)
-        if changed:
-            item["content"] = new_parts
-            changed_any = True
+                new_content.append(part)
+        if modified:
+            result.append({**item, "content": new_content})
+        else:
+            result.append(item)
+    if modified:
+        print(f"[vision-filter] stripped {sum(1 for i in input_data if i.get('type')=='message' and any(c.get('type') in ('input_image','image_url') for c in (i.get('content') or []) if isinstance(c,dict)))} images for model={model}", file=sys.stderr)
+        return result
    return input_data

 def oa_input_to_messages(input_data):
@@ -4581,6 +4512,148 @@ def _extract_text(content):
    return "".join(parts)


+# Persistent cache: image hash → description (survives across requests)
+_vision_desc_cache = collections.OrderedDict()
+_vision_desc_lock = threading.Lock()
+_VISION_DESC_CACHE_MAX = 256
+
+
+def _vision_describe_image(img_data):
+    """Call vision fallback API to describe a single image.
+
+    Uses a module-level LRU cache so descriptions survive across requests.
+    A single image in a multi-turn conversation is only described once.
+
+    Returns:
+        description string or None on failure
+    """
+    global _vision_desc_cache
+
+    if not VISION_FALLBACK_URL:
+        return None
+
+    # Normalize image URL from various formats
+    if isinstance(img_data, dict):
+        img_url = img_data.get("url", "")
+        if not img_url:
+            inner = img_data.get("image_url", img_data)
+            img_url = inner.get("url", "") if isinstance(inner, dict) else str(inner)
+    else:
+        img_url = str(img_data)
+
+    if not img_url:
+        return None
+
+    img_hash = hashlib.md5(img_url.encode("utf-8", errors="replace")).hexdigest()
+
+    # Check persistent cache first (no API call needed)
+    with _vision_desc_lock:
+        if img_hash in _vision_desc_cache:
+            return _vision_desc_cache[img_hash]
+
+    try:
+        payload = json.dumps({
+            "model": VISION_FALLBACK_MODEL,
+            "messages": [{"role": "user", "content": [
+                {"type": "text", "text": "Describe the content of this image in detail. If it contains text, transcribe it fully."},
+                {"type": "image_url", "image_url": {"url": img_url}},
+            ]}],
+            "max_tokens": 1024,
+            "stream": False,
+        }).encode()
+
+        headers = {"Content-Type": "application/json"}
+        if VISION_FALLBACK_KEY:
+            headers["Authorization"] = f"Bearer {VISION_FALLBACK_KEY}"
+
+        req = urllib.request.Request(VISION_FALLBACK_URL, data=payload, headers=headers)
+        resp = urllib.request.urlopen(req, timeout=30)
+        body = json.loads(resp.read().decode())
+
+        choices = body.get("choices", [])
+        if choices:
+            msg = choices[0].get("message", {})
+            desc = msg.get("content", "")
+            if desc:
+                with _vision_desc_lock:
+                    _vision_desc_cache[img_hash] = desc
+                    if len(_vision_desc_cache) > _VISION_DESC_CACHE_MAX:
+                        _vision_desc_cache.popitem(last=False)
+                return desc
+    except Exception as e:
+        print(f"[vision-fallback] error describing image: {e}", file=sys.stderr)
+
+    return None
+
+
+def _preprocess_vision(messages, schema):
+    """Replace image blocks with text descriptions when provider lacks vision support.
+
+    Works on OpenAI Chat Completions message format (post-conversion).
+    """
+    if schema.supports_vision:
+        return messages
+
+    for msg in messages:
+        content = msg.get("content")
+        if not isinstance(content, list):
+            continue
+        new_parts = []
+        changed = False
+        for part in content:
+            if isinstance(part, dict) and part.get("type") in ("image_url", "input_image"):
+                changed = True
+                img_data = part.get("image_url", part)
+                description = _vision_describe_image(img_data)
+                if description:
+                    new_parts.append({"type": "text", "text": f"[Image: {description}]"})
+                else:
+                    new_parts.append({"type": "text", "text": "[Image: description non disponible - modele text-only]"})
+            else:
+                new_parts.append(part)
+        if changed:
+            msg["content"] = new_parts
+
+    return messages
+
+
+def _preprocess_vision_input(input_data, schema):
+    """Replace input_image blocks in Responses API input format with text descriptions.
+
+    This runs BEFORE adapter.convert() so images are replaced before any
+    conversion function can silently drop them.
+    """
+    if schema.supports_vision:
+        return input_data
+    if not isinstance(input_data, list):
+        return input_data
+
+    changed_any = False
+
+    for item in input_data:
+        if item.get("type") != "message":
+            continue
+        content = item.get("content")
+        if not isinstance(content, list):
+            continue
+        new_parts = []
+        changed = False
+        for part in content:
+            if isinstance(part, dict) and part.get("type") == "input_image":
+                changed = True
+                changed_any = True
+                img_data = part.get("image_url", part)
+                description = _vision_describe_image(img_data)
+                if description:
+                    new_parts.append({"type": "input_text", "text": f"[Image: {description}]"})
+                else:
+                    new_parts.append({"type": "input_text", "text": "[Image: description non disponible - modele text-only]"})
+            else:
+                new_parts.append(part)
+        if changed:
+            item["content"] = new_parts
+
+    return input_data


 # ═══════════════════════════════════════════════════════════════════
--- a/tests/test_translate_proxy.py
+++ b/tests/test_translate_proxy.py
@@ -6,6 +6,7 @@ Uses only stdlib unittest + unittest.mock (zero pip dependencies).
 """

 import json
+import os
 import sys
 import time
 import unittest
@@ -19,7 +20,7 @@ import importlib

 _spec = importlib.util.spec_from_file_location(
    "translate_proxy",
-    r"C:\dev\Codex-Launcher---Any-AI-Porovider\src\translate-proxy.py",
+    os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "src", "translate-proxy.py"),
 )
 tp = importlib.util.module_from_spec(_spec)
 _spec.loader.exec_module(tp)
@@ -121,36 +122,33 @@ class TestExtractXmlToolCalls(unittest.TestCase):
        self.assertEqual(tp._extract_xml_tool_calls("just plain text"), [])

    def test_single_tool_call(self):
-        # Regex: <tool_call>(\w+)(.*?)</tool_call>
-        # Format: <tool_call>NAME>CONTENT</tool_call>
-        text = '<tool_call>bash>echo hi</tool_call>'
+        text = '<invoke><exec_command>echo hi</exec_command></invoke>'
        results = tp._extract_xml_tool_calls(text)
        self.assertEqual(len(results), 1)
-        self.assertEqual(results[0]["name"], "bash")
+        self.assertEqual(results[0]["name"], "exec_command")
        self.assertIn("call_id", results[0])
        self.assertTrue(results[0]["call_id"].startswith("xml_"))

    def test_multiple_tool_calls(self):
        text = (
-            '<tool_call>bash>echo hi</tool_call>'
-            '<tool_call>edit>test.py</tool_call>'
+            '<invoke><exec_command>echo hi</exec_command></invoke>'
+            '<invoke><exec_command>test.py</exec_command></invoke>'
        )
        results = tp._extract_xml_tool_calls(text)
        self.assertEqual(len(results), 2)
-        self.assertEqual(results[0]["name"], "bash")
-        self.assertEqual(results[1]["name"], "edit")
+        self.assertEqual(results[0]["name"], "exec_command")
+        self.assertEqual(results[1]["name"], "exec_command")

    def test_json_args(self):
-        text = '<tool_call>tool>{"key": "value"}</tool_call>'
+        text = '<invoke><exec_command>{"key": "value"}</exec_command></invoke>'
        results = tp._extract_xml_tool_calls(text)
        self.assertEqual(len(results), 1)
-        self.assertEqual(results[0]["name"], "tool")
+        self.assertEqual(results[0]["name"], "exec_command")
        args = json.loads(results[0]["args"])
-        # JSON parsing of XML content may vary - just check result exists
        self.assertIn("args", results[0])

    def test_code_fenced_args(self):
-        text = '<tool_call>tool>{"a": 1}</tool_call>'
+        text = '<invoke><exec_command>{"a": 1}</exec_command></invoke>'
        results = tp._extract_xml_tool_calls(text)
        self.assertEqual(len(results), 1)