v3.11.8: vision cache persistence (PR #8 merge)

2026-05-26 19:20:02 +04:00
parent 0b13c376d8
commit 66fe3c07a3
6 changed files with 189 additions and 103 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,14 @@
 # Changelog
 ## v3.11.8 (2026-05-26)
 **Vision Cache Persistence, PR #8 Merge**
 ### New Features
 - **Vision description cache persisted across requests**: Image descriptions from the vision fallback API are now cached in a file (`~/.cache/codex-proxy/vision-cache.json`) so the same image URL is never described twice — saves API calls and latency
 - **Merge PR #8**: `fix: persist vision description cache across requests` (cobra91)
 ## v3.11.7 (2026-05-26)
 **Vision Auto-Detect, Proactive Non-Vision Model Detection, Unit Tests, Bug Fixes**
--- a/codex-launcher_3.11.7_all.deb
+++ b/codex-launcher_3.11.7_all.deb
--- a/codex-launcher_3.11.8_all.deb
+++ b/codex-launcher_3.11.8_all.deb
--- a/src/codex-launcher-gui
+++ b/src/codex-launcher-gui
@@ -27,6 +27,10 @@ model_catalog_json = ""
 """
 CHANGELOG = [
    ("3.11.8", "2026-05-26", [
        "Vision cache persisted across requests (PR #8 merge)",
        "No redundant vision API calls for same image URL",
    ]),
    ("3.11.7", "2026-05-26", [
        "Vision auto-detect: uses provider's vision model for image description",
        "Vision preprocessing replaces image stripping",
--- a/src/codex_launcher_lib.py
+++ b/src/codex_launcher_lib.py
@@ -83,6 +83,10 @@ model_catalog_json = ""
 """
 CHANGELOG = [
    ("3.11.8", "2026-05-26", [
        "Vision description cache persisted across requests (no redundant API calls for same image)",
        "Merge PR #8: fix vision cache persistence across requests",
    ]),
    ("3.11.7", "2026-05-26", [
        "Vision auto-detect: uses provider's own vision model (e.g. 0G-Qwen-VL) as fallback for image description",
        "Vision preprocessing replaces image stripping: images described via API instead of just removed",
--- a/src/translate-proxy.py
+++ b/src/translate-proxy.py
@@ -2407,116 +2407,43 @@ def _mark_vision_fail(model):
        with _vision_fail_lock:
            _vision_fail_cache.add(model)
-def _vision_describe_image(img_data, cache):
+def _strip_images_from_input(input_data, model):
-    """Call vision fallback API to describe a single image."""
+    if not isinstance(input_data, list) or _model_supports_vision(model):
    if not VISION_FALLBACK_URL:
        return None
    if isinstance(img_data, dict):
        img_url = img_data.get("url", "")
        if not img_url:
            inner = img_data.get("image_url", img_data)
            img_url = inner.get("url", "") if isinstance(inner, dict) else str(inner)
    else:
        img_url = str(img_data)
    if not img_url:
        return None
    img_hash = hashlib.md5(img_url.encode("utf-8", errors="replace")).hexdigest()
    if img_hash in cache:
        return cache[img_hash]
    try:
        payload = json.dumps({
            "model": VISION_FALLBACK_MODEL,
            "messages": [{"role": "user", "content": [
                {"type": "text", "text": "Describe the content of this image in detail. If it contains text, transcribe it fully."},
                {"type": "image_url", "image_url": {"url": img_url}},
            ]}],
            "max_tokens": 1024,
            "stream": False,
        }).encode()
        headers = {"Content-Type": "application/json"}
        if VISION_FALLBACK_KEY:
            headers["Authorization"] = f"Bearer {VISION_FALLBACK_KEY}"
        req = urllib.request.Request(VISION_FALLBACK_URL, data=payload, headers=headers)
        resp = urllib.request.urlopen(req, timeout=30)
        body = json.loads(resp.read().decode())
        choices = body.get("choices", [])
        if choices:
            msg = choices[0].get("message", {})
            desc = msg.get("content", "")
            if desc:
                cache[img_hash] = desc
                return desc
    except Exception as e:
        print(f"[vision-fallback] error describing image: {e}", file=sys.stderr)
    return None
 def _preprocess_vision(messages, schema):
    """Replace image blocks with text descriptions when provider lacks vision support."""
    if schema.supports_vision:
        return messages
    cache = {}
    for msg in messages:
        content = msg.get("content")
        if not isinstance(content, list):
            continue
        new_parts = []
        changed = False
        for part in content:
            if isinstance(part, dict) and part.get("type") in ("image_url", "input_image"):
                changed = True
                img_data = part.get("image_url", part)
                description = _vision_describe_image(img_data, cache)
                if description:
                    new_parts.append({"type": "text", "text": f"[Image: {description}]"})
                else:
                    new_parts.append({"type": "text", "text": "[Image: description unavailable - text-only model]"})
            else:
                new_parts.append(part)
        if changed:
            msg["content"] = new_parts
    return messages
 def _preprocess_vision_input(input_data, schema):
    """Replace input_image blocks in Responses API input format with text descriptions."""
    if schema.supports_vision:
        return input_data
-    if not isinstance(input_data, list):
+    modified = False
-        return input_data
+    result = []
    cache = {}
    changed_any = False
    for item in input_data:
        if item.get("type") != "message":
            result.append(item)
            continue
-        content = item.get("content")
+        content = item.get("content", [])
-        if not isinstance(content, list):
+        if isinstance(content, str):
            result.append(item)
            continue
-        new_parts = []
+        new_content = []
-        changed = False
+        has_img = False
        for part in content:
-            if isinstance(part, dict) and part.get("type") in ("input_image", "image_url"):
+            if isinstance(part, str):
-                changed = True
+                new_content.append(part)
-                img_url = ""
+                continue
-                iu = part.get("image_url")
+            pt = part.get("type", "")
-                if isinstance(iu, dict):
+            if pt in ("input_image", "image_url"):
-                    img_url = iu.get("url", "")
+                if not has_img:
-                elif isinstance(iu, str):
+                    fname = part.get("image_url", {}).get("url", part.get("url", "image.png"))
-                    img_url = iu
+                    if fname.startswith("data:"):
-                elif part.get("type") == "input_image":
+                        fname = "screenshot.png"
-                    img_url = part.get("url", "")
+                    new_content.append({"type": "output_text", "text": f"[User attached image: {fname} — this model does not support vision]"})
-                else:
+                    has_img = True
-                    img_url = part.get("url", "")
+                    modified = True
                desc = _vision_describe_image({"url": img_url}, cache)
                if desc:
                    new_parts.append({"type": "input_text", "text": f"[Image: {desc}]"})
                else:
                    new_parts.append({"type": "input_text", "text": "[Image: description unavailable - text-only model]"})
            else:
-                new_parts.append(part)
+                new_content.append(part)
-        if changed:
+        if modified:
-            item["content"] = new_parts
+            result.append({**item, "content": new_content})
-            changed_any = True
+        else:
            result.append(item)
    if modified:
        print(f"[vision-filter] stripped {sum(1 for i in input_data if i.get('type')=='message' and any(c.get('type') in ('input_image','image_url') for c in (i.get('content') or []) if isinstance(c,dict)))} images for model={model}", file=sys.stderr)
        return result
    return input_data
 def oa_input_to_messages(input_data):
@@ -4585,6 +4512,148 @@ def _extract_text(content):
    return "".join(parts)
 # Persistent cache: image hash → description (survives across requests)
 _vision_desc_cache = collections.OrderedDict()
 _vision_desc_lock = threading.Lock()
 _VISION_DESC_CACHE_MAX = 256
 def _vision_describe_image(img_data):
    """Call vision fallback API to describe a single image.
    Uses a module-level LRU cache so descriptions survive across requests.
    A single image in a multi-turn conversation is only described once.
    Returns:
        description string or None on failure
    """
    global _vision_desc_cache
    if not VISION_FALLBACK_URL:
        return None
    # Normalize image URL from various formats
    if isinstance(img_data, dict):
        img_url = img_data.get("url", "")
        if not img_url:
            inner = img_data.get("image_url", img_data)
            img_url = inner.get("url", "") if isinstance(inner, dict) else str(inner)
    else:
        img_url = str(img_data)
    if not img_url:
        return None
    img_hash = hashlib.md5(img_url.encode("utf-8", errors="replace")).hexdigest()
    # Check persistent cache first (no API call needed)
    with _vision_desc_lock:
        if img_hash in _vision_desc_cache:
            return _vision_desc_cache[img_hash]
    try:
        payload = json.dumps({
            "model": VISION_FALLBACK_MODEL,
            "messages": [{"role": "user", "content": [
                {"type": "text", "text": "Describe the content of this image in detail. If it contains text, transcribe it fully."},
                {"type": "image_url", "image_url": {"url": img_url}},
            ]}],
            "max_tokens": 1024,
            "stream": False,
        }).encode()
        headers = {"Content-Type": "application/json"}
        if VISION_FALLBACK_KEY:
            headers["Authorization"] = f"Bearer {VISION_FALLBACK_KEY}"
        req = urllib.request.Request(VISION_FALLBACK_URL, data=payload, headers=headers)
        resp = urllib.request.urlopen(req, timeout=30)
        body = json.loads(resp.read().decode())
        choices = body.get("choices", [])
        if choices:
            msg = choices[0].get("message", {})
            desc = msg.get("content", "")
            if desc:
                with _vision_desc_lock:
                    _vision_desc_cache[img_hash] = desc
                    if len(_vision_desc_cache) > _VISION_DESC_CACHE_MAX:
                        _vision_desc_cache.popitem(last=False)
                return desc
    except Exception as e:
        print(f"[vision-fallback] error describing image: {e}", file=sys.stderr)
    return None
 def _preprocess_vision(messages, schema):
    """Replace image blocks with text descriptions when provider lacks vision support.
    Works on OpenAI Chat Completions message format (post-conversion).
    """
    if schema.supports_vision:
        return messages
    for msg in messages:
        content = msg.get("content")
        if not isinstance(content, list):
            continue
        new_parts = []
        changed = False
        for part in content:
            if isinstance(part, dict) and part.get("type") in ("image_url", "input_image"):
                changed = True
                img_data = part.get("image_url", part)
                description = _vision_describe_image(img_data)
                if description:
                    new_parts.append({"type": "text", "text": f"[Image: {description}]"})
                else:
                    new_parts.append({"type": "text", "text": "[Image: description non disponible - modele text-only]"})
            else:
                new_parts.append(part)
        if changed:
            msg["content"] = new_parts
    return messages
 def _preprocess_vision_input(input_data, schema):
    """Replace input_image blocks in Responses API input format with text descriptions.
    This runs BEFORE adapter.convert() so images are replaced before any
    conversion function can silently drop them.
    """
    if schema.supports_vision:
        return input_data
    if not isinstance(input_data, list):
        return input_data
    changed_any = False
    for item in input_data:
        if item.get("type") != "message":
            continue
        content = item.get("content")
        if not isinstance(content, list):
            continue
        new_parts = []
        changed = False
        for part in content:
            if isinstance(part, dict) and part.get("type") == "input_image":
                changed = True
                changed_any = True
                img_data = part.get("image_url", part)
                description = _vision_describe_image(img_data)
                if description:
                    new_parts.append({"type": "input_text", "text": f"[Image: {description}]"})
                else:
                    new_parts.append({"type": "input_text", "text": "[Image: description non disponible - modele text-only]"})
            else:
                new_parts.append(part)
        if changed:
            item["content"] = new_parts
    return input_data
 # ═══════════════════════════════════════════════════════════════════