diff --git a/CHANGELOG.md b/CHANGELOG.md index a8385b0..135230c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## v3.11.8 (2026-05-26) + +**Vision Cache Persistence, PR #8 Merge** + +### New Features + +- **Vision description cache persisted across requests**: Image descriptions from the vision fallback API are now cached in a file (`~/.cache/codex-proxy/vision-cache.json`) so the same image URL is never described twice — saves API calls and latency +- **Merge PR #8**: `fix: persist vision description cache across requests` (cobra91) + ## v3.11.7 (2026-05-26) **Vision Auto-Detect, Proactive Non-Vision Model Detection, Unit Tests, Bug Fixes** diff --git a/codex-launcher_3.11.7_all.deb b/codex-launcher_3.11.7_all.deb deleted file mode 100644 index d1fea6a..0000000 Binary files a/codex-launcher_3.11.7_all.deb and /dev/null differ diff --git a/codex-launcher_3.11.8_all.deb b/codex-launcher_3.11.8_all.deb new file mode 100644 index 0000000..126ad27 Binary files /dev/null and b/codex-launcher_3.11.8_all.deb differ diff --git a/src/codex-launcher-gui b/src/codex-launcher-gui index fa8dbef..6f0dc00 100755 --- a/src/codex-launcher-gui +++ b/src/codex-launcher-gui @@ -27,6 +27,10 @@ model_catalog_json = "" """ CHANGELOG = [ + ("3.11.8", "2026-05-26", [ + "Vision cache persisted across requests (PR #8 merge)", + "No redundant vision API calls for same image URL", + ]), ("3.11.7", "2026-05-26", [ "Vision auto-detect: uses provider's vision model for image description", "Vision preprocessing replaces image stripping", diff --git a/src/codex_launcher_lib.py b/src/codex_launcher_lib.py index c7b6e05..5bedfcc 100644 --- a/src/codex_launcher_lib.py +++ b/src/codex_launcher_lib.py @@ -83,6 +83,10 @@ model_catalog_json = "" """ CHANGELOG = [ + ("3.11.8", "2026-05-26", [ + "Vision description cache persisted across requests (no redundant API calls for same image)", + "Merge PR #8: fix vision cache persistence across requests", + ]), ("3.11.7", "2026-05-26", [ "Vision auto-detect: uses provider's own vision model (e.g. 0G-Qwen-VL) as fallback for image description", "Vision preprocessing replaces image stripping: images described via API instead of just removed", diff --git a/src/translate-proxy.py b/src/translate-proxy.py index e4c3128..ecb13a2 100755 --- a/src/translate-proxy.py +++ b/src/translate-proxy.py @@ -2407,116 +2407,43 @@ def _mark_vision_fail(model): with _vision_fail_lock: _vision_fail_cache.add(model) -def _vision_describe_image(img_data, cache): - """Call vision fallback API to describe a single image.""" - if not VISION_FALLBACK_URL: - return None - if isinstance(img_data, dict): - img_url = img_data.get("url", "") - if not img_url: - inner = img_data.get("image_url", img_data) - img_url = inner.get("url", "") if isinstance(inner, dict) else str(inner) - else: - img_url = str(img_data) - if not img_url: - return None - img_hash = hashlib.md5(img_url.encode("utf-8", errors="replace")).hexdigest() - if img_hash in cache: - return cache[img_hash] - try: - payload = json.dumps({ - "model": VISION_FALLBACK_MODEL, - "messages": [{"role": "user", "content": [ - {"type": "text", "text": "Describe the content of this image in detail. If it contains text, transcribe it fully."}, - {"type": "image_url", "image_url": {"url": img_url}}, - ]}], - "max_tokens": 1024, - "stream": False, - }).encode() - headers = {"Content-Type": "application/json"} - if VISION_FALLBACK_KEY: - headers["Authorization"] = f"Bearer {VISION_FALLBACK_KEY}" - req = urllib.request.Request(VISION_FALLBACK_URL, data=payload, headers=headers) - resp = urllib.request.urlopen(req, timeout=30) - body = json.loads(resp.read().decode()) - choices = body.get("choices", []) - if choices: - msg = choices[0].get("message", {}) - desc = msg.get("content", "") - if desc: - cache[img_hash] = desc - return desc - except Exception as e: - print(f"[vision-fallback] error describing image: {e}", file=sys.stderr) - return None - - -def _preprocess_vision(messages, schema): - """Replace image blocks with text descriptions when provider lacks vision support.""" - if schema.supports_vision: - return messages - cache = {} - for msg in messages: - content = msg.get("content") - if not isinstance(content, list): - continue - new_parts = [] - changed = False - for part in content: - if isinstance(part, dict) and part.get("type") in ("image_url", "input_image"): - changed = True - img_data = part.get("image_url", part) - description = _vision_describe_image(img_data, cache) - if description: - new_parts.append({"type": "text", "text": f"[Image: {description}]"}) - else: - new_parts.append({"type": "text", "text": "[Image: description unavailable - text-only model]"}) - else: - new_parts.append(part) - if changed: - msg["content"] = new_parts - return messages - - -def _preprocess_vision_input(input_data, schema): - """Replace input_image blocks in Responses API input format with text descriptions.""" - if schema.supports_vision: +def _strip_images_from_input(input_data, model): + if not isinstance(input_data, list) or _model_supports_vision(model): return input_data - if not isinstance(input_data, list): - return input_data - cache = {} - changed_any = False + modified = False + result = [] for item in input_data: if item.get("type") != "message": + result.append(item) continue - content = item.get("content") - if not isinstance(content, list): + content = item.get("content", []) + if isinstance(content, str): + result.append(item) continue - new_parts = [] - changed = False + new_content = [] + has_img = False for part in content: - if isinstance(part, dict) and part.get("type") in ("input_image", "image_url"): - changed = True - img_url = "" - iu = part.get("image_url") - if isinstance(iu, dict): - img_url = iu.get("url", "") - elif isinstance(iu, str): - img_url = iu - elif part.get("type") == "input_image": - img_url = part.get("url", "") - else: - img_url = part.get("url", "") - desc = _vision_describe_image({"url": img_url}, cache) - if desc: - new_parts.append({"type": "input_text", "text": f"[Image: {desc}]"}) - else: - new_parts.append({"type": "input_text", "text": "[Image: description unavailable - text-only model]"}) + if isinstance(part, str): + new_content.append(part) + continue + pt = part.get("type", "") + if pt in ("input_image", "image_url"): + if not has_img: + fname = part.get("image_url", {}).get("url", part.get("url", "image.png")) + if fname.startswith("data:"): + fname = "screenshot.png" + new_content.append({"type": "output_text", "text": f"[User attached image: {fname} — this model does not support vision]"}) + has_img = True + modified = True else: - new_parts.append(part) - if changed: - item["content"] = new_parts - changed_any = True + new_content.append(part) + if modified: + result.append({**item, "content": new_content}) + else: + result.append(item) + if modified: + print(f"[vision-filter] stripped {sum(1 for i in input_data if i.get('type')=='message' and any(c.get('type') in ('input_image','image_url') for c in (i.get('content') or []) if isinstance(c,dict)))} images for model={model}", file=sys.stderr) + return result return input_data def oa_input_to_messages(input_data): @@ -4585,6 +4512,148 @@ def _extract_text(content): return "".join(parts) +# Persistent cache: image hash → description (survives across requests) +_vision_desc_cache = collections.OrderedDict() +_vision_desc_lock = threading.Lock() +_VISION_DESC_CACHE_MAX = 256 + + +def _vision_describe_image(img_data): + """Call vision fallback API to describe a single image. + + Uses a module-level LRU cache so descriptions survive across requests. + A single image in a multi-turn conversation is only described once. + + Returns: + description string or None on failure + """ + global _vision_desc_cache + + if not VISION_FALLBACK_URL: + return None + + # Normalize image URL from various formats + if isinstance(img_data, dict): + img_url = img_data.get("url", "") + if not img_url: + inner = img_data.get("image_url", img_data) + img_url = inner.get("url", "") if isinstance(inner, dict) else str(inner) + else: + img_url = str(img_data) + + if not img_url: + return None + + img_hash = hashlib.md5(img_url.encode("utf-8", errors="replace")).hexdigest() + + # Check persistent cache first (no API call needed) + with _vision_desc_lock: + if img_hash in _vision_desc_cache: + return _vision_desc_cache[img_hash] + + try: + payload = json.dumps({ + "model": VISION_FALLBACK_MODEL, + "messages": [{"role": "user", "content": [ + {"type": "text", "text": "Describe the content of this image in detail. If it contains text, transcribe it fully."}, + {"type": "image_url", "image_url": {"url": img_url}}, + ]}], + "max_tokens": 1024, + "stream": False, + }).encode() + + headers = {"Content-Type": "application/json"} + if VISION_FALLBACK_KEY: + headers["Authorization"] = f"Bearer {VISION_FALLBACK_KEY}" + + req = urllib.request.Request(VISION_FALLBACK_URL, data=payload, headers=headers) + resp = urllib.request.urlopen(req, timeout=30) + body = json.loads(resp.read().decode()) + + choices = body.get("choices", []) + if choices: + msg = choices[0].get("message", {}) + desc = msg.get("content", "") + if desc: + with _vision_desc_lock: + _vision_desc_cache[img_hash] = desc + if len(_vision_desc_cache) > _VISION_DESC_CACHE_MAX: + _vision_desc_cache.popitem(last=False) + return desc + except Exception as e: + print(f"[vision-fallback] error describing image: {e}", file=sys.stderr) + + return None + + +def _preprocess_vision(messages, schema): + """Replace image blocks with text descriptions when provider lacks vision support. + + Works on OpenAI Chat Completions message format (post-conversion). + """ + if schema.supports_vision: + return messages + + for msg in messages: + content = msg.get("content") + if not isinstance(content, list): + continue + new_parts = [] + changed = False + for part in content: + if isinstance(part, dict) and part.get("type") in ("image_url", "input_image"): + changed = True + img_data = part.get("image_url", part) + description = _vision_describe_image(img_data) + if description: + new_parts.append({"type": "text", "text": f"[Image: {description}]"}) + else: + new_parts.append({"type": "text", "text": "[Image: description non disponible - modele text-only]"}) + else: + new_parts.append(part) + if changed: + msg["content"] = new_parts + + return messages + + +def _preprocess_vision_input(input_data, schema): + """Replace input_image blocks in Responses API input format with text descriptions. + + This runs BEFORE adapter.convert() so images are replaced before any + conversion function can silently drop them. + """ + if schema.supports_vision: + return input_data + if not isinstance(input_data, list): + return input_data + + changed_any = False + + for item in input_data: + if item.get("type") != "message": + continue + content = item.get("content") + if not isinstance(content, list): + continue + new_parts = [] + changed = False + for part in content: + if isinstance(part, dict) and part.get("type") == "input_image": + changed = True + changed_any = True + img_data = part.get("image_url", part) + description = _vision_describe_image(img_data) + if description: + new_parts.append({"type": "input_text", "text": f"[Image: {description}]"}) + else: + new_parts.append({"type": "input_text", "text": "[Image: description non disponible - modele text-only]"}) + else: + new_parts.append(part) + if changed: + item["content"] = new_parts + + return input_data # ═══════════════════════════════════════════════════════════════════