v3.11.8: vision cache persistence (PR #8 merge)
This commit is contained in:
@@ -1,5 +1,14 @@
|
||||
# Changelog
|
||||
|
||||
## v3.11.8 (2026-05-26)
|
||||
|
||||
**Vision Cache Persistence, PR #8 Merge**
|
||||
|
||||
### New Features
|
||||
|
||||
- **Vision description cache persisted across requests**: Image descriptions from the vision fallback API are now cached in a file (`~/.cache/codex-proxy/vision-cache.json`) so the same image URL is never described twice — saves API calls and latency
|
||||
- **Merge PR #8**: `fix: persist vision description cache across requests` (cobra91)
|
||||
|
||||
## v3.11.7 (2026-05-26)
|
||||
|
||||
**Vision Auto-Detect, Proactive Non-Vision Model Detection, Unit Tests, Bug Fixes**
|
||||
|
||||
Binary file not shown.
BIN
codex-launcher_3.11.8_all.deb
Normal file
BIN
codex-launcher_3.11.8_all.deb
Normal file
Binary file not shown.
@@ -27,6 +27,10 @@ model_catalog_json = ""
|
||||
"""
|
||||
|
||||
CHANGELOG = [
|
||||
("3.11.8", "2026-05-26", [
|
||||
"Vision cache persisted across requests (PR #8 merge)",
|
||||
"No redundant vision API calls for same image URL",
|
||||
]),
|
||||
("3.11.7", "2026-05-26", [
|
||||
"Vision auto-detect: uses provider's vision model for image description",
|
||||
"Vision preprocessing replaces image stripping",
|
||||
|
||||
@@ -83,6 +83,10 @@ model_catalog_json = ""
|
||||
"""
|
||||
|
||||
CHANGELOG = [
|
||||
("3.11.8", "2026-05-26", [
|
||||
"Vision description cache persisted across requests (no redundant API calls for same image)",
|
||||
"Merge PR #8: fix vision cache persistence across requests",
|
||||
]),
|
||||
("3.11.7", "2026-05-26", [
|
||||
"Vision auto-detect: uses provider's own vision model (e.g. 0G-Qwen-VL) as fallback for image description",
|
||||
"Vision preprocessing replaces image stripping: images described via API instead of just removed",
|
||||
|
||||
@@ -2407,116 +2407,43 @@ def _mark_vision_fail(model):
|
||||
with _vision_fail_lock:
|
||||
_vision_fail_cache.add(model)
|
||||
|
||||
def _vision_describe_image(img_data, cache):
|
||||
"""Call vision fallback API to describe a single image."""
|
||||
if not VISION_FALLBACK_URL:
|
||||
return None
|
||||
if isinstance(img_data, dict):
|
||||
img_url = img_data.get("url", "")
|
||||
if not img_url:
|
||||
inner = img_data.get("image_url", img_data)
|
||||
img_url = inner.get("url", "") if isinstance(inner, dict) else str(inner)
|
||||
else:
|
||||
img_url = str(img_data)
|
||||
if not img_url:
|
||||
return None
|
||||
img_hash = hashlib.md5(img_url.encode("utf-8", errors="replace")).hexdigest()
|
||||
if img_hash in cache:
|
||||
return cache[img_hash]
|
||||
try:
|
||||
payload = json.dumps({
|
||||
"model": VISION_FALLBACK_MODEL,
|
||||
"messages": [{"role": "user", "content": [
|
||||
{"type": "text", "text": "Describe the content of this image in detail. If it contains text, transcribe it fully."},
|
||||
{"type": "image_url", "image_url": {"url": img_url}},
|
||||
]}],
|
||||
"max_tokens": 1024,
|
||||
"stream": False,
|
||||
}).encode()
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if VISION_FALLBACK_KEY:
|
||||
headers["Authorization"] = f"Bearer {VISION_FALLBACK_KEY}"
|
||||
req = urllib.request.Request(VISION_FALLBACK_URL, data=payload, headers=headers)
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
body = json.loads(resp.read().decode())
|
||||
choices = body.get("choices", [])
|
||||
if choices:
|
||||
msg = choices[0].get("message", {})
|
||||
desc = msg.get("content", "")
|
||||
if desc:
|
||||
cache[img_hash] = desc
|
||||
return desc
|
||||
except Exception as e:
|
||||
print(f"[vision-fallback] error describing image: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def _preprocess_vision(messages, schema):
|
||||
"""Replace image blocks with text descriptions when provider lacks vision support."""
|
||||
if schema.supports_vision:
|
||||
return messages
|
||||
cache = {}
|
||||
for msg in messages:
|
||||
content = msg.get("content")
|
||||
if not isinstance(content, list):
|
||||
continue
|
||||
new_parts = []
|
||||
changed = False
|
||||
for part in content:
|
||||
if isinstance(part, dict) and part.get("type") in ("image_url", "input_image"):
|
||||
changed = True
|
||||
img_data = part.get("image_url", part)
|
||||
description = _vision_describe_image(img_data, cache)
|
||||
if description:
|
||||
new_parts.append({"type": "text", "text": f"[Image: {description}]"})
|
||||
else:
|
||||
new_parts.append({"type": "text", "text": "[Image: description unavailable - text-only model]"})
|
||||
else:
|
||||
new_parts.append(part)
|
||||
if changed:
|
||||
msg["content"] = new_parts
|
||||
return messages
|
||||
|
||||
|
||||
def _preprocess_vision_input(input_data, schema):
|
||||
"""Replace input_image blocks in Responses API input format with text descriptions."""
|
||||
if schema.supports_vision:
|
||||
def _strip_images_from_input(input_data, model):
|
||||
if not isinstance(input_data, list) or _model_supports_vision(model):
|
||||
return input_data
|
||||
if not isinstance(input_data, list):
|
||||
return input_data
|
||||
cache = {}
|
||||
changed_any = False
|
||||
modified = False
|
||||
result = []
|
||||
for item in input_data:
|
||||
if item.get("type") != "message":
|
||||
result.append(item)
|
||||
continue
|
||||
content = item.get("content")
|
||||
if not isinstance(content, list):
|
||||
content = item.get("content", [])
|
||||
if isinstance(content, str):
|
||||
result.append(item)
|
||||
continue
|
||||
new_parts = []
|
||||
changed = False
|
||||
new_content = []
|
||||
has_img = False
|
||||
for part in content:
|
||||
if isinstance(part, dict) and part.get("type") in ("input_image", "image_url"):
|
||||
changed = True
|
||||
img_url = ""
|
||||
iu = part.get("image_url")
|
||||
if isinstance(iu, dict):
|
||||
img_url = iu.get("url", "")
|
||||
elif isinstance(iu, str):
|
||||
img_url = iu
|
||||
elif part.get("type") == "input_image":
|
||||
img_url = part.get("url", "")
|
||||
else:
|
||||
img_url = part.get("url", "")
|
||||
desc = _vision_describe_image({"url": img_url}, cache)
|
||||
if desc:
|
||||
new_parts.append({"type": "input_text", "text": f"[Image: {desc}]"})
|
||||
else:
|
||||
new_parts.append({"type": "input_text", "text": "[Image: description unavailable - text-only model]"})
|
||||
if isinstance(part, str):
|
||||
new_content.append(part)
|
||||
continue
|
||||
pt = part.get("type", "")
|
||||
if pt in ("input_image", "image_url"):
|
||||
if not has_img:
|
||||
fname = part.get("image_url", {}).get("url", part.get("url", "image.png"))
|
||||
if fname.startswith("data:"):
|
||||
fname = "screenshot.png"
|
||||
new_content.append({"type": "output_text", "text": f"[User attached image: {fname} — this model does not support vision]"})
|
||||
has_img = True
|
||||
modified = True
|
||||
else:
|
||||
new_parts.append(part)
|
||||
if changed:
|
||||
item["content"] = new_parts
|
||||
changed_any = True
|
||||
new_content.append(part)
|
||||
if modified:
|
||||
result.append({**item, "content": new_content})
|
||||
else:
|
||||
result.append(item)
|
||||
if modified:
|
||||
print(f"[vision-filter] stripped {sum(1 for i in input_data if i.get('type')=='message' and any(c.get('type') in ('input_image','image_url') for c in (i.get('content') or []) if isinstance(c,dict)))} images for model={model}", file=sys.stderr)
|
||||
return result
|
||||
return input_data
|
||||
|
||||
def oa_input_to_messages(input_data):
|
||||
@@ -4585,6 +4512,148 @@ def _extract_text(content):
|
||||
return "".join(parts)
|
||||
|
||||
|
||||
# Persistent cache: image hash → description (survives across requests)
|
||||
_vision_desc_cache = collections.OrderedDict()
|
||||
_vision_desc_lock = threading.Lock()
|
||||
_VISION_DESC_CACHE_MAX = 256
|
||||
|
||||
|
||||
def _vision_describe_image(img_data):
|
||||
"""Call vision fallback API to describe a single image.
|
||||
|
||||
Uses a module-level LRU cache so descriptions survive across requests.
|
||||
A single image in a multi-turn conversation is only described once.
|
||||
|
||||
Returns:
|
||||
description string or None on failure
|
||||
"""
|
||||
global _vision_desc_cache
|
||||
|
||||
if not VISION_FALLBACK_URL:
|
||||
return None
|
||||
|
||||
# Normalize image URL from various formats
|
||||
if isinstance(img_data, dict):
|
||||
img_url = img_data.get("url", "")
|
||||
if not img_url:
|
||||
inner = img_data.get("image_url", img_data)
|
||||
img_url = inner.get("url", "") if isinstance(inner, dict) else str(inner)
|
||||
else:
|
||||
img_url = str(img_data)
|
||||
|
||||
if not img_url:
|
||||
return None
|
||||
|
||||
img_hash = hashlib.md5(img_url.encode("utf-8", errors="replace")).hexdigest()
|
||||
|
||||
# Check persistent cache first (no API call needed)
|
||||
with _vision_desc_lock:
|
||||
if img_hash in _vision_desc_cache:
|
||||
return _vision_desc_cache[img_hash]
|
||||
|
||||
try:
|
||||
payload = json.dumps({
|
||||
"model": VISION_FALLBACK_MODEL,
|
||||
"messages": [{"role": "user", "content": [
|
||||
{"type": "text", "text": "Describe the content of this image in detail. If it contains text, transcribe it fully."},
|
||||
{"type": "image_url", "image_url": {"url": img_url}},
|
||||
]}],
|
||||
"max_tokens": 1024,
|
||||
"stream": False,
|
||||
}).encode()
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if VISION_FALLBACK_KEY:
|
||||
headers["Authorization"] = f"Bearer {VISION_FALLBACK_KEY}"
|
||||
|
||||
req = urllib.request.Request(VISION_FALLBACK_URL, data=payload, headers=headers)
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
body = json.loads(resp.read().decode())
|
||||
|
||||
choices = body.get("choices", [])
|
||||
if choices:
|
||||
msg = choices[0].get("message", {})
|
||||
desc = msg.get("content", "")
|
||||
if desc:
|
||||
with _vision_desc_lock:
|
||||
_vision_desc_cache[img_hash] = desc
|
||||
if len(_vision_desc_cache) > _VISION_DESC_CACHE_MAX:
|
||||
_vision_desc_cache.popitem(last=False)
|
||||
return desc
|
||||
except Exception as e:
|
||||
print(f"[vision-fallback] error describing image: {e}", file=sys.stderr)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _preprocess_vision(messages, schema):
|
||||
"""Replace image blocks with text descriptions when provider lacks vision support.
|
||||
|
||||
Works on OpenAI Chat Completions message format (post-conversion).
|
||||
"""
|
||||
if schema.supports_vision:
|
||||
return messages
|
||||
|
||||
for msg in messages:
|
||||
content = msg.get("content")
|
||||
if not isinstance(content, list):
|
||||
continue
|
||||
new_parts = []
|
||||
changed = False
|
||||
for part in content:
|
||||
if isinstance(part, dict) and part.get("type") in ("image_url", "input_image"):
|
||||
changed = True
|
||||
img_data = part.get("image_url", part)
|
||||
description = _vision_describe_image(img_data)
|
||||
if description:
|
||||
new_parts.append({"type": "text", "text": f"[Image: {description}]"})
|
||||
else:
|
||||
new_parts.append({"type": "text", "text": "[Image: description non disponible - modele text-only]"})
|
||||
else:
|
||||
new_parts.append(part)
|
||||
if changed:
|
||||
msg["content"] = new_parts
|
||||
|
||||
return messages
|
||||
|
||||
|
||||
def _preprocess_vision_input(input_data, schema):
|
||||
"""Replace input_image blocks in Responses API input format with text descriptions.
|
||||
|
||||
This runs BEFORE adapter.convert() so images are replaced before any
|
||||
conversion function can silently drop them.
|
||||
"""
|
||||
if schema.supports_vision:
|
||||
return input_data
|
||||
if not isinstance(input_data, list):
|
||||
return input_data
|
||||
|
||||
changed_any = False
|
||||
|
||||
for item in input_data:
|
||||
if item.get("type") != "message":
|
||||
continue
|
||||
content = item.get("content")
|
||||
if not isinstance(content, list):
|
||||
continue
|
||||
new_parts = []
|
||||
changed = False
|
||||
for part in content:
|
||||
if isinstance(part, dict) and part.get("type") == "input_image":
|
||||
changed = True
|
||||
changed_any = True
|
||||
img_data = part.get("image_url", part)
|
||||
description = _vision_describe_image(img_data)
|
||||
if description:
|
||||
new_parts.append({"type": "input_text", "text": f"[Image: {description}]"})
|
||||
else:
|
||||
new_parts.append({"type": "input_text", "text": "[Image: description non disponible - modele text-only]"})
|
||||
else:
|
||||
new_parts.append(part)
|
||||
if changed:
|
||||
item["content"] = new_parts
|
||||
|
||||
return input_data
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
Reference in New Issue
Block a user