Merge PR #6: vision/OCR preprocessing (sync from GitHub)

2026-05-26 18:19:19 +04:00
parent 256b1ac08b
commit 7939de085a
1 changed files with 145 additions and 0 deletions
--- a/src/translate-proxy.py
+++ b/src/translate-proxy.py
@@ -4314,6 +4314,14 @@ class ErrorAnalyzer:
        elif re.search(r"tool-call|tool_call.*format", err):
            hints["tool_decl_format"] = "command_code"

+        # ── Vision support detection ──
+        if re.search(r"unknown variant\b.*image_url", err) or \
+           re.search(r"unexpected.*image_url", err) or \
+           re.search(r"does not support.*image", err) or \
+           re.search(r"image.*not.*support", err) or \
+           re.search(r"unsupported.*content.*type.*image", err):
+            hints["supports_vision"] = False
+
        # ── Response/Stream format hints from content-type or error ──
        # ── Vision support detection ──
        if re.search(r"unknown variant\b.*image_url", err) or \
@@ -4580,6 +4588,141 @@ def _extract_text(content):
    return "".join(parts)


+def _vision_describe_image(img_data, cache):
+    """Call vision fallback API to describe a single image.
+
+    Args:
+        img_data: dict with image_url field, or raw image_url dict
+        cache: dict mapping image hash -> description (request-scoped)
+
+    Returns:
+        description string or None on failure
+    """
+    if not VISION_FALLBACK_URL:
+        return None
+
+    # Normalize image URL from various formats
+    if isinstance(img_data, dict):
+        img_url = img_data.get("url", "")
+        if not img_url:
+            inner = img_data.get("image_url", img_data)
+            img_url = inner.get("url", "") if isinstance(inner, dict) else str(inner)
+    else:
+        img_url = str(img_data)
+
+    if not img_url:
+        return None
+
+    # Check cache
+    img_hash = hashlib.md5(img_url.encode("utf-8", errors="replace")).hexdigest()
+    if img_hash in cache:
+        return cache[img_hash]
+
+    try:
+        payload = json.dumps({
+            "model": VISION_FALLBACK_MODEL,
+            "messages": [{"role": "user", "content": [
+                {"type": "text", "text": "Describe the content of this image in detail. If it contains text, transcribe it fully."},
+                {"type": "image_url", "image_url": {"url": img_url}},
+            ]}],
+            "max_tokens": 1024,
+            "stream": False,
+        }).encode()
+
+        headers = {"Content-Type": "application/json"}
+        if VISION_FALLBACK_KEY:
+            headers["Authorization"] = f"Bearer {VISION_FALLBACK_KEY}"
+
+        req = urllib.request.Request(VISION_FALLBACK_URL, data=payload, headers=headers)
+        resp = urllib.request.urlopen(req, timeout=30)
+        body = json.loads(resp.read().decode())
+
+        choices = body.get("choices", [])
+        if choices:
+            msg = choices[0].get("message", {})
+            desc = msg.get("content", "")
+            if desc:
+                cache[img_hash] = desc
+                return desc
+    except Exception as e:
+        print(f"[vision-fallback] error describing image: {e}", file=sys.stderr)
+
+    return None
+
+
+def _preprocess_vision(messages, schema):
+    """Replace image blocks with text descriptions when provider lacks vision support.
+
+    Works on OpenAI Chat Completions message format (post-conversion).
+    """
+    if schema.supports_vision:
+        return messages
+
+    cache = {}
+
+    for msg in messages:
+        content = msg.get("content")
+        if not isinstance(content, list):
+            continue
+        new_parts = []
+        changed = False
+        for part in content:
+            if isinstance(part, dict) and part.get("type") in ("image_url", "input_image"):
+                changed = True
+                img_data = part.get("image_url", part)
+                description = _vision_describe_image(img_data, cache)
+                if description:
+                    new_parts.append({"type": "text", "text": f"[Image: {description}]"})
+                else:
+                    new_parts.append({"type": "text", "text": "[Image: description non disponible - modele text-only]"})
+            else:
+                new_parts.append(part)
+        if changed:
+            msg["content"] = new_parts
+
+    return messages
+
+
+def _preprocess_vision_input(input_data, schema):
+    """Replace input_image blocks in Responses API input format with text descriptions.
+
+    This runs BEFORE adapter.convert() so images are replaced before any
+    conversion function can silently drop them.
+    """
+    if schema.supports_vision:
+        return input_data
+    if not isinstance(input_data, list):
+        return input_data
+
+    cache = {}
+    changed_any = False
+
+    for item in input_data:
+        if item.get("type") != "message":
+            continue
+        content = item.get("content")
+        if not isinstance(content, list):
+            continue
+        new_parts = []
+        changed = False
+        for part in content:
+            if isinstance(part, dict) and part.get("type") == "input_image":
+                changed = True
+                changed_any = True
+                img_data = part.get("image_url", part)
+                description = _vision_describe_image(img_data, cache)
+                if description:
+                    new_parts.append({"type": "input_text", "text": f"[Image: {description}]"})
+                else:
+                    new_parts.append({"type": "input_text", "text": "[Image: description non disponible - modele text-only]"})
+            else:
+                new_parts.append(part)
+        if changed:
+            item["content"] = new_parts
+
+    return input_data
+
+
 # ═══════════════════════════════════════════════════════════════════
 # HTTP Server
 # ═══════════════════════════════════════════════════════════════════
@@ -7014,6 +7157,8 @@ class Handler(http.server.BaseHTTPRequestHandler):
        max_retries = 3
        prev_content_type = None  # for oscillation detection
        for attempt in range(max_retries + 1):
+            # Preprocess images for text-only providers BEFORE conversion
+            processed_input = _preprocess_vision_input(input_data, schema) if not schema.supports_vision else input_data
            adapter = SchemaAdapter(schema)
            processed_input = _preprocess_vision_input(input_data, schema) if not schema.supports_vision else input_data
            messages = adapter.convert(processed_input, instructions)