From 7939de085aa743814fc875c21377e911356c4452 Mon Sep 17 00:00:00 2001 From: Roman | RyzenAdvanced Date: Tue, 26 May 2026 18:19:19 +0400 Subject: [PATCH] Merge PR #6: vision/OCR preprocessing (sync from GitHub) --- src/translate-proxy.py | 145 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) diff --git a/src/translate-proxy.py b/src/translate-proxy.py index df2d914..eb01a2e 100755 --- a/src/translate-proxy.py +++ b/src/translate-proxy.py @@ -4314,6 +4314,14 @@ class ErrorAnalyzer: elif re.search(r"tool-call|tool_call.*format", err): hints["tool_decl_format"] = "command_code" + # ── Vision support detection ── + if re.search(r"unknown variant\b.*image_url", err) or \ + re.search(r"unexpected.*image_url", err) or \ + re.search(r"does not support.*image", err) or \ + re.search(r"image.*not.*support", err) or \ + re.search(r"unsupported.*content.*type.*image", err): + hints["supports_vision"] = False + # ── Response/Stream format hints from content-type or error ── # ── Vision support detection ── if re.search(r"unknown variant\b.*image_url", err) or \ @@ -4580,6 +4588,141 @@ def _extract_text(content): return "".join(parts) +def _vision_describe_image(img_data, cache): + """Call vision fallback API to describe a single image. + + Args: + img_data: dict with image_url field, or raw image_url dict + cache: dict mapping image hash -> description (request-scoped) + + Returns: + description string or None on failure + """ + if not VISION_FALLBACK_URL: + return None + + # Normalize image URL from various formats + if isinstance(img_data, dict): + img_url = img_data.get("url", "") + if not img_url: + inner = img_data.get("image_url", img_data) + img_url = inner.get("url", "") if isinstance(inner, dict) else str(inner) + else: + img_url = str(img_data) + + if not img_url: + return None + + # Check cache + img_hash = hashlib.md5(img_url.encode("utf-8", errors="replace")).hexdigest() + if img_hash in cache: + return cache[img_hash] + + try: + payload = json.dumps({ + "model": VISION_FALLBACK_MODEL, + "messages": [{"role": "user", "content": [ + {"type": "text", "text": "Describe the content of this image in detail. If it contains text, transcribe it fully."}, + {"type": "image_url", "image_url": {"url": img_url}}, + ]}], + "max_tokens": 1024, + "stream": False, + }).encode() + + headers = {"Content-Type": "application/json"} + if VISION_FALLBACK_KEY: + headers["Authorization"] = f"Bearer {VISION_FALLBACK_KEY}" + + req = urllib.request.Request(VISION_FALLBACK_URL, data=payload, headers=headers) + resp = urllib.request.urlopen(req, timeout=30) + body = json.loads(resp.read().decode()) + + choices = body.get("choices", []) + if choices: + msg = choices[0].get("message", {}) + desc = msg.get("content", "") + if desc: + cache[img_hash] = desc + return desc + except Exception as e: + print(f"[vision-fallback] error describing image: {e}", file=sys.stderr) + + return None + + +def _preprocess_vision(messages, schema): + """Replace image blocks with text descriptions when provider lacks vision support. + + Works on OpenAI Chat Completions message format (post-conversion). + """ + if schema.supports_vision: + return messages + + cache = {} + + for msg in messages: + content = msg.get("content") + if not isinstance(content, list): + continue + new_parts = [] + changed = False + for part in content: + if isinstance(part, dict) and part.get("type") in ("image_url", "input_image"): + changed = True + img_data = part.get("image_url", part) + description = _vision_describe_image(img_data, cache) + if description: + new_parts.append({"type": "text", "text": f"[Image: {description}]"}) + else: + new_parts.append({"type": "text", "text": "[Image: description non disponible - modele text-only]"}) + else: + new_parts.append(part) + if changed: + msg["content"] = new_parts + + return messages + + +def _preprocess_vision_input(input_data, schema): + """Replace input_image blocks in Responses API input format with text descriptions. + + This runs BEFORE adapter.convert() so images are replaced before any + conversion function can silently drop them. + """ + if schema.supports_vision: + return input_data + if not isinstance(input_data, list): + return input_data + + cache = {} + changed_any = False + + for item in input_data: + if item.get("type") != "message": + continue + content = item.get("content") + if not isinstance(content, list): + continue + new_parts = [] + changed = False + for part in content: + if isinstance(part, dict) and part.get("type") == "input_image": + changed = True + changed_any = True + img_data = part.get("image_url", part) + description = _vision_describe_image(img_data, cache) + if description: + new_parts.append({"type": "input_text", "text": f"[Image: {description}]"}) + else: + new_parts.append({"type": "input_text", "text": "[Image: description non disponible - modele text-only]"}) + else: + new_parts.append(part) + if changed: + item["content"] = new_parts + + return input_data + + # ═══════════════════════════════════════════════════════════════════ # HTTP Server # ═══════════════════════════════════════════════════════════════════ @@ -7014,6 +7157,8 @@ class Handler(http.server.BaseHTTPRequestHandler): max_retries = 3 prev_content_type = None # for oscillation detection for attempt in range(max_retries + 1): + # Preprocess images for text-only providers BEFORE conversion + processed_input = _preprocess_vision_input(input_data, schema) if not schema.supports_vision else input_data adapter = SchemaAdapter(schema) processed_input = _preprocess_vision_input(input_data, schema) if not schema.supports_vision else input_data messages = adapter.convert(processed_input, instructions)