Merge PR #6: vision/OCR preprocessing (sync from GitHub)
This commit is contained in:
@@ -4314,6 +4314,14 @@ class ErrorAnalyzer:
|
|||||||
elif re.search(r"tool-call|tool_call.*format", err):
|
elif re.search(r"tool-call|tool_call.*format", err):
|
||||||
hints["tool_decl_format"] = "command_code"
|
hints["tool_decl_format"] = "command_code"
|
||||||
|
|
||||||
|
# ── Vision support detection ──
|
||||||
|
if re.search(r"unknown variant\b.*image_url", err) or \
|
||||||
|
re.search(r"unexpected.*image_url", err) or \
|
||||||
|
re.search(r"does not support.*image", err) or \
|
||||||
|
re.search(r"image.*not.*support", err) or \
|
||||||
|
re.search(r"unsupported.*content.*type.*image", err):
|
||||||
|
hints["supports_vision"] = False
|
||||||
|
|
||||||
# ── Response/Stream format hints from content-type or error ──
|
# ── Response/Stream format hints from content-type or error ──
|
||||||
# ── Vision support detection ──
|
# ── Vision support detection ──
|
||||||
if re.search(r"unknown variant\b.*image_url", err) or \
|
if re.search(r"unknown variant\b.*image_url", err) or \
|
||||||
@@ -4580,6 +4588,141 @@ def _extract_text(content):
|
|||||||
return "".join(parts)
|
return "".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _vision_describe_image(img_data, cache):
|
||||||
|
"""Call vision fallback API to describe a single image.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
img_data: dict with image_url field, or raw image_url dict
|
||||||
|
cache: dict mapping image hash -> description (request-scoped)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
description string or None on failure
|
||||||
|
"""
|
||||||
|
if not VISION_FALLBACK_URL:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Normalize image URL from various formats
|
||||||
|
if isinstance(img_data, dict):
|
||||||
|
img_url = img_data.get("url", "")
|
||||||
|
if not img_url:
|
||||||
|
inner = img_data.get("image_url", img_data)
|
||||||
|
img_url = inner.get("url", "") if isinstance(inner, dict) else str(inner)
|
||||||
|
else:
|
||||||
|
img_url = str(img_data)
|
||||||
|
|
||||||
|
if not img_url:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Check cache
|
||||||
|
img_hash = hashlib.md5(img_url.encode("utf-8", errors="replace")).hexdigest()
|
||||||
|
if img_hash in cache:
|
||||||
|
return cache[img_hash]
|
||||||
|
|
||||||
|
try:
|
||||||
|
payload = json.dumps({
|
||||||
|
"model": VISION_FALLBACK_MODEL,
|
||||||
|
"messages": [{"role": "user", "content": [
|
||||||
|
{"type": "text", "text": "Describe the content of this image in detail. If it contains text, transcribe it fully."},
|
||||||
|
{"type": "image_url", "image_url": {"url": img_url}},
|
||||||
|
]}],
|
||||||
|
"max_tokens": 1024,
|
||||||
|
"stream": False,
|
||||||
|
}).encode()
|
||||||
|
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
if VISION_FALLBACK_KEY:
|
||||||
|
headers["Authorization"] = f"Bearer {VISION_FALLBACK_KEY}"
|
||||||
|
|
||||||
|
req = urllib.request.Request(VISION_FALLBACK_URL, data=payload, headers=headers)
|
||||||
|
resp = urllib.request.urlopen(req, timeout=30)
|
||||||
|
body = json.loads(resp.read().decode())
|
||||||
|
|
||||||
|
choices = body.get("choices", [])
|
||||||
|
if choices:
|
||||||
|
msg = choices[0].get("message", {})
|
||||||
|
desc = msg.get("content", "")
|
||||||
|
if desc:
|
||||||
|
cache[img_hash] = desc
|
||||||
|
return desc
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[vision-fallback] error describing image: {e}", file=sys.stderr)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _preprocess_vision(messages, schema):
|
||||||
|
"""Replace image blocks with text descriptions when provider lacks vision support.
|
||||||
|
|
||||||
|
Works on OpenAI Chat Completions message format (post-conversion).
|
||||||
|
"""
|
||||||
|
if schema.supports_vision:
|
||||||
|
return messages
|
||||||
|
|
||||||
|
cache = {}
|
||||||
|
|
||||||
|
for msg in messages:
|
||||||
|
content = msg.get("content")
|
||||||
|
if not isinstance(content, list):
|
||||||
|
continue
|
||||||
|
new_parts = []
|
||||||
|
changed = False
|
||||||
|
for part in content:
|
||||||
|
if isinstance(part, dict) and part.get("type") in ("image_url", "input_image"):
|
||||||
|
changed = True
|
||||||
|
img_data = part.get("image_url", part)
|
||||||
|
description = _vision_describe_image(img_data, cache)
|
||||||
|
if description:
|
||||||
|
new_parts.append({"type": "text", "text": f"[Image: {description}]"})
|
||||||
|
else:
|
||||||
|
new_parts.append({"type": "text", "text": "[Image: description non disponible - modele text-only]"})
|
||||||
|
else:
|
||||||
|
new_parts.append(part)
|
||||||
|
if changed:
|
||||||
|
msg["content"] = new_parts
|
||||||
|
|
||||||
|
return messages
|
||||||
|
|
||||||
|
|
||||||
|
def _preprocess_vision_input(input_data, schema):
|
||||||
|
"""Replace input_image blocks in Responses API input format with text descriptions.
|
||||||
|
|
||||||
|
This runs BEFORE adapter.convert() so images are replaced before any
|
||||||
|
conversion function can silently drop them.
|
||||||
|
"""
|
||||||
|
if schema.supports_vision:
|
||||||
|
return input_data
|
||||||
|
if not isinstance(input_data, list):
|
||||||
|
return input_data
|
||||||
|
|
||||||
|
cache = {}
|
||||||
|
changed_any = False
|
||||||
|
|
||||||
|
for item in input_data:
|
||||||
|
if item.get("type") != "message":
|
||||||
|
continue
|
||||||
|
content = item.get("content")
|
||||||
|
if not isinstance(content, list):
|
||||||
|
continue
|
||||||
|
new_parts = []
|
||||||
|
changed = False
|
||||||
|
for part in content:
|
||||||
|
if isinstance(part, dict) and part.get("type") == "input_image":
|
||||||
|
changed = True
|
||||||
|
changed_any = True
|
||||||
|
img_data = part.get("image_url", part)
|
||||||
|
description = _vision_describe_image(img_data, cache)
|
||||||
|
if description:
|
||||||
|
new_parts.append({"type": "input_text", "text": f"[Image: {description}]"})
|
||||||
|
else:
|
||||||
|
new_parts.append({"type": "input_text", "text": "[Image: description non disponible - modele text-only]"})
|
||||||
|
else:
|
||||||
|
new_parts.append(part)
|
||||||
|
if changed:
|
||||||
|
item["content"] = new_parts
|
||||||
|
|
||||||
|
return input_data
|
||||||
|
|
||||||
|
|
||||||
# ═══════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════
|
||||||
# HTTP Server
|
# HTTP Server
|
||||||
# ═══════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════
|
||||||
@@ -7014,6 +7157,8 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
|||||||
max_retries = 3
|
max_retries = 3
|
||||||
prev_content_type = None # for oscillation detection
|
prev_content_type = None # for oscillation detection
|
||||||
for attempt in range(max_retries + 1):
|
for attempt in range(max_retries + 1):
|
||||||
|
# Preprocess images for text-only providers BEFORE conversion
|
||||||
|
processed_input = _preprocess_vision_input(input_data, schema) if not schema.supports_vision else input_data
|
||||||
adapter = SchemaAdapter(schema)
|
adapter = SchemaAdapter(schema)
|
||||||
processed_input = _preprocess_vision_input(input_data, schema) if not schema.supports_vision else input_data
|
processed_input = _preprocess_vision_input(input_data, schema) if not schema.supports_vision else input_data
|
||||||
messages = adapter.convert(processed_input, instructions)
|
messages = adapter.convert(processed_input, instructions)
|
||||||
|
|||||||
Reference in New Issue
Block a user