v3.11.7: vision auto-detect, proactive non-vision detection, unit tests (PR #7), auth fix

2026-05-26 18:57:58 +04:00
parent 7939de085a
commit ddb5f3dddc
8 changed files with 1257 additions and 185 deletions
--- a/src/codex-launcher-gui
+++ b/src/codex-launcher-gui
@@ -27,6 +27,13 @@ model_catalog_json = ""
 """

 CHANGELOG = [
+    ("3.11.7", "2026-05-26", [
+        "Vision auto-detect: uses provider's vision model for image description",
+        "Vision preprocessing replaces image stripping",
+        "Fix AttributeError in image_url string handling",
+        "Merge PR #6: vision/OCR preprocessing, PR #7: 177 unit tests",
+        "Auth os error 2 fix: proper config-missing message in GUI",
+    ]),
    ("3.11.6", "2026-05-26", [
        "Antigravity loop breakers: per-session tracking, repeated tool detection",
        "has_content fix: function_call counts as valid output",
@@ -1303,6 +1310,9 @@ def _check_codex_auth():
        if out.returncode == 0 and text:
            return ("logged_in", text)
        if text:
+            _tl = text.lower()
+            if "no such file" in _tl or "os error 2" in _tl or "not found" in _tl:
+                return ("not_configured", "Config missing — launch once to create")
            return ("error", text)
        return ("unknown", "No output from codex login status")
    except FileNotFoundError:
--- a/src/codex_launcher_lib.py
+++ b/src/codex_launcher_lib.py
@@ -83,6 +83,14 @@ model_catalog_json = ""
 """

 CHANGELOG = [
+    ("3.11.7", "2026-05-26", [
+        "Vision auto-detect: uses provider's own vision model (e.g. 0G-Qwen-VL) as fallback for image description",
+        "Vision preprocessing replaces image stripping: images described via API instead of just removed",
+        "Fix AttributeError in image_url handling when value is string not dict",
+        "Merge PR #6: vision/OCR preprocessing for text-only models",
+        "Merge PR #7: 177 unit tests for translate-proxy.py",
+        "Auth os error 2 fix: GUI shows config-missing message instead of raw error",
+    ]),
    ("3.11.6", "2026-05-26", [
        "Antigravity loop breakers: per-session tracking, edit-intent nudge (first turn only)",
        "Loop breaker: same tool+args repeated 5+ times triggers force finalization",
--- a/src/translate-proxy.py
+++ b/src/translate-proxy.py
@@ -857,6 +857,25 @@ def _ensure_antigravity_client_version():
    _antigravity_client_version_checked = time.time()
    return _antigravity_client_version

+_VISION_MODEL_KEYWORDS = ("vl", "vision", "gpt-4o", "gpt-5", "claude-3", "claude-4", "gemini", "qwen-vl", "kimi-vl", "pixtral", "llava")
+
+def _auto_detect_vision_fallback(target_url, api_key, models):
+    """Auto-detect a vision-capable model from the current provider for image description."""
+    base = target_url.rstrip("/")
+    if "/v1" in base:
+        chat_url = base.split("/v1")[0] + "/v1/chat/completions"
+    else:
+        chat_url = base + "/v1/chat/completions"
+    vision_model = ""
+    for m in (models or []):
+        ml = m.lower()
+        if any(kw in ml for kw in _VISION_MODEL_KEYWORDS):
+            vision_model = m
+            break
+    if not vision_model:
+        return "", "", ""
+    return chat_url, vision_model, api_key
+
 def _init_runtime():
    global CONFIG, PORT, BACKEND, TARGET_URL, API_KEY, OAUTH_PROVIDER, _antigravity_version
    global MODELS, CC_VERSION, REASONING_ENABLED, REASONING_EFFORT, BGP_ROUTES
@@ -879,9 +898,17 @@ def _init_runtime():
    PROMPT_ENHANCER_MODEL = CONFIG.get("prompt_enhancer_model", "")
    PROMPT_ENHANCER_URL = CONFIG.get("prompt_enhancer_url", "")
    PROMPT_ENHANCER_KEY = CONFIG.get("prompt_enhancer_key", "")
-    VISION_FALLBACK_URL = CONFIG.get("vision_fallback_url") or "https://api.kilo.ai/api/gateway/chat/completions"
-    VISION_FALLBACK_MODEL = CONFIG.get("vision_fallback_model") or "kilo-auto/small"
+    VISION_FALLBACK_URL = CONFIG.get("vision_fallback_url") or ""
+    VISION_FALLBACK_MODEL = CONFIG.get("vision_fallback_model") or ""
    VISION_FALLBACK_KEY = CONFIG.get("vision_fallback_key") or ""
+    if not VISION_FALLBACK_URL or not VISION_FALLBACK_MODEL:
+        _vision_url, _vision_model, _vision_key = _auto_detect_vision_fallback(TARGET_URL, API_KEY, MODELS)
+        if not VISION_FALLBACK_URL:
+            VISION_FALLBACK_URL = _vision_url
+        if not VISION_FALLBACK_MODEL:
+            VISION_FALLBACK_MODEL = _vision_model
+        if not VISION_FALLBACK_KEY:
+            VISION_FALLBACK_KEY = _vision_key
    BGP_ROUTES = CONFIG.get("bgp_routes", [])
    _api_key_pool = None
    if API_KEY and "," in API_KEY and not OAUTH_PROVIDER.startswith("google") and BACKEND not in ("codebuff", "freebuff"):
@@ -2467,10 +2494,15 @@ def _preprocess_vision_input(input_data, schema):
            if isinstance(part, dict) and part.get("type") in ("input_image", "image_url"):
                changed = True
                img_url = ""
-                if part.get("type") == "input_image":
-                    img_url = part.get("image_url", {}).get("url", "")
+                iu = part.get("image_url")
+                if isinstance(iu, dict):
+                    img_url = iu.get("url", "")
+                elif isinstance(iu, str):
+                    img_url = iu
+                elif part.get("type") == "input_image":
+                    img_url = part.get("url", "")
                else:
-                    img_url = part.get("image_url", {}).get("url", part.get("url", ""))
+                    img_url = part.get("url", "")
                desc = _vision_describe_image({"url": img_url}, cache)
                if desc:
                    new_parts.append({"type": "input_text", "text": f"[Image: {desc}]"})
@@ -2483,45 +2515,6 @@ def _preprocess_vision_input(input_data, schema):
            changed_any = True
    return input_data

-def _strip_images_from_input(input_data, model):
-    if not isinstance(input_data, list) or _model_supports_vision(model):
-        return input_data
-    modified = False
-    result = []
-    for item in input_data:
-        if item.get("type") != "message":
-            result.append(item)
-            continue
-        content = item.get("content", [])
-        if isinstance(content, str):
-            result.append(item)
-            continue
-        new_content = []
-        has_img = False
-        for part in content:
-            if isinstance(part, str):
-                new_content.append(part)
-                continue
-            pt = part.get("type", "")
-            if pt in ("input_image", "image_url"):
-                if not has_img:
-                    fname = part.get("image_url", {}).get("url", part.get("url", "image.png"))
-                    if fname.startswith("data:"):
-                        fname = "screenshot.png"
-                    new_content.append({"type": "output_text", "text": f"[User attached image: {fname} — this model does not support vision]"})
-                    has_img = True
-                    modified = True
-            else:
-                new_content.append(part)
-        if modified:
-            result.append({**item, "content": new_content})
-        else:
-            result.append(item)
-    if modified:
-        print(f"[vision-filter] stripped {sum(1 for i in input_data if i.get('type')=='message' and any(c.get('type') in ('input_image','image_url') for c in (i.get('content') or []) if isinstance(c,dict)))} images for model={model}", file=sys.stderr)
-        return result
-    return input_data
-
 def oa_input_to_messages(input_data):
    msgs = []
    tool_name_by_id = {}
@@ -4588,139 +4581,6 @@ def _extract_text(content):
    return "".join(parts)


-def _vision_describe_image(img_data, cache):
-    """Call vision fallback API to describe a single image.
-
-    Args:
-        img_data: dict with image_url field, or raw image_url dict
-        cache: dict mapping image hash -> description (request-scoped)
-
-    Returns:
-        description string or None on failure
-    """
-    if not VISION_FALLBACK_URL:
-        return None
-
-    # Normalize image URL from various formats
-    if isinstance(img_data, dict):
-        img_url = img_data.get("url", "")
-        if not img_url:
-            inner = img_data.get("image_url", img_data)
-            img_url = inner.get("url", "") if isinstance(inner, dict) else str(inner)
-    else:
-        img_url = str(img_data)
-
-    if not img_url:
-        return None
-
-    # Check cache
-    img_hash = hashlib.md5(img_url.encode("utf-8", errors="replace")).hexdigest()
-    if img_hash in cache:
-        return cache[img_hash]
-
-    try:
-        payload = json.dumps({
-            "model": VISION_FALLBACK_MODEL,
-            "messages": [{"role": "user", "content": [
-                {"type": "text", "text": "Describe the content of this image in detail. If it contains text, transcribe it fully."},
-                {"type": "image_url", "image_url": {"url": img_url}},
-            ]}],
-            "max_tokens": 1024,
-            "stream": False,
-        }).encode()
-
-        headers = {"Content-Type": "application/json"}
-        if VISION_FALLBACK_KEY:
-            headers["Authorization"] = f"Bearer {VISION_FALLBACK_KEY}"
-
-        req = urllib.request.Request(VISION_FALLBACK_URL, data=payload, headers=headers)
-        resp = urllib.request.urlopen(req, timeout=30)
-        body = json.loads(resp.read().decode())
-
-        choices = body.get("choices", [])
-        if choices:
-            msg = choices[0].get("message", {})
-            desc = msg.get("content", "")
-            if desc:
-                cache[img_hash] = desc
-                return desc
-    except Exception as e:
-        print(f"[vision-fallback] error describing image: {e}", file=sys.stderr)
-
-    return None
-
-
-def _preprocess_vision(messages, schema):
-    """Replace image blocks with text descriptions when provider lacks vision support.
-
-    Works on OpenAI Chat Completions message format (post-conversion).
-    """
-    if schema.supports_vision:
-        return messages
-
-    cache = {}
-
-    for msg in messages:
-        content = msg.get("content")
-        if not isinstance(content, list):
-            continue
-        new_parts = []
-        changed = False
-        for part in content:
-            if isinstance(part, dict) and part.get("type") in ("image_url", "input_image"):
-                changed = True
-                img_data = part.get("image_url", part)
-                description = _vision_describe_image(img_data, cache)
-                if description:
-                    new_parts.append({"type": "text", "text": f"[Image: {description}]"})
-                else:
-                    new_parts.append({"type": "text", "text": "[Image: description non disponible - modele text-only]"})
-            else:
-                new_parts.append(part)
-        if changed:
-            msg["content"] = new_parts
-
-    return messages
-
-
-def _preprocess_vision_input(input_data, schema):
-    """Replace input_image blocks in Responses API input format with text descriptions.
-
-    This runs BEFORE adapter.convert() so images are replaced before any
-    conversion function can silently drop them.
-    """
-    if schema.supports_vision:
-        return input_data
-    if not isinstance(input_data, list):
-        return input_data
-
-    cache = {}
-    changed_any = False
-
-    for item in input_data:
-        if item.get("type") != "message":
-            continue
-        content = item.get("content")
-        if not isinstance(content, list):
-            continue
-        new_parts = []
-        changed = False
-        for part in content:
-            if isinstance(part, dict) and part.get("type") == "input_image":
-                changed = True
-                changed_any = True
-                img_data = part.get("image_url", part)
-                description = _vision_describe_image(img_data, cache)
-                if description:
-                    new_parts.append({"type": "input_text", "text": f"[Image: {description}]"})
-                else:
-                    new_parts.append({"type": "input_text", "text": "[Image: description non disponible - modele text-only]"})
-            else:
-                new_parts.append(part)
-        if changed:
-            item["content"] = new_parts
-
-    return input_data


 # ═══════════════════════════════════════════════════════════════════
@@ -5322,14 +5182,22 @@ class Handler(http.server.BaseHTTPRequestHandler):
                body = dict(body)
                body["input"] = input_data

-        # Strip images for non-vision models
-        input_data = _strip_images_from_input(input_data, model)
-        body["input"] = input_data
+        # Vision preprocessing for non-vision models
+        _schema = _load_schema(model=model)
+        _needs_vision_preprocess = False
+        if _schema and not _schema.supports_vision:
+            _needs_vision_preprocess = True
+        elif not _model_supports_vision(model):
+            print(f"[vision] model {model} detected as non-vision via name pattern, preprocessing images", file=sys.stderr)
+            if _schema:
+                _schema.supports_vision = False
+                _save_schema(_schema, model=model)
+            _needs_vision_preprocess = True
+        if _needs_vision_preprocess:
+            input_data = _preprocess_vision_input(input_data, _schema)
+            body["input"] = input_data

        messages = oa_input_to_messages(input_data)
-        _schema = _load_schema(model=model)
-        if _schema and not _schema.supports_vision:
-            messages = _preprocess_vision(messages, _schema)
        messages = _inject_stored_reasoning(messages)
        instructions = body.get("instructions", "").strip()
        if instructions:
@@ -5384,7 +5252,7 @@ class Handler(http.server.BaseHTTPRequestHandler):
                            input_data = _crof_compact_for_retry(input_data, model, aggression=attempt)
                            body = dict(body)
                            body["input"] = input_data
-                            messages = oa_input_to_messages(_strip_images_from_input(input_data, model))
+                            messages = oa_input_to_messages(_preprocess_vision_input(input_data, _schema) if _schema and not _schema.supports_vision else input_data)
                            messages = _inject_stored_reasoning(messages)
                            instructions = body.get("instructions", "").strip()
                            if instructions:
@@ -6517,7 +6385,7 @@ class Handler(http.server.BaseHTTPRequestHandler):
                            print(f"[{self._session_id}] [smart-continue] XML injection retry failed: {e}", file=sys.stderr)
                            break
                    _nudge_msg = {"role": "user", "content": nudge_text}
-                    nudge_messages = oa_input_to_messages(_strip_images_from_input(input_data, model)) + [_nudge_msg]
+                    nudge_messages = oa_input_to_messages(_preprocess_vision_input(input_data, _schema) if _schema and not _schema.supports_vision else input_data) + [_nudge_msg]
                    instructions = body.get("instructions", "").strip()
                    if instructions:
                        nudge_messages.insert(0, {"role": "system", "content": instructions})