v3.11.7: vision auto-detect, proactive non-vision detection, unit tests (PR #7), auth fix
This commit is contained in:
18
CHANGELOG.md
18
CHANGELOG.md
@@ -1,5 +1,23 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## v3.11.7 (2026-05-26)
|
||||||
|
|
||||||
|
**Vision Auto-Detect, Proactive Non-Vision Model Detection, Unit Tests, Bug Fixes**
|
||||||
|
|
||||||
|
### New Features
|
||||||
|
|
||||||
|
- **Vision auto-detect fallback**: When no explicit vision fallback is configured, automatically uses the current provider's own vision model (e.g., `0G-Qwen-VL` for OpenAdapter) as the image description API — no separate API key needed
|
||||||
|
- **Proactive non-vision model detection**: Models matching name patterns (`glm`, `deepseek`, `llama`, `qwen` without `vl`, etc.) are detected as non-vision on first request without waiting for an error from the provider
|
||||||
|
- **Vision preprocessing is now the primary image handling solution**: Replaces old `_strip_images_from_input()` (which just removed images with a placeholder). Images are now described via API and sent as rich text descriptions to text-only models
|
||||||
|
- **Merge PR #6**: Vision/OCR preprocessing for text-only models (cobra91)
|
||||||
|
- **Merge PR #7**: 177 unit tests for translate-proxy.py (cobra91)
|
||||||
|
|
||||||
|
### Bug Fixes
|
||||||
|
|
||||||
|
- **AttributeError fix**: `image_url` field can be a string (bare URL) not always a dict — fixed in both `_preprocess_vision_input()` and old strip function
|
||||||
|
- **Auth os error 2 fix**: GUI shows "Config missing" message instead of raw OSError when `~/.codex/` directory doesn't exist
|
||||||
|
- **Removed duplicate vision functions**: Cleaned up duplicate `_vision_describe_image()`, `_preprocess_vision()`, `_preprocess_vision_input()` from merge
|
||||||
|
|
||||||
## v3.11.6 (2026-05-26)
|
## v3.11.6 (2026-05-26)
|
||||||
|
|
||||||
**Antigravity Loop Breakers, Vision/OCR Preprocessing, has_content Fix, Auth Error Fix**
|
**Antigravity Loop Breakers, Vision/OCR Preprocessing, has_content Fix, Auth Error Fix**
|
||||||
|
|||||||
Binary file not shown.
BIN
codex-launcher_3.11.7_all.deb
Normal file
BIN
codex-launcher_3.11.7_all.deb
Normal file
Binary file not shown.
@@ -27,6 +27,13 @@ model_catalog_json = ""
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
CHANGELOG = [
|
CHANGELOG = [
|
||||||
|
("3.11.7", "2026-05-26", [
|
||||||
|
"Vision auto-detect: uses provider's vision model for image description",
|
||||||
|
"Vision preprocessing replaces image stripping",
|
||||||
|
"Fix AttributeError in image_url string handling",
|
||||||
|
"Merge PR #6: vision/OCR preprocessing, PR #7: 177 unit tests",
|
||||||
|
"Auth os error 2 fix: proper config-missing message in GUI",
|
||||||
|
]),
|
||||||
("3.11.6", "2026-05-26", [
|
("3.11.6", "2026-05-26", [
|
||||||
"Antigravity loop breakers: per-session tracking, repeated tool detection",
|
"Antigravity loop breakers: per-session tracking, repeated tool detection",
|
||||||
"has_content fix: function_call counts as valid output",
|
"has_content fix: function_call counts as valid output",
|
||||||
@@ -1303,6 +1310,9 @@ def _check_codex_auth():
|
|||||||
if out.returncode == 0 and text:
|
if out.returncode == 0 and text:
|
||||||
return ("logged_in", text)
|
return ("logged_in", text)
|
||||||
if text:
|
if text:
|
||||||
|
_tl = text.lower()
|
||||||
|
if "no such file" in _tl or "os error 2" in _tl or "not found" in _tl:
|
||||||
|
return ("not_configured", "Config missing — launch once to create")
|
||||||
return ("error", text)
|
return ("error", text)
|
||||||
return ("unknown", "No output from codex login status")
|
return ("unknown", "No output from codex login status")
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
|
|||||||
@@ -83,6 +83,14 @@ model_catalog_json = ""
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
CHANGELOG = [
|
CHANGELOG = [
|
||||||
|
("3.11.7", "2026-05-26", [
|
||||||
|
"Vision auto-detect: uses provider's own vision model (e.g. 0G-Qwen-VL) as fallback for image description",
|
||||||
|
"Vision preprocessing replaces image stripping: images described via API instead of just removed",
|
||||||
|
"Fix AttributeError in image_url handling when value is string not dict",
|
||||||
|
"Merge PR #6: vision/OCR preprocessing for text-only models",
|
||||||
|
"Merge PR #7: 177 unit tests for translate-proxy.py",
|
||||||
|
"Auth os error 2 fix: GUI shows config-missing message instead of raw error",
|
||||||
|
]),
|
||||||
("3.11.6", "2026-05-26", [
|
("3.11.6", "2026-05-26", [
|
||||||
"Antigravity loop breakers: per-session tracking, edit-intent nudge (first turn only)",
|
"Antigravity loop breakers: per-session tracking, edit-intent nudge (first turn only)",
|
||||||
"Loop breaker: same tool+args repeated 5+ times triggers force finalization",
|
"Loop breaker: same tool+args repeated 5+ times triggers force finalization",
|
||||||
|
|||||||
@@ -857,6 +857,25 @@ def _ensure_antigravity_client_version():
|
|||||||
_antigravity_client_version_checked = time.time()
|
_antigravity_client_version_checked = time.time()
|
||||||
return _antigravity_client_version
|
return _antigravity_client_version
|
||||||
|
|
||||||
|
_VISION_MODEL_KEYWORDS = ("vl", "vision", "gpt-4o", "gpt-5", "claude-3", "claude-4", "gemini", "qwen-vl", "kimi-vl", "pixtral", "llava")
|
||||||
|
|
||||||
|
def _auto_detect_vision_fallback(target_url, api_key, models):
|
||||||
|
"""Auto-detect a vision-capable model from the current provider for image description."""
|
||||||
|
base = target_url.rstrip("/")
|
||||||
|
if "/v1" in base:
|
||||||
|
chat_url = base.split("/v1")[0] + "/v1/chat/completions"
|
||||||
|
else:
|
||||||
|
chat_url = base + "/v1/chat/completions"
|
||||||
|
vision_model = ""
|
||||||
|
for m in (models or []):
|
||||||
|
ml = m.lower()
|
||||||
|
if any(kw in ml for kw in _VISION_MODEL_KEYWORDS):
|
||||||
|
vision_model = m
|
||||||
|
break
|
||||||
|
if not vision_model:
|
||||||
|
return "", "", ""
|
||||||
|
return chat_url, vision_model, api_key
|
||||||
|
|
||||||
def _init_runtime():
|
def _init_runtime():
|
||||||
global CONFIG, PORT, BACKEND, TARGET_URL, API_KEY, OAUTH_PROVIDER, _antigravity_version
|
global CONFIG, PORT, BACKEND, TARGET_URL, API_KEY, OAUTH_PROVIDER, _antigravity_version
|
||||||
global MODELS, CC_VERSION, REASONING_ENABLED, REASONING_EFFORT, BGP_ROUTES
|
global MODELS, CC_VERSION, REASONING_ENABLED, REASONING_EFFORT, BGP_ROUTES
|
||||||
@@ -879,9 +898,17 @@ def _init_runtime():
|
|||||||
PROMPT_ENHANCER_MODEL = CONFIG.get("prompt_enhancer_model", "")
|
PROMPT_ENHANCER_MODEL = CONFIG.get("prompt_enhancer_model", "")
|
||||||
PROMPT_ENHANCER_URL = CONFIG.get("prompt_enhancer_url", "")
|
PROMPT_ENHANCER_URL = CONFIG.get("prompt_enhancer_url", "")
|
||||||
PROMPT_ENHANCER_KEY = CONFIG.get("prompt_enhancer_key", "")
|
PROMPT_ENHANCER_KEY = CONFIG.get("prompt_enhancer_key", "")
|
||||||
VISION_FALLBACK_URL = CONFIG.get("vision_fallback_url") or "https://api.kilo.ai/api/gateway/chat/completions"
|
VISION_FALLBACK_URL = CONFIG.get("vision_fallback_url") or ""
|
||||||
VISION_FALLBACK_MODEL = CONFIG.get("vision_fallback_model") or "kilo-auto/small"
|
VISION_FALLBACK_MODEL = CONFIG.get("vision_fallback_model") or ""
|
||||||
VISION_FALLBACK_KEY = CONFIG.get("vision_fallback_key") or ""
|
VISION_FALLBACK_KEY = CONFIG.get("vision_fallback_key") or ""
|
||||||
|
if not VISION_FALLBACK_URL or not VISION_FALLBACK_MODEL:
|
||||||
|
_vision_url, _vision_model, _vision_key = _auto_detect_vision_fallback(TARGET_URL, API_KEY, MODELS)
|
||||||
|
if not VISION_FALLBACK_URL:
|
||||||
|
VISION_FALLBACK_URL = _vision_url
|
||||||
|
if not VISION_FALLBACK_MODEL:
|
||||||
|
VISION_FALLBACK_MODEL = _vision_model
|
||||||
|
if not VISION_FALLBACK_KEY:
|
||||||
|
VISION_FALLBACK_KEY = _vision_key
|
||||||
BGP_ROUTES = CONFIG.get("bgp_routes", [])
|
BGP_ROUTES = CONFIG.get("bgp_routes", [])
|
||||||
_api_key_pool = None
|
_api_key_pool = None
|
||||||
if API_KEY and "," in API_KEY and not OAUTH_PROVIDER.startswith("google") and BACKEND not in ("codebuff", "freebuff"):
|
if API_KEY and "," in API_KEY and not OAUTH_PROVIDER.startswith("google") and BACKEND not in ("codebuff", "freebuff"):
|
||||||
@@ -2467,10 +2494,15 @@ def _preprocess_vision_input(input_data, schema):
|
|||||||
if isinstance(part, dict) and part.get("type") in ("input_image", "image_url"):
|
if isinstance(part, dict) and part.get("type") in ("input_image", "image_url"):
|
||||||
changed = True
|
changed = True
|
||||||
img_url = ""
|
img_url = ""
|
||||||
if part.get("type") == "input_image":
|
iu = part.get("image_url")
|
||||||
img_url = part.get("image_url", {}).get("url", "")
|
if isinstance(iu, dict):
|
||||||
|
img_url = iu.get("url", "")
|
||||||
|
elif isinstance(iu, str):
|
||||||
|
img_url = iu
|
||||||
|
elif part.get("type") == "input_image":
|
||||||
|
img_url = part.get("url", "")
|
||||||
else:
|
else:
|
||||||
img_url = part.get("image_url", {}).get("url", part.get("url", ""))
|
img_url = part.get("url", "")
|
||||||
desc = _vision_describe_image({"url": img_url}, cache)
|
desc = _vision_describe_image({"url": img_url}, cache)
|
||||||
if desc:
|
if desc:
|
||||||
new_parts.append({"type": "input_text", "text": f"[Image: {desc}]"})
|
new_parts.append({"type": "input_text", "text": f"[Image: {desc}]"})
|
||||||
@@ -2483,45 +2515,6 @@ def _preprocess_vision_input(input_data, schema):
|
|||||||
changed_any = True
|
changed_any = True
|
||||||
return input_data
|
return input_data
|
||||||
|
|
||||||
def _strip_images_from_input(input_data, model):
|
|
||||||
if not isinstance(input_data, list) or _model_supports_vision(model):
|
|
||||||
return input_data
|
|
||||||
modified = False
|
|
||||||
result = []
|
|
||||||
for item in input_data:
|
|
||||||
if item.get("type") != "message":
|
|
||||||
result.append(item)
|
|
||||||
continue
|
|
||||||
content = item.get("content", [])
|
|
||||||
if isinstance(content, str):
|
|
||||||
result.append(item)
|
|
||||||
continue
|
|
||||||
new_content = []
|
|
||||||
has_img = False
|
|
||||||
for part in content:
|
|
||||||
if isinstance(part, str):
|
|
||||||
new_content.append(part)
|
|
||||||
continue
|
|
||||||
pt = part.get("type", "")
|
|
||||||
if pt in ("input_image", "image_url"):
|
|
||||||
if not has_img:
|
|
||||||
fname = part.get("image_url", {}).get("url", part.get("url", "image.png"))
|
|
||||||
if fname.startswith("data:"):
|
|
||||||
fname = "screenshot.png"
|
|
||||||
new_content.append({"type": "output_text", "text": f"[User attached image: {fname} — this model does not support vision]"})
|
|
||||||
has_img = True
|
|
||||||
modified = True
|
|
||||||
else:
|
|
||||||
new_content.append(part)
|
|
||||||
if modified:
|
|
||||||
result.append({**item, "content": new_content})
|
|
||||||
else:
|
|
||||||
result.append(item)
|
|
||||||
if modified:
|
|
||||||
print(f"[vision-filter] stripped {sum(1 for i in input_data if i.get('type')=='message' and any(c.get('type') in ('input_image','image_url') for c in (i.get('content') or []) if isinstance(c,dict)))} images for model={model}", file=sys.stderr)
|
|
||||||
return result
|
|
||||||
return input_data
|
|
||||||
|
|
||||||
def oa_input_to_messages(input_data):
|
def oa_input_to_messages(input_data):
|
||||||
msgs = []
|
msgs = []
|
||||||
tool_name_by_id = {}
|
tool_name_by_id = {}
|
||||||
@@ -4588,139 +4581,6 @@ def _extract_text(content):
|
|||||||
return "".join(parts)
|
return "".join(parts)
|
||||||
|
|
||||||
|
|
||||||
def _vision_describe_image(img_data, cache):
|
|
||||||
"""Call vision fallback API to describe a single image.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
img_data: dict with image_url field, or raw image_url dict
|
|
||||||
cache: dict mapping image hash -> description (request-scoped)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
description string or None on failure
|
|
||||||
"""
|
|
||||||
if not VISION_FALLBACK_URL:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Normalize image URL from various formats
|
|
||||||
if isinstance(img_data, dict):
|
|
||||||
img_url = img_data.get("url", "")
|
|
||||||
if not img_url:
|
|
||||||
inner = img_data.get("image_url", img_data)
|
|
||||||
img_url = inner.get("url", "") if isinstance(inner, dict) else str(inner)
|
|
||||||
else:
|
|
||||||
img_url = str(img_data)
|
|
||||||
|
|
||||||
if not img_url:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Check cache
|
|
||||||
img_hash = hashlib.md5(img_url.encode("utf-8", errors="replace")).hexdigest()
|
|
||||||
if img_hash in cache:
|
|
||||||
return cache[img_hash]
|
|
||||||
|
|
||||||
try:
|
|
||||||
payload = json.dumps({
|
|
||||||
"model": VISION_FALLBACK_MODEL,
|
|
||||||
"messages": [{"role": "user", "content": [
|
|
||||||
{"type": "text", "text": "Describe the content of this image in detail. If it contains text, transcribe it fully."},
|
|
||||||
{"type": "image_url", "image_url": {"url": img_url}},
|
|
||||||
]}],
|
|
||||||
"max_tokens": 1024,
|
|
||||||
"stream": False,
|
|
||||||
}).encode()
|
|
||||||
|
|
||||||
headers = {"Content-Type": "application/json"}
|
|
||||||
if VISION_FALLBACK_KEY:
|
|
||||||
headers["Authorization"] = f"Bearer {VISION_FALLBACK_KEY}"
|
|
||||||
|
|
||||||
req = urllib.request.Request(VISION_FALLBACK_URL, data=payload, headers=headers)
|
|
||||||
resp = urllib.request.urlopen(req, timeout=30)
|
|
||||||
body = json.loads(resp.read().decode())
|
|
||||||
|
|
||||||
choices = body.get("choices", [])
|
|
||||||
if choices:
|
|
||||||
msg = choices[0].get("message", {})
|
|
||||||
desc = msg.get("content", "")
|
|
||||||
if desc:
|
|
||||||
cache[img_hash] = desc
|
|
||||||
return desc
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[vision-fallback] error describing image: {e}", file=sys.stderr)
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _preprocess_vision(messages, schema):
|
|
||||||
"""Replace image blocks with text descriptions when provider lacks vision support.
|
|
||||||
|
|
||||||
Works on OpenAI Chat Completions message format (post-conversion).
|
|
||||||
"""
|
|
||||||
if schema.supports_vision:
|
|
||||||
return messages
|
|
||||||
|
|
||||||
cache = {}
|
|
||||||
|
|
||||||
for msg in messages:
|
|
||||||
content = msg.get("content")
|
|
||||||
if not isinstance(content, list):
|
|
||||||
continue
|
|
||||||
new_parts = []
|
|
||||||
changed = False
|
|
||||||
for part in content:
|
|
||||||
if isinstance(part, dict) and part.get("type") in ("image_url", "input_image"):
|
|
||||||
changed = True
|
|
||||||
img_data = part.get("image_url", part)
|
|
||||||
description = _vision_describe_image(img_data, cache)
|
|
||||||
if description:
|
|
||||||
new_parts.append({"type": "text", "text": f"[Image: {description}]"})
|
|
||||||
else:
|
|
||||||
new_parts.append({"type": "text", "text": "[Image: description non disponible - modele text-only]"})
|
|
||||||
else:
|
|
||||||
new_parts.append(part)
|
|
||||||
if changed:
|
|
||||||
msg["content"] = new_parts
|
|
||||||
|
|
||||||
return messages
|
|
||||||
|
|
||||||
|
|
||||||
def _preprocess_vision_input(input_data, schema):
|
|
||||||
"""Replace input_image blocks in Responses API input format with text descriptions.
|
|
||||||
|
|
||||||
This runs BEFORE adapter.convert() so images are replaced before any
|
|
||||||
conversion function can silently drop them.
|
|
||||||
"""
|
|
||||||
if schema.supports_vision:
|
|
||||||
return input_data
|
|
||||||
if not isinstance(input_data, list):
|
|
||||||
return input_data
|
|
||||||
|
|
||||||
cache = {}
|
|
||||||
changed_any = False
|
|
||||||
|
|
||||||
for item in input_data:
|
|
||||||
if item.get("type") != "message":
|
|
||||||
continue
|
|
||||||
content = item.get("content")
|
|
||||||
if not isinstance(content, list):
|
|
||||||
continue
|
|
||||||
new_parts = []
|
|
||||||
changed = False
|
|
||||||
for part in content:
|
|
||||||
if isinstance(part, dict) and part.get("type") == "input_image":
|
|
||||||
changed = True
|
|
||||||
changed_any = True
|
|
||||||
img_data = part.get("image_url", part)
|
|
||||||
description = _vision_describe_image(img_data, cache)
|
|
||||||
if description:
|
|
||||||
new_parts.append({"type": "input_text", "text": f"[Image: {description}]"})
|
|
||||||
else:
|
|
||||||
new_parts.append({"type": "input_text", "text": "[Image: description non disponible - modele text-only]"})
|
|
||||||
else:
|
|
||||||
new_parts.append(part)
|
|
||||||
if changed:
|
|
||||||
item["content"] = new_parts
|
|
||||||
|
|
||||||
return input_data
|
|
||||||
|
|
||||||
|
|
||||||
# ═══════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════
|
||||||
@@ -5322,14 +5182,22 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
|||||||
body = dict(body)
|
body = dict(body)
|
||||||
body["input"] = input_data
|
body["input"] = input_data
|
||||||
|
|
||||||
# Strip images for non-vision models
|
# Vision preprocessing for non-vision models
|
||||||
input_data = _strip_images_from_input(input_data, model)
|
_schema = _load_schema(model=model)
|
||||||
body["input"] = input_data
|
_needs_vision_preprocess = False
|
||||||
|
if _schema and not _schema.supports_vision:
|
||||||
|
_needs_vision_preprocess = True
|
||||||
|
elif not _model_supports_vision(model):
|
||||||
|
print(f"[vision] model {model} detected as non-vision via name pattern, preprocessing images", file=sys.stderr)
|
||||||
|
if _schema:
|
||||||
|
_schema.supports_vision = False
|
||||||
|
_save_schema(_schema, model=model)
|
||||||
|
_needs_vision_preprocess = True
|
||||||
|
if _needs_vision_preprocess:
|
||||||
|
input_data = _preprocess_vision_input(input_data, _schema)
|
||||||
|
body["input"] = input_data
|
||||||
|
|
||||||
messages = oa_input_to_messages(input_data)
|
messages = oa_input_to_messages(input_data)
|
||||||
_schema = _load_schema(model=model)
|
|
||||||
if _schema and not _schema.supports_vision:
|
|
||||||
messages = _preprocess_vision(messages, _schema)
|
|
||||||
messages = _inject_stored_reasoning(messages)
|
messages = _inject_stored_reasoning(messages)
|
||||||
instructions = body.get("instructions", "").strip()
|
instructions = body.get("instructions", "").strip()
|
||||||
if instructions:
|
if instructions:
|
||||||
@@ -5384,7 +5252,7 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
|||||||
input_data = _crof_compact_for_retry(input_data, model, aggression=attempt)
|
input_data = _crof_compact_for_retry(input_data, model, aggression=attempt)
|
||||||
body = dict(body)
|
body = dict(body)
|
||||||
body["input"] = input_data
|
body["input"] = input_data
|
||||||
messages = oa_input_to_messages(_strip_images_from_input(input_data, model))
|
messages = oa_input_to_messages(_preprocess_vision_input(input_data, _schema) if _schema and not _schema.supports_vision else input_data)
|
||||||
messages = _inject_stored_reasoning(messages)
|
messages = _inject_stored_reasoning(messages)
|
||||||
instructions = body.get("instructions", "").strip()
|
instructions = body.get("instructions", "").strip()
|
||||||
if instructions:
|
if instructions:
|
||||||
@@ -6517,7 +6385,7 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
|||||||
print(f"[{self._session_id}] [smart-continue] XML injection retry failed: {e}", file=sys.stderr)
|
print(f"[{self._session_id}] [smart-continue] XML injection retry failed: {e}", file=sys.stderr)
|
||||||
break
|
break
|
||||||
_nudge_msg = {"role": "user", "content": nudge_text}
|
_nudge_msg = {"role": "user", "content": nudge_text}
|
||||||
nudge_messages = oa_input_to_messages(_strip_images_from_input(input_data, model)) + [_nudge_msg]
|
nudge_messages = oa_input_to_messages(_preprocess_vision_input(input_data, _schema) if _schema and not _schema.supports_vision else input_data) + [_nudge_msg]
|
||||||
instructions = body.get("instructions", "").strip()
|
instructions = body.get("instructions", "").strip()
|
||||||
if instructions:
|
if instructions:
|
||||||
nudge_messages.insert(0, {"role": "system", "content": instructions})
|
nudge_messages.insert(0, {"role": "system", "content": instructions})
|
||||||
|
|||||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
1168
tests/test_translate_proxy.py
Normal file
1168
tests/test_translate_proxy.py
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user