v3.11.6: Antigravity loop breakers, vision/OCR preprocessing, has_content fix, auth config error fix, install.ps1
This commit is contained in:
@@ -157,7 +157,7 @@ Architecture:
|
||||
|
||||
import json, http.server, socketserver, urllib.request, urllib.parse, urllib.error, re
|
||||
import time, uuid, os, sys, argparse, threading, socket, collections, contextlib, signal
|
||||
import secrets, string
|
||||
import secrets, string, hashlib
|
||||
import dataclasses
|
||||
import http.client
|
||||
import selectors
|
||||
@@ -219,6 +219,9 @@ def load_config():
|
||||
"backend_type": ("PROXY_BACKEND", None, str),
|
||||
"target_url": ("PROXY_TARGET_URL", "ZAI_BASE_URL", str),
|
||||
"api_key": ("PROXY_API_KEY", "ZAI_API_KEY", str),
|
||||
"vision_fallback_url": ("VISION_FALLBACK_URL", None, str),
|
||||
"vision_fallback_model": ("VISION_FALLBACK_MODEL", None, str),
|
||||
"vision_fallback_key": ("VISION_FALLBACK_KEY", None, str),
|
||||
}
|
||||
for ck, (ev1, ev2, conv) in env_map.items():
|
||||
if ck not in cfg:
|
||||
@@ -260,6 +263,9 @@ PROMPT_ENHANCER_MODE = "offline"
|
||||
PROMPT_ENHANCER_MODEL = ""
|
||||
PROMPT_ENHANCER_URL = ""
|
||||
PROMPT_ENHANCER_KEY = ""
|
||||
VISION_FALLBACK_URL = ""
|
||||
VISION_FALLBACK_MODEL = ""
|
||||
VISION_FALLBACK_KEY = ""
|
||||
SERVER = None
|
||||
|
||||
if _IS_WINDOWS:
|
||||
@@ -855,6 +861,7 @@ def _init_runtime():
|
||||
global CONFIG, PORT, BACKEND, TARGET_URL, API_KEY, OAUTH_PROVIDER, _antigravity_version
|
||||
global MODELS, CC_VERSION, REASONING_ENABLED, REASONING_EFFORT, BGP_ROUTES
|
||||
global _api_key_pool, PROMPT_ENHANCER
|
||||
global VISION_FALLBACK_URL, VISION_FALLBACK_MODEL, VISION_FALLBACK_KEY
|
||||
|
||||
CONFIG = load_config()
|
||||
PORT = CONFIG["port"]
|
||||
@@ -872,6 +879,9 @@ def _init_runtime():
|
||||
PROMPT_ENHANCER_MODEL = CONFIG.get("prompt_enhancer_model", "")
|
||||
PROMPT_ENHANCER_URL = CONFIG.get("prompt_enhancer_url", "")
|
||||
PROMPT_ENHANCER_KEY = CONFIG.get("prompt_enhancer_key", "")
|
||||
VISION_FALLBACK_URL = CONFIG.get("vision_fallback_url") or "https://api.kilo.ai/api/gateway/chat/completions"
|
||||
VISION_FALLBACK_MODEL = CONFIG.get("vision_fallback_model") or "kilo-auto/small"
|
||||
VISION_FALLBACK_KEY = CONFIG.get("vision_fallback_key") or ""
|
||||
BGP_ROUTES = CONFIG.get("bgp_routes", [])
|
||||
_api_key_pool = None
|
||||
if API_KEY and "," in API_KEY and not OAUTH_PROVIDER.startswith("google") and BACKEND not in ("codebuff", "freebuff"):
|
||||
@@ -2366,6 +2376,113 @@ def _mark_vision_fail(model):
|
||||
with _vision_fail_lock:
|
||||
_vision_fail_cache.add(model)
|
||||
|
||||
def _vision_describe_image(img_data, cache):
|
||||
"""Call vision fallback API to describe a single image."""
|
||||
if not VISION_FALLBACK_URL:
|
||||
return None
|
||||
if isinstance(img_data, dict):
|
||||
img_url = img_data.get("url", "")
|
||||
if not img_url:
|
||||
inner = img_data.get("image_url", img_data)
|
||||
img_url = inner.get("url", "") if isinstance(inner, dict) else str(inner)
|
||||
else:
|
||||
img_url = str(img_data)
|
||||
if not img_url:
|
||||
return None
|
||||
img_hash = hashlib.md5(img_url.encode("utf-8", errors="replace")).hexdigest()
|
||||
if img_hash in cache:
|
||||
return cache[img_hash]
|
||||
try:
|
||||
payload = json.dumps({
|
||||
"model": VISION_FALLBACK_MODEL,
|
||||
"messages": [{"role": "user", "content": [
|
||||
{"type": "text", "text": "Describe the content of this image in detail. If it contains text, transcribe it fully."},
|
||||
{"type": "image_url", "image_url": {"url": img_url}},
|
||||
]}],
|
||||
"max_tokens": 1024,
|
||||
"stream": False,
|
||||
}).encode()
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if VISION_FALLBACK_KEY:
|
||||
headers["Authorization"] = f"Bearer {VISION_FALLBACK_KEY}"
|
||||
req = urllib.request.Request(VISION_FALLBACK_URL, data=payload, headers=headers)
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
body = json.loads(resp.read().decode())
|
||||
choices = body.get("choices", [])
|
||||
if choices:
|
||||
msg = choices[0].get("message", {})
|
||||
desc = msg.get("content", "")
|
||||
if desc:
|
||||
cache[img_hash] = desc
|
||||
return desc
|
||||
except Exception as e:
|
||||
print(f"[vision-fallback] error describing image: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def _preprocess_vision(messages, schema):
|
||||
"""Replace image blocks with text descriptions when provider lacks vision support."""
|
||||
if schema.supports_vision:
|
||||
return messages
|
||||
cache = {}
|
||||
for msg in messages:
|
||||
content = msg.get("content")
|
||||
if not isinstance(content, list):
|
||||
continue
|
||||
new_parts = []
|
||||
changed = False
|
||||
for part in content:
|
||||
if isinstance(part, dict) and part.get("type") in ("image_url", "input_image"):
|
||||
changed = True
|
||||
img_data = part.get("image_url", part)
|
||||
description = _vision_describe_image(img_data, cache)
|
||||
if description:
|
||||
new_parts.append({"type": "text", "text": f"[Image: {description}]"})
|
||||
else:
|
||||
new_parts.append({"type": "text", "text": "[Image: description unavailable - text-only model]"})
|
||||
else:
|
||||
new_parts.append(part)
|
||||
if changed:
|
||||
msg["content"] = new_parts
|
||||
return messages
|
||||
|
||||
|
||||
def _preprocess_vision_input(input_data, schema):
|
||||
"""Replace input_image blocks in Responses API input format with text descriptions."""
|
||||
if schema.supports_vision:
|
||||
return input_data
|
||||
if not isinstance(input_data, list):
|
||||
return input_data
|
||||
cache = {}
|
||||
changed_any = False
|
||||
for item in input_data:
|
||||
if item.get("type") != "message":
|
||||
continue
|
||||
content = item.get("content")
|
||||
if not isinstance(content, list):
|
||||
continue
|
||||
new_parts = []
|
||||
changed = False
|
||||
for part in content:
|
||||
if isinstance(part, dict) and part.get("type") in ("input_image", "image_url"):
|
||||
changed = True
|
||||
img_url = ""
|
||||
if part.get("type") == "input_image":
|
||||
img_url = part.get("image_url", {}).get("url", "")
|
||||
else:
|
||||
img_url = part.get("image_url", {}).get("url", part.get("url", ""))
|
||||
desc = _vision_describe_image({"url": img_url}, cache)
|
||||
if desc:
|
||||
new_parts.append({"type": "input_text", "text": f"[Image: {desc}]"})
|
||||
else:
|
||||
new_parts.append({"type": "input_text", "text": "[Image: description unavailable - text-only model]"})
|
||||
else:
|
||||
new_parts.append(part)
|
||||
if changed:
|
||||
item["content"] = new_parts
|
||||
changed_any = True
|
||||
return input_data
|
||||
|
||||
def _strip_images_from_input(input_data, model):
|
||||
if not isinstance(input_data, list) or _model_supports_vision(model):
|
||||
return input_data
|
||||
@@ -4014,6 +4131,7 @@ class ProviderSchema:
|
||||
})
|
||||
response_format: str = "auto" # "sse" | "raw_json" | "ndjson" | "auto"
|
||||
stream_format: str = "auto" # "sse_data" | "sse_event" | "raw_lines" | "json_lines"
|
||||
supports_vision: bool = True
|
||||
|
||||
def hints(self) -> dict:
|
||||
"""Return a dict for storing in provider-caps.json."""
|
||||
@@ -4023,7 +4141,10 @@ class ProviderSchema:
|
||||
continue
|
||||
if isinstance(v, dict) and not v:
|
||||
continue
|
||||
if v is False:
|
||||
if k == "supports_vision":
|
||||
if v is not False:
|
||||
continue
|
||||
elif v is False:
|
||||
continue
|
||||
if v == "":
|
||||
continue
|
||||
@@ -4193,6 +4314,15 @@ class ErrorAnalyzer:
|
||||
elif re.search(r"tool-call|tool_call.*format", err):
|
||||
hints["tool_decl_format"] = "command_code"
|
||||
|
||||
# ── Response/Stream format hints from content-type or error ──
|
||||
# ── Vision support detection ──
|
||||
if re.search(r"unknown variant\b.*image_url", err) or \
|
||||
re.search(r"unexpected.*image_url", err) or \
|
||||
re.search(r"does not support.*image", err) or \
|
||||
re.search(r"image.*not.*support", err) or \
|
||||
re.search(r"unsupported.*content.*type.*image", err):
|
||||
hints["supports_vision"] = False
|
||||
|
||||
# ── Response/Stream format hints from content-type or error ──
|
||||
if re.search(r"content.type.*text/event.stream", err) or \
|
||||
re.search(r"stream.*sse|sse.*expected", err):
|
||||
@@ -4253,6 +4383,7 @@ def _load_schema(target_url=None, backend=None, model=None):
|
||||
})),
|
||||
response_format=data.get("response_format", "auto"),
|
||||
stream_format=data.get("stream_format", "auto"),
|
||||
supports_vision=data.get("supports_vision", True),
|
||||
)
|
||||
|
||||
|
||||
@@ -5053,6 +5184,9 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
||||
body["input"] = input_data
|
||||
|
||||
messages = oa_input_to_messages(input_data)
|
||||
_schema = _load_schema(model=model)
|
||||
if _schema and not _schema.supports_vision:
|
||||
messages = _preprocess_vision(messages, _schema)
|
||||
messages = _inject_stored_reasoning(messages)
|
||||
instructions = body.get("instructions", "").strip()
|
||||
if instructions:
|
||||
@@ -5082,6 +5216,18 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
||||
upstream = urllib.request.urlopen(req, timeout=_upstream_timeout(body, stream))
|
||||
except urllib.error.HTTPError as e:
|
||||
err_body = e.read().decode()
|
||||
if re.search(r"unknown variant\b.*image_url", err_body.lower()) or \
|
||||
re.search(r"unexpected.*image_url", err_body.lower()) or \
|
||||
re.search(r"does not support.*image", err_body.lower()):
|
||||
_schema = _load_schema(model=model)
|
||||
if _schema:
|
||||
_schema.supports_vision = False
|
||||
if attempt < max_retries:
|
||||
print(f"[{self._session_id}] vision not supported, retrying with image preprocessing", file=sys.stderr)
|
||||
messages = _preprocess_vision(messages, _schema) if _schema else messages
|
||||
chat_body = self._build_chat_body(model, messages, body, stream)
|
||||
chat_body_b = json.dumps(chat_body).encode()
|
||||
continue
|
||||
if "context_length_exceeded" in err_body and attempt < max_retries:
|
||||
import re as _re
|
||||
_tok_m = _re.search(r'~?(\d+)\s*tokens', err_body)
|
||||
@@ -6869,7 +7015,8 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
||||
prev_content_type = None # for oscillation detection
|
||||
for attempt in range(max_retries + 1):
|
||||
adapter = SchemaAdapter(schema)
|
||||
messages = adapter.convert(input_data, instructions)
|
||||
processed_input = _preprocess_vision_input(input_data, schema) if not schema.supports_vision else input_data
|
||||
messages = adapter.convert(processed_input, instructions)
|
||||
use_cc_wrap = schema.cc_body_wrap or is_cc
|
||||
|
||||
# Build auth header from schema
|
||||
|
||||
Reference in New Issue
Block a user