v3.11.6: Antigravity loop breakers, vision/OCR preprocessing, has_content fix, auth config error fix, install.ps1

This commit is contained in:
Roman | RyzenAdvanced
2026-05-26 18:07:42 +04:00
Unverified
parent b029e7cb5e
commit e59ef6f28a
8 changed files with 340 additions and 10 deletions

View File

@@ -157,7 +157,7 @@ Architecture:
import json, http.server, socketserver, urllib.request, urllib.parse, urllib.error, re
import time, uuid, os, sys, argparse, threading, socket, collections, contextlib, signal
import secrets, string
import secrets, string, hashlib
import dataclasses
import http.client
import selectors
@@ -219,6 +219,9 @@ def load_config():
"backend_type": ("PROXY_BACKEND", None, str),
"target_url": ("PROXY_TARGET_URL", "ZAI_BASE_URL", str),
"api_key": ("PROXY_API_KEY", "ZAI_API_KEY", str),
"vision_fallback_url": ("VISION_FALLBACK_URL", None, str),
"vision_fallback_model": ("VISION_FALLBACK_MODEL", None, str),
"vision_fallback_key": ("VISION_FALLBACK_KEY", None, str),
}
for ck, (ev1, ev2, conv) in env_map.items():
if ck not in cfg:
@@ -260,6 +263,9 @@ PROMPT_ENHANCER_MODE = "offline"
PROMPT_ENHANCER_MODEL = ""
PROMPT_ENHANCER_URL = ""
PROMPT_ENHANCER_KEY = ""
VISION_FALLBACK_URL = ""
VISION_FALLBACK_MODEL = ""
VISION_FALLBACK_KEY = ""
SERVER = None
if _IS_WINDOWS:
@@ -855,6 +861,7 @@ def _init_runtime():
global CONFIG, PORT, BACKEND, TARGET_URL, API_KEY, OAUTH_PROVIDER, _antigravity_version
global MODELS, CC_VERSION, REASONING_ENABLED, REASONING_EFFORT, BGP_ROUTES
global _api_key_pool, PROMPT_ENHANCER
global VISION_FALLBACK_URL, VISION_FALLBACK_MODEL, VISION_FALLBACK_KEY
CONFIG = load_config()
PORT = CONFIG["port"]
@@ -872,6 +879,9 @@ def _init_runtime():
PROMPT_ENHANCER_MODEL = CONFIG.get("prompt_enhancer_model", "")
PROMPT_ENHANCER_URL = CONFIG.get("prompt_enhancer_url", "")
PROMPT_ENHANCER_KEY = CONFIG.get("prompt_enhancer_key", "")
VISION_FALLBACK_URL = CONFIG.get("vision_fallback_url") or "https://api.kilo.ai/api/gateway/chat/completions"
VISION_FALLBACK_MODEL = CONFIG.get("vision_fallback_model") or "kilo-auto/small"
VISION_FALLBACK_KEY = CONFIG.get("vision_fallback_key") or ""
BGP_ROUTES = CONFIG.get("bgp_routes", [])
_api_key_pool = None
if API_KEY and "," in API_KEY and not OAUTH_PROVIDER.startswith("google") and BACKEND not in ("codebuff", "freebuff"):
@@ -2366,6 +2376,113 @@ def _mark_vision_fail(model):
with _vision_fail_lock:
_vision_fail_cache.add(model)
def _vision_describe_image(img_data, cache):
"""Call vision fallback API to describe a single image."""
if not VISION_FALLBACK_URL:
return None
if isinstance(img_data, dict):
img_url = img_data.get("url", "")
if not img_url:
inner = img_data.get("image_url", img_data)
img_url = inner.get("url", "") if isinstance(inner, dict) else str(inner)
else:
img_url = str(img_data)
if not img_url:
return None
img_hash = hashlib.md5(img_url.encode("utf-8", errors="replace")).hexdigest()
if img_hash in cache:
return cache[img_hash]
try:
payload = json.dumps({
"model": VISION_FALLBACK_MODEL,
"messages": [{"role": "user", "content": [
{"type": "text", "text": "Describe the content of this image in detail. If it contains text, transcribe it fully."},
{"type": "image_url", "image_url": {"url": img_url}},
]}],
"max_tokens": 1024,
"stream": False,
}).encode()
headers = {"Content-Type": "application/json"}
if VISION_FALLBACK_KEY:
headers["Authorization"] = f"Bearer {VISION_FALLBACK_KEY}"
req = urllib.request.Request(VISION_FALLBACK_URL, data=payload, headers=headers)
resp = urllib.request.urlopen(req, timeout=30)
body = json.loads(resp.read().decode())
choices = body.get("choices", [])
if choices:
msg = choices[0].get("message", {})
desc = msg.get("content", "")
if desc:
cache[img_hash] = desc
return desc
except Exception as e:
print(f"[vision-fallback] error describing image: {e}", file=sys.stderr)
return None
def _preprocess_vision(messages, schema):
"""Replace image blocks with text descriptions when provider lacks vision support."""
if schema.supports_vision:
return messages
cache = {}
for msg in messages:
content = msg.get("content")
if not isinstance(content, list):
continue
new_parts = []
changed = False
for part in content:
if isinstance(part, dict) and part.get("type") in ("image_url", "input_image"):
changed = True
img_data = part.get("image_url", part)
description = _vision_describe_image(img_data, cache)
if description:
new_parts.append({"type": "text", "text": f"[Image: {description}]"})
else:
new_parts.append({"type": "text", "text": "[Image: description unavailable - text-only model]"})
else:
new_parts.append(part)
if changed:
msg["content"] = new_parts
return messages
def _preprocess_vision_input(input_data, schema):
"""Replace input_image blocks in Responses API input format with text descriptions."""
if schema.supports_vision:
return input_data
if not isinstance(input_data, list):
return input_data
cache = {}
changed_any = False
for item in input_data:
if item.get("type") != "message":
continue
content = item.get("content")
if not isinstance(content, list):
continue
new_parts = []
changed = False
for part in content:
if isinstance(part, dict) and part.get("type") in ("input_image", "image_url"):
changed = True
img_url = ""
if part.get("type") == "input_image":
img_url = part.get("image_url", {}).get("url", "")
else:
img_url = part.get("image_url", {}).get("url", part.get("url", ""))
desc = _vision_describe_image({"url": img_url}, cache)
if desc:
new_parts.append({"type": "input_text", "text": f"[Image: {desc}]"})
else:
new_parts.append({"type": "input_text", "text": "[Image: description unavailable - text-only model]"})
else:
new_parts.append(part)
if changed:
item["content"] = new_parts
changed_any = True
return input_data
def _strip_images_from_input(input_data, model):
if not isinstance(input_data, list) or _model_supports_vision(model):
return input_data
@@ -4014,6 +4131,7 @@ class ProviderSchema:
})
response_format: str = "auto" # "sse" | "raw_json" | "ndjson" | "auto"
stream_format: str = "auto" # "sse_data" | "sse_event" | "raw_lines" | "json_lines"
supports_vision: bool = True
def hints(self) -> dict:
"""Return a dict for storing in provider-caps.json."""
@@ -4023,7 +4141,10 @@ class ProviderSchema:
continue
if isinstance(v, dict) and not v:
continue
if v is False:
if k == "supports_vision":
if v is not False:
continue
elif v is False:
continue
if v == "":
continue
@@ -4193,6 +4314,15 @@ class ErrorAnalyzer:
elif re.search(r"tool-call|tool_call.*format", err):
hints["tool_decl_format"] = "command_code"
# ── Response/Stream format hints from content-type or error ──
# ── Vision support detection ──
if re.search(r"unknown variant\b.*image_url", err) or \
re.search(r"unexpected.*image_url", err) or \
re.search(r"does not support.*image", err) or \
re.search(r"image.*not.*support", err) or \
re.search(r"unsupported.*content.*type.*image", err):
hints["supports_vision"] = False
# ── Response/Stream format hints from content-type or error ──
if re.search(r"content.type.*text/event.stream", err) or \
re.search(r"stream.*sse|sse.*expected", err):
@@ -4253,6 +4383,7 @@ def _load_schema(target_url=None, backend=None, model=None):
})),
response_format=data.get("response_format", "auto"),
stream_format=data.get("stream_format", "auto"),
supports_vision=data.get("supports_vision", True),
)
@@ -5053,6 +5184,9 @@ class Handler(http.server.BaseHTTPRequestHandler):
body["input"] = input_data
messages = oa_input_to_messages(input_data)
_schema = _load_schema(model=model)
if _schema and not _schema.supports_vision:
messages = _preprocess_vision(messages, _schema)
messages = _inject_stored_reasoning(messages)
instructions = body.get("instructions", "").strip()
if instructions:
@@ -5082,6 +5216,18 @@ class Handler(http.server.BaseHTTPRequestHandler):
upstream = urllib.request.urlopen(req, timeout=_upstream_timeout(body, stream))
except urllib.error.HTTPError as e:
err_body = e.read().decode()
if re.search(r"unknown variant\b.*image_url", err_body.lower()) or \
re.search(r"unexpected.*image_url", err_body.lower()) or \
re.search(r"does not support.*image", err_body.lower()):
_schema = _load_schema(model=model)
if _schema:
_schema.supports_vision = False
if attempt < max_retries:
print(f"[{self._session_id}] vision not supported, retrying with image preprocessing", file=sys.stderr)
messages = _preprocess_vision(messages, _schema) if _schema else messages
chat_body = self._build_chat_body(model, messages, body, stream)
chat_body_b = json.dumps(chat_body).encode()
continue
if "context_length_exceeded" in err_body and attempt < max_retries:
import re as _re
_tok_m = _re.search(r'~?(\d+)\s*tokens', err_body)
@@ -6869,7 +7015,8 @@ class Handler(http.server.BaseHTTPRequestHandler):
prev_content_type = None # for oscillation detection
for attempt in range(max_retries + 1):
adapter = SchemaAdapter(schema)
messages = adapter.convert(input_data, instructions)
processed_input = _preprocess_vision_input(input_data, schema) if not schema.supports_vision else input_data
messages = adapter.convert(processed_input, instructions)
use_cc_wrap = schema.cc_body_wrap or is_cc
# Build auth header from schema