v3.11.5: token-aware compaction, vision filter, universal adaptive compaction, smart-continue text detection
This commit is contained in:
@@ -1469,6 +1469,53 @@ _CROF_ADAPTIVE = {
|
||||
"min_keep_recent": 6,
|
||||
}
|
||||
|
||||
_model_max_tokens = {}
|
||||
_model_max_tokens_lock = threading.Lock()
|
||||
|
||||
def _estimate_tokens(item):
|
||||
if not isinstance(item, dict):
|
||||
return 4
|
||||
t = item.get("type", "")
|
||||
if t == "message":
|
||||
content = item.get("content", "")
|
||||
if isinstance(content, str):
|
||||
return max(4, len(content) // 4)
|
||||
elif isinstance(content, list):
|
||||
total = 4
|
||||
for part in content:
|
||||
pt = part.get("type", "")
|
||||
if pt in ("input_text", "output_text"):
|
||||
total += max(4, len(part.get("text", "")) // 4)
|
||||
elif pt == "input_image":
|
||||
total += 800
|
||||
elif pt in ("function_call",):
|
||||
total += max(20, len(part.get("arguments", "{}")) // 2)
|
||||
elif pt == "function_call_output":
|
||||
total += max(8, len(part.get("output", "")) // 4)
|
||||
return total
|
||||
elif t in ("function_call_output",):
|
||||
return max(8, len(item.get("output", "")) // 4)
|
||||
elif t == "function_call":
|
||||
return max(20, len(item.get("arguments", "{}")) // 2)
|
||||
return 4
|
||||
|
||||
def _estimate_input_tokens(input_data):
|
||||
if not isinstance(input_data, list):
|
||||
return 0
|
||||
return sum(_estimate_tokens(i) for i in input_data)
|
||||
|
||||
def _get_model_max_tokens(model):
|
||||
with _model_max_tokens_lock:
|
||||
return _model_max_tokens.get(model)
|
||||
|
||||
def _set_model_max_tokens(model, tokens):
|
||||
if model and tokens:
|
||||
with _model_max_tokens_lock:
|
||||
existing = _model_max_tokens.get(model)
|
||||
if existing is None or tokens < existing:
|
||||
_model_max_tokens[model] = tokens
|
||||
print(f"[ctx-limit] learned {model} max ~{tokens} tokens", file=sys.stderr)
|
||||
|
||||
_BGP_STATS_PATH = os.path.join(_LOG_DIR, "bgp-route-stats.json")
|
||||
_bgp_stats_lock = threading.Lock()
|
||||
|
||||
@@ -1534,8 +1581,6 @@ def _sorted_bgp_routes():
|
||||
return sorted(BGP_ROUTES, key=lambda r: _score_route(r, stats))
|
||||
|
||||
def _crof_record(model, n_items, success):
|
||||
if TARGET_URL and "crof.ai" not in TARGET_URL:
|
||||
return
|
||||
if not isinstance(n_items, int) or n_items < 1:
|
||||
return
|
||||
entry = {"model": model, "items": n_items, "ok": success}
|
||||
@@ -1561,20 +1606,36 @@ def _crof_record(model, n_items, success):
|
||||
global_limit = v["limit"]
|
||||
_CROF_ADAPTIVE["global_item_limit"] = global_limit
|
||||
|
||||
if TARGET_URL and "crof.ai" in TARGET_URL:
|
||||
print(f"[crof-adaptive] model={model} items={n_items} {'OK' if success else 'FAIL'} -> limit={ml.get('limit',30)} global={global_limit}", file=sys.stderr)
|
||||
print(f"[crof-adaptive] model={model} items={n_items} {'OK' if success else 'FAIL'} -> limit={ml.get('limit',30)} global={global_limit}", file=sys.stderr)
|
||||
|
||||
def _crof_item_limit(model):
|
||||
ml = _CROF_ADAPTIVE["model_limits"].get(model, {})
|
||||
per_model = ml.get("limit", 30)
|
||||
return min(per_model, _CROF_ADAPTIVE["global_item_limit"])
|
||||
|
||||
def _crof_compact_for_retry(input_data, model):
|
||||
def _crof_compact_for_retry(input_data, model, aggression=0):
|
||||
limit = _crof_item_limit(model)
|
||||
if not isinstance(input_data, list) or len(input_data) <= limit:
|
||||
if not isinstance(input_data, list) or len(input_data) < 2:
|
||||
return input_data
|
||||
|
||||
max_tok = _get_model_max_tokens(model)
|
||||
est = _estimate_input_tokens(input_data)
|
||||
over_item_limit = len(input_data) > limit
|
||||
over_token_limit = max_tok and est >= max_tok * 0.9
|
||||
|
||||
if not over_item_limit and not over_token_limit:
|
||||
return input_data
|
||||
|
||||
keep = max(_CROF_ADAPTIVE["min_keep_recent"], limit // 3)
|
||||
if over_token_limit:
|
||||
ratio = est / max_tok
|
||||
if aggression >= 1 or ratio > 1.5:
|
||||
keep = max(2, _CROF_ADAPTIVE["min_keep_recent"] // 2)
|
||||
elif ratio > 1.2:
|
||||
keep = max(3, keep // 2)
|
||||
print(f"[ctx-limit] model={model} est={est}tok max={max_tok}tok ratio={ratio:.2f} -> keep={keep}", file=sys.stderr)
|
||||
elif over_item_limit:
|
||||
keep = max(keep, 6)
|
||||
head_end = 0
|
||||
for i, item in enumerate(input_data):
|
||||
t = item.get("type")
|
||||
@@ -1607,8 +1668,7 @@ def _crof_compact_for_retry(input_data, model):
|
||||
summary_lines.append(_item_summary(item, max_len=120))
|
||||
|
||||
summary_msg = {"type": "message", "role": "user", "content": [{"type": "input_text", "text": "\n".join(summary_lines)}]}
|
||||
if TARGET_URL and "crof.ai" in TARGET_URL:
|
||||
print(f"[crof-adaptive] RETRY compact: {len(input_data)} -> {len(head)+1+len(tail)} (limit={limit}, keep={len(tail)})", file=sys.stderr)
|
||||
print(f"[crof-adaptive] RETRY compact: {len(input_data)} -> {len(head)+1+len(tail)} (limit={limit}, keep={len(tail)}, agg={aggression})", file=sys.stderr)
|
||||
return head + [summary_msg] + tail
|
||||
|
||||
def _item_summary(item, max_len=200):
|
||||
@@ -2051,6 +2111,18 @@ def synthesize_tool_results_for_chat(input_items):
|
||||
def has_function_call_output(input_items):
|
||||
return isinstance(input_items, list) and any(i.get("type") == "function_call_output" for i in input_items)
|
||||
|
||||
_TOOL_CALL_TEXT_PATTERNS = re.compile(
|
||||
r'(?:^|\n)[\s•\-\*]*\(?'
|
||||
r'(?:exec_command|write_to_file|exec_bash|bash|run_command|shell|edit_file|read_file|search_files|list_files)'
|
||||
r'[\s:]',
|
||||
re.I | re.MULTILINE
|
||||
)
|
||||
|
||||
def _text_looks_like_tool_calls(text):
|
||||
if not text or len(text) < 6:
|
||||
return False
|
||||
return bool(_TOOL_CALL_TEXT_PATTERNS.search(text))
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Log redaction
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
@@ -2233,9 +2305,14 @@ def _normalize_tool_args(raw_args):
|
||||
except json.JSONDecodeError:
|
||||
return raw_args
|
||||
|
||||
_XML_TC_RE = re.compile(r'<tool_call>(\w+)(.*?)</tool_call>', re.DOTALL)
|
||||
_XML_TC_RE = re.compile(r'exec_command(.*?)</invoke>', re.DOTALL)
|
||||
_XML_ARG_VALUE_RE = re.compile(r'</?arg_value>\s*')
|
||||
|
||||
_PAREN_TC_RE = re.compile(
|
||||
r'(?:^|[\n•\-\*]\s*)\(\s*(exec_command|write_to_file|exec_bash|bash|run_command|shell|edit_file|read_file|search_files|list_files)\b\s*(.*?)\)',
|
||||
re.DOTALL | re.I
|
||||
)
|
||||
|
||||
def _extract_xml_tool_calls(text):
|
||||
if not text:
|
||||
return []
|
||||
@@ -2262,6 +2339,68 @@ def _extract_xml_tool_calls(text):
|
||||
results.append({"name": name, "args": args_str, "call_id": f"xml_{len(results)}"})
|
||||
return results
|
||||
|
||||
_NON_VISION_MODEL_PATTERNS = re.compile(
|
||||
r'\b(deepseek|glm|mixtral|llama\b(?!.*vision)|command|dbrx|qwen\b(?!.*vl)|phi-?3(?!.*vision))',
|
||||
re.I
|
||||
)
|
||||
|
||||
_vision_fail_cache = set()
|
||||
_vision_fail_lock = threading.Lock()
|
||||
|
||||
def _model_supports_vision(model):
|
||||
if not model:
|
||||
return True
|
||||
with _vision_fail_lock:
|
||||
if model in _vision_fail_cache:
|
||||
return False
|
||||
if _NON_VISION_MODEL_PATTERNS.search(model):
|
||||
return False
|
||||
return True
|
||||
|
||||
def _mark_vision_fail(model):
|
||||
if model:
|
||||
with _vision_fail_lock:
|
||||
_vision_fail_cache.add(model)
|
||||
|
||||
def _strip_images_from_input(input_data, model):
|
||||
if not isinstance(input_data, list) or _model_supports_vision(model):
|
||||
return input_data
|
||||
modified = False
|
||||
result = []
|
||||
for item in input_data:
|
||||
if item.get("type") != "message":
|
||||
result.append(item)
|
||||
continue
|
||||
content = item.get("content", [])
|
||||
if isinstance(content, str):
|
||||
result.append(item)
|
||||
continue
|
||||
new_content = []
|
||||
has_img = False
|
||||
for part in content:
|
||||
if isinstance(part, str):
|
||||
new_content.append(part)
|
||||
continue
|
||||
pt = part.get("type", "")
|
||||
if pt in ("input_image", "image_url"):
|
||||
if not has_img:
|
||||
fname = part.get("image_url", {}).get("url", part.get("url", "image.png"))
|
||||
if fname.startswith("data:"):
|
||||
fname = "screenshot.png"
|
||||
new_content.append({"type": "output_text", "text": f"[User attached image: {fname} — this model does not support vision]"})
|
||||
has_img = True
|
||||
modified = True
|
||||
else:
|
||||
new_content.append(part)
|
||||
if modified:
|
||||
result.append({**item, "content": new_content})
|
||||
else:
|
||||
result.append(item)
|
||||
if modified:
|
||||
print(f"[vision-filter] stripped {sum(1 for i in input_data if i.get('type')=='message' and any(c.get('type') in ('input_image','image_url') for c in (i.get('content') or []) if isinstance(c,dict)))} images for model={model}", file=sys.stderr)
|
||||
return result
|
||||
return input_data
|
||||
|
||||
def oa_input_to_messages(input_data):
|
||||
msgs = []
|
||||
tool_name_by_id = {}
|
||||
@@ -4889,12 +5028,25 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
||||
body["input"] = input_data
|
||||
|
||||
crof_limit = _crof_item_limit(model)
|
||||
_crof_eligible = TARGET_URL and "crof.ai" in TARGET_URL
|
||||
if _crof_eligible and not compacted and isinstance(input_data, list) and len(input_data) > crof_limit:
|
||||
print(f"[crof-adaptive] proactive compact: {len(input_data)} items > limit {crof_limit}", file=sys.stderr)
|
||||
input_data = _crof_compact_for_retry(input_data, model)
|
||||
body = dict(body)
|
||||
body["input"] = input_data
|
||||
_crof_eligible = True
|
||||
if _crof_eligible and not compacted and isinstance(input_data, list):
|
||||
_needs_compact = len(input_data) > crof_limit
|
||||
max_tok = _get_model_max_tokens(model)
|
||||
est_tok = _estimate_input_tokens(input_data) if max_tok else 0
|
||||
if not _needs_compact and max_tok and est_tok > max_tok * 0.8:
|
||||
_needs_compact = True
|
||||
if _needs_compact:
|
||||
_agg = 0
|
||||
if max_tok and est_tok > max_tok:
|
||||
_agg = 1
|
||||
print(f"[crof-adaptive] proactive compact: {len(input_data)} items, est={est_tok}tok max={max_tok}tok agg={_agg}", file=sys.stderr)
|
||||
input_data = _crof_compact_for_retry(input_data, model, aggression=_agg)
|
||||
body = dict(body)
|
||||
body["input"] = input_data
|
||||
|
||||
# Strip images for non-vision models
|
||||
input_data = _strip_images_from_input(input_data, model)
|
||||
body["input"] = input_data
|
||||
|
||||
messages = oa_input_to_messages(input_data)
|
||||
messages = _inject_stored_reasoning(messages)
|
||||
@@ -4927,14 +5079,19 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
||||
except urllib.error.HTTPError as e:
|
||||
err_body = e.read().decode()
|
||||
if "context_length_exceeded" in err_body and attempt < max_retries:
|
||||
print(f"[{self._session_id}] context_length_exceeded (attempt {attempt+1}/{max_retries}), retrying with extreme compaction!", file=sys.stderr)
|
||||
import re as _re
|
||||
_tok_m = _re.search(r'~?(\d+)\s*tokens', err_body)
|
||||
if _tok_m:
|
||||
_set_model_max_tokens(model, int(_tok_m.group(1)))
|
||||
print(f"[{self._session_id}] context_length_exceeded (attempt {attempt+1}/{max_retries}), retrying with compaction (agg={attempt})!", file=sys.stderr)
|
||||
policy = provider_policy()
|
||||
if isinstance(input_data, list):
|
||||
print(f"[{self._session_id}] applying extreme compaction to {len(input_data)} items", file=sys.stderr)
|
||||
input_data = _crof_compact_for_retry(input_data, model)
|
||||
est = _estimate_input_tokens(input_data)
|
||||
print(f"[{self._session_id}] applying compaction to {len(input_data)} items ~{est}tok", file=sys.stderr)
|
||||
input_data = _crof_compact_for_retry(input_data, model, aggression=attempt)
|
||||
body = dict(body)
|
||||
body["input"] = input_data
|
||||
messages = oa_input_to_messages(input_data)
|
||||
messages = oa_input_to_messages(_strip_images_from_input(input_data, model))
|
||||
messages = _inject_stored_reasoning(messages)
|
||||
instructions = body.get("instructions", "").strip()
|
||||
if instructions:
|
||||
@@ -5725,9 +5882,11 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
||||
last_status = None
|
||||
finish_reason = None
|
||||
has_content = False
|
||||
has_message = False
|
||||
has_tool_call = False
|
||||
|
||||
def _observe_event(event):
|
||||
nonlocal last_resp_id, last_output, last_status, finish_reason, has_content
|
||||
nonlocal last_resp_id, last_output, last_status, finish_reason, has_content, has_message, has_tool_call
|
||||
for line in event.strip().split("\n"):
|
||||
if line.startswith("data: "):
|
||||
try:
|
||||
@@ -5737,7 +5896,9 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
||||
last_output = d.get("response", {}).get("output", [])
|
||||
last_status = d.get("response", {}).get("status")
|
||||
finish_reason = "length" if last_status == "incomplete" else "stop"
|
||||
has_content = any(o.get("type") == "message" for o in (last_output or []))
|
||||
has_tool_call = any(o.get("type") == "function_call" for o in (last_output or []))
|
||||
has_message = any(o.get("type") == "message" for o in (last_output or []))
|
||||
has_content = has_message or has_tool_call
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -5749,7 +5910,7 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
||||
break
|
||||
collected_events.append(event)
|
||||
_observe_event(event)
|
||||
print(f"[{self._session_id}] stream ended: events={len(collected_events)} finish={finish_reason} has_content={has_content} elapsed={time.time()-t0:.1f}s", file=sys.stderr)
|
||||
print(f"[{self._session_id}] stream ended: events={len(collected_events)} finish={finish_reason} has_content={has_content} has_message={has_message} has_tool_call={has_tool_call} elapsed={time.time()-t0:.1f}s", file=sys.stderr)
|
||||
except (ConnectionResetError, BrokenPipeError, ConnectionAbortedError):
|
||||
print("[translate-proxy] client disconnected during stream", file=sys.stderr)
|
||||
_crof_record(model, n_items, False)
|
||||
@@ -5805,6 +5966,8 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
||||
last_resp_id = last_output = last_status = None
|
||||
finish_reason = None
|
||||
has_content = False
|
||||
has_message = False
|
||||
has_tool_call = False
|
||||
for event in oa_stream_to_sse(retry_upstream, model, body.get("request_id") or body.get("id")):
|
||||
collected_events.append(event)
|
||||
_observe_event(event)
|
||||
@@ -5813,7 +5976,7 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
||||
print(f"[provider-sensor] synthetic retry failed: {e}", file=sys.stderr)
|
||||
|
||||
# Auto-retry on finish_reason=length with no content due to too much context.
|
||||
if finish_reason == "length" and not has_content and isinstance(input_data, list) and len(input_data) > 5 and TARGET_URL and "crof.ai" in TARGET_URL:
|
||||
if finish_reason == "length" and not has_content and isinstance(input_data, list) and len(input_data) > 5:
|
||||
print(f"[crof-adaptive] RETRY: finish_reason=length with no content, compacting {n_items} items", file=sys.stderr)
|
||||
new_input = _crof_compact_for_retry(input_data, model)
|
||||
if len(new_input) < len(input_data):
|
||||
@@ -5836,6 +5999,8 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
||||
last_resp_id = last_output = last_status = None
|
||||
finish_reason = None
|
||||
has_content = False
|
||||
has_message = False
|
||||
has_tool_call = False
|
||||
for event in oa_stream_to_sse(retry_upstream, model, body.get("request_id") or body.get("id")):
|
||||
collected_events.append(event)
|
||||
_observe_event(event)
|
||||
@@ -5943,9 +6108,17 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
||||
_smart_attempt = 0
|
||||
while _smart_attempt < _smart_max:
|
||||
_has_tool_calls_in_output = any(o.get("type") == "function_call" for o in (last_output or []))
|
||||
last_text = ""
|
||||
for o in (last_output or []):
|
||||
if o.get("type") == "message":
|
||||
for c in (o.get("content") or []):
|
||||
if isinstance(c, dict) and c.get("type") == "output_text":
|
||||
last_text += c.get("text", "")
|
||||
_looks_like_tools = _text_looks_like_tool_calls(last_text)
|
||||
_has_prior_tool_ctx = has_function_call_output(input_data)
|
||||
if not (finish_reason == "stop" and has_content and not _has_tool_calls_in_output
|
||||
and isinstance(input_data, list) and len(input_data) >= 3
|
||||
and has_function_call_output(input_data)):
|
||||
and (_has_prior_tool_ctx or _looks_like_tools)):
|
||||
break
|
||||
_smart_attempt += 1
|
||||
_nudges = [
|
||||
@@ -5954,12 +6127,6 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
||||
]
|
||||
nudge_text = _nudges[min(_smart_attempt - 1, len(_nudges) - 1)]
|
||||
# Try extracting XML tool calls from text as fallback before nudging
|
||||
last_text = ""
|
||||
for o in (last_output or []):
|
||||
if o.get("type") == "message":
|
||||
for c in (o.get("content") or []):
|
||||
if isinstance(c, dict) and c.get("type") == "output_text":
|
||||
last_text += c.get("text", "")
|
||||
xml_fc = _extract_xml_tool_calls(last_text)
|
||||
if xml_fc:
|
||||
print(f"[{self._session_id}] [smart-continue] extracted {len(xml_fc)} XML tool calls from text, injecting and retrying", file=sys.stderr)
|
||||
@@ -5979,6 +6146,8 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
||||
last_resp_id = last_output = last_status = None
|
||||
finish_reason = None
|
||||
has_content = False
|
||||
has_message = False
|
||||
has_tool_call = False
|
||||
for event in oa_stream_to_sse(retry_upstream, model, body.get("request_id") or body.get("id")):
|
||||
collected_events.append(event)
|
||||
_observe_event(event)
|
||||
@@ -5988,19 +6157,21 @@ class Handler(http.server.BaseHTTPRequestHandler):
|
||||
print(f"[{self._session_id}] [smart-continue] XML injection retry failed: {e}", file=sys.stderr)
|
||||
break
|
||||
_nudge_msg = {"role": "user", "content": nudge_text}
|
||||
nudge_messages = oa_input_to_messages(input_data) + [_nudge_msg]
|
||||
nudge_messages = oa_input_to_messages(_strip_images_from_input(input_data, model)) + [_nudge_msg]
|
||||
instructions = body.get("instructions", "").strip()
|
||||
if instructions:
|
||||
nudge_messages.insert(0, {"role": "system", "content": instructions})
|
||||
nudge_chat_body = self._build_chat_body(model, nudge_messages, body, stream)
|
||||
nudge_req = urllib.request.Request(target, data=json.dumps(nudge_chat_body).encode(), headers=fwd)
|
||||
print(f"[{self._session_id}] [smart-continue] attempt {_smart_attempt}/{_smart_max}: model stopped mid-task, nudging", file=sys.stderr)
|
||||
print(f"[{self._session_id}] [smart-continue] attempt {_smart_attempt}/{_smart_max}: model stopped mid-task (prior_ctx={_has_prior_tool_ctx} text_tools={_looks_like_tools}), nudging", file=sys.stderr)
|
||||
try:
|
||||
retry_upstream = urllib.request.urlopen(nudge_req, timeout=_upstream_timeout(body, True))
|
||||
collected_events = []
|
||||
last_resp_id = last_output = last_status = None
|
||||
finish_reason = None
|
||||
has_content = False
|
||||
has_message = False
|
||||
has_tool_call = False
|
||||
for event in oa_stream_to_sse(retry_upstream, model, body.get("request_id") or body.get("id")):
|
||||
collected_events.append(event)
|
||||
_observe_event(event)
|
||||
|
||||
Reference in New Issue
Block a user