v10.13.8: FIX D force_finalize skip Gemini, FIX A status=failed, FIX B stream timeout, FIX C lock scope, threshold 8/40

This commit is contained in:
Roman | RyzenAdvanced
2026-05-27 17:52:17 +04:00
Unverified
parent 6861700c0d
commit 5055ff894d
4 changed files with 199 additions and 159 deletions

View File

@@ -198,9 +198,10 @@ if [ "$RUN_TASK" = "1" ]; then
CLI_VERSION=$(codex --version 2>/dev/null || echo "unknown")
log_info "Codex CLI: $CLI_VERSION"
TASK_PROMPT='Redesign the <div class="vdb-universe" id="vectordb"> section in site/index.html. Create a bold, innovative Steve Jobs-style design: boxy approach with contrasting boxes (one side white, one black), custom art seamless background that blends the two halves, think out of the box. Use pure CSS + HTML only, no external images. Make it visually stunning with geometric precision. The section is inside the existing page so keep the outer wrapper class vdb-universe with id=vectordb. Do NOT touch anything outside that section.'
TASK_PROMPT='Create a file /tmp/e2e-test-output.txt with the text "Hello from Codex CLI E2E test" followed by the current date. Then read it back and confirm the content is correct. This is a simple smoke test.'
TASK_WORKSPACE="/home/roman/Codex-Launcher-Any-AI-Provider"
TASK_WORKSPACE="/tmp/e2e-test-workspace"
mkdir -p "$TASK_WORKSPACE"
mkdir -p /tmp/antigravity-task-logs
TASK_PROXY_LOG="/tmp/antigravity-task-logs/proxy-$(date +%s).log"
@@ -218,26 +219,16 @@ if [ "$RUN_TASK" = "1" ]; then
# Generate model catalog
CATALOG_PATH="$HOME/.cache/codex-proxy/models-Antigravity-Test.json"
python3 -c "
import json
import json, os
models = ['gemini-3.5-flash-high', 'gemini-3.5-flash-medium', 'gemini-3.5-flash-low',
'gemini-3.1-pro-high', 'gemini-3.1-pro-low',
'claude-sonnet-4-6', 'claude-opus-4-6-thinking', 'gpt-oss-120b-medium']
catalog = []
for m in models:
catalog.append({
'slug': m, 'model': m, 'display_name': m,
'description': f'Antigravity {m}', 'hidden': False,
'isDefault': m == 'gemini-3.5-flash-high',
'shell_type': 'shell_command', 'visibility': 'list',
'default_reasoning_level': 'medium',
'supported_reasoning_levels': [
{'effort': 'low', 'description': 'Fast'},
{'effort': 'medium', 'description': 'Balanced'},
{'effort': 'high', 'description': 'Deep'},
],
})
json.dump(catalog, open('$CATALOG_PATH', 'w'), indent=2)
"
catalog.append({'slug':m,'model':m,'display_name':m,'description':'Antigravity '+m,'hidden':False,'isDefault':m=='gemini-3.5-flash-high','shell_type':'shell_command','visibility':'list','default_reasoning_level':'medium','supported_reasoning_levels':[{'effort':'low','description':'Fast'},{'effort':'medium','description':'Balanced'},{'effort':'high','description':'Deep'}]})
os.makedirs(os.path.dirname('$CATALOG_PATH'), exist_ok=True)
json.dump(catalog, open('$CATALOG_PATH','w'), indent=2)
" || log_fail "Failed to create model catalog"
# Write main config
cat > "$CONFIG_FILE" <<CONFEOF
@@ -351,16 +342,15 @@ PROFEOF
# ── Launch Codex CLI with the task ──
log_info "Launching Codex CLI with real task..."
log_info "Task: Redesign vectordb section (boxy black/white approach)"
log_info "Task: Create and verify a simple test file"
log_info "Monitor log: $TASK_MONITOR_LOG"
cd "$TASK_WORKSPACE"
# Run codex non-interactively with --quiet flag
set +e
codex --profile Antigravity-Test -c "model=gemini-3.5-flash-high" \
-s danger-full-access -a never \
-q "$TASK_PROMPT" \
codex exec --profile Antigravity-Test -c "model=gemini-3.5-flash-high" \
-c 'sandbox_permissions=["disk-full-read-access","disk-full-write-access"]' \
"$TASK_PROMPT" \
> "$TASK_CLI_LOG" 2>&1
CLI_EXIT=$?
set -e
@@ -429,21 +419,41 @@ PROFEOF
fi
fi
# Check if the file was actually modified
# Check if the file was actually created
echo ""; echo "─── Test 4d: Task Output Quality ───"
if [ -f "$TASK_WORKSPACE/site/index.html" ]; then
VDB_LINES=$(grep -c "vectordb\|vdb-" "$TASK_WORKSPACE/site/index.html" || echo 0)
log_info "vectordb section has $VDB_LINES vdb-related lines"
# Check for common issues in the output
MALFORMED=$(grep -c "&lt;\|&gt;\|&amp;" "$TASK_WORKSPACE/site/index.html" || echo 0)
[ "$MALFORMED" -gt 100 ] && log_fail "Possible HTML encoding issue: $MALFORMED escaped entities"
# Check section is still intact
if grep -q 'id="vectordb"' "$TASK_WORKSPACE/site/index.html"; then
log_pass "vectordb section preserved"
if [ -f "/tmp/e2e-test-output.txt" ]; then
CONTENT=$(cat /tmp/e2e-test-output.txt 2>/dev/null)
if echo "$CONTENT" | grep -q "Hello from Codex CLI E2E test"; then
log_pass "Task output file created with correct content"
else
log_fail "vectordb section missing or corrupted"
log_fail "Task output file exists but content is wrong: $CONTENT"
fi
else
log_fail "Task output file /tmp/e2e-test-output.txt was NOT created"
fi
# Check proxy log for tool-strip events (budget cap defense)
echo ""; echo "─── Test 4e: Anti-Loop Defense Verification ───"
if [ -f "/tmp/antigravity-test-proxy.log" ]; then
NULL_TOOL_LOOPS=$(grep -c "NULL-TOOL LOOP" /tmp/antigravity-test-proxy.log || echo 0)
TOOL_STRIPPED=$(grep -c "TOOLS STRIPPED" /tmp/antigravity-test-proxy.log || echo 0)
BUDGET_HIT=$(grep -c "HARD CAP" /tmp/antigravity-test-proxy.log || echo 0)
READ_LOOP=$(grep -c "FILE READ LOOP" /tmp/antigravity-test-proxy.log || echo 0)
FORCE_FINALIZE=$(grep -c "force_finalize" /tmp/antigravity-test-proxy.log || echo 0)
log_info "Anti-loop events: null-tool=$NULL_TOOL_LOOPS stripped=$TOOL_STRIPPED budget=$BUDGET_HIT read-loop=$READ_LOOP finalize=$FORCE_FINALIZE"
# For a simple task, none of these should fire
if [ "$BUDGET_HIT" -gt 0 ]; then
log_fail "Budget cap hit on simple task — model looping"
else
log_pass "No budget cap triggered (task completed cleanly)"
fi
if [ "$TOOL_STRIPPED" -gt 0 ]; then
log_fail "Tools were stripped — model hit hard limit"
else
log_pass "No tool stripping needed (model behaved)"
fi
fi