v10.13.8: FIX D force_finalize skip Gemini, FIX A status=failed, FIX B stream timeout, FIX C lock scope, threshold 8/40
This commit is contained in:
@@ -198,9 +198,10 @@ if [ "$RUN_TASK" = "1" ]; then
|
||||
CLI_VERSION=$(codex --version 2>/dev/null || echo "unknown")
|
||||
log_info "Codex CLI: $CLI_VERSION"
|
||||
|
||||
TASK_PROMPT='Redesign the <div class="vdb-universe" id="vectordb"> section in site/index.html. Create a bold, innovative Steve Jobs-style design: boxy approach with contrasting boxes (one side white, one black), custom art seamless background that blends the two halves, think out of the box. Use pure CSS + HTML only, no external images. Make it visually stunning with geometric precision. The section is inside the existing page so keep the outer wrapper class vdb-universe with id=vectordb. Do NOT touch anything outside that section.'
|
||||
TASK_PROMPT='Create a file /tmp/e2e-test-output.txt with the text "Hello from Codex CLI E2E test" followed by the current date. Then read it back and confirm the content is correct. This is a simple smoke test.'
|
||||
|
||||
TASK_WORKSPACE="/home/roman/Codex-Launcher-Any-AI-Provider"
|
||||
TASK_WORKSPACE="/tmp/e2e-test-workspace"
|
||||
mkdir -p "$TASK_WORKSPACE"
|
||||
|
||||
mkdir -p /tmp/antigravity-task-logs
|
||||
TASK_PROXY_LOG="/tmp/antigravity-task-logs/proxy-$(date +%s).log"
|
||||
@@ -218,26 +219,16 @@ if [ "$RUN_TASK" = "1" ]; then
|
||||
# Generate model catalog
|
||||
CATALOG_PATH="$HOME/.cache/codex-proxy/models-Antigravity-Test.json"
|
||||
python3 -c "
|
||||
import json
|
||||
import json, os
|
||||
models = ['gemini-3.5-flash-high', 'gemini-3.5-flash-medium', 'gemini-3.5-flash-low',
|
||||
'gemini-3.1-pro-high', 'gemini-3.1-pro-low',
|
||||
'claude-sonnet-4-6', 'claude-opus-4-6-thinking', 'gpt-oss-120b-medium']
|
||||
catalog = []
|
||||
for m in models:
|
||||
catalog.append({
|
||||
'slug': m, 'model': m, 'display_name': m,
|
||||
'description': f'Antigravity {m}', 'hidden': False,
|
||||
'isDefault': m == 'gemini-3.5-flash-high',
|
||||
'shell_type': 'shell_command', 'visibility': 'list',
|
||||
'default_reasoning_level': 'medium',
|
||||
'supported_reasoning_levels': [
|
||||
{'effort': 'low', 'description': 'Fast'},
|
||||
{'effort': 'medium', 'description': 'Balanced'},
|
||||
{'effort': 'high', 'description': 'Deep'},
|
||||
],
|
||||
})
|
||||
json.dump(catalog, open('$CATALOG_PATH', 'w'), indent=2)
|
||||
"
|
||||
catalog.append({'slug':m,'model':m,'display_name':m,'description':'Antigravity '+m,'hidden':False,'isDefault':m=='gemini-3.5-flash-high','shell_type':'shell_command','visibility':'list','default_reasoning_level':'medium','supported_reasoning_levels':[{'effort':'low','description':'Fast'},{'effort':'medium','description':'Balanced'},{'effort':'high','description':'Deep'}]})
|
||||
os.makedirs(os.path.dirname('$CATALOG_PATH'), exist_ok=True)
|
||||
json.dump(catalog, open('$CATALOG_PATH','w'), indent=2)
|
||||
" || log_fail "Failed to create model catalog"
|
||||
|
||||
# Write main config
|
||||
cat > "$CONFIG_FILE" <<CONFEOF
|
||||
@@ -351,16 +342,15 @@ PROFEOF
|
||||
|
||||
# ── Launch Codex CLI with the task ──
|
||||
log_info "Launching Codex CLI with real task..."
|
||||
log_info "Task: Redesign vectordb section (boxy black/white approach)"
|
||||
log_info "Task: Create and verify a simple test file"
|
||||
log_info "Monitor log: $TASK_MONITOR_LOG"
|
||||
|
||||
cd "$TASK_WORKSPACE"
|
||||
|
||||
# Run codex non-interactively with --quiet flag
|
||||
set +e
|
||||
codex --profile Antigravity-Test -c "model=gemini-3.5-flash-high" \
|
||||
-s danger-full-access -a never \
|
||||
-q "$TASK_PROMPT" \
|
||||
codex exec --profile Antigravity-Test -c "model=gemini-3.5-flash-high" \
|
||||
-c 'sandbox_permissions=["disk-full-read-access","disk-full-write-access"]' \
|
||||
"$TASK_PROMPT" \
|
||||
> "$TASK_CLI_LOG" 2>&1
|
||||
CLI_EXIT=$?
|
||||
set -e
|
||||
@@ -429,21 +419,41 @@ PROFEOF
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check if the file was actually modified
|
||||
# Check if the file was actually created
|
||||
echo ""; echo "─── Test 4d: Task Output Quality ───"
|
||||
if [ -f "$TASK_WORKSPACE/site/index.html" ]; then
|
||||
VDB_LINES=$(grep -c "vectordb\|vdb-" "$TASK_WORKSPACE/site/index.html" || echo 0)
|
||||
log_info "vectordb section has $VDB_LINES vdb-related lines"
|
||||
|
||||
# Check for common issues in the output
|
||||
MALFORMED=$(grep -c "<\|>\|&" "$TASK_WORKSPACE/site/index.html" || echo 0)
|
||||
[ "$MALFORMED" -gt 100 ] && log_fail "Possible HTML encoding issue: $MALFORMED escaped entities"
|
||||
|
||||
# Check section is still intact
|
||||
if grep -q 'id="vectordb"' "$TASK_WORKSPACE/site/index.html"; then
|
||||
log_pass "vectordb section preserved"
|
||||
if [ -f "/tmp/e2e-test-output.txt" ]; then
|
||||
CONTENT=$(cat /tmp/e2e-test-output.txt 2>/dev/null)
|
||||
if echo "$CONTENT" | grep -q "Hello from Codex CLI E2E test"; then
|
||||
log_pass "Task output file created with correct content"
|
||||
else
|
||||
log_fail "vectordb section missing or corrupted"
|
||||
log_fail "Task output file exists but content is wrong: $CONTENT"
|
||||
fi
|
||||
else
|
||||
log_fail "Task output file /tmp/e2e-test-output.txt was NOT created"
|
||||
fi
|
||||
|
||||
# Check proxy log for tool-strip events (budget cap defense)
|
||||
echo ""; echo "─── Test 4e: Anti-Loop Defense Verification ───"
|
||||
if [ -f "/tmp/antigravity-test-proxy.log" ]; then
|
||||
NULL_TOOL_LOOPS=$(grep -c "NULL-TOOL LOOP" /tmp/antigravity-test-proxy.log || echo 0)
|
||||
TOOL_STRIPPED=$(grep -c "TOOLS STRIPPED" /tmp/antigravity-test-proxy.log || echo 0)
|
||||
BUDGET_HIT=$(grep -c "HARD CAP" /tmp/antigravity-test-proxy.log || echo 0)
|
||||
READ_LOOP=$(grep -c "FILE READ LOOP" /tmp/antigravity-test-proxy.log || echo 0)
|
||||
FORCE_FINALIZE=$(grep -c "force_finalize" /tmp/antigravity-test-proxy.log || echo 0)
|
||||
|
||||
log_info "Anti-loop events: null-tool=$NULL_TOOL_LOOPS stripped=$TOOL_STRIPPED budget=$BUDGET_HIT read-loop=$READ_LOOP finalize=$FORCE_FINALIZE"
|
||||
|
||||
# For a simple task, none of these should fire
|
||||
if [ "$BUDGET_HIT" -gt 0 ]; then
|
||||
log_fail "Budget cap hit on simple task — model looping"
|
||||
else
|
||||
log_pass "No budget cap triggered (task completed cleanly)"
|
||||
fi
|
||||
|
||||
if [ "$TOOL_STRIPPED" -gt 0 ]; then
|
||||
log_fail "Tools were stripped — model hit hard limit"
|
||||
else
|
||||
log_pass "No tool stripping needed (model behaved)"
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
Reference in New Issue
Block a user