v10.13.8: FIX D force_finalize skip Gemini, FIX A status=failed, FIX B stream timeout, FIX C lock scope, threshold 8/40

2026-05-27 17:52:17 +04:00
parent 6861700c0d
commit 5055ff894d
4 changed files with 199 additions and 159 deletions
--- a/test-antigravity.sh
+++ b/test-antigravity.sh
@@ -198,9 +198,10 @@ if [ "$RUN_TASK" = "1" ]; then
        CLI_VERSION=$(codex --version 2>/dev/null || echo "unknown")
        log_info "Codex CLI: $CLI_VERSION"

-        TASK_PROMPT='Redesign the <div class="vdb-universe" id="vectordb"> section in site/index.html. Create a bold, innovative Steve Jobs-style design: boxy approach with contrasting boxes (one side white, one black), custom art seamless background that blends the two halves, think out of the box. Use pure CSS + HTML only, no external images. Make it visually stunning with geometric precision. The section is inside the existing page so keep the outer wrapper class vdb-universe with id=vectordb. Do NOT touch anything outside that section.'
+        TASK_PROMPT='Create a file /tmp/e2e-test-output.txt with the text "Hello from Codex CLI E2E test" followed by the current date. Then read it back and confirm the content is correct. This is a simple smoke test.'

-        TASK_WORKSPACE="/home/roman/Codex-Launcher-Any-AI-Provider"
+        TASK_WORKSPACE="/tmp/e2e-test-workspace"
+        mkdir -p "$TASK_WORKSPACE"

        mkdir -p /tmp/antigravity-task-logs
        TASK_PROXY_LOG="/tmp/antigravity-task-logs/proxy-$(date +%s).log"
@@ -218,26 +219,16 @@ if [ "$RUN_TASK" = "1" ]; then
        # Generate model catalog
        CATALOG_PATH="$HOME/.cache/codex-proxy/models-Antigravity-Test.json"
        python3 -c "
-import json
+import json, os
 models = ['gemini-3.5-flash-high', 'gemini-3.5-flash-medium', 'gemini-3.5-flash-low',
          'gemini-3.1-pro-high', 'gemini-3.1-pro-low',
          'claude-sonnet-4-6', 'claude-opus-4-6-thinking', 'gpt-oss-120b-medium']
 catalog = []
 for m in models:
-    catalog.append({
-        'slug': m, 'model': m, 'display_name': m,
-        'description': f'Antigravity {m}', 'hidden': False,
-        'isDefault': m == 'gemini-3.5-flash-high',
-        'shell_type': 'shell_command', 'visibility': 'list',
-        'default_reasoning_level': 'medium',
-        'supported_reasoning_levels': [
-            {'effort': 'low', 'description': 'Fast'},
-            {'effort': 'medium', 'description': 'Balanced'},
-            {'effort': 'high', 'description': 'Deep'},
-        ],
-    })
-json.dump(catalog, open('$CATALOG_PATH', 'w'), indent=2)
-"
+    catalog.append({'slug':m,'model':m,'display_name':m,'description':'Antigravity '+m,'hidden':False,'isDefault':m=='gemini-3.5-flash-high','shell_type':'shell_command','visibility':'list','default_reasoning_level':'medium','supported_reasoning_levels':[{'effort':'low','description':'Fast'},{'effort':'medium','description':'Balanced'},{'effort':'high','description':'Deep'}]})
+os.makedirs(os.path.dirname('$CATALOG_PATH'), exist_ok=True)
+json.dump(catalog, open('$CATALOG_PATH','w'), indent=2)
+" || log_fail "Failed to create model catalog"

        # Write main config
        cat > "$CONFIG_FILE" <<CONFEOF
@@ -351,16 +342,15 @@ PROFEOF

        # ── Launch Codex CLI with the task ──
        log_info "Launching Codex CLI with real task..."
-        log_info "Task: Redesign vectordb section (boxy black/white approach)"
+        log_info "Task: Create and verify a simple test file"
        log_info "Monitor log: $TASK_MONITOR_LOG"

        cd "$TASK_WORKSPACE"

-        # Run codex non-interactively with --quiet flag
        set +e
-        codex --profile Antigravity-Test -c "model=gemini-3.5-flash-high" \
-            -s danger-full-access -a never \
-            -q "$TASK_PROMPT" \
+        codex exec --profile Antigravity-Test -c "model=gemini-3.5-flash-high" \
+            -c 'sandbox_permissions=["disk-full-read-access","disk-full-write-access"]' \
+            "$TASK_PROMPT" \
            > "$TASK_CLI_LOG" 2>&1
        CLI_EXIT=$?
        set -e
@@ -429,21 +419,41 @@ PROFEOF
            fi
        fi

-        # Check if the file was actually modified
+        # Check if the file was actually created
        echo ""; echo "─── Test 4d: Task Output Quality ───"
-        if [ -f "$TASK_WORKSPACE/site/index.html" ]; then
-            VDB_LINES=$(grep -c "vectordb\|vdb-" "$TASK_WORKSPACE/site/index.html" || echo 0)
-            log_info "vectordb section has $VDB_LINES vdb-related lines"
-
-            # Check for common issues in the output
-            MALFORMED=$(grep -c "&lt;\|&gt;\|&amp;" "$TASK_WORKSPACE/site/index.html" || echo 0)
-            [ "$MALFORMED" -gt 100 ] && log_fail "Possible HTML encoding issue: $MALFORMED escaped entities"
-
-            # Check section is still intact
-            if grep -q 'id="vectordb"' "$TASK_WORKSPACE/site/index.html"; then
-                log_pass "vectordb section preserved"
+        if [ -f "/tmp/e2e-test-output.txt" ]; then
+            CONTENT=$(cat /tmp/e2e-test-output.txt 2>/dev/null)
+            if echo "$CONTENT" | grep -q "Hello from Codex CLI E2E test"; then
+                log_pass "Task output file created with correct content"
            else
-                log_fail "vectordb section missing or corrupted"
+                log_fail "Task output file exists but content is wrong: $CONTENT"
+            fi
+        else
+            log_fail "Task output file /tmp/e2e-test-output.txt was NOT created"
+        fi
+
+        # Check proxy log for tool-strip events (budget cap defense)
+        echo ""; echo "─── Test 4e: Anti-Loop Defense Verification ───"
+        if [ -f "/tmp/antigravity-test-proxy.log" ]; then
+            NULL_TOOL_LOOPS=$(grep -c "NULL-TOOL LOOP" /tmp/antigravity-test-proxy.log || echo 0)
+            TOOL_STRIPPED=$(grep -c "TOOLS STRIPPED" /tmp/antigravity-test-proxy.log || echo 0)
+            BUDGET_HIT=$(grep -c "HARD CAP" /tmp/antigravity-test-proxy.log || echo 0)
+            READ_LOOP=$(grep -c "FILE READ LOOP" /tmp/antigravity-test-proxy.log || echo 0)
+            FORCE_FINALIZE=$(grep -c "force_finalize" /tmp/antigravity-test-proxy.log || echo 0)
+
+            log_info "Anti-loop events: null-tool=$NULL_TOOL_LOOPS stripped=$TOOL_STRIPPED budget=$BUDGET_HIT read-loop=$READ_LOOP finalize=$FORCE_FINALIZE"
+
+            # For a simple task, none of these should fire
+            if [ "$BUDGET_HIT" -gt 0 ]; then
+                log_fail "Budget cap hit on simple task — model looping"
+            else
+                log_pass "No budget cap triggered (task completed cleanly)"
+            fi
+
+            if [ "$TOOL_STRIPPED" -gt 0 ]; then
+                log_fail "Tools were stripped — model hit hard limit"
+            else
+                log_pass "No tool stripping needed (model behaved)"
            fi
        fi