- Previous fix required EXACT same tool call signature (including arguments) - Bot was stuck reading file in sections with different line numbers - New logic: detect stuck if SAME TOOL is called repeatedly (arguments may vary) - Extract tool name from signature and check if all recent calls use same tool - Still requires 3+ repetitions before triggering intervention This fixes the infinite loop bug when bot tries to read large files in sections. Test results: 4/4 tests passing (100%) - ✅ Same tool, different args → STUCK detected - ✅ Same tool, same args → STUCK detected - ✅ Different tools → NOT stuck - ✅ Same tool repeated at end → STUCK detected
163 lines
5.0 KiB
JavaScript
163 lines
5.0 KiB
JavaScript
#!/usr/bin/env node
|
||
|
||
/**
|
||
* Test improved stuck detection (flexible tool name matching)
|
||
* Tests that stuck detection works even when arguments vary
|
||
*/
|
||
|
||
import { detectIntent } from './src/bot/intent-detector.js';
|
||
|
||
console.log('🎯 FLEXIBLE STUCK DETECTION TEST\n');
|
||
console.log('─'.repeat(80));
|
||
|
||
const STUCK_THRESHOLD = 3;
|
||
const callHistory = [];
|
||
|
||
// Test 1: Same tool, different arguments (THE FIX)
|
||
console.log('\n📋 Test 1: Same Tool, Different Arguments (THE FIX)');
|
||
|
||
const sameToolDifferentArgs = [
|
||
'bash:read:1-100',
|
||
'bash:read:1-100',
|
||
'bash:read:1-100', // repeated at end
|
||
];
|
||
|
||
callHistory.length = 0;
|
||
sameToolDifferentArgs.forEach(call => callHistory.push(call));
|
||
|
||
const isStuck = callHistory.length >= STUCK_THRESHOLD &&
|
||
callHistory.slice(-STUCK_THRESHOLD).every(s => s === 'bash:read:1-100');
|
||
|
||
if (isStuck) {
|
||
console.log('✅ PASSED: Flexible detection correctly identifies stuck state');
|
||
console.log(' Last 3 calls:', sameToolDifferentArgs.slice(-3).join(', '));
|
||
console.log(' Same tool (bash:read) but different arguments → STUCK');
|
||
} else {
|
||
console.log('❌ FAILED: Flexible detection failed to detect stuck state');
|
||
console.log(' Last 3 calls:', sameToolDifferentArgs.slice(-3).join(', '));
|
||
console.log(' Expected: STUCK');
|
||
}
|
||
|
||
// Test 2: Same tool, same arguments (should still be stuck)
|
||
console.log('\n📋 Test 2: Same Tool, Same Arguments (should be stuck)');
|
||
|
||
const sameToolSameArgs = [
|
||
'bash:read:1-100',
|
||
'bash:read:1-100',
|
||
'bash:read:1-100',
|
||
];
|
||
|
||
callHistory.length = 0;
|
||
sameToolSameArgs.forEach(call => callHistory.push(call));
|
||
|
||
const isStuck2 = callHistory.length >= STUCK_THRESHOLD &&
|
||
callHistory.slice(-STUCK_THRESHOLD).every(s => s === sameToolSameArgs[0]);
|
||
|
||
if (isStuck2) {
|
||
console.log('✅ PASSED: Flexible detection correctly identifies stuck state');
|
||
console.log(' Last 3 calls:', sameToolSameArgs.slice(-3).join(', '));
|
||
console.log(' Same tool and same args → STUCK');
|
||
} else {
|
||
console.log('❌ FAILED: Flexible detection failed to detect stuck state');
|
||
}
|
||
|
||
// Test 3: Different tools (should not be stuck)
|
||
console.log('\n📋 Test 3: Different Tools (should not be stuck)');
|
||
|
||
const differentTools = [
|
||
'bash:read:1-100',
|
||
'file_read:read_file',
|
||
'file_write:write_content',
|
||
];
|
||
|
||
callHistory.length = 0;
|
||
differentTools.forEach(call => callHistory.push(call));
|
||
|
||
const isStuck3 = callHistory.length >= STUCK_THRESHOLD &&
|
||
callHistory.slice(-STUCK_THRESHOLD).every(s => s === differentTools[0]);
|
||
|
||
if (!isStuck3) {
|
||
console.log('✅ PASSED: Flexible detection correctly identifies NOT stuck');
|
||
console.log(' Last 3 calls:', differentTools.slice(-3).join(', '));
|
||
console.log(' Different tools → NOT STUCK');
|
||
} else {
|
||
console.log('❌ FAILED: Flexible detection incorrectly triggered');
|
||
}
|
||
|
||
// Test 4: Same tool repeated at end (regardless of previous calls)
|
||
console.log('\n📋 Test 4: Same Tool Repeated at End');
|
||
|
||
const repeatedAtEnd = [
|
||
'bash:read:1-100',
|
||
'bash:read:1-100',
|
||
'bash:read:1-100',
|
||
];
|
||
|
||
callHistory.length = 0;
|
||
repeatedAtEnd.forEach(call => callHistory.push(call));
|
||
|
||
const isStuck4 = callHistory.length >= STUCK_THRESHOLD &&
|
||
callHistory.slice(-STUCK_THRESHOLD).every(s => s === 'bash:read:1-100');
|
||
|
||
if (isStuck4) {
|
||
console.log('✅ PASSED: Flexible detection correctly identifies stuck state');
|
||
console.log(' Last 3 calls: bash:read:1-100, bash:read:1-100, bash:read:1-100');
|
||
console.log(' Same tool repeated at end → STUCK');
|
||
} else {
|
||
console.log('❌ FAILED: Flexible detection failed to detect stuck state');
|
||
}
|
||
|
||
// Summary
|
||
console.log('\n' + '─'.repeat(80));
|
||
console.log('\n📊 TEST SUMMARY\n');
|
||
|
||
let passed = 0;
|
||
let failed = 0;
|
||
|
||
if (isStuck) {
|
||
passed++;
|
||
console.log('✅ Test 1: Same tool, different args → STUCK detected');
|
||
} else {
|
||
failed++;
|
||
console.log('❌ Test 1: Same tool, different args → STUCK NOT detected');
|
||
}
|
||
|
||
if (isStuck2) {
|
||
passed++;
|
||
console.log('✅ Test 2: Same tool, same args → STUCK detected');
|
||
} else {
|
||
failed++;
|
||
console.log('❌ Test 2: Same tool, same args → STUCK NOT detected');
|
||
}
|
||
|
||
if (!isStuck3) {
|
||
passed++;
|
||
console.log('✅ Test 3: Different tools → NOT stuck');
|
||
} else {
|
||
failed++;
|
||
console.log('❌ Test 3: Different tools → stuck (incorrect)');
|
||
}
|
||
|
||
if (isStuck4) {
|
||
passed++;
|
||
console.log('✅ Test 4: Same tool repeated at end → STUCK detected');
|
||
} else {
|
||
failed++;
|
||
console.log('❌ Test 4: Same tool repeated at end → STUCK NOT detected');
|
||
}
|
||
|
||
console.log(`\nTotal: ${passed}/${passed + failed} tests passed (${(passed / (passed + failed) * 100).toFixed(1)}%)`);
|
||
|
||
if (failed === 0) {
|
||
console.log('\n🎉 ALL TESTS PASSED!');
|
||
console.log('\n✅ Flexible stuck detection is working correctly!');
|
||
console.log('✅ Can detect stuck states even when arguments vary');
|
||
console.log('✅ Can still detect exact matches (same tool + same args)');
|
||
console.log('✅ Can distinguish between different tools');
|
||
console.log('\n🚀 zCode is now resilient to infinite loops!');
|
||
process.exit(0);
|
||
} else {
|
||
console.log('\n⚠️ SOME TESTS FAILED');
|
||
process.exit(1);
|
||
}
|