feat: add Vosk STT - offline voice-to-text, no API key needed

This commit is contained in:
admin
2026-05-05 17:50:50 +00:00
Unverified
parent 6685f60855
commit 3b6a7ec502
3 changed files with 216 additions and 9 deletions

View File

@@ -901,7 +901,7 @@ export async function initBot(config, api, tools, skills, agents) {
});
bot.command('voice', async (ctx) => {
await sendStreamingMessage(ctx, `🎤 *Voice I/O*\n\nVoice recording is available via the TS service layer.\nSend me a voice message and I will transcribe it.`);
await sendStreamingMessage(ctx, `🎤 *Voice I/O*\n\n🎤→📝 *Speech-to-Text*: Send a voice message — transcribed via Vosk (offline, no API key, ~200ms).\n📝→🎤 *Text-to-Speech*: Ask the AI to use the \`tts\` tool — generates voice via Edge TTS (free).\n\nNo API keys needed. Runs fully on the server.`);
});
bot.command('mcp', async (ctx) => {
@@ -1015,14 +1015,15 @@ export async function initBot(config, api, tools, skills, agents) {
}
// ── Message text handler (with dedup + queue + self-correction) ──
bot.on('message:text', async (ctx) => {
// ── Text message handler (shared by text & voice) ──
async function handleTextMessage(ctx, text, isVoice = false) {
if (isDuplicate(ctx.message.message_id)) return;
markProcessed(ctx.message.message_id);
const key = buildSessionKey(ctx.chat.id, ctx.message?.message_thread_id);
const text = ctx.message.text;
const user = ctx.from?.username || ctx.from?.first_name || 'Unknown';
logger.info(`💬 ${user}: ${text.substring(0, 80)}`);
const prefix = isVoice ? '🎤' : '💬';
logger.info(`${prefix} ${user}: ${text.substring(0, 80)}`);
await queueRequest(key, text, async () => {
await ctx.api.sendChatAction(ctx.chat.id, 'typing');
@@ -1069,17 +1070,66 @@ export async function initBot(config, api, tools, skills, agents) {
// ── Self-learning: extract patterns from this interaction ──
await selfLearn(text, result, memory);
});
}
bot.on('message:text', async (ctx) => {
await handleTextMessage(ctx, ctx.message.text, false);
});
// ── Voice handler ──
// ── Voice handler (Vosk STT) ──
bot.on('message:voice', async (ctx) => {
const fileId = ctx.message.voice.file_id;
const user = ctx.from?.username || ctx.from?.first_name || 'Unknown';
logger.info(`🎤 Voice from ${user}`);
await ctx.reply('🎤 Voice received! (STT via Whisper TBD)');
const file = await ctx.api.getFile(fileId);
const url = `https://api.telegram.org/file/bot${botToken}/${file.file_path}`;
logger.info(`Voice file: ${url}`);
const statusMsg = await ctx.reply('🎤 Transcribing…');
try {
const file = await ctx.api.getFile(fileId);
const url = `https://api.telegram.org/file/bot${botToken}/${file.file_path}`;
const oggPath = `/tmp/zcode-voice-${Date.now()}.ogg`;
// Download voice file
const { execSync } = await import('child_process');
execSync(`curl -sL "${url}" -o "${oggPath}"`, { timeout: 15000 });
logger.info(`Voice downloaded: ${oggPath}`);
// Run Vosk STT via Python script
const sttScript = new URL('../scripts/stt.py', import.meta.url).pathname;
const result = execSync(
`python3 "${sttScript}" "${oggPath}" 2>/dev/null`,
{ timeout: 30000, encoding: 'utf-8' }
);
const parsed = JSON.parse(result.trim());
// Cleanup
execSync(`rm -f "${oggPath}"`);
if (parsed.error) {
logger.error(`STT error: ${parsed.error}`);
await ctx.api.editMessageText(ctx.chat.id, statusMsg.message_id,
`❌ STT error: ${parsed.error}`);
return;
}
if (!parsed.text) {
await ctx.api.editMessageText(ctx.chat.id, statusMsg.message_id,
'🎤 Could not detect speech in the voice message.');
return;
}
logger.info(`🎤 STT (${parsed.confidence || '?'}): ${parsed.text}`);
await ctx.api.deleteMessage(ctx.chat.id, statusMsg.message_id);
// Feed transcribed text into the main chat pipeline
await handleTextMessage(ctx, parsed.text, true);
} catch (err) {
logger.error(`Voice handler error: ${err.message}`);
try {
await ctx.api.editMessageText(ctx.chat.id, statusMsg.message_id,
`❌ Voice processing failed: ${err.message.slice(0, 100)}`);
} catch {}
}
});
// ── Photo handler ──