feat: add Vosk STT - offline voice-to-text, no API key needed
This commit is contained in:
@@ -901,7 +901,7 @@ export async function initBot(config, api, tools, skills, agents) {
|
||||
});
|
||||
|
||||
bot.command('voice', async (ctx) => {
|
||||
await sendStreamingMessage(ctx, `🎤 *Voice I/O*\n\nVoice recording is available via the TS service layer.\nSend me a voice message and I will transcribe it.`);
|
||||
await sendStreamingMessage(ctx, `🎤 *Voice I/O*\n\n🎤→📝 *Speech-to-Text*: Send a voice message — transcribed via Vosk (offline, no API key, ~200ms).\n📝→🎤 *Text-to-Speech*: Ask the AI to use the \`tts\` tool — generates voice via Edge TTS (free).\n\nNo API keys needed. Runs fully on the server.`);
|
||||
});
|
||||
|
||||
bot.command('mcp', async (ctx) => {
|
||||
@@ -1015,14 +1015,15 @@ export async function initBot(config, api, tools, skills, agents) {
|
||||
}
|
||||
|
||||
// ── Message text handler (with dedup + queue + self-correction) ──
|
||||
bot.on('message:text', async (ctx) => {
|
||||
// ── Text message handler (shared by text & voice) ──
|
||||
async function handleTextMessage(ctx, text, isVoice = false) {
|
||||
if (isDuplicate(ctx.message.message_id)) return;
|
||||
markProcessed(ctx.message.message_id);
|
||||
|
||||
const key = buildSessionKey(ctx.chat.id, ctx.message?.message_thread_id);
|
||||
const text = ctx.message.text;
|
||||
const user = ctx.from?.username || ctx.from?.first_name || 'Unknown';
|
||||
logger.info(`💬 ${user}: ${text.substring(0, 80)}…`);
|
||||
const prefix = isVoice ? '🎤' : '💬';
|
||||
logger.info(`${prefix} ${user}: ${text.substring(0, 80)}…`);
|
||||
|
||||
await queueRequest(key, text, async () => {
|
||||
await ctx.api.sendChatAction(ctx.chat.id, 'typing');
|
||||
@@ -1069,17 +1070,66 @@ export async function initBot(config, api, tools, skills, agents) {
|
||||
// ── Self-learning: extract patterns from this interaction ──
|
||||
await selfLearn(text, result, memory);
|
||||
});
|
||||
}
|
||||
|
||||
bot.on('message:text', async (ctx) => {
|
||||
await handleTextMessage(ctx, ctx.message.text, false);
|
||||
});
|
||||
|
||||
// ── Voice handler ──
|
||||
// ── Voice handler (Vosk STT) ──
|
||||
bot.on('message:voice', async (ctx) => {
|
||||
const fileId = ctx.message.voice.file_id;
|
||||
const user = ctx.from?.username || ctx.from?.first_name || 'Unknown';
|
||||
logger.info(`🎤 Voice from ${user}`);
|
||||
await ctx.reply('🎤 Voice received! (STT via Whisper TBD)');
|
||||
const file = await ctx.api.getFile(fileId);
|
||||
const url = `https://api.telegram.org/file/bot${botToken}/${file.file_path}`;
|
||||
logger.info(`Voice file: ${url}`);
|
||||
const statusMsg = await ctx.reply('🎤 Transcribing…');
|
||||
|
||||
try {
|
||||
const file = await ctx.api.getFile(fileId);
|
||||
const url = `https://api.telegram.org/file/bot${botToken}/${file.file_path}`;
|
||||
const oggPath = `/tmp/zcode-voice-${Date.now()}.ogg`;
|
||||
|
||||
// Download voice file
|
||||
const { execSync } = await import('child_process');
|
||||
execSync(`curl -sL "${url}" -o "${oggPath}"`, { timeout: 15000 });
|
||||
logger.info(`Voice downloaded: ${oggPath}`);
|
||||
|
||||
// Run Vosk STT via Python script
|
||||
const sttScript = new URL('../scripts/stt.py', import.meta.url).pathname;
|
||||
const result = execSync(
|
||||
`python3 "${sttScript}" "${oggPath}" 2>/dev/null`,
|
||||
{ timeout: 30000, encoding: 'utf-8' }
|
||||
);
|
||||
const parsed = JSON.parse(result.trim());
|
||||
|
||||
// Cleanup
|
||||
execSync(`rm -f "${oggPath}"`);
|
||||
|
||||
if (parsed.error) {
|
||||
logger.error(`STT error: ${parsed.error}`);
|
||||
await ctx.api.editMessageText(ctx.chat.id, statusMsg.message_id,
|
||||
`❌ STT error: ${parsed.error}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!parsed.text) {
|
||||
await ctx.api.editMessageText(ctx.chat.id, statusMsg.message_id,
|
||||
'🎤 Could not detect speech in the voice message.');
|
||||
return;
|
||||
}
|
||||
|
||||
logger.info(`🎤 STT (${parsed.confidence || '?'}): ${parsed.text}`);
|
||||
await ctx.api.deleteMessage(ctx.chat.id, statusMsg.message_id);
|
||||
|
||||
// Feed transcribed text into the main chat pipeline
|
||||
await handleTextMessage(ctx, parsed.text, true);
|
||||
|
||||
} catch (err) {
|
||||
logger.error(`Voice handler error: ${err.message}`);
|
||||
try {
|
||||
await ctx.api.editMessageText(ctx.chat.id, statusMsg.message_id,
|
||||
`❌ Voice processing failed: ${err.message.slice(0, 100)}`);
|
||||
} catch {}
|
||||
}
|
||||
});
|
||||
|
||||
// ── Photo handler ──
|
||||
|
||||
Reference in New Issue
Block a user