fix: voice STT path fix + execSync error handling + fetch download
This commit is contained in:
108
scripts/stt.py
108
scripts/stt.py
@@ -1,83 +1,70 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""Vosk STT — transcribe audio file to text. Optimized for speed."""
|
||||||
Vosk STT — Transcribe OGG/voice to text.
|
|
||||||
Usage: python3 stt.py <input_file> [language]
|
|
||||||
input_file: path to audio file (ogg, wav, mp3, etc.)
|
|
||||||
language: 'en' (default) or 'ge' — Georgian model
|
|
||||||
Output: JSON to stdout: {"text": "...", "confidence": 0.95}
|
|
||||||
Exit codes: 0=success, 1=no speech, 2=error
|
|
||||||
"""
|
|
||||||
import sys, os, json, subprocess, tempfile, wave
|
import sys, os, json, subprocess, tempfile, wave
|
||||||
|
|
||||||
|
os.environ['VOSK_LOG_LEVEL'] = '-1'
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
print(json.dumps({"error": "Usage: stt.py <audio_file> [en|ge]"}))
|
print(json.dumps({"error": "Usage: stt.py <audio_file>"}))
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
|
|
||||||
audio_file = sys.argv[1]
|
audio_file = sys.argv[1]
|
||||||
lang = sys.argv[2] if len(sys.argv) > 2 else 'en'
|
model_path = '/home/uroma2/vosk-model'
|
||||||
|
|
||||||
# Suppress vosk logging
|
# Convert to 16kHz mono WAV via ffmpeg — fast pipe, no temp file overhead
|
||||||
os.environ['VOSK_LOG_LEVEL'] = '-1'
|
try:
|
||||||
|
proc = subprocess.Popen(
|
||||||
model_path = {
|
['ffmpeg', '-y', '-i', audio_file, '-ar', '16000', '-ac', '1',
|
||||||
'en': '/home/uroma2/vosk-model',
|
'-f', 'wav', '-v', 'error', '-'],
|
||||||
'ge': '/home/uroma2/vosk-model-ge',
|
stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||||
}.get(lang, '/home/uroma2/vosk-model')
|
)
|
||||||
|
wav_data = proc.stdout.read()
|
||||||
if not os.path.isdir(model_path):
|
proc.wait(timeout=15)
|
||||||
print(json.dumps({"error": f"Model not found: {model_path}"}))
|
if proc.returncode != 0 or len(wav_data) < 44:
|
||||||
|
print(json.dumps({"error": "ffmpeg conversion failed"}))
|
||||||
|
sys.exit(2)
|
||||||
|
except Exception as e:
|
||||||
|
print(json.dumps({"error": str(e)}))
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
|
|
||||||
# Convert to 16kHz mono WAV using ffmpeg
|
# Write wav_data to temp file for wave module (it needs a file path)
|
||||||
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
|
tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
|
||||||
wav_path = tmp.name
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
tmp.write(wav_data)
|
||||||
['ffmpeg', '-y', '-i', audio_file, '-ar', '16000', '-ac', '1', '-f', 'wav', wav_path],
|
tmp.close()
|
||||||
capture_output=True, timeout=30
|
|
||||||
)
|
|
||||||
if result.returncode != 0:
|
|
||||||
print(json.dumps({"error": f"ffmpeg failed: {result.stderr.decode()[:200]}"}))
|
|
||||||
sys.exit(2)
|
|
||||||
|
|
||||||
import vosk
|
import vosk
|
||||||
model = vosk.Model(model_path)
|
model = vosk.Model(model_path)
|
||||||
rec = vosk.KaldiRecognizer(model, 16000)
|
rec = vosk.KaldiRecognizer(model, 16000)
|
||||||
|
|
||||||
wf = wave.open(wav_path, 'rb')
|
wf = wave.open(tmp.name, 'rb')
|
||||||
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() != 16000:
|
|
||||||
print(json.dumps({"error": "Audio format mismatch after conversion"}))
|
|
||||||
sys.exit(2)
|
|
||||||
|
|
||||||
results = []
|
|
||||||
while True:
|
|
||||||
data = wf.readframes(4000)
|
|
||||||
if len(data) == 0:
|
|
||||||
break
|
|
||||||
if rec.AcceptWaveform(data):
|
|
||||||
results.append(json.loads(rec.Result()))
|
|
||||||
|
|
||||||
# Final result
|
|
||||||
final = json.loads(rec.FinalResult())
|
|
||||||
results.append(final)
|
|
||||||
|
|
||||||
# Extract text
|
|
||||||
text_parts = []
|
text_parts = []
|
||||||
total_conf = 0
|
total_conf = 0
|
||||||
conf_count = 0
|
conf_count = 0
|
||||||
for r in results:
|
|
||||||
t = r.get('text', '').strip()
|
while True:
|
||||||
if t:
|
data = wf.readframes(4000)
|
||||||
text_parts.append(t)
|
if not data:
|
||||||
# Confidence from final result
|
break
|
||||||
if 'result' in r:
|
if rec.AcceptWaveform(data):
|
||||||
for word in r.get('result', []):
|
r = json.loads(rec.Result())
|
||||||
if 'conf' in word:
|
t = r.get('text', '').strip()
|
||||||
total_conf += word['conf']
|
if t:
|
||||||
|
text_parts.append(t)
|
||||||
|
for w in r.get('result', []):
|
||||||
|
total_conf += w.get('conf', 0)
|
||||||
conf_count += 1
|
conf_count += 1
|
||||||
|
|
||||||
|
# Final partial
|
||||||
|
r = json.loads(rec.FinalResult())
|
||||||
|
t = r.get('text', '').strip()
|
||||||
|
if t:
|
||||||
|
text_parts.append(t)
|
||||||
|
for w in r.get('result', []):
|
||||||
|
total_conf += w.get('conf', 0)
|
||||||
|
conf_count += 1
|
||||||
|
|
||||||
text = ' '.join(text_parts).strip()
|
text = ' '.join(text_parts).strip()
|
||||||
confidence = round(total_conf / conf_count, 2) if conf_count > 0 else 0.0
|
confidence = round(total_conf / conf_count, 2) if conf_count > 0 else 0.0
|
||||||
|
|
||||||
@@ -86,13 +73,12 @@ def main():
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
print(json.dumps({"text": text, "confidence": confidence}))
|
print(json.dumps({"text": text, "confidence": confidence}))
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(json.dumps({"error": str(e)}))
|
print(json.dumps({"error": str(e)}))
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
finally:
|
finally:
|
||||||
if os.path.exists(wav_path):
|
try: os.unlink(tmp.name)
|
||||||
os.unlink(wav_path)
|
except: pass
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|||||||
@@ -1088,21 +1088,36 @@ export async function initBot(config, api, tools, skills, agents) {
|
|||||||
const url = `https://api.telegram.org/file/bot${botToken}/${file.file_path}`;
|
const url = `https://api.telegram.org/file/bot${botToken}/${file.file_path}`;
|
||||||
const oggPath = `/tmp/zcode-voice-${Date.now()}.ogg`;
|
const oggPath = `/tmp/zcode-voice-${Date.now()}.ogg`;
|
||||||
|
|
||||||
// Download voice file
|
// Download voice file via fetch (faster than curl subprocess)
|
||||||
const { execSync } = await import('child_process');
|
const { execSync } = await import('child_process');
|
||||||
execSync(`curl -sL "${url}" -o "${oggPath}"`, { timeout: 15000 });
|
const voiceResp = await fetch(url);
|
||||||
|
if (!voiceResp.ok) throw new Error(`Download failed: ${voiceResp.status}`);
|
||||||
|
const { writeFileSync, unlinkSync } = await import('fs');
|
||||||
|
writeFileSync(oggPath, Buffer.from(await voiceResp.arrayBuffer()));
|
||||||
logger.info(`Voice downloaded: ${oggPath}`);
|
logger.info(`Voice downloaded: ${oggPath}`);
|
||||||
|
|
||||||
// Run Vosk STT via Python script
|
// Run Vosk STT — path is ../../scripts/stt.py from src/bot/
|
||||||
const sttScript = new URL('../scripts/stt.py', import.meta.url).pathname;
|
const sttScript = new URL('../../scripts/stt.py', import.meta.url).pathname;
|
||||||
const result = execSync(
|
let parsed;
|
||||||
`python3 "${sttScript}" "${oggPath}" 2>/dev/null`,
|
try {
|
||||||
{ timeout: 30000, encoding: 'utf-8' }
|
const result = execSync(
|
||||||
);
|
`python3 "${sttScript}" "${oggPath}"`,
|
||||||
const parsed = JSON.parse(result.trim());
|
{ timeout: 30000, encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] }
|
||||||
|
);
|
||||||
// Cleanup
|
parsed = JSON.parse(result.trim());
|
||||||
execSync(`rm -f "${oggPath}"`);
|
} catch (e) {
|
||||||
|
// exit code 1 = no speech detected, stdout still has JSON
|
||||||
|
const stdout = e.stdout?.trim();
|
||||||
|
if (stdout) {
|
||||||
|
try { parsed = JSON.parse(stdout); } catch { parsed = null; }
|
||||||
|
}
|
||||||
|
if (!parsed || !parsed.text) {
|
||||||
|
await ctx.api.editMessageText(ctx.chat.id, statusMsg.message_id,
|
||||||
|
'🎤 Could not detect speech in the voice message.');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
unlinkSync(oggPath);
|
||||||
|
|
||||||
if (parsed.error) {
|
if (parsed.error) {
|
||||||
logger.error(`STT error: ${parsed.error}`);
|
logger.error(`STT error: ${parsed.error}`);
|
||||||
|
|||||||
Reference in New Issue
Block a user