fix: voice STT path fix + execSync error handling + fetch download

This commit is contained in:
admin
2026-05-05 17:56:22 +00:00
Unverified
parent 3b6a7ec502
commit 3bfd842998
2 changed files with 74 additions and 73 deletions

View File

@@ -1,83 +1,70 @@
#!/usr/bin/env python3
"""
Vosk STT — Transcribe OGG/voice to text.
Usage: python3 stt.py <input_file> [language]
input_file: path to audio file (ogg, wav, mp3, etc.)
language: 'en' (default) or 'ge' — Georgian model
Output: JSON to stdout: {"text": "...", "confidence": 0.95}
Exit codes: 0=success, 1=no speech, 2=error
"""
"""Vosk STT — transcribe audio file to text. Optimized for speed."""
import sys, os, json, subprocess, tempfile, wave
os.environ['VOSK_LOG_LEVEL'] = '-1'
def main():
if len(sys.argv) < 2:
print(json.dumps({"error": "Usage: stt.py <audio_file> [en|ge]"}))
print(json.dumps({"error": "Usage: stt.py <audio_file>"}))
sys.exit(2)
audio_file = sys.argv[1]
lang = sys.argv[2] if len(sys.argv) > 2 else 'en'
model_path = '/home/uroma2/vosk-model'
# Suppress vosk logging
os.environ['VOSK_LOG_LEVEL'] = '-1'
model_path = {
'en': '/home/uroma2/vosk-model',
'ge': '/home/uroma2/vosk-model-ge',
}.get(lang, '/home/uroma2/vosk-model')
if not os.path.isdir(model_path):
print(json.dumps({"error": f"Model not found: {model_path}"}))
# Convert to 16kHz mono WAV via ffmpeg — fast pipe, no temp file overhead
try:
proc = subprocess.Popen(
['ffmpeg', '-y', '-i', audio_file, '-ar', '16000', '-ac', '1',
'-f', 'wav', '-v', 'error', '-'],
stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
wav_data = proc.stdout.read()
proc.wait(timeout=15)
if proc.returncode != 0 or len(wav_data) < 44:
print(json.dumps({"error": "ffmpeg conversion failed"}))
sys.exit(2)
except Exception as e:
print(json.dumps({"error": str(e)}))
sys.exit(2)
# Convert to 16kHz mono WAV using ffmpeg
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
wav_path = tmp.name
# Write wav_data to temp file for wave module (it needs a file path)
tmp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
try:
result = subprocess.run(
['ffmpeg', '-y', '-i', audio_file, '-ar', '16000', '-ac', '1', '-f', 'wav', wav_path],
capture_output=True, timeout=30
)
if result.returncode != 0:
print(json.dumps({"error": f"ffmpeg failed: {result.stderr.decode()[:200]}"}))
sys.exit(2)
tmp.write(wav_data)
tmp.close()
import vosk
model = vosk.Model(model_path)
rec = vosk.KaldiRecognizer(model, 16000)
wf = wave.open(wav_path, 'rb')
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() != 16000:
print(json.dumps({"error": "Audio format mismatch after conversion"}))
sys.exit(2)
results = []
while True:
data = wf.readframes(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
results.append(json.loads(rec.Result()))
# Final result
final = json.loads(rec.FinalResult())
results.append(final)
# Extract text
wf = wave.open(tmp.name, 'rb')
text_parts = []
total_conf = 0
conf_count = 0
for r in results:
t = r.get('text', '').strip()
if t:
text_parts.append(t)
# Confidence from final result
if 'result' in r:
for word in r.get('result', []):
if 'conf' in word:
total_conf += word['conf']
while True:
data = wf.readframes(4000)
if not data:
break
if rec.AcceptWaveform(data):
r = json.loads(rec.Result())
t = r.get('text', '').strip()
if t:
text_parts.append(t)
for w in r.get('result', []):
total_conf += w.get('conf', 0)
conf_count += 1
# Final partial
r = json.loads(rec.FinalResult())
t = r.get('text', '').strip()
if t:
text_parts.append(t)
for w in r.get('result', []):
total_conf += w.get('conf', 0)
conf_count += 1
text = ' '.join(text_parts).strip()
confidence = round(total_conf / conf_count, 2) if conf_count > 0 else 0.0
@@ -86,13 +73,12 @@ def main():
sys.exit(1)
print(json.dumps({"text": text, "confidence": confidence}))
except Exception as e:
print(json.dumps({"error": str(e)}))
sys.exit(2)
finally:
if os.path.exists(wav_path):
os.unlink(wav_path)
try: os.unlink(tmp.name)
except: pass
if __name__ == '__main__':
main()