# -rw-r--r-- 4.7 KiB View raw
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python3

#  Copyright (C) 2021
#
#  This is free software: you can redistribute it and/or modify
#  it under the terms of the Affero GNU General Public License as published
#  by the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.

"""
Uses VOSK text-to-speech.

Thanks to VOSK of course, some code is from Campbell Barton's https://github.com/ideasman42/nerd-dictation it's heavily modified and stripped. (I'll be looking at it's commandline options when i re-add feature.)

This code is way more stripped down, you might want to re-add features like parsing numbers.

`final_transcribed` is the final result.

`interim_transcribed` is intermediate for quicker responses or maybe trying to catch on to things the final responses false-positives on.

nerd-dictation is better in that it has generating text/x events based on partial results and undoing things too. I actually wrote this use to make a more elaborate system of do/undo and commands etc.
"""

import time  # Some time stuff.
initial_t = time.time()

import vosk  # Initial set up model.

vosk.SetLogLevel(-1)

# Read model from file, if available. TODO: nerd-dictation was more elaborate on it.
import os
vosk_model_dir = os.getenv('HOME') + "/.config/nerd-dictation/model/"

RECORD_SAMPLE_RATE = 44100
pulse_device_name = None
min_dt = 0.001

def figure_recog():
    """Loads the model and returns the recognizer."""
    if not os.path.exists(vosk_model_dir):
        sys.stderr.write(
            f"Please download the model from https://alphacephei.com/vosk/models and unpack it to {vosk_model_dir}.\n")
        sys.stderr.write(" (or use some other source of models)\n")
        sys.exit(1)

    model = vosk.Model(vosk_model_dir)
    recog = vosk.KaldiRecognizer(model, RECORD_SAMPLE_RATE)
    recog.SetMaxAlternatives(10)
    recog.SetWords(True)

    return recog

recog = figure_recog()

# 1mb (allow for loading the model to take some time).
block_size = 1048576

import json
import subprocess
import sys

from typing import IO

parec_cmd = (  # Pulseaudio recorder command.
    "parec", "--record",
    f"--rate={RECORD_SAMPLE_RATE}",
    "--channels=1",
    *((f"--device={pulse_device_name}",) if pulse_device_name else ()),
    "--format=s16ne",
    "--latency=10")
print('using pulseaudio recorder command:', parec_cmd)

def file_handle_make_non_blocking(file_handle: IO[bytes]) -> None:
     import fcntl
     # Get current `file_handle` flags.
     flags = fcntl.fcntl(file_handle.fileno(), fcntl.F_GETFL)
     fcntl.fcntl(file_handle, fcntl.F_SETFL, flags | os.O_NONBLOCK)

def final_transcribed(json_data):
    """What the speech-to-text concluded was the text finally."""
    print('ok')
    for el in json_data.get('alternatives', []):
        print('.', el.get('confidence'), el.get('text'))

def interim_transcribed(json_data):
    """What the speech-to-text concluded in the meanwhile."""
    # Without this, there are *many* calls with the same partial text.
    print('bad', json_data)
    if json_data.get('partial') == "exit now":
        sys.stdout.close()
        exit(0)

def interim_transcribed_unchanged(json_data):
    """Cases it didn't change, though probably you're not interested."""
    pass

if True:
    ps = subprocess.Popen(parec_cmd, stdout=subprocess.PIPE)

    stdout = ps.stdout
    assert stdout is not None

    # Needed so whatever is available can be read (without waiting).
    file_handle_make_non_blocking(stdout)

    # Track this to prevent excessive load when the "partial" result doesn't change.
    prev_json_interim = ("", None)

    print('startup took', time.time() - initial_t)

    prev_t = time.time()

    while True:  # Main loop.
        dt = time.time() - prev_t
        if dt < min_dt:  # Minimum time between.
            time.sleep(dt)
        prev_t = time.time()

        # Mostly the data read is quite small (under 1k).
        # Only the 1st entry in the loop reads a lot of data due to the time it takes to initialize the VOSK module.
        data = stdout.read(block_size)

        if data:  # TODO actually do stuff.
            ok = recog.AcceptWaveform(data)

            if ok:  # Final result.
                json_text = recog.Result()
                json_prev_interim_text = ""
                final_transcribed(json.loads(json_text))
            else:  # Interim result.
                # Only for comparison, to detect change.
                json_text = recog.PartialResult()
                if prev_json_interim[0] != json_text:
                    prev_json_interim = (json_text, json.loads(json_text))
                    interim_transcribed(prev_json_interim[1])
                else:
                    interim_transcribed_unchanged(prev_json_interim[0])