Last active
June 2, 2023 19:07
-
-
Save apetenchea/beb6d427c38cae6cabe57f051e117325 to your computer and use it in GitHub Desktop.
Text to speech in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# importing libraries | |
import pathlib | |
import sys | |
import speech_recognition as sr | |
import os | |
from pydub import AudioSegment | |
from pydub.silence import split_on_silence | |
# create a speech recognition object | |
r = sr.Recognizer() | |
# a function to recognize speech in the audio file | |
# so that we don't repeat ourselves in in other functions | |
def transcribe_audio(path): | |
# use the audio file as the audio source | |
with sr.AudioFile(path) as source: | |
audio_listened = r.record(source) | |
# try converting it to text | |
text = r.recognize_google(audio_listened) | |
return text | |
# a function that splits the audio file into chunks on silence | |
# and applies speech recognition | |
def get_large_audio_transcription_on_silence(path): | |
"""Splitting the large audio file into chunks | |
and apply speech recognition on each of these chunks""" | |
# open the audio file using pydub | |
sound = AudioSegment.from_file(path) | |
# split audio sound where silence is 500 miliseconds or more and get chunks | |
chunks = split_on_silence(sound, | |
# experiment with this value for your target audio file | |
min_silence_len = 500, | |
# adjust this per requirement | |
silence_thresh = sound.dBFS-14, | |
# keep the silence for 1 second, adjustable as well | |
keep_silence=500, | |
) | |
folder_name = "audio-chunks" | |
# create a directory to store the audio chunks | |
if not os.path.isdir(folder_name): | |
os.mkdir(folder_name) | |
whole_text = "" | |
# process each chunk | |
for i, audio_chunk in enumerate(chunks, start=1): | |
# export audio chunk and save it in | |
# the `folder_name` directory. | |
chunk_filename = os.path.join(folder_name, f"chunk{i}.wav") | |
audio_chunk.export(chunk_filename, format="wav") | |
# recognize the chunk | |
try: | |
text = transcribe_audio(chunk_filename) | |
except sr.UnknownValueError as e: | |
print("Error:", str(e)) | |
else: | |
text = f"{text.capitalize()}. " | |
print(chunk_filename, ":", text) | |
whole_text += text | |
# return the text for all chunks detected | |
return whole_text | |
def convert_mp3_files(path): | |
path = pathlib.Path(path) | |
mp3 = [str(mp3_file) for mp3_file in path.glob('**/*.mp3')] | |
for m in mp3: | |
sound = AudioSegment.from_mp3(m) | |
x = m.replace(".mp3", ".wav") | |
sound.export(f"converted/{os.path.basename(x)}", format="wav") | |
def find_audio_files(path): | |
path = pathlib.Path(path) | |
return [str(mp3_file) for mp3_file in path.glob('**/*.wav')] | |
# Usage | |
if __name__ == '__main__': | |
p = sys.argv[1] | |
convert_mp3_files(p) | |
files = find_audio_files(p) | |
d = dict() | |
for f in files: | |
d[f] = transcribe_audio(f) | |
with open('merged-audio.txt', 'w') as out: | |
for k, v in d.items(): | |
print(k, v, file=out) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment