Skip to content

Instantly share code, notes, and snippets.

View tikhonova's full-sized avatar
👾

Tatiana Tikhonova tikhonova

👾
View GitHub Profile
@tikhonova
tikhonova / us_state_abbrev.py
Created October 30, 2020 11:29 — forked from rogerallen/us_state_abbrev.py
A Python Dictionary to translate US States to Two letter codes
# United States of America Python Dictionary to translate States,
# Districts & Territories to Two-Letter codes and vice versa.
#
# https://gist.github.com/rogerallen/1583593
#
# Dedicated to the public domain. To the extent possible under law,
# Roger Allen has waived all copyright and related or neighboring
# rights to this code.
us_state_abbrev = {
def convert_audio_to_wav(filename: str, filepath: str, dest_path: str) -> None:
filepath = os.path.join(filepath, filename)
dest_filepath = os.path.join(dest_path, f"{filename[:-4]}.wav")
given_audio = AudioSegment.from_file(filepath, format="mp3") # replace with mp4 or avi
given_audio.export(dest_filepath, format="wav")
# create a list of input arguments for the function
inputs = [(filename, filepath, dest_path) for filename in os.listdir(filepath) if filename not in os.listdir(dest_path)]
# create a Process pool with 16 worker processes
def reduce_noise(file_path: str, file: str, dest_path: str) -> None:
# load data
rate, data = wavfile.read(os.path.join(file_path, file))
reduced_noise = nr.reduce_noise(y=data, sr=rate)
# perform noise reduction
try:
wavfile.write(os.path.join(dest_path, file), rate, reduced_noise)
except Exception():
pass
def remove_sil(file_path: str, file: str, dest_path: str, format="wav"):
sound = AudioSegment.from_file(os.path.join(file_path, file), format=format)
non_sil_times = detect_nonsilent(sound, min_silence_len=50, silence_thresh=sound.dBFS * 1.5)
if len(non_sil_times) == 0:
return None
elif len(non_sil_times) > 0:
non_sil_times_concat = [non_sil_times[0]]
if len(non_sil_times) > 1:
for t in non_sil_times[1:]:
if t[0] - non_sil_times_concat[-1][-1] < 200:
# using AudioSegment
def get_duration(self):
return self.audio.duration_seconds
def single_split(self, from_min, to_min, split_filename):
t1 = from_min * 7 * 1000 # convert to milliseconds
t2 = to_min * 7 * 1000
split_audio = self.audio[t1:t2]
resampled = split_audio.set_frame_rate(22050) # change the sampling rate to 22050 Hz
# snippet of instantiating a client
client = speech.SpeechClient()
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=22050,
audio_channel_count=1,
model='phone_call', # recognizes low quality audio better than default
use_enhanced=1, # if available
language_code="en-US")
''' Make metadata.csv and filelists via https://jaimeleal.github.io/how-to-speech-synthesis '''
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
filepath = 'E:/AlanWatts/dataset/transcripts2/'
files = os.listdir(filepath)
rows = []
# Attention parameters
attention_rnn_dim=1024, # sets the number of units in the RNN
attention_dim=128, # sets the number of units in the attention mechanism
# These two values are relatively large and may require a significant amount of GPU memory during training and inference.
# Location Layer parameters
attention_location_n_filters=32, # sets the number of filters in the CNN
attention_location_kernel_size=31, # sets the size of the filters
# This means that the CNN has 32 filters and each filter has a kernel size of 31.
''' ffmpeg not found when using pydub utils
___
If using Win, need to download from the official website and add to path, then reload git bash.
https://github.com/jiaaro/pydub/issues/348
'''
''' AssertionError: Distributed mode requires CUDA
___
a MUST-read to confirm that both GPU and drivers support the CUDA version you've installed (or about to install):
https://stackoverflow.com/questions/60987997/why-torch-cuda-is-available-returns-false-even-after-installing-pytorch-with
ai.train('result.txt',
line_by_line=False,
from_cache=False,
num_steps=100000,
generate_every=5000,
save_every=5000,
save_gdrive=True,
learning_rate=1e-3,
fp16=False,
batch_size=1,