Last active
November 2, 2020 08:40
-
-
Save r9y9/9adf93b51cf04bd5a1cc60bf6d308724 to your computer and use it in GitHub Desktop.
Musical context features-v2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 68, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Populating the interactive namespace from numpy and matplotlib\n" | |
] | |
} | |
], | |
"source": [ | |
"%pylab inline\n", | |
"import matplotlib\n", | |
"rcParams['figure.figsize'] = (16, 5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 69, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The autoreload extension is already loaded. To reload it, use:\n", | |
" %reload_ext autoreload\n" | |
] | |
} | |
], | |
"source": [ | |
"%load_ext autoreload\n", | |
"%autoreload" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 70, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import librosa\n", | |
"import librosa.display\n", | |
"import torch\n", | |
"from torch import nn\n", | |
"from torch.nn import functional as F\n", | |
"import torchaudio\n", | |
"from IPython.display import Audio\n", | |
"import IPython\n", | |
"from nnmnkwii import preprocessing as P\n", | |
"import pysptk\n", | |
"import pyworld\n", | |
"from scipy import signal\n", | |
"from scipy.io import wavfile\n", | |
"from nnmnkwii.io import hts\n", | |
"from glob import glob\n", | |
"from nnmnkwii.frontend import merlin as fe" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 71, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"question_path = \"../egs/_common/hed/jp_qst002_nnsvs.hed\"\n", | |
"binary_dict, continuous_dict = hts.load_question_set(\n", | |
" question_path, append_hat_for_LL=False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 0. Collect label files" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 72, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"394" | |
] | |
}, | |
"execution_count": 72, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"files = glob(\"../egs/nit-song070/00-svs-world/data/acoustic/label_phone_align/*.lab\")\n", | |
"files2 = glob(\"../egs/kiritan_singing/00-svs-world/data/acoustic/label_phone_align/*.lab\")\n", | |
"files3 = glob(\"../egs/jsut-song/00-svs-world/data/acoustic/label_phone_align/*.lab\")\n", | |
"\n", | |
"files.extend(files2)\n", | |
"files.extend(files3)\n", | |
"len(files)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 1. Compute stats for musical/linguistic features" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 73, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from tqdm import tqdm_notebook as tqdm" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 75, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"(360,)\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "7a828c42d91245ad86b566c539a2be20", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"HBox(children=(IntProgress(value=0, max=394), HTML(value='')))" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"labels = hts.load(files[0])\n", | |
"features = fe.linguistic_features(\n", | |
" labels, binary_dict, continuous_dict,\n", | |
" add_frame_features=False,\n", | |
" subphone_features=None)\n", | |
"freq = np.zeros(features.shape[1], dtype=np.int)\n", | |
"print(freq.shape)\n", | |
"\n", | |
"for f in tqdm(files):\n", | |
" labels = hts.load(f)\n", | |
" features = fe.linguistic_features(\n", | |
" labels, binary_dict, continuous_dict,\n", | |
" add_frame_features=False,\n", | |
" subphone_features=None)\n", | |
" \n", | |
" # binary features\n", | |
" b_freq = (features[:, :len(binary_dict)] == 1).sum(0)\n", | |
" \n", | |
" # continuous features\n", | |
" c_freq = (features[:, len(binary_dict):] != -1).sum(0)\n", | |
" \n", | |
" #print(b_freq.shape, c_freq.shape)\n", | |
" \n", | |
" freq[:len(b_freq)] += b_freq\n", | |
" freq[len(b_freq):] += c_freq" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 3. Find rate features" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 76, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"indices = np.argsort(freq)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 77, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Idx 68, freq 0, regex: [re.compile('\\\\^dy\\\\-')]\n", | |
"Idx 212, freq 0, regex: [re.compile('\\\\+E\\\\='), re.compile('\\\\+O\\\\=')]\n", | |
"Idx 209, freq 0, regex: [re.compile('\\\\+I\\\\='), re.compile('\\\\+U\\\\=')]\n", | |
"Idx 206, freq 0, regex: [re.compile('\\\\+A\\\\='), re.compile('\\\\+U\\\\='), re.compile('\\\\+O\\\\=')]\n", | |
"Idx 203, freq 0, regex: [re.compile('\\\\+I\\\\='), re.compile('\\\\+E\\\\=')]\n", | |
"Idx 200, freq 0, regex: [re.compile('\\\\+A\\\\='), re.compile('\\\\+I\\\\='), re.compile('\\\\+U\\\\='), re.compile('\\\\+E\\\\='), re.compile('\\\\+O\\\\=')]\n", | |
"Idx 191, freq 0, regex: [re.compile('\\\\-v\\\\+')]\n", | |
"Idx 56, freq 0, regex: [re.compile('\\\\^E\\\\-')]\n", | |
"Idx 55, freq 0, regex: [re.compile('\\\\^A\\\\-')]\n", | |
"Idx 96, freq 0, regex: [re.compile('\\\\^v\\\\-')]\n", | |
"Idx 114, freq 0, regex: [re.compile('\\\\-I\\\\+'), re.compile('\\\\-U\\\\+')]\n", | |
"Idx 105, freq 0, regex: [re.compile('\\\\-A\\\\+'), re.compile('\\\\-I\\\\+'), re.compile('\\\\-U\\\\+'), re.compile('\\\\-E\\\\+'), re.compile('\\\\-O\\\\+')]\n", | |
"Idx 163, freq 0, regex: [re.compile('\\\\-dy\\\\+')]\n", | |
"Idx 159, freq 0, regex: [re.compile('\\\\-by\\\\+')]\n", | |
"Idx 155, freq 0, regex: [re.compile('\\\\-U\\\\+')]\n", | |
"Idx 154, freq 0, regex: [re.compile('\\\\-O\\\\+')]\n", | |
"Idx 152, freq 0, regex: [re.compile('\\\\-I\\\\+')]\n", | |
"Idx 151, freq 0, regex: [re.compile('\\\\-E\\\\+')]\n", | |
"Idx 150, freq 0, regex: [re.compile('\\\\-A\\\\+')]\n", | |
"Idx 119, freq 0, regex: [re.compile('\\\\-A\\\\+')]\n", | |
"Idx 117, freq 0, regex: [re.compile('\\\\-E\\\\+'), re.compile('\\\\-O\\\\+')]\n", | |
"Idx 108, freq 0, regex: [re.compile('\\\\-I\\\\+'), re.compile('\\\\-E\\\\+')]\n", | |
"Idx 214, freq 0, regex: [re.compile('\\\\+A\\\\=')]\n", | |
"Idx 24, freq 0, regex: [re.compile('\\\\^A\\\\-')]\n", | |
"Idx 245, freq 0, regex: [re.compile('\\\\+A\\\\=')]\n", | |
"Idx 286, freq 0, regex: [re.compile('\\\\+v\\\\=')]\n", | |
"Idx 64, freq 0, regex: [re.compile('\\\\^by\\\\-')]\n", | |
"Idx 60, freq 0, regex: [re.compile('\\\\^U\\\\-')]\n", | |
"Idx 59, freq 0, regex: [re.compile('\\\\^O\\\\-')]\n", | |
"Idx 57, freq 0, regex: [re.compile('\\\\^I\\\\-')]\n", | |
"Idx 22, freq 0, regex: [re.compile('\\\\^E\\\\-'), re.compile('\\\\^O\\\\-')]\n", | |
"Idx 258, freq 0, regex: [re.compile('\\\\+dy\\\\=')]\n", | |
"Idx 10, freq 0, regex: [re.compile('\\\\^A\\\\-'), re.compile('\\\\^I\\\\-'), re.compile('\\\\^U\\\\-'), re.compile('\\\\^E\\\\-'), re.compile('\\\\^O\\\\-')]\n", | |
"Idx 254, freq 0, regex: [re.compile('\\\\+by\\\\=')]\n", | |
"Idx 246, freq 0, regex: [re.compile('\\\\+E\\\\=')]\n", | |
"Idx 19, freq 0, regex: [re.compile('\\\\^I\\\\-'), re.compile('\\\\^U\\\\-')]\n", | |
"Idx 247, freq 0, regex: [re.compile('\\\\+I\\\\=')]\n", | |
"Idx 13, freq 0, regex: [re.compile('\\\\^I\\\\-'), re.compile('\\\\^E\\\\-')]\n", | |
"Idx 16, freq 0, regex: [re.compile('\\\\^A\\\\-'), re.compile('\\\\^U\\\\-'), re.compile('\\\\^O\\\\-')]\n", | |
"Idx 249, freq 0, regex: [re.compile('\\\\+O\\\\=')]\n", | |
"Idx 250, freq 0, regex: [re.compile('\\\\+U\\\\=')]\n", | |
"Idx 111, freq 0, regex: [re.compile('\\\\-A\\\\+'), re.compile('\\\\-U\\\\+'), re.compile('\\\\-O\\\\+')]\n", | |
"Idx 264, freq 1, regex: [re.compile('\\\\+hy\\\\=')]\n", | |
"Idx 167, freq 2, regex: [re.compile('\\\\-gy\\\\+')]\n", | |
"Idx 169, freq 2, regex: [re.compile('\\\\-hy\\\\+')]\n", | |
"Idx 262, freq 2, regex: [re.compile('\\\\+gy\\\\=')]\n", | |
"Idx 74, freq 2, regex: [re.compile('\\\\^hy\\\\-')]\n", | |
"Idx 72, freq 2, regex: [re.compile('\\\\^gy\\\\-')]\n", | |
"Idx 82, freq 3, regex: [re.compile('\\\\^ny\\\\-')]\n", | |
"Idx 80, freq 3, regex: [re.compile('\\\\^my\\\\-')]\n", | |
"Idx 177, freq 3, regex: [re.compile('\\\\-ny\\\\+')]\n", | |
"Idx 175, freq 3, regex: [re.compile('\\\\-my\\\\+')]\n", | |
"Idx 272, freq 3, regex: [re.compile('\\\\+ny\\\\=')]\n", | |
"Idx 270, freq 3, regex: [re.compile('\\\\+my\\\\=')]\n", | |
"Idx 281, freq 16, regex: [re.compile('\\\\+sil\\\\=')]\n", | |
"Idx 86, freq 17, regex: [re.compile('\\\\^py\\\\-')]\n", | |
"Idx 181, freq 17, regex: [re.compile('\\\\-py\\\\+')]\n", | |
"Idx 276, freq 17, regex: [re.compile('\\\\+py\\\\=')]\n", | |
"Idx 186, freq 19, regex: [re.compile('\\\\-sil\\\\+')]\n", | |
"Idx 0, freq 19, regex: [re.compile('\\\\As\\\\@')]\n", | |
"Idx 268, freq 38, regex: [re.compile('\\\\+ky\\\\=')]\n", | |
"Idx 173, freq 40, regex: [re.compile('\\\\-ky\\\\+')]\n", | |
"Idx 78, freq 40, regex: [re.compile('\\\\^ky\\\\-')]\n", | |
"Idx 91, freq 92, regex: [re.compile('\\\\^sil\\\\-')]\n", | |
"Idx 189, freq 100, regex: [re.compile('\\\\-ty\\\\+')]\n", | |
"Idx 284, freq 100, regex: [re.compile('\\\\+ty\\\\=')]\n", | |
"Idx 94, freq 100, regex: [re.compile('\\\\^ty\\\\-')]\n" | |
] | |
} | |
], | |
"source": [ | |
"for idx in np.argsort(freq):\n", | |
" if freq[idx] > 100:\n", | |
" continue\n", | |
" if idx < len(binary_dict):\n", | |
" print(f\"Idx {idx:3d}, freq {freq[idx]:3d}, regex: {binary_dict[idx]}\")\n", | |
" else:\n", | |
" print(f\"Idx {idx:3d}, freq {freq[idx]:3d}, regex: {continuous_dict[idx-len(binary_dict)]}\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"I have done manual feature selection by checking the above information. \n", | |
"\n", | |
"### Note\n", | |
"\n", | |
"Before: 420 dim\n", | |
"after: 360 dim\n", | |
"\n", | |
"- Beat features are removed\n", | |
"- Some rare continuous features are removed.\n", | |
"- Phoneme flag features are removed\n", | |
"- Time signature features are removed\n", | |
"- Rare phoneme identity features are preserved as is.\n", | |
"\n", | |
"### Note for future:\n", | |
"\n", | |
"- We might want to revisit phoneme identity features. Removing unused ones, merging similear phones into a single one, etc." | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment