Last active
December 8, 2023 04:36
-
-
Save naturale0/b0c15b0940c23e40d8775acfdb5a575e to your computer and use it in GitHub Desktop.
implementation of NPLM (pytorch & tensorflow)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "healthy-wheel", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import tensorflow as tf\n", | |
"import numpy as np\n", | |
"import matplotlib.pyplot as plt\n", | |
"import random\n", | |
"%matplotlib inline\n", | |
"%config Completer.use_jedi = False" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "alive-permit", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'2.4.0-rc0'" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tf.random.set_seed(0)\n", | |
"tf.__version__" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "oriental-cycling", | |
"metadata": { | |
"toc": true | |
}, | |
"source": [ | |
"<h1>Implementation: Neural Probabilistic Language Model<span class=\"tocSkip\"></span></h1>\n", | |
"<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Create-dataset-from-nltk-data\" data-toc-modified-id=\"Create-dataset-from-nltk-data-1\"><span class=\"toc-item-num\">1 </span>Create dataset from <code>nltk</code> data</a></span></li><li><span><a href=\"#Build-the-model\" data-toc-modified-id=\"Build-the-model-2\"><span class=\"toc-item-num\">2 </span>Build the model</a></span></li><li><span><a href=\"#Train-the-model\" data-toc-modified-id=\"Train-the-model-3\"><span class=\"toc-item-num\">3 </span>Train the model</a></span></li><li><span><a href=\"#Embedding-result\" data-toc-modified-id=\"Embedding-result-4\"><span class=\"toc-item-num\">4 </span>Embedding result</a></span></li></ul></div>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "digital-filename", | |
"metadata": {}, | |
"source": [ | |
"## Create dataset from `nltk` data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "animated-stretch", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['[', 'emma', 'by', 'jane', 'austen', '1816', ']', 'volume', 'i', 'chapter']" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from nltk.corpus import gutenberg\n", | |
"data = [w.lower() for w in gutenberg.words()]\n", | |
"data[:10]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "respective-illustration", | |
"metadata": {}, | |
"source": [ | |
"* build vocabulary" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "rising-donna", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"42339" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vocab = list(set(data))\n", | |
"VOCAB_SIZE = len(vocab)\n", | |
"VOCAB_SIZE" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "smaller-envelope", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"word_to_ix = {word: ix for ix, word in enumerate(vocab)}" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "unavailable-classic", | |
"metadata": {}, | |
"source": [ | |
"* build custom torchtext dataset with `tf.data.Dataset.from_tensor_slices`." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "minor-detective", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"CONTEXT_SIZE=3\n", | |
"\n", | |
"x, y = [], []\n", | |
"for i in range(len(data)-CONTEXT_SIZE):\n", | |
" x.append(list(map(lambda x: word_to_ix[x], data[i:i+CONTEXT_SIZE])))\n", | |
" y.append(word_to_ix[data[i+CONTEXT_SIZE]])\n", | |
"x = np.array(x)\n", | |
"y = np.array(y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "shaped-cowboy", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(array([27532, 13282, 10722]), 23537)" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"x[0], y[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "parallel-recipe", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"trainset = tf.data.Dataset.from_tensor_slices((x, y)).shuffle(50000).batch(128)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "quick-steel", | |
"metadata": {}, | |
"source": [ | |
"## Build the model" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "willing-release", | |
"metadata": {}, | |
"source": [ | |
"![bengio et al.png](https://miro.medium.com/max/2408/1*EqKiy4-6tuLSoPP_kub33Q.png)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "thorough-treasury", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"EMBEDDING_DIM = 128\n", | |
"HIDDEN_DIM = 128" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "dense-writer", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class NPLM(tf.keras.Model):\n", | |
" \n", | |
" def __init__(self, vocab_size, context_size, embedding_dim, hidden_dim):\n", | |
" super(NPLM, self).__init__()\n", | |
" self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n", | |
" # affine layers for tanh\n", | |
" self.flatten = tf.keras.layers.Flatten()\n", | |
" self.linear1 = tf.keras.layers.Dense(hidden_dim, activation=\"tanh\")\n", | |
" self.linear2 = tf.keras.layers.Dense(vocab_size, use_bias=False)\n", | |
" # affine layer for residual connection\n", | |
" self.linear3 = tf.keras.layers.Dense(vocab_size)\n", | |
" \n", | |
" def call(self, x):\n", | |
" x = self.embedding(x)\n", | |
" x = self.flatten(x)\n", | |
" \n", | |
" x1 = self.linear1(x)\n", | |
" x1 = self.linear2(x1)\n", | |
" \n", | |
" x2 = self.linear3(x)\n", | |
" \n", | |
" x = x1 + x2\n", | |
" return x\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "afraid-chaos", | |
"metadata": {}, | |
"source": [ | |
"## Train the model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "decimal-executive", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"criterion = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "neural-relations", | |
"metadata": {}, | |
"source": [ | |
"**train with [1-cycle learning rate policy](https://arxiv.org/abs/1708.07120)**" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "adjustable-chance", | |
"metadata": {}, | |
"source": [ | |
"1. write the scheduler" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "important-sampling", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"N_EPOCH = 3\n", | |
"MAX_LR = 0.1\n", | |
"MIN_LR = 0.01\n", | |
"\n", | |
"def scheduler(epoch, lr):\n", | |
" step_size = (MAX_LR - MIN_LR) / (N_EPOCH // 2.1)\n", | |
" half_cycle = N_EPOCH // 2.1\n", | |
" \n", | |
" # exploration\n", | |
" if epoch < half_cycle:\n", | |
" return lr + step_size\n", | |
" \n", | |
" # exploitation\n", | |
" elif epoch < 2 * half_cycle:\n", | |
" return lr - step_size\n", | |
" else:\n", | |
" return lr * tf.math.exp(-1.)\n", | |
"\n", | |
"lrscheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "deadly-prague", | |
"metadata": {}, | |
"source": [ | |
"2. train the model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "delayed-profit", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model = NPLM(VOCAB_SIZE, CONTEXT_SIZE, EMBEDDING_DIM, HIDDEN_DIM)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "tested-witch", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"optimizer = tf.keras.optimizers.SGD(learning_rate=MIN_LR, momentum=0.9)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"id": "numerous-renaissance", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model.compile(\n", | |
" optimizer=optimizer,\n", | |
" loss=criterion,\n", | |
" metrics=[\"accuracy\"]\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "legal-execution", | |
"metadata": {}, | |
"source": [ | |
"```python\n", | |
"history = model.fit(trainset, epochs=N_EPOCH, callbacks=[lrscheduler])\n", | |
"\n", | |
"plt.plot(history.history[\"loss\"])\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "orange-programmer", | |
"metadata": {}, | |
"source": [ | |
"* save the model" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "electoral-blond", | |
"metadata": {}, | |
"source": [ | |
"```python\n", | |
"model.save(\"./NPLM_SGD_lr0.01-0.1_momentum0.9_epoch3\")\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "possible-attention", | |
"metadata": {}, | |
"source": [ | |
"## Embedding result" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "gentle-length", | |
"metadata": {}, | |
"source": [ | |
"* load the pre-trained final model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "center-burke", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model = tf.keras.models.load_model(\"./NPLM_SGD_lr0.01-1.0_momentum0.9_epoch50/\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"id": "buried-vermont", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"embedding = model.embedding" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"id": "changing-luxury", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from collections import Counter\n", | |
"\n", | |
"test_words = Counter(data).most_common(100)\n", | |
"test_words_raw = [w.lower() for w, _ in test_words]\n", | |
"test_words = [word_to_ix[w.lower()] for w in test_words_raw]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"id": "liberal-people", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.manifold import TSNE\n", | |
"embed_xy = embedding(np.array(test_words)).numpy()\n", | |
"embed_xy = TSNE(2).fit_transform(embed_xy)\n", | |
"embed_x, embed_y = list(zip(*embed_xy))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"id": "atlantic-qatar", | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<matplotlib.lines.Line2D at 0x333a02d30>" | |
] | |
}, | |
"execution_count": 38, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 720x720 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"plt.figure(figsize=(10,10))\n", | |
"\n", | |
"for xy, word in zip(embed_xy, test_words_raw):\n", | |
" plt.annotate(word, xy, clip_on=True)\n", | |
"\n", | |
"plt.title(\"Word Embedding\")\n", | |
"plt.scatter(embed_x, embed_y, alpha=.3)\n", | |
"plt.axhline([0], ls=\":\", c=\"grey\")\n", | |
"plt.axvline([0], ls=\":\", c=\"grey\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "ongoing-teach", | |
"metadata": {}, | |
"source": [ | |
"* 5-most similar words" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"id": "duplicate-chess", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.metrics.pairwise import cosine_distances" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"id": "urban-attendance", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def find_similar(word, n=5, from_total=5000):\n", | |
" distance = []\n", | |
" y = embedding(word_to_ix[word]).numpy().reshape(1, -1)\n", | |
" total = Counter(data).most_common(from_total)\n", | |
" for w, _ in total:\n", | |
" x = embedding(word_to_ix[w]).numpy().reshape(1, -1)\n", | |
" distance.append(cosine_distances(x, y)[0][0])\n", | |
" \n", | |
" distance = np.array(distance)\n", | |
" top_n = distance.argsort()[1:n+1]\n", | |
" \n", | |
" return [total[ix][0] for ix in top_n]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"id": "clear-carpet", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"she : ['emma', 'alice', 'anne', 'he', 'elinor']\n", | |
"heart: ['wisdom', 'mouth', 'righteousness', 'tongue', 'flag']\n", | |
"love : ['kingdom', 'honour', 'praise', 'grace', 'serve']\n", | |
"death: ['life', 'hell', 'truth', 'wisdom', 'law']\n" | |
] | |
} | |
], | |
"source": [ | |
"print(f\"she : {find_similar('she')}\")\n", | |
"print(f\"heart: {find_similar('heart')}\")\n", | |
"print(f\"love : {find_similar('love')}\")\n", | |
"print(f\"death: {find_similar('death')}\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "geographic-candy", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "tf_macos", | |
"language": "python", | |
"name": "tf_macos" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.6" | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"title_cell": "Implementation: Neural Probabilistic Language Model", | |
"title_sidebar": "Contents", | |
"toc_cell": true, | |
"toc_position": { | |
"height": "calc(100% - 180px)", | |
"left": "10px", | |
"top": "150px", | |
"width": "250.43478393554688px" | |
}, | |
"toc_section_display": true, | |
"toc_window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# slightly modified version of https://github.com/ratsgo/embedding/blob/master/preprocess.sh | |
COMMAND=$1 | |
function gdrive_download () { | |
CONFIRM=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=$1" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p') | |
wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$CONFIRM&id=$1" -O $2 | |
rm -rf /tmp/cookies.txt | |
} | |
case $COMMAND in | |
dump-raw-wiki) | |
echo "download ko-wikipedia..." | |
wget https://dumps.wikimedia.org/kowiki/latest/kowiki-latest-pages-articles.xml.bz2 -P $HOME/data/raw | |
mkdir -p $HOME/data/processed | |
;; | |
dump-raw-korquad) | |
echo "download KorQuAD data..." | |
wget https://korquad.github.io/dataset/KorQuAD_v1.0_train.json -P $HOME/data/raw | |
wget https://korquad.github.io/dataset/KorQuAD_v1.0_dev.json -P $HOME/data/raw | |
mkdir -p $HOME/data/processed | |
;; | |
dump-raw-nsmc) | |
echo "download naver movie corpus..." | |
wget https://github.com/e9t/nsmc/raw/master/ratings.txt -P $HOME/data/raw | |
wget https://github.com/e9t/nsmc/raw/master/ratings_train.txt -P $HOME/data/raw | |
wget https://github.com/e9t/nsmc/raw/master/ratings_test.txt -P $HOME/data/raw | |
mkdir -p $HOME/data/processed | |
;; | |
dump-blog) | |
echo "download blog data.." | |
mkdir -p $HOME/data/processed | |
gdrive_download 1Few7-Mh3JypQN3rjnuXD8yAXrkxUwmjS $HOME/data/processed/processed_blog.txt | |
;; | |
dump-raw) | |
echo "make directories..." | |
mkdir -p $HOME/data | |
mkdir -p $HOME/data/processed | |
mkdir $HOME/data/tokenized | |
echo "download similar sentence data..." | |
wget https://github.com/songys/Question_pair/raw/master/kor_pair_train.csv -P $HOME/data/raw | |
wget https://github.com/songys/Question_pair/raw/master/kor_Pair_test.csv -P $HOME/data/raw | |
;; | |
dump-word-embeddings) | |
echo "download word embeddings..." | |
mkdir -p $HOME/data/processed | |
cd $HOME/data | |
gdrive_download 1FeGIbSz2E1A63JZP_XIxnGaSRt7AhXFf $HOME/data/word-embeddings.zip | |
unzip word-embeddings.zip | |
rm word-embeddings.zip | |
;; | |
dump-sentence-embeddings) | |
echo "download sentence embeddings..." | |
mkdir -p $HOME/data/processed | |
cd $HOME/data | |
gdrive_download 1jL3Q5H1vwATewHrx0PJgJ8YoUCtEkaGW $HOME/data/sentence-embeddings.zip | |
unzip sentence-embeddings.zip | |
rm sentence-embeddings.zip | |
;; | |
dump-tokenized) | |
echo "download tokenized data..." | |
mkdir -p $HOME/data/processed | |
cd $HOME/data | |
gdrive_download 1Ybp_DmzNEpsBrUKZ1-NoPDzCMO39f-fx $HOME/data/tokenized.zip | |
unzip tokenized.zip | |
rm tokenized.zip | |
;; | |
dump-processed) | |
echo "download processed data..." | |
mkdir -p $HOME/data | |
cd $HOME/data | |
gdrive_download 1kUecR7xO7bsHFmUI6AExtY5u2XXlObOG $HOME/data/processed.zip | |
unzip processed.zip | |
rm processed.zip | |
;; | |
process-wiki) | |
echo "processing ko-wikipedia..." | |
mkdir -p $HOME/data/processed | |
python preprocess/dump.py --preprocess_mode wiki \ | |
--input_path $HOME/data/raw/kowiki-latest-pages-articles.xml.bz2 \ | |
--output_path $HOME/data/processed/processed_wiki_ko.txt | |
;; | |
process-nsmc) | |
echo "processing naver movie corpus..." | |
mkdir -p $HOME/data/processed | |
python preprocess/dump.py --preprocess_mode nsmc \ | |
--input_path $HOME/data/raw/ratings.txt \ | |
--output_path $HOME/data/processed/processed_ratings.txt \ | |
--with_label False | |
python preprocess/dump.py --preprocess_mode nsmc \ | |
--input_path $HOME/data/raw/ratings_train.txt \ | |
--output_path $HOME/data/processed/processed_ratings_train.txt \ | |
--with_label True | |
python preprocess/dump.py --preprocess_mode nsmc \ | |
--input_path $HOME/data/raw/ratings_test.txt \ | |
--output_path $HOME/data/processed/processed_ratings_test.txt \ | |
--with_label True | |
;; | |
process-korquad) | |
echo "processing KorQuAD corpus..." | |
mkdir -p $HOME/data/processed | |
python preprocess/dump.py --preprocess_mode korquad \ | |
--input_path $HOME/data/raw/KorQuAD_v1.0_train.json \ | |
--output_path $HOME/data/processed/processed_korquad_train.txt | |
python preprocess/dump.py --preprocess_mode korquad \ | |
--input_path $HOME/data/raw/KorQuAD_v1.0_dev.json \ | |
--output_path $HOME/data/processed/processed_korquad_dev.txt | |
cat $HOME/data/processed/processed_korquad_train.txt $HOME/data/processed/processed_korquad_dev.txt > $HOME/data/processed/processed_korquad.txt | |
rm $HOME/data/processed/processed_korquad_*.txt | |
;; | |
mecab-tokenize) | |
echo "mecab, tokenizing..." | |
python preprocess/supervised_nlputils.py --tokenizer mecab \ | |
--input_path $HOME/data/processed/processed_wiki_ko.txt \ | |
--output_path data/tokenized/wiki_ko_mecab.txt | |
python preprocess/supervised_nlputils.py --tokenizer mecab \ | |
--input_path $HOME/data/processed/processed_ratings.txt \ | |
--output_path data/tokenized/ratings_mecab.txt | |
python preprocess/supervised_nlputils.py --tokenizer mecab \ | |
--input_path $HOME/data/processed/processed_korquad.txt \ | |
--output_path data/tokenized/korquad_mecab.txt | |
;; | |
process-jamo) | |
echo "processing jamo sentences..." | |
python preprocess/unsupervised_nlputils.py --preprocess_mode jamo \ | |
--input_path $HOME/data/tokenized/corpus_mecab.txt \ | |
--output_path $HOME/data/tokenized/corpus_mecab_jamo.txt | |
;; | |
space-correct) | |
echo "train & apply space correct..." | |
python preprocess/unsupervised_nlputils.py --preprocess_mode train_space \ | |
--input_path $HOME/data/processed/processed_ratings.txt \ | |
--model_path $HOME/data/processed/space-correct.model | |
python preprocess/unsupervised_nlputils.py --preprocess_mode apply_space_correct \ | |
--input_path $HOME/data/processed/processed_ratings.txt \ | |
--model_path $HOME/data/processed/space-correct.model \ | |
--output_path $HOME/data/processed/corrected_ratings_corpus.txt \ | |
--with_label False | |
python preprocess/unsupervised_nlputils.py --preprocess_mode apply_space_correct \ | |
--input_path $HOME/data/processed/processed_ratings_train.txt \ | |
--model_path $HOME/data/processed/space-correct.model \ | |
--output_path $HOME/data/processed/corrected_ratings_train.txt \ | |
--with_label True | |
python preprocess/unsupervised_nlputils.py --preprocess_mode apply_space_correct \ | |
--input_path $HOME/data/processed/processed_ratings_test.txt \ | |
--model_path $HOME/data/processed/space-correct.model \ | |
--output_path $HOME/data/processed/corrected_ratings_test.txt \ | |
--with_label True | |
;; | |
soy-tokenize) | |
echo "soynlp, LTokenizing..." | |
mkdir -p $HOME/data/tokenized | |
python preprocess/unsupervised_nlputils.py --preprocess_mode compute_soy_word_score \ | |
--input_path $HOME/data/processed/corrected_ratings_corpus.txt \ | |
--model_path $HOME/data/processed/soyword.model | |
python preprocess/unsupervised_nlputils.py --preprocess_mode soy_tokenize \ | |
--input_path $HOME/data/processed/corrected_ratings_corpus.txt \ | |
--model_path $HOME/data/processed/soyword.model \ | |
--output_path $HOME/data/tokenized/ratings_soynlp.txt | |
;; | |
komoran-tokenize) | |
echo "komoran, tokenizing..." | |
mkdir -p $HOME/data/tokenized | |
python preprocess/supervised_nlputils.py --tokenizer komoran \ | |
--input_path $HOME/data/processed/corrected_ratings_corpus.txt \ | |
--output_path $HOME/data/tokenized/ratings_komoran.txt | |
;; | |
okt-tokenize) | |
echo "okt, tokenizing..." | |
mkdir -p $HOME/data/tokenized | |
python preprocess/supervised_nlputils.py --tokenizer okt \ | |
--input_path $HOME/data/processed/corrected_ratings_corpus.txt \ | |
--output_path $HOME/data/tokenized/ratings_okt.txt | |
;; | |
hannanum-tokenize) | |
echo "hannanum, tokenizing..." | |
mkdir -p $HOME/data/tokenized | |
python preprocess/supervised_nlputils.py --tokenizer hannanum \ | |
--input_path $HOME/data/processed/corrected_ratings_corpus.txt \ | |
--output_path $HOME/data/tokenized/ratings_hannanum.txt | |
;; | |
khaiii-tokenize) | |
echo "khaiii, tokenizing..." | |
mkdir -p $HOME/data/tokenized | |
python preprocess/supervised_nlputils.py --tokenizer khaiii \ | |
--input_path $HOME/data/processed/corrected_ratings_corpus.txt \ | |
--output_path $HOME/data/tokenized/ratings_khaiii.txt | |
;; | |
bert-tokenize) | |
mkdir -p $HOME/data/tokenized | |
python preprocess/unsupervised_nlputils.py --preprocess_mode bert_tokenize \ | |
--vocab_path $HOME/data/sentence-embeddings/bert/pretrain-ckpt/vocab.txt \ | |
--input_path $HOME/data/processed/corrected_ratings_corpus.txt \ | |
--output_path $HOME/data/tokenized/ratings_sentpiece.txt | |
;; | |
mecab-user-dic) | |
echo "insert mecab user dictionary..." | |
cd /tmp/mecab-ko-dic-2.1.1-20180720 | |
cp -f $HOME/preprocess/mecab-user-dic.csv /tmp/mecab-ko-dic-2.1.1-20180720/user-dic/nnp.csv | |
./tools/add-userdic.sh | |
make install | |
cd /Users/PSH | |
;; | |
make-bert-vocab) | |
echo "making BERT vocabulary..." | |
mkdir -p $HOME/data | |
cd $HOME/data | |
gdrive_download 1kUecR7xO7bsHFmUI6AExtY5u2XXlObOG $HOME/data/processed.zip | |
unzip processed.zip | |
rm processed.zip | |
cd /Users/PSH | |
python preprocess/unsupervised_nlputils.py --preprocess_mode make_bert_vocab \ | |
--input_path $HOME/data/processed/processed_wiki_ko.txt \ | |
--vocab_path $HOME/data/processed/bert.vocab | |
mv sentpiece* $HOME/data/processed | |
;; | |
esac |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment