Last active
December 7, 2018 01:13
-
-
Save FooQoo/c028e522d99b3209f58d035a89c802ee to your computer and use it in GitHub Desktop.
ツイート中のストップワード取り除きを単語の分ち書きにするpythonスクリプト
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
requirement : janome, pandas | |
''' | |
import re | |
from janome.tokenizer import Tokenizer | |
import unicodedata | |
from html import unescape | |
import pandas as pd | |
class Pretweet(object): | |
def __init__(self): | |
self.tokenizer = Tokenizer() | |
def cleaning(self, text): | |
text = ' ' + unescape(text) + ' ' | |
stopwords = [] | |
if text[1:3] == 'RT': | |
stopwords.append('RT') | |
stopwords += re.findall(r'\s@[a-zA-Z1-9]+\s', text) | |
stopwords += re.findall(r'\s#\w+\s', text) | |
stopwords += [g[0] for g in re.findall(r'((https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+\$,%#]+))', text)] | |
return re.sub('(' + '|'.join(stopwords) + ')', '', text)[1:-1] | |
def annotation(self, text): | |
return [(token.base_form, token.part_of_speech.split(',')) for token in self.tokenizer.tokenize(text)] | |
def normalization(self, text): | |
return unicodedata.normalize("NFKC", text) | |
def is_not_my_stopword(self, word, attr): | |
if attr[0] == '名詞' and attr[1] in ['固有名詞', '一般', 'サ変接続']: | |
if (re.match(u'[一-龥ぁ-んァ-ンa-zA-Za-zA-Z1-91-9]', word) and len(word) >= 2) or (re.match(u'[一-龥]', word) and len(word) == 1): | |
return True | |
return False | |
def segmentation(self, text, cleaning=True, cleaning_pos=True, normalization=True): | |
text = self.cleaning(text) if cleaning else text | |
segment = [] | |
for word, attr in self.annotation(text): | |
word = self.normalization(word) if normalization else word | |
if cleaning_pos and self.is_not_my_stopword(word, attr): | |
segment.append(self.normalization(word)) | |
return ' '.join(segment) | |
if __name__ == "__main__": | |
pretweet = Pretweet() | |
lines = pd.read_csv('tweets.csv', encoding = 'utf-8').text.tolist() | |
with open('processed.txt', 'w', encoding='utf-8') as f: | |
for line in lines: | |
f.write('{}\n'.format(pretweet.segmentation(line))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment