Created
May 21, 2018 06:02
-
-
Save leojojo/d4b6990100ab746e59e6e8c996e18395 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, getopt | |
import csv, re, MeCab | |
from os import path | |
from wordcloud import WordCloud | |
import matplotlib.pyplot as plt | |
def clean_tweet(tweet): | |
reply = r'@[A-Za-z0-9\_\-]+' | |
hashtag = r'#\w+' | |
link = r'\w+:\/\/\S+' | |
rt = r'RT' | |
punc = r'[:\[\]ー,\.、。\']' | |
face = r'm?\\?\?₍{0,2} ?\([^a-np-zA-Zぁ-ふほ-んァ-ン一-龥]{2,14}\) ?\/?/?₎{0,2}m?.?☞?|([^a-np-zA-Zぁ-んァ-ン一-龥]{2,12})|\(.3\[.{1,5}.|・ω.{1,2}|¯\\_\(ツ\)_/¯|⊂.*[⊃₎]|_\(:3.*\)_|.*\)\"\"|.*ฅ|ʕ.*ʔ| ҉.*\*҉|\(.*੭ु| *_人*_| * ̄Y.* ̄|【| 】|>.*<|>.*<' | |
regex = reply+'|'+hashtag+'|'+link+'|'+rt+'|'+punc+'|'+face | |
tweet = re.sub(regex, '', tweet) | |
return tweet | |
def mecab_analysis(tweet): | |
tagger = MeCab.Tagger('-Ochasen -d /usr/local/lib/mecab/dic/ipadic') | |
for chunk in tagger.parse(clean_tweet(tweet)).splitlines()[:-1]: | |
surface,_,_,feature,_,_ = chunk.split('\t') | |
if bool(re.search(r'^[名形動]',feature)): | |
return surface | |
def parse_csv(csvfile): | |
string = '' | |
with open (csvfile, newline='') as f: | |
reader = csv.reader(f) | |
for row in reader: | |
string = string + str(mecab_analysis(row[5])) + ' ' | |
return string | |
def draw_cloud(txt): | |
font = '/Library/Fonts/ヒラギノ丸ゴ ProN W4.ttc' | |
wordcloud = WordCloud( | |
font_path=font, | |
width=900, | |
height=500 | |
).generate(txt) | |
plt.figure() | |
plt.imshow(wordcloud) | |
plt.axis("off") | |
plt.show() | |
def main(argv): | |
inputfile = '' | |
try: | |
opts, args = getopt.getopt(argv, "hi:o:", ["ifile="]) | |
except getopt.GetoptError: | |
print ('python3 twitter_analysis.py -i <inputfile>') | |
sys.exit(2) | |
for opt, arg in opts: | |
if opt in ("-i"): | |
inputfile = arg | |
draw_cloud(parse_csv(inputfile)) | |
if __name__ == "__main__": | |
print("starting...") | |
main(sys.argv[1:]) |
Author
leojojo
commented
May 21, 2018
実行
$ python3 twi_wordcloud.py -i tweets.csv
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment