This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#################################### | |
# Last tested & updated 27/10/2013 | |
#################################### | |
sudo apt-get update | |
sudo apt-get upgrade | |
wget http://launchpad.net/graphite/0.9/0.9.9/+download/graphite-web-0.9.9.tar.gz | |
wget http://launchpad.net/graphite/0.9/0.9.9/+download/carbon-0.9.9.tar.gz | |
wget http://launchpad.net/graphite/0.9/0.9.9/+download/whisper-0.9.9.tar.gz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
vagrant init precise32 http://files.vagrantup.com/precise32.box | |
########################## | |
# ssh to vagrant machine | |
########################## | |
vagrant ssh | |
########################## | |
# to configure directory to use vagrant |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# exit on any error | |
set -o errexit | |
# exit if an uninitialised variable is used. | |
set -o nounset | |
# install nginx. | |
sudo apt-get install -y nginx | |
sudo apt-get upgrade |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
seed_urls = ['https://inshorts.com/en/read/technology', | |
'https://inshorts.com/en/read/sports', | |
'https://inshorts.com/en/read/world'] | |
def build_dataset(seed_urls): | |
news_data = [] | |
for url in seed_urls: | |
news_category = url.split('/')[-1] | |
data = requests.get(url) | |
soup = BeautifulSoup(data.content, 'html.parser') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def remove_accented_chars(text): | |
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore') | |
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP): | |
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), | |
flags=re.IGNORECASE|re.DOTALL) | |
def expand_match(contraction): | |
match = contraction.group(0) | |
first_char = match[0] | |
expanded_contraction = contraction_mapping.get(match)\ | |
if contraction_mapping.get(match)\ | |
else contraction_mapping.get(match.lower()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def remove_special_characters(text, remove_digits=False): | |
pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]' | |
text = re.sub(pattern, '', text) | |
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def simple_stemmer(text): | |
ps = nltk.porter.PorterStemmer() | |
text = ' '.join([ps.stem(word) for word in text.split()]) | |
return text | |
def lemmatize_text(text): | |
text = nlp(text) | |
text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text]) | |
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def remove_stopwords(text, is_lower_case=False): | |
tokens = tokenizer.tokenize(text) | |
tokens = [token.strip() for token in tokens] | |
if is_lower_case: | |
filtered_tokens = [token for token in tokens if token not in stopword_list] | |
else: | |
filtered_tokens = [token for token in tokens if token.lower() not in stopword_list] | |
filtered_text = ' '.join(filtered_tokens) | |
return filtered_text |
OlderNewer