Skip to content

Instantly share code, notes, and snippets.

@tdiggelm
Created March 24, 2020 00:31
Show Gist options
  • Save tdiggelm/ccd3d87437a843aa83bd032ef9e2c741 to your computer and use it in GitHub Desktop.
Save tdiggelm/ccd3d87437a843aa83bd032ef9e2c741 to your computer and use it in GitHub Desktop.
Analyzer that does a few sane default transformations
import re
import nltk
class Analyzer:
''' This analyzer applies the following transformation to an input text:
1. tokenize text into words
2. lowercase transform words
3. stem words
4. filter stopwords
5. replace all numbers with _num_
6. remove all non-words
7. compute everygrams
8. returns transformed tokens
'''
RE_WORD = re.compile('^\w+$', re.UNICODE)
RE_NUMBER = re.compile('^[-+]?[0-9]*\.?[0-9]+$')
def __init__(self, language='english', use_stemmer=True, ngram_range=(1,1),
min_token_len=2):
self.min_token_len = min_token_len
self.stemmer = nltk.stem.PorterStemmer()
self.ngram_range = ngram_range
self.use_stemmer = use_stemmer
words = stopwords.words(language)
if use_stemmer:
words = (self.stemmer.stem(word) for word in words)
self.stopwords = set(words)
def __call__(self, text):
tokens = nltk.word_tokenize(text)
tokens = (tok.lower().strip() for tok in tokens)
if self.use_stemmer:
tokens = (self.stemmer.stem(tok) for tok in tokens)
tokens = (tok for tok in tokens if not tok in self.stopwords)
tokens = (tok for tok in tokens if len(tok) >= self.min_token_len)
tokens = (self.RE_NUMBER.sub('_num_', tok) for tok in tokens)
tokens = (tok for tok in tokens if self.RE_WORD.match(tok))
tokens = nltk.everygrams(tokens,
min_len=self.ngram_range[0],
max_len=self.ngram_range[1])
tokens = (' '.join(tok) for tok in tokens)
return list(tokens)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment