Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save lrvick/1266502 to your computer and use it in GitHub Desktop.
Save lrvick/1266502 to your computer and use it in GitHub Desktop.
Manually train an NLTK NaiveBayes Classifier
from nltk.probability import DictionaryProbDist
from nltk import NaiveBayesClassifier
from nltk import FreqDist
train_samples = {
'I hate you and you are a bad person': 'neg',
'I love you and you are a good person': 'pos',
'I fail at everything and I want to kill people' : 'neg',
'I win at everything and I want to love people' : 'pos',
'sad are things are heppening. fml' : 'neg',
'good are things are heppening. gbu' : 'pos',
'I am so poor' : 'neg',
'I am so rich' : 'pos',
'I hate you mommy ! You are my terrible person' : 'neg',
'I love you mommy ! You are my amazing person' : 'pos',
'I want to kill butterflies since they make me sad' : 'neg',
'I want to chase butterflies since they make me happy' : 'pos',
'I want to hurt bunnies' : 'neg',
'I want to hug bunnies' : 'pos',
'You make me frown' : 'neg',
'You make me smile' : 'pos',
}
word_freqs = {'pos': {}, 'neg': {}}
for text, label in train_samples.items():
tokens = text.split()
for token in tokens:
word_freqs[label][token] = word_freqs[label].get(token, 0) + 1
print word_freqs
test_samples = [
'You are a terrible person and everything you do is bad',
'I love you all and you make me happy',
'I frown whenever I see you in a poor state of mind',
'Finally getting rich from my ideas. They make me smile.',
'My mommy is poor',
'I love butterflies. Yay for happy',
'Everything is fail today and I hate stuff',
]
def gen_bow(s):
words = s.split(' ')
bow = {}
for word in words:
bow[word] = True
return bow
label_probdist = DictionaryProbDist({'pos': 0.5, 'neg': 0.5})
true_probdist = DictionaryProbDist({True: 6})
feature_probdist = { ## need to generate this from train_samples
('neg', 'no'): true_probdist,
('neg', 'hate'): true_probdist,
('neg', 'fml'): true_probdist,
('neg', 'poor'): true_probdist,
('neg', 'sad'): true_probdist,
('neg', 'fail'): true_probdist,
('neg', 'kill'): true_probdist,
('neg', 'evil'): true_probdist,
('pos', 'bunnies'): true_probdist,
('pos', 'butteryfly'): true_probdist,
('pos', 'pony'): true_probdist,
('pos', 'love'): true_probdist,
('pos', 'smile'): true_probdist,
('pos', 'happy'): true_probdist,
('pos', 'amazing'): true_probdist,
('pos', 'yes'): true_probdist,
}
classifier = NaiveBayesClassifier(label_probdist, feature_probdist)
for sample in test_samples:
print "%s | %s | %s" % (sample, classifier.classify(gen_bow(sample)), classifier.prob_classify(gen_bow(sample)))
classifier.show_most_informative_features()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment