Pickle – saving trained classifiers
Once a classifier has been trained it’s often quicker in the long run to save the algorithm for re-use later, rather than training it each and every time.
Script to Save the classifier:-
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()
Script to load the classifier once pickled:-
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()
import nltk
import random
import pickle
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower()) # normalise everything to lower case and append
all_words = nltk.FreqDist(all_words) # converts to a nltk frequency distribution
word_features = list (all_words.keys())[:3000] # from the frequency list we're taking just the words(keys) and only the top 3000
def find_fetures(document):
words = set(document) # this gives a list of the unique words - removes duplicates
features = {} # declare an empty dictionary
for w in word_features:
features[w] = (w in words) # this checks each word in the top 3000 to see if it is present in the passed text 'document' so gives a true/false against the 3000
return features
# print((find_fetures(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_fetures(rev), category) for (rev, category) in documents]
training_set = featuresets[:1900] # splits the featuresets into two seperate groups 1 to train and the other to test
testing_set = featuresets[1900:]
## Naive Bayse Algorythm
# classifier = nltk.NaiveBayesClassifier.train(training_set) # training the NaiveBayesClassifier on training data commented out once naivebayes.pickle is generated
classifier_f = open("naivebayes.pickle","rb")
classifer = pickle.load(classifier_f)
classifier_f.close()
print("Naive Bayes Algo accuracy:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15) # tells most popular words on both sides and if +ve or -ve
## Pickle allows you to save python obects and import them later
# save_classifier = open("naivebayes.pickle","wb") # commented out once naivebayes.pickle has been generated
# pickle.dump(classifier, save_classifier)
# save_classifier.close()