Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

load pickle file for counvectorizer

I have train model which and saved pickle file, but when I tried to load it on new data I am getting error ">>> Traceback (most recent call last): File "", line 1, in "

Please refer to below script in which I have trained data saved pickle file.

# Import the pandas package, then use the "read_csv" function to read
# the labeled training data
import os
import pandas as pd       
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords # Import the stop word list
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.grid_search import GridSearchCV
import pickle

##Set working directory
os.getcwd()
os.chdir("C:/Prediction")

##Read history data file
train = pd.read_csv("C:/Prediction/Past.csv",encoding='cp1252')

##Text Cleanng keeping only key words/ stemmming 
stemmer = SnowballStemmer('english')

def Description_to_words(raw_Description):
    #1. Remove HTML.    
    Description_text = BeautifulSoup(raw_Description).get_text() 
    #2. Remove non-letters: 
    #letters_only = re.sub("[^\w\s]", " ", Description_text)
    letters_only = re.sub("[^a-zA-Z]", " ", Description_text)
    #3. Convert to lower case
    words = word_tokenize(letters_only.lower())    
    #4. Remove stop words
    stops = set(stopwords.words("english")) 
    meaningful_words = [w for w in words if not w in stops]
    #5Stem words. Another issue. Stem meaningful_words, not words.
    return( " ".join(stemmer.stem(w) for w in meaningful_words))

# Get the number of Descriptions based on the dataframe column size
num_Descriptions = train["Description"].size

# Initialize an empty list to hold the clean Descriptions
clean_train_Descriptions = []

# Loop over each Description; create an index i that goes from 0 to the length
# of the Ticket Description list 

print("Cleaning and parsing the training set ticket Descriptions...\n")
clean_train_Descriptions = []
for i in range( 0, num_Descriptions ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%1000 == 0 ):
        print("Description %d of %d\n" % ( i+1, num_Descriptions ))
    # Call our function for each one, and add the result to the list of
    # clean Descriptions
    clean_train_Descriptions.append(Description_to_words( train["Description"][i] ))
##Text Cleanng keeping only key words/ stemmming 

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000,   \
                             ngram_range=(1,2)) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_Descriptions)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

# Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 
forest = forest.fit(train_data_features, train["Group"])

###save picle file 
pickle.dump(train_data_features, open("vector.pickel","wb"))
pickle.dump(forest, open("classifier-rf.pickel","wb"))

But when I load vector.pickel file to create test_data_features on new data set I am getting. Error. Can anyone help me on this, or everytime I had to train model while predicting new data set. Please advice.

# Read the test data
test = pd.read_csv("C:/New.csv",encoding='cp1252')

# Create an empty list and append the clean Descriptions one by one
num_Descriptions = len(test["Description"])
clean_test_Descriptions = [] 

print("Cleaning and parsing the test set movie Descriptions...\n")
for i in range(0,num_Descriptions):
    if( (i+1) % 1000 == 0 ):
        print("Description %d of %d\n" % (i+1, num_Descriptions))
    clean_Description = Description_to_words( test["Description"][i] )
    clean_test_Descriptions.append( clean_Description )

# Get a bag of words for the test set, and convert to a numpy array
vect1 = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000,   \
                             ngram_range=(1,2)) 

vect1=pickle.load(open("vector.pickel","rb"))
test_data_features = vect1.transform(clean_test_Descriptions)
like image 770
user3734568 Avatar asked Mar 08 '23 08:03

user3734568


1 Answers

You're pickling the wrong object. In the part where you're doing the pickling, you're pickling the matrix that is returned as a result of the CountVectorizer transformer.

What you need to do is pickle your vectorizer:

# create CountVectorizer transformer
vectorizer = CountVectorizer(analyzer="word",
                             tokenizer=None,
                             preprocessor=None,
                             stop_words=None,
                             max_features=5000,
                             ngram_range=(1, 2))

# fit on training data
# assuming clean_train_Descriptions is training set
vectorizer.fit(clean_train_Descriptions)

# now pickle
pickle.dump(vectorizer, open("vector.pickel", "wb"))

Now, when you need to score, you just load the object and score on the new data

# load pickle
vectorizer = pickle.load(open("vector.pickel", "rb"))

# score
# assuming clean_test_Descriptions is the test set
test_data_features = vectorizer.transform(clean_test_Descriptions)
like image 136
Scratch'N'Purr Avatar answered Mar 19 '23 11:03

Scratch'N'Purr