K-fold cross validation implementation python

Tags:

I am trying to implement the k-fold cross-validation algorithm in python. I know SKLearn provides an implementation but still... This is my code as of right now.

from sklearn import metrics
import numpy as np

class Cross_Validation:

@staticmethod
def partition(vector, fold, k):
    size = vector.shape[0]
    start = (size/k)*fold
    end = (size/k)*(fold+1)
    validation = vector[start:end]
    if str(type(vector)) == "<class 'scipy.sparse.csr.csr_matrix'>":
        indices = range(start, end)
        mask = np.ones(vector.shape[0], dtype=bool)
        mask[indices] = False
        training = vector[mask]
    elif str(type(vector)) == "<type 'numpy.ndarray'>":
        training = np.concatenate((vector[:start], vector[end:]))
    return training, validation

@staticmethod
def Cross_Validation(learner, k, examples, labels):
    train_folds_score = []
    validation_folds_score = []
    for fold in range(0, k):
        training_set, validation_set = Cross_Validation.partition(examples, fold, k)
        training_labels, validation_labels = Cross_Validation.partition(labels, fold, k)
        learner.fit(training_set, training_labels)
        training_predicted = learner.predict(training_set)
        validation_predicted = learner.predict(validation_set)
        train_folds_score.append(metrics.accuracy_score(training_labels, training_predicted))
        validation_folds_score.append(metrics.accuracy_score(validation_labels, validation_predicted))
    return train_folds_score, validation_folds_score

The learner parameter is a classifier from SKlearn library, k is the number of folds, examples is a sparse matrix produced by the CountVectorizer (again SKlearn) that is the representation of the bag of words. For example:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from Cross_Validation import Cross_Validation as cv

vectorizer = CountVectorizer(stop_words='english', lowercase=True, min_df=2, analyzer="word")
data = vectorizer.fit_transform("""textual data""")
clfMNB = MultinomialNB(alpha=.0001)
score = cv.Cross_Validation(clfMNB, 10, data, labels)
print "Train score" + str(score[0])
print "Test score" + str(score[1])

I'm assuming there is some logic error somewhere since the scores are 95% on the training set (as expected) but practically 0 on the test test, but I can't find it.

I hope I was clear. Thanks in advance.

________________________________EDIT___________________________________

This is the code that loads the text into the vector that can be passed to the vectorizer. It also returns the label vector.

from nltk.tokenize import word_tokenize
from Categories_Data import categories
import numpy as np
import codecs
import glob
import os
import re

class Data_Preprocessor:

def tokenize(self, text):
    tokens = word_tokenize(text)
    alpha = [t for t in tokens if unicode(t).isalpha()]
    return alpha

def header_not_fully_removed(self, text):
    if ":" in text.splitlines()[0]:
        return len(text.splitlines()[0].split(":")[0].split()) == 1
    else:
        return False

def strip_newsgroup_header(self, text):
    _before, _blankline, after = text.partition('\n\n')
    if len(after) > 0 and self.header_not_fully_removed(after):
        after = self.strip_newsgroup_header(after)
    return after

def strip_newsgroup_quoting(self, text):
    _QUOTE_RE = re.compile(r'(writes in|writes:|wrote:|says:|said:'r'|^In article|^Quoted from|^\||^>)')
    good_lines = [line for line in text.split('\n')
        if not _QUOTE_RE.search(line)]
    return '\n'.join(good_lines)

def strip_newsgroup_footer(self, text):
    lines = text.strip().split('\n')
    for line_num in range(len(lines) - 1, -1, -1):
        line = lines[line_num]
        if line.strip().strip('-') == '':
            break
    if line_num > 0:
        return '\n'.join(lines[:line_num])
    else:
        return text

def raw_to_vector(self, path, to_be_stripped=["header", "footer", "quoting"], noise_threshold=-1):
    base_dir = os.getcwd()
    train_data = []
    label_data = []
    for category in categories:
        os.chdir(base_dir)
        os.chdir(path+"/"+category[0])
        for filename in glob.glob("*"):
            with codecs.open(filename, 'r', encoding='utf-8', errors='replace') as target:
                data = target.read()
                if "quoting" in to_be_stripped:
                    data = self.strip_newsgroup_quoting(data)
                if "header" in to_be_stripped:
                    data = self.strip_newsgroup_header(data)
                if "footer" in to_be_stripped:
                    data = self.strip_newsgroup_footer(data)
                if len(data) > noise_threshold:
                    train_data.append(data)
                    label_data.append(category[1])
    os.chdir(base_dir)
    return np.array(train_data), np.array(label_data)

This is what "from Categories_Data import categories" imports...

categories = [
    ('alt.atheism',0),
    ('comp.graphics',1),
    ('comp.os.ms-windows.misc',2),
    ('comp.sys.ibm.pc.hardware',3),
    ('comp.sys.mac.hardware',4),
    ('comp.windows.x',5),
    ('misc.forsale',6),
    ('rec.autos',7),
    ('rec.motorcycles',8),
    ('rec.sport.baseball',9),
    ('rec.sport.hockey',10),
    ('sci.crypt',11),
    ('sci.electronics',12),
    ('sci.med',13),
    ('sci.space',14),
    ('soc.religion.christian',15),
    ('talk.politics.guns',16),
    ('talk.politics.mideast',17),
    ('talk.politics.misc',18),
    ('talk.religion.misc',19)
 ]

258

asked Aug 09 '16 12:08

Lorenzo Norcini

1 Answers

The reason why your validation score is low is subtle.

The issue is how you have partitioned the dataset. Remember, when doing cross-validation you should randomly split the dataset. It is the randomness that you are missing.

Your data is loaded category by category, which means in your input dataset, class labels and examples follow one after the other. By not doing the random split, you have completely removed a class which your model never sees during the training phase and hence you get a bad result on your test/validation phase.

You can solve this by doing a random shuffle. So, do this:

from sklearn.utils import shuffle    

processor = Data_Preprocessor()
td, tl = processor.raw_to_vector(path="C:/Users/Pankaj/Downloads/ng/")
vectorizer = CountVectorizer(stop_words='english', lowercase=True, min_df=2, analyzer="word")
data = vectorizer.fit_transform(td)
# Shuffle the data and labels
data, tl = shuffle(data, tl, random_state=0)
clfMNB = MultinomialNB(alpha=.0001)
score = Cross_Validation.Cross_Validation(clfMNB, 10, data, tl)

print("Train score" + str(score[0]))
print("Test score" + str(score[1]))

163

answered Oct 12 '22 19:10

Pankaj Daga

Related questions
                            
                                How does hashing work for python sets [duplicate]
                            
                                Iterating through multidimensional lists?
                            
                                Using ols function with parameters that contain numbers/spaces
                            
                                Python - SkLearn Imputer usage
                            
                                Sqlalchemy: Print contents of table
                            
                                Trigonometric identities
                            
                                How to emit dataChanged in PyQt5
                            
                                Is there a way, in Django, to define routes using Flask-style route syntax?
                            
                                how to fetch a field in ConsumerRecord
                            
                                Add legend to networks plot to explain colouring of nodes
                            
                                Compare pandas dataframes for common rows in two dataframes
                            
                                How to perform an operation on every element in a numpy matrix?
                            
                                Pivot table error:1 ndim Categorical are not supported at this time
                            
                                how to get webpage resource content via chrome remote debugging
                            
                                What does self[identifier] = some_value do in this code?
                            
                                AWS Unable to import module 'app' : no module named Pymysql
                            
                                OpenSSL.crypto.X509.sign() throws " 'bytes' object has no attribute 'encode' "
                            
                                python module names with same name as existing modules
                            
                                Translate/Rotate 2D points to change perspective
                            
                                Python shared read memory

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

K-fold cross validation implementation python

Tags:

python

machine-learning

scikit-learn

cross-validation

Lorenzo Norcini

People also ask

1 Answers

Pankaj Daga

Recent Activity

Donate For Us