I have a script that randomly generates a set of data and trains several classifiers to compare them against each other (it's very similar to http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html):
from itertools import product
import numpy as np
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
names = ["Linear SVM", "Decision Tree",
"Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Analysis",
"Quadratic Discriminant Analysis"]
def griddy_mcsearchface(num_samples, num_feats, num_feats_to_remove):
classifiers = [
SVC(kernel="linear", C=0.025),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
AdaBoostClassifier(), GaussianNB(),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis()]
classifiers2 = [
SVC(kernel="linear", C=0.025),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
AdaBoostClassifier(), GaussianNB(),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis()]
X, y = make_classification(n_samples=num_samples, n_features=num_feats, n_redundant=0, n_informative=2,
random_state=1, n_clusters_per_class=1)
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
for name, clf, clf2 in zip(names, classifiers, classifiers2):
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
# Remove 40% of the features.
clf2.fit(X_train[:,:-num_feats_to_remove], y_train)
score2 = clf2.score(X_test[:,:-num_feats_to_remove], y_test)
yield (num_samples, num_feats, num_feats_to_remove, name, score, score2)
And to run it:
_samples = [100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000]
_feats = [10, 20, 50, 100, 200, 500, 10000]
_feats_to_rm = [5, 10, 25, 50, 100, 250]
for num_samples, num_feats, num_feats_to_remove in product(_samples, _feats, _feats_to_rm):
if num_feats <= num_feats_to_remove:
continue
for i in griddy_mcsearchface(num_samples, num_feats, num_feats_to_remove):
print (i)
The script outputs something like:
(100, 10, 5, 'Linear SVM', 1.0, 0.40000000000000002)
(100, 10, 5, 'Decision Tree', 1.0, 0.65000000000000002)
(100, 10, 5, 'Random Forest', 1.0, 0.90000000000000002)
(100, 10, 5, 'AdaBoost', 1.0, 0.65000000000000002)
(100, 10, 5, 'Naive Bayes', 1.0, 0.75)
(100, 10, 5, 'Linear Discriminant Analysis', 1.0, 0.40000000000000002)
(100, 10, 5, 'Quadratic Discriminant Analysis', 1.0, 0.84999999999999998)
(100, 20, 5, 'Linear SVM', 1.0, 1.0)
(100, 20, 5, 'Decision Tree', 0.94999999999999996, 0.94999999999999996)
(100, 20, 5, 'Random Forest', 0.80000000000000004, 0.75)
(100, 20, 5, 'AdaBoost', 1.0, 0.94999999999999996)
(100, 20, 5, 'Naive Bayes', 1.0, 1.0)
(100, 20, 5, 'Linear Discriminant Analysis', 1.0, 1.0)
(100, 20, 5, 'Quadratic Discriminant Analysis', 0.84999999999999998, 0.94999999999999996)
(100, 20, 10, 'Linear SVM', 0.94999999999999996, 0.65000000000000002)
(100, 20, 10, 'Decision Tree', 0.94999999999999996, 0.59999999999999998)
(100, 20, 10, 'Random Forest', 0.75, 0.69999999999999996)
(100, 20, 10, 'AdaBoost', 0.94999999999999996, 0.69999999999999996)
(100, 20, 10, 'Naive Bayes', 0.94999999999999996, 0.75)
but the clf.fit()
is now single-threaded.
Assuming that I have enough threads to run all classifiers for each iteration, How would I be able to train the classifiers using different threads for every iteration of for num_samples, num_feats, num_feats_to_remove in product(_samples, _feats, _feats_to_rm)
?
And if I am restricted to 4 or 8 threads but I need to train >4 or >8 classifiers for each iteration, how is it done?
The evaluation procedure can be configured to use multiple cores, where each model training and evaluation happens on a separate core. This can be done by setting the n_jobs argument on the call to cross_val_score() function; for example: We can explore the effect of multiple cores on model evaluation.
Scikit-learn relies heavily on NumPy and SciPy, which internally call multi-threaded linear algebra routines implemented in libraries such as MKL, OpenBLAS or BLIS.
By default, auto-sklearn uses one core. See also Parallel computation on how to configure this. How can I run auto-sklearn in parallel?
This is less of an answer and more of a rough sketch of an answer to your first question,
How would I be able to train the classifiers using different threads
for every iteration of for num_samples, num_feats, num_feats_to_remove in product(_samples, _feats, _feats_to_rm)
I assume by this you mean that for each iteration of for name, clf, clf2 in zip(names, classifiers, classifiers2):
you want clf
and clf2
trained on different processors.
Here is some working code as a starting point (it is poorly implemented, but the general idea is there):
from itertools import product
import numpy as np
import multiprocessing
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
names = ["Linear SVM", "Decision Tree",
"Random Forest", "AdaBoost", "Naive Bayes", "Linear Discriminant Analysis",
"Quadratic Discriminant Analysis"]
# def mp_handler():
# p = multiprocessing.Pool(8)
# p.map(mp_worker, data)
def mp_worker((name, clf, X_train, y_train, X_test, y_test, num_features_to_remove)):
if num_features_to_remove == False:
clf.fit(X_train, y_train)
return ('score1', clf.score(X_test, y_test))
clf.fit(X_train[:,:-num_feats_to_remove], y_train)
return ('score2', clf.score(X_test[:,:-num_feats_to_remove], y_test))
def griddy_mcsearchface(num_samples, num_feats, num_feats_to_remove):
classifiers = [
SVC(kernel="linear", C=0.025),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
AdaBoostClassifier(), GaussianNB(),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis()]
classifiers2 = [
SVC(kernel="linear", C=0.025),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
AdaBoostClassifier(), GaussianNB(),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis()]
X, y = make_classification(n_samples=num_samples, n_features=num_feats, n_redundant=0, n_informative=2,
random_state=1, n_clusters_per_class=1)
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
for name, clf, clf2 in zip(names, classifiers, classifiers2):
p = multiprocessing.Pool(2) #set to 2 for using two processors; one processor per classfier
#The integer parameter you pass to Pool is equal to the number of SETS of classifiers you have
data = (name, clf, X_train, y_train, X_test, y_test, False), (name, clf, X_train, y_train, X_test, y_test, num_feats_to_remove)
res = p.map(mp_worker, data) #this splits the two classification tasks acrpss two separate processors
for i,j in res: #parse the results
if i == 'score1':
score1 = j
else:
score2 = j
yield (num_samples, num_feats, num_feats_to_remove, name, score1, score2)
if __name__ == '__main__':
_samples = [100, 200]
_feats = [10, 20]
_feats_to_rm = [5, 10]
for num_samples, num_feats, num_feats_to_remove in product(_samples, _feats, _feats_to_rm):
if num_feats <= num_feats_to_remove:
continue
for i in griddy_mcsearchface(num_samples, num_feats, num_feats_to_remove):
print (i)
If I misunderstood your question, then the general principle in the code above can be modified to suit your needs. I draw from the the accepted answer here in the code above.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With