I am creating a DocumentTermMatrix
using create_matrix()
from RTextTools
and create container
and model
based on that. It is for extremely large datasets.
I do this for each category (factor levels). So for each category it has to run matrix, container and model. When I run the below code in (say 16 core / 64 gb) - it runs only in one core and memory used is less than 10%.
Is there way I can speedup this process? Perhaps using doparallel
& foreach
? Any information would certainly help.
#import the required libraries
library("RTextTools")
library("hash")
library(tm)
for ( n in 1:length(folderaddress)){
#Initialize the variables
traindata = list()
matrix = list()
container = list()
models = list()
trainingdata = list()
results = list()
classifiermodeldiv = 0.80`
#Create the directory to place the models and the output files
pradd = paste(combinedmodelsaveaddress[n],"SelftestClassifierModels",sep="")
if (!file.exists(pradd)){
dir.create(file.path(pradd))
}
Data$CATEGORY <- as.factor(Data$CATEGORY)
#Read the training files
X <- split(Data, Data$CATEGORY)
data <- lapply(seq_along(X), function(x) as.data.frame(X[[x]])[,5])
names(data) <- levels(Data$CATEGORY)
list2env(data, envir = .GlobalEnv)
files=as.matrix(names(data))
fileno=length(files)
fileno=as.integer(fileno)
print(fileno)
#For all the files in the training folder(the number of files in the training folder = Number of categories in Taxonomy)
for(i in 1:fileno){
filename = as.character(files[i,1])
data1 = as.data.frame(data[i])
data1 = as.matrix(data1)
filenamechanged = gsub ("\\.[[:alnum:]]+","",filename)
type = matrix(data = as.character(filenamechanged),nrow = length(data1[,1]),ncol=1 )
data1 = cbind(data1,type)
traindata[[i]] = data1
print(i)
}
for(i in 1:fileno){
#Obtain the unique classified data from the train files for one category
trainingdata1 = as.data.frame(traindata[[i]][,1])
uniquetraintweet = hash()
typetrain1 = matrix(data=as.character(traindata[[i]][1,2]), ncol =1, nrow = length(trainingdata1[,1]))
#If the training data is less than 10 records for a category, do not create a model
#The model created based on a smaller set of data will not be accurate
if (length(trainingdata1[,1])<200){
matrix[[i]] = NULL
next
}
#Obtain the unique classified data from the train files of all the other category except that is considered as training category
trainingdata2=matrix(data="",nrow=0,ncol=1)
for (j in 1:fileno){
if ( j==i) next
trainingdata2dummy = as.data.frame(traindata[[j]][,1])
length(trainingdata1[,1])
colnames(trainingdata2)="feedbacks"
colnames(trainingdata2dummy)="feedbacks"
trainingdata2 = rbind(trainingdata2,trainingdata2dummy)
}
#Consider one category as training set and make the remaining categories as Others
typetrain2 = matrix(data="ZZOther",nrow=length(trainingdata2[,1]),ncol=1)
colnames(trainingdata1)="feedbacks"
trainingdata[[i]]=rbind(trainingdata1,trainingdata2)
colnames(typetrain1)="type"
colnames(typetrain2)="type"
type=rbind(typetrain1,typetrain2)
trainingdata[[i]] = cbind(trainingdata[[i]],type)
trainingdata[[i]]=trainingdata[[i]][sample(nrow(trainingdata[[i]])),]
#Input the training set and other set to the classifier
mindoc = max(1,floor(min(0.001*length(trainingdata[[i]][,1]),3)))
#Create Matrix
matrix[[i]] <- create_matrix(trainingdata[[i]][,1], language="english",
removeNumbers=FALSE, stemWords=FALSE,weighting=weightTf,minWordLength=3, minDocFreq=mindoc, maxDocFreq=floor(0.5*(length(trainingdata[[i]][,1]))))
#rowTotals <- apply(matrix[[i]] , 1, sum) #Find the sum of words in each Document
#matrix[[i]] <- matrix[[i]][rowTotals> 0,]
print(i)
#Create Container
container[[i]] <- create_container(matrix[[i]],trainingdata[[i]][,2],trainSize=1:length(trainingdata[[i]][,1]),virgin=FALSE)
print(i)
#Create Models
models[[i]] <- train_models(container[[i]], algorithms=c("SVM"))
print(i)
}
save(matrix, file = paste(pradd,"/Matrix",sep=""))
save(models, file = paste(pradd,"/Models",sep=""))
}
Here is an example of working with RTextTools
in parallel. I created the dummy function using information to be found here.
The function myFun
follows the introduction in the above link - at the end it writes a csv file (no directory is specified) containing the analytics/summary. Afterwards it is straight forward application of the base R
package parallel
in order to run myFun
in parallel.
library(parallel)
library(RTextTools)
# I. A dummy function
# Uses RTextTools
myFun <- function (trainMethod) {
library(RTextTools)
data(USCongress)
# Create the document-term matrix
doc_matrix <- create_matrix(USCongress$text, language="english", removeNumbers=TRUE,
stemWords=TRUE, removeSparseTerms=.998)
container <- create_container(doc_matrix, USCongress$major, trainSize=1:4000,
testSize=4001:4449, virgin=FALSE)
# Train
model <- train_model(container,trainMethod)
classify <- classify_model(container, model)
# Analytics
analytics <- create_analytics(container,
cbind(classify))
summary(analytics)
# Saving
nameToSave <- paste(trainMethod, 'DocumentSummary.csv', sep = '_')
write.csv(analytics@document_summary, nameToSave)
}
# II. Parallel Processing
#
# 1. Vector for parallelization & number of cores available
trainMethods <- c('SVM','GLMNET','MAXENT','SLDA','BOOSTING')
num_cores <- detectCores() - 1L
# 2. Start a cluster
cl <- makeCluster(num_cores)
# 3. Export Variables needed to the cluster
# specifying exactly which variables should be exported
clusterExport(cl, varlist = c('myFun', 'trainMethods'))
# 4. do in parallel
parLapply(cl, seq_along(trainMethods), function (n) myFun(trainMethod = trainMethods[n]))
# stop the cluster
stopCluster(cl)
In your case, you'd have to turn your code into a function myFun (n, ...)
with n
being an element of seq_along(folderaddress)
and of course substitute seq_along(trainMethods)
for seq_along(folderaddress)
in parLapply
.
Of course chances are there exist ways besides parallelization to enhance your code. The problem is without sample data, any suggested improvement is but conjecture.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With