I am working with the R programming language. I am trying to fit a Random Forest model on a very large dataset (over 100 million rows) with imbalanced classes (i.e. binary response variable ratio 95% to 5%). To do this, the R code I wrote:
Below, I have included the R code for these steps:
Step 1: Create Data for Problem
# Step 1: Randomly create data and make initial training/test set:
library(dplyr)
library(ranger)
original_data = rbind( data_1 = data.frame( class = 1, height = rnorm(10000, 180,10), weight = rnorm(10000, 90,10), salary = rnorm(10000,50000,10000)), data_2 = data.frame(class = 0, height = rnorm(100, 160,10), weight = rnorm(100, 100,10), salary = rnorm(100,40000,10000)) )
original_data$class = as.factor(original_data$class)
original_data$id = 1:nrow(original_data)
test_set= rbind(original_data[ sample( which( original_data$class == "0" ) , replace = FALSE , 30 ) , ], original_data[ sample( which( original_data$class == "1" ) , replace = FALSE, 2000 ) , ])
train_set = anti_join(original_data, test_set)
Step 2: Create "Balanced" Random Subsets:
# Step 2: Create "Balanced" Random Subsets:
results <- list()
for (i in 1:100)
{
iteration_i = i
sample_i = rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
results_tmp = data.frame(iteration_i, sample_i)
results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
X<-split(results_df, results_df$iteration)
invisible(lapply(seq_along(results),
function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
x=results))
Step 3: Train Models on Each Subset
# Step 3: Train Models on Each Subset:
#training
wd = getwd()
results_1 <- list()
for (i in 1:100){
model_i <- ranger(class ~ height + weight + salary, data = X[[i]], probability = TRUE)
saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
results_1[[i]] <- model_i
}
Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
results_2 <- list()
for (i in 1:100){
predict_i <- data.frame(predict(results_1[[i]], data = test_set)$predictions)
predict_i$id = 1:nrow(predict_i)
results_2[[i]] <- predict_i
}
final_predictions = aggregate(.~ id, do.call(rbind, results_2), mean)
My Question: I would like to see if I can incorporate "parallel computing" into Step 2, Step 3 and Step 4 to potentially make the code I have written run faster. I consulted other posts (e.g.https://stackoverflow.com/questions/14106010/parallel-execution-of-random-forest-in-r, https://stats.stackexchange.com/questions/519640/parallelizing-random-forest-learning-in-r-changes-the-class-of-the-rf-object) and I would like to see if I can reformat the code I have written and incorporate similar "parallel computing" functions for improving my code:
library(parallel)
library(doParallel)
library(foreach)
#Try to parallelize
cl <- makeCluster(detectCores()-1)
registerDoParallel(cl)
# Insert Reformatted Step 2 - Step 4 Here:
stopImplicitCluster()
stopCluster(cl)
rm(cl)
But I am still new to the world of parallel computing and still trying to figure out how to reformat my code so that this will work.
Can someone please show me how to do this?
Note:
In the previous questions that I consulted (e.g.parallel execution of random forest in R, https://stats.stackexchange.com/questions/519640/parallelizing-random-forest-learning-in-r-changes-the-class-of-the-rf-object), the "randomForest" package is used instead of "ranger" I am also open to using the "randomForest" package if this will make it easier to parallelize .
I acknowledge that the overall structure of my code might not be optimally written - I am open to suggestions for re-writing my code if this will make it easier to parallelize.
I realize that there are several popular packages in R that can be used to parallelize code (e.g. https://cran.r-project.org/web/packages/doSNOW/index.html) - I am also open to using any of these packages for parallelizing my code.
Noting your openness to a tidymodels approach, you could try this using your original_data and including parallel processing:
library(tidyverse)
library(tidymodels)
library(vip)
library(doParallel)
library(tictoc)
library(themis)
registerDoParallel(cores = 6)
# Supplied data
set.seed(2022)
original_data <- rbind(
data_1 = data.frame(
class = 1,
height = rnorm(10000, 180, 10),
weight = rnorm(10000, 90, 10),
salary = rnorm(10000, 50000, 10000)
),
data_2 = data.frame(
class = 0,
height = rnorm(100, 160, 10),
weight = rnorm(100, 100, 10),
salary = rnorm(100, 40000, 10000)
)
)
original_data$class <- as.factor(original_data$class)
original_data$id <- 1:nrow(original_data)
tic()
# Train / test data
set.seed(2022)
data_split <-
original_data |>
initial_split(strata = class) # stratify by class
train_df <- data_split |> training()
test_df <- data_split |> testing()
# Create a pre-processing recipe
class_recipe <-
train_df |>
recipe() |>
update_role(class, new_role = "outcome") |>
update_role(id, new_role = "id") |>
update_role(-has_role("outcome"), -has_role("id"), new_role = "predictor") |>
step_rose(class)
# Check class balance
class_recipe |> prep() |> bake(new_data = NULL) |> count(class)
#> # A tibble: 2 × 2
#> class n
#> <fct> <int>
#> 1 0 7407
#> 2 1 7589
summary(class_recipe)
#> # A tibble: 5 × 4
#> variable type role source
#> <chr> <chr> <chr> <chr>
#> 1 class nominal outcome original
#> 2 height numeric predictor original
#> 3 weight numeric predictor original
#> 4 salary numeric predictor original
#> 5 id numeric id original
# Create model & workflow
ranger_model <-
rand_forest(mtry = tune()) |>
set_engine("ranger", importance = "impurity") |>
set_mode("classification")
ranger_wflow <- workflow() |>
add_recipe(class_recipe) |>
add_model(ranger_model)
# Tune model with 10-fold Cross Validation
set.seed(2022)
folds <- vfold_cv(train_df, v = 10)
set.seed(2022)
ranger_res <- ranger_wflow |>
tune_grid(
resamples = folds,
grid = crossing(
mtry = seq(1, 3, 1),
),
control = control_grid(verbose = TRUE),
metrics = metric_set(accuracy) # choose a metric, e.g. accuracy
)
# Fit model
best_tune <- ranger_res |> select_best()
set.seed(2022)
ranger_fit <- ranger_wflow |>
finalize_workflow(best_tune) %>%
fit(train_df)
# Test
class_results <- ranger_fit |> augment(new_data = test_df)
class_results |> accuracy(class, .pred_class)
#> # A tibble: 1 × 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 accuracy binary 0.912
# Visualise feature importance
ranger_fit |>
extract_fit_parsnip() |>
vip() +
labs(title = "Feature Importance -- Ranger")

toc()
#> 62.393 sec elapsed
Created on 2022-06-21 by the reprex package (v2.0.1)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With