I want to try the LightGBM algorithm using tidymodels and treesnip package. Some preproccessing...
# remotes::install_github("curso-r/treesnip")
# install.packages("titanic")
library(tidymodels)
library(stringr)
library(titanic)
data("titanic_train")
df <- titanic_train %>% as_tibble %>%
mutate(title=str_extract(Name,"\\w+\\.") %>% str_replace(fixed("."),"")) %>%
mutate(title=case_when(title %in% c('Mlle','Ms')~'Miss',
title=='Mme'~ 'Mrs',
title %in% c('Capt','Don','Major','Sir','Jonkheer', 'Col')~'Sir',
title %in% c('Dona', 'Lady', 'Countess')~'Lady',
TRUE~title)) %>%
mutate(title=as.factor(title),
Survived=factor(Survived,levels = c(0,1),labels=c("no","yes")),
Sex=as.factor(Sex),
Pclass=factor(Pclass)) %>%
select(-c(PassengerId,Ticket,Cabin,Name)) %>%
mutate(Embarked=as.factor(Embarked))
table(df$title,df$Sex)
trnTst <- initial_split(data = df,prop = .8,strata = Survived)
cv.folds <- training(trnTst) %>%
vfold_cv(data = .,v = 4,repeats = 1)
cv.folds
rec <- recipe(Survived~.,data = training(trnTst)) %>%
step_nzv(all_predictors()) %>%
step_knnimpute(Age,neighbors = 3,impute_with = vars(title,Fare,Pclass))
To check that the problem is not in the data, i succesfully tune the Random Forest algorithm.
m.rf <- rand_forest(trees = 1000,min_n = tune(),mtry = tune()) %>%
set_mode(mode = 'classification') %>%
set_engine('ranger')
wf.rf <- workflow() %>% add_recipe(rec) %>% add_model(m.rf)
(cls <- parallel::makeCluster(parallel::detectCores()-1))
doParallel::registerDoParallel(cl = cls)
tn.rf <- tune_grid(wf.rf,resamples = cv.folds,grid = 20,
metrics = metric_set(accuracy,roc_auc))
doParallel::stopImplicitCluster()
autoplot(tn.rf)
wf.rf <- finalize_workflow(x = wf.rf,parameters = select_best(tn.rf,metric = 'roc_auc'))
res.rf <- fit_resamples(wf.rf,resamples = cv.folds,metrics = metric_set(accuracy,roc_auc))
res.rf %>% collect_metrics()
But lightGBM raise error just without tuning and parallel processing
According to How to Use Lightgbm with Tidymodels
In contrast to XGBoost, both lightgbm and catboost are very capable of handling categorical variables (factors) and so you don’t need to turn variables into dummies (one hot encode), in fact you shouldn’t do it, it makes everything slower and might give you worse performance.
library(treesnip) # lightgbm & catboost connector
m.lgbm <- boost_tree() %>% #trees = tune(), min_n = tune()) %>%
set_mode(mode = 'classification') %>%
set_engine('lightgbm')
wf.lgbm <- workflow() %>% add_recipe(rec) %>% add_model(m.lgbm)
res.lgbm <- fit_resamples(wf.lgbm,resamples = cv.folds)
Warning message:
All models failed. See the `.notes` column.
res.lgbm$.notes[[1]]
internal: Error in pkg_list[[1]]: subgroup out of bounds
Try running the tune_grid without doParallel - there seems to be a conflict between LightGBM and tune_grid which both want to run in parallel.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With