Random forest output interpretation

1 Answers

The "inTrees" R package might be useful.

Here is an example.

Extract raw rules from a random forest:

library(inTrees) library(randomForest)  data(iris) X <- iris[, 1:(ncol(iris) - 1)]  # X: predictors target <- iris[,"Species"]  # target: class rf <- randomForest(X, as.factor(target)) treeList <- RF2List(rf)  # transform rf object to an inTrees' format exec <- extractRules(treeList, X)  # R-executable conditions exec[1:2,] #       condition                  # [1,] "X[,1]<=5.45 & X[,4]<=0.8" # [2,] "X[,1]<=5.45 & X[,4]>0.8"

Measure rules. len is the number of variable-value pairs in a condition, freq is the percentage of data satisfying a condition, pred is the outcome of a rule, i.e., condition => pred, err is the error rate of a rule.

ruleMetric <- getRuleMetric(exec,X,target)  # get rule metrics ruleMetric[1:2,] #      len  freq    err     condition                  pred         # [1,] "2" "0.3"   "0"     "X[,1]<=5.45 & X[,4]<=0.8" "setosa"     # [2,] "2" "0.047" "0.143" "X[,1]<=5.45 & X[,4]>0.8"  "versicolor"

Prune each rule:

ruleMetric <- pruneRule(ruleMetric, X, target) ruleMetric[1:2,] #      len  freq    err     condition                 pred         # [1,] "1" "0.333" "0"     "X[,4]<=0.8"              "setosa"     # [2,] "2" "0.047" "0.143" "X[,1]<=5.45 & X[,4]>0.8" "versicolor"

Select a compact rule set:

(ruleMetric <- selectRuleRRF(ruleMetric, X, target)) #          len freq    err     condition                                             pred         impRRF               # [1,] "1" "0.333" "0"     "X[,4]<=0.8"                                          "setosa"     "1"                  # [2,] "3" "0.313" "0"     "X[,3]<=4.95 & X[,3]>2.6 & X[,4]<=1.65"               "versicolor" "0.806787615686919"  # [3,] "4" "0.333" "0.04"  "X[,1]>4.95 & X[,3]<=5.35 & X[,4]>0.8 & X[,4]<=1.75"  "versicolor" "0.0746284932951366" # [4,] "2" "0.287" "0.023" "X[,1]<=5.9 & X[,2]>3.05"                             "setosa"     "0.0355855756152103" # [5,] "1" "0.307" "0.022" "X[,4]>1.75"                                          "virginica"  "0.0329176860493297" # [6,] "4" "0.027" "0"     "X[,1]>5.45 & X[,3]<=5.45 & X[,4]<=1.75 & X[,4]>1.55" "versicolor" "0.0234818254947883" # [7,] "3" "0.007" "0"     "X[,1]<=6.05 & X[,3]>5.05 & X[,4]<=1.7"               "versicolor" "0.0132907201116241"

Build an ordered rule list as a classifier:

(learner <- buildLearner(ruleMetric, X, target)) #      len freq                 err                  condition                                             pred         # [1,] "1" "0.333333333333333"  "0"                  "X[,4]<=0.8"                                          "setosa"     # [2,] "3" "0.313333333333333"  "0"                  "X[,3]<=4.95 & X[,3]>2.6 & X[,4]<=1.65"               "versicolor" # [3,] "4" "0.0133333333333333" "0"                  "X[,1]>5.45 & X[,3]<=5.45 & X[,4]<=1.75 & X[,4]>1.55" "versicolor" # [4,] "1" "0.34"               "0.0196078431372549" "X[,1]==X[,1]"                                        "virginica"

Make rules more readable:

readableRules <- presentRules(ruleMetric, colnames(X)) readableRules[1:2, ] #      len  freq    err     condition                                                                       pred         # [1,] "1" "0.333" "0"     "Petal.Width<=0.8"                                                              "setosa"     # [2,] "3" "0.313" "0"     "Petal.Length<=4.95 & Petal.Length>2.6 & Petal.Width<=1.65"                     "versicolor"

Extract frequent variable interactions (note the rules are not pruned or selected):

rf <- randomForest(X, as.factor(target)) treeList <- RF2List(rf)  # transform rf object to an inTrees' format exec <- extractRules(treeList, X)  # R-executable conditions ruleMetric <- getRuleMetric(exec, X, target)  # get rule metrics freqPattern <- getFreqPattern(ruleMetric) # interactions of at least two predictor variables freqPattern[which(as.numeric(freqPattern[, "len"]) >= 2), ][1:4, ] #      len sup     conf    condition                  pred         # [1,] "2" "0.045" "0.587" "X[,3]>2.45 & X[,4]<=1.75" "versicolor" # [2,] "2" "0.041" "0.63"  "X[,3]>4.75 & X[,4]>0.8"   "virginica"  # [3,] "2" "0.039" "0.604" "X[,4]<=1.75 & X[,4]>0.8"  "versicolor" # [4,] "2" "0.033" "0.675" "X[,4]<=1.65 & X[,4]>0.8"  "versicolor"

One can also present these frequent patterns in a readable form using function presentRules.

In addition, rules or frequent patterns can be formatted in LaTex.

library(xtable) print(xtable(freqPatternSelect), include.rownames=FALSE) # \begin{table}[ht] # \centering # \begin{tabular}{lllll} #   \hline #   len & sup & conf & condition & pred \\  #   \hline #   2 & 0.045 & 0.587 & X[,3]$>$2.45 \& X[,4]$<$=1.75 & versicolor \\  #   2 & 0.041 & 0.63 & X[,3]$>$4.75 \& X[,4]$>$0.8 & virginica \\  #   2 & 0.039 & 0.604 & X[,4]$<$=1.75 \& X[,4]$>$0.8 & versicolor \\  #   2 & 0.033 & 0.675 & X[,4]$<$=1.65 \& X[,4]$>$0.8 & versicolor \\  #   \hline # \end{tabular} # \end{table}

163

answered Sep 21 '22 03:09

H.D.

Related questions
                            
                                Categorize numeric variable with mutate
                            
                                subset a column in data frame based on another data frame/list
                            
                                Splitting a string on the first space
                            
                                caret train() predicts very different then predict.glm()
                            
                                Find the maximum and minimum value of every column and then find the maximum and minimum value of every row
                            
                                Can rbind be parallelized in R?
                            
                                Formatting ggplot2 axis labels with commas (and K? MM?) if I already have a y-scale
                            
                                Can I calculate z-score with R? [duplicate]
                            
                                Is it possible to swap columns around in a data frame using R?
                            
                                How to remove extra white space between words inside a character vector using?
                            
                                Changing shapes used for scale_shape() in ggplot2
                            
                                How to delete rows from a data.frame, based on an external list, using R?
                            
                                Moving color key in R heatmap.2 (function of gplots package)
                            
                                How to not show all labels on ggplot axis?
                            
                                Initialize an empty tibble with column names and 0 rows
                            
                                Calculate correlation for more than two variables?
                            
                                Selecting a subset of columns in a data.table
                            
                                How to hide or disable in-function printed message
                            
                                How can I rbind vectors matching their column names?
                            
                                Plot polynomial regression curve in R

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

Random forest output interpretation

Tags:

output

r

profiling

random-forest

user2061730

People also ask

1 Answers

H.D.

Recent Activity

Donate For Us