Random forest output interpretation

I have run a random forest for my data and got the output in the form of a matrix. What are the rules it applied to classify?

P.S. I want a profile of the customer as output, e.g. Person from New York, works in the technology industry, etc.

How can I interpret the results from a random forest?

The "inTrees" R package might be useful.

Here is an example.

Extract raw rules from a random forest:

library(inTrees) library(randomForest)  data(iris) X <- iris[, 1:(ncol(iris) - 1)]  # X: predictors target <- iris[,"Species"]  # target: class rf <- randomForest(X, as.factor(target)) treeList <- RF2List(rf)  # transform rf object to an inTrees' format exec <- extractRules(treeList, X)  # R-executable conditions exec[1:2,] #       condition                  # [1,] "X[,1]<=5.45 & X[,4]<=0.8" # [2,] "X[,1]<=5.45 & X[,4]>0.8" 

Measure rules. len is the number of variable-value pairs in a condition, freq is the percentage of data satisfying a condition, pred is the outcome of a rule, i.e., condition => pred, err is the error rate of a rule.

ruleMetric <- getRuleMetric(exec,X,target)  # get rule metrics ruleMetric[1:2,] #      len  freq    err     condition                  pred         # [1,] "2" "0.3"   "0"     "X[,1]<=5.45 & X[,4]<=0.8" "setosa"     # [2,] "2" "0.047" "0.143" "X[,1]<=5.45 & X[,4]>0.8"  "versicolor" 

Prune each rule:

ruleMetric <- pruneRule(ruleMetric, X, target) ruleMetric[1:2,] #      len  freq    err     condition                 pred         # [1,] "1" "0.333" "0"     "X[,4]<=0.8"              "setosa"     # [2,] "2" "0.047" "0.143" "X[,1]<=5.45 & X[,4]>0.8" "versicolor" 

Select a compact rule set:

(ruleMetric <- selectRuleRRF(ruleMetric, X, target)) #          len freq    err     condition                                             pred         impRRF               # [1,] "1" "0.333" "0"     "X[,4]<=0.8"                                          "setosa"     "1"                  # [2,] "3" "0.313" "0"     "X[,3]<=4.95 & X[,3]>2.6 & X[,4]<=1.65"               "versicolor" "0.806787615686919"  # [3,] "4" "0.333" "0.04"  "X[,1]>4.95 & X[,3]<=5.35 & X[,4]>0.8 & X[,4]<=1.75"  "versicolor" "0.0746284932951366" # [4,] "2" "0.287" "0.023" "X[,1]<=5.9 & X[,2]>3.05"                             "setosa"     "0.0355855756152103" # [5,] "1" "0.307" "0.022" "X[,4]>1.75"                                          "virginica"  "0.0329176860493297" # [6,] "4" "0.027" "0"     "X[,1]>5.45 & X[,3]<=5.45 & X[,4]<=1.75 & X[,4]>1.55" "versicolor" "0.0234818254947883" # [7,] "3" "0.007" "0"     "X[,1]<=6.05 & X[,3]>5.05 & X[,4]<=1.7"               "versicolor" "0.0132907201116241" 

Build an ordered rule list as a classifier:

(learner <- buildLearner(ruleMetric, X, target)) #      len freq                 err                  condition                                             pred         # [1,] "1" "0.333333333333333"  "0"                  "X[,4]<=0.8"                                          "setosa"     # [2,] "3" "0.313333333333333"  "0"                  "X[,3]<=4.95 & X[,3]>2.6 & X[,4]<=1.65"               "versicolor" # [3,] "4" "0.0133333333333333" "0"                  "X[,1]>5.45 & X[,3]<=5.45 & X[,4]<=1.75 & X[,4]>1.55" "versicolor" # [4,] "1" "0.34"               "0.0196078431372549" "X[,1]==X[,1]"                                        "virginica"  

Make rules more readable:

readableRules <- presentRules(ruleMetric, colnames(X)) readableRules[1:2, ] #      len  freq    err     condition                                                                       pred         # [1,] "1" "0.333" "0"     "Petal.Width<=0.8"                                                              "setosa"     # [2,] "3" "0.313" "0"     "Petal.Length<=4.95 & Petal.Length>2.6 & Petal.Width<=1.65"                     "versicolor" 

Extract frequent variable interactions (note the rules are not pruned or selected):

rf <- randomForest(X, as.factor(target)) treeList <- RF2List(rf)  # transform rf object to an inTrees' format exec <- extractRules(treeList, X)  # R-executable conditions ruleMetric <- getRuleMetric(exec, X, target)  # get rule metrics freqPattern <- getFreqPattern(ruleMetric) # interactions of at least two predictor variables freqPattern[which(as.numeric(freqPattern[, "len"]) >= 2), ][1:4, ] #      len sup     conf    condition                  pred         # [1,] "2" "0.045" "0.587" "X[,3]>2.45 & X[,4]<=1.75" "versicolor" # [2,] "2" "0.041" "0.63"  "X[,3]>4.75 & X[,4]>0.8"   "virginica"  # [3,] "2" "0.039" "0.604" "X[,4]<=1.75 & X[,4]>0.8"  "versicolor" # [4,] "2" "0.033" "0.675" "X[,4]<=1.65 & X[,4]>0.8"  "versicolor" 

One can also present these frequent patterns in a readable form using function presentRules.

In addition, rules or frequent patterns can be formatted in LaTex.

library(xtable) print(xtable(freqPatternSelect), include.rownames=FALSE) # \begin{table}[ht] # \centering # \begin{tabular}{lllll} #   \hline #   len & sup & conf & condition & pred \\  #   \hline #   2 & 0.045 & 0.587 & X[,3]$>$2.45 \& X[,4]$<$=1.75 & versicolor \\  #   2 & 0.041 & 0.63 & X[,3]$>$4.75 \& X[,4]$>$0.8 & virginica \\  #   2 & 0.039 & 0.604 & X[,4]$<$=1.75 \& X[,4]$>$0.8 & versicolor \\  #   2 & 0.033 & 0.675 & X[,4]$<$=1.65 \& X[,4]$>$0.8 & versicolor \\  #   \hline # \end{tabular} # \end{table} 
