I am using the C50 decision tree algorithm. I am able to build the tree and get the summaries, but cannot figure out how to plot or viz the tree.
My C50 model is called credit_model
In other decision tree packages, I usually use something like plot(credit_model). In rpart it is rpart.plot(credit_model).
What is the equivalent in the C50 algorithm to plot?
Right now, there are none built in. I've been working on an adapter for the partykit package (e.g. as.party) but have not gotten very far.
Max
You can use the following routine, to directly convert the decision tree into GraphViz dot language (and then plot it with GraphViz - a previous installation of GraphViz (http://www.graphviz.org/) is required).
Edit: Version 2 included hereinafter, which is able to handle multi-branched trees (version 1 could handle trees with only two splits). Version 2.2 corrects a missing initialization.
Example calling within R:
library(C50)
data(churn)
treeModel <- C5.0(x = churnTrain[, -20], y = churnTrain$churn)
C5.0.graphviz(treeModel, 'C:\\mydotfile.txt')
Then from the operating system (OS, e.g. a Windows command prompt):
dot -Tpng 'C:\mydotfile.txt' > 'C:\mygraph.png'
You can then open the mygraph.png file as a PNG (bitmap) and use it in your application.
For further details, here's the original post: http://r-project-thanos.blogspot.de/2014/09/plot-c50-decision-trees-in-r.html
C5.0.graphviz <- function( C5.0.model, filename, fontname ='Arial',
col.draw ='black',col.font ='blue',col.conclusion ='lightpink',
col.question = 'grey78', shape.conclusion ='box3d',shape.question ='diamond',
bool.substitute = 'None', prefix=FALSE, vertical=TRUE ) {
library(cwhmisc)
library(stringr)
treeout <- C5.0.model$output
treeout<- substr(treeout, cpos(treeout, 'Decision tree:', start=1)+14,nchar(treeout))
treeout<- substr(treeout, 1,cpos(treeout, 'Evaluation on training data', start=1)-2)
variables <- data.frame(matrix(nrow=500, ncol=4))
names(variables) <- c('SYMBOL','TOKEN', 'TYPE' , 'QUERY')
connectors <- data.frame(matrix(nrow=500, ncol=3))
names(connectors) <- c('TOKEN', 'START','END')
theStack <- data.frame(matrix(nrow=500, ncol=1))
names(theStack) <- c('ITEM')
theStackIndex <- 1
currentvar <- 1
currentcon <- 1
open_connection <- TRUE
previousindent <- -1
firstindent <- 4
substitutes <- data.frame(None=c('= 0','= 1'), yesno=c('no','yes'),
truefalse=c('false', 'true'),TF=c('F','T'))
dtreestring<-unlist( scan(text= treeout, sep='\n', what =list('character')))
for (linecount in c(1:length(dtreestring))) {
lineindent<-0
shortstring <- str_trim(dtreestring[linecount], side='left')
leadingspaces <- nchar(dtreestring[linecount]) - nchar(shortstring)
lineindent <- leadingspaces/4
dtreestring[linecount]<-str_trim(dtreestring[linecount], side='left')
while (!is.na(cpos(dtreestring[linecount], ': ', start=1)) ) {
lineindent<-lineindent + 1
dtreestring[linecount]<-substr(dtreestring[linecount],
ifelse(is.na(cpos(dtreestring[linecount], ': ', start=1)), 1,
cpos(dtreestring[linecount], ': ', start=1)+4),
nchar(dtreestring[linecount]) )
shortstring <- str_trim(dtreestring[linecount], side='left')
leadingspaces <- nchar(dtreestring[linecount]) - nchar(shortstring)
lineindent <- lineindent + leadingspaces/4
dtreestring[linecount]<-str_trim(dtreestring[linecount], side='left')
}
if (!is.na(cpos(dtreestring[linecount], ':...', start=1)))
lineindent<- lineindent + 1
dtreestring[linecount]<-substr(dtreestring[linecount],
ifelse(is.na(cpos(dtreestring[linecount], ':...', start=1)), 1,
cpos(dtreestring[linecount], ':...', start=1)+4),
nchar(dtreestring[linecount]) )
dtreestring[linecount]<-str_trim(dtreestring[linecount])
stringlist <- strsplit(dtreestring[linecount],'\\:')
stringpart <- strsplit(unlist(stringlist)[1],'\\s')
if (open_connection==TRUE) {
variables[currentvar,'TOKEN'] <- unlist(stringpart)[1]
variables[currentvar,'SYMBOL'] <- paste('node',as.character(currentvar), sep='')
variables[currentvar,'TYPE'] <- shape.question
variables[currentvar,'QUERY'] <- 1
theStack[theStackIndex,'ITEM']<-variables[currentvar,'SYMBOL']
theStack[theStackIndex,'INDENT'] <-firstindent
theStackIndex<-theStackIndex+1
currentvar <- currentvar + 1
if(currentvar>2) {
connectors[currentcon - 1,'END'] <- variables[currentvar - 1, 'SYMBOL']
}
}
connectors[currentcon,'TOKEN'] <- paste(unlist(stringpart)[2],unlist(stringpart)[3])
if (connectors[currentcon,'TOKEN']=='= 0')
connectors[currentcon,'TOKEN'] <- as.character(substitutes[1,bool.substitute])
if (connectors[currentcon,'TOKEN']=='= 1')
connectors[currentcon,'TOKEN'] <- as.character(substitutes[2,bool.substitute])
if (open_connection==TRUE) {
if (lineindent<previousindent) {
theStackIndex <- theStackIndex-(( previousindent- lineindent) +1 )
currentsymbol <-theStack[theStackIndex,'ITEM']
} else
currentsymbol <-variables[currentvar - 1,'SYMBOL']
} else {
currentsymbol <-theStack[theStackIndex-((previousindent -lineindent ) +1 ),'ITEM']
theStackIndex <- theStackIndex-(( previousindent- lineindent) )
}
connectors[currentcon, 'START'] <- currentsymbol
currentcon <- currentcon + 1
open_connection <- TRUE
if (length(unlist(stringlist))==2) {
stringpart2 <- strsplit(unlist(stringlist)[2],'\\s')
variables[currentvar,'TOKEN'] <- paste(ifelse((prefix==FALSE),'','Class'), unlist(stringpart2)[2])
variables[currentvar,'SYMBOL'] <- paste('node',as.character(currentvar), sep='')
variables[currentvar,'TYPE'] <- shape.conclusion
variables[currentvar,'QUERY'] <- 0
currentvar <- currentvar + 1
connectors[currentcon - 1,'END'] <- variables[currentvar - 1,'SYMBOL']
open_connection <- FALSE
}
previousindent<-lineindent
}
runningstring <- paste('digraph g {', 'graph ', sep='\n')
runningstring <- paste(runningstring, ' [rankdir="', sep='')
runningstring <- paste(runningstring, ifelse(vertical==TRUE,'TB','LR'), sep='' )
runningstring <- paste(runningstring, '"]', sep='')
for (lines in c(1:(currentvar-1))) {
runningline <- paste(variables[lines,'SYMBOL'], '[shape="')
runningline <- paste(runningline,variables[lines,'TYPE'], sep='' )
runningline <- paste(runningline,'" label ="', sep='' )
runningline <- paste(runningline,variables[lines,'TOKEN'], sep='' )
runningline <- paste(runningline,
'" style=filled fontcolor=', sep='')
runningline <- paste(runningline, col.font)
runningline <- paste(runningline,' color=' )
runningline <- paste(runningline, col.draw)
runningline <- paste(runningline,' fontname=')
runningline <- paste(runningline, fontname)
runningline <- paste(runningline,' fillcolor=')
runningline <- paste(runningline,
ifelse(variables[lines,'QUERY']== 0 ,col.conclusion,col.question))
runningline <- paste(runningline,'];')
runningstring <- paste(runningstring, runningline , sep='\n')
}
for (lines in c(1:(currentcon-1))) {
runningline <- paste (connectors[lines,'START'], '->')
runningline <- paste (runningline, connectors[lines,'END'])
runningline <- paste (runningline,'[label="')
runningline <- paste (runningline,connectors[lines,'TOKEN'], sep='')
runningline <- paste (runningline,'" fontname=', sep='')
runningline <- paste (runningline, fontname)
runningline <- paste (runningline,'];')
runningstring <- paste(runningstring, runningline , sep='\n')
}
runningstring <- paste(runningstring,'}')
cat(runningstring)
sink(filename, split=TRUE)
cat(runningstring)
sink()
}
thanos
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With