I am trying to convert a factor column into multiple boolean columns as the image below shows. The data is from weather stations as retrieved using the fine weatherData
package. The factor column I want to convert into multiple boolean columns contains 11 factors. Some of them are single "events", and some of them are a combination of "events".
Here is an image showing what I want to achieve: This is R code which will produce the data frame with combined factors that I want to convert into several boolean columns:
df <- read.table(text="
date Events
1/8/2013 Rain
1/9/2013 Fog
1/10/2013 ''
1/11/2013 Fog-Rain
1/12/2013 Snow
1/13/2013 Rain-Snow
1/14/2013 Rain-Thunderstorm
1/15/2013 Thunderstorm
1/16/2013 Fog-Rain-Thunderstorm
1/17/2013 Fog-Thunderstorm
1/18/2013 Fog-Rain-Thunderstorm-Snow",
header=T)
df$date <- as.character(as.Date(df$date, "%m/%d/%Y"))
Thanks in advance.
You could try:
lst <- strsplit(as.character(df$Events),"-")
lvl <- unique(unlist(lst))
res <- data.frame(date=df$date,
do.call(rbind,lapply(lst, function(x) table(factor(x, levels=lvl)))),
stringsAsFactors=FALSE)
res
# date Rain Fog Snow Thunderstorm
#1 2013-01-08 1 0 0 0
#2 2013-01-09 0 1 0 0
#3 2013-01-10 0 0 0 0
#4 2013-01-11 1 1 0 0
#5 2013-01-12 0 0 1 0
#6 2013-01-13 1 0 1 0
#7 2013-01-14 1 0 0 1
#8 2013-01-15 0 0 0 1
#9 2013-01-16 1 1 0 1
#10 2013-01-17 0 1 0 1
# 11 2013-01-18 1 1 1 1
Or possibly, this could be faster than the above (contributed by @alexis_laz)
setNames(data.frame(df$date, do.call(rbind,lapply(lst, function(x) as.integer(lvl %in% x)) )), c("date", lvl))
Or
library(devtools)
library(data.table)
source_gist("11380733")
library(reshape2) #In case it is needed
res1 <- dcast.data.table(cSplit(df, "Events", "-", "long"), date~Events)
res2 <- merge(subset(df, select=1), res1, by="date", all=TRUE)
res2 <- as.data.frame(res2)
res2[,-1] <- (!is.na(res2[,-1]))+0
res2[,c(1,3,2,4,5)]
# date Rain Fog Snow Thunderstorm
#1 2013-01-08 1 0 0 0
#2 2013-01-09 0 1 0 0
#3 2013-01-10 0 0 0 0
#4 2013-01-11 1 1 0 0
#5 2013-01-12 0 0 1 0
#6 2013-01-13 1 0 1 0
#7 2013-01-14 1 0 0 1
#8 2013-01-15 0 0 0 1
#9 2013-01-16 1 1 0 1
#10 2013-01-17 0 1 0 1
#11 2013-01-18 1 1 1 1
Or
library(qdap)
with(df, termco(Events, date, c("Rain", "Fog", "Snow", "Thunderstorm")))[[1]][,-2]
# date Rain Fog Snow Thunderstorm
#1 2013-01-08 1 0 0 0
#2 2013-01-09 0 1 0 0
#3 2013-01-10 0 0 0 0
#4 2013-01-11 1 1 0 0
#5 2013-01-12 0 0 1 0
#6 2013-01-13 1 0 1 0
#7 2013-01-14 1 0 0 1
#8 2013-01-15 0 0 0 1
#9 2013-01-16 1 1 0 1
#10 2013-01-17 0 1 0 1
#11 2013-01-18 1 1 1 1
The easiest thing I can think of is concat.split.expanded
from my "splitstackshape" package (devel version 1.3.0, from GitHub).
## Get the right version of the package
library(devtools)
install_github("splitstackshape", "mrdwab", ref = "devel")
packageVersion("splitstackshape")
# [1] ‘1.3.0’
## Split up the relevant column
concat.split.expanded(df, "Events", "-", type = "character",
fill = 0, drop = TRUE)
# date Events_Fog Events_Rain Events_Snow Events_Thunderstorm
# 1 2013-01-08 0 1 0 0
# 2 2013-01-09 1 0 0 0
# 3 2013-01-10 0 0 0 0
# 4 2013-01-11 1 1 0 0
# 5 2013-01-12 0 0 1 0
# 6 2013-01-13 0 1 1 0
# 7 2013-01-14 0 1 0 1
# 8 2013-01-15 0 0 0 1
# 9 2013-01-16 1 1 0 1
# 10 2013-01-17 1 0 0 1
# 11 2013-01-18 1 1 1 1
Answering this question, I realize that I've somewhat foolishly hard-coded a "trim" feature in concat.split.expanded
that could slow things down a lot. If you want a much faster approach, use charMat
(the function called by concat.split.expanded
) directly on the split up version of your "Events" column, like this:
splitstackshape:::charMat(
strsplit(as.character(indf[, "Events"]), "-", fixed = TRUE), fill = 0)
For some benchmarks, check out this Gist.
Can be done with base R using 'grep':
ddf = data.frame(df$date, df$Events, "Rain"=rep(0), "Fog"=rep(0), "Snow"=rep(0), "Thunderstorm"=rep(0))
for(i in 3:6) ddf[grep(names(ddf)[i],ddf[,2]),i]=1
ddf
df.date df.Events Rain Fog Snow Thunderstorm
1 2013-01-08 Rain 1 0 0 0
2 2013-01-09 Fog 0 1 0 0
3 2013-01-10 0 0 0 0
4 2013-01-11 Fog-Rain 1 1 0 0
5 2013-01-12 Snow 0 0 1 0
6 2013-01-13 Rain-Snow 1 0 1 0
7 2013-01-14 Rain-Thunderstorm 1 0 0 1
8 2013-01-15 Thunderstorm 0 0 0 1
9 2013-01-16 Fog-Rain-Thunderstorm 1 1 0 1
10 2013-01-17 Fog-Thunderstorm 0 1 0 1
11 2013-01-18 Fog-Rain-Thunderstorm-Snow 1 1 1 1
Here's an approach with qdapTools
:
library(qdapTools)
matrix2df(mtabulate(lapply(split(as.character(df$Events), df$date),
function(x) strsplit(x, "-")[[1]])), "Date")
## Date Fog Rain Snow Thunderstorm
## 1 2013-01-08 0 1 0 0
## 2 2013-01-09 1 0 0 0
## 3 2013-01-10 0 0 0 0
## 4 2013-01-11 1 1 0 0
## 5 2013-01-12 0 0 1 0
## 6 2013-01-13 0 1 1 0
## 7 2013-01-14 0 1 0 1
## 8 2013-01-15 0 0 0 1
## 9 2013-01-16 1 1 0 1
## 10 2013-01-17 1 0 0 1
## 11 2013-01-18 1 1 1 1
Here is the same answer with magrittr
as it makes the chain clearer:
split(as.character(df$Events), df$date) %>%
lapply(function(x) strsplit(x, "-")[[1]]) %>%
mtabulate() %>%
matrix2df("Date")
Create a vector with factors
set.seed(1)
n <- c("Rain", "Fog", "Snow", "Thunderstorm")
v <- sapply(sample(0:3,100,T), function(i) paste0(sample(n,i), collapse = "-"))
v <- as.factor(v)
Function which returns matrix with desired output that shoulb be cbind
'ed to the initial data.frame
mSplit <- function(vec) {
if (!is.character(vec))
vec <- as.character(vec)
L <- strsplit(vec, "-")
ids <- unlist(lapply(seq_along(L), function(i) rep(i, length(L[[i]])) ))
U <- sort(unique(unlist(L)))
M <- matrix(0, nrow = length(vec),
ncol = length(U),
dimnames = list(NULL, U))
M[cbind(ids, match(unlist(L), U))] <- 1L
M
}
Solution is based on the answer of Ananda Mahto to that SO question. It should be pretty fast.
res <- mSplit(v)
I think what you need in this case is a simple call for the function dummy
. Let's call the target column. target_cat
.
df_target_bin <- data.frame(dummy(target_cat, "<prefix>"))
This will create a new data frame with a column with 0s and 1s values for each value of target_cat
.
To convert the columns into logical, and with logical I mean the values be TRUE
and FALSE
, then use the function as.logical
.
df_target_logical <- apply(df_target_bin, as.logical)
Building on the answer by @rnso
The following will identify all the unique elements and then dynamically generate new columns with the relevant data in them.
options = unique(unlist(strsplit(df$Events, '-'), recursive=FALSE))
for(o in options){
df$newcol = rep(0)
df <- rename(df, !!o := newcol)
df[grep(o, df$Events), o] = 1
}
Results:
date Events Rain Fog Snow Thunderstorm
1 2013-01-08 Rain 1 0 0 0
2 2013-01-09 Fog 0 1 0 0
3 2013-01-10 0 0 0 0
4 2013-01-11 Fog-Rain 1 1 0 0
5 2013-01-12 Snow 0 0 1 0
6 2013-01-13 Rain-Snow 1 0 1 0
7 2013-01-14 Rain-Thunderstorm 1 0 0 1
8 2013-01-15 Thunderstorm 0 0 0 1
9 2013-01-16 Fog-Rain-Thunderstorm 1 1 0 1
10 2013-01-17 Fog-Thunderstorm 0 1 0 1
11 2013-01-18 Fog-Rain-Thunderstorm-Snow 1 1 1 1
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With