I have a dataframe df
:
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta
3108 -8.00 Easy Easy Easy Easy
3207 3.00 Hard Easy Match Match
3350 5.78 Hard Easy Hard Hard
3961 10.00 Easy NA Hard Hard
4021 10.00 Easy Easy NA Hard
1. userID is factor variable
2. Score is numeric
3. All the 'Task_' features are factor variables with possible values 'Hard', 'Easy', 'Match' or NA
I want to count the possible transitions between the Task_
features. For reference, the possible transitions are:
EE transition from Easy -> Easy
EM transition from Easy -> Match
EH transition from Easy -> Hard
ME transition from Match-> Easy
MM transition from Match-> Match
MH transition from Match-> Hard
HE transition from Hard -> Easy
HM transition from Hard -> Match
HH transition from Hard -> Hard
Since there are three possible values (excluding the NA case), the output columns would be as below:
userID EE EM EH MM ME MH HH HE HM
3108 3 0 0 0 0 0 0 0 0
3207 0 1 0 1 0 0 0 1 0
3350 0 0 1 0 0 0 1 1 0
3961 0 0 0 0 0 0 1 0 0
4021 1 0 0 0 0 0 0 0 0
1) In this example each userID can have at most 3 state transitions.
2) Note that for users 3961 and 4021, NA has reduced the possible state transitions.
Any advice on these questions would be greatly appreciated.
The data dput()
is :
df <- structure(list(
userID = c(3108L, 3207L, 3350L, 3961L, 4021L),
Score = c(-8, 3, 5.78, 10, 10),
Task_Alpha = structure(c(1L, 2L, 2L, 1L, 1L), .Label = c("Easy", "Hard"), class = "factor"),
Task_Beta = structure(c(1L, 1L, 1L, NA, 1L), .Label = "Easy", class = "factor"),
Task_Charlie = structure(c(1L, 3L, 2L, 2L, NA), .Label = c("Easy", "Hard", "Match"), class = "factor"),
Task_Delta = structure(c(1L, 3L, 2L, 2L, 2L), .Label = c("Easy", "Hard", "Match"), class = "factor")),
class = "data.frame", row.names = c(NA, -5L))
Another idea via base R can be to paste the values to their previous value (rowwise), convert to factor to get all 9 levels (using expand.grid
with only the levels you want - which also takes care of NAs), and finally count the values via table
. The last step is to bind the IDs with the results, i.e.
cbind.data.frame(df$userID, t(apply(df[-c(1:2)], 1, function(i) {
i1 <- paste(i[-length(i)], i[-1]);
i1 <- factor(i1, levels = do.call(paste, expand.grid(c('Easy', 'Match', 'Hard'),
c('Easy', 'Match', 'Hard'))));
table(i1) })))
which gives,
df$userID Easy Easy Match Easy Hard Easy Easy Match Match Match Hard Match Easy Hard Match Hard Hard Hard 1 3108 3 0 0 0 0 0 0 0 0 2 3207 0 0 1 1 1 0 0 0 0 3 3350 0 0 1 0 0 0 1 0 1 4 3961 0 0 0 0 0 0 0 0 1 5 4021 1 0 0 0 0 0 0 0 0
Another option similar to Sotos' approach but 1) using data.table
, 2) not using factor
and 3) replacing table
with Rfast::rowTabulate
:
v <- c('Hard', 'Match', 'Easy')
vv <- do.call(paste, expand.grid(v, v))
DT[, (vv) := {
mat <- mapply(paste, .SD[, -ncol(.SD), with=FALSE], .SD[, -1L])
as.data.table(Rfast::rowTabulate(matrix(match(mat, vv, 0L), nrow=.N)))
}, .SDcols=Task_Alpha:Task_Delta]
output:
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta Hard Hard Match Hard Easy Hard Hard Match Match Match Easy Match Hard Easy Match Easy Easy Easy
1: 3108 -8.00 Easy Easy Easy Easy 0 0 0 0 0 0 0 0 3
2: 3207 3.00 Hard Easy Match Match 0 0 0 0 1 1 1 0 0
3: 3350 5.78 Hard Easy Hard Hard 1 0 1 0 0 0 1 0 0
4: 3961 10.00 Easy <NA> Hard Hard 1 0 0 0 0 0 0 0 0
5: 4021 10.00 Easy Easy <NA> Hard 0 0 0 0 0 0 0 0 1
data:
library(data.table)
library(Rfast)
DT <- structure(list(
userID = c(3108L, 3207L, 3350L, 3961L, 4021L),
Score = c(-8, 3, 5.78, 10, 10),
Task_Alpha = structure(c(1L, 2L, 2L, 1L, 1L), .Label = c("Easy", "Hard"), class = "factor"),
Task_Beta = structure(c(1L, 1L, 1L, NA, 1L), .Label = "Easy", class = "factor"),
Task_Charlie = structure(c(1L, 3L, 2L, 2L, NA), .Label = c("Easy", "Hard", "Match"), class = "factor"),
Task_Delta = structure(c(1L, 3L, 2L, 2L, 2L), .Label = c("Easy", "Hard", "Match"), class = "factor")),
class = "data.frame", row.names = c(NA, -5L))
setDT(DT)
Would be interesting to know how fast this approach works on actual dataset and if actual dataset is large.
edit: added some timings
library(data.table)
nr <- 1e6
vec <- c('Hard', 'Match', 'Easy', NA)
DT <- data.table(userID=1:nr, Task_Alpha=sample(vec, nr, TRUE), Task_Beta=sample(vec, nr, TRUE),
Task_Charlie=sample(vec, nr, TRUE), Task_Delta=sample(vec, nr, TRUE))
df <- as.data.frame(DT)
DT0 <- copy(DT)
DT1 <- copy(DT)
DT2 <- copy(DT)
mtd0 <- function() {
t(apply(df[-1L], 1, function(i) {
i1 <- paste(i[-length(i)], i[-1L]);
i1 <- factor(i1, levels = do.call(paste, expand.grid(c('Easy', 'Match', 'Hard'),
c('Easy', 'Match', 'Hard'))));
table(i1)
}))
}
mtd1 <- function() {
f_cols <- names(DT0)[ sapply( DT0, is.factor ) ]
DT0[, (f_cols) := lapply(.SD, as.character), .SDcols = f_cols ]
#melt to long format
DT.melt <- melt( DT0, id.vars = "userID", measure.vars = patterns( task = "^Task_"))
#set order of Aplha-Beta-etc...
DT.melt[ grepl( "Alpha", variable ), order := 1 ]
DT.melt[ grepl( "Beta", variable ), order := 2 ]
DT.melt[ grepl( "Charlie", variable ), order := 3 ]
DT.melt[ grepl( "Delta", variable ), order := 4 ]
#order DT.melt
setorder( DT.melt, userID, order )
#fill in codes EE, etc...
DT.melt[, `:=`( code1 = gsub( "(^.).*", "\\1", value ),
code2 = gsub( "(^.).*", "\\1", shift( value, type = "lead" ) ) ),
by = userID ]
#filter only rows without NA
DT.melt <- DT.melt[ complete.cases( DT.melt ) ]
#cast to wide output
dcast( DT.melt, userID ~ paste0( code2, code1 ), fun.aggregate = length )
}
mtd2 <- function() {
v <- c('Hard', 'Match', 'Easy')
vv <- do.call(paste, expand.grid(v, v))
DT2[, (vv) := {
mat <- mapply(paste, .SD[, -ncol(.SD), with=FALSE], .SD[, -1L])
as.data.table(Rfast::rowTabulate(matrix(match(mat, vv, 0L), nrow=.N)))
}, .SDcols=Task_Alpha:Task_Delta]
}
bench::mark(mtd0(), mtd1(), mtd2(), check=FALSE)
timings:
# A tibble: 3 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 mtd0() 2.19m 2.19m 0.00760 252MB 2.26 1 297 2.19m <int[,9] [1,000,000 x 9]> <df[,3] [171,481 x 3]> <bch:tm> <tibble [1 x 3]>
2 mtd1() 33.16s 33.16s 0.0302 856MB 0.754 1 25 33.16s <df[,10] [843,688 x 10]> <df[,3] [8,454 x 3]> <bch:tm> <tibble [1 x 3]>
3 mtd2() 844.95ms 844.95ms 1.18 298MB 1.18 1 1 844.95ms <df[,14] [1,000,000 x 14]> <df[,3] [8,912 x 3]> <bch:tm> <tibble [1 x 3]>
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With