What are some ways to transform session path data such as this:
df
# Session Link1 Link2 Link3 Link4 Link5
# 1 1 A B
# 2 2 C
# 3 3 D A B
# 4 4 C F G H J
# 5 5 A B C
Into a data set that looks like this:
desired
# Session From To
# 1 1 A B
# 2 2 C <NA>
# 3 3 D A
# 4 3 A B
# 5 4 C F
# 6 4 F G
# 7 4 G H
# 8 4 H J
# 9 5 A B
# 10 5 B C
Data for reproducibility:
df <- structure(list(Session = 1:5, Link1 = structure(c(1L, 2L, 3L, 2L, 1L), .Label = c("A", "C", "D"), class = "factor"), Link2 = structure(c(3L, 1L, 2L, 4L, 3L), .Label = c("", "A", "B", "F"), class = "factor"), Link3 = structure(c(1L, 1L, 2L, 4L, 3L), .Label = c("", "B", "C", "G"), class = "factor"), Link4 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("", "H"), class = "factor"), Link5 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("", "J"), class = "factor")), .Names = c("Session", "Link1", "Link2", "Link3", "Link4", "Link5"), class = "data.frame", row.names = c(NA, -5L))
desired <- structure(list(Session = c(1L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L), From = structure(c(1L, 3L, 4L, 1L, 3L, 5L, 6L, 7L, 1L, 2L), .Label = c("A", "B", "C", "D", "F", "G", "H"), class = "factor"), To = structure(c(2L, NA, 1L, 2L, 4L, 5L, 6L, 7L, 2L, 3L), .Label = c("A", "B", "C", "F", "G", "H", "J"), class = "factor")), .Names = c("Session", "From", "To"), class = "data.frame", row.names = c(NA, -10L))
We could use data.table. Convert the 'data.frame' to 'data.table' (setDT(df)). Reshape from 'wide' to 'long' format with melt specifying the id.var as 'Session'. Remove the 'value' elements that are empty [value!='']. Grouped by 'Session', we insert 'NA' values in the 'value' column for those 'Session' that have only a single row (if...else), create a two columns ('From' and 'To') by removing the last and first element of 'V1' grouped by 'Session'.
library(data.table)#v1.9.5+
melt(setDT(df), id.var='Session')[value!=''][,
if(.N==1L) c(value, NA) else value, by = Session][,
list(From=V1[-.N], To=V1[-1L]), by = Session]
# Session From To
#1: 1 A B
#2: 2 C NA
#3: 3 D A
#4: 3 A B
#5: 4 C F
#6: 4 F G
#7: 4 G H
#8: 4 H J
#9: 5 A B
#10: 5 B C
The above could be simplified to a single block after the melt step. For some reason, tmp[-.N] is not working. So I used tmp[1:(.N-1)].
melt(setDT(df), id.var= 'Session')[value!='', {
tmp <- if(.N==1L) c(value, NA) else value
list(From= tmp[1:(.N-1)], To= tmp[-1L]) }, by = Session]
# Session From To
#1: 1 A B
#2: 2 C NA
#3: 3 D A
#4: 3 A B
#5: 4 C F
#6: 4 F G
#7: 4 G H
#8: 4 H J
#9: 5 A B
#10: 5 B C
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With