Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

R Convert List Into Data.Frame or Table

list1 = list(
  c(4,5,6,7,1,1,1,1,3,1,3,3),
  c(3,4,5,6,2,2,2,2,1,4,2,1),
  c(1,2,3,4,1,1,1,1,3,2,1,1),
  c(5,6,7,8,1,1,1,1,4,4,4,3),
  c(2,3,4,5,2,2,2,2,2,1,2,1)
)

data1=data.frame("ID"=c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5),
"Time"=c(1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4),
"Grade"=c(4,5,6,7,3,4,5,6,1,2,3,4,5,6,7,8,2,3,4,5),
"Class"=c(1,1,1,1,2,2,2,2,1,1,1,1,1,1,1,1,2,2,2,2),
"Score"=c(3,1,3,3,1,4,2,1,3,2,1,1,4,4,4,3,2,1,2,1))

I have 'list1' Each item in 'list1' equals to a single individual's Grade, Class, Score for 4 years. So 'list1' has 5 students and 12 records for each student (4 records for every of three variables, Grade and Class and Score). I wish to turn 'list1' into 'data1' that is a long data file where 'ID' equals to the list item number in 'list1'. Time equals to time of the record (every student has 4 time measures), Grade equals to the first 4 data points in ALL elements in list1, Class the next 4, and Score the last 4.

Sample output is shown turning 'list1' into desired output 'data1'.

This data set is HUGE so I am hoping for a efficient approach to doing this conversion.

like image 282
bvowe Avatar asked Mar 03 '23 14:03

bvowe


1 Answers

I'm not sure it'll be efficient but it's concise:

setDT(list1)
# could also do something like paste0('student', 1:5) for clarity,
#   and adjust patterns() below accordingly
setnames(list1, paste0(1:5))
# 4 = # of values of Time
list1[ , colid := rep(c('Grade', 'Class', 'Score'), each = 4L)]
# 3 = # of columns "stacked" in each student's column initially
list1[ , Time := rep(1:4, 3L)]
# first, reshape long
list1[ , melt(.SD, measure.vars = patterns('^[0-9]+'), variable.name = 'ID',
              variable.factor = FALSE)
       # now, reshape to the final format
       ][ , dcast(.SD, ID + Time ~ colid, value.var = 'value')]
#         ID  Time Class Grade Score
#     <char> <int> <num> <num> <num>
#  1:      1     1     1     4     3
#  2:      1     2     1     5     1
#  3:      1     3     1     6     3
#  4:      1     4     1     7     3
#  5:      2     1     2     3     1
#  6:      2     2     2     4     4
#  7:      2     3     2     5     2
#  8:      2     4     2     6     1
#  9:      3     1     1     1     3
# 10:      3     2     1     2     2
# 11:      3     3     1     3     1
# 12:      3     4     1     4     1
# 13:      4     1     1     5     4
# 14:      4     2     1     6     4
# 15:      4     3     1     7     4
# 16:      4     4     1     8     3
# 17:      5     1     2     2     2
# 18:      5     2     2     3     1
# 19:      5     3     2     4     2
# 20:      5     4     2     5     1
#         ID  Time Class Grade Score

The inefficiency would come from having two operations here.

The approach of building the table skeleton first, then populating it may be faster, like this:

# 4 = # of Times per ID&Column (assuming your table is rectangular)
out = CJ(ID = 1:length(list1), Time = 1:4)
# relies on ID being an integer, so that ID = 1 --> list1[[1]]
#   gives ID=1's data
out[ , by = ID, c('Grade', 'Class', 'Score') := {
  as.data.table(matrix(list1[[ .BY$ID ]], ncol = 3L))
}]

It may be that as.data.table is also inefficient but this code is more readable than the alternative:

out = CJ(ID = 1:length(list1), Time = 1:4)
out[ , by = ID, c('Grade', 'Class', 'Score') := {
  student_data = list1[[.BY$ID]]
  lapply(1:3, function(j) student_data[4L*(j-1) + 1:4])
}]
like image 114
MichaelChirico Avatar answered Mar 05 '23 15:03

MichaelChirico