Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to convert complex JSON data into a single dataframe?

Tags:

json

r

I don't know how to properly convert my JSON data into a useful dataframe. This is some example data that shows the structure of my data:

{
"data":[
{"track":[
{"time":"2015","midpoint":{"x":6,"y":8},"realworld":{"x":1,"y":3},"coordinate":{"x":16,"y":38}},
{"time":"2015","midpoint":{"x":6,"y":8},"realworld":{"x":1,"y":3},"coordinate":{"x":16,"y":37}},
{"time":"2016","midpoint":{"x":6,"y":9},"realworld":{"x":2,"y":3},"coordinate":{"x":16,"y":38}}
]},
{"track":[
{"time":"2015","midpoint":{"x":5,"y":9},"realworld":{"x":-1,"y":3},"coordinate":{"x":16,"y":38}},
{"time":"2015","midpoint":{"x":5,"y":9},"realworld":{"x":-1,"y":3},"coordinate":{"x":16,"y":38}},
{"time":"2016","midpoint":{"x":5,"y":9},"realworld":{"x":-1,"y":3},"coordinate":{"x":16,"y":38}},
{"time":"2015","midpoint":{"x":3,"y":15},"realworld":{"x":-9,"y":2},"coordinate":{"x":17,"y":38}}
]},
{"track":[
{"time":"2015","midpoint":{"x":6,"y":7},"realworld":{"x":-2,"y":3},"coordinate":{"x":16,"y":39}}
]}]}

I have a lot of tracks and I'd like the dataset to look like this:

track   time   midpoint   realworld   coordinate 
1
1
1
2
2
2
2
3

So far I have this:

json_file <- "testdata.json"
data <- fromJSON(json_file)
data2 <- list.stack(data, fill=TRUE) 

Right now it comes out like this:

data output

How can I get this in the proper format?

like image 452
Romy Avatar asked Sep 28 '16 12:09

Romy


2 Answers

Add the flatten = TRUE parameter when reading with fromJSON. This will give you a nested list with in it's deepest level a list of three dataframes. Using:

library(jsonlite)
# read the json
jsondata <- fromJSON(txt, flatten = TRUE)

# bind the dataframes in the nested 'track' list together    
dat <- do.call(rbind, jsondata$data$track)

# add a track variable
dat$track <- rep(1:length(jsondata$data$track), sapply(jsondata$data$track, nrow))

gives:

> dat
  time midpoint.x midpoint.y realworld.x realworld.y coordinate.x coordinate.y track
1 2015          6          8           1           3           16           38     1
2 2015          6          8           1           3           16           37     1
3 2016          6          9           2           3           16           38     1
4 2015          5          9          -1           3           16           38     2
5 2015          5          9          -1           3           16           38     2
6 2016          5          9          -1           3           16           38     2
7 2015          3         15          -9           2           17           38     2
8 2015          6          7          -2           3           16           39     3

Another, shorter, approach is using jsonlite in combination with rbindlist from the data.table package:

library(jsonlite)
library(data.table)
# read the json
jsondata <- fromJSON(txt, flatten = TRUE)
# bind the dataframes in the nested 'track' list together 
# and include an id-column at the same time   
dat <- rbindlist(jsondata$data$track, idcol = 'track')

or with bind_rows from the dplyr package in a similar way:

library(dplyr)
dat <- bind_rows(jsondata$data$track, .id = 'track')

Used data:

txt <- '{
"data":[
{"track":[
{"time":"2015","midpoint":{"x":6,"y":8},"realworld":{"x":1,"y":3},"coordinate":{"x":16,"y":38}},
{"time":"2015","midpoint":{"x":6,"y":8},"realworld":{"x":1,"y":3},"coordinate":{"x":16,"y":37}},
{"time":"2016","midpoint":{"x":6,"y":9},"realworld":{"x":2,"y":3},"coordinate":{"x":16,"y":38}}
]},
{"track":[
{"time":"2015","midpoint":{"x":5,"y":9},"realworld":{"x":-1,"y":3},"coordinate":{"x":16,"y":38}},
{"time":"2015","midpoint":{"x":5,"y":9},"realworld":{"x":-1,"y":3},"coordinate":{"x":16,"y":38}},
{"time":"2016","midpoint":{"x":5,"y":9},"realworld":{"x":-1,"y":3},"coordinate":{"x":16,"y":38}},
{"time":"2015","midpoint":{"x":3,"y":15},"realworld":{"x":-9,"y":2},"coordinate":{"x":17,"y":38}}
]},
{"track":[
{"time":"2015","midpoint":{"x":6,"y":7},"realworld":{"x":-2,"y":3},"coordinate":{"x":16,"y":39}}
]}]}'
like image 129
Jaap Avatar answered Oct 24 '22 15:10

Jaap


Sahil's answer (if it's not deleted already) is misleading since stream_in is for ndjson and you don't have ndjson. You just need to wrangle the nested list a bit. I think the following can be made even smaller, but it was a quick, direct-attack hack:

library(jsonlite)
library(purrr)
library(readr)

dat <- fromJSON(txt, simplifyVector=FALSE) # read in your JSON
map(dat$data, "track") %>%                 # move past the top-level "data" element and iterate over the "track"s
  map_df(function(track) {                 # iterate over each element of "track"
    map_df(track, ~as.list(unlist(track))) # convert it to a data frame
  }, .id="track") %>%                      # add in the track "id"
  type_convert()                           # convert mangled types
## # A tibble: 8 × 8
##   track  time midpoint.x midpoint.y realworld.x realworld.y coordinate.x coordinate.y
##   <int> <int>      <int>      <int>       <int>       <int>        <int>        <int>
## 1     1  2016          6          9           2           3           16           38
## 2     1  2016          6          9           2           3           16           38
## 3     1  2016          6          9           2           3           16           38
## 4     2  2015          3         15          -9           2           17           38
## 5     2  2015          3         15          -9           2           17           38
## 6     2  2015          3         15          -9           2           17           38
## 7     2  2015          3         15          -9           2           17           38
## 8     3  2015          6          7          -2           3           16           39

This also leaves you with decent column types, though you may want to use the col_types parameter to readr::type_convert to turn time into a character vector.

Alternately:

library(jsonlite)
library(purrr)
library(tibble)

dat <- fromJSON(txt, flatten=TRUE) # read in your JSON
map_df(dat$data$track, as_tibble, .id="track")
like image 40
hrbrmstr Avatar answered Oct 24 '22 16:10

hrbrmstr