I am trying to convert a 321 MB
nested json file to a pandas Dataframe
and it is taking me a very very long time, i am sure that there is a faster way to do this, this is my code:
this is how the data looks like:
js_dict["data"][0:5]
d = [{'datetime': '2013-01-01T00:00:00+02:00', 'channels': [
{'id': 1, 'name': 'Rain', 'alias': None, 'value': 0.0, 'status': 1, 'valid': True, 'description': None},
{'id': 2, 'name': 'WSmax', 'alias': None, 'value': 7.7, 'status': 1, 'valid': True, 'description': None},
{'id': 3, 'name': 'WDmax', 'alias': None, 'value': 52.0, 'status': 1, 'valid': True, 'description': None},
{'id': 4, 'name': 'WS', 'alias': None, 'value': 5.2, 'status': 1, 'valid': True, 'description': None},
{'id': 5, 'name': 'WD', 'alias': None, 'value': 56.0, 'status': 1, 'valid': True, 'description': None},
{'id': 6, 'name': 'STDwd', 'alias': None, 'value': 11.9, 'status': 1, 'valid': True, 'description': None},
{'id': 7, 'name': 'RH', 'alias': None, 'value': 55.0, 'status': 1, 'valid': True, 'description': None},
{'id': 8, 'name': 'TD', 'alias': None, 'value': 13.5, 'status': 1, 'valid': True, 'description': None},
{'id': 10, 'name': 'TDmax', 'alias': None, 'value': 13.6, 'status': 1, 'valid': True, 'description': None},
{'id': 11, 'name': 'TDmin', 'alias': None, 'value': 13.5, 'status': 1, 'valid': True, 'description': None},
{'id': 13, 'name': 'WS1mm', 'alias': None, 'value': 6.2, 'status': 1, 'valid': True, 'description': None},
{'id': 14, 'name': 'Ws10mm', 'alias': None, 'value': 5.3, 'status': 1, 'valid': True, 'description': None},
{'id': 15, 'name': 'Time', 'alias': None, 'value': 2351.0, 'status': 1, 'valid': True, 'description': None}]},
{'datetime': '2013-01-01T00:10:00+02:00', 'channels': [
{'id': 1, 'name': 'Rain', 'alias': None, 'value': 0.0, 'status': 1, 'valid': True, 'description': None},
{'id': 2, 'name': 'WSmax', 'alias': None, 'value': 9.7, 'status': 1, 'valid': True, 'description': None},
{'id': 3, 'name': 'WDmax', 'alias': None, 'value': 42.0, 'status': 1, 'valid': True, 'description': None},
{'id': 4, 'name': 'WS', 'alias': None, 'value': 6.3, 'status': 1, 'valid': True, 'description': None},
{'id': 5, 'name': 'WD', 'alias': None, 'value': 55.0, 'status': 1, 'valid': True, 'description': None},
{'id': 6, 'name': 'STDwd', 'alias': None, 'value': 12.6, 'status': 1, 'valid': True, 'description': None},
{'id': 7, 'name': 'RH', 'alias': None, 'value': 54.0, 'status': 1, 'valid': True, 'description': None},
{'id': 8, 'name': 'TD', 'alias': None, 'value': 13.5, 'status': 1, 'valid': True, 'description': None},
{'id': 10, 'name': 'TDmax', 'alias': None, 'value': 13.5, 'status': 1, 'valid': True, 'description': None},
{'id': 11, 'name': 'TDmin', 'alias': None, 'value': 13.5, 'status': 1, 'valid': True, 'description': None},
{'id': 13, 'name': 'WS1mm', 'alias': None, 'value': 7.7, 'status': 1, 'valid': True, 'description': None},
{'id': 14, 'name': 'Ws10mm', 'alias': None, 'value': 6.3, 'status': 1, 'valid': True, 'description': None},
{'id': 15, 'name': 'Time', 'alias': None, 'value': 10.0, 'status': 1, 'valid': True, 'description': None}]},
{'datetime': '2013-01-01T00:20:00+02:00', 'channels': [
{'id': 1, 'name': 'Rain', 'alias': None, 'value': 0.0, 'status': 1, 'valid': True, 'description': None},
{'id': 2, 'name': 'WSmax', 'alias': None, 'value': 8.8, 'status': 1, 'valid': True, 'description': None},
{'id': 3, 'name': 'WDmax', 'alias': None, 'value': 42.0, 'status': 1, 'valid': True, 'description': None},
{'id': 4, 'name': 'WS', 'alias': None, 'value': 5.6, 'status': 1, 'valid': True, 'description': None},
{'id': 5, 'name': 'WD', 'alias': None, 'value': 55.0, 'status': 1, 'valid': True, 'description': None},
{'id': 6, 'name': 'STDwd', 'alias': None, 'value': 12.8, 'status': 1, 'valid': True, 'description': None},
{'id': 7, 'name': 'RH', 'alias': None, 'value': 55.0, 'status': 1, 'valid': True, 'description': None},
{'id': 8, 'name': 'TD', 'alias': None, 'value': 13.5, 'status': 1, 'valid': True, 'description': None},
{'id': 10, 'name': 'TDmax', 'alias': None, 'value': 13.5, 'status': 1, 'valid': True, 'description': None},
{'id': 11, 'name': 'TDmin', 'alias': None, 'value': 13.5, 'status': 1, 'valid': True, 'description': None},
{'id': 13, 'name': 'WS1mm', 'alias': None, 'value': 6.8, 'status': 1, 'valid': True, 'description': None},
{'id': 14, 'name': 'Ws10mm', 'alias': None, 'value': 6.3, 'status': 1, 'valid': True, 'description': None},
{'id': 15, 'name': 'Time', 'alias': None, 'value': 12.0, 'status': 1, 'valid': True, 'description': None}]},
{'datetime': '2013-01-01T00:30:00+02:00', 'channels': [
{'id': 1, 'name': 'Rain', 'alias': None, 'value': 0.0, 'status': 1, 'valid': True, 'description': None},
{'id': 2, 'name': 'WSmax', 'alias': None, 'value': 10.4, 'status': 1, 'valid': True, 'description': None},
{'id': 3, 'name': 'WDmax', 'alias': None, 'value': 60.0, 'status': 1, 'valid': True, 'description': None},
{'id': 4, 'name': 'WS', 'alias': None, 'value': 5.5, 'status': 1, 'valid': True, 'description': None},
{'id': 5, 'name': 'WD', 'alias': None, 'value': 54.0, 'status': 1, 'valid': True, 'description': None},
{'id': 6, 'name': 'STDwd', 'alias': None, 'value': 11.9, 'status': 1, 'valid': True, 'description': None},
{'id': 7, 'name': 'RH', 'alias': None, 'value': 55.0, 'status': 1, 'valid': True, 'description': None},
{'id': 8, 'name': 'TD', 'alias': None, 'value': 13.4, 'status': 1, 'valid': True, 'description': None},
{'id': 10, 'name': 'TDmax', 'alias': None, 'value': 13.5, 'status': 1, 'valid': True, 'description': None},
{'id': 11, 'name': 'TDmin', 'alias': None, 'value': 13.3, 'status': 1, 'valid': True, 'description': None},
{'id': 13, 'name': 'WS1mm', 'alias': None, 'value': 7.0, 'status': 1, 'valid': True, 'description': None},
{'id': 14, 'name': 'Ws10mm', 'alias': None, 'value': 5.6, 'status': 1, 'valid': True, 'description': None},
{'id': 15, 'name': 'Time', 'alias': None, 'value': 21.0, 'status': 1, 'valid': True, 'description': None}]},
{'datetime': '2013-01-01T00:40:00+02:00', 'channels': [
{'id': 1, 'name': 'Rain', 'alias': None, 'value': 0.0, 'status': 1, 'valid': True, 'description': None},
{'id': 2, 'name': 'WSmax', 'alias': None, 'value': 9.5, 'status': 1, 'valid': True, 'description': None},
{'id': 3, 'name': 'WDmax', 'alias': None, 'value': 61.0, 'status': 1, 'valid': True, 'description': None},
{'id': 4, 'name': 'WS', 'alias': None, 'value': 5.7, 'status': 1, 'valid': True, 'description': None},
{'id': 5, 'name': 'WD', 'alias': None, 'value': 52.0, 'status': 1, 'valid': True, 'description': None},
{'id': 6, 'name': 'STDwd', 'alias': None, 'value': 11.0, 'status': 1, 'valid': True, 'description': None},
{'id': 7, 'name': 'RH', 'alias': None, 'value': 55.0, 'status': 1, 'valid': True, 'description': None},
{'id': 8, 'name': 'TD', 'alias': None, 'value': 13.2, 'status': 1, 'valid': True, 'description': None},
{'id': 10, 'name': 'TDmax', 'alias': None, 'value': 13.3, 'status': 1, 'valid': True, 'description': None},
{'id': 11, 'name': 'TDmin', 'alias': None, 'value': 13.2, 'status': 1, 'valid': True, 'description': None},
{'id': 13, 'name': 'WS1mm', 'alias': None, 'value': 7.0, 'status': 1, 'valid': True, 'description': None},
{'id': 14, 'name': 'Ws10mm', 'alias': None, 'value': 5.7, 'status': 1, 'valid': True, 'description': None},
{'id': 15, 'name': 'Time', 'alias': None, 'value': 40.0, 'status': 1, 'valid': True, 'description': None}]}]
import json
import pandas as pd
from pandas.io.json import json_normalize
with open(r"Documents\dat.json") as data_file:
js_dict = json.load(data_file)
columns =list(json_normalize(js_dict["data"][0]["channels"])
["name"])
df = pd.DataFrame()
for row in js_dict["data"]:
val = json_normalize(row,record_path="channels").transpose().iloc[6]
new_val = pd.DataFrame(val).transpose().reset_index()
df = df.append(new_val)
print(df)
EDIT
I used another more basic script using less pandas functions and shortened the time considerably, I am wondering why the pandas functions are so slow
from glob import glob
import json
import pandas as pd
from pandas.io.json import json_normalize
import time
import cProfile
def timing(f):
def wrap(*args):
time1 = time.time()
ret = f(*args)
time2 = time.time()
print('{:s} function took {:.3f} ms'.format(f.__name__, (time2-
time1)*1000.0))
return ret
return wrap
@timing
def convert_json_panda_I(js_dict):
columns =list(json_normalize(js_dict[0]["channels"])["name"])
df = pd.DataFrame()
for row in js_dict:
val = json_normalize(row,record_path="channels").transpose().iloc[6]
new_val = pd.DataFrame(val).transpose().reset_index()
df = df.append(new_val)
return df
def decode_dict(dat):
row = []
for k, v in dat.items():
if k in 'datetime':
date = v
row.append(date)
elif k in 'channels':
for chanell in v:
row.append(chanell["value"])
return row
@timing
def convert_json_panda_II(json_dict):
df = pd.DataFrame([decode_dict(line) for line in json_dict])
return df
if __name__ == "__main__":
with open(r"C:\cygwin64\bin\zefat_bp.json") as data_file:
js_dict = json.load(data_file)
print("first version profile")
first_version = convert_json_panda_I(js_dict["data"])
print("second version profile")
second_version = cProfile.run('convert_json_panda_II(js_dict["data"])')
print(type(second_version))
print(second_version.head())
The difference in timing is very big:
Connected to pydev debugger (build 181.4445.76)
convert_json_panda_I function took 2298914.500 ms
convert_json_panda_II function took 1389.219 ms
why is using pandas functions so slow?
The problem is that df.append(new_val)
create a full new Pandas dataframe for each row (it copy the previous one and add just a new row). Thus, with 1,000,000
rows, the first version will generate 1,000,000 dataframes with an average of 500,000 rows each. The second approach generate only one big dataframe directly. The the point is: do not use append
of Pandas, use concat
. Here is an example:
import json
import pandas as pd
from pandas.io.json import json_normalize
with open(r"Documents\dat.json") as data_file:
js_dict = json.load(data_file)
columns =list(json_normalize(js_dict["data"][0]["channels"])
["name"])
rows = []
for row in js_dict["data"]:
val = json_normalize(row,record_path="channels").transpose().iloc[6]
rows.append(pd.DataFrame(val).transpose().reset_index())
df = pd.concat(rows)
print(df)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With