This question has the same point of the link that I posted before.
( Is there a good way to avoid memory deep copy or to reduce time spent in multiprocessing? )
I'm getting nowhere with that since I faced the 'DataFrame' object sharing problem.
I simplified the sample code.
If there any professional to revise my code to share 'DataFrame' object between processes without Manager.list, Manager.dict, numpy sharedmem, I will very appreciate to her or him.
Here's the code.
#-*- coding: UTF-8 -*-'
import pandas as pd
import numpy as np
from multiprocessing import *
import multiprocessing.sharedctypes as sharedctypes
import ctypes
def add_new_derived_column(shared_df_obj):
shared_df_obj.value['new_column']=shared_df_obj.value['A']+shared_df_obj.value['B'] / 2
print shared_df_obj.value.head()
'''
"new_column" Generated!!!
A B new_column
0 -0.545815 -0.179209 -0.635419
1 0.654273 -2.015285 -0.353370
2 0.865932 -0.943028 0.394418
3 -0.850136 0.464778 -0.617747
4 -1.077967 -1.127802 -1.641868
'''
if __name__ == "__main__":
dataframe = pd.DataFrame(np.random.randn(100000, 2), columns=['A', 'B'])
# to shared DataFrame object, I use sharedctypes.RawValue
shared_df_obj=sharedctypes.RawValue(ctypes.py_object, dataframe )
# then I pass the "shared_df_obj" to Mulitiprocessing.Process object
process=Process(target=add_new_derived_column, args=(shared_df_obj,))
process.start()
process.join()
print shared_df_obj.value.head()
'''
"new_column" disappeared.
the DataFrame object isn't shared.
A B
0 -0.545815 -0.179209
1 0.654273 -2.015285
2 0.865932 -0.943028
3 -0.850136 0.464778
4 -1.077967 -1.127802
'''
You can use a Namespace Manager, the following code works as you expect.
#-*- coding: UTF-8 -*-'
import pandas as pd
import numpy as np
from multiprocessing import *
import multiprocessing.sharedctypes as sharedctypes
import ctypes
def add_new_derived_column(ns):
dataframe2 = ns.df
dataframe2['new_column']=dataframe2['A']+dataframe2['B'] / 2
print (dataframe2.head())
ns.df = dataframe2
if __name__ == "__main__":
mgr = Manager()
ns = mgr.Namespace()
dataframe = pd.DataFrame(np.random.randn(100000, 2), columns=['A', 'B'])
ns.df = dataframe
print (dataframe.head())
# then I pass the "shared_df_obj" to Mulitiprocessing.Process object
process=Process(target=add_new_derived_column, args=(ns,))
process.start()
process.join()
print (ns.df.head())
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With