I have a dataframe with 10 million rows. It takes too much time for iteration and calculation of each rows. I would like to get results in a shorter period. I tried different multiprocessing codes, but each time I faced different errors. Can someone helps me about this. Thank you advance.
d3 is the dataframe with more than 10 millions rows and 29 columns. The last columns' name is "Calculation", all values in this column is zero. By using the values of other columns, I calculated new values and I set new values to the "Calculation" column's each row.
First code
def fun():
for i in d3.index:
if (d3.iloc[i, 3:4])[0] == (d3.iloc[i + 1, 3:4])[0]:
d3.loc[d3.index[i], 'Calculation']==(d3.iloc[i, 22:23] - d3.iloc[i + 1, 22:23])
else:
d3.loc[d3.index[i],'Calculation']=d3.iloc[i, 13:14][0]
if __name__=="__main__":
p1 = mp.Process(target=fun, args=())
p2 = mp.Process(target=fun, args=())
p3 = mp.Process(target=fun, args=())
p4 = mp.Process(target=fun, args=())
p1.start()
p2.start()
p3.start()
p4.start()
Second code
def fun(i):
if (d3.iloc[i, 3:4])[0] == (d3.iloc[i + 1, 3:4])[0]:
d3.set_value(i, "Calculation", (d3.iloc[i, 22:23] - d3.iloc[i + 1, 22:23]))
else:
d3.set_value(i, "Calculation", d3.iloc[i, 13:14])
if __name__=="__main__":
p1 = mp.Process(target=fun, args=(d3.index, ))
p2 = mp.Process(target=fun, args=(d3.index, ))
p3 = mp.Process(target=fun, args=(d3.index, ))
p4 = mp.Process(target=fun, args=(d3.index, ))
p1.start()
p2.start()
p3.start()
p4.start()
p1.join()
p2.join()
p3.join()
p4.join()
Third Code
def fun(q):
for i in d3.index:
if (d3.iloc[i, 3:4])[0] == (d3.iloc[i + 1, 3:4])[0]:
d3.loc[d3.index[i], 'Calculation']==(d3.iloc[i, 22:23] - d3.iloc[i + 1, 22:23])
else:
d3.loc[d3.index[i], 'Calculation']==(d3.iloc[i, 22:23] - d3.iloc[i + 1, 22:23])
q.put(d3)``
if __name__ == "__main__":
q = mp.Queue()
processes = [mp.Process(target=fun, args=(q,)) for x in range(4)]
for p in processes:
p.start()
for p in processes:
p.join()
results = [q.get() for p in processes]
Error1:
File "pandas\_libs\ops.pyx", line 103, in pandas._libs.ops.vec_compare
ValueError: Buffer has wrong number of dimensions (expected 1, got 0)
Error2:
File "pandas\_libs\hashtable_class_helper.pxi", line 1614, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0
Error3:
File "pandas\_libs\ops.pyx", line 103, in pandas._libs.ops.vec_compare
ValueError: Buffer has wrong number of dimensions (expected 1, got 0)