I am trying to split a large CSV file into 50k rows Excel files. I tried using pandas but unfortunately it is getting crashed. So I switched to polar. It is working but the problem is it is only creating single files of 100k rows. How can I generate 50k rows excel files from 100 million (18.5 GB) csv file?
First I have given next_batches(50000) but still it is giving me only one excel file. So I have tried to print the len and it gave me one as output
import os
import polars as pl
def split_csv_to_excel(csv_file_path, output_dir, batch_size=50000):
reader = pl.read_csv_batched(csv_file_path,batch_size=batch_size)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
batches = reader.next_batches(1)
for i, df in enumerate(batches):
output_file_path = os.path.join(output_dir, f"batch_{i}.xlsx")
df.write_excel(output_file_path)
split_csv_to_excel("inputpath","outputpath")
I think perhaps the docs could be made clearer.
If you look at the final while example - you basically need to loop over .next_batches and keep calling it.
count = 0
batches = reader.next_batches(5)
while batches:
for df in batches:
count += 1
filename = f"batch_{count}.xlsx"
print("[WRITE]:", filename)
output_file_path = os.path.join(output_dir, filename)
df.write_excel(output_file_path)
batches = reader.next_batches(5)
As this while-pattern is quite common, iter() exists as a way to turn it into a for loop:
count = 0
batches = iter(lambda: reader.next_batches(5), None)
for batch in batches:
for df in batch:
count += 1
...
import polars as pl
from multiprocessing import get_context
from pathlib import Path
def create_sample_input(filename="input.csv", num_lines=100_000_000):
with open(filename, "w") as f:
f.write("COLUMN0\n")
f.write("\n".join(map(str, range(num_lines))))
return filename
def split_csv():
reader = pl.read_csv_batched(create_sample_input())
count = 0
batches = reader.next_batches(5)
while batches:
for df in batches:
count += 1
filename = f"batch_{count}.xlsx"
output_file_path = output_dir / filename
yield df, output_file_path
batches = reader.next_batches(5)
def write_excel(args):
df, filename = args
df.write_excel(filename)
def read_excel(filename):
return dict(filename=filename.name, height=pl.read_excel(filename).height)
if __name__ == "__main__":
output_dir = Path(".")
with get_context("spawn").Pool() as pool:
pool.map(write_excel, (args for args in split_csv()))
df = pl.DataFrame(
pool.map(read_excel, output_dir.glob("batch_*.xlsx"))
)
print(df)
shape: (2_008, 2)
┌─────────────────┬────────┐
│ filename ┆ height │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════════════╪════════╡
│ batch_709.xlsx ┆ 50001 │
│ batch_359.xlsx ┆ 50001 │
│ batch_1380.xlsx ┆ 50001 │
│ batch_1695.xlsx ┆ 50001 │
│ … ┆ … │
│ batch_1194.xlsx ┆ 50001 │
│ batch_1481.xlsx ┆ 50001 │
│ batch_1878.xlsx ┆ 50001 │
│ batch_458.xlsx ┆ 50001 │
└─────────────────┴────────┘
Not sure why the batch size varies
>>> df['height'].value_counts()
shape: (9, 2)
┌────────┬────────┐
│ height ┆ counts │
│ --- ┆ --- │
│ i64 ┆ u32 │
╞════════╪════════╡
│ 28572 ┆ 13 │
│ 50001 ┆ 1984 │
│ 43751 ┆ 5 │
│ 35186 ┆ 1 │
│ 13457 ┆ 1 │
│ 45658 ┆ 1 │
│ 33334 ┆ 1 │
│ 47121 ┆ 1 │
│ 33069 ┆ 1 │
└────────┴────────┘
But no input is lost.
>>> df["height"].sum()
100000000
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With