test.csv
27-Mar-12,8.25,8.35,8.17,8.19,9801989
26-Mar-12,8.16,8.25,8.12,8.24,8694416
23-Mar-12,8.05,8.12,7.95,8.09,8149170
pandas
?
datetime
date formatted stringpandas
this can be accomplished with a single (long) line of code, not including the import.
parse_date
can be very slow, if date_parser
isn't used.import pandas as pd
(pd.read_csv('test.csv', header=None, parse_dates=[0], date_parser=lambda t: pd.to_datetime(t, format='%d-%b-%y'))
.rename(columns={0: 'date'})
.sort_values('date')
.to_csv('test.csv', index=False))
date,1,2,3,4,5
2012-03-23,8.05,8.12,7.95,8.09,8149170
2012-03-26,8.16,8.25,8.12,8.24,8694416
2012-03-27,8.25,8.35,8.17,8.19,9801989
pandas
for this task.pandas
.pandas
.numpy
, but the primary point of the question, is to accomplish this task with only packages from the standard library.pandas to CSV with no Index pandas DataFrame to CSV with no index can be done by using index=False param of to_csv() method. With this, you can specify ignore index while writing/exporting DataFrame to CSV file.
To sort CSV by multiple columns, use the sort_values() method. Sorting by multiple columns means if one of the columns has repeated values, then the sort order depends on the 2nd column mentioned under sort_values() method.
Read A CSV File Using Python There are two common ways to read a . csv file when using Python. The first by using the csv library, and the second by using the pandas library.
Using as few imports as I can:
from datetime import datetime
def format_date(date: str) -> str:
formatted_date = datetime.strptime(date, "%d-%b-%y").date().isoformat()
return formatted_date
# read in the CSV
with open("test.csv", "r") as file:
lines = file.readlines()
records = [[value for value in line.split(",")] for line in lines]
# reformat the first field in each record
for record in records:
record[0] = format_date(record[0])
# having formatted the dates, sort records by first (date) field:
sorted_records = sorted(records, key = lambda r: r[0])
# join values with commas once more, removing newline characters
prepared_lines = [",".join(record).strip("\n") for record in sorted_records]
# create a header row
field_names = "date,1,2,3,4,5"
# prepend the header row
prepared_lines.insert(0, field_names)
prepared_data = "\n".join(prepared_lines)
# write out the CSV
with open("test.csv", "w") as file:
file.write(prepared_data)
pandas
is, by far, the easier tool for parsing and cleaning files.pandas
, took 11 lines of code, and requires a for-loop
.csv
& datetime
.seek
& .truncate
list()
was used to unpack the csv.reader
object, but that was removed, to update the date value, while iterating through the reader
.sorted
to customize the sort order, but I do not see a way to return a value from the lambda
expression.
key=lambda row: datetime.strptime(row[0], '%Y-%m-%d')
was used, but has been removed, since the updated date column doesn't contain month names.import csv
from datetime import datetime
# open the file for reading and writing
with open('test1.csv', mode='r+', newline='') as f:
# create a reader and writer opbject
reader, writer = csv.reader(f), csv.writer(f)
data = list()
# iterate through the reader and update column 0 to a datetime date string
for row in reader:
# update column 0 to a datetime date string
row[0] = datetime.strptime(row[0], "%d-%b-%y").date().isoformat()
# append the row to data
data.append(row)
# sort all of the rows, based on date, with a lambda expression
data = sorted(data, key=lambda row: row[0])
# change the stream position to the given byte offset
f.seek(0)
# truncate the file size
f.truncate()
# add a header to data
data.insert(0, ['date', 1, 2, 3, 4, 5])
# write data to the file
writer.writerows(data)
test.csv
date,1,2,3,4,5
2012-03-23,8.05,8.12,7.95,8.09,8149170
2012-03-26,8.16,8.25,8.12,8.24,8694416
2012-03-27,8.25,8.35,8.17,8.19,9801989
%time
testimport pandas
import pandas_datareader as web
# test data with 1M rows
df = web.DataReader(ticker, data_source='yahoo', start='1980-01-01', end='2020-09-27').drop(columns=['Adj Close']).reset_index().sort_values('High', ascending=False)
df.Date = df.Date.dt.strftime('%d-%b-%y')
df = pd.concat([df]*100)
df.to_csv('test.csv', index=False, header=False)
# pandas test with date_parser
%time pandas_test('test.csv')
[out]:
Wall time: 17.9 s
# pandas test without the date_parser parameter
%time pandas_test('test.csv')
[out]:
Wall time: 1min 17s
# from Paddy Alton
%time paddy('test.csv')
[out]:
Wall time: 15.9 s
# from Trenton
%time trenton('test.csv')
[out]:
Wall time: 17.7 s
# from sammywemmy with functions updated to return the correct date format
%time sammy('test.csv')
[out]:
Wall time: 22.2 s
%time sammy2('test.csv')
[out]:
Wall time: 22.2 s
from operator import itemgetter
import csv
import pandas as pd
from datetime import datetime
def pandas_test(file):
(pd.read_csv(file, header=None, parse_dates=[0], date_parser=lambda t: pd.to_datetime(t, format='%d-%b-%y'))
.rename(columns={0: 'date'})
.sort_values('date')
.to_csv(file, index=False))
def trenton(file):
with open(file, mode='r+', newline='') as f:
reader, writer = csv.reader(f), csv.writer(f)
data = list()
for row in reader:
row[0] = datetime.strptime(row[0], "%d-%b-%y").date().isoformat()
data.append(row)
data = sorted(data, key=lambda row: row[0])
f.seek(0)
f.truncate()
data.insert(0, ['date', 1, 2, 3, 4, 5])
writer.writerows(data)
def paddy(file):
def format_date(date: str) -> str:
formatted_date = datetime.strptime(date, "%d-%b-%y").date().isoformat()
return formatted_date
with open(file, "r") as f:
lines = f.readlines()
records = [[value for value in line.split(",")] for line in lines]
for record in records:
record[0] = format_date(record[0])
sorted_records = sorted(records, key = lambda r: r[0])
prepared_lines = [",".join(record).strip("\n") for record in sorted_records]
field_names = "date,1,2,3,4,5"
prepared_lines.insert(0, field_names)
prepared_data = "\n".join(prepared_lines)
with open(file, "w") as f:
f.write(prepared_data)
def sammy(file):
# updated with .date().isoformat() to return the correct format
with open(file) as csvfile:
fieldnames = ["date", 1, 2, 3, 4, 5]
reader = csv.DictReader(csvfile, fieldnames=fieldnames)
mapping = list(reader)
mapping = [
{
key: datetime.strptime(value, ("%d-%b-%y")).date().isoformat()
if key == "date" else value
for key, value in entry.items()
}
for entry in mapping
]
mapping = sorted(mapping, key=itemgetter("date"))
with open(file, mode="w", newline="") as csvfile:
fieldnames = mapping[0].keys()
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in mapping:
writer.writerow(row)
def sammy2(file):
# updated with .date().isoformat() to return the correct format
with open(file) as csvfile:
reader = csv.reader(csvfile, delimiter=",")
mapping = dict(enumerate(reader))
num_of_cols = len(mapping[0])
fieldnames = ["date" if n == 0 else n
for n in range(num_of_cols)]
mapping = [
[ datetime.strptime(val, "%d-%b-%y").date().isoformat()
if ind == 0 else val
for ind, val in enumerate(value)
]
for key, value in mapping.items()
]
mapping = sorted(mapping, key=itemgetter(0))
with open(file, mode="w", newline="") as csvfile:
csvwriter = csv.writer(csvfile, delimiter=",")
csvwriter.writerow(fieldnames)
for row in mapping:
csvwriter.writerow(row)
As the OP states, Pandas makes this easy; an alternative is to use the DictReader and DictWriter options; it still is more verbose than using Pandas (beauty of abstraction here, Pandas does the heavy lifting for us).
import csv
from datetime import datetime
from operator import itemgetter
with open("test.csv") as csvfile:
fieldnames = ["date", 1, 2, 3, 4, 5]
reader = csv.DictReader(csvfile, fieldnames=fieldnames)
mapping = list(reader)
mapping = [
{
key: datetime.strptime(value, ("%d-%b-%y"))
if key == "date" else value
for key, value in entry.items()
}
for entry in mapping
]
mapping = sorted(mapping, key=itemgetter("date"))
with open("test.csv", mode="w", newline="") as csvfile:
fieldnames = mapping[0].keys()
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for row in mapping:
writer.writerow(row)
Since the fieldnames are not known ahead of time, we could use the csvreader and csvwriter options:
with open("test.csv") as csvfile:
reader = csv.reader(csvfile, delimiter=",")
mapping = dict(enumerate(reader))
num_of_cols = len(mapping[0])
fieldnames = ["date" if n == 0 else n
for n in range(num_of_cols)]
mapping = [
[ datetime.strptime(val, "%d-%b-%y")
if ind == 0 else val
for ind, val in enumerate(value)
]
for key, value in mapping.items()
]
mapping = sorted(mapping, key=itemgetter(0))
with open("test.csv", mode="w", newline="") as csvfile:
csvwriter = csv.writer(csvfile, delimiter=",")
csvwriter.writerow(fieldnames)
for row in mapping:
csvwriter.writerow(row)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With