I have to process some xlsx
received from external source. Is there a more straightforward way to load a xlsx
in pandas
while also skipping rows with strikethrough?
Currently I have to do something like this:
import pandas as pd, openpyxl
working_file = r"something.xlsx"
working_wb = openpyxl.load_workbook(working_file, data_only=True)
working_sheet = working_wb.active
empty = []
for row in working_sheet.iter_rows("B", row_offset=3):
for cell in row:
if cell.font.strike is True:
p_id = working_sheet.cell(row=cell.row, column=37).value
empty.append(p_id)
df = pd.read_excel(working_file, skiprows=3)
df = df[~df["ID"].isin(empty)]
...
Which works but only by going through every excel sheet twice.
Ended up subclassing pd.ExcelFile
and _OpenpyxlReader
. It was easier than I thought :)
import pandas as pd
from pandas.io.excel._openpyxl import _OpenpyxlReader
from pandas._typing import Scalar
from typing import List
from pandas.io.excel._odfreader import _ODFReader
from pandas.io.excel._xlrd import _XlrdReader
class CustomReader(_OpenpyxlReader):
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
data = []
for row in sheet.rows:
first = row[1] # I need the strikethrough check on this cell only
if first.value is not None and first.font.strike: continue
else:
data.append([self._convert_cell(cell, convert_float) for cell in row])
return data
class CustomExcelFile(pd.ExcelFile):
_engines = {"xlrd": _XlrdReader, "openpyxl": CustomReader, "odf": _ODFReader}
With the custom classes set, now just pass the files like a normal ExcelFile
, specify the engine to openpyxl
and voila! Rows with strikethrough cells are gone.
excel = CustomExcelFile(r"excel_file_name.xlsx", engine="openpyxl")
df = excel.parse()
print (df)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With