First time trying to parse XML to CSV using Python. I need some help with when I have multiple customers and they do not have the same child elements. When a customer does not have a child element I want the csv file column to be populated with 'Empty'. I want 'Empty' to be a placeholder so that the values that do exist populate in the correct column.
Example of what's happening, notice how data in the second row which is suppose to be in the zipcode, street, and number field are being squeezed into previous columns where no values were found. !https://i.sstatic.net/Ik5J1.jpg!
Here's an example of what I'm trying to do, as you will see the 'Empty' is just a place holder: !https://i.sstatic.net/uByjT.jpg!
Here's my python code:
import xml.etree.ElementTree as ET
import csv
tree = ET.parse(r'C:\Documents\cat.xml')
root = tree.getroot()
#Open the file for writing
CustomerData = open(r'C:\Users\Kris\Documents\customerdata.csv', 'w')
#Create header row object
header_row = []
#Create the csv writer object
csvwriter = csv.writer(CustomerData)
#Set count to 0
count = 0
#Find tags and text
for node in tree.iter('Customer'):
data = []
if count == 0:
for customerid in node.iter('Id_Customer'):
customer = customerid.tag
header_row.append(customer)
for segmentid in node.iter('Segment'):
segment = segmentid.tag
header_row.append(segment)
for event in node.iter('Event'):
for natureid in event.iter('Nature'):
nature = natureid.tag
header_row.append(nature)
for event2 in node.iter('Event'):
for Extrainfoid in event2.iter('Extrainfo'):
extrainfo = Extrainfoid.tag
header_row.append(extrainfo)
for address in node.iter('Address'):
for zipcode in address.iter('zipcode'):
zipcd = zipcode.tag
header_row.append(zipcd)
for address in node.iter('Address'):
for streetname in address.iter('street'):
street = streetname.tag
header_row.append(street)
for address in node.iter('Address'):
for number in address.iter('number'):
num = number.tag
csvwriter.writerow(header_row)
count = count + 1
for customerid in node.iter('Id_Customer'):
customertxt = customerid.text
data.append(customertxt)
for segmentid in node.iter('Segment'):
segmenttxt = segmentid.text
data.append(segmenttxt)
for event in node.iter('Event'):
for natureid in event.iter('Nature'):
naturetxt = natureid.text
data.append(naturetxt)
for event2 in node.iter('Event'):
for Extrainfoid in event2.iter('Extrainfo'):
extrainfotxt = Extrainfoid.text
data.append(extrainfotxt)
for address in node.iter('Address'):
for zipcode in address.iter('zipcode'):
zipcdtxt = zipcode.text
data.append(zipcdtxt)
for address in node.iter('Address'):
for streetname in address.iter('street'):
streettxt = streetname.text
header_row.append(streettxt)
for address in node.iter('Address'):
for number in address.iter('number'):
numtxt = number.text
data.append(numtxt)
csvwriter.writerow(data)
CustomerData.close()
Here is an example of XML code that is similar to mine with different elements. It's not the real xml code that I'm using, just an example of how a customer can have multiple elements that another customer does not. Please note in my actual process with my xml files the headers and everything are displaying properly in my csv file, I just need to create an 'Empty' when the element does not actually have a value for that particular customer.
<CAT>
<Header>...</Header>
<Add>...</Add>
<Customer>
<Id_Customer>xyz1</Id_Customer>
<Segment>abc1</Segment>
<Event>
<Nature>info1</Nature>
<Extrainfo>info2</Extrainfo>
</Event>
</Customer>
<Customer>
<Id_Customer>zzwy</Id_Customer>
<Segment>c2</Segment>
<Adress>
<zipcode>77098</zipcode>
<street>belaire drive</street>
<number>5</number>
</Adress>
</Customer>
...
You could create a list containing all the mappings you want. Try and search for each, and if it is not present, catch the AttributeError and store an empty value for it:
import xml.etree.ElementTree as ET
import csv
fields = [
('Id_Customer', 'Id_Customer'),
('Segment', 'Segment'),
('Nature', 'Event/Nature'),
('Extrainfo', 'Event/Extrainfo'),
('zipcode', 'Adress/zipcode'),
('street', 'Adress/street'),
('number', 'Adress/number')]
tree = ET.parse('cat.xml')
root = tree.getroot()
with open(r'customerdata.csv', 'wb') as f_customerdata:
csv_customerdata = csv.DictWriter(f_customerdata, fieldnames=[field for field, match in fields])
csv_customerdata.writeheader()
for node in tree.iter('Customer'):
row = {}
for field_name, match in fields:
try:
row[field_name] = node.find(match).text
except AttributeError as e:
row[field_name] = ''
csv_customerdata.writerow(row)
Giving you an output CSV file containing:
Id_Customer,Segment,Nature,Extrainfo,zipcode,street,number
xyz1,abc1,info1,info2,,,
zzwy,c2,,,77098,belaire drive,5
This approach also uses a DictWriter() instead of the standard csv writer. This makes it easier to assign values by name.
To cope with multiple address entries per customer, you first need to autocreate the maximum number of extra columns per entry. Then when accessing the elements, use findall() to get each one:
import xml.etree.ElementTree as ET
import csv
extra_columns = 2
fields = [
('Id_Customer', 'Id_Customer', 1),
('Segment', 'Segment', 1),
('Nature', 'Event/Nature', 1),
('Extrainfo', 'Event/Extrainfo', 1),
('zipcode', 'Adress/zipcode', extra_columns),
('street', 'Adress/street', extra_columns),
('number', 'Adress/number', extra_columns)]
tree = ET.parse('cat.xml')
root = tree.getroot()
# Auto create the header from fields
fieldnames = []
for field, match, cols in fields:
fieldnames.append(field)
if cols > 1:
fieldnames.extend(["{}{}".format(field, x+2) for x in range(extra_columns)])
with open(r'customerdata.csv', 'wb') as f_customerdata:
csv_customerdata = csv.DictWriter(f_customerdata, fieldnames=fieldnames)
csv_customerdata.writeheader()
for node in tree.iter('Customer'):
row = {}
for field_name, match, cols in fields:
if cols > 1:
for index, el in enumerate(node.findall(match)):
try:
if index:
row["{}{}".format(field_name, index+1)] = el.text
else:
row[field_name] = el.text
except AttributeError as e:
row[field_name] = ''
else:
try:
row[field_name] = node.find(match).text
except AttributeError as e:
row[field_name] = ''
csv_customerdata.writerow(row)
So your header would now look like:
Id_Customer,Segment,Nature,Extrainfo,zipcode,zipcode2,zipcode3,street,street2,street3,number,number2,number3
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With