Python Parsing XML to CSV with missing Elements

Question

First time trying to parse XML to CSV using Python. I need some help with when I have multiple customers and they do not have the same child elements. When a customer does not have a child element I want the csv file column to be populated with 'Empty'. I want 'Empty' to be a placeholder so that the values that do exist populate in the correct column.

Example of what's happening, notice how data in the second row which is suppose to be in the zipcode, street, and number field are being squeezed into previous columns where no values were found. !https://i.sstatic.net/Ik5J1.jpg!

Here's an example of what I'm trying to do, as you will see the 'Empty' is just a place holder: !https://i.sstatic.net/uByjT.jpg!

Here's my python code:

import xml.etree.ElementTree as ET
import csv

tree = ET.parse(r'C:\Documents\cat.xml')
root = tree.getroot()

#Open the file for writing

CustomerData = open(r'C:\Users\Kris\Documents\customerdata.csv', 'w')

#Create header row object

header_row = []

#Create the csv writer object

csvwriter = csv.writer(CustomerData)

#Set count to 0

count = 0

#Find tags and text

for node in tree.iter('Customer'):
data = []
if count == 0:
    for customerid in node.iter('Id_Customer'):
        customer = customerid.tag
        header_row.append(customer)
    for segmentid in node.iter('Segment'):
        segment = segmentid.tag
        header_row.append(segment)
    for event in node.iter('Event'):
        for natureid in event.iter('Nature'):
            nature = natureid.tag
            header_row.append(nature)
    for event2 in node.iter('Event'):
        for Extrainfoid in event2.iter('Extrainfo'):
            extrainfo = Extrainfoid.tag
            header_row.append(extrainfo)
    for address in node.iter('Address'):
        for zipcode in address.iter('zipcode'):
            zipcd = zipcode.tag
            header_row.append(zipcd)
    for address in node.iter('Address'):
        for streetname in address.iter('street'):
            street = streetname.tag
            header_row.append(street)
    for address in node.iter('Address'):
        for number in address.iter('number'):
            num = number.tag
    csvwriter.writerow(header_row)
    count = count + 1

for customerid in node.iter('Id_Customer'):
    customertxt = customerid.text
    data.append(customertxt)
for segmentid in node.iter('Segment'):
    segmenttxt = segmentid.text
    data.append(segmenttxt)
for event in node.iter('Event'):
    for natureid in event.iter('Nature'):
        naturetxt = natureid.text
        data.append(naturetxt)
for event2 in node.iter('Event'):
    for Extrainfoid in event2.iter('Extrainfo'):
        extrainfotxt = Extrainfoid.text
        data.append(extrainfotxt)
for address in node.iter('Address'):
    for zipcode in address.iter('zipcode'):
        zipcdtxt = zipcode.text
        data.append(zipcdtxt)
for address in node.iter('Address'):
    for streetname in address.iter('street'):
        streettxt = streetname.text
        header_row.append(streettxt)
for address in node.iter('Address'):
    for number in address.iter('number'):
        numtxt = number.text
        data.append(numtxt)
csvwriter.writerow(data)

CustomerData.close()

Here is an example of XML code that is similar to mine with different elements. It's not the real xml code that I'm using, just an example of how a customer can have multiple elements that another customer does not. Please note in my actual process with my xml files the headers and everything are displaying properly in my csv file, I just need to create an 'Empty' when the element does not actually have a value for that particular customer.

<CAT>
 <Header>...</Header>
 <Add>...</Add>
 <Customer>
  <Id_Customer>xyz1</Id_Customer>
  <Segment>abc1</Segment>
  <Event>
   <Nature>info1</Nature>
   <Extrainfo>info2</Extrainfo>
  </Event>
</Customer>
<Customer>
 <Id_Customer>zzwy</Id_Customer>
 <Segment>c2</Segment>
 <Adress>
  <zipcode>77098</zipcode>
  <street>belaire drive</street>
  <number>5</number>
 </Adress>
</Customer>

...

Martin Evans · Accepted Answer

You could create a list containing all the mappings you want. Try and search for each, and if it is not present, catch the AttributeError and store an empty value for it:

import xml.etree.ElementTree as ET
import csv

fields = [
    ('Id_Customer', 'Id_Customer'),
    ('Segment', 'Segment'),
    ('Nature', 'Event/Nature'),
    ('Extrainfo', 'Event/Extrainfo'),
    ('zipcode', 'Adress/zipcode'),
    ('street', 'Adress/street'),
    ('number', 'Adress/number')]

tree = ET.parse('cat.xml')
root = tree.getroot()

with open(r'customerdata.csv', 'wb') as f_customerdata:
    csv_customerdata = csv.DictWriter(f_customerdata, fieldnames=[field for field, match in fields])
    csv_customerdata.writeheader()

    for node in tree.iter('Customer'):
        row = {}

        for field_name, match in fields:
            try:
                row[field_name] = node.find(match).text
            except AttributeError as e:
                row[field_name] = ''

        csv_customerdata.writerow(row)

Giving you an output CSV file containing:

Id_Customer,Segment,Nature,Extrainfo,zipcode,street,number
xyz1,abc1,info1,info2,,,
zzwy,c2,,,77098,belaire drive,5

This approach also uses a DictWriter() instead of the standard csv writer. This makes it easier to assign values by name.

To cope with multiple address entries per customer, you first need to autocreate the maximum number of extra columns per entry. Then when accessing the elements, use findall() to get each one:

import xml.etree.ElementTree as ET
import csv

extra_columns = 2

fields = [
    ('Id_Customer', 'Id_Customer', 1),
    ('Segment', 'Segment', 1),
    ('Nature', 'Event/Nature', 1),
    ('Extrainfo', 'Event/Extrainfo', 1),
    ('zipcode', 'Adress/zipcode', extra_columns),
    ('street', 'Adress/street', extra_columns),
    ('number', 'Adress/number', extra_columns)]

tree = ET.parse('cat.xml')
root = tree.getroot()

# Auto create the header from fields
fieldnames = []

for field, match, cols in fields:
    fieldnames.append(field)

    if cols > 1:
        fieldnames.extend(["{}{}".format(field, x+2) for x in range(extra_columns)])

with open(r'customerdata.csv', 'wb') as f_customerdata:
    csv_customerdata = csv.DictWriter(f_customerdata, fieldnames=fieldnames)
    csv_customerdata.writeheader()

    for node in tree.iter('Customer'):
        row = {}

        for field_name, match, cols in fields:
            if cols > 1:
                for index, el in enumerate(node.findall(match)):
                    try:
                        if index:
                            row["{}{}".format(field_name, index+1)] = el.text
                        else:
                            row[field_name] = el.text

                    except AttributeError as e:
                        row[field_name] = ''
            else:
                try:
                    row[field_name] = node.find(match).text
                except AttributeError as e:
                    row[field_name] = ''

        csv_customerdata.writerow(row)

So your header would now look like:

Id_Customer,Segment,Nature,Extrainfo,zipcode,zipcode2,zipcode3,street,street2,street3,number,number2,number3

Python Parsing XML to CSV with missing Elements

Tags:

python

parsing

xml

csv

User214122114

1 Answers

Martin Evans

Recent Activity

Donate For Us

Python Parsing XML to CSV with missing Elements

Tags:

python

parsing

xml

csv

User214122114

1 Answers

Martin Evans

Related questions

Recent Activity

Donate For Us