I have been trying to build a nested tree-like structure for two days and decided to ask here for help. Suppose I have data like this:
rows = [
{'Year': None, 'Region': None, 'Country': None, 'Manufacturer': None, 'Brand': None, 'Sales': 25}, # row 1 => SUM of (row 2 and row 14) = 15+25 = 40; this row represents, for example, all of the sales made so far (the ultimate total, if you will call it as such)
{'Year': 2013, 'Region': None, 'Country': None, 'Manufacturer': None, 'Brand': None, 'Sales': 15}, # row 2 => SUM of sales from (row 3) = 15; this row represents, for example, the total of sales in 2013 from all regions, all countries, all manufacturers and all brands
{'Year': 2013, 'Region': 'LTM', 'Country': None, 'Manufacturer': None, 'Brand': None, 'Sales': 15}, #row 3 => SUM of sales from (row 4) = 15; this row represents, for example, the total of sales in LTM region for 2013
{'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': None, 'Brand': None, 'Sales': 15}, # row 4 => SUM of sales from (row 5+row 7+row 10+row12) = 1+5+4+5 = 15; this row represents, for example, the total of Sales in Colombia for 2013
{'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M1', 'Brand': None, 'Sales': 1}, # row 5 => SUM of sales from (row 6) = 1
{'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M1', 'Brand': 'B1', 'Sales': 1}, # row 6 => Nothing to sum here because this is the lowest hierarchy
{'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M2', 'Brand': None, 'Sales': 5}, # row 7 => SUM of sales from (row 8 and row 9) = 2+3 = 5
{'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M2', 'Brand': 'B2', 'Sales': 2}, # row 8 => Nothing to sum here
{'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M2', 'Brand': 'B3', 'Sales': 3}, # row 9 => Nothing to sum here
{'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M3', 'Brand': None, 'Sales': 4}, # row 10 => SUM of sales from (row 11) = 4
{'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M3', 'Brand': 'B4', 'Sales': 4}, # row 11 => Nothing to sum here
{'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M4', 'Brand': None, 'Sales': 5}, # row 12 => SUM of sales from (row 13) = 5
{'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M4', 'Brand': 'B5', 'Sales': 5}, # row 13 => Nothing to sum here
{'Year': 2014, 'Region': None, 'Country': None, 'Manufacturer': None, 'Brand': None, 'Sales': 25}, # row 14 => SUM of sales from (row 15) = 25; represents total sales in 2014 from all regions, all countries, all manufacturers and all brands
{'Year': 2014, 'Region': 'LTM', 'Country': None, 'Manufacturer': None, 'Brand': None, 'Sales': 25}, # row 15 => SUM of sales from (row 16+row 18) = 15+10 = 25; represents total sales in 2014 from Chile and Colombia combined
{'Year': 2014, 'Region': 'LTM', 'Country': 'Chile', 'Manufacturer': None, 'Brand': None, 'Sales': 15}, # ** TRICKY: row 16 => SUM of sales from (row 17+row 20+row 21) = 0+5+10 = 15; total sales in 2014 for Chile
{'Year': 2014, 'Region': 'LTM', 'Country': 'Chile', 'Manufacturer': 'M1', 'Brand': None, 'Sales': 15}, # row 17
{'Year': 2014, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': None, 'Brand': None, 'Sales': 10}, # row 18 => SUM of sales from (row 19) = 10; total sales in 2014 for Colombia
{'Year': 2014, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M1', 'Brand': None, 'Sales': 10}, # row 19
{'Year': 2014, 'Region': 'LTM', 'Country': 'Chile', 'Manufacturer': 'M1', 'Brand': 'B1', 'Sales': 5}, # row 20
{'Year': 2014, 'Region': 'LTM', 'Country': 'Chile', 'Manufacturer': 'M1', 'Brand': 'B6', 'Sales': 10}, # row 21
# more data...
]
I am trying to write a function/method that has signature like this:
def build_tree(rows, hierarchy):
pass # still can't get it right after ~2 days of trying
In the above signature, hierarchy
is defined as: any combination of ['Year']+[any from the 'Region','Country','Manufacturer' and 'Brand']
. So for example, these are all legitimate hierarchy of the desired tree: ['Year','Region','Country']
or ['Year','Country','Manufacturer']
or ['Year','Country','Brand']
.
Suppose, hierarchy=['Year','Country','Manufacturer']
and the input rows
are the 21 visible ones that I've described above, the output of the function should look like this:
output = [
{
"name": 2013,
"sales": 15, # total sales of 2013, which corresponds to 'Values: 15' of row #2 in input; alternatively, this "sales" can be calculated as the SUM(all "sales" of its IMMEDIATE children, which is the node with "name"="Colombia". We do NOT need to sum up the sales from children that are further down the hierarchy level such as that of 'children' from the 'Manufacturer' level)
"children": [
{
"name": "Colombia",
"sales": 15, # total sales in Colombia in 2013 which corresponds to 'Sales' of row #4 in input (please note that our input 'hierarchy' skipped 'Region', so we are not showing the aggregate value of 'Region' (row #3) here); alternatively, this "sales" can be calculated as the SUM(all "sales" in its immediate children, "name"=M1, M2, M3 and M4)
"children": [
{
"name": "M1", # total sales for Manufacturer 'M1' in 2013 which corresponds to 'Sales' of row #5 in input
"sales": 1,
"children": []
},
{
"name": "M2",
"sales": 5, # total sales for Manufacturer 'M2' in 2013 which corresponds to 'Sales' of row #7 in input
"children": []
},
{
"name": "M3",
"sales": 4, # total sales for Manufacturer 'M3' in 2013 which corresponds to 'Sales' of row #10 in input
"children": []
},
{
"name": "M4",
"sales": 5, # total sales for Manufacturer 'M4' in 2013 which corresponds to 'Sales' of row #12 in input
"children": []
}
]
}
]
},
{
"name": 2014,
"sales": 25, # sum of total sales in 2014; same as 'Sales' in row #14. Alternatively, we can just get the sum of its IMMEDIATE children, row#16 for 'Chile' and row#18 for Colombia, here
"children": [
{
"name": "Chile",
"sales": 15, # sum of total sales in 2014 for Chile, which is row #16; alternatively, we can derive this value by adding up the sales of row #17 (that is, its immediate children listed ONE hierarchy below, which is 'Manufacturer')
"children": [
{
"name": "M1",
"sales": 15, # corresponds to 'Sales' from row #17
"children": []
}
]
},
{
"name": "Colombia",
"sales": 10, # corresponds to 'Sales' from row #18, which is equivalent to the sum of total sales from all manufacturers in 'Colombia' in 2014
"children": [
{
"name": "M1",
"sales": 10, # corresponds to row #19; there's only one manufacturer reported for Colombia in 2014 in the input data
"children": []
}
]
}
]
}
]
Thanks very much in advance if you could share some tips/suggestions/answers!
This is how I see the algorithm. I hope the code is easily readable.
This assignment x0, *x = x
is Python3 syntax for detaching the first item of a list. In Python2: x0 = x[0]; x = x[1:]
There are two details you did not mention, see #comments
from collections import defaultdict
def build_tree(rows, hierarchy):
if not hierarchy:
return []
h0, *hierarchy = hierarchy
node = defaultdict(list)
for row in rows:
v0 = row[h0]
if v0 is not None: # filter out null values??
node[v0].append(row)
return [{
'name': key,
'value': None, # what is value??
'children': build_tree(subrows, hierarchy)} for key, subrows in node.items()]
You can use itertools.groupby
with recursion:
import itertools
rows = [{'Year': None, 'Region': None, 'Country': None, 'Manufacturer': None, 'Brand': None, 'Value': 1}, {'Year': 2013, 'Region': None, 'Country': None, 'Manufacturer': None, 'Brand': None, 'Value': 2}, {'Year': 2013, 'Region': 'LTM', 'Country': None, 'Manufacturer': None, 'Brand': None, 'Value': 3}, {'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': None, 'Brand': None, 'Value': 4}, {'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M1', 'Brand': None, 'Value': 5}, {'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M1', 'Brand': 'B1', 'Value': 6}, {'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M2', 'Brand': None, 'Value': 7}, {'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M2', 'Brand': 'B2', 'Value': 8}, {'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M2', 'Brand': 'B3', 'Value': 9}, {'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M3', 'Brand': None, 'Value': 10}, {'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M3', 'Brand': 'B4', 'Value': 11}, {'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M4', 'Brand': None, 'Value': 12}, {'Year': 2013, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M4', 'Brand': 'B5', 'Value': 13}, {'Year': 2014, 'Region': None, 'Country': None, 'Manufacturer': None, 'Brand': None, 'Value': 14}, {'Year': 2014, 'Region': 'LTM', 'Country': None, 'Manufacturer': None, 'Brand': None, 'Value': 15}, {'Year': 2014, 'Region': 'LTM', 'Country': 'Chile', 'Manufacturer': None, 'Brand': None, 'Value': 16}, {'Year': 2014, 'Region': 'LTM', 'Country': 'Chile', 'Manufacturer': 'M1', 'Brand': None, 'Value': 17}, {'Year': 2014, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': 'M1', 'Brand': None, 'Value': 18}, {'Year': 2014, 'Region': 'LTM', 'Country': 'Colombia', 'Manufacturer': None, 'Brand': None, 'Value': 19}, {'Year': 2014, 'Region': 'LTM', 'Country': 'Chile', 'Manufacturer': 'M1', 'Brand': 'B1', 'Value': 20}, {'Year': 2014, 'Region': 'LTM', 'Country': 'Chile', 'Manufacturer': 'M1', 'Brand': 'B6', 'Value': 21}]
def __lt__(_rows, key, current):
new_rows = list(filter(None, [i[current] for i in _rows]))
return {'int':0, 'str':''}.get(type(new_rows[0]).__name__) if key is None else key
def group_data(d, hierarchy=['Year','Country','Manufacturer']):
start, *_h = hierarchy
first = [[a, list(b)] for a, b in itertools.groupby(sorted(d, key=lambda x:__lt__(rows, x[start], start)), key=lambda x:__lt__(rows, x[start], start))]
return [{'name':a, 'value':min(b, key=lambda x:x['Value'])['Value'], 'children':[] if not _h else group_data(b, _h)} for a, b in first if a]
import json
print(json.dumps(group_data(rows), indent = 4))
Output:
[
{
"name": 2013,
"value": 2,
"children": [
{
"name": "Colombia",
"value": 4,
"children": [
{
"name": "M1",
"value": 5,
"children": []
},
{
"name": "M2",
"value": 7,
"children": []
},
{
"name": "M3",
"value": 10,
"children": []
},
{
"name": "M4",
"value": 12,
"children": []
}
]
}
]
},
{
"name": 2014,
"value": 14,
"children": [
{
"name": "Chile",
"value": 16,
"children": [
{
"name": "M1",
"value": 17,
"children": []
}
]
},
{
"name": "Colombia",
"value": 18,
"children": [
{
"name": "M1",
"value": 18,
"children": []
}
]
}
]
}
]
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With