Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Python how to process complex nested dictionaries efficiently

I have a complex nested dictionary structured like this:

example = {
    ('rem', 125): {
        ('emo', 35): {
            ('mon', 133): {
                ('ony', 33): 0
            },
            ('mor', 62): {
                ('ore', 23): 0
            },
            ('mot', 35): {
                ('ote', 22): 0
            },
            ('mos', 29): {
                ('ose', 29): 0
            }
        },
        ('emi', 32): {
            ('min', 109): {
                ('ine', 69): 0
            },
            ('mit', 58): {
                ('ite', 64): 0,
                ('ity', 45): 0
            }
        },
        ('eme', 26): {
            ('mer', 68): {
                ('ere', 24): 0,
                ('ery', 20): 0
            }
        }
    },
    ('iro', 30): {
        ('ron', 27): {
            ('oni', 94): {
                ('nic', 205): 0
            },
            ('one', 47): {
                ('ned', 86): 0,
                ('ner', 85): 0,
                ('nes', 26): 0,
                ('net', 20): 0
            },
            ('ona', 44): {
                ('nal', 246): 0
            }
        }
    },
    ('det', 122): {
        ('ete', 53): {
            ('ter', 212): {
                ('ery', 72): 0,
                ('era', 35): 0
            },
            ('ten', 65): {
                ('ene', 21): 0
            }
        }
    },
    ('uni', 217): {
        ('nin', 62): {
            ('ine', 32): {
                ('ned', 88): 0,
                ('ner', 74): 0,
                ('net', 25): 0
            }
        },
        ('nim', 31): {
            ('ima', 50): {
                ('mal', 23): 0
            }
        },
        ('niv', 25): {
            ('ive', 28): {
                ('ver', 48): 0
            }
        }
    },
    ('nat', 66): {
        ('ati', 21): {
            ('tiv', 517): {
                ('ive', 821): 0
            },
            ('tin', 230): {
                ('ine', 134): 0,
                ('ina', 22): 0
            },
            ('tic', 187): {
                ('ice', 23): 0
            },
            ('tis', 129): {
                ('ise', 25): 0
            },
            ('tiz', 59): {
                ('ize', 100): 0
            },
            ('tit', 52): {
                ('ite', 60): 0
            },
            ('tif', 43): {
                ('ify', 30): 0
            },
            ('til', 25): {
                ('ile', 79): 0,
                ('ily', 37): 0
            }
        }
    },
    ('mar', 286): {
        ('ari', 30): {
            ('rin', 156): {
                ('ine', 168): 0,
                ('ina', 24): 0
            },
            ('ris', 146): {
                ('ise', 31): 0
            },
            ('rit', 119): {
                ('ite', 174): 0,
                ('ity', 118): 0
            },
            ('ril', 63): {
                ('ily', 88): 0
            },
            ('riz', 56): {
                ('ize', 134): 0
            },
            ('ric', 30): {
                ('ice', 25): 0
            },
            ('rid', 25): {
                ('ide', 49): 0
            },
            ('rif', 24): {
                ('ify', 32): 0
            }
        },
        ('ara', 21): {
            ('rac', 84): {
                ('acy', 60): 0,
                ('ace', 33): 0
            },
            ('rat', 76): {
                ('ate', 451): 0
            },
            ('rag', 56): {
                ('age', 90): 0
            },
            ('rad', 47): {
                ('ade', 36): 0
            }
        }
    },
    ('ref', 153): {
        ('efo', 27): dict()
    },
    ('gen', 150): {
        ('ene', 56): {
            ('nes', 644): {
                ('ese', 33): 0
            },
            ('ner', 118): {
                ('ery', 37): 0
            }
        },
        ('eni', 25): {
            ('nit', 112): {
                ('ite', 200): 0,
                ('ity', 93): 0
            },
            ('nin', 59): {
                ('ine', 54): 0
            },
            ('niz', 33): {
                ('ize', 178): 0
            }
        }
    },
    ('pol', 384): {
        ('oly', 255): dict(),
        ('oli', 35): {
            ('lit', 234): {
                ('ity', 698): 0,
                ('ite', 209): 0
            },
            ('lis', 79): {
                ('ise', 28): 0
            },
            ('lin', 72): {
                ('ine', 206): 0
            },
            ('lid', 36): {
                ('ide', 21): 0
            },
            ('lif', 24): {
                ('ify', 21): 0
            },
            ('liz', 22): {
                ('ize', 209): 0
            }
        },
        ('ola', 22): {
            ('lat', 165): {
                ('ate', 422): 0
            },
            ('lar', 48): {
                ('ary', 106): 0
            },
            ('lan', 27): {
                ('ane', 24): 0
            }
        },
        ('ole', 22): {
            ('len', 60): {
                ('ene', 58): 0
            },
            ('ler', 39): {
                ('ery', 36): 0
            }
        }
    },
    ('lam', 117): {
        ('ame', 33): {
            ('mer', 46): {
                ('ere', 24): 0,
                ('ery', 20): 0
            }
        }
    },
    ('mal', 213): {
        ('ala', 64): {
            ('lac', 67): {
                ('ace', 32): 0
            },
            ('lan', 65): {
                ('ane', 24): 0
            },
            ('lat', 58): {
                ('ate', 422): 0
            },
            ('lar', 31): {
                ('ary', 106): 0
            }
        },
        ('ale', 37): {
            ('len', 90): {
                ('ene', 58): 0
            },
            ('ler', 26): {
                ('ery', 36): 0
            }
        }
    },
    ('rev', 156): {
        ('eve', 76): {
            ('ver', 139): {
                ('ery', 27): 0
            },
            ('vel', 36): {
                ('ely', 168): 0
            }
        },
        ('evi', 44): {
            ('vit', 27): {
                ('ity', 64): 0
            }
        },
        ('evo', 28): dict()
    },
    ('ura', 42): {
        ('ran', 25): {
            ('ani', 91): {
                ('nic', 76): 0
            }
        }
    },
    ('val', 78): {
        ('ale', 28): {
            ('len', 90): {
                ('ene', 58): 0
            },
            ('ler', 26): {
                ('ery', 36): 0
            }
        }
    },
    ('ven', 111): {
        ('ene', 26): {
            ('nes', 644): {
                ('ese', 33): 0
            },
            ('ner', 118): {
                ('ery', 37): 0
            }
        }
    },
    ('lat', 106): {
        ('ati', 39): {
            ('tiv', 517): {
                ('ive', 821): 0
            },
            ('tin', 230): {
                ('ine', 134): 0,
                ('ina', 22): 0
            },
            ('tic', 187): {
                ('ice', 23): 0
            },
            ('tis', 129): {
                ('ise', 25): 0
            },
            ('tiz', 59): {
                ('ize', 100): 0
            },
            ('tit', 52): {
                ('ite', 60): 0
            },
            ('tif', 43): {
                ('ify', 30): 0
            },
            ('til', 25): {
                ('ile', 79): 0,
                ('ily', 37): 0
            }
        },
        ('ate', 27): {
            ('ter', 153): {
                ('ery', 72): 0,
                ('era', 35): 0
            },
            ('tel', 117): {
                ('ely', 112): 0
            },
            ('ten', 74): {
                ('ene', 21): 0
            }
        }
    },
    ('aci', 42): {
        ('cid', 21): {
            ('idi', 49): {
                ('dic', 20): 0
            },
            ('ida', 43): {
                ('dal', 118): 0
            },
            ('ide', 43): {
                ('der', 40): 0,
                ('des', 38): 0,
                ('ded', 20): 0
            }
        }
    }
}

It is just a small fraction of the complex object, I posted such a complex example to signify how complex the object really is, the complete object can have as many as 15 nested levels from the first level of the example, and has many more first level keys.

Let me explain what this is, this is to be used in Markov chain based pseudo-word generation, each key is a tuple composed of a three letter string and a positive integer, the string is either composed of a vowel letter, a consonant letter and a vowel letter or CVC.

The string is a three-letter state, the integer represents the weight/likelihood/frequency of the state, each key of the successive level is a state that can follow the state of the previous level, a state can only follow another state iff the last two letters of the other state is the start of the first state, the number is the likelihood of transformation from the previous state to the next state.

The first level states are the states that can be found at start of words, the second level states are states that follow the previous first state as second state in words. Each last level value should be 0, it means end of nesting, in the example, if the first nesting level has a nesting value of 0 and each successive nesting level adds nesting value by 1, the last nesting level's nesting value plus three (the length of the first states) should be six (the length of the word generated by walking through the tree should be).

A word is generated by weighted random choosing a state, and weighted random choosing a state from its value, recursively, until the value is zero, then, take all letters of the first state, and the letters from the third letter on from the second state on, and join the letters.

For example, if the choices are ('uni', 'nin', 'ine', 'ned'), the chunks to be taken are 'uni', 'n', 'e', 'd' and the generated word is 'unined'.

Now I have three problems, the first problem is that the dictionary isn't sorted, this is the easiest to solve. The second is that the dictionary has empty nested dictionaries, these dictionaries mean the branch terminated prematurely, because the branches can't grow "legally", these branches are invalid and I would like to cut off the invalid branches recursively from last level to first level (from last level to first level, recursively pop keys backwards if the value dictionary is empty, until the values are no longer empty), this is a little more difficult.

And the most difficult part, there are many branches with only one subbranch, and I need to merge such subbranches with their parent branches recursively, from last level to first level backwards.

In short I need it to become this:

{
    ('acid', 42): {
        ('idal', 43): 0,
        ('ide', 43): {
            ('ded', 20): 0,
            ('der', 40): 0,
            ('des', 38): 0
        },
        ('idic', 49): 0
    },
    ('dete', 122): {
        ('tene', 65): 0,
        ('ter', 212): {
            ('era', 35): 0,
            ('ery', 72): 0
        }
    },
    ('gen', 150): {
        ('ene', 56): {
            ('nery', 118): 0,
            ('nese', 644): 0
        },
        ('eni', 25): {
            ('nine', 59): 0,
            ('nit', 112): {
                ('ite', 200): 0,
                ('ity', 93): 0
            },
            ('nize', 33): 0
        }
    },
    ('iron', 30): {
        ('onal', 44): 0,
        ('one', 47): {
            ('ned', 86): 0,
            ('ner', 85): 0,
            ('nes', 26): 0,
            ('net', 20): 0
        },
        ('onic', 94): 0
    },
    ('lamer', 117): {
        ('ere', 24): 0,
        ('ery', 20): 0
    },
    ('lat', 106): {
        ('ate', 27): {
            ('tely', 117): 0,
            ('tene', 74): 0,
            ('ter', 153): {
                ('era', 35): 0,
                ('ery', 72): 0
            }
        },
        ('ati', 39): {
            ('tice', 187): 0,
            ('tify', 43): 0,
            ('til', 25): {
                ('ile', 79): 0,
                ('ily', 37): 0
            },
            ('tin', 230): {
                ('ina', 22): 0,
                ('ine', 134): 0
            },
            ('tise', 129): 0,
            ('tite', 52): 0,
            ('tive', 517): 0,
            ('tize', 59): 0
        }
    },
    ('mal', 213): {
        ('ala', 64): {
            ('lace', 67): 0,
            ('lane', 65): 0,
            ('lary', 31): 0,
            ('late', 58): 0
        },
        ('ale', 37): {
            ('lene', 90): 0,
            ('lery', 26): 0
        }
    },
    ('mar', 286): {
        ('ara', 21): {
            ('rac', 84): {
                ('ace', 33): 0,
                ('acy', 60): 0
            },
            ('rade', 47): 0,
            ('rage', 56): 0,
            ('rate', 76): 0
        },
        ('ari', 30): {
            ('rice', 30): 0,
            ('ride', 25): 0,
            ('rify', 24): 0,
            ('rily', 63): 0,
            ('rin', 156): {
                ('ina', 24): 0,
                ('ine', 168): 0
            },
            ('rise', 146): 0,
            ('rit', 119): {
                ('ite', 174): 0,
                ('ity', 118): 0
            },
            ('rize', 56): 0
        }
    },
    ('nati', 66): {
        ('tice', 187): 0,
        ('tify', 43): 0,
        ('til', 25): {
            ('ile', 79): 0,
            ('ily', 37): 0
        },
        ('tin', 230): {
            ('ina', 22): 0,
            ('ine', 134): 0
        },
        ('tise', 129): 0,
        ('tite', 52): 0,
        ('tive', 517): 0,
        ('tize', 59): 0
    },
    ('pol', 384): {
        ('ola', 22): {
            ('lane', 27): 0,
            ('lary', 48): 0,
            ('late', 165): 0
        },
        ('ole', 22): {
            ('lene', 60): 0,
            ('lery', 39): 0
        },
        ('oli', 35): {
            ('lide', 36): 0,
            ('lify', 24): 0,
            ('line', 72): 0,
            ('lise', 79): 0,
            ('lit', 234): {
                ('ite', 209): 0,
                ('ity', 698): 0
            },
            ('lize', 22): 0
        }
    },
    ('rem', 125): {
        ('emer', 26): {
            ('ere', 24): 0,
            ('ery', 20): 0
        },
        ('emi', 32): {
            ('mine', 109): 0,
            ('mit', 58): {
                ('ite', 64): 0,
                ('ity', 45): 0
            }
        },
        ('emo', 35): {
            ('mony', 133): 0,
            ('more', 62): 0,
            ('mose', 29): 0,
            ('mote', 35): 0
        }
    },
    ('rev', 156): {
        ('eve', 76): {
            ('vely', 36): 0,
            ('very', 139): 0
        },
        ('evity', 44): 0
    },
    ('uni', 217): {
        ('nimal', 31): 0,
        ('nine', 62): {
            ('ned', 88): 0,
            ('ner', 74): 0,
            ('net', 25): 0
        },
        ('niver', 25): 0
    },
    ('uranic', 42): 0,
    ('vale', 78): {
        ('lene', 90): 0,
        ('lery', 26): 0
    },
    ('vene', 111): {
        ('nery', 118): 0,
        ('nese', 644): 0
    }
}

And here is how I do it:

def merge(obj):
    for k, v in list(obj.items()):
        if isinstance(v, dict):
            merge(v)
            if len(v) == 1:
                a, b = k
                k1, v1 = list(obj.pop(k).items())[0]
                key = (a + k1[0][2:], b)
                obj[key] = v1

def pop_empty(obj):
    for k, v in list(obj.items()):
        if isinstance(v, dict):
            pop_empty(v)
            if not v:
                obj.pop(k)


def nested_sort(dic):
    d = dict()
    for k, v in sorted(dic.items()):
        if isinstance(v, dict):
            d[k] = nested_sort(v)
        else:
            d[k] = v
    return d

pop_empty(example)
merge(example)
nested_sort(example)

I need to process an extremely complex object much more complex than this, what are more efficient ways to do these?

The object has many repeated elements and repeated nested dictionaries and in general repetition is very heavy and memoization is a huge help, but two of the functions don't return any value and the one returns a value, trying to wrap it using lru_cache will raise unhashable type dict, can any one re-implement the same ideas using memoization?

like image 989
Thyebri Avatar asked Mar 01 '23 09:03

Thyebri


1 Answers

I was able to get about 25 % faster by combining the three processes.

def merge(obj, /):
    result = {}
    for key, val in sorted(obj.items()):
        if isinstance(val, dict):
            val = merge(val)
            if not val:
                continue
            if len(val) == 1:
                k1, val = next(iter(val.items()))
                key = (key[0] + k1[0][2:], key[1])
        result[key] = val
    return result

I don't know how much more is possible.

like image 175
LeopardShark Avatar answered Apr 06 '23 11:04

LeopardShark