Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Build a tree / run recursive query in Kusto Azure Data Explorer

I have a Kusto table that has the following structure:

Name File IngestType
A F1 output
B F1 input
B F2 output
C F2 input
D F2 input

I want to start with a given Name, say A and run a query where I can build a flat tree up to a certain level. So far my solution is to write a query for each level and union the data in the end:

    let parent = table | where Name == 'A';
    let child_level_1 = table | where type == 'input' 
    | join kind=inner parent on File
    | ... // project own output logic for child_level_2 to consume
    let child_level_2 = table | where type == 'input'
    | join kind=inner child_level_1 on File
    ...
    let child_level_10 = ...

Could the above be built dynamically in Kusto? Something on the lines of "while children have outputs consumed by others, keep query-ing"

like image 463
Alexandru Antochi Avatar asked Sep 19 '25 00:09

Alexandru Antochi


2 Answers

Could the above be built dynamically in Kusto? Something on the lines of "while children have outputs consumed by others, keep query-ing"

No. What you wrote above (using child_level_i) is the only way.

like image 128
Slavik N Avatar answered Sep 23 '25 07:09

Slavik N


Edit

Kusto supports now directed graph, which would allow you to achieve what you want by building a process tree then filtering out branches https://learn.microsoft.com/en-us/azure/data-explorer/kusto/query/graph-operators

Have a look there: https://github.com/microsoft/Kusto-Query-Language/issues/133#issuecomment-1868939619

 

 // working dummy data
 let Data = datatable (Computer:string,  process_parent_command_line:string, process_command_line:string, process_parent_id:string, process_id :string)[
        "ComputerA", "root cmd", "cmd lvl1", 0, 1,
        "ComputerA", "cmd lvl1", "cmd lvl2", 1, 2,
        "ComputerA", "cmd lvl1", "cmd lvl1.2",1, 1.2,
        "ComputerA",  "cmd lvl2", "cmd lvl3", 2, 3,
        "ComputerA", "cmd lvl3", "cmd /C reg load lvl4", 3, 4,
        "ComputerA", "cmd /C reg load lvl4", "cmd reg load lvl5", 4, 5,
        "ComputerA", "cmd reg load lvl5", "cmd lvl6",5, 6,
        // Computer B, branch 000, reg load 1
        "ComputerB", "root cmd - branch 00000", "cmd lvl1", 10, 11,
        "ComputerB", "cmd lvl1", "cmd /C reg load lvl2", 11, 12,
        "ComputerB",  "cmd /C reg load lvl2", "cmd reg load lvl3", 12, 13,
        // Computer B bracnh 000 noise
        "ComputerB", "cmd lvl3", "cmd lvl4", 13, 14,
        "ComputerB", "cmd lvl4", "cmd lvl5", 14, 15,
        "ComputerB", "cmd lvl5", "cmd lvl6",15, 16,
        // Computer B Branch AAAA - reg load - longuest
        "ComputerB", "root cmd - branch AAA", "cmd lvl1 - branch AAA", 20, 21,
        "ComputerB", "cmd lvl1 - branch AAA", "cmd lvl2 - branch AAA", 21, 22,
        "ComputerB", "cmd lvl2 - branch AAA", "cmd lvl3 - branch AAA",22, 23,
        "ComputerB", "cmd lvl3 - branch AAA", "cmd reg load", 23,24,
        // Computer B Branch BBBB - reg load - shortest
        "ComputerB", "root cmd - branch BBB", "cmd lvl1 - branch BBB", 30,31,
        "ComputerB", "cmd lvl1 - branch BBB", "cmd reg load", 31,32,
        ]
// create requiered column for graph
// adapt values if required
| project parent_id = strcat_delim("_",Computer, process_parent_id, process_parent_command_line),
        child_id = strcat_delim("_", Computer, process_id, process_command_line),
        Computer, process_command_line,
        process_parent_command_line,
       user_name = "bob",
       RenderedDescription = "process launched",
       EventID = 1,
       file_directory = "C:/",
       process_id, process_parent_id,
       process_path = replace_string(process_command_line, "cmd","path"),
       process_parent_path = replace_string(process_parent_command_line, "cmd","path")
;
// build node info table for graph
// needs to list all unique cmd/parent cmd in a one column
let process_info =// materialize(
                Data
                // reduce data amount
                // create a row pre process child with its info
                | distinct parent_id, child_id, 
                        user_name, process_command_line, process_parent_command_line, process_path, process_parent_path, Computer,
                        RenderedDescription, EventID, file_directory, process_id, process_parent_id
                | as hint.materialized=false unique_cmd_child_parent // false more efficient here
                | project node_id = child_id,
                        user_name, process_command_line, process_path, Computer, RenderedDescription, file_directory, process_id
                // create a row pre process parent with its info
                | union (unique_cmd_child_parent
                        | project node_id = parent_id,
                                user_name, process_command_line=process_parent_command_line, process_path = process_parent_path,
                                Computer, RenderedDescription, file_directory, process_id = process_parent_id
                )
                | distinct node_id, user_name, process_command_line, process_path, process_id, Computer, RenderedDescription, file_directory
      //  )
;
Data
| project parent_id, child_id,
         process_parent_command_line, process_command_line // parent/child ids are built on cmd and computer
| make-graph parent_id --> child_id with process_info on node_id
//| make-graph process_parent_command_line --> process_command_line with process_info on process_command_line
| graph-match (parent) -[edge*1..10]-> (child)
                where child.process_command_line has "reg"
                        and child.process_command_line has_any("load","unload")
                        //
                        //and not( // keeps only A -> /c reg load  and not A -> /C reg load -> reg load
                        //        parent.process_command_line has "reg"
                        //        and parent.process_command_line has_any("load","unload")
                        //        )
                project root_parent_user_name = parent.user_name,
                        root_cmd = parent.process_command_line,
                        root_parent_path = parent.process_path,
                        root_nodeId = parent.node_id,
                        process_branch = todynamic(edge.process_parent_command_line),
                        node_id_list = todynamic(edge.child_id),
                        last_child_nodeId = child.node_id,
                        last_child_cmd = child.process_command_line,
                        last_child_pid = child.process_id,
                        Computer = parent.Computer
// back to tabular mode
| extend process_branch_length = array_length(process_branch)
| summarize process_branch_length = arg_max(process_branch_length, *) by last_child_cmd, Computer, last_child_pid // keep only the longest branch for a child_cmd by computer
// deals with case where branch is 'root cmd' --> 'parent cmd /c reg load' -> 'cmd /c reg load' => 2 results at previous step
// Note that some similar cmd might looks duplicated (ex: %%systemDrive%% vs %systemDrive%)
| order by Computer, process_branch_length desc
| where not(prev(Computer) == Computer
            //and prev(last_child_cmd) ==  process_branch[-1]
            and prev(process_branch_length) > process_branch_length
            and prev(root_cmd) ==  root_cmd
            )
| project-reorder Computer, root_parent_user_name, root_cmd, last_child_cmd, process_branch

Old way

if you are looking for a particular process branch, you could target it by defining the process tree as a time sequence of processes launched or command-lines with evaluate sequence_detect plugin.

datatable (Time: datetime, parent:string, child: string)[
    '2023-01-13T14:01:06.3120799Z', "Grand parent", "parent",
    '2023-01-13T14:01:07.3120799Z', "parent", "child",
    '2023-01-13T14:01:08.3120799Z', "child", "grand child",
    '2023-01-13T14:01:08.3120799Z', "noise parent", "noise child"
    '2023-01-13T14:01:08.3120799Z', "noise child", "noise grandchild"
    ]
| evaluate sequence_detect(Time, // time column to look for sequence
                            3s, // timespan  allowed for the sequence completion
                            2s, // max timespan between each step
                           // process tree as condition
                           GrandParentFound =  parent == "Grand parent" and child == "parent",
                            ParentFound = parent == "parent" and child == "child",
                            ChildFound = parent == "child" and child == "grand child"
                            )
like image 33
nonayme Avatar answered Sep 23 '25 07:09

nonayme