how to robustly parse a document for any headings and build a
tree of just those headings

Question

So I parse through a document in order to grab all the headings with stackHeadings(). I do this in order to build a Microsoft Word style document map with buildNav(). This currently works OK but its not very robust and breaks anytime the headings do not follow a strict order... e.g. (If you start with an H2 it breaks, if you nest a H3 under and H1 it breaks, etc...)

I can't quite figure out the best way to fix this (make it more robust). I'm taking advantage of jQuery's `nextUntil' function to find all the h2s between two h1s.

One possibility is replacing:

elem.nextUntil( 'h' + cur, 'h' + next )

with

elem.nextUntil( 'h' + cur, 'h' + next + ',h' + (next + 1) + ',h' + (next + 2) ... )

to find ALL subheadings between two headings of the same level. But now h3 children of h1s would only be nested one level rather than two.

So then you'd have to compare the current heading level with the parent heading level, and if there's a jump of more than one (h1 -> h3), you'd have to create an empty child between them as a nesting placeholder for the missing h2.

Any ideas or solutions would be greatly appreciated!

stackHeadings = (items, cur, counter) ->

    cur = 1 if cur == undefined
    counter ?= 1
    next = cur + 1
    for elem, index in items
      elem = $(elem)
      children  =  filterHeadlines( elem.nextUntil( 'h' + cur, 'h' + next ) )
      d.children = stackHeadings( children, next, counter ) if children.length > 0
      d


filterHeadlines = ( $hs ) ->
    _.filter( $hs, ( h ) -> $(h).text().match(/[^\s]/) )

buildNav = ( ul, items ) ->
    for child, index in items
        li = $( "<li>" )
        $( ul ).append( li )
        $a = $("<a/>")
        $a.attr( "id", "nav-title-" + child.id )

        li.append( $a )

        if child.children
            subUl = document.createElement( 'ul' )
            li.append( subUl )
            buildNav( subUl, child.children )

items = stackHeadings( filterHeadlines( source.find( 'h1' ) ) )
ul = $('<ul>')
buildNav( ul, items)

Louis Ricci · Accepted Answer

I threw together some JavaScript that will do what you want http://jsfiddle.net/fA4EW/

It's a fairly straightforward recursive function that consumes an array of elements (nodes) and builds the UL structure accordingly. To be consistent with the question I add the placeholder (empty) list elements when you from an H1 to an H3 etc.

function buildRec(nodes, elm, lv) {
    var node;
    // filter
    do {
        node = nodes.shift();
    } while(node && !(/^h[123456]$/i.test(node.tagName)));
    // process the next node
    if(node) {
        var ul, li, cnt;
        var curLv = parseInt(node.tagName.substring(1));
        if(curLv == lv) { // same level append an il
            cnt = 0;
        } else if(curLv < lv) { // walk up then append il
            cnt = 0;
            do {
                elm = elm.parentNode.parentNode;
                cnt--;
            } while(cnt > (curLv - lv));
        } else if(curLv > lv) { // create children then append il
            cnt = 0;
            do {
                li = elm.lastChild;
                if(li == null)
                    li = elm.appendChild(document.createElement("li"));
                elm = li.appendChild(document.createElement("ul"));
                cnt++;
            } while(cnt < (curLv - lv));
        }
        li = elm.appendChild(document.createElement("li"));
        // replace the next line with archor tags or whatever you want
        li.innerHTML = node.innerHTML;
        // recursive call
        buildRec(nodes, elm, lv + cnt);
    }
}
// example usage
var all = document.getElementById("content").getElementsByTagName("*");
var nodes = []; 
for(var i = all.length; i--; nodes.unshift(all[i]));
var result = document.createElement("ul");
buildRec(nodes, result, 1);
document.getElementById("outp").appendChild(result);

mb21 · Answer

You could simply use the jQuery TOC plugin, it appears they're doing it like this:

$('h1,h2,h3').each(function(i, heading) {
    ...
}

Of course, this simply treats all h1, h2 and h3 equally and creates the TOC by considering the nesting and the document order of the elements only. But isn't that the desired behaviour?

It would look strange to have a h3, which is directly inside a h1, double indented in the TOC. If you cannot live with that inconsistency, instead of inserting the missing h2 in between, I would consider cleaning up the HTML and converting the h3 into a h2.

how to robustly parse a document for any headings and build a <ul> tree of just those headings

Tags:

javascript

jquery

dom

coffeescript

funkyeah

2 Answers

Louis Ricci

mb21

Recent Activity

Donate For Us