Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Best way to parse HTML in Javascript

I am having a lot of trouble learning RegExp and coming up with a good algorithm to do this. I have this string of HTML that I need to parse. Note that when I am parsing it, it is still a string object and not yet HTML on the browser as I need to parse it before it gets there. The HTML looks like this:

<html>
  <head>
    <title>Geoserver GetFeatureInfo output</title>
  </head>
  <style type="text/css">
    table.featureInfo, table.featureInfo td, table.featureInfo th {
        border:1px solid #ddd;
        border-collapse:collapse;
        margin:0;
        padding:0;
        font-size: 90%;
        padding:.2em .1em;
    }
    table.featureInfo th {
        padding:.2em .2em;
        font-weight:bold;
        background:#eee;
    }
    table.featureInfo td{
        background:#fff;
    }
    table.featureInfo tr.odd td{
        background:#eee;
    }
    table.featureInfo caption{
        text-align:left;
        font-size:100%;
        font-weight:bold;
        text-transform:uppercase;
        padding:.2em .2em;
    }
  </style>

  <body>
    <table class="featureInfo2">
    <tr>
        <th class="dataLayer" colspan="5">Tibetan Villages</th>
    </tr>
    <!-- EOF Data Layer -->
    <tr class="dataHeaders">
        <th>ID</th>
        <th>Latitude</th>
        <th>Longitude</th>
        <th>Place Name</th>
        <th>English Translation</th>
    </tr>
    <!-- EOF Data Headers -->
    <!-- Data -->
    <tr>
    <!-- Feature Info Data -->
        <td>3394</td>
        <td>29.1</td>
        <td>93.15</td>
        <td>བསྡམས་གྲོང་ཚོ།</td>
        <td>Dam Drongtso </td>
    </tr>
    <!-- EOF Feature Info Data -->
    <!-- End Data -->
    </table>
    <br/>
  </body>
</html>

and I need to get it like this:

3394,
29.1,
93.15,
བསྡམས་གྲོང་ཚོ།,
Dam Drongtso

Basically an array...even better if it matches according to its field headers and from which table they are somehow, which look like this:

Tibetan Villages

ID
Latitude
Longitude
Place Name
English Translation

Finding out JavaScript does not support wonderful mapping was a bummer and I have what I want working already. However it is VERY VERY hard coded and I'm thinking I should probably use RegExp to handle this better. Unfortunately I am having a real tough time :(. Here is my function to parse my string (very ugly IMO):

    function parseHTML(html){

    //Getting the layer name
    alert(html);
    //Lousy attempt at RegExp
    var somestring = html.replace('/m//\<html\>+\<body\>//m/',' ');
    alert(somestring);
    var startPos = html.indexOf('<th class="dataLayer" colspan="5">');
    var length = ('<th class="dataLayer" colspan="5">').length;
    var endPos = html.indexOf('</th></tr><!-- EOF Data Layer -->');
    var dataLayer = html.substring(startPos + length, endPos);

    //Getting the data headers
    startPos = html.indexOf('<tr class="dataHeaders">');
    length = ('<tr class="dataHeaders">').length;
    endPos = html.indexOf('</tr><!-- EOF Data Headers -->');
    var newString = html.substring(startPos + length, endPos);
    newString = newString.replace(/<th>/g, '');
    newString = newString.substring(0, newString.lastIndexOf('</th>'));
    var featureInfoHeaders = new Array();
    featureInfoHeaders = newString.split('</th>');

    //Getting the data
    startPos = html.indexOf('<!-- Data -->');
    length = ('<!-- Data -->').length;
    endPos = html.indexOf('<!-- End Data -->');
    newString = html.substring(startPos + length, endPos);
    newString = newString.substring(0, newString.lastIndexOf('</tr><!-- EOF Feature Info Data -->'));
    var featureInfoData = new Array();
    featureInfoData = newString.split('</tr><!-- EOF Feature Info Data -->');

    for(var s = 0; s < featureInfoData.length; s++){
        startPos = featureInfoData[s].indexOf('<!-- Feature Info Data -->');
        length = ('<!-- Feature Info Data -->').length;
        endPos = featureInfoData[s].lastIndexOf('</td>');
        featureInfoData[s] = featureInfoData[s].substring(startPos + length, endPos);
        featureInfoData[s] = featureInfoData[s].replace(/<td>/g, '');
        featureInfoData[s] = featureInfoData[s].split('</td>');
    }//end for

    alert(featureInfoData);

    //Put all the feature info in one array
    var featureInfo = new Array();
    var len = featureInfoData.length;
    for(var j = 0; j < len; j++){
        featureInfo[j] = new Object();
        featureInfo[j].id = featureInfoData[j][0];
        featureInfo[j].latitude = featureInfoData[j][1];
        featureInfo[j].longitude = featureInfoData[j][2];
        featureInfo[j].placeName = featureInfoData[j][3];
        featureInfo[j].translation = featureInfoData[j][4];
        }//end for 

    //This can be ignored for now...
        var string = redesignHTML(featureInfoHeaders, featureInfo);
        return string;

    }//end parseHTML

So as you can see if the content in that string ever changes, my code will be horribly broken. I want to avoid that as much as possible and try to write better code. I appreciate all the help and advice you can give me.

like image 789
elshae Avatar asked Nov 22 '10 16:11

elshae


People also ask

Can JavaScript parse HTML?

The native DOM manipulation capabilities of JavaScript and jQuery are great for simple parsing of HTML fragments. However, if you actually need to parse a complete HTML or XML source in a DOM document programmatically, there is a better solution: DOMParser. It is available in all modern browsers.

Can XML parsers parse HTML?

XML parsers will fail to parse any HTML document that uses any of those features. HTML parsers, on the other hand, will basically never fail no matter what a document contains.

How HTML code is parsed?

HTML parsing involves tokenization and tree construction. HTML tokens include start and end tags, as well as attribute names and values. If the document is well-formed, parsing it is straightforward and faster. The parser parses tokenized input into the document, building up the document tree.


3 Answers

Do the following steps:

  1. Create a new documentFragment
  2. Put your HTML string in it
  3. Use selectors to get what you want

Why do all the parsing work - which won't work anyways, since HTML is not parsable via RegExp - when you have the best HTML parser available? (the Browser)

like image 107
Ivo Wetzel Avatar answered Oct 19 '22 21:10

Ivo Wetzel


You can use jQuery to easily traverse the DOM and create an object with the structure automatically.

var $dom = $('<html>').html(the_html_string_variable_goes_here);
var featureInfo = {};

$('table:has(.dataLayer)', $dom).each(function(){
    var $tbl = $(this);
    var section = $tbl.find('.dataLayer').text();
    var obj = [];
    var $structure = $tbl.find('.dataHeaders');
    var structure = $structure.find('th').map(function(){return $(this).text().toLowerCase();});
    var $datarows= $structure.nextAll('tr');
    $datarows.each(function(i){
        obj[i] = {};
        $(this).find('td').each(function(index,element){
            obj[i][structure[index]] = $(element).text();
        });
    });
    featureInfo[section] = obj;
});

Working Demo

The code can work with multiple tables with different structures inside.. and also multiple data rows inside each table..

The featureInfo will hold the final structure and data, and can be accessed like

alert( featureInfo['Tibetan Villages'][0]['English Translation'] );

or

alert( featureInfo['Tibetan Villages'][0].id );
like image 37
Gabriele Petrioli Avatar answered Oct 19 '22 20:10

Gabriele Petrioli


The "correct" way to do it is with DOMParser. Do it like this:

var parsed=new DOMParser.parseFromString(htmlString,'text/html');

Or, if you're worried about browser compatibility, use the polyfill on the MDN documentation:

/*
 * DOMParser HTML extension
 * 2012-09-04
 * 
 * By Eli Grey, http://eligrey.com
 * Public domain.
 * NO WARRANTY EXPRESSED OR IMPLIED. USE AT YOUR OWN RISK.
 */

/*! @source https://gist.github.com/1129031 */
/*global document, DOMParser*/

(function(DOMParser) {
    "use strict";

    var
      DOMParser_proto = DOMParser.prototype
    , real_parseFromString = DOMParser_proto.parseFromString
    ;

    // Firefox/Opera/IE throw errors on unsupported types
    try {
        // WebKit returns null on unsupported types
        if ((new DOMParser).parseFromString("", "text/html")) {
            // text/html parsing is natively supported
            return;
        }
    } catch (ex) {}

    DOMParser_proto.parseFromString = function(markup, type) {
        if (/^\s*text\/html\s*(?:;|$)/i.test(type)) {
            var
              doc = document.implementation.createHTMLDocument("")
            ;
                if (markup.toLowerCase().indexOf('<!doctype') > -1) {
                    doc.documentElement.innerHTML = markup;
                }
                else {
                    doc.body.innerHTML = markup;
                }
            return doc;
        } else {
            return real_parseFromString.apply(this, arguments);
        }
    };
}(DOMParser));
like image 23
markasoftware Avatar answered Oct 19 '22 19:10

markasoftware