Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Parse entire HTML document from string into jQuery

Tags:

jquery

I have a document fetched by a $.get call, it's a big bloated HTML document. I need to use jQuery to grab an element from it.

I'm trying this (in coffeescript):

$.get url, (data) ->
  title = $(data).find('title').text()

This doesn't work. In browser console I've whittled this down to $(document.documentElement.outerHTML).find('title') where document.documentElement.outerHTML gives a string of the document.

I've tried jQuery.parseHTML, with the same result.

like image 888
wjdp Avatar asked Jun 05 '15 22:06

wjdp


2 Answers

The reason why it does not work is because jQuery expects a DOM node to find the 'title' tags. As you noted, you need to parse the html text first.

From here and here, the solution is to parse the string and append it into a temporal div (or other element):

var tempDom = $('<div></div>').append($.parseHTML(str));

Then, you can manipulate tempDom to find elements.

Working demo: http://codepen.io/anon/pen/wKwLMP

like image 127
Miguel Jiménez Avatar answered Sep 21 '22 14:09

Miguel Jiménez


TL;DR ... use the DOMParser API

var htmlString = "<html><head><title>Name</title></head><body><div class='content'>Hello</div></body></html>";
var htmlDoc = (new DOMParser()).parseFromString(htmlString, "text/xml");

Unfortunately, there current answers don't hit a lot of edge cases

You should not use $.parseHTML(htmlString) as it's immediately lossy. If we check the source code on $.parseHtml, it'll call buildFragment which creates a temporary DOM element and sets the innerHTML property.

innerHtml Parsing

Element.innerHTML provides an API for:

  • Parsing (string -> DOM) in the write operation
  • Serializing (DOM -> string) in the read operation

And here's the spec for Html Fragment Parsing Algorithm

Taking a sample string, here's the result of trying various HTML Parsing approaches:

var htmlString = "<html><head><title>Name</title></head><body><div class='content'>Hello</div></body></html>";

function ParseHtmlTests() {

  /*** $.parseHTML ***/
  var $parseHtml = $.parseHTML(htmlString)

  console.LogOutput(
    '1. $.parseHTML',
    $parseHtml,
    $parseHtml.map(function(el, i) { return el.outerHTML }),
    $($parseHtml).find("title").text(),
    $($parseHtml).find(".content").text()
  )


  /*** tempDiv.innerHTML ***/
  var tempDiv = document.createElement("div")
  tempDiv.innerHTML = htmlString

  console.LogOutput(
    '2. tempDiv.innerHTML',
    tempDiv,
    tempDiv.outerHTML,
    $(tempDiv).find("title").text(),
    $(tempDiv).find(".content").text()
  )


  /*** divAppendContents ***/
  var $divAppendContents = $('<div></div>').append(htmlString)

  console.LogOutput(
    '3. divAppendContents',
    $divAppendContents,
    $divAppendContents.html(),
    $divAppendContents.find("title").text(),
    $divAppendContents.find(".content").text()
  )


  /*** tempHtml.innerHTML ***/
  var tmpHtml = document.createElement( 'html' );
  tmpHtml.innerHTML = htmlString;

  console.LogOutput(
   '4. tempHtml.innerHTML',
    tmpHtml,
    tmpHtml.outerHTML,
    tmpHtml.getElementsByTagName('title')[0].innerText,
    tmpHtml.getElementsByClassName('content')[0].innerText
  )


  /*** DOMParser.parseFromString ***/
  var htmlDoc = (new DOMParser()).parseFromString(htmlString, "text/xml");

  console.LogOutput(
    '5. DOMParser.parseFromString',
    htmlDoc,
    htmlDoc.documentElement.outerHTML,
    htmlDoc.documentElement.getElementsByTagName('title')[0].innerHTML,
    htmlDoc.documentElement.getElementsByClassName('content')[0].innerHTML
  )
}

/*** Create Console Log Methods ***/
console.group = console.group || function(msg) {
  console.log(msg)
}
console.groupEnd = console.groupEnd || function(msg) {
  console.log("----------------------------")
}
console.LogOutput = function(method, dom, html, title, content) {
  console.group(method);
  console.log("DOM:", dom)
  console.log("HTML:", html)
  console.log("Title:", title)
  console.log("Content:", content)
  console.groupEnd();
};

/*** Execute Script ***/
ParseHtmlTests()
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.3.1/jquery.js"></script>

And here's the output from the above script in chrome:

Output

The best approach seems to be creating a HTML Root object by setting the innerHTML of a temporary HTML document or by using the DOMParser API

Further Reading:

  • Parse an HTML string with JS
  • Parsing of html string using jquery
  • jQuery not finding elements in jQuery.parseHTML()
like image 38
KyleMit Avatar answered Sep 21 '22 14:09

KyleMit