Scraping links generated by javascript

Question

I'm using Scrapy to crawl a website, and one of the links I need to scrape appears to be generated by a small snippet of Javascript code in the page, like so:

 <!--
 var prefix = 'm&#97;&#105;lt&#111;:';
 var suffix = '';
 var attribs = '';
 var path = 'hr' + 'ef' + '=';
 var addy59933 = 'HR-C&#111;l&#111;gn&#101;' + '&#64;';
 addy59933 = addy59933 + 'sc&#111;r' + '&#46;' + 'c&#111;m';
 var addy_text59933 = 'Submit your application';
 document.write( '<a ' + path + '\'' + prefix + addy59933 + suffix + '\'' + attribs + '>' );
 document.write( addy_text59933 );
 document.write( '<\/a>' );
 //-->

The link won't show up unless you view the page from a browser, but I need my spider to be able to scrape it anyways. Since the code is embedded within the page, I had the idea to grab the and then re-assemble the link url from there, but the text is in a format I'm not familiar with.

Is there a better way to do it?

edit: Just figured out that those are HTML Character Entities. I'd still like to know if there are better ways to overcome this sort of obfuscation.

paul trmbrth · Accepted Answer

Here's a solution using js2xml:

>>> import js2xml
>>> import pprint
>>> jscode = r"""
... var prefix = 'm&#97;&#105;lt&#111;:';
... var suffix = '';
... var attribs = '';
... var path = 'hr' + 'ef' + '=';
... var addy59933 = 'HR-C&#111;l&#111;gn&#101;' + '&#64;';
... addy59933 = addy59933 + 'sc&#111;r' + '&#46;' + 'c&#111;m';
... var addy_text59933 = 'Submit your application';
... document.write( '<a ' + path + '\'' + prefix + addy59933 + suffix + '\'' + attribs + '>' );
... document.write( addy_text59933 );
... document.write( '<\/a>' );
>>> js = js2xml.parse(jscode)

Variable declarations are represented by var_decl elements, their name is in identifier node and their values here are strings, with + operator, so let's make a dict out of them, using "".join() on string/text() elements:

>>> # variables
... variables = dict([(var.xpath('string(./identifier)'), u"".join(var.xpath('.//string/text()')))
...                   for var in js.xpath('.//var_decl')])
>>> pprint.pprint(variables)
{'addy59933': u'HR-C&#111;l&#111;gn&#101;&#64;',
 'addy_text59933': u'Submit your application',
 'attribs': u'',
 'path': u'href=',
 'prefix': u'm&#97;&#105;lt&#111;:',
 'suffix': u''}

Then assignments change values of some variables, with a mix of strings and variables. Concatenate %(identifidername)s for variable identifiers and strings values for strings

>>> # identifiers are assigned other string values
... assigns = {}
>>> for assign in js.xpath('.//assign'):
...     value = u"".join(['%%(%s)s' % el.text if el.tag=='identifier' else el.text
...                       for el in assign.xpath('./right//*[self::string or self::identifier]')])
...     key = assign.xpath('string(left/identifier)')
...     assigns[key] = value
... 
>>> pprint.pprint(assigns)
{'addy59933': u'%(addy59933)ssc&#111;r&#46;c&#111;m'}

Update the variables dict "applying" the assignements

>>> # update variables dict with new values
... for key, val in assigns.items():
...    variables[key] = val % variables
... 
>>> pprint.pprint(variables)
{'addy59933': u'HR-C&#111;l&#111;gn&#101;&#64;sc&#111;r&#46;c&#111;m',
 'addy_text59933': u'Submit your application',
 'attribs': u'',
 'path': u'href=',
 'prefix': u'm&#97;&#105;lt&#111;:',
 'suffix': u''}
>>>

Function arguments are under arguments node (XPath .//arguments/*):

>>> # interpret arguments of document.write()
... arguments = [u"".join(['%%(%s)s' % el.text if el.tag=='identifier' else el.text
...                        for el in arg.xpath('./descendant-or-self::*[self::string or self::identifier]')])
...              for arg in js.xpath('.//arguments/*')]
>>> 
>>> pprint.pprint(arguments)
[u"<a %(path)s'%(prefix)s%(addy59933)s%(suffix)s'%(attribs)s>",
 u'%(addy_text59933)s',
 u'</a>']
>>>

If you replace the identifiers in there, you get

>>> # apply string formatting replacing identifiers
... arguments = [arg % variables for arg in arguments]
>>> 
>>> pprint.pprint(arguments)
[u"<a href='m&#97;&#105;lt&#111;:HR-C&#111;l&#111;gn&#101;&#64;sc&#111;r&#46;c&#111;m'>",
 u'Submit your application',
 u'</a>']
>>>

Now that looks interesting by let's run it through lxml.html to get rid of the numeric character references:

>>> import lxml.html
>>> import lxml.etree
>>> 
>>> doc = lxml.html.fromstring("".join(arguments))
>>> print lxml.etree.tostring(doc)
<a href="mailto:[email protected]">Submit your application</a>
>>>

Using Scrapy Selector:

>>> from scrapy.selector import Selector
>>> selector = Selector(text="".join(arguments), type="html")
>>> selector.xpath('.//a/@href').extract()
[u'mailto:[email protected]']
>>>

Scraping links generated by javascript

Tags:

python-2.7

web-scraping

scrapy

Enrico Tuvera Jr

1 Answers

paul trmbrth

Recent Activity

Donate For Us

Scraping links generated by javascript

Tags:

python-2.7

web-scraping

scrapy

Enrico Tuvera Jr

1 Answers

paul trmbrth

Related questions

Recent Activity

Donate For Us