Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Scraping table from website, with javascript:subOpen href link

I would like to scrape for each link on this page the page details page behind.

I can get all informations on this page: PAGE

However, I would like to get all info's on the details page, but the href link looks like that, for example:

href="javascript:subOpen('9ca8ed0fae15d43dc1257e7300345b99')"

Here is my sample spreadsheet using the ImportHTML function to get the general overview.

Google Spreadsheet

Any suggestions how to get the details pages?

UPDATE

I implemented the method the following:

function doGet(e){
  var base = 'http://www.ediktsdatei.justiz.gv.at/edikte/ex/exedi3.nsf/'
  var feed =  UrlFetchApp.fetch(base + 'suche?OpenForm&subf=e&query=%28%5BVKat%5D%3DEH%20%7C%20%5BVKat%5D%3DZH%20%7C%20%5BVKat%5D%3DMH%20%7C%20%5BVKat%5D%3DMW%20%7C%20%5BVKat%5D%3DMSH%20%7C%20%5BVKat%5D%3DGGH%20%7C%20%5BVKat%5D%3DRH%20%7C%20%5BVKat%5D%3DHAN%20%7C%20%5BVKat%5D%3DWE%20%7C%20%5BVKat%5D%3DEW%20%7C%20%5BVKat%5D%3DMAI%20%7C%20%5BVKat%5D%3DDTW%20%7C%20%5BVKat%5D%3DDGW%20%7C%20%5BVKat%5D%3DGA%20%7C%20%5BVKat%5D%3DGW%20%7C%20%5BVKat%5D%3DUL%20%7C%20%5BVKat%5D%3DBBL%20%7C%20%5BVKat%5D%3DLF%20%7C%20%5BVKat%5D%3DGL%20%7C%20%5BVKat%5D%3DSE%20%7C%20%5BVKat%5D%3DSO%29%20AND%20%5BBL%5D%3D0').getContentText();

       var d = document.createElement('div'); //assuming you can do this
       d.innerHTML = feed;//make the text a dom structure
       var arr = d.getElementsByTagName('a') //iterate over the page links
       var response = "";
       for(var i = 0;i<arr.length;i++){
         var atr = arr[i].getAttribute('onclick');
         if(atr) atr = atr.match(/subOpen\((.*?)\)/) //if onclick calls subOpen
         if(atr && atr.length > 1){ //get the id
            var detail = UrlFetchApp.fetch(base + '0/'+atr[1]).getContentText();
            response += detail//process the relevant part of the content and append to the reposnse text
         }
        }      
       return ContentService.createTextOutput(response);
}

However, I get an error when running the method:

ReferenceError: "document" is not defined. (line 6, file "")

What is the document an object of?

I have update the Google Spreadsheet with a webapp.

like image 706
Carol.Kar Avatar asked Jul 22 '15 08:07

Carol.Kar


1 Answers

You can use Firebug in order to inspect the page contents and javascript. For instance you can find that subOpen is actually an alias to subOpenXML declared in xmlhttp01.js.

function subOpenXML(unid) {/*open found doc from search view*/
 if (waiting) return alert(bittewar);
 var wState = dynDoc.getElementById('windowState');
 wState.value = 'H';/*httpreq pending*/
 var last = '';
 if (unid==docLinks[0]) {last += '&f=1'; thisdocnum = 1;}
 if (unid==docLinks[docLinks.length-1]) {
  last += '&l=1';
  thisdocnum = docLinks.length;
 } else {
  for (var i=1;i<docLinks.length-1;i++)
   if (unid==docLinks[i]) {thisdocnum = i+1; break;}
 }
 var url = unid + html_delim + 'OpenDocument'+last + '&bm=2';
 httpreq.open('GET',    // &rand=' + Math.random();
  /*'/edikte/test/ex/exedi31.nsf/0/'+*/ '0/'+url, true);
 httpreq.onreadystatechange=onreadystatechange;
// httpreq.setRequestHeader('Accept','text/xml');
 httpreq.send(null);
 waiting = true;
 title2src = firstTextChild(dynDoc.getElementById('title2')).nodeValue;
}

So, after copying the function source and modifying it in firebug's Console tab to add a console.log(url) before the http call, like this:

 var url = unid + html_delim + 'OpenDocument'+last + '&bm=2';
 console.log(url)
 httpreq.open('GET',    // &rand=' + Math.random();
  /*'/edikte/test/ex/exedi31.nsf/0/'+*/ '0/'+url, true);

You can execute the function declaration in firebug's Console tab and overwrite subOpen with the modified source. Clickin in the link then will show that the invoked url is composed of the id passed as parameter to subOpen prefixed by '0/', so in the example you posted it would be a GET to:

http://www.ediktsdatei.justiz.gv.at/edikte/ex/exedi3.nsf/0/1fd2313c2e0095bfc1257e49004170ca?OpenDocument&f=1&bm=2

You could also verify this by opening the Network tab in firebug and clicking the link.

Therefore, in order to scrape the details page you'd need to

  1. Parse the id passed to subOpen
  2. Make a GET call to '0/'
  3. Parse the request response

Looking the request response in firebug's Network Tab shows that probably you'll need to do similar parsing to actually get the showed contents, but I haven't looked deep into it.

UPDATE The importHTML function is not suitable for the kind of scraping you want. Google's HTML or Content Services are better suited for this. You'll need to create a web app and implement the doGet function:

function doGet(e){
  var base = 'http://www.ediktsdatei.justiz.gv.at/edikte/ex/exedi3.nsf/'
  var feed =  UrlFetchApp.fetch(base + 'suche?OpenForm&subf=e&query=%28%5BVKat%5D%3DEH%20%7C%20%5BVKat%5D%3DZH%20%7C%20%5BVKat%5D%3DMH%20%7C%20%5BVKat%5D%3DMW%20%7C%20%5BVKat%5D%3DMSH%20%7C%20%5BVKat%5D%3DGGH%20%7C%20%5BVKat%5D%3DRH%20%7C%20%5BVKat%5D%3DHAN%20%7C%20%5BVKat%5D%3DWE%20%7C%20%5BVKat%5D%3DEW%20%7C%20%5BVKat%5D%3DMAI%20%7C%20%5BVKat%5D%3DDTW%20%7C%20%5BVKat%5D%3DDGW%20%7C%20%5BVKat%5D%3DGA%20%7C%20%5BVKat%5D%3DGW%20%7C%20%5BVKat%5D%3DUL%20%7C%20%5BVKat%5D%3DBBL%20%7C%20%5BVKat%5D%3DLF%20%7C%20%5BVKat%5D%3DGL%20%7C%20%5BVKat%5D%3DSE%20%7C%20%5BVKat%5D%3DSO%29%20AND%20%5BBL%5D%3D0').getContentText();
       var response = "";
       var match = feed.match(/subOpen\('.*?'\)/g)
       if(match){
         for(var i = 0; i < match.length;i++){
              var m = match[i].match(/\('(.*)'\)/);
              if(m && m.length > 1){
                var detailText = UrlFetchApp.fetch(base + '0/'+m[1]);
                response += //dosomething with detail text 
                            //and concatenate in the response
              }
         }
       }
       return ContentService.createTextOutput(response);


}
like image 94
Grasshopper Avatar answered Nov 15 '22 04:11

Grasshopper