I would like to scrape for each link on this page the page details page behind.
I can get all informations on this page: PAGE
However, I would like to get all info's on the details page, but the href link looks like that, for example:
href="javascript:subOpen('9ca8ed0fae15d43dc1257e7300345b99')"
Here is my sample spreadsheet using the ImportHTML
function to get the general overview.
Google Spreadsheet
Any suggestions how to get the details pages?
UPDATE
I implemented the method the following:
function doGet(e){
var base = 'http://www.ediktsdatei.justiz.gv.at/edikte/ex/exedi3.nsf/'
var feed = UrlFetchApp.fetch(base + 'suche?OpenForm&subf=e&query=%28%5BVKat%5D%3DEH%20%7C%20%5BVKat%5D%3DZH%20%7C%20%5BVKat%5D%3DMH%20%7C%20%5BVKat%5D%3DMW%20%7C%20%5BVKat%5D%3DMSH%20%7C%20%5BVKat%5D%3DGGH%20%7C%20%5BVKat%5D%3DRH%20%7C%20%5BVKat%5D%3DHAN%20%7C%20%5BVKat%5D%3DWE%20%7C%20%5BVKat%5D%3DEW%20%7C%20%5BVKat%5D%3DMAI%20%7C%20%5BVKat%5D%3DDTW%20%7C%20%5BVKat%5D%3DDGW%20%7C%20%5BVKat%5D%3DGA%20%7C%20%5BVKat%5D%3DGW%20%7C%20%5BVKat%5D%3DUL%20%7C%20%5BVKat%5D%3DBBL%20%7C%20%5BVKat%5D%3DLF%20%7C%20%5BVKat%5D%3DGL%20%7C%20%5BVKat%5D%3DSE%20%7C%20%5BVKat%5D%3DSO%29%20AND%20%5BBL%5D%3D0').getContentText();
var d = document.createElement('div'); //assuming you can do this
d.innerHTML = feed;//make the text a dom structure
var arr = d.getElementsByTagName('a') //iterate over the page links
var response = "";
for(var i = 0;i<arr.length;i++){
var atr = arr[i].getAttribute('onclick');
if(atr) atr = atr.match(/subOpen\((.*?)\)/) //if onclick calls subOpen
if(atr && atr.length > 1){ //get the id
var detail = UrlFetchApp.fetch(base + '0/'+atr[1]).getContentText();
response += detail//process the relevant part of the content and append to the reposnse text
}
}
return ContentService.createTextOutput(response);
}
However, I get an error when running the method:
ReferenceError: "document" is not defined. (line 6, file "")
What is the document
an object of?
I have update the Google Spreadsheet with a webapp.
You can use Firebug in order to inspect the page contents and javascript. For instance you can find that subOpen is actually an alias to subOpenXML declared in xmlhttp01.js.
function subOpenXML(unid) {/*open found doc from search view*/
if (waiting) return alert(bittewar);
var wState = dynDoc.getElementById('windowState');
wState.value = 'H';/*httpreq pending*/
var last = '';
if (unid==docLinks[0]) {last += '&f=1'; thisdocnum = 1;}
if (unid==docLinks[docLinks.length-1]) {
last += '&l=1';
thisdocnum = docLinks.length;
} else {
for (var i=1;i<docLinks.length-1;i++)
if (unid==docLinks[i]) {thisdocnum = i+1; break;}
}
var url = unid + html_delim + 'OpenDocument'+last + '&bm=2';
httpreq.open('GET', // &rand=' + Math.random();
/*'/edikte/test/ex/exedi31.nsf/0/'+*/ '0/'+url, true);
httpreq.onreadystatechange=onreadystatechange;
// httpreq.setRequestHeader('Accept','text/xml');
httpreq.send(null);
waiting = true;
title2src = firstTextChild(dynDoc.getElementById('title2')).nodeValue;
}
So, after copying the function source and modifying it in firebug's Console tab to add a console.log(url)
before the http call, like this:
var url = unid + html_delim + 'OpenDocument'+last + '&bm=2';
console.log(url)
httpreq.open('GET', // &rand=' + Math.random();
/*'/edikte/test/ex/exedi31.nsf/0/'+*/ '0/'+url, true);
You can execute the function declaration in firebug's Console tab and overwrite subOpen with the modified source. Clickin in the link then will show that the invoked url is composed of the id passed as parameter to subOpen prefixed by '0/', so in the example you posted it would be a GET to:
http://www.ediktsdatei.justiz.gv.at/edikte/ex/exedi3.nsf/0/1fd2313c2e0095bfc1257e49004170ca?OpenDocument&f=1&bm=2
You could also verify this by opening the Network tab in firebug and clicking the link.
Therefore, in order to scrape the details page you'd need to
Looking the request response in firebug's Network Tab shows that probably you'll need to do similar parsing to actually get the showed contents, but I haven't looked deep into it.
UPDATE
The importHTML function is not suitable for the kind of scraping you want. Google's HTML or Content Services are better suited for this. You'll need to create a web app and implement the doGet
function:
function doGet(e){
var base = 'http://www.ediktsdatei.justiz.gv.at/edikte/ex/exedi3.nsf/'
var feed = UrlFetchApp.fetch(base + 'suche?OpenForm&subf=e&query=%28%5BVKat%5D%3DEH%20%7C%20%5BVKat%5D%3DZH%20%7C%20%5BVKat%5D%3DMH%20%7C%20%5BVKat%5D%3DMW%20%7C%20%5BVKat%5D%3DMSH%20%7C%20%5BVKat%5D%3DGGH%20%7C%20%5BVKat%5D%3DRH%20%7C%20%5BVKat%5D%3DHAN%20%7C%20%5BVKat%5D%3DWE%20%7C%20%5BVKat%5D%3DEW%20%7C%20%5BVKat%5D%3DMAI%20%7C%20%5BVKat%5D%3DDTW%20%7C%20%5BVKat%5D%3DDGW%20%7C%20%5BVKat%5D%3DGA%20%7C%20%5BVKat%5D%3DGW%20%7C%20%5BVKat%5D%3DUL%20%7C%20%5BVKat%5D%3DBBL%20%7C%20%5BVKat%5D%3DLF%20%7C%20%5BVKat%5D%3DGL%20%7C%20%5BVKat%5D%3DSE%20%7C%20%5BVKat%5D%3DSO%29%20AND%20%5BBL%5D%3D0').getContentText();
var response = "";
var match = feed.match(/subOpen\('.*?'\)/g)
if(match){
for(var i = 0; i < match.length;i++){
var m = match[i].match(/\('(.*)'\)/);
if(m && m.length > 1){
var detailText = UrlFetchApp.fetch(base + '0/'+m[1]);
response += //dosomething with detail text
//and concatenate in the response
}
}
}
return ContentService.createTextOutput(response);
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With