Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to retrieve the "Initiator" field from the devtools Network panel with Selenium?

I'm trying to extract all the URL network requests from a website and establish a relationship of hierarchy between them i.e. if one URL request is generating another request. Something like a chain of requests.

As you know, in the Network panel, there is a field called "Initiator" in the Requests table which tells you the origin or parent of a specific request (if there's any). Manually, I can use the browser, go to the Network panel in the developer tools, load the website and download the resulting HAR file. For example:

{
        "startedDateTime": "2019-11-05T17:38:46.775Z",
        "time": 15.676000155508518,
        "request": {
          "method": "POST",
          "url": "https://www.google.com/gen_204?oq=&gs_l=psy-ab.22...0.0..847450...0.0..0.0.0.......0......gws-wiz.",
          "httpVersion": "http/2.0",
          "headers": [
            {
              "name": ":path",
              "value": "/gen_204?oq=&gs_l=psy-ab.22...0.0..847450...0.0..0.0.0.......0......gws-wiz."
            },
            {
              "name": "sec-fetch-mode",
              "value": "no-cors"
            },
            {
              "name": "origin",
              "value": "https://www.google.com"
            },
            {
              "name": "accept-encoding",
              "value": "gzip, deflate, br"
            },
            {
              "name": "accept-language",
              "value": "en-GB,en;q=0.9,en-US;q=0.8,es-US;q=0.7,es;q=0.6"
            },
            {
              "name": "user-agent",
              "value": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/76.0.3809.100 Chrome/76.0.3809.100 Safari/537.36"
            },
            {
              "name": "content-type",
              "value": "text/plain;charset=UTF-8"
            },
            {
              "name": "accept",
              "value": "*/*"
            },
            {
              "name": "referer",
              "value": "https://www.google.com/"
            },
            {
              "name": ":authority",
              "value": "www.google.com"
            },
            {
              "name": "cookie",
              "value": "CONSENT=YES+GB.en+20160414-00-0; SEARCH_SAMESITE=CgQIg44B; ANID=AHWqTUlE3OPRfM5R1dtW0XvyIu2NOdLWoSHEgFemsFslXQTIzFKFCL-7kTtDZAr_; NID=190=Ezp7tXRaU_Rs2BS9RprlsS9QN9-PcwpYNSLwaOVGVFFp6pWepIjDqsYlgyLqb2eATn6HwUNs-SmgzAmtEm63fgX-YWVgbOyX7GU1esPamrN-GWXfwmXyrsqsTBOOQTzsHB3Q89tATDNQE_OKGd0YgCxMp9m9QXke2BJANdKdBYujl-g5tS8ZXcq0pw; 1P_JAR=2019-11-05-17; DV=o32RqCcqMlgsAJonGalrPPWlv0DK4xZ24gV5ztaaewMAAAA"
            },
            {
              "name": ":scheme",
              "value": "https"
            },
            {
              "name": "sec-fetch-site",
              "value": "same-origin"
            },
            {
              "name": "content-length",
              "value": "0"
            },
            {
              "name": ":method",
              "value": "POST"
            }
          ],
          "queryString": [
            {
              "name": "oq",
              "value": ""
            },
            {
              "name": "gs_l",
              "value": "psy-ab.22...0.0..847450...0.0..0.0.0.......0......gws-wiz."
            }
          ],
          "cookies": [
            {
              "name": "CONSENT",
              "value": "YES+GB.en+20160414-00-0",
              "expires": null,
              "httpOnly": false,
              "secure": false
            },
            {
              "name": "SEARCH_SAMESITE",
              "value": "CgQIg44B",
              "expires": null,
              "httpOnly": false,
              "secure": false
            },
            {
              "name": "ANID",
              "value": "AHWqTUlE3OPRfM5R1dtW0XvyIu2NOdLWoSHEgFemsFslXQTIzFKFCL-7kTtDZAr_",
              "expires": null,
              "httpOnly": false,
              "secure": false
            },
            {
              "name": "NID",
              "value": "190=Ezp7tXRaU_Rs2BS9RprlsS9QN9-PcwpYNSLwaOVGVFFp6pWepIjDqsYlgyLqb2eATn6HwUNs-SmgzAmtEm63fgX-YWVgbOyX7GU1esPamrN-GWXfwmXyrsqsTBOOQTzsHB3Q89tATDNQE_OKGd0YgCxMp9m9QXke2BJANdKdBYujl-g5tS8ZXcq0pw",
              "expires": null,
              "httpOnly": false,
              "secure": false
            },
            {
              "name": "1P_JAR",
              "value": "2019-11-05-17",
              "expires": null,
              "httpOnly": false,
              "secure": false
            },
            {
              "name": "DV",
              "value": "o32RqCcqMlgsAJonGalrPPWlv0DK4xZ24gV5ztaaewMAAAA",
              "expires": null,
              "httpOnly": false,
              "secure": false
            }
          ],
          "headersSize": -1,
          "bodySize": 0
        },
        "response": {
          "status": 204,
          "statusText": "",
          "httpVersion": "http/2.0",
          "headers": [
            {
              "name": "date",
              "value": "Tue, 05 Nov 2019 17:38:46 GMT"
            },
            {
              "name": "server",
              "value": "gws"
            },
            {
              "name": "x-frame-options",
              "value": "SAMEORIGIN"
            },
            {
              "name": "content-type",
              "value": "text/html; charset=UTF-8"
            },
            {
              "name": "status",
              "value": "204"
            },
            {
              "name": "alt-svc",
              "value": "quic=\":443\"; ma=2592000; v=\"46,43\",h3-Q050=\":443\"; ma=2592000,h3-Q049=\":443\"; ma=2592000,h3-Q048=\":443\"; ma=2592000,h3-Q046=\":443\"; ma=2592000,h3-Q043=\":443\"; ma=2592000"
            },
            {
              "name": "content-length",
              "value": "0"
            },
            {
              "name": "x-xss-protection",
              "value": "0"
            }
          ],
          "cookies": [],
          "content": {
            "size": 0,
            "mimeType": "text/html"
          },
          "redirectURL": "",
          "headersSize": -1,
          "bodySize": -1,
          "_transferSize": 54
        },
        "cache": {},
        "timings": {
          "blocked": 1.1320006029605865,
          "dns": -1,
          "ssl": -1,
          "connect": -1,
          "send": 0.16199999999999992,
          "wait": 14.122000366747379,
          "receive": 0.25999918580055237,
          "_blocked_queueing": 0.5990006029605865
        },
        "serverIPAddress": "216.58.204.68",
        "_initiator": {
          "type": "script",
          "stack": {
            "callFrames": [
              {
                "functionName": "s_1pb",
                "scriptId": "129",
                "url": "https://www.google.com/xjs/_/js/k=xjs.s.en_GB.UISl_YucLj8.O/ck=xjs.s.or8k_ixGu54.L.W.O/m=Fkg7bd,HcFEGb,IvlUe,MC8mtf,OF7gzc,RMhBfe,T4BAC,TJw5qb,TbaHGc,Y33vzc,cdos,hsm,iDPoPb,jsa,mvYTse,tg8oTe,uz938c,vWNDde,ws9Tlc,yQ43ff,d,csi/am=BAAAsAjYuwOC_L8VAAQAfAYAAAFuwQYLhCGhYqwOEAE/d=1/dg=2/br=1/ct=zgms/rs=ACT90oGdwE1ooFdbHyz-Vk2BhYjwAv-QDQ",
                "lineNumber": 2323,
                "columnNumber": 376
              },

In this case, the URL https://www.google.com/gen_204?oq=&gs_l=psy-ab.22... is initiated by the URL https://www.google.com/xjs/_/js/k=xjs.s.en_GB.UISl.... You can see this information in the key "Initiator" -> "callFrames" -> "url".

The idea is to obtain this information (URLs that are calling others) or download the HAR file automatically using Selenium. I have tried this:

1. Selenium + browsermob proxy Problem: The resulting HAR files doesn't have the "Initiator" field and there's no way to connect initiator requests and their dependencies.

2. Selenium Performance Logs I am using this code to get the performance logs from Selenium:

caps = DesiredCapabilities.CHROME
caps['loggingPrefs'] = {'performance': 'ALL'}

chrome_options = webdriver.ChromeOptions()
driver = webdriver.Chrome(driver_path,desired_capabilities=caps)
driver.get("http://google.com")

browser_log = driver.get_log('performance')

Here, the only method where the "initiator" field can be found is in the "Network.requestWillBeSent" method which has a URL but it's not related to another one. Seems that each "message" field is independent and again, there's no way to connect initiator requests and their dependencies.

  1. driver.execute_script I saw this code in this question: How to access Network panel on google chrome developer tools with selenium?
driver = webdriver.Chrome('/path/to/chromedriver)
driver.get('https://www.google.com');
log = driver.execute_script("return window.performance.getEntries();")
#ANOTHER WAY
#log = driver.execute_script("var performance = window.performance || window.mozPerformance || window.msPerformance || window.webkitPerformance || {}; var network = performance.getEntries() || {}; return network;")

The resulting log is completely different to a HAR file or a performance log and it doesn't have any information that can be used to relate requests. Example:

{
    "startTime": 0,
    "initiatorType": "navigation",
    "unloadEventStart": 0,
    "fetchStart": 69.29999962449074,
    "duration": 1311.3000001758337,
    "responseStart": 172.89999965578318,
    "nextHopProtocol": "h2",
    "transferSize": 68052,
    "connectStart": 70.19999995827675,
    "domainLookupStart": 70.19999995827675,
    "redirectStart": 0,
    "domContentLoadedEventEnd": 504.90000005811453,
    "responseEnd": 190.7999999821186,
    "requestStart": 100.49999970942736,
    "type": "navigate",
    "secureConnectionStart": 80.40000032633543,
    "connectEnd": 99.99999962747097,
    "redirectCount": 0,
    "workerStart": 0,
    "decodedBodySize": 233300,
    "loadEventStart": 1304.8000000417233,
    "encodedBodySize": 67329,
    "serverTiming": [],
    "entryType": "navigation",
    "domInteractive": 487.699999473989,
    "domContentLoadedEventStart": 487.80000023543835,
    "redirectEnd": 0,
    "name": "https://www.google.com/?gws_rd=ssl",
    "domainLookupEnd": 70.19999995827675,
    "unloadEventEnd": 0,
    "loadEventEnd": 1311.3000001758337,
    "domComplete": 1303.3999996259809,
    "toJSON": {}
  },
  {
    "initiatorType": "img",
    "fetchStart": 298.20000007748604,
    "duration": 14.100000262260437,
    "responseStart": 310.70000026375055,
    "responseEnd": 312.3000003397465,
    "transferSize": 6146,
    "connectStart": 298.20000007748604,
    "domainLookupStart": 298.20000007748604,
    "redirectStart": 0,
    "toJSON": {},
    "requestStart": 299.80000015348196,
    "secureConnectionStart": 0,
    "connectEnd": 298.20000007748604,
    "workerStart": 0,
    "decodedBodySize": 5969,
    "startTime": 298.20000007748604,
    "encodedBodySize": 5969,
    "serverTiming": [],
    "entryType": "resource",
    "redirectEnd": 0,
    "name": "https://www.google.com/images/branding/googlelogo/1x/googlelogo_color_272x92dp.png",
    "domainLookupEnd": 298.20000007748604,
    "nextHopProtocol": "h2"
  },

Therefore, I think the HAR file downloaded from the developer tools Network panel is the only source of information that allows to establish relationships between URL requests. Since I'm crawling many websites, I need to automatise the process but I can't find a way to do it.

Any ideas about downloading the HAR files automatically of extracting the info I need using Selenium directly would be appreciated.

like image 312
Adrian Avatar asked Nov 06 '22 12:11

Adrian


1 Answers

You can fetch the _initiator data from the page's raw_entry.

with open('at.har', 'r') as f:
    har_parser = HarParser(json.loads(f.read()))
pages_root = har_parser.pages[0]
initiator_dict = {}
for page in pages_root:
    if "url" in initiator:
        initiator_dict[page.request.url] = initiator['url']
res = defaultdict(list)
for key, val in sorted(initiator_dict.items()):
    res[val].append(key)
like image 54
feedthemachine Avatar answered Nov 14 '22 21:11

feedthemachine