Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Headless endless scroll selenium

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import urllib,requests,unidecode,lxml,pdb
from pyvirtualdisplay import Display
from xvfbwrapper import Xvfb
class wait_for_more_than_n_elements_to_be_present(object):
    def __init__(self, locator, count):
        self.locator = locator
        self.count = count

    def __call__(self, driver):
        try:
            elements = EC._find_elements(driver, self.locator)
            return len(elements) > self.count
        except StaleElementReferenceException:
            return False

def return_html_code(url):
    print url #added in edit 1
    vdisplay =Xvfb()
    vdisplay.start()
    driver = webdriver.Firefox()
    driver.maximize_window()
    driver.get(url)
    # initial wait for the tweets to load
    wait = WebDriverWait(driver, 240)
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
    # scroll down to the last tweet until there is no more tweets loaded
    while True:
        tweets = driver.find_elements_by_css_selector("li[data-item-id]")
        print len(tweets)  #added in edit 1
        driver.execute_script("arguments[0].scrollIntoView(true);", tweets[-1])
        try:
            wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
        except TimeoutException:
            break
     html_full_source=driver.page_source
     driver.close()
     vdisplay.stop()
     html_full=return_html_code(url)

Output:

https://twitter.com/search?q=Error%20Check&src=typd&lang=en
20
39
56
74

I have the above code for endlessly scrolling a page in endless mode. But somehow it seems to stops before. References-https://stackoverflow.com/a/31058403/3646408

Edit 1:

$ phantomjs --version
2.1.1

On runnning @alexce code it showed different output in 2 runs, the date check makes it clear that there are more tweets:

https://twitter.com/search?q=Error%20Check&src=typd&lang=en
20
40
59
76
95
114
133
152
171
191
211
231
249
267
Date of most old tweet: 12 Jan 2016


https://twitter.com/search?q=Error%20Check&src=typd&lang=en
20
40
59
76
95
114
133
152
171
191
211
231
249
267
287
303
317
337
356
373
388
400
418
437
457
476
492
Date of most old tweet: 8 Jan 2016

Edit2:

On runnning updated version of @alexce's code. It showed the below error after ~7000 tweets.

    Traceback (most recent call last):
      File "twitter_script.py", line 82, in <module>
        search_twitter('Alcoholics Anonymous')
      File "twitter_script.py", line 76, in search_twitter
        db_name=write_data_to_db(*get_twitter_data(query))
      File "twitter_script.py", line 24, in get_twitter_data
        html_full=return_html_code(url)
      File "c:\Users\sony\Desktop\social_network_extract_old\social_network_extract\scrollDownHtmlCode.py", line 48, in return_html_code
        html_full_source=driver.page_source
      File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 464, in page_source
        return self.execute(Command.GET_PAGE_SOURCE)['value']
      File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 199, in execute
        response = self.command_executor.execute(driver_command, params)
      File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 395, in execute
        return self._request(command_info[0], url, body=data)
      File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 463, in _request
        resp = opener.open(request, timeout=self._timeout)
      File "c:\Anaconda\lib\urllib2.py", line 431, in open
        response = self._open(req, data)
      File "c:\Anaconda\lib\urllib2.py", line 449, in _open
        '_open', req)
      File "c:\Anaconda\lib\urllib2.py", line 409, in _call_chain
        result = func(*args)
      File "c:\Anaconda\lib\urllib2.py", line 1227, in http_open
        return self.do_open(httplib.HTTPConnection, req)
      File "c:\Anaconda\lib\urllib2.py", line 1200, in do_open
        r = h.getresponse(buffering=True)
      File "c:\Anaconda\lib\httplib.py", line 1136, in getresponse
        response.begin()
      File "c:\Anaconda\lib\httplib.py", line 453, in begin
        version, status, reason = self._read_status()
      File "c:\Anaconda\lib\httplib.py", line 409, in _read_status
        line = self.fp.readline(_MAXLINE + 1)
      File "c:\Anaconda\lib\socket.py", line 480, in readline
        data = self._sock.recv(self._rbufsize)
    socket.error: [Errno 10054] An existing connection was forcibly closed by the remote host

Edit 3: Trying the same code for different url.

https://twitter.com/search?q=Alcoholics%20Anonymous%20Drunk%20since%3A2006-03-24%20until%3A2006-04-23&src=typd&lang=en
Traceback (most recent call last):
  File "twitter_script.py", line 64, in <module>
    search_twitter('Alcoholics Anonymous Drunk')
  File "twitter_script.py", line 58, in search_twitter
    db_name=write_data_to_db(*get_twitter_data(query))
  File "twitter_script.py", line 31, in get_twitter_data
    html_full=return_html_code(url)
  File "c:\Users\sony\Desktop\social_network_extract_old\social_network_extract\scrollDownHtmlCode.py", line 30, in return_html_code
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
  File "c:\Anaconda\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until
    raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
Screenshot: available via screen

Edit 4:

ubuntu@ip-172-31-38-123:~/social_network_extract_proxy$ cat error.txt 
Traceback (most recent call last):
  File "twitter_script.py", line 70, in <module>
    search_twitter('alcoholics anonymous')
  File "twitter_script.py", line 64, in search_twitter
    db_name=write_data_to_db(*get_twitter_data(query))
  File "twitter_script.py", line 37, in get_twitter_data
    html_full=return_html_code(url)
  File "/home/ubuntu/social_network_extract_proxy/firefox_driver_code.py", line 35, in return_html_code
    driver=webdriver.Firefox(firefox_profile=profile)
  File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/webdriver.py", line 79, in __init__
    self.binary, timeout),
  File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/extension_connection.py", line 49, in __init__
    self.binary.launch_browser(self.profile)
  File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/firefox_binary.py", line 68, in launch_browser
    self._wait_until_connectable()
  File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/firefox_binary.py", line 106, in _wait_until_connectable
    % (self.profile.path))
selenium.common.exceptions.WebDriverException: Message: Can't load the profile. Profile Dir: /tmp/tmpvFoPrE If you specified a log_file in the FirefoxBinary constructor, check it for details.

Got the above error after a while.

like image 476
Abhishek Bhatia Avatar asked Jan 22 '16 08:01

Abhishek Bhatia


1 Answers

Here is a set of things that made it work for me in headless mode:

  • switch to PhantomJS
  • pretend to be a different browser by setting a custom User-Agent string
  • before scrolling into view of the last tweet, scroll to the top of the page (several times to increase reliability)

The code:

import time

def return_html_code(url):
    dcap = dict(webdriver.DesiredCapabilities.PHANTOMJS)
    dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36"

    driver = webdriver.PhantomJS(desired_capabilities=dcap)
    driver.maximize_window()

    driver.get(url)

    # initial wait for the tweets to load
    wait = WebDriverWait(driver, 30)
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
    # scroll down to the last tweet until there is no more tweets loaded
    while True:
        tweets = driver.find_elements_by_css_selector("li[data-item-id]")
        number_of_tweets = len(tweets)
        print(number_of_tweets)

        # move to the top and then to the bottom 5 times in a row
        for _ in range(5):
            driver.execute_script("window.scrollTo(0, 0)")
            driver.execute_script("arguments[0].scrollIntoView(true);", tweets[-1])
            time.sleep(0.5)

        try:
            wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
        except TimeoutException:
            break
like image 162
alecxe Avatar answered Oct 23 '22 10:10

alecxe