I'm using Selenium/python to automatically scroll down a social media website and scrape posts. I'm currently extracting all the text in one "hit" after scrolling a certain number of times (code below), but instead I want to extract just the newly-loaded text after each scroll.
For example, if the page initially contained the text "A, B, C", then after the first scroll it displayed "D, E, F", I'd want to store "A, B, C", then scroll, then store "D, E, F" etc.
The specific items I'm wanting to extract are the dates of the posts and the message text, which can be obtained with the css selectors '.message-date'
and '.message-body'
, respectively (e.g., dates = driver.find_elements_by_css_selector('.message-date')
).
Can anyone advise on how to extract just the newly-loaded text after each scroll?
Here's my current code (which extracts all the dates/messages after I finish scrolling):
from selenium import webdriver
import sys
import time
from selenium.webdriver.common.keys import Keys
#load website to scrape
driver = webdriver.PhantomJS()
driver.get("https://stocktwits.com/symbol/USDJPY?q=%24USDjpy")
#Scroll the webpage
ScrollNumber=3 #max scrolls
print(str(ScrollNumber)+ " scrolldown will be done.")
for i in range(1,ScrollNumber): #scroll down X times
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3) #Delay between 2 scrolls down to be sure the page loaded
## I WANT TO SAVE/STORE ANY NEWLY LOADED POSTS HERE RATHER
## THAN EXTRACTING IT ALL IN ONE GO AT THE END OF THE LOOP
# Extract messages and dates.
## I WANT TO EXTRACT THIS DATA ON THE FLY IN THE ABOVE
## LOOP RATHER THAN EXTRACTING IT HERE
dates = driver.find_elements_by_css_selector('.message-date')
messages = driver.find_elements_by_css_selector('.message-body')
You can store the number of messages in a variable and use xpath
and position()
to get the newly added posts
dates = []
messages = []
num_of_posts = 1
for i in range(1, ScrollNumber):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
dates.extend(driver.find_elements_by_xpath('(//div[@class="message-date"])[position()>=' + str(num_of_posts) + ']'))
messages.extend(driver.find_elements_by_xpath('(//div[contains(@class, "message-body")])[position()>=' + str(num_of_posts) + ']'))
num_of_posts = len(dates)
I had the same issue with facebook posts. For that I save the post ID (or whatever value that's unique for the post, even a Hash) in a List and then when you made the query again, you need to check if that ID is in your list or not.
Also, you can remove the DOM that is parsed, so only the new ones will exists.
As others have said, if you can do what you need to do via hitting the API directly, thats your best bet. If you absolutely must use Selenium, see my solution below.
I do something similar to the below for my needs.
:nth-child()
aspect of CSS paths to individually find elements as they load.explicit
package, pip install explicit
) to efficiently wait for elements to load.The script is quit fast (no calls to sleep()), however, the webpage itself has so much junk going on in the background that it often takes a while for selenium to return control to the script.
from __future__ import print_function
from itertools import count
import sys
import time
from explicit import waiter, CSS
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait as Wait
# The CSS selectors we will use
POSTS_BASE_CSS = 'ol.stream-list > li' # All li elements
POST_BASE_CSS = POSTS_BASE_CSS + ":nth-child({0})" # li child element at index {0}
POST_DATE_CSS = POST_BASE_CSS + ' div.message-date' # li child element at {0} with div.message-date
POST_BODY_CSS = POST_BASE_CSS + ' div.message-body' # li child element at {0} with div.message-date
class Post(object):
def __init__(self, driver, post_index):
self.driver = driver
self.date_css = POST_DATE_CSS.format(post_index)
self.text_css = POST_BODY_CSS.format(post_index)
@property
def date(self):
return waiter.find_element(self.driver, self.date_css, CSS).text
@property
def text(self):
return waiter.find_element(self.driver, self.text_css, CSS).text
def get_posts(driver, url, max_screen_scrolls):
""" Post object generator """
driver.get(url)
screen_scroll_count = 0
# Wait for the initial posts to load:
waiter.find_elements(driver, POSTS_BASE_CSS, CSS)
for index in count(1):
# Evaluate if we need to scroll the screen, or exit the generator
# If there is no element at this index, it means we need to scroll the screen
if len(driver.find_elements_by_css_selector('ol.stream-list > :nth-child({0})'.format(index))) == 0:
if screen_scroll_count >= max_screen_scrolls:
# Break if we have already done the max scrolls
break
# Get count of total posts on page
post_count = len(waiter.find_elements(driver, POSTS_BASE_CSS, CSS))
# Scroll down
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
screen_scroll_count += 1
def posts_load(driver):
""" Custom explicit wait function; waits for more posts to load in """
return len(waiter.find_elements(driver, POSTS_BASE_CSS, CSS)) > post_count
# Wait until new posts load in
Wait(driver, 20).until(posts_load)
# The list elements have sponsored ads and scripts mixed in with the posts we
# want to scrape. Check if they have a div.message-date element and continue on
# if not
includes_date_css = POST_DATE_CSS.format(index)
if len(driver.find_elements_by_css_selector(includes_date_css)) == 0:
continue
yield Post(driver, index)
def main():
url = "https://stocktwits.com/symbol/USDJPY?q=%24USDjpy"
max_screen_scrolls = 4
driver = webdriver.Chrome()
try:
for post_num, post in enumerate(get_posts(driver, url, max_screen_scrolls), 1):
print("*" * 40)
print("Post #{0}".format(post_num))
print("\nDate: {0}".format(post.date))
print("Text: {0}\n".format(post.text[:34]))
finally:
driver.quit() # Use try/finally to make sure the driver is closed
if __name__ == "__main__":
main()
Full disclosure:
I'm the creator of the explicit
package. You could easily rewrite the above using explicit waits directly, at the expense of readability.
This does exactly what you want. But, I wouldn't scrape the site this way...it'll just get slower and slower the longer it runs. RAM usage will spiral out of control too.
import time
from hashlib import md5
import selenium.webdriver.support.ui as ui
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
URL = 'https://stocktwits.com/symbol/USDJPY?q=%24USDjpy'
CSS = By.CSS_SELECTOR
driver.get(URL)
def scrape_for(max_seconds=300):
found = set()
end_at = time.time() + max_seconds
wait = ui.WebDriverWait(driver, 5, 0.5)
while True:
# find elements
elms = driver.find_elements(CSS, 'li.messageli')
for li in elms:
# get the information we need about each post
text = li.find_element(CSS, 'div.message-content')
key = md5(text.text.encode('ascii', 'ignore')).hexdigest()
if key in found:
continue
found.add(key)
try:
date = li.find_element(CSS, 'div.message-date').text
except NoSuchElementException as e:
date = None
yield text.text, date
if time.time() > end_at:
raise StopIteration
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
wait.until(EC.invisibility_of_element_located(
(CSS, 'div#more-button-loading')))
raise StopIteration
for twit in scrape_for(60):
print(twit)
driver.quit()
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With