Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Data scraping from forexfactory.com

I am a beginner in python. In this question they extract data from forex factory. In that time the solution was working with their logic, finding table soup.find('table', class_="calendar__table") . But, now the web structure has been changed, the html table is removed and converted to some javascript format. So, this solution is not find anything now.

 import requests
from bs4 import BeautifulSoup

r = requests.get('http://www.forexfactory.com/calendar.php?day=nov18.2016')
soup = BeautifulSoup(r.text, 'lxml')

calendar_table = soup.find('table', class_="calendar__table")

print(calendar_table)


# for row in calendar_table.find_all('tr', class_=['calendar__row calendar_row','newday']):
#     row_data = [td.get_text(strip=True) for td in row.find_all('td')]
#     print(row_data)

enter image description here

As I am a begineer I have no idea how to do that. So, how can I scrape the data? If you give me any hints it will be helpful for me. Thanks a lot for reading my post.

like image 471
Phi Avatar asked May 08 '26 02:05

Phi


2 Answers

Currently they have implemented some cloudfare protection so beautifulsoup can't collect data. We have to use selenium for that.

Example Working Code:

import random
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

def create_driver():
    user_agent_list = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.5; rv:90.0) Gecko/20100101 Firefox/90.0',
        'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_5_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
        'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
    ]
    user_agent = random.choice(user_agent_list)

    browser_options = webdriver.ChromeOptions()
    browser_options.add_argument("--no-sandbox")
    browser_options.add_argument("--headless")
    browser_options.add_argument("start-maximized")
    browser_options.add_argument("window-size=1900,1080")
    browser_options.add_argument("disable-gpu")
    browser_options.add_argument("--disable-software-rasterizer")
    browser_options.add_argument("--disable-dev-shm-usage")
    browser_options.add_argument(f'user-agent={user_agent}')

    driver = webdriver.Chrome(options=browser_options, service_args=["--verbose", "--log-path=test.log"])

    return driver

def parse_data(driver, url):
    driver.get(url)

    data_table = driver.find_element(By.CLASS_NAME, "calendar__table")
    value_list = []

    for row in data_table.find_elements(By.TAG_NAME, "tr"):
        row_data = list(filter(None, [td.text for td in row.find_elements(By.TAG_NAME, "td")]))
        if row_data:
            value_list.append(row_data)
    return value_list

driver = create_driver()
url = 'https://www.forexfactory.com/calendar?day=aug26.2021'

value_list = parse_data(driver=driver, url=url)

for value in value_list:
    if '\n' in value[0]:
        date_str = value.pop(0).replace('\n', ' - ')
        print(f'Date: {date_str}')
    print(value)

Output:

Date: Thu - Aug 26
['2:00am', 'EUR', 'German GfK Consumer Climate', '-1.2', '-0.5', '-0.4']
['4:00am', 'EUR', 'M3 Money Supply y/y', '7.6%', '7.6%', '8.3%']
['EUR', 'Private Loans y/y', '4.2%', '4.1%', '4.0%']
['7:30am', 'EUR', 'ECB Monetary Policy Meeting Accounts']
['8:30am', 'USD', 'Prelim GDP q/q', '6.6%', '6.7%', '6.5%']
['USD', 'Unemployment Claims', '353K', '345K', '349K']
['USD', 'Prelim GDP Price Index q/q', '6.1%', '6.0%', '6.0%']
['10:30am', 'USD', 'Natural Gas Storage', '29B', '40B', '46B']
['Day 1', 'All', 'Jackson Hole Symposium']
['5:00pm', 'USD', 'President Biden Speaks']
['7:30pm', 'JPY', 'Tokyo Core CPI y/y', '0.0%', '-0.1%', '0.1%']
['9:30pm', 'AUD', 'Retail Sales m/m', '-2.7%', '-2.6%', '-1.8%']
like image 117
Sabil Avatar answered May 10 '26 14:05

Sabil


As you've tagged this question with selenium, this answer relies on Selenium. I am using webdriver manager for ease.

from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(ChromeDriverManager().install())

try:
    driver.get("http://www.forexfactory.com/calendar.php?day=nov18.2016")
    # Get the table
    table = driver.find_element(By.CLASS_NAME, "calendar__table")
    # Iterate over each table row
    for row in table.find_elements(By.TAG_NAME, "tr"):
        # list comprehension to get each cell's data and filter out empty cells
        row_data = list(filter(None, [td.text for td in row.find_elements(By.TAG_NAME, "td")]))
        if row_data == []:
            continue
        print(row_data)
except Exception as e:
    print(e)
finally:
    driver.quit()

This currently prints out:

['Fri\nNov 18', '2:00am', 'EUR', 'German PPI m/m', '0.7%', '0.3%', '-0.2%']
['3:30am', 'EUR', 'ECB President Draghi Speaks']
['4:00am', 'EUR', 'Current Account', '25.3B', '31.3B', '29.1B']
['4:10am', 'GBP', 'MPC Member Broadbent Speaks']
['5:30am', 'CHF', 'Gov Board Member Maechler Speaks']
['EUR', 'German Buba President Weidmann Speaks']
['USD', 'FOMC Member Bullard Speaks']
['8:30am', 'CAD', 'Core CPI m/m', '0.2%', '0.3%', '0.2%']
['CAD', 'CPI m/m', '0.2%', '0.2%', '0.1%']
['9:30am', 'USD', 'FOMC Member Dudley Speaks']
['USD', 'FOMC Member George Speaks']
['10:00am', 'USD', 'CB Leading Index m/m', '0.1%', '0.1%', '0.2%']
['9:45pm', 'USD', 'FOMC Member Powell Speaks']

The data it's printing is just to show that it can extract the data, you will need to change and format it as you see fit.

like image 33
Lucan Avatar answered May 10 '26 14:05

Lucan



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!