I've been unable run Selenium from within Databricks. I have followed the steps of others in various other threads:
https://forums.databricks.com/questions/15480/how-to-add-webdriver-for-selenium-in-databricks.html
How to use Selenium in Databricks and accessing and moving downloaded files to mounted storage
cannot get selenium webdriver to work in azure databricks
My code currently looks like this:
%sh
sudo add-apt-repository ppa:canonical-chromium-builds/stage
/usr/bin/yes | sudo apt update
/usr/bin/yes | sudo apt install chromium-browser
import os
from webdrivermanager import ChromeDriverManager
from selenium import webdriver
cdd = ChromeDriverManager().download_and_install()
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--remote-debugging-port=9009')
driver = webdriver.Chrome(executable_path=cdd[0], options=chrome_options)
# Test driver connection
driver.get("https://www.google.com")
driver.find_element_by_css_selector("img").get_attribute("alt")
Exception:
---------------------------------------------------------------------------
WebDriverException Traceback (most recent call last)
<command-2232618947863762> in <module>
12 chrome_driver = "/usr/bin/chromedriver"
13
---> 14 driver = webdriver.Chrome(executable_path=cdd[0], options=chrome_options)
15
16 # Test driver connection
/databricks/python/lib/python3.8/site-packages/selenium/webdriver/chrome/webdriver.py in __init__(self, executable_path, port, options, service_args, desired_capabilities, service_log_path, chrome_options, keep_alive)
74
75 try:
---> 76 RemoteWebDriver.__init__(
77 self,
78 command_executor=ChromeRemoteConnection(
/databricks/python/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py in __init__(self, command_executor, desired_capabilities, browser_profile, proxy, keep_alive, file_detector, options)
155 warnings.warn("Please use FirefoxOptions to set browser profile",
156 DeprecationWarning, stacklevel=2)
--> 157 self.start_session(capabilities, browser_profile)
158 self._switch_to = SwitchTo(self)
159 self._mobile = Mobile(self)
/databricks/python/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py in start_session(self, capabilities, browser_profile)
250 parameters = {"capabilities": w3c_caps,
251 "desiredCapabilities": capabilities}
--> 252 response = self.execute(Command.NEW_SESSION, parameters)
253 if 'sessionId' not in response:
254 response = response['value']
/databricks/python/lib/python3.8/site-packages/selenium/webdriver/remote/webdriver.py in execute(self, driver_command, params)
319 response = self.command_executor.execute(driver_command, params)
320 if response:
--> 321 self.error_handler.check_response(response)
322 response['value'] = self._unwrap_value(
323 response.get('value', None))
/databricks/python/lib/python3.8/site-packages/selenium/webdriver/remote/errorhandler.py in check_response(self, response)
240 alert_text = value['alert'].get('text')
241 raise exception_class(message, screen, stacktrace, alert_text)
--> 242 raise exception_class(message, screen, stacktrace)
243
244 def _value_or_default(self, obj, key, default):
WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally.
(chrome not reachable)
(The process started from chrome location /usr/bin/chromium-browser is no longer running, so ChromeDriver is assuming that Chrome has crashed.)
What worked for me was this method which is from the source you linked in the question by kindofhungry.
I was not able to use the web driver manager successfully in Databricks.
%sh
pip install selenium
# imports needed for notebook
from datetime import datetime
import dateutil.relativedelta
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
import urllib.request, json
Download and unzip the Chrome driver and ensure its the latest version, https://chromedriver.storage.googleapis.com stopped giving me the latest version so I now use python to find the latest version of the chrome driver and then use the url to download the driver, I learned how to pass the variable to the shell script from this post
with urllib.request.urlopen("https://googlechromelabs.github.io/chrome-for-testing/last-known-good-versions-with-downloads.json") as url:
data = json.load(url)
print(data['channels']['Stable']['version'])
url = data['channels']['Stable']['downloads']['chromedriver'][0]['url']
print(url)
# set the url as environment variable to use in scripting
os.environ['url']= url
%sh
wget -N $url -O /tmp/chromedriver_linux64.zip
unzip /tmp/chromedriver_linux64.zip -d /tmp/chromedriver/
I needed to get Ubuntu updates see this post
%sh
sudo rm -r /var/lib/apt/lists/*
sudo apt clean &&
sudo apt update --fix-missing -y
add chrome and necessary packages
%sh
sudo curl -sS -o - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add
sudo echo "deb https://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list
sudo apt-get -y update
sudo apt-get -y install google-chrome-stable
set up the browser
def init_chrome_browser(download_path, chrome_driver_path, url):
options = Options()
prefs = {'download.default_directory' : download_path, 'profile.default_content_setting_values.automatic_downloads': 1, "download.prompt_for_download": False,
"download.directory_upgrade": True, "safebrowsing.enabled": True ,
"translate_whitelists": {"vi":"en"},
"translate":{"enabled":"true"}}
options.add_experimental_option('prefs', prefs)
options.add_argument('--no-sandbox')
options.add_argument('--headless') # wont work without this feature in databricks can't display browser
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--start-maximized')
options.add_argument('window-size=2560,1440')
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
options.add_argument('--lang=en')
options.add_experimental_option('excludeSwitches', ['enable-logging'])
print(f"{datetime.now()} Launching Chrome...")
browser = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
print(f"{datetime.now()} Chrome launched.")
browser.get(url)
print(f"{datetime.now()} Browser ready to use.")
return browser
Now you can test it, change the url as needed, you can also change the download path, I was able to point to folders in my data lake in blob storage and download items there by clicking on links with downloads, just wait a few seconds before you close the browser to give time for the download
driver = init_chrome_browser(
download_path="/tmp/downloads",
chrome_driver_path="/tmp/chromedriver/chromedriver-linux64/chromedriver",
url= "https://www.google.com"
)
Now you should be able to run your code
driver.find_element_by_css_selector("img").get_attribute("alt")
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With