How to scrape a website that requires login first with Python

First of all, I think it's worth saying that, I know there are a bunch of similar questions but NONE of them works for me...

I'm a newbie on Python, html and web scraper. I'm trying to scrape user information from a website which needs to login first. In my tests I use scraper my email settings from github as examples. The main page is 'https://github.com/login' and the target page is 'https://github.com/settings/emails'

Here are a list of methods I've tried

##################################### Method 1 import mechanize import cookielib from BeautifulSoup import BeautifulSoup import html2text  br = mechanize.Browser() cj = cookielib.LWPCookieJar() br.set_cookiejar(cj)  # Browser options br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)   br.addheaders = [('User-agent', 'Chrome')]  # The site we will navigate into, handling it's session br.open('https://github.com/login')  for f in br.forms():     print f  br.select_form(nr=0)  # User credentials br.form['login'] = 'myusername' br.form['password'] = 'mypwd'  # Login br.submit()  br.open('github.com/settings/emails').read()   ################ Method 2 import urllib, urllib2, cookielib  username = 'myusername' password = 'mypwd'  cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) login_data = urllib.urlencode({'username' : username, 'j_password' : password}) opener.open('https://github.com/login', login_data) resp = opener.open('https://github.com/settings/emails') print resp.read()    ############# Method 3 import urllib opener = urllib.FancyURLopener() print opener.open('http://myusername:[email protected]/settings/emails').read()     ########## Method 4 import mechanize import cookielib  br = mechanize.Browser() cj = cookielib.LWPCookieJar() br.set_cookiejar(cj)  br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) #br.set_debug_http(True) #br.set_debug_redirects(True) #br.set_debug_responses(True)  br.addheaders = [('User-agent', 'Chrome')]  br.add_password('https://github.com/settings/emails', 'myusername', 'mypwd') br.open('https://github.com/settings/emails') print br.response().read()    ############ Methods 5 from requests import session  payload = {     'action': 'login',     'username': 'myusername',     'password': 'mypwd' }  with session() as c:     c.post('https://github.com/login', data=payload)     request = c.get('https://github.com/settings/emails')     print request.headers     print request.text    ########### Method 6 import requests from requests.packages.urllib3 import add_stderr_logger import sys from bs4 import BeautifulSoup as bs  add_stderr_logger() s = requests.Session()  s.headers['User-Agent'] = 'Chrome'  username = 'myusername' password = 'mypwd' url = 'https://github.com/login'  # after examining the HTML of the website you're trying to log into # set name_form to the name of the form element that contains the name and # set password_form to the name of the form element that will contain the password login = {'login': username, 'password': password} login_response = s.post(url, data=login) for r in login_response.history:     if r.status_code == 401:  # 401 means authentication failed         print 'error!'         sys.exit(1)  # abort   pdf_response = s.get('https://github.com/settings/emails')  # Your cookies and headers are automatically included soup = bs(pdf_response.content) 

Also I've read some discussions about differences between HTTP Authentication and cookies. Still none of them worked.

Please help and any help would be appreciated. Thank you very much.

user2830451 Avatar asked Nov 18 '13 03:11


1 Answers

This works for me:

##################################### Method 1 import mechanize import cookielib from BeautifulSoup import BeautifulSoup import html2text  # Browser br = mechanize.Browser()  # Cookie Jar cj = cookielib.LWPCookieJar() br.set_cookiejar(cj)  # Browser options br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)  br.addheaders = [('User-agent', 'Chrome')]  # The site we will navigate into, handling it's session br.open('https://github.com/login')  # View available forms for f in br.forms():     print f  # Select the second (index one) form (the first form is a search query box) br.select_form(nr=1)  # User credentials br.form['login'] = 'mylogin' br.form['password'] = 'mypass'  # Login br.submit()  print(br.open('https://github.com/settings/emails').read()) 

You were not far off at all!

Holy Mackerel Avatar answered Oct 05 '22 23:10

Holy Mackerel

Holy Mackerel