note: ipums international and ipums usa probably use the same system. ipums usa allows quicker signup. if you would like to test out your code, try https://usa.ipums.org/usa-action/users/request_access to sign up!
i am trying to programmatically download a file from https://international.ipums.org/ with the R language and httr. i need to use httr and not RCurl because i need to post-authentication download large files not into RAM but directly to disk. this is currently only possible with httr
as far as i know
the reproducible code below documents my best effort at getting from the login page (https://international.ipums.org/international-action/users/login) to the main post-authentication page. any tips or hints would be appreciated! thanks!
my_email <- "[email protected]"
my_password <- "password"
tf <- tempfile()
# use httr, because i need to download a large file after authentication
# and only httr supports that with its `write_disk()` option
library(httr)
# turn off ssl verify, otherwise the subsequent GET command will fail
set_config( config( ssl_verifypeer = 0L ) )
GET( "https://international.ipums.org/Shibboleth.sso/Login?target=https%3A%2F%2Finternational.ipums.org%2Finternational-action%2Fmenu" )
# connect to the starting login page of the website
( a <- GET( "https://international.ipums.org/international-action/users/login" , verbose( info = TRUE ) ) )
# which takes me through to a lot of websites, but ultimately (in my browser) lands at
shibboleth_url <- "https://live.identity.popdata.org:443/idp/Authn/UserPassword"
# construct authentication information?
base_values <- list( "j_username" = my_email , "j_password" = my_password )
idp_values <- list( "j_username" = my_email , "j_password" = my_password , "_idp_authn_lc_key"=subset( a$cookies , domain == "live.identity.popdata.org" )$value , "JSESSIONID" = subset( a$cookies , domain == "#HttpOnly_live.identity.popdata.org" )$value )
ipums_values <- list( "j_username" = my_email , "j_password" = my_password , "_idp_authn_lc_key"=subset( a$cookies , domain == "live.identity.popdata.org" )$value , "JSESSIONID" = subset( a$cookies , domain == "international.ipums.org" )$value)
# i believe this is where the main login should happen, but it looks like it's failing
GET( shibboleth_url , query = idp_values )
POST( shibboleth_url , body = base_values )
writeBin( GET( shibboleth_url , query = idp_values )$content , tf )
readLines( tf )
# The MPC account authentication system has encountered an error
# This error can sometimes occur if you did not close your browser after logging out of an application previously. It may also occur for other reasons. Please close your browser and try your action again."
writeBin( GET( "https://live.identity.popdata.org/idp/profile/SAML2/Redirect/SSO" , query = idp_values )$content , tf )
POST( "https://live.identity.popdata.org/idp/profile/SAML2/Redirect/SSO" , body = idp_values )
readLines( tf )
# same error as above
# return to the main login page..
writeBin( GET( "https://international.ipums.org/international-action/menu" , query = ipums_values )$content , tf )
readLines( tf )
# ..not logged in
You have to use set_cookies()
to send your cookies to the server:
library(httr)
library(rvest)
#my_email <- "xxx"
#my_password <- "yyy"
tf <- tempfile()
set_config( config( ssl_verifypeer = 0L ) )
# Get first page
p1 <- GET( "https://international.ipums.org/international-action/users/login" , verbose( info = TRUE ) )
# Post Login credentials
b2 <- list( "j_username" = my_email , "j_password" = my_password )
c2 <- c(JSESSIONID=p1$cookies[p1$cookies$domain=="#HttpOnly_live.identity.popdata.org",]$value,
`_idp_authn_lc_key`=p1$cookies[p1$cookies$domain=="live.identity.popdata.org",]$value)
p2 <- POST(p1$url,body = b2, set_cookies(.cookies = c2), encode="form" )
# Parse hidden fields
h2 <- read_html(p2$content)
form <- h2 %>% html_form()
# Post hidden fields
b3 <- list( "RelayState"=form[[1]]$fields[[1]]$value, "SAMLResponse"=form[[1]]$fields[[2]]$value)
c3 <- c(JSESSIONID=p1$cookies[p1$cookies$domain=="#HttpOnly_live.identity.popdata.org",]$value,
`_idp_session`=p2$cookies[p2$cookies$name=="_idp_session",]$value,
`_idp_authn_lc_key`=p2$cookies[p2$cookies$name=="_idp_authn_lc_key",]$value)
p3 <- POST( form[[1]]$url , body=b3, set_cookies(.cookies = c3), encode = "form")
# Get interesting page
c4 <- c(JSESSIONID=p3$cookies[p1$cookies$domain=="international.ipums.org" && p3$cookies$name=="JSESSIONID",]$value,
`_idp_session`=p3$cookies[p3$cookies$name=="_idp_session",]$value,
`_idp_authn_lc_key`=p3$cookies[p3$cookies$name=="_idp_authn_lc_key",]$value)
p4 <- GET( "https://international.ipums.org/international-action/menu", set_cookies(.cookies = c4) )
writeBin(p4$content , tf )
readLines( tf )[55]
Since the result is
[1] " <li class=\"lastItem\"><a href=\"/international-action/users/logout\">Logout</a></li>"
I think you're logged in...
@HubertL have done many steps in the right direction, however, I think, his answer is not complete.
First of all, the main thing to look at when you're implementing automatic web authorization is the cookies being used during 'normal' manual workflow. You can easily spy on them with dev tools in any modern browser:
Here, we see JSESSIONID
and _shibsession*
cookies, first one holds JSP session id of the website, second is most likely solely for a shibboleth authorization. Server is, probably, have them bound somehow, but JSESSIONID
doesn't require authorization and you get it right away after opening the website. So, we must get _shibsession*
cookie for our JSESSIONID
to be authorized. That's what the Shibboleth's authorization process with many redirects is about. See the comments in code.
login_ipums = function(user, password)
{
require(httr)
require(rvest)
set_config( config( ssl_verifypeer = 0L ) )
#important - httr preserves cookies on subsequent requests to the same host, we don't need that because of sessions expiration
handle_reset("https://usa.ipums.org/")
#set login and password
login1 = GET( "https://usa.ipums.org/usa-action/users/login" )
form_auth = list( "j_username" = user , "j_password" = password )
l1_cookies=login1$cookies$value
names(l1_cookies)=login1$cookies$name
#receive auth tokens as html hidden fields in a form
login2 = POST(login1$url, body = form_auth, set_cookies(.cookies=l1_cookies), encode="form")
login2_form = read_html(login2$content) %>% html_form()
l2_cookies=login2$cookies$value
names(l2_cookies)=login2$cookies$name
#submit the form back (browser submits it back automatically with JS)
login3 = POST(login2_form[[1]]$url, body=list(RelayState=login2_form[[1]]$fields$RelayState$value,
SAMLResponse=login2_form[[1]]$fields$SAMLResponse$value),
set_cookies(.cookies=l2_cookies),
encode="form")
#now we have what we came for - _shibsession_* and JSESSION id cookie
login_cookies = login3$cookies$value
names(login_cookies)=login3$cookies$name
return=login_cookies
}
After the call to login_ipums
we'll have the following cookies:
> cookies=login_ipums(my_email, my_password)
> names(cookies)
[1] "JSESSIONID"
[2] "_idp_authn_lc_key"
[3] "_shibsession_7573612e69..."
Here, we have both JSESSIONID
and _shibsession_*
used for site-wide authorization. _idp_authn_lc_key
is, probably, not needed, but leaving it won't hurt.
Now, you can easily download files like that:
cookies=login_ipums(my_email, my_password)
target = GET("https://usa.ipums.org/usa-action/downloads/extract_files/usa_00001.dat.gz",
set_cookies(.cookies=cookies),
write_disk("file.bin", overwrite = TRUE))
IMPORTANT NOTE: As you can see, I used IPUMS USA, not International. To check that code with your account, replace usa
with international
everywhere, including *-action
in URLs.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With