Below is Haskell code that (HTTP) downloads files that are missing from the given directory:
module Main where
import Control.Monad ( filterM
, liftM
)
import Data.Maybe ( fromJust )
import Network.HTTP ( RequestMethod(GET)
, rspBody
, simpleHTTP
)
import Network.HTTP.Base ( Request(..) )
import Network.URI ( parseURI )
import System.Directory ( doesFileExist )
import System.Environment ( getArgs )
import System.IO ( hClose
, hPutStr
, hPutStrLn
, IOMode(WriteMode)
, openFile
, stderr
)
import Text.Printf ( printf )
indices :: [String]
indices =
map format1 [0..9] ++ map format2 [0..14] ++ ["40001-41284" :: String]
where
format1 index =
printf "%d-%d" ((index * 1000 + 1) :: Int)
(((index + 1) * 1000) :: Int)
format2 index =
printf "%d-%d" ((10000 + 2 * index * 1000 + 1) :: Int)
((10000 + (2 * index + 2) * 1000) :: Int)
main :: IO ()
main = do
[dir] <- getArgs
updateDownloads dir
updateDownloads :: FilePath -> IO ()
updateDownloads path = do
let
fileNames = map (\index ->
(index, path ++ "/tv_and_movie_freqlist" ++ index ++ ".html")) indices
missing <-
filterM (\(_, fileName) -> liftM not $ doesFileExist fileName) fileNames
mapM_ (\(index, fileName) -> do
let
url =
"http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006/" ++
index
request =
Request
{ rqURI = fromJust $ parseURI url
, rqMethod = GET
, rqHeaders = []
, rqBody = ""
}
hPutStrLn stderr $ "Downloading " ++ show url
resp <- simpleHTTP request
case resp of
Left _ -> hPutStrLn stderr $ "Error connecting to " ++ show url
Right response -> do
let
html = rspBody response
file <- openFile fileName WriteMode
hPutStr file html
hClose file
return ()) missing
I would like to run the downloads in parallel. I know about par
, but am not sure if it can be used in the IO
monad, and if so, how?
UPDATE: Here is my code reimplemented using Control.Concurrent.Async
and mapConcurrently
:
module Main where
import Control.Concurrent.Async ( mapConcurrently )
import Control.Monad ( filterM
, liftM
)
import Data.Maybe ( fromJust )
import Network.HTTP ( RequestMethod(GET)
, rspBody
, simpleHTTP
)
import Network.HTTP.Base ( Request(..) )
import Network.URI ( parseURI )
import System.Directory ( doesFileExist )
import System.Environment ( getArgs )
import System.IO ( hClose
, hPutStr
, hPutStrLn
, IOMode(WriteMode)
, openFile
, stderr
)
import Text.Printf ( printf )
indices :: [String]
indices =
map format1 [0..9] ++ map format2 [0..14] ++ ["40001-41284" :: String]
where
format1 index =
printf "%d-%d" ((index * 1000 + 1) :: Int)
(((index + 1) * 1000) :: Int)
format2 index =
printf "%d-%d" ((10000 + 2 * index * 1000 + 1) :: Int)
((10000 + (2 * index + 2) * 1000) :: Int)
main :: IO ()
main = do
[dir] <- getArgs
updateDownloads dir
updateDownloads :: FilePath -> IO ()
updateDownloads path = do
let
fileNames = map (\index ->
(index, path ++ "/tv_and_movie_freqlist" ++ index ++ ".html")) indices
missing <-
filterM (\(_, fileName) -> liftM not $ doesFileExist fileName) fileNames
pages <-
mapConcurrently (\(index, fileName) -> getUrl index fileName) missing
mapM_ (\(fileName, html) -> do
handle <- openFile fileName WriteMode
hPutStr handle html
hClose handle) pages
where
getUrl :: String -> FilePath -> IO (FilePath, String)
getUrl index fileName = do
let
url =
"http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006/" ++
index
request =
Request
{ rqURI = fromJust $ parseURI url
, rqMethod = GET
, rqHeaders = []
, rqBody = ""
}
resp <- simpleHTTP request
case resp of
Left _ -> do
hPutStrLn stderr $ "Error connecting to " ++ show url
return ("", "")
Right response ->
return (fileName, rspBody response)
This looks like it's exactly what async
is designed for, in fact the example is for parallel downloads. There is a presentation on this too - http://skillsmatter.com/podcast/home/high-performance-concurrency - well worth checking out.
Since the operations involve IO, you typically would /not/ use par
for this, as it doesn't do anything to IO actions.
You will need an explicit concurrency model, to hide the latency of downloading.
I'd recommend MVars or TVars, combined with forkIO.
A work queue abstraction is often useful for this style of problem: push all URLs into a queue, and have a fixed set of worker threads (e.g. N * k) for N cores, take jobs until done. Completed work would then be appended to a communication channel handed back to the main thread.
Here's an example from a parallel URL checker, using channels.
http://code.haskell.org/~dons/code/urlcheck/Check.hs
Another version that uses async's mapConcurrently and the http-conduit keep-alive manager
{-# LANGUAGE PackageImports, FlexibleContexts #-}
import System.Environment (getArgs)
import "http-conduit" Network.HTTP.Conduit
import qualified "conduit" Data.Conduit as C
import "http-types" Network.HTTP.Types.Status (ok200)
import "async" Control.Concurrent.Async (mapConcurrently)
import qualified "bytestring" Data.ByteString.Lazy as LBS
import qualified "bytestring" Data.ByteString as BS
import "transformers" Control.Monad.Trans.Class (lift)
import "transformers" Control.Monad.IO.Class (liftIO)
import "url" Network.URL (encString)
import "failure" Control.Failure (Failure(..))
import Control.Monad
import System.IO
taggedRequest :: Failure HttpException m => String -> m (String, Request m')
taggedRequest url = do
req <- parseUrl url
return (url, req)
taggedResult :: (C.MonadBaseControl IO m, C.MonadResource m) => Manager -> (String, Request m) -> m (String, Response LBS.ByteString)
taggedResult manager (url, req) = do
res <- httpLbs req manager
return (url, res)
main = do
args <- getArgs
case args of
[] -> putStrLn "usage: program url1 url2 ... urlN"
args -> do
requests <- mapM taggedRequest args
withManager $ \manager -> liftIO $ do
results <- mapConcurrently (C.runResourceT . taggedResult manager) requests
forM_ results $ \(url, Response status _ _ bsBody) -> do
putStrLn $ url ++ " ; " ++ show status
let fileName = encString True (`notElem` ":/") url ++ ".html"
when (status == ok200) $ LBS.writeFile fileName bsBody
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With