10

下面是 (HTTP) 下载给定目录中缺少的文件的 Haskell 代码:

module Main where

import Control.Monad ( filterM
                     , liftM
                     )
import Data.Maybe ( fromJust )
import Network.HTTP ( RequestMethod(GET)
                    , rspBody
                    , simpleHTTP
                    )
import Network.HTTP.Base ( Request(..) )
import Network.URI ( parseURI )
import System.Directory ( doesFileExist )
import System.Environment ( getArgs )
import System.IO ( hClose
                 , hPutStr
                 , hPutStrLn
                 , IOMode(WriteMode)
                 , openFile
                 , stderr
                 )
import Text.Printf ( printf )

indices :: [String]
indices =
  map format1 [0..9] ++ map format2 [0..14] ++ ["40001-41284" :: String]
  where
    format1 index =
      printf "%d-%d" ((index * 1000 + 1) :: Int)
                     (((index + 1) * 1000) :: Int)
    format2 index =
      printf "%d-%d" ((10000 + 2 * index * 1000 + 1) :: Int)
                     ((10000 + (2 * index + 2) * 1000) :: Int)

main :: IO ()
main = do
  [dir] <- getArgs
  updateDownloads dir

updateDownloads :: FilePath -> IO ()
updateDownloads path = do
  let
    fileNames = map (\index ->
      (index, path ++ "/tv_and_movie_freqlist" ++ index ++ ".html")) indices
  missing <-
    filterM (\(_, fileName) -> liftM not $ doesFileExist fileName) fileNames
  mapM_ (\(index, fileName) -> do
    let
      url =
        "http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006/" ++
        index
      request =
        Request
          { rqURI = fromJust $ parseURI url
          , rqMethod = GET
          , rqHeaders = []
          , rqBody = ""
          }
    hPutStrLn stderr $ "Downloading " ++ show url
    resp <- simpleHTTP request
    case resp of
      Left _ -> hPutStrLn stderr $ "Error connecting to " ++ show url
      Right response -> do
        let
          html = rspBody response
        file <- openFile fileName WriteMode
        hPutStr file html
        hClose file
    return ()) missing

我想并行运行下载。我知道par,但不确定它是否可以在IOmonad 中使用,如果可以,如何使用?

更新:这是我使用Control.Concurrent.Asyncand重新实现的代码mapConcurrently

module Main where

import Control.Concurrent.Async ( mapConcurrently )
import Control.Monad ( filterM
                     , liftM
                     )
import Data.Maybe ( fromJust )
import Network.HTTP ( RequestMethod(GET)
                    , rspBody
                    , simpleHTTP
                    )
import Network.HTTP.Base ( Request(..) )
import Network.URI ( parseURI )
import System.Directory ( doesFileExist )
import System.Environment ( getArgs )
import System.IO ( hClose
                 , hPutStr
                 , hPutStrLn
                 , IOMode(WriteMode)
                 , openFile
                 , stderr
                 )
import Text.Printf ( printf )

indices :: [String]
indices =
  map format1 [0..9] ++ map format2 [0..14] ++ ["40001-41284" :: String]
  where
    format1 index =
      printf "%d-%d" ((index * 1000 + 1) :: Int)
                     (((index + 1) * 1000) :: Int)
    format2 index =
      printf "%d-%d" ((10000 + 2 * index * 1000 + 1) :: Int)
                     ((10000 + (2 * index + 2) * 1000) :: Int)

main :: IO ()
main = do
  [dir] <- getArgs
  updateDownloads dir

updateDownloads :: FilePath -> IO ()
updateDownloads path = do
  let
    fileNames = map (\index ->
      (index, path ++ "/tv_and_movie_freqlist" ++ index ++ ".html")) indices
  missing <-
    filterM (\(_, fileName) -> liftM not $ doesFileExist fileName) fileNames
  pages <-
    mapConcurrently (\(index, fileName) -> getUrl index fileName) missing
  mapM_ (\(fileName, html) -> do
    handle <- openFile fileName WriteMode
    hPutStr handle html
    hClose handle) pages
  where
    getUrl :: String -> FilePath -> IO (FilePath, String)
    getUrl index fileName = do
      let
        url =
          "http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006/" ++
          index
        request =
          Request
          { rqURI = fromJust $ parseURI url
          , rqMethod = GET
          , rqHeaders = []
          , rqBody = ""
          }
      resp <- simpleHTTP request
      case resp of
        Left _ -> do
          hPutStrLn stderr $ "Error connecting to " ++ show url
          return ("", "")
        Right response ->
          return (fileName, rspBody response)
4

4 回答 4

13

这看起来正是async设计的目的,实际上该示例用于并行下载。也有关于此的演示 - http://skillsmatter.com/podcast/home/high-performance-concurrency - 非常值得一试。

于 2013-01-14T20:22:44.440 回答
12

由于操作涉及 IO,因此您通常 /not/ 使用par它,因为它对 IO 操作没有任何作用。

您将需要一个显式的并发模型来隐藏下载的延迟。

我推荐 MVars 或 TVars,结合 forkIO。

工作队列抽象通常对这种类型的问题很有用:将所有 URL 推入一个队列,并为 N 个内核提供一组固定的工作线程(例如 N * k),直到完成为止。然后将完成的工作附加到返回给主线程的通信通道。

这是来自并行 URL 检查器的示例,使用通道。

http://code.haskell.org/~dons/code/urlcheck/Check.hs

于 2013-01-14T17:31:19.830 回答
8

看看mapConcurrently Simon Marlow 的“异步”库。

IO它以并行和异步的方式将一个动作映射到Traversable容器的元素并等待所有动作。

例子:

{-# LANGUAGE PackageImports #-}

import System.Environment (getArgs)

import "async" Control.Concurrent.Async (mapConcurrently)

import "HTTP" Network.HTTP
import "HTTP" Network.Stream (Result)
import "HTTP" Network.HTTP.Base (Response(..))
import System.IO
import "url" Network.URL (encString)

import Control.Monad


getURL :: String -> IO (String, Result (Response String))
getURL url = do
        res <- (simpleHTTP . getRequest) url
        return (url, res)

main = do
     args <- getArgs
     case args of
          [] -> putStrLn "usage: program url1 url2 ... urlN"
          args -> do
                results <- mapConcurrently getURL args
                forM_ results $ \(url, res) -> do
                        case res of
                                Left connError -> putStrLn $ url ++ "; " ++ show connError
                                Right response -> do
                                        putStrLn $ url ++ "; OK"
                                        let content = rspBody response

                                            -- make name from url
                                            fname = encString True (`notElem` ":/") url ++ ".html"
                                        writeFile fname content    
于 2013-01-15T10:26:44.593 回答
2

另一个使用async 的 mapConcurrentlyhttp-conduit keep-alive manager的版本

{-# LANGUAGE PackageImports, FlexibleContexts #-}

import System.Environment (getArgs)

import "http-conduit" Network.HTTP.Conduit
import qualified "conduit" Data.Conduit as C
import "http-types" Network.HTTP.Types.Status (ok200)

import "async" Control.Concurrent.Async (mapConcurrently)
import qualified "bytestring" Data.ByteString.Lazy as LBS
import qualified "bytestring" Data.ByteString as BS
import "transformers" Control.Monad.Trans.Class (lift)
import "transformers" Control.Monad.IO.Class (liftIO)
import "url" Network.URL (encString)
import "failure" Control.Failure (Failure(..))

import Control.Monad
import System.IO

taggedRequest :: Failure HttpException m => String -> m (String, Request m')
taggedRequest url = do
        req <- parseUrl url
        return (url, req)

taggedResult :: (C.MonadBaseControl IO m, C.MonadResource m) => Manager -> (String, Request m) -> m (String, Response LBS.ByteString)
taggedResult manager (url, req) = do
        res <- httpLbs req manager
        return (url, res)

main = do
     args <- getArgs
     case args of
          [] -> putStrLn "usage: program url1 url2 ... urlN"
          args -> do
                requests <- mapM taggedRequest args
                withManager $ \manager -> liftIO $ do

                        results <- mapConcurrently (C.runResourceT . taggedResult manager) requests

                        forM_ results $ \(url, Response status _ _ bsBody) -> do
                             putStrLn $ url ++ " ; " ++ show status   
                             let fileName = encString True (`notElem` ":/") url ++ ".html"
                             when (status == ok200) $ LBS.writeFile fileName bsBody
于 2013-01-15T16:21:50.750 回答