Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to refactor this Haskell random bytes outputter?

I'm trying to generate random data at fast speed inside Haskell, but it when I try to use any idiomatic approach I get low speed and big GC overhead.

Here is the short code:

import qualified System.Random.Mersenne as RM
import qualified Data.ByteString.Lazy as BL
import qualified System.IO as SI
import Data.Word

main = do
    r <- RM.newMTGen  Nothing :: IO RM.MTGen
    rnd <- RM.randoms  r :: IO [Word8]
    BL.hPutStr SI.stdout $ BL.pack rnd

Here is the fast code:

import qualified System.Random.Mersenne as RM
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.Binary.Put as DBP
import qualified System.IO as SI
import Data.List
import Control.Monad (void, forever)
import Data.Word

main = do
    r <- RM.newMTGen  Nothing :: IO RM.MTGen
    forever $ do
        x0 <- RM.random r :: IO Word32
        x1 <- RM.random r :: IO Word32
        x2 <- RM.random r :: IO Word32
        x3 <- RM.random r :: IO Word32
        x4 <- RM.random r :: IO Word32
        x5 <- RM.random r :: IO Word32
        x6 <- RM.random r :: IO Word32
        x7 <- RM.random r :: IO Word32
        x8 <- RM.random r :: IO Word32
        x9 <- RM.random r :: IO Word32
        xA <- RM.random r :: IO Word32
        xB <- RM.random r :: IO Word32
        xC <- RM.random r :: IO Word32
        xD <- RM.random r :: IO Word32
        xE <- RM.random r :: IO Word32
        xF <- RM.random r :: IO Word32
        c0 <- RM.random r :: IO Word32
        c1 <- RM.random r :: IO Word32
        c2 <- RM.random r :: IO Word32
        c3 <- RM.random r :: IO Word32
        c4 <- RM.random r :: IO Word32
        c5 <- RM.random r :: IO Word32
        c6 <- RM.random r :: IO Word32
        c7 <- RM.random r :: IO Word32
        c8 <- RM.random r :: IO Word32
        c9 <- RM.random r :: IO Word32
        cA <- RM.random r :: IO Word32
        cB <- RM.random r :: IO Word32
        cC <- RM.random r :: IO Word32
        cD <- RM.random r :: IO Word32
        cE <- RM.random r :: IO Word32
        cF <- RM.random r :: IO Word32
        v0 <- RM.random r :: IO Word32
        v1 <- RM.random r :: IO Word32
        v2 <- RM.random r :: IO Word32
        v3 <- RM.random r :: IO Word32
        v4 <- RM.random r :: IO Word32
        v5 <- RM.random r :: IO Word32
        v6 <- RM.random r :: IO Word32
        v7 <- RM.random r :: IO Word32
        v8 <- RM.random r :: IO Word32
        v9 <- RM.random r :: IO Word32
        vA <- RM.random r :: IO Word32
        vB <- RM.random r :: IO Word32
        vC <- RM.random r :: IO Word32
        vD <- RM.random r :: IO Word32
        vE <- RM.random r :: IO Word32
        vF <- RM.random r :: IO Word32
        b0 <- RM.random r :: IO Word32
        b1 <- RM.random r :: IO Word32
        b2 <- RM.random r :: IO Word32
        b3 <- RM.random r :: IO Word32
        b4 <- RM.random r :: IO Word32
        b5 <- RM.random r :: IO Word32
        b6 <- RM.random r :: IO Word32
        b7 <- RM.random r :: IO Word32
        b8 <- RM.random r :: IO Word32
        b9 <- RM.random r :: IO Word32
        bA <- RM.random r :: IO Word32
        bB <- RM.random r :: IO Word32
        bC <- RM.random r :: IO Word32
        bD <- RM.random r :: IO Word32
        bE <- RM.random r :: IO Word32
        bF <- RM.random r :: IO Word32
        BL.hPutStr SI.stdout  $ DBP.runPut $ do
            DBP.putWord32be x0
            DBP.putWord32be x1
            DBP.putWord32be x2
            DBP.putWord32be x3
            DBP.putWord32be x4
            DBP.putWord32be x5
            DBP.putWord32be x6
            DBP.putWord32be x7
            DBP.putWord32be x8
            DBP.putWord32be x9
            DBP.putWord32be xA
            DBP.putWord32be xB
            DBP.putWord32be xC
            DBP.putWord32be xD
            DBP.putWord32be xE
            DBP.putWord32be xF
            DBP.putWord32be c0
            DBP.putWord32be c1
            DBP.putWord32be c2
            DBP.putWord32be c3
            DBP.putWord32be c4
            DBP.putWord32be c5
            DBP.putWord32be c6
            DBP.putWord32be c7
            DBP.putWord32be c8
            DBP.putWord32be c9
            DBP.putWord32be cA
            DBP.putWord32be cB
            DBP.putWord32be cC
            DBP.putWord32be cD
            DBP.putWord32be cE
            DBP.putWord32be cF
            DBP.putWord32be v0
            DBP.putWord32be v1
            DBP.putWord32be v2
            DBP.putWord32be v3
            DBP.putWord32be v4
            DBP.putWord32be v5
            DBP.putWord32be v6
            DBP.putWord32be v7
            DBP.putWord32be v8
            DBP.putWord32be v9
            DBP.putWord32be vA
            DBP.putWord32be vB
            DBP.putWord32be vC
            DBP.putWord32be vD
            DBP.putWord32be vE
            DBP.putWord32be vF
            DBP.putWord32be b0
            DBP.putWord32be b1
            DBP.putWord32be b2
            DBP.putWord32be b3
            DBP.putWord32be b4
            DBP.putWord32be b5
            DBP.putWord32be b6
            DBP.putWord32be b7
            DBP.putWord32be b8
            DBP.putWord32be b9
            DBP.putWord32be bA
            DBP.putWord32be bB
            DBP.putWord32be bC
            DBP.putWord32be bD
            DBP.putWord32be bE
            DBP.putWord32be bF

The short code outputs about 6 megabytes of random bytes per second on my computer. The fast code - about 150 megabytes per seccond.

If I reduce number of that variables from 64 to 16 in the fast code, the speed drops to about 78 megabytes per second.

How to make this code compact and idiomatic without slowing it down?

like image 674
Vi. Avatar asked Sep 15 '13 23:09

Vi.


2 Answers

I don't think lazy IO is considered very idiomatic today in Haskell. It may work for one-liners, but for the large IO-intensive programs haskellers use iteratees/conduits/pipes/Oleg-knows-what.

First, to make a reference point, some stats from running your original versions on my computer, compiled with GHC 7.6.3 (-O2 --make), on Linux x86-64. The slow lazy bytestring version:

$ ./rnd +RTS -s | pv | head -c 100M > /dev/null
 100MB 0:00:09 [10,4MB/s] [         <=>                                       ]
   6,843,934,360 bytes allocated in the heap
       2,065,144 bytes copied during GC
          68,000 bytes maximum residency (2 sample(s))
          18,016 bytes maximum slop
               1 MB total memory in use (0 MB lost due to fragmentation)
  ...
  Productivity  99.2% of total user, 97.7% of total elapsed

It's not blazingly fast, but there is no GC and memory overhead to speak of. It's interesting how and where did you get 37% GC time with this code.

The fast version with unrolled loops:

$ ./rndfast +RTS -s | pv | head -c 500M > /dev/null
 500MB 0:00:04 [ 110MB/s] [    <=>                                            ]
  69,434,953,224 bytes allocated in the heap
       9,225,128 bytes copied during GC
          68,000 bytes maximum residency (2 sample(s))
          18,016 bytes maximum slop
               2 MB total memory in use (0 MB lost due to fragmentation)
  ...
  Productivity  85.0% of total user, 72.7% of total elapsed

That's much faster, but, interestingly enough, now we have 15% GC overhead.

And, finally, my version using conduits and blaze-builder. It generates 512 random Word64s at a time to produce 4 KB data chunks to be consumed downstream. The performance increased steadily as I increased list "buffer" size from 32 to 512, but improvements are small above 128.

import Blaze.ByteString.Builder (Builder)
import Blaze.ByteString.Builder.Word
import Control.Monad (forever)
import Control.Monad.IO.Class (liftIO)
import Data.ByteString (ByteString)
import Data.Conduit
import qualified Data.Conduit.Binary as CB
import Data.Conduit.Blaze (builderToByteString)
import Data.Word
import System.IO (stdout)
import qualified System.Random.Mersenne as RM

randomStream :: RM.MTGen -> Source IO Builder
randomStream gen = forever $ do
    words <- liftIO $ RM.randoms gen
    yield $ fromWord64shost $ take 512 words

main :: IO ()
main = do
    gen <- RM.newMTGen Nothing
    randomStream gen $= builderToByteString $$ CB.sinkHandle stdout

I noticed that unlike the two programs above it is slightly (3-4%) faster when compiled with -fllvm, so the output below is from binary produced by LLVM 3.3.

$ ./rndconduit +RTS -s | pv | head -c 500M > /dev/null
 500MB 0:00:09 [53,2MB/s] [         <=>                                       ]
   8,889,236,736 bytes allocated in the heap
      10,912,024 bytes copied during GC
          36,376 bytes maximum residency (2 sample(s))
          19,024 bytes maximum slop
               1 MB total memory in use (0 MB lost due to fragmentation)
  ...
  Productivity  99.0% of total user, 91.9% of total elapsed

So, it is twice as slow as the manually unrolled version, but is almost as short and readable as the lazy IO version, has almost no GC overhead and predictable memory behavior. Maybe there is a room for improvement here: comments are welcome.

UPDATE:

Combining a bit of unsafe byte fiddling with conduits I was able to make program that generates 300+ MB/s of random data. Looks like simple type-specialized tail recursive functions works better than both lazy lists and manual unrolling.

import Control.Monad (forever)
import Control.Monad.IO.Class (liftIO)
import Data.ByteString (ByteString)
import qualified Data.ByteString as B
import Data.Conduit
import qualified Data.Conduit.Binary as CB
import Data.Word
import Foreign.Marshal.Array
import Foreign.Ptr
import Foreign.Storable
import System.IO (stdout)
import qualified System.Random.Mersenne as RM


randomChunk :: RM.MTGen -> Int -> IO ByteString
randomChunk gen bufsize = allocaArray bufsize $ \ptr -> do
    loop ptr bufsize
    B.packCStringLen (castPtr ptr, bufsize * sizeOf (undefined :: Word64))
    where
    loop :: Ptr Word64 -> Int -> IO ()
    loop ptr 0 = return ()
    loop ptr n = do
        x <- RM.random gen
        pokeElemOff ptr n x
        loop ptr (n - 1)


chunkStream :: RM.MTGen -> Source IO ByteString
chunkStream gen = forever $ liftIO (randomChunk gen 512) >>= yield


main :: IO ()
main = do
    gen <- RM.newMTGen Nothing
    chunkStream gen $$ CB.sinkHandle stdout

At this speed IO overhead actually becomes noticeable: program spends more than quarter of its run time in system calls, and adding head to the pipeline like in the examples above slows it down considerably.

$ ./rndcond +RTS -s | pv > /dev/null
^C27GB 0:00:10 [ 338MB/s] [         <=>                                       ]
   8,708,628,512 bytes allocated in the heap
       1,646,536 bytes copied during GC
          36,168 bytes maximum residency (2 sample(s))
          17,080 bytes maximum slop
               2 MB total memory in use (0 MB lost due to fragmentation)
  ...
  Productivity  98.7% of total user, 73.6% of total elapsed
like image 97
rkhayrov Avatar answered Dec 04 '22 08:12

rkhayrov


I can confirm that the second version is slower than the first, but not to the same extent. In 10 seconds, the short code generated 111M of data, while the large code generated 833M of data. This was done on Mac OSX Lion, compiled with 7.6.3 with -O3.

While I don't know why the first solution is so slow, the second can be simplified by using replicateM and mapM to remove the duplication:

main3 = do
    r <- RM.newMTGen  Nothing :: IO RM.MTGen
    forever $ do
        vals <- sequence $ replicate 64 (RM.random r)
        BL.hPutStr SI.stdout $ DBP.runPut $ mapM_ DBP.putWord32be vals

This solution is still slower though, generating 492M of data in 10 seconds. A final last ditch solution is to use template haskell, to generate the code to unroll the loops:

main4 = do
  r <- RM.newMTGen Nothing :: IO RM.MTGen
  forever $ do
    $(let varCount = 64
          -- | replaces every instance of oldName with newName in the exp
          replaceNames :: (Typeable t, Data t) => String -> Name -> t -> t
          replaceNames oldName replacementName expr = everywhere (mkT changeName) expr where
              changeName name | nameBase name == oldName = replacementName
                              | otherwise       = name
          singleVarExp :: Name -> ExpQ -> ExpQ
          singleVarExp varName next = replaceNames "patternvar" varName <$> [| RM.random r >>= \patternvar -> $(next) |]
          allVarExps :: [Name] -> ExpQ -> ExpQ
          allVarExps (n:ns) next = foldr (\var result -> singleVarExp var result)
                                         (singleVarExp n next) ns

          singleOutputter :: Name -> ExpQ -> ExpQ
          singleOutputter varName next = [| DBP.putWord32be $(varE varName) >> $(next) |]
          allVarOutput :: [Name] -> ExpQ
          allVarOutput (n:ns) = foldr (\var result -> singleOutputter var result)
                                      (singleOutputter n [| return () |]) ns
          printResultExp :: [Name] -> ExpQ
          printResultExp names = [| BL.hPutStr SI.stdout $ DBP.runPut ($(allVarOutput names)) |]

          result = do
            vars <- replicateM varCount $ newName "x"
            allVarExps vars (printResultExp vars)
      in result)

This runs at about the same speed as your original fast version. It isn't very neat (your fast solution is easier to read), but you can now change the number of variables easily, and still have the loop unrolled. I tried 512, but apart from making the compile time huge, it didn't seem to have much effect on performance.

like image 24
David Miani Avatar answered Dec 04 '22 08:12

David Miani