Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Speed in Matlab vs. Julia vs. Fortran

I am playing around with different languages to solve a simple value function iteration problem where I loop over a state-space grid. I am trying to understand the performance differences and how I could tweak each code. For posterity I have posted full length working examples for each language below. However, I believe that most of the tweaking is to be done in the while loop. I am a bit confused what I am doing wrong in Fortran as the speed seems subpar.

Matlab ~2.7secs : I am avoiding a more efficient solution using the repmat function for now to keep the codes comparable. Code seems to be automatically multithreaded onto 4 threads

beta = 0.98;
sigma = 0.5;
R = 1/beta;

a_grid = linspace(0,100,1001);
tic
[V_mat, next_mat] = valfun(beta, sigma, R ,a_grid);
toc

where valfun()

function [V_mat, next_mat] = valfun(beta, sigma, R, a_grid)
    zeta = 1-1/sigma;
    len = length(a_grid);
    V_mat = zeros(2,len);
    next_mat = zeros(2,len);

    u = zeros(2,len,len);
    c = zeros(2,len,len);

    for i = 1:len
        c(1,:,i) = a_grid(i) - a_grid/R + 20.0;
        c(2,:,i) = a_grid(i) - a_grid/R;
    end

    u = c.^zeta * zeta^(-1);
    u(c<=0) = -1e8;

    tol = 1e-4;
    outeriter = 0;
    diff = 1000.0;

     while (diff>tol)  %&& (outeriter<20000)
        outeriter = outeriter + 1;
        V_last = V_mat;

        for i = 1:len 
            [V_mat(1,i), next_mat(1,i)] = max( u(1,:,i) + beta*V_last(2,:));
            [V_mat(2,i), next_mat(2,i)] = max( u(2,:,i) + beta*V_last(1,:));
        end
        diff = max(abs(V_mat - V_last));
    end

    fprintf("\n Value Function converged in %i steps. \n", outeriter)  
end

Julia (after compilation) ~5.4secs (4 threads (9425469 allocations: 22.43 GiB)), ~7.8secs (1 thread (2912564 allocations: 22.29 GiB))

[EDIT: after adding correct broadcasting and @views its only 1.8-2.1seconds now, see below!]

using LinearAlgebra, UnPack, BenchmarkTools

struct paramsnew
    β::Float64
    σ::Float64
    R::Float64
end

function valfun(params, a_grid)
    @unpack β,σ, R = params
    ζ = 1-1/σ

    len = length(a_grid)
    V_mat = zeros(2,len)
    next_mat = zeros(2,len)
    u = zeros(2,len,len)
    c = zeros(2,len,len)

    @inbounds for i in 1:len
        c[1,:,i] = @. a_grid[i] - a_grid/R .+ 20.0
        c[2,:,i] = @. a_grid[i] - a_grid/R
    end

    u = c.^ζ * ζ^(-1)
    u[c.<=0] .= typemin(Float64)

    tol = 1e-4
    outeriter = 0
    test = 1000.0

    while test>tol 
        outeriter += 1
        V_last = deepcopy(V_mat)

        @inbounds Threads.@threads for i in 1:len # loop over grid points
            V_mat[1,i], next_mat[1,i] = findmax( u[1,:,i] .+ β*V_last[2,:])
            V_mat[2,i], next_mat[2,i] = findmax( u[2,:,i] .+ β*V_last[1,:])
        end

        test = maximum( abs.(V_mat - V_last)[.!isnan.( V_mat - V_last )])
    end

    print("\n Value Function converged in ", outeriter, " steps.")
    return V_mat, next_mat
end

a_grid = collect(0:0.1:100)
p1 = paramsnew(0.98, 1/2, 1/0.98);

@time valfun(p1,a_grid)
print("\n should be compiled now \n")

@btime valfun(p1,a_grid)

Fortran (O3, mkl, qopenmp) ~9.2secs: I also must be doing something wrong when declaring the openmp variables as the compilation will crash for some grid sizes when using openmp (SIGSEGV error).

module mod_calc
    use omp_lib
    implicit none
    integer, parameter :: dp = selected_real_kind(33,4931), len = 1001
    public :: dp, len

    contains

    subroutine linspace(from, to, array)
        real(dp), intent(in) :: from, to
        real(dp), intent(out) :: array(:)
        real(dp) :: range
        integer :: n, i
        n = size(array)
        range = to - from

        if (n == 0) return

        if (n == 1) then
            array(1) = from
            return
        end if

        do i=1, n
            array(i) = from + range * (i - 1) / (n - 1)
        end do
    end subroutine

    subroutine calc_val() 
        real(dp)::  bbeta, sigma, R, zeta, tol, test
        real(dp)::  a_grid(len), V_mat(2,len), V_last(2,len), &
                    u(len,len,2), c(len,len,2)
        
        integer :: outeriter, i, sss, next_mat(2,len), fu
        character(len=*), parameter :: FILE_NAME = 'data.txt'   ! File name.

        call linspace(from=0._dp, to=100._dp, array=a_grid)

        bbeta = 0.98
        sigma = 0.5
        R = 1.0/0.98
        zeta = 1.0 - 1.0/sigma

        tol = 1e-4
        test = 1000.0
        outeriter = 0

        do i = 1,len
            c(:,i,1) = a_grid(i) - a_grid/R + 20.0
            c(:,i,2) = a_grid(i) - a_grid/R
        end do

        u = c**zeta * 1.0/zeta

        where (c<=0)
            u = -1e6
        end where

        V_mat = 0.0
        next_mat = 0.0

        do while (test>tol .and. outeriter<20000)
            outeriter = outeriter+1
            V_last = V_mat

            !$OMP PARALLEL DEFAULT(NONE)          &
            !$OMP SHARED(V_mat, next_mat,V_last, u, bbeta)  &
            !$OMP PRIVATE(i)                      
            !$OMP DO SCHEDULE(static)
            do i=1,len 
                V_mat(1,i)    = maxval(u(:,i,1) + bbeta*V_last(2,:))
                next_mat(1,i) = maxloc(u(:,i,1) + bbeta*V_last(2,:),1)
                V_mat(2,i)    = maxval(u(:,i,2) + bbeta*V_last(1,:))
                next_mat(2,i) = maxloc(u(:,i,2) + bbeta*V_last(1,:),1)
            end do
            !$OMP END DO
            !$OMP END PARALLEL

            test = maxval(abs(log(V_last/V_mat)))
        end do
    end subroutine
end module mod_calc

program main
    use mod_calc

    implicit none
    integer:: clck_counts_beg,clck_rate,clck_counts_end
    
    call omp_set_num_threads(4)

    call system_clock ( clck_counts_beg, clck_rate )
    call calc_val()
    call system_clock ( clck_counts_end, clck_rate )
    write (*, '("Time = ",f6.3," seconds.")')  (clck_counts_end - clck_counts_beg) / real(clck_rate)
end program main

There should be ways to reduce the amount of allocations (Julia reports 32-45% gc time!) but for now I am too novice to see them, so any comments and tipps are welcome.

Edit:

Adding @views and correct broadcasting to the while loop improved the Julia speed considerably (as expected, I guess) and hence beats the Matlab loop now. With 4 threads the code now takes only 1.97secs. Specifically,

    @inbounds for i in 1:len
        c[1,:,i] = @views @. a_grid[i] - a_grid/R .+ 20.0
        c[2,:,i] = @views @. a_grid[i] - a_grid/R
    end

    u = @. c^ζ * ζ^(-1)
    @. u[c<=0] = typemin(Float64)

    while test>tol && outeriter<20000
        outeriter += 1
        V_last = deepcopy(V_mat)

        @inbounds Threads.@threads for i in 1:len # loop over grid points
            V_mat[1,i], next_mat[1,i] = @views findmax( @. u[1,:,i] + β*V_last[2,:])
            V_mat[2,i], next_mat[2,i] = @views findmax( @. u[2,:,i] + β*V_last[1,:])
        end
        test = @views maximum( @. abs(V_mat - V_last)[!isnan( V_mat - V_last )])
    end
like image 298
cgx Avatar asked Oct 11 '20 03:10

cgx


People also ask

Is Julia faster than MATLAB?

Finaly, the highly optimized Julia code is 2x faster than Matlab vectorized code.

Is Julia faster than Fortran?

Pure Julia erfinv(x) [ = erf–1(x) ] 3–4× faster than Matlab's and 2–3× faster than SciPy's (Fortran Cephes). Julia code can actually be faster than typical “oplmized” C/Fortran code, by using techniques [metaprogramming/ code generalon] that are hard in a low-level language.

Is Julia programming language fast?

Julia prides itself on being very fast. Julia, unlike Python which is interpreted, is a compiled language that is primarily written in its own base. However, unlike other compiled languages like C, Julia is compiled at run-time, whereas traditional languages are compiled prior to execution.

Why is Julia faster?

Julia gives you the language features and a JIT compiler which makes it possible for you to optimize Julia code and get it as fast or faster than C/C++ or Fortran. With a language like Python you hit the ceiling of what is possible to do much sooner.


Video Answer


1 Answers

The reason the fortran is so slow is that it is using quadruple precision - I don't know Julia or Matlab but it looks as though double precision is being used in that case. Further as noted in the comments some of the loop orders are incorrect for Fortran, and also you are not consistent in your use of precision in the Fortran code, most of your constants are single precision. Correcting all these leads to the following:

  • Original: test = 9.83440674663232047922921588613472439E-0005 Time = 31.413 seconds.
  • Optimised: test = 9.8343643237979391E-005 Time = 0.912 seconds.

Note I have turned off parallelisation for these, all results are single threaded. Code is below:

module mod_calc
!!$    use omp_lib
    implicit none
!!$    integer, parameter :: dp = selected_real_kind(33,4931), len = 1001
    integer, parameter :: dp = selected_real_kind(15), len = 1001
    public :: dp, len

    contains

    subroutine linspace(from, to, array)
        real(dp), intent(in) :: from, to
        real(dp), intent(out) :: array(:)
        real(dp) :: range
        integer :: n, i
        n = size(array)
        range = to - from

        if (n == 0) return

        if (n == 1) then
            array(1) = from
            return
        end if

        do i=1, n
            array(i) = from + range * (i - 1) / (n - 1)
        end do
    end subroutine

    subroutine calc_val() 
        real(dp)::  bbeta, sigma, R, zeta, tol, test
        real(dp)::  a_grid(len), V_mat(len,2), V_last(len,2), &
                    u(len,len,2), c(len,len,2)
        
        integer :: outeriter, i, sss, next_mat(2,len), fu
        character(len=*), parameter :: FILE_NAME = 'data.txt'   ! File name.

        call linspace(from=0._dp, to=100._dp, array=a_grid)

        bbeta = 0.98_dp
        sigma = 0.5_dp
        R = 1.0_dp/0.98_dp
        zeta = 1.0_dp - 1.0_dp/sigma

        tol = 1e-4_dp
        test = 1000.0_dp
        outeriter = 0

        do i = 1,len
            c(:,i,1) = a_grid(i) - a_grid/R + 20.0_dp
            c(:,i,2) = a_grid(i) - a_grid/R
        end do

        u = c**zeta * 1.0_dp/zeta

        where (c<=0)
            u = -1e6_dp
        end where

        V_mat = 0.0_dp
        next_mat = 0.0_dp

        do while (test>tol .and. outeriter<20000)
            outeriter = outeriter+1
            V_last = V_mat

            !$OMP PARALLEL DEFAULT(NONE)          &
            !$OMP SHARED(V_mat, next_mat,V_last, u, bbeta)  &
            !$OMP PRIVATE(i)                      
            !$OMP DO SCHEDULE(static)
            do i=1,len 
                V_mat(i,1)    = maxval(u(:,i,1) + bbeta*V_last(:, 2))
                next_mat(i,1) = maxloc(u(:,i,1) + bbeta*V_last(:, 2),1)
                V_mat(i,2)    = maxval(u(:,i,2) + bbeta*V_last(:, 1))
                next_mat(i,2) = maxloc(u(:,i,2) + bbeta*V_last(:, 1),1)
            end do
            !$OMP END DO
            !$OMP END PARALLEL

            test = maxval(abs(log(V_last/V_mat)))
         end do
         Write( *, * ) test
    end subroutine
end module mod_calc

program main
    use mod_calc

    implicit none
    integer:: clck_counts_beg,clck_rate,clck_counts_end
    
!!$    call omp_set_num_threads(2)

    call system_clock ( clck_counts_beg, clck_rate )
    call calc_val()
    call system_clock ( clck_counts_end, clck_rate )
    write (*, '("Time = ",f6.3," seconds.")')  (clck_counts_end - clck_counts_beg) / real(clck_rate)
end program main

Compilation / linking:

ian@eris:~/work/stack$ gfortran --version
GNU Fortran (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

ian@eris:~/work/stack$ gfortran -Wall -Wextra -O3   jul.f90 
jul.f90:36:48:

         character(len=*), parameter :: FILE_NAME = 'data.txt'   ! File name.
                                                1
Warning: Unused parameter ‘file_name’ declared at (1) [-Wunused-parameter]
jul.f90:35:57:

         integer :: outeriter, i, sss, next_mat(2,len), fu
                                                         1
Warning: Unused variable ‘fu’ declared at (1) [-Wunused-variable]
jul.f90:35:36:

         integer :: outeriter, i, sss, next_mat(2,len), fu
                                    1
Warning: Unused variable ‘sss’ declared at (1) [-Wunused-variable]

Running:

ian@eris:~/work/stack$ ./a.out
   9.8343643237979391E-005
Time =  0.908 seconds.
like image 86
Ian Bush Avatar answered Oct 18 '22 06:10

Ian Bush