I had a hard time using Julia to read a large text file (968MB, 8.7 million rows). Each line is like:
0.3295747E+01 0.3045123E+01 0.3325542E+01 0.1185458E+01 -0.4827727E-05 -0.1033694E-04 0.3306459E-03
I used parse.(Float64, split(line))
to convert every line to numbers.
function openfile()
datafile = open("data.dat","r")
lines = readlines(datafile)
close(datafile)
lines
end
function parseline(lines::Array{String})
for line in lines
zzz = parse.(Float64, split(line))
end
end
import Base: tryparse_internal
function myreadfile(str::String, T::Type, dlm=' ', eol='\n')
row, clm, bg, ed = 0, 0, 0, 0
data = Array{T}(undef,0)
isnu0, isnu = false, false
for (idx, chr) in enumerate(str)
isnu = false
(chr != eol && chr != dlm) && (isnu = true)
if isnu0 == false && isnu == true
bg, isnu0 = idx, true
end
if isnu0 == true && isnu == false
ed, isnu0 = idx-1, false
push!(data, tryparse_internal(T, str, bg, ed))
end
end
isnu == true && (push!(data, tryparse(T, str[bg:end])))
data
end
@time lines = openfile()
@time parseline(lines)
using DelimitedFiles
@time readdlm("data.dat")
@time myreadfile(read("data.dat",String), Float64)
and got
3.584656 seconds (17.59 M allocations: 1.240 GiB, 28.44% gc time)
78.099010 seconds (276.14 M allocations: 6.080 GiB, 1.50% gc time)
52.504199 seconds (185.93 M allocations: 3.960 GiB, 0.53% gc time)
46.085581 seconds (61.70 M allocations: 2.311 GiB, 0.28% gc time)
Compare with fortran code
call cpu_time(start)
open(10, file="data.dat",status="old")
do i=1, 8773632
read(10,*) a, b, c, d, e, f, g
end do
call cpu_time(finish)
print '("Time = ",f6.3," seconds.")',finish-start
Which is Time = 14.812 seconds.
It seems Julia spends much longer time doing the same thing. Is there a better way to convert string to float? split and parse are so slow.
Creation of a File In order to work with a file in Julia, firstly we need to create a new file using “touch”(used to create a new empty file) method, “pwd”(used for checking the present working directory of the system) method and “cd”(used to change directory and create the file where we want) method.
As the comment says above, the fastest is most likely the readdlm
function. That will return a Matrix which is most likely what you want.
If you do want to do it by hand it's usually better to read through the file and process it line by line, instead of storing everything in big intermediary objects. Memory reads and writes are slow. Something like
ret = open("data.dat","r") do datafile
[parse.(Float64, split(line)) for line in eachline(datafile)]
end
It's probably not faster than your last line anyway though.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With