I am processing something with lots of repeated lines:
# => [ [1, "A", 23626], [1, "A", 31314], [2, "B", 2143], [2, "B", 5247] ]
puts xs
# => [ [1, "A"], [2, "B"] ]
puts xs.uniq{ |x| x[0] }.map{ |x| [x[0], x[1]] }
But xs is huge. I'm trying to load it lazily, but Enumerator#Lazy does not have an uniq method.
How do I do achieve this lazily?
I decided to run a benchmark on the two methods I suggested and on @Amadan's method. The results speak for themselves.
Benchmark code
require 'benchmark'
module EnumeratorLazyUniq
refine Enumerator::Lazy do
require 'set'
def uniq
set = Set.new
select { |e|
val = block_given? ? yield(e) : e
!set.include?(val).tap { |exists|
set << val unless exists
}
}
end
end
end
using EnumeratorLazyUniq
def amadan(xs)
xs.uniq{ |x| x[0] }.map{ |x| [x[0], x[1]] }
end
require 'set'
def cary_set(arr)
first = Set.new
arr.each_with_object([]) do |(a0, a1, *_), b|
unless first.include?(a0)
first << a0
b << [a0, a1]
end
end
end
def cary_hash(arr)
arr.each_with_object({}) { |(a0, a1, *_), h|
h[a0]=[a0, a1] unless h.key?(a0) }.values
end
Test data
n_uniq = 10_000
n_copies = 100
tot = n_uniq * n_copies
xs = tot.times.map { |i| [i % n_uniq, 0, 1] }
Run Benchmark
Benchmark.bm do |x|
x.report("cary_set ") { cary_set(xs) }
x.report("cary_hash") { cary_hash(xs) }
x.report("amadan ") { amadan(xs) }
end
Results
Unique elements: 200,000
Number of copies of each unique element: 5
Array size: 1,000,000
user system total real
cary_set 0.980000 0.030000 1.010000 ( 1.018618)
cary_hash 0.980000 0.010000 0.990000 ( 0.982508)
amadan 0.590000 0.010000 0.600000 ( 0.597249)
Unique elements: 100,000
Number of copies of each unique element: 10
Array size: 1,000,000
user system total real
cary_set 0.920000 0.030000 0.950000 ( 0.942539)
cary_hash 0.630000 0.020000 0.650000 ( 0.642367)
amadan 0.470000 0.000000 0.470000 ( 0.478658)
Unique elements: 50,000
Number of copies of each unique element: 20
Array size: 1,000,000
user system total real
cary_set 0.910000 0.020000 0.930000 ( 0.932277)
cary_hash 0.570000 0.000000 0.570000 ( 0.575439)
amadan 0.410000 0.010000 0.420000 ( 0.417695)
Unique elements: 1000000
Number of copies of each unique element: 10
Array size: 10000000
user system total real
cary_set 12.660000 0.270000 12.930000 ( 12.962183)
cary_hash 7.730000 0.060000 7.790000 ( 7.797486)
amadan 6.640000 0.060000 6.700000 ( 6.707706)
module EnumeratorLazyUniq
refine Enumerator::Lazy do
require 'set'
def uniq
set = Set.new
select { |e|
val = block_given? ? yield(e) : e
!set.include?(val).tap { |exists|
set << val unless exists
}
}
end
end
end
using EnumeratorLazyUniq
xs = [ [1, "A", 23626], [1, "A", 31314], [2, "B", 2143], [2, "B", 5247] ].to_enum.lazy
us = xs.uniq{ |x| x[0] }.map{ |x| [x[0], x[1]] }
puts us.to_a.inspect
# => [[1, "A"], [2, "B"]]
# Works with a block
puts us.class
# => Enumerator::Lazy
# Yep, still lazy.
ns = [1, 4, 6, 1, 2].to_enum.lazy
puts ns.uniq.to_a.inspect
# => [1, 4, 6, 2]
# Works without a block
This is straightforward implementation using Set
; this means any uniq'd values (i.e. things like [1, "A"]
, but not stream elements themselves such as [1, "A", 23626]
) will take up memory.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With