Memory efficient creation of sparse matrix

Tags:

I have a list of 50000 string vectors, consisting of various combinations of 6000 unique strings.

Goal: I want to transform them in "relative frequencies" (table(x)/length(x)) and store them in a sparse matrix. Low memory consumption is more important than speed. Currently memory is the bottleneck. (Even though source data has about ~50 mb and data in target format ~10mb --> Transformation seems to be inefficient,...)

Generate sample data

dims <- c(50000, 6000)
nms <- paste0("A", 1:dims[2])
lengths <- sample(5:30, dims[1], replace = T)
data <- lapply(lengths, sample, x = nms, replace = T)

Possible attempts:

1) sapply() with simplify to sparse matrix?

library(Matrix)  
sparseRow <- function(stringVec){
  relFreq <- c(table(factor(stringVec, levels = nms)) / length(stringVec))
  Matrix(relFreq, 1, dims[2], sparse = TRUE)
}
sparseRows <- sapply(data[1:5], sparseRow)
sparseMat <- do.call(rbind, sparseRows)

Problem: My bottleneck seems to be the sparseRows as the rows are not directly combined to a sparse matrix. (If i run the code above on the full sample, i get an Error: cannot allocate vector of size 194 Kb Error during wrapup: memory exhausted (limit reached?) - my hardware has 8 GB RAM.)

Obviously there is more memory consumption for creating a list of rows, before combining them instead of filling the sparse matrix directly. --> so using (s/l)apply is not memory friendly in my case?

object.size(sparseRows)
object.size(sparseMat)

2) Dirty workaround(?)

My goal seems to be to create an empty sparse matrix and fill it row wise. Below is a dirty way to do it (which works on my hardware).

indxs <- lapply(data, function(data) sapply(data, function(x) which(x == nms), 
   USE.NAMES = FALSE))
relFreq <- lapply(indxs, function(idx) table(idx)/length(idx))

mm <- Matrix(0, nrow = dims[1], ncol = dims[2])
for(idx in 1:dims[1]){
  mm[idx, as.numeric(names(relFreq[[idx]]))] <- as.numeric(relFreq[[idx]])
}
#sapply(1:dims[1], function(idx) mm[idx, 
#     as.numeric(names(relFreq[[idx]]))] <<- as.numeric(relFreq[[idx]]))

I would like to ask if there is a more elegant/efficient way to achieve that with lowest amount of RAM possible.

219

asked Jul 16 '18 19:07

Tlatwork

2 Answers

I would convert to data.table and then do the necessary calculations:

ld <- lengths(data)
D <- data.table(val = unlist(data),
                id = rep(1:length(data), times = ld),
                Ntotal = rep(ld, times = ld))
D <- D[, .N, keyby = .(id, val, Ntotal)]
D[, freq := N/Ntotal]
ii <- data.table(val = nms, ind = seq_along(nms))
D <- ii[D, on = 'val']
sp <- with(D, sparseMatrix(i = id, j = ind, x = freq,
                           dims = c(max(id), length(nms))))

Benchmarks for n = 100

data2 <- data[1:100]
Unit: milliseconds
      expr        min         lq       mean    median        uq        max neval cld
        OP 102.150200 106.235148 113.117848 109.98310 116.79734 142.859832    10  b 
  F. Privé 122.314496 123.804442 149.999595 126.76936 164.97166 233.034447    10   c
     minem   5.617658   5.827209   6.307891   6.10946   6.15137   9.199257    10 a  
 user20650  11.012509  11.752350  13.580099  12.59034  14.31870  21.961725    10 a

Benchmarks on all data

Lets benchmark 3 of the fastest functions, because rest of them (OP's, user20650_v1 and F.Privé's) would be to slow on all of the data.

user20650_v2 <- function(x) {
  dt2 = data.table(lst = rep(1:length(x), lengths(x)),
                   V1 = unlist(x))
  dt2[, V1 := factor(V1, levels = nms)]
  x3 = xtabs(~ lst + V1, data = dt2, sparse = TRUE)
  x3/rowSums(x3)
}
user20650_v3 <- function(x) {
  x3 = xtabs(~ rep(1:length(x), lengths(x)) + factor(unlist(x), levels = nms),
             sparse = TRUE)
  x3/rowSums(x3)
}
minem <- function(x) {
  ld <- lengths(x)
  D <- data.table(val = unlist(x), id = rep(1:length(x), times = ld),
                  Ntotal = rep(ld, times = ld))
  D <- D[, .N, keyby = .(id, val, Ntotal)]
  D[, freq := N/Ntotal]
  ii <- data.table(val = nms, ind = seq_along(nms))
  D <- ii[D, on = 'val']
  sparseMatrix(i = D$id, j = D$ind, x = D$freq,
               dims = c(max(D$id), length(nms)))
}

Compare the results of minem and user20650_v3:

x1 <- minem(data)
x2 <- user20650_v3(data)
all.equal(x1, x2)
# [1] "Component “Dimnames”: names for current but not for target"             
# [2] "Component “Dimnames”: Component 1: target is NULL, current is character"
# [3] "Component “Dimnames”: Component 2: target is NULL, current is character"
# [4] "names for target but not for current"

x2 has additional names. remove them:

dimnames(x2) <- names(x2@x) <- NULL
all.equal(x1, x2)
# [1] TRUE # all equal

Timings:

x <- bench::mark(minem(data),
            user20650_v2(data),
            user20650_v3(data),
            iterations = 5, check = F)
as.data.table(x)[, 1:10]

#            expression   min  mean median   max  itr/sec mem_alloc n_gc n_itr total_time
# 1:        minem(data) 324ms 345ms  352ms 371ms 2.896187     141MB    7     5      1.73s
# 2: user20650_v2(data) 604ms 648ms  624ms 759ms 1.544380     222MB   10     5      3.24s
# 3: user20650_v3(data) 587ms 607ms  605ms 633ms 1.646977     209MB   10     5      3.04s

relating memory:

OPdirty <- function(x) {
  indxs <- lapply(x, function(x) sapply(x, function(x) which(x == nms), 
                                        USE.NAMES = FALSE))
  relFreq <- lapply(indxs, function(idx) table(idx)/length(idx))
  dims <- c(length(indxs), length(nms))
  mm <- Matrix(0, nrow = dims[1], ncol = dims[2])
  for (idx in 1:dims[1]) {
    mm[idx, as.numeric(names(relFreq[[idx]]))] <- as.numeric(relFreq[[idx]])
  }
  mm
}


xx <- data[1:1000]
all.equal(OPdirty(xx), minem(xx))
# true
x <- bench::mark(minem(xx),
                 FPrive(xx),
                 OPdirty(xx),
                 iterations = 3, check = T)
as.data.table(x)[, 1:10]
    expression     min    mean  median     max    itr/sec mem_alloc n_gc n_itr total_time
1:   minem(xx) 12.69ms 14.11ms 12.71ms 16.93ms 70.8788647    3.04MB    0     3    42.33ms
2:  FPrive(xx)   1.46s   1.48s   1.47s   1.52s  0.6740317  214.95MB    4     3      4.45s
3: OPdirty(xx)   2.12s   2.14s   2.15s   2.16s  0.4666106  914.91MB    9     3      6.43s

See column mem_alloc...

143

answered Oct 05 '22 23:10

minem

Use a loop to fill a pre-allocated sparse matrix column-wise (and then transpose it):

res <- Matrix(0, dims[2], length(data), sparse = TRUE)
for (i in seq_along(data)) {
  ind.match <- match(data[[i]], nms)
  tab.match <- table(ind.match)
  res[as.integer(names(tab.match)), i] <- as.vector(tab.match) / length(data[[i]])
}
# Verif
stopifnot(identical(t(res), sparseMat))

Benchmark:

data2 <- data[1:50]
microbenchmark::microbenchmark(
  OP = {
    sparseMat <- do.call(rbind, sapply(data2, sparseRow))
  },
  ME = {
    res <- Matrix(0, dims[2], length(data2), sparse = TRUE)
    for (i in seq_along(data2)) {
      ind.match <- match(data2[[i]], nms)
      tab.match <- table(ind.match)
      res[as.integer(names(tab.match)), i] <- as.vector(tab.match) / length(data2[[i]])
    }
    res2 <- t(res)
  }
)
stopifnot(identical(res2, sparseMat))

Unit: milliseconds
 expr      min       lq     mean   median       uq       max neval cld
   OP 56.28020 59.61689 63.24816 61.16986 62.80294 206.18689   100   b
   ME 46.60318 48.27268 49.77190 49.50714 50.92287  55.23727   100  a

So, it's memory-efficient and not that slow.

answered Oct 06 '22 00:10

F. Privé

Related questions
                            
                                Mystified by qr.Q(): what is an orthonormal matrix in "compact" form?
                            
                                R ggplot2 plotting hourly data
                            
                                setkey and the := operator, data.table, R
                            
                                ggplot geom_bar vs geom_histogram
                            
                                How to place the intercept of x and y axes at (0 , 0) and extend the x and y axes to the edge of the plot [duplicate]
                            
                                What/Where are the attributes of a function object?
                            
                                Creating same random number sequence in Python, NumPy and R
                            
                                Performance of R stats::sd() vs. arma::stddev() vs. Rcpp implementation
                            
                                Printing nice cross-tabulations in knitr
                            
                                How to exit a Shiny app and return a value
                            
                                Negative look-ahead assertion in list.files in R
                            
                                doParallel "foreach" inconsistently inherits objects from parent environment: "Error in { : task 1 failed - "could not find function..."
                            
                                Referencing a 'hand-made' table using bookdown package
                            
                                dplyr . and _no visible binding for global variable '.'_ Note in package check
                            
                                Converting unix seconds in milliseconds to POSIXct/POSIXlt
                            
                                Using a date field in a ts?
                            
                                How to extract the correct timezones from POSIXct and POSIXlt objects?
                            
                                Alternate, interweave or interlace two vectors
                            
                                How to generate a URL to restore the user input values in Shiny
                            
                                configure error installing R-3.3.2 on Ubuntu: checking whether bzip2 support suffices... configure: error: bzip2 library and headers are required

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With