Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Segment vector according to whether or not values are above a threshold in R

Tags:

r

vector

I have a long vector and I need to divide it into segments according to a threshold. A segment is consecutive values over the threshold. When values drop below the threshold, the segment ends and the next segment begins where the values once again cross above the threshold. I need to record the start and end indices of each segment.

Below is an inefficient implementation. What's the fastest and most appropriate way to write this? This is pretty ugly, I have to assume that there's a cleaner implementation.

set.seed(10)
test.vec <- rnorm(100, 8, 10)
threshold <- 0
segments <- list()
in.segment <- FALSE
for(i in 1:length(test.vec)){

    # If we're in a segment
    if(in.segment){
        if(test.vec[i] > threshold){
            next
        }else{
            end.ind <- i - 1
            in.segment <- FALSE
            segments[[length(segments) + 1]] <- c(start.ind, end.ind)
        }
    }

    # if not in segment
    else{
        if(test.vec[i] > threshold){        
            start.ind <- i
            in.segment <- TRUE
        }
    }
}

EDIT: Runtime of all solutions

Thanks for all the replies, this has been helpful and very instructive. A small test of all five solutions is below (the four provided plus the original example). As you can see, all four are a huge improvement over the original solution, but Khashaa's solution is by far the fastest.

set.seed(1)
test.vec <- rnorm(1e6, 8, 10);threshold <- 0

originalFunction <- function(x, threshold){
    segments <- list()
    in.segment <- FALSE
    for(i in 1:length(test.vec)){

    # If we're in a segment
        if(in.segment){
            if(test.vec[i] > threshold){
                next
            }else{
                end.ind <- i - 1
                in.segment <- FALSE
                segments[[length(segments) + 1]] <- c(start.ind, end.ind)
            }
        }

    # if not in segment
        else{
            if(test.vec[i] > threshold){        
                start.ind <- i
                in.segment <- TRUE
            }
        }
    }
    segments
}

SimonG <- function(x, threshold){

  hit <- which(x > threshold)
  n <- length(hit)

  ind <- which(hit[-1] - hit[-n] > 1)

  starts <- c(hit[1], hit[ ind+1 ])
  ends <- c(hit[ ind ], hit[n])

  cbind(starts,ends)
}

Rcpp::cppFunction('DataFrame Khashaa(NumericVector x, double threshold) {
  x.push_back(-1);
  int n = x.size(), startind, endind; 
  std::vector<int> startinds, endinds;
  bool insegment = false;
  for(int i=0; i<n; i++){
    if(!insegment){
      if(x[i] > threshold){        
        startind = i + 1;
        insegment = true;          }
    }else{
      if(x[i] < threshold){
        endind = i;
        insegment = false;
        startinds.push_back(startind); 
        endinds.push_back(endind);
      }
    }
  }
  return DataFrame::create(_["start"]= startinds, _["end"]= endinds);
}')

bgoldst <- function(x, threshold){
    with(rle(x>threshold),
         t(matrix(c(0L,rep(cumsum(lengths),2L)[-length(lengths)]),2L,byrow=T)+1:0)[values,])   
}

ClausWilke <- function(x, threshold){
    suppressMessages(require(dplyr, quietly = TRUE))
    in.segment <- (x > threshold)
    start <- which(c(FALSE, in.segment) == TRUE & lag(c(FALSE, in.segment) == FALSE)) - 1
    end <- which(c(in.segment, FALSE) == TRUE & lead(c(in.segment, FALSE) == FALSE))
    data.frame(start, end)    
}

system.time({ originalFunction(test.vec, threshold); })
 ## user  system elapsed 
 ## 66.539   1.232  67.770 
system.time({ SimonG(test.vec, threshold); })
 ## user  system elapsed 
 ## 0.028   0.008   0.036 
system.time({ Khashaa(test.vec, threshold); })
 ## user  system elapsed 
 ## 0.008   0.000   0.008 
system.time({ bgoldst(test.vec, threshold); })
 ## user  system elapsed 
 ## 0.065   0.000   0.065 
system.time({ ClausWilke(test.vec, threshold); })
 ## user  system elapsed 
 ## 0.274   0.012   0.285 
like image 765
Henry David Thorough Avatar asked Jul 05 '15 21:07

Henry David Thorough


1 Answers

Here's another option, mostly using which. The start and end points are determined by finding the non-consecutive elements of the hit sequence.

test.vec <- rnorm(100, 8, 10)
threshold <- 0


findSegments <- function(x, threshold){

  hit <- which(x > threshold)
  n <- length(hit)

  ind <- which(hit[-1] - hit[-n] > 1)

  starts <- c(hit[1], hit[ ind+1 ])
  ends <- c(hit[ ind ], hit[n])

  cbind(starts,ends)

}

findSegments(test.vec, threshold=0)

This gives something like:

> findSegments(test.vec, threshold=0)
      starts ends
 [1,]      1    3
 [2,]      5    7
 [3,]      9   11
 [4,]     13   28
 [5,]     30   30
 [6,]     32   32
 [7,]     34   36
 [8,]     38   39
 [9,]     41   41
[10,]     43   43
[11,]     46   51
[12,]     54   54
[13,]     56   61
[14,]     63   67
[15,]     69   72
[16,]     76   77
[17,]     80   81
[18,]     83   84
[19,]     86   88
[20,]     90   92
[21,]     94   95
[22,]     97   97
[23,]    100  100

Compare that to the original sequence:

> round(test.vec,1)
  [1]  20.7  15.7   4.3 -15.1  24.6   9.4  23.2  -4.5  16.9  20.9  13.2  -1.2
 [13]  22.6   7.7   6.0   6.6   4.1  21.3   5.3  16.7  11.4  16.7  19.6  16.7
 [25]  11.6   7.3   3.7   8.4  -4.5  11.7  -7.1   8.4 -18.5  12.8  22.5  11.0
 [37]  -3.3  11.1   6.9  -7.9  22.9  -3.7   3.5  -7.1  -5.9   3.5  13.2  20.0
 [49]  13.2  23.4  15.9  -5.0  -6.3  10.0  -6.2   4.7   2.1  26.4   5.9  27.3
 [61]  14.3 -12.4  28.4  30.9  18.2  11.4   5.7  -4.5   6.2  12.0  10.9  11.1
 [73]  -2.0  -9.0  -1.4  15.4  19.1  -1.6  -5.4   5.4   7.8  -5.6  15.2  13.8
 [85] -18.8   7.1  17.1   9.3  -3.9  22.6   1.7  28.9 -21.3  21.2   8.2 -15.4
 [97]   3.2 -10.2  -6.2  14.1
like image 77
SimonG Avatar answered Sep 29 '22 12:09

SimonG