Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Bin Granges with Gaps

I am try to split Granges to specific n of bins, usually, GenomicRanges::tile could work for this. However, my Granges has some gaps, for example:

# if (!require("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
#
# BiocManager::install("GenomicRanges")
# BiocManager::install("GenomicFeatures")

library(GenomicRanges)
library(GenomicFeatures)

gr <- GRanges(
  seqnames = "chr1",
  ranges = IRanges(start = c(100, 200, 300), end = c(148, 249, 349)),
  strand = "+"
)
GRanges object with 3 ranges and 0 metadata columns:
      seqnames    ranges strand
         <Rle> <IRanges>  <Rle>
  [1]     chr1   100-148      +
  [2]     chr1   200-249      +
  [3]     chr1   300-349      +
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

expected output:

      seqnames    ranges strand |       bin
         <Rle> <IRanges>  <Rle> | <integer>
  [1]     chr1   100-109      + |         1
  [2]     chr1   110-119      + |         2
  [3]     chr1   120-129      + |         3
  [4]     chr1   130-139      + |         4
  [5]     chr1   140-148      + |         5 ## cross gap
  [6]     chr1   200-200      + |         5
  [7]     chr1   201-210      + |         6
  ...

A diagram for this is: enter image description here (The middle l3 is l2, Its a typo)

I have a customize function to do this, but It runs very slowly. I'm looking forward to seeing if there are any efficient methods.

tile_granges_with_intron <- function(exons, n) {
  exon_lengths <- width(exons)
  total_length <- sum(exon_lengths)
  
  bin_size <- total_length / n
  bin_starts <- floor(seq(1, total_length, by = bin_size))
  bin_ends <- c(bin_starts[-1] - 1, total_length)
  
  cum_lengths <- cumsum(exon_lengths)
  transcript_starts <- c(1, cum_lengths[-length(cum_lengths)] + 1)
  transcript_ends <- cum_lengths
  
  bins <- GRangesList()
  
  for (i in 1:n) {
    bin_start <- bin_starts[i]
    bin_end <- bin_ends[i]

    overlaps <- which(
      bin_start <= transcript_ends & bin_end >= transcript_starts
    )
    
    bin_ranges <- GRanges()
    for (j in overlaps) {
      overlap_start <- max(bin_start, transcript_starts[j])
      overlap_end <- min(bin_end, transcript_ends[j])
      
      genomic_start <- start(exons[j]) + (overlap_start - transcript_starts[j])
      genomic_end <- start(exons[j]) + (overlap_end - transcript_starts[j])
      
      bin_ranges <- c(bin_ranges, GRanges(
        seqnames = seqnames(exons[j]),
        ranges = IRanges(start = genomic_start, end = genomic_end),
        strand = strand(exons[j])
      ))
    }
    
    mcols(bin_ranges)$bin <- i
    bins[[i]] <- bin_ranges
  }
  
  names(bins) <- paste0("bin", 1:n)
  bins <- unlist(bins, use.names = FALSE)
  return(bins)
}

If this cannot be accomplished in Grangs, can it be accomplished in data.table? The column strands here can be ignored because they are always consistent

# we can convert it to data.table
df <- as.data.frame(gr) %>% 
      data.table::as.data.table()
df
seqnames start   end width strand
     <fctr> <int> <int> <int> <fctr>
1:     chr1   100   148    50      +
2:     chr1   200   249    50      +
3:     chr1   300   349    50      +
like image 758
zhang Avatar asked Sep 01 '25 03:09

zhang


2 Answers

library(GenomicRanges)
library(GenomicFeatures)

gr <- GRanges(
  seqnames = "chr1",
  ranges = IRanges(start = c(100, 200, 300), end = c(148, 249, 349)),
  strand = "+"
)
tile_granges_vectorized <- function(gr, n_bins) {
  
  starts <- start(gr)
  widths <- width(gr)
  total_length <- sum(widths)
  bin_size <- total_length / n_bins
  
  cum_ends <- cumsum(widths)
  cum_starts <- c(1, cum_ends[-length(cum_ends)] + 1)
  
  max_results <- n_bins * length(gr)
  result_seqnames <- character(max_results)
  result_starts <- integer(max_results)
  result_ends <- integer(max_results)
  result_strands <- character(max_results)
  result_bins <- integer(max_results)
  result_count <- 0
  
  for(i in 1:n_bins) {
    bin_start_cum <- (i - 1) * bin_size + 1
    bin_end_cum <- min(i * bin_size, total_length)
    
    overlaps <- which(bin_start_cum <= cum_ends & bin_end_cum >= cum_starts)
    
    if(length(overlaps) > 0) {
      overlap_starts_cum <- pmax(bin_start_cum, cum_starts[overlaps])
      overlap_ends_cum <- pmin(bin_end_cum, cum_ends[overlaps])
      
      genomic_starts <- starts[overlaps] + (overlap_starts_cum - cum_starts[overlaps])
      genomic_ends <- starts[overlaps] + (overlap_ends_cum - cum_starts[overlaps])
      
      n_overlaps <- length(overlaps)
      idx_range <- (result_count + 1):(result_count + n_overlaps)
      
      result_seqnames[idx_range] <- as.character(seqnames(gr)[overlaps])
      result_starts[idx_range] <- genomic_starts
      result_ends[idx_range] <- genomic_ends
      result_strands[idx_range] <- as.character(strand(gr)[overlaps])
      result_bins[idx_range] <- i
      
      result_count <- result_count + n_overlaps
    }
  }
  
  if(result_count > 0) {
    idx <- 1:result_count
    GRanges(
      seqnames = result_seqnames[idx],
      ranges = IRanges(start = result_starts[idx], end = result_ends[idx]),
      strand = result_strands[idx],
      bin = result_bins[idx]
    )
  } else {
    GRanges()
  }
}
microbenchmark::microbenchmark(
  intron = tile_granges_with_intron(gr, 10),
  vectorized = tile_granges_vectorized(gr, 10),
  times = 10,
  check='identical'
)
#> Unit: milliseconds
#>        expr      min       lq      mean    median       uq      max neval cld
#>      intron 322.7702 328.6691 341.78269 341.35505 345.6044 377.0622    10  a 
#>  vectorized  10.0457  10.4513  12.07787  10.97265  13.5882  18.1502    10   b

Created on 2025-06-30 with reprex v2.1.1

like image 78
M-- Avatar answered Sep 02 '25 17:09

M--


Although there is an accepted answer, I post the solution below since it uses a different logic, achieves comparable results in terms of speed, and is more stable (in terms of speed) when the number of bins grows.

library(GenomicRanges)
library(GenomicFeatures)

gr <- GRanges(
  seqnames = "chr1",
  ranges = IRanges(start = c(100, 200, 300), end = c(148, 249, 349)),
  strand = "+"
)
tile_granges_rle <- function(exons, n) {
  
  exon_lengths <- width(exons)
  total_length <- sum(exon_lengths)
  
  # get cutting points
  bin_size <- total_length / n
  bin_cuts <- seq(from = bin_size, to = total_length, by = bin_size)
  
  # assign each gene to a bin
  bin_idx <- findInterval(
    x = 1:total_length, 
    vec = bin_cuts, 
    left.open = T, 
    checkSorted = F
  ) + 1
  
  # identify new ranges & get summary using running length encoding
  range_idx <- rep(start(exons), times = width(exons))
  new_ranges <- paste(range_idx, bin_idx)
  rle_ranges <- rle(new_ranges)
  id_split <- strsplit(rle_ranges$values, split = " ")
  rle_ranges$bins <- as.integer(sapply(id_split, "[", 2))
  rle_ranges$values <- as.integer(sapply(id_split, "[", 1))
  
  # find start and end points of each new range (+ strand & seqnames)
  for (val in unique(rle_ranges$values)) {
    
    idx_val <- which(rle_ranges$values == val)
    range_width <- rle_ranges$lengths[idx_val]
    
    rle_ranges$starts <- c(rle_ranges$starts, cumsum(range_width) - range_width + val)
    rle_ranges$ends <- c(rle_ranges$ends, cumsum(range_width) - 1 + val)
    
    strand_val <- as.character(strand(exons[start(exons) == val]))
    rle_ranges$strands <- c(rle_ranges$strands, 
                            rep(strand_val, times = length(idx_val)))
    
    seqnames_val <- as.character(seqnames(exons[start(exons) == val]))
    rle_ranges$seqnames <- c(rle_ranges$seqnames, 
                             rep(seqnames_val, times = length(idx_val)))
    
  }
  
  res <- GRanges(
    seqnames = rle_ranges$seqnames,
    ranges = IRanges(start = rle_ranges$starts, end = rle_ranges$ends),
    strand = rle_ranges$strands,
    bin = rle_ranges$bins
  )
  
  return(res)
  
}
microbenchmark::microbenchmark(
  intron = tile_granges_with_intron(gr, 10),
  vectorized = tile_granges_vectorized(gr, 10),
  rle = tile_granges_rle(gr, 10),
  times = 10,
  check='identical'
)
# Unit: milliseconds
#        expr      min       lq      mean    median       uq      max neval cld
#      intron 337.5580 350.1868 416.33718 362.63930 382.9547 902.6799    10   a 
#  vectorized  10.2361  10.5405  11.08353  11.01365  11.7656  11.8167    10   b
#         rle   8.9927   9.1254  10.29057  10.18255  10.8431  12.2104    10   b

microbenchmark::microbenchmark(
  intron = tile_granges_with_intron(gr, 60),
  vectorized = tile_granges_vectorized(gr, 60),
  rle = tile_granges_rle(gr, 60),
  times = 10,
  check='identical'
)
# Unit: milliseconds
#        expr       min        lq       mean     median        uq       max neval cld
#      intron 1946.3718 1974.2415 1995.67697 1981.59585 1999.4995 2091.2665    10   a  
#  vectorized   42.6344   43.0850   47.24929   44.35430   54.7038   56.2842    10   b 
#         rle    9.0754    9.3832    9.78063    9.52365   10.2477   11.2114    10   c
like image 37
ju-henry Avatar answered Sep 02 '25 17:09

ju-henry