Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

merge data frame - column prefix

Tags:

r

The merge data frame inbuilt function contains an option to change the suffix applied when the two data frames you merge have a shared column name:

## S3 method for class 'data.frame'
merge(x, y, by = intersect(names(x), names(y)),
      by.x = by, by.y = by, all = FALSE, all.x = all, all.y = all,
      sort = TRUE, suffixes = c(".x",".y"),
      incomparables = NULL, ...) 

Is there an option to instead have a prefix (e.g. = c("x.","y.") ) applied?

like image 893
Daniel Hoare Avatar asked Sep 28 '15 13:09

Daniel Hoare


2 Answers

Since merge doesn't have this option (as of 22-Feb 2016), we can just let merge do its thing and then clean up ex-post by running:

names(df_new) <- gsub("(.*).([xy])$", "\\2.\\1", names(df_new))

I always hate writing these sort-of distracting lines of code in my scripts, but such is life.

like image 127
MichaelChirico Avatar answered Sep 20 '22 22:09

MichaelChirico


Make a custom merge function.

#dummy data
df1 <- head(mtcars[,1:4])
df2 <- tail(mtcars[,1:6])

#base merge
merge(df1,df2, by="cyl")
#   cyl mpg.x disp.x hp.x mpg.y disp.y hp.y drat    wt
# 1   4  22.8    108   93  26.0  120.3   91 4.43 2.140
# 2   4  22.8    108   93  30.4   95.1  113 3.77 1.513
# 3   4  22.8    108   93  21.4  121.0  109 4.11 2.780
# 4   6  21.0    160  110  19.7  145.0  175 3.62 2.770
# 5   6  21.0    160  110  19.7  145.0  175 3.62 2.770
# 6   6  21.4    258  110  19.7  145.0  175 3.62 2.770
# 7   6  18.1    225  105  19.7  145.0  175 3.62 2.770
# 8   8  18.7    360  175  15.8  351.0  264 4.22 3.170
# 9   8  18.7    360  175  15.0  301.0  335 3.54 3.570

#custom merge
myMerge(df1,df2, by="cyl")
#   cyl x.mpg x.disp x.hp y.mpg y.disp y.hp drat    wt
# 1   4  22.8    108   93  26.0  120.3   91 4.43 2.140
# 2   4  22.8    108   93  30.4   95.1  113 3.77 1.513
# 3   4  22.8    108   93  21.4  121.0  109 4.11 2.780
# 4   6  21.0    160  110  19.7  145.0  175 3.62 2.770
# 5   6  21.0    160  110  19.7  145.0  175 3.62 2.770
# 6   6  21.4    258  110  19.7  145.0  175 3.62 2.770
# 7   6  18.1    225  105  19.7  145.0  175 3.62 2.770
# 8   8  18.7    360  175  15.8  351.0  264 4.22 3.170
# 9   8  18.7    360  175  15.0  301.0  335 3.54 3.570

Custom myMerge function:

#custom myMerge function - modified from "base::merge.data.frame"
myMerge <- function (x, y, by = intersect(names(x), names(y)), by.x = by, 
                     by.y = by, all = FALSE, all.x = all, all.y = all, sort = TRUE, 
                     prefix = c("x.", "y."), incomparables = NULL, ...) 
{
  fix.by <- function(by, df) {
    if (is.null(by)) 
      by <- numeric()
    by <- as.vector(by)
    nc <- ncol(df)
    if (is.character(by)) {
      poss <- c("row.names", names(df))
      if (any(bad <- !charmatch(by, poss, 0L))) 
        stop(ngettext(sum(bad), "'by' must specify a uniquely valid column", 
                      "'by' must specify uniquely valid columns"), 
             domain = NA)
      by <- match(by, poss) - 1L
    }
    else if (is.numeric(by)) {
      if (any(by < 0L) || any(by > nc)) 
        stop("'by' must match numbers of columns")
    }
    else if (is.logical(by)) {
      if (length(by) != nc) 
        stop("'by' must match number of columns")
      by <- seq_along(by)[by]
    }
    else stop("'by' must specify one or more columns as numbers, names or logical")
    if (any(bad <- is.na(by))) 
      stop(ngettext(sum(bad), "'by' must specify a uniquely valid column", 
                    "'by' must specify uniquely valid columns"), 
           domain = NA)
    unique(by)
  }
  nx <- nrow(x <- as.data.frame(x))
  ny <- nrow(y <- as.data.frame(y))
  if (nx >= 2^31 || ny >= 2^31) 
    stop("long vectors are not supported")
  by.x <- fix.by(by.x, x)
  by.y <- fix.by(by.y, y)
  if ((l.b <- length(by.x)) != length(by.y)) 
    stop("'by.x' and 'by.y' specify different numbers of columns")
  if (l.b == 0L) {
    nm <- nm.x <- names(x)
    nm.y <- names(y)
    has.common.nms <- any(cnm <- nm.x %in% nm.y)
    if (has.common.nms) {
      names(x)[cnm] <- paste0(prefix[1L], nm.x[cnm])
      cnm <- nm.y %in% nm
      names(y)[cnm] <- paste0(prefix[1L], nm.y[cnm])
    }
    if (nx == 0L || ny == 0L) {
      res <- cbind(x[FALSE, ], y[FALSE, ])
    }
    else {
      ij <- expand.grid(seq_len(nx), seq_len(ny))
      res <- cbind(x[ij[, 1L], , drop = FALSE], y[ij[, 
                                                     2L], , drop = FALSE])
    }
  }
  else {
    if (any(by.x == 0L)) {
      x <- cbind(Row.names = I(row.names(x)), x)
      by.x <- by.x + 1L
    }
    if (any(by.y == 0L)) {
      y <- cbind(Row.names = I(row.names(y)), y)
      by.y <- by.y + 1L
    }
    row.names(x) <- NULL
    row.names(y) <- NULL
    if (l.b == 1L) {
      bx <- x[, by.x]
      if (is.factor(bx)) 
        bx <- as.character(bx)
      by <- y[, by.y]
      if (is.factor(by)) 
        by <- as.character(by)
    }
    else {
      if (!is.null(incomparables)) 
        stop("'incomparables' is supported only for merging on a single column")
      bx <- x[, by.x, drop = FALSE]
      by <- y[, by.y, drop = FALSE]
      names(bx) <- names(by) <- paste0("V", seq_len(ncol(bx)))
      bz <- do.call("paste", c(rbind(bx, by), sep = "\r"))
      bx <- bz[seq_len(nx)]
      by <- bz[nx + seq_len(ny)]
    }
    comm <- match(bx, by, 0L)
    bxy <- bx[comm > 0L]
    xinds <- match(bx, bxy, 0L, incomparables)
    yinds <- match(by, bxy, 0L, incomparables)
    if (nx > 0L && ny > 0L) 
      m <- .Internal(merge(xinds, yinds, all.x, all.y))
    else m <- list(xi = integer(), yi = integer(), x.alone = seq_len(nx), 
                   y.alone = seq_len(ny))
    nm <- nm.x <- names(x)[-by.x]
    nm.by <- names(x)[by.x]
    nm.y <- names(y)[-by.y]
    ncx <- ncol(x)
    if (all.x) 
      all.x <- (nxx <- length(m$x.alone)) > 0L
    if (all.y) 
      all.y <- (nyy <- length(m$y.alone)) > 0L
    lxy <- length(m$xi)
    has.common.nms <- any(cnm <- nm.x %in% nm.y)
    if (has.common.nms && nzchar(prefix[1L])) 
      nm.x[cnm] <- paste0(prefix[1L], nm.x[cnm])
    x <- x[c(m$xi, if (all.x) m$x.alone), c(by.x, seq_len(ncx)[-by.x]), 
           drop = FALSE]
    names(x) <- c(nm.by, nm.x)
    if (all.y) {
      ya <- y[m$y.alone, by.y, drop = FALSE]
      names(ya) <- nm.by
      xa <- x[rep.int(NA_integer_, nyy), nm.x, drop = FALSE]
      names(xa) <- nm.x
      x <- rbind(x, cbind(ya, xa))
    }
    if (has.common.nms && nzchar(prefix[2L])) {
      cnm <- nm.y %in% nm
      nm.y[cnm] <- paste0(prefix[2L], nm.y[cnm])
    }
    y <- y[c(m$yi, if (all.x) rep.int(1L, nxx), if (all.y) m$y.alone), 
           -by.y, drop = FALSE]
    if (all.x) {
      zap <- (lxy + 1L):(lxy + nxx)
      for (i in seq_along(y)) {
        if (is.matrix(y[[1]])) 
          y[[1]][zap, ] <- NA
        else is.na(y[[i]]) <- zap
      }
    }
    if (has.common.nms) 
      names(y) <- nm.y
    nm <- c(names(x), names(y))
    if (any(d <- duplicated(nm))) 
      if (sum(d) > 1L) 
        warning("column names ", paste(sQuote(nm[d]), 
                                       collapse = ", "), " are duplicated in the result", 
                domain = NA)
    else warning("column name ", sQuote(nm[d]), " is duplicated in the result", 
                 domain = NA)
    res <- cbind(x, y)
    if (sort) 
      res <- res[if (all.x || all.y) 
        do.call("order", x[, seq_len(l.b), drop = FALSE])
        else sort.list(bx[m$xi]), , drop = FALSE]
  }
  attr(res, "row.names") <- .set_row_names(nrow(res))
  res
}
like image 31
zx8754 Avatar answered Sep 17 '22 22:09

zx8754