I have unknown data coming to me that I'd like to look at the correlations for programatically, and group together any variables that are perfectly correlated (ignoring direction). In the data set below I can manually look at the correlations and say a, f, g, h
go together as do b, d, e
. How can I efficiently do this programatically.
library(dplyr)
dat <- data_frame(
a = 1:100,
b = rnorm(100),
c = sample(1:100),
d = b * 3,
e = b + 100,
f = 1001:1100,
g = a - 100,
h = 100:1
)
round(cor(dat), 3)
## a b c d e f g h
## a 1.000 0.053 -0.042 0.053 0.053 1.000 1.000 -1.000
## b 0.053 1.000 0.092 1.000 1.000 0.053 0.053 -0.053
## c -0.042 0.092 1.000 0.092 0.092 -0.042 -0.042 0.042
## d 0.053 1.000 0.092 1.000 1.000 0.053 0.053 -0.053
## e 0.053 1.000 0.092 1.000 1.000 0.053 0.053 -0.053
## f 1.000 0.053 -0.042 0.053 0.053 1.000 1.000 -1.000
## g 1.000 0.053 -0.042 0.053 0.053 1.000 1.000 -1.000
## h -1.000 -0.053 0.042 -0.053 -0.053 -1.000 -1.000 1.000
Desired result:
list(
c('a', 'f', 'g', 'h'),
c('b', 'd', 'e')
)
How about this:
# Save absolute correlation mtx
cmat <- abs(cor(dat))
# Step over the rows of the matrix and select the column names that have correlation 1
groups <- lapply(rownames(cmat), function(rname) { colnames(cmat)[cmat[rname, ]==1] })
# Choose only unique correlation groups
groups <- unique(groups)
## [[1]]
## [1] "a" "f" "g" "h"
## [[2]]
## [1] "b" "d" "e"
## [[3]]
## [1] "c"
EDIT by Tyler Rinker: A benchmark of the 3 approaches:
library(dplyr)
dat <- data_frame(
a = 1:100000,
b = rnorm(100000),
c = sample(1:100000),
d = b * 3,
e = b + 100000,
f = 1001:101000,
g = a - 100,
h = 100000:1,
i = runif(100000),
j = rev(i),
k = i * 3
)
cor_group_dplyr <- function(dat){
grps <- data.frame(abs(round(cor(dat), 3))) %>%
dplyr::add_rownames() %>%
tidyr::gather(key, value, -rowname) %>%
dplyr::filter(value == 1) %>%
dplyr::distinct(rowname) %>%
dplyr::group_by(key) %>%
dplyr::summarise(pairs = list(rowname)) %>%
{.[["pairs"]]} %>%
{.[sapply(., length) > 1]}
if (length(grps) == 0) return(NA)
grps
}
cor_group_data.table <- function(dat){
res <- data.table::data.table(do.call(paste, data.table::as.data.table(abs(round(cor(dat), 3)))), colnames(dat))
groups <- res[, .(res = list(V2)), by = V1][["res"]]
m <- groups[sapply(groups, length) > 1]
if (length(m) == 0) return(NA)
m
}
cor_group_base <- function(dat){
cmat <- abs(round(cor(dat), 4))
groups <- lapply(rownames(cmat), function(rname) { colnames(cmat)[cmat[rname, ]==1] })
groups <- unique(groups)
m <- groups[sapply(groups, length) > 1]
if (length(m) == 0) return(NA)
m
}
library(microbenchmark)
(op <- microbenchmark(
cor_group_base(dat),
cor_group_dplyr(dat),
cor_group_data.table(dat),
times=100L))
Results
## Unit: milliseconds
## expr min lq mean median uq max neval
## cor_group_base(dat) 50.83729 52.53670 60.93529 56.65787 58.27536 143.1478 100
## cor_group_dplyr(dat) 54.25574 55.67910 69.32940 60.76432 64.94523 182.8525 100
## cor_group_data.table(dat) 53.10673 56.36881 62.42772 58.94608 60.06950 158.2749 100
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With