Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Any workaround to find optimal threshold for filtering raw features based on correlation matrix in R?

I intended to extract highly correlated features by measuring its Pearson correlation, and I got a correlation matrix by doing that. However, for filtering high correlated features, I selected correlation coefficient arbitrarily, I don't know the optimal threshold for filtering highly correlated features. I am thinking about to quantify positive and negative correlated features first, then get credible figures to set up a threshold for filtering features. Can anyone point me out how to quantify positive and negative correlated features from the correlation matrix? Is there any efficient way to select the optimal threshold for filtering highly correlated features?

reproducible data

Here is the reproducible data that I used whereas row is the number of samples, the column in the number of raw features:

> dput(my_df)
structure(list(SampleID = c("Tarca_001_P1A01", "Tarca_013_P1B01", 
"Tarca_025_P1C01", "Tarca_037_P1D01", "Tarca_049_P1E01", "Tarca_061_P1F01", 
"Tarca_051_P1E03", "Tarca_063_P1F03", "Tarca_075_P1G03", "Tarca_087_P1H03"
), GA = c(11, 15.3, 21.7, 26.7, 31.3, 32.1, 19.7, 23.6, 27.6, 
30.6), `1_at` = c(6.06221469449721, 5.8755020052495, 6.12613148162098, 
6.1345548976595, 6.28953417729806, 6.08561779473768, 6.25857984382111, 
6.22016811759586, 6.22269236303877, 6.11986885253451), `10_at` = c(3.79648446367096, 
3.45024474095539, 3.62841140410044, 3.51232455992681, 3.56819306931016, 
3.54911765491621, 3.59024881523945, 3.69553021972333, 3.61860245801661, 
3.74019994293802), `100_at` = c(5.84933778267459, 6.55052475296263, 
6.42187743053935, 6.15489279092855, 6.34807354206396, 6.11780116002087, 
6.24635169763079, 6.25479583503303, 6.16095987926232, 6.26979789563404
), `1000_at` = c(3.5677794435745, 3.31613364795286, 3.43245075704917, 
3.63813996294905, 3.39904385276621, 3.54214650423219, 3.51532853598111, 
3.50451431462302, 3.38965905673286, 3.54646930636612), `10000_at` = c(6.16681461038468, 
6.18505928400759, 5.6337568741831, 5.14814946571171, 5.64064316609978, 
6.25755205471611, 5.68110995701518, 5.14171528059565, 5.48711836247815, 
5.69671814694115), `100009613_at` = c(4.44302662142323, 4.3934877055859, 
4.6237834519809, 4.66743523288194, 4.97483476597509, 4.78673497541689, 
4.77791032146269, 4.64089637146557, 4.4415876428342, 4.57411708287226
), `100009676_at` = c(5.83652223195279, 5.89836406552412, 6.01979203584278, 
5.98400432133011, 6.1149144301085, 5.74573650612351, 6.04564052289621, 
6.10594091413241, 5.89779877157418, 6.08906323192048), `10001_at` = c(6.33001755606083, 
6.13798360106589, 5.78750241567476, 5.5920698678248, 5.84077907831575, 
6.19490161026853, 5.80941714030283, 5.80320733931781, 6.05345724391988, 
5.84386016796266), `10002_at` = c(4.92233877299356, 4.62812370798939, 
4.79628294150335, 4.79729686531453, 4.91913790102029, 4.79997095951811, 
4.90838062744781, 4.73415922096939, 4.77466915267328, 4.78437458350139
), `10003_at` = c(2.68934375273141, 2.55675627493564, 2.61341541015611, 
2.69430042092269, 2.73207812554522, 2.65268941561582, 2.66697993437978, 
2.59784138580729, 2.74247110877575, 2.511875309739), `100033411_at` = c(2.74561888109989, 
2.70765553292035, 2.80774129091983, 2.8653583834812, 3.00137677271996, 
2.83262780533507, 2.85563184073152, 2.9364732038239, 3.04291003006152, 
2.87464057209658), `100033413_at` = c(2.76060893169324, 3.03645581534102, 
2.64583376265592, 3.24800269901788, 2.62090678070501, 3.40648642432304, 
2.3166708613396, 2.62819739311836, 2.97367900843303, 2.62634568261552
), `100033414_at` = c(3.79468365910661, 4.29971184424969, 3.81085169542991, 
3.81895258294878, 4.03594900960396, 3.82989979044012, 3.29585327836005, 
3.27434364943932, 3.10419531747282, 4.48509833313903), `100033418_at` = c(2.84818282222582, 
2.48325694938049, 3.2386968734862, 2.72080210986981, 2.58058159047299, 
2.53965338068817, 2.1940368933459, 2.39335155022896, 2.59875871802789, 
2.1053634999615), `100033420_at` = c(2.81277398177906, 3.51308266658033, 
2.78489562992621, 2.63705084722617, 2.63479468288161, 2.7893378666207, 
2.57252259415358, 3.6809929352922, 3.33486815632383, 3.26518578675427
), `100033422_at` = c(2.14058977019523, 2.26880029802564, 2.3315210232915, 
2.33064119419619, 2.24052626899434, 2.33982101586472, 2.18436254317561, 
2.45046620859257, 2.56645806945223, 2.3405394322417), `100033423_at` = c(2.6928480064245, 
3.03461160119094, 2.75618624035735, 2.77388400895015, 3.2286586324064, 
2.93047368426024, 2.8187821442941, 3.056923038096, 2.90637516892824, 
2.70751558441428), `100033424_at` = c(2.35292391447048, 2.3853610213164, 
2.36292219228603, 2.46939314182722, 2.30413560438815, 2.61148325229634, 
2.34045470681792, 2.48995835642741, 2.32083529534773, 2.40632218044949
), `100033425_at` = c(2.48476830655452, 2.28880889278209, 2.31409329648109, 
2.28927162732448, 2.38147147362554, 2.33334530852942, 2.44322869233962, 
2.34064030240538, 2.67362452592881, 2.33750820349888), `100033426_at` = c(6.53876010917445, 
7.38935014141236, 6.89661896623484, 6.93808821971072, 6.58149585137493, 
7.76996534217549, 6.08470562892749, 7.07455266815876, 6.94555867772862, 
6.96998299746459)), class = "data.frame", row.names = c("Tarca_001_P1A01", 
"Tarca_013_P1B01", "Tarca_025_P1C01", "Tarca_037_P1D01", "Tarca_049_P1E01", 
"Tarca_061_P1F01", "Tarca_051_P1E03", "Tarca_063_P1F03", "Tarca_075_P1G03", 
"Tarca_087_P1H03"))

my attempt:

Here is my attempt to get Pearson correlation matrix and intended to filter out highly correlated features (here I just used correlation coefficient which was chosen arbitrarily):

target <-  my_df$GA
raw_feats <- my_df[,-c(1:2)]

corr_df = do.call(rbind,
                  apply(raw_feats, 2, function(x){
                      temp = cor.test(target, as.numeric(x),
                                      alternative = "two.sided",method = "pearson")
                      data.frame(t = temp$statistic, p = temp$p.value,
                                 cor_coef=temp$estimate)
                  }))

then I selected correlation coefficient arbitrarily as the default threshold for filtering highly correlated features.

indx <- which(corr_df$cor_coef > 0.0785 | corr_df$cor_coef<=-0.01)
mydf_new <- my_df[indx,]

I think doing this way is not accurate. any idea?

I am curious about how to quantify positive and negative correlated features, then find out optimal threshold value for filtering. How can I make this happen? any efficient way to quantify pos/neg correlated features? How can I select optimal correlation coefficient values as the threshold for filtering? any thought? Thanks in advance

like image 391
Jerry07 Avatar asked Jul 20 '19 21:07

Jerry07


People also ask

How to select features based on correlation?

How does correlation help in feature selection? Features with high correlation are more linearly dependent and hence have almost the same effect on the dependent variable. So, when two features have high correlation, we can drop one of the two features.

How will you choose one features if there are 2 highly correlated features?

When we have highly correlated features in the dataset, the values in “S” matrix will be small. So inverse square of “S” matrix (S^-2 in the above equation) will be large which makes the variance of Wₗₛ large. So, it is advised that we keep only one feature in the dataset if two features are highly correlated.

What is a good threshold for correlation?

Correlation coefficients whose magnitude are between 0.7 and 0.9 indicate variables which can be considered highly correlated. Correlation coefficients whose magnitude are between 0.5 and 0.7 indicate variables which can be considered moderately correlated.

What happens to correlation between the features if we add or subtract a value in the features?

Positive Correlation: means that if feature A increases then feature B also increases or if feature A decreases then feature B also decreases. Both features move in tandem and they have a linear relationship. Negative Correlation: means that if feature A increases then feature B decreases and vice versa.


1 Answers

I doubt you want to select on correlations -- features that are highly correlated with the target may also be highly correlated with each other, so not offer much new information. Regularized regression with cross validation is a pretty typical way of handling this sort of thing. The following fits the data (artificially extended so examples could work) using the glmnet package for regularised / cross validated regression, and gives an index at the end representing features that are likely to be useful to include in a linear model.

 ### using regularized regression

my_df2 <- my_df[,-1] #drop id

for(i in 1:2){ #add a bit more data for this example
  my_df2 <- rbind(my_df2,my_df2+rnorm(length(my_df2),0,.1))
}

# install.packages('glmnet')
library(glmnet)
res=glmnet( y=my_df2$GA, 
  x=sapply(my_df2[,-1],function(x) x), #convert data to matrix
  , standardize=TRUE,
  alpha=1) #reduce alpha for less lasso like more ridge like behavior
plot(res,label=TRUE) #variable importance as penalty decreases

### selecting ideal regularization level using cross validation
cvres=cv.glmnet( y=my_df2$GA, 
  x=sapply(my_df2[,-1],function(x) x), #convert data to matrix
  , standardize=TRUE,
  alpha=1)
plot(cvres) #out of sample prediction error as penalty decreases

# install.packages('coefplot')
library(coefplot)
nonzero <- extract.coef(cvres) #useful features
indx <- which(colnames(my_df) %in% nonzero$Coefficient) #indexed


ranked <- nonzero[order(abs(nonzero$Value),decreasing = TRUE),] #ranked list at best penalty
plot(res,xvar='lambda',label=TRUE) #variable importance as penalty increases

nonzeromatrix<-apply(res$beta,2,function(x) (x!=0))
nonzerocount <- apply(nonzeromatrix,2,sum)
nonzeroindices <- match(0:max(nonzerocount), nonzerocount) #which lambdas does n vars change at
names(nonzeroindices) <- 0:(length(nonzeroindices)-1)
nonzeroindices <- nonzeroindices[!is.na(nonzeroindices)] #dropping NA's in case > 1 increase in n vars
incrementalbeta <-res$beta[,nonzeroindices] #beta weights as number of variables changes
matplot(names(nonzeroindices),t(incrementalbeta),type='l',xlab='N vars',ylab='beta')

rankpernvars <- lapply(nonzeroindices, function(x) {
  ret <- res$beta[nonzeromatrix[,x],x]
  ret <- ret[order(abs(ret))]
}) 
like image 64
Charlie Avatar answered Nov 15 '22 01:11

Charlie