I'm trying to create a subset of data that contains only the rows with missing data in one of my columns.
The data:
data<-structure(list(ID = c(1, 2, 3, 4, 7, 9, 10, 12, 13, 14, 15, 16,
17, 18, 20, 21, 22, 23, 24, 25, 27, 28, 29, 31, 34, 37, 38, 39,
40, 41), QnSinV1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), QnSinV2 = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L), QnSinV3 = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), QnSize = c(0.032140423, 0.017620319,
NA, -0.093448167, -0.051090375, 0.001188913, NA, -0.144868599,
-0.000260992, 0.008502255, -0.00346349, 0.017208373, 0.004301855,
0.004420431, -0.007564124, NA, 0.174388101, -0.142412328, 0.064935852,
-0.052174354, NA, 0.005180317, 0.05728222, 0.041215822, -0.002449455,
-0.040942923, -0.082284946, -0.173656321, 0.022723036, -0.061326436
), QnWt = c(15.8, 16.5, 11.9, 13.7, 15, 15.3, 13.7, 15.8, 16.3,
15.9, 15.1, 14.5, 14.4, 15.7, 14.4, 13.3, 14.8, 15.1, 15.1, 14.7,
15.8, 17.8, 16.4, 13.4, 15.1, 14.8, 14.2, 12.7, 17.9, 16.2),
QnWtLsCL = c(NA, 0.503030303, 0.596638655, NA, 0.446666667,
0.509803922, 0.408759124, 0.462025316, 0.552147239, 0.509433962,
0.456953642, 0.455172414, 0.506944444, NA, 0.486111111, 0.473684211,
0.513513514, 0.516556291, 0.582781457, 0.537414966, 0.474683544,
0.43258427, 0.432926829, NA, 0.569536424, 0.445945946, 0.485915493,
0.543307087, NA, 0.543209877), ClaustPer = c(NA, 1L, 2L,
NA, 3L, 0L, 2L, 0L, 1L, 0L, 0L, 0L, 1L, NA, 0L, 7L, 1L, 0L,
1L, 0L, 1L, 2L, 2L, NA, 2L, 3L, 2L, 2L, NA, 0L), QnSurvCL = c(0L,
1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L),
ColWtCL = c(NA, 11.7, 7.3, NA, 9.1, 11.1, 9.6, 11.2, 9, 11.2,
12, 11, 10.9, NA, 9.9, 8.6, 10.8, 10.9, 8.7, 10.8, 11.6,
13.7, 10.8, NA, 9.3, 9.6, 9.8, 8.7, NA, 11.1), ColWtCL_6 = c(NA,
57.1, 45, NA, 73.6, NA, NA, NA, 43.8, NA, NA, 71.1, NA, NA,
53.7, NA, 84.4, NA, NA, NA, 56, 56.1, NA, NA, 59.4, NA, 45.7,
NA, NA, NA), ColGrowthCL_6 = c(NA, 4.88034188, 6.164383562,
NA, 8.087912088, NA, NA, NA, 4.866666667, NA, NA, 6.463636364,
NA, NA, 5.424242424, NA, 7.814814815, NA, NA, NA, 4.827586207,
4.094890511, NA, NA, 6.387096774, NA, 4.663265306, NA, NA,
NA), QnSurvCL_6 = c(NA, 1L, NA, NA, 1L, NA, NA, NA, 1L, NA,
NA, 1L, NA, NA, 1L, 0L, 1L, NA, NA, NA, 1L, 1L, NA, NA, 1L,
NA, 1L, NA, NA, NA), IR = c(-0.1919695, 0.0214441, NA, 0.0886954,
0.4221713, 0.0869788, 0.2716466, 0.0289674, -0.0291414, -0.1739616,
-0.0215773, -0.1473209, 0.0370336, 0.254584, 0.0332632, -0.0203844,
0.1524175, -0.051451, -0.0612144, 0.1617955, 0.0354173, 0.0904954,
0.3344705, 0.0990583, 0.1985931, 0.0419539, -0.0159598, 0.1159526,
-0.0057495, -0.1811458), SH = c(1.2064, 1.1093, NA, 0.922,
0.643, 0.9284, 0.7225, 0.9866, 1.0804, 1.2226, 1.0315, 1.1953,
1.007, 0.6991, 1.0264, 1.0265, 0.8865, 1.1184, 1.094, 0.829,
1.0142, 0.9824, 0.6793, 0.9188, 0.7853, 1.0352, 1.0648, 0.9654,
1.0366, 1.2044), HL = c(0.3774, 0.4349, NA, 0.5091, 0.6187,
0.5168, 0.6405, 0.4691, 0.4555, 0.3444, 0.4908, 0.3819, 0.4846,
0.6256, 0.4638, 0.4778, 0.5219, 0.433, 0.447, 0.564, 0.4899,
0.4612, 0.6542, 0.5162, 0.5549, 0.4928, 0.4471, 0.4959, 0.4523,
0.3511), MLH = c(0.534090909090909, 0.5, NA, 0.40506329113924,
0.298507462686567, 0.410958904109589, 0.293103448275862,
0.442105263157895, 0.48, 0.554347826086957, 0.453488372093023,
0.535353535353535, 0.443298969072165, 0.304878048780488,
0.457446808510638, 0.455555555555556, 0.397849462365591,
0.494252873563218, 0.48314606741573, 0.377777777777778, 0.457446808510638,
0.445652173913043, 0.3, 0.412371134020619, 0.354838709677419,
0.464646464646465, 0.474226804123711, 0.43010752688172, 0.46078431372549,
0.541666666666667)), .Names = c("ID", "QnSinV1", "QnSinV2",
"QnSinV3", "QnSize", "QnWt", "QnWtLsCL", "ClaustPer", "QnSurvCL",
"ColWtCL", "ColWtCL_6", "ColGrowthCL_6", "QnSurvCL_6", "IR",
"SH", "HL", "MLH"), row.names = c(1L, 2L, 3L, 4L, 7L, 9L, 10L,
12L, 13L, 14L, 15L, 16L, 17L, 18L, 20L, 21L, 22L, 23L, 24L, 25L,
27L, 28L, 29L, 31L, 34L, 37L, 38L, 39L, 40L, 41L), class = "data.frame")
My guess (which doesn't work):
test<-subset(data, data$ColWtCL_6=='NA')
test
By using bracket notation on R DataFrame (data.name) we can select rows by column value, by index, by name, by condition e.t.c. You can also use the R base function subset() to get the same results. Besides these, R also provides another function dplyr::filter() to get the rows from the DataFrame.
There are two easy methods to select columns of an R data frame without missing values, first one results in a vector and other returns a matrix. For example, if we have a data frame called df then the first method can be used as df[,colSums(is.na(df))==0] and the second method will be used as t(na.
To select rows of an R data frame that are non-Na, we can use complete. cases function with single square brackets. For example, if we have a data frame called that contains some missing values (NA) then the selection of rows that are non-NA can be done by using the command df[complete. cases(df),].
You can do it also without subset()
. To select NA values you should use function is.na()
.
data[is.na(data$ColWtCL_6),]
Or with subset()
subset(data,is.na(ColWtCL_6))
A tidyverse approach (package dplyr
):
test <-
data %>%
filter(is.na(ColWtCL_6))
If you want to filter based on NAs in multiple columns, please consider using function filter_at()
in combinations with a valid function to select the columns to apply the filtering condition and the filtering condition itself.
Example 1: select rows of data
with NA in all columns starting with Col
:
test <-
data %>%
filter_at(vars(starts_with("Col")), all_vars(is.na(.)))
Example 2: select rows of data
with NA in one of the columns starting with Col
:
test <-
data %>%
filter_at(vars(starts_with("Col")), any_vars(is.na(.)))
This link from tidyverse documentation is very inspiring: https://dplyr.tidyverse.org/reference/filter_all.html
Here's another solution to find ǸA
's across all columns in a dataframe using dplyr
:
library(dplyr)
# get column names
colnms <- colnames(df)
# filter
df %>%
filter_at(vars(all_of(colnms)), any_vars(is.na(.)))
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With