I have the following process which uses group_split of dplyr:
library(tidyverse)
set.seed(1)
iris %>% sample_n(size = 5) %>%
group_by(Species) %>%
group_split()
The result is:
[[1]]
# A tibble: 2 x 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fct>
1 5 3.5 1.6 0.6 setosa
2 5.1 3.8 1.5 0.3 setosa
[[2]]
# A tibble: 2 x 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fct>
1 5.9 3 4.2 1.5 versicolor
2 6.2 2.2 4.5 1.5 versicolor
[[3]]
# A tibble: 1 x 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fct>
1 6.2 3.4 5.4 2.3 virginica
What I want to achieve is to name this list by grouped name (i.e. Species). Yielding this (done by hand):
$setosa
# A tibble: 2 x 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fct>
1 5 3.5 1.6 0.6 setosa
2 5.1 3.8 1.5 0.3 setosa
$versicolor
# A tibble: 2 x 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fct>
1 5.9 3 4.2 1.5 versicolor
2 6.2 2.2 4.5 1.5 versicolor
$virginica
# A tibble: 1 x 5
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
<dbl> <dbl> <dbl> <dbl> <fct>
1 6.2 3.4 5.4 2.3 virginica
How can I achieve that?
Update
I tried this new data, where the naming now is called Cluster :
df <- structure(list(Cluster = c("Cluster9", "Cluster11", "Cluster1",
"Cluster9", "Cluster6", "Cluster12", "Cluster9", "Cluster11",
"Cluster8", "Cluster8"), gene_name = c("Tbc1d8", "Vimp", "Grhpr",
"H1f0", "Zfp398", "Pikfyve", "Ankrd13a", "Fgfr1op2", "Golga7",
"Lars2"), p_value = c(3.46629097620496e-47, 3.16837338947245e-62,
1.55108439059684e-06, 9.46078511685542e-131, 0.000354049720507017,
0.0146807415917158, 1.42799750295289e-38, 2.0697825959399e-08,
4.13777221466668e-06, 3.92889640704683e-184), morans_test_statistic = c(14.3797687352223,
16.6057085487911, 4.66393667525872, 24.301453902967, 3.38642377758137,
2.17859882998961, 12.9350063459509, 5.48479186018979, 4.4579286289179,
28.9144540271157), morans_I = c(0.0814728893885783, 0.0947505609609695,
0.0260671534007409, 0.138921824574569, 0.018764800166045, 0.0119813199210325,
0.0736554862590782, 0.0309849638728409, 0.0250591347318986, 0.165310420808725
), q_value = c(1.57917584337356e-46, 1.62106594498462e-61, 3.43312171446844e-06,
6.99503520654745e-130, 0.000683559649593623, 0.0245476826213791,
5.96116678335584e-38, 4.97603701391971e-08, 8.9649490080526e-06,
3.48152096326702e-183)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
With Ronak Shah's approach I get inconsistent result:
df %>% group_split(Cluster) %>% setNames(unique(df$Cluster))
$Cluster9
# A tibble: 1 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster1 Grhpr 0.00000155 4.66 0.0261 0.00000343
$Cluster11
# A tibble: 2 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster11 Vimp 3.17e-62 16.6 0.0948 1.62e-61
2 Cluster11 Fgfr1op2 2.07e- 8 5.48 0.0310 4.98e- 8
$Cluster1
# A tibble: 1 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster12 Pikfyve 0.0147 2.18 0.0120 0.0245
$Cluster6
# A tibble: 1 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster6 Zfp398 0.000354 3.39 0.0188 0.000684
$Cluster12
# A tibble: 2 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster8 Golga7 4.14e- 6 4.46 0.0251 8.96e- 6
2 Cluster8 Lars2 3.93e-184 28.9 0.165 3.48e-183
$Cluster8
# A tibble: 3 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster9 Tbc1d8 3.47e- 47 14.4 0.0815 1.58e- 46
2 Cluster9 H1f0 9.46e-131 24.3 0.139 7.00e-130
3 Cluster9 Ankrd13a 1.43e- 38 12.9 0.0737 5.96e- 38
Note that $Cluster9 has Cluster1 in it.
Please advice how to go about this?
Lots of good answers. You can also just do:
iris %>% sample_n(size = 5) %>%
split(f = as.factor(.$Species))
Which will give you:
$setosa
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
4 5.5 3.5 1.3 0.2 setosa
5 5.3 3.7 1.5 0.2 setosa
$versicolor
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
3 5 2.3 3.3 1 versicolor
$virginica
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 7.7 2.6 6.9 2.3 virginica
2 7.2 3.0 5.8 1.6 virginica
Also works with your dataframe above:
df %>%
split(f = as.factor(.$Cluster))
Gives you:
$Cluster1
# A tibble: 1 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster1 Grhpr 0.00000155 4.66 0.0261 0.00000343
$Cluster11
# A tibble: 2 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster11 Vimp 3.17e-62 16.6 0.0948 1.62e-61
2 Cluster11 Fgfr1op2 2.07e- 8 5.48 0.0310 4.98e- 8
$Cluster12
# A tibble: 1 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster12 Pikfyve 0.0147 2.18 0.0120 0.0245
$Cluster6
# A tibble: 1 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster6 Zfp398 0.000354 3.39 0.0188 0.000684
$Cluster8
# A tibble: 2 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster8 Golga7 4.14e- 6 4.46 0.0251 8.96e- 6
2 Cluster8 Lars2 3.93e-184 28.9 0.165 3.48e-183
$Cluster9
# A tibble: 3 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster9 Tbc1d8 3.47e- 47 14.4 0.0815 1.58e- 46
2 Cluster9 H1f0 9.46e-131 24.3 0.139 7.00e-130
3 Cluster9 Ankrd13a 1.43e- 38 12.9 0.0737 5.96e- 38
Not sure, if this can be done directly. One way is by sampling the dataframe and then use it's unique names to setNames.
library(dplyr)
df <- iris %>% sample_n(size = 5)
df %>%
group_split(Species) %>%
setNames(unique(df$Species))
#$setosa
# A tibble: 1 x 5
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <dbl> <dbl> <dbl> <dbl> <fct>
#1 5 3.4 1.5 0.2 setosa
#$versicolor
# A tibble: 1 x 5
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <dbl> <dbl> <dbl> <dbl> <fct>
#1 6 3.4 4.5 1.6 versicolor
#$virginica
# A tibble: 3 x 5
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# <dbl> <dbl> <dbl> <dbl> <fct>
#1 7.3 2.9 6.3 1.8 virginica
#2 6.9 3.1 5.1 2.3 virginica
#3 7.7 3 6.1 2.3 virginica
It is weird that group_split doesn't directly name the lists because it is supposed to be an alternative to base::split which does name it.
split(df, df$Species)
The document says :
group_split() works like base::split() but
For the updated dataset it doesn't work because while naming we are using unique which gets the data in the same order as they appear whereas group_split, splits the data based on increasing order of their value. (So the order of splitting is Cluster1,Cluster11, Cluster2...) One way to overcome that is to convert Cluster to factor and specify levels as they appear using unique.
df <- df %>%
mutate(Cluster = factor(Cluster, levels = unique(Cluster)))
df %>%
group_split(Cluster) %>%
setNames(unique(df$Cluster))
OR if you don't want them as factors do
df %>%
group_split(Cluster) %>%
setNames(sort(unique(df$Cluster)))
I came across same problem and using this 2 step solution:
df= df %>% group_by(Cluster)
df= df %>% group_split() %>% set_names(unlist(group_keys(df)))
df$Cluster1
# A tibble: 1 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster1 Grhpr 0.00000155 4.66 0.0261 0.00000343
df$Cluster9
# A tibble: 3 x 6
Cluster gene_name p_value morans_test_statistic morans_I q_value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Cluster9 Tbc1d8 3.47e- 47 14.4 0.0815 1.58e- 46
2 Cluster9 H1f0 9.46e-131 24.3 0.139 7.00e-130
3 Cluster9 Ankrd13a 1.43e- 38 12.9 0.0737 5.96e- 38
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With