I have a dataset of pairs of cities V1 and V2. Each cities has a population v1_pop2015 and v2_pop2015.
I would like to create a new dataset with only the cityCode of the biggest city and its populated added of the population of the smallest.
I was able to create the output I want with a for loop. For educationnal purpose, I tried to do it using tidyverse tools without success.
This is a working sample
library(tidyverse)
## Sample dataset
pairs_pop <- structure(list(cityCodeV1 = c(20073, 20888, 20222, 22974, 23792,
20779), cityCodeV2 = c(20063, 204024, 20183, 20406, 23586, 23595
), v1_pop2015 = c(414, 682, 497, 3639, 384, 596), v2_pop2015 = c(384,
757, 5716, 315, 367, 1303)), row.names = c(NA, 6L), class = c("tbl_df",
"tbl", "data.frame"))
pairs_pop
#> # A tibble: 6 x 4
#> cityCodeV1 cityCodeV2 v1_pop2015 v2_pop2015
#> * <dbl> <dbl> <dbl> <dbl>
#> 1 20073 20063 414 384
#> 2 20888 204024 682 757
#> 3 20222 20183 497 5716
#> 4 22974 20406 3639 315
#> 5 23792 23586 384 367
#> 6 20779 23595 596 1303
#### This is working !!!
clean_df <- setNames(data.frame(matrix(ncol = 2, nrow = dim(pairs_pop)[1])),c("to_keep", "to_keep_pop"))
# For each row, determine which city is the biggest and adds the two cities population
for (i in 1:dim(pairs_pop)[1]) {
if(pairs_pop$v1_pop2015[i] > pairs_pop$v2_pop2015[i])
{
clean_df$to_keep[i] = pairs_pop$cityCodeV1[i]
clean_df$to_keep_pop[i] = pairs_pop$v1_pop2015[i] + pairs_pop$v2_pop2015[i]
}
else
{
clean_df$to_keep[i] = pairs_pop$cityCodeV2[i]
clean_df$to_keep_pop[i] = pairs_pop$v1_pop2015[i] + pairs_pop$v2_pop2015[i]
}
}
clean_df
#> to_keep to_keep_pop
#> 1 20073 798
#> 2 204024 1439
#> 3 20183 6213
#> 4 22974 3954
#> 5 23792 751
#> 6 23595 1899
This is where I'm stucked
### trying to tidy it with rowwise, mutate and a function
v1_sup_tov2 <- function(x){
print(x)
if(x$v1_pop2015 > x$v2_pop2015){
return (TRUE)
}
return(FALSE)
}
to_clean_df2 <- pairs_pop %>%
rowwise() %>%
mutate_if(v1_sup_tov2,
to_keep = cityCodeV1,
to_delete= cityCodeV2,
to_keep_pop = v1_pop2015 + v2_pop2015)
The expected output is a dataframe with 2 colums like this: to_keep: cityCode of the city I want to keep to_keep_pop: population of that city
clean_df
#> to_keep to_keep_pop
#> 1 20073 798
#> 2 204024 1439
#> 3 20183 6213
#> 4 22974 3954
#> 5 23792 751
#> 6 23595 1899
What about this?
library(dplyr)
## Sample dataset
pairs_pop <- structure(
list(cityCodeV1 = c(20073, 20888, 20222, 22974, 23792, 20779),
cityCodeV2 = c(20063, 204024, 20183, 20406, 23586, 23595),
v1_pop2015 = c(414, 682, 497, 3639, 384, 596),
v2_pop2015 = c(384, 757, 5716, 315, 367, 1303)),
row.names = c(NA, 6L), class = c("tbl_df", "tbl", "data.frame"))
clean_df <- transmute(pairs_pop,
to_keep = if_else(v1_pop2015 > v2_pop2015, cityCodeV1, cityCodeV2),
to_keep_pop = v1_pop2015 + v2_pop2015)
Just in case one day you get multiple cities with v1, v2, v3, ... Do not forget to keep all information in your dataframe so that you know what value is related to what. A tidy dataframe.
library(dplyr)
## Sample dataset
pairs_pop <- structure(
list(cityCodeV1 = c(20073, 20888, 20222, 22974, 23792, 20779),
cityCodeV2 = c(20063, 204024, 20183, 20406, 23586, 23595),
v1_pop2015 = c(414, 682, 497, 3639, 384, 596),
v2_pop2015 = c(384, 757, 5716, 315, 367, 1303)),
row.names = c(NA, 6L), class = c("tbl_df", "tbl", "data.frame"))
# Tidy dataset with all information that was in columns
library(dplyr)
library(tidyr)
library(stringr)
tidy_pairs <- pairs_pop %>%
mutate(city = 1:n()) %>%
gather("key", "value", -city) %>%
mutate(ville = str_extract(key, "([[:digit:]])"),
key = case_when(
grepl("cityCode", key) ~ "cityCode",
grepl("pop", key) ~ "pop",
TRUE ~ "other"
)) %>%
spread(key, value)
And then you can apply the test you want
tidy_pairs %>%
group_by(city) %>%
summarise(to_keep = cityCode[pop == max(pop)],
to_keep_pop = sum(pop))
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With