Some tidyverse
functions return dataframes, but some return tibbles. I can't find any resources on which functions return which data type, and I can't see any real predictable pattern to it. Here are a few examples for illustration:
library(tidyverse)
# Returns dataframe
df1 <- iris %>% mutate(newcol = mean(Sepal.Length))
df2 <- iris %>% filter(Sepal.Length > 5)
df3 <- iris %>% select(Sepal.Length, Species)
class(df1); class(df2); class(df3)
[1] "data.frame"
[1] "data.frame"
[1] "data.frame"
# Returns tibble
df4 <- iris %>% add_count(Species)
df5 <- iris %>% pivot_longer(cols = -Species)
df6 <- iris %>% group_by(Species) %>% mutate(newcol = mean(Sepal.Length))
class(df4); class(df5); class(df6)
[1] "tbl_df" "tbl" "data.frame"
[1] "tbl_df" "tbl" "data.frame"
[1] "grouped_df" "tbl_df" "tbl" "data.frame"
Is there a way to tell what the return value type will be from the function? Or perhaps there is a resource on which functions return which data type in the tidyverse? A lot of R code will run the same on tibbles and dataframes, but there are important differences, such as the default behavior for extracting a column, where drop = FALSE
is the default behavior for tibbles. I know I can always change the types manually, but when working with large bodies of legacy code it would be incredibly useful to know
The Tidyverse suite of integrated packages are designed to work together to make common data science operations more user friendly. The packages have functions for data wrangling, tidying, reading/writing, parsing, and visualizing, among others.
You can use the function is_tibble() to check whether a data frame is a tibble or not. The mtcars data frame is not a tibble. But the diamonds and flights data are tibbles. More generally, you can use the class() function to find out the class of an object.
Tibbles are data. frames that are lazy and surly: they do less (i.e. they don't change variable names or types, and don't do partial matching) and complain more (e.g. when a variable does not exist). This forces you to confront problems earlier, typically leading to cleaner, more expressive code.
tibble() is a nice way to create data frames. It encapsulates best practices for data frames: It never changes an input's type (i.e., no more stringsAsFactors = FALSE !). List-columns are often created by tidyr::nest() , but they can be useful to create by hand.
I called each of these tidyverse
functions with dummy arguments to find the resulting class.
lst(
"dplyr::filter"=dplyr::filter(iris, T) %>% class(),
"dplyr::filter_all"=dplyr::filter_all(mtcars, all_vars(. > 1)) %>% class(),
"dplyr::group_by"=dplyr::group_by(iris, Species) %>% class(),
"dplyr::group_by_all"=dplyr::group_by_all(iris, function(...) TRUE) %>% class(),
"dplyr::group_keys"=dplyr::group_keys(iris) %>% class(),
"dplyr::group_modify"=dplyr::group_modify(iris, function(x, ...) identity(x)) %>% class(),
"dplyr::mutate"=dplyr::mutate(iris) %>% class(),
"dplyr::mutate_all"=dplyr::mutate_all(iris, identity) %>% class(),
"dplyr::rowwise"=dplyr::rowwise(iris) %>% class(),
"dplyr::select"=dplyr::select(iris) %>% class(),
"dplyr::select_all"=dplyr::select_all(iris) %>% class(),
"dplyr::slice"=dplyr::slice(iris) %>% class(),
"dplyr::summarise"=dplyr::summarise(iris) %>% class(),
"dplyr::summarise_all"=dplyr::summarise_all(iris, ~.) %>% class(),
"modelr::add_predictions"=modelr::add_predictions(mtcars, lm(cyl~hp, data = mtcars)) %>% class(),
"modelr::add_residuals"=modelr::add_residuals(mtcars, lm(cyl~hp, data = mtcars)) %>% class(),
"modelr::bootstrap"=modelr::bootstrap(mtcars, 1) %>% class(),
"modelr::crossv_mc"=modelr::crossv_mc(iris, 1) %>% class(),
"modelr::data_grid"=modelr::data_grid(mtcars, vs, am) %>% class(),
"modelr::model_matrix"=modelr::model_matrix(mtcars, am ~ disp) %>% class(),
"modelr::permute"=modelr::permute(iris, 1) %>% class(),
"tibble::add_column"=tibble::add_column(iris) %>% class(),
"tibble::add_row"=tibble::add_row(iris) %>% class(),
"tidyr::complete"=tidyr::complete(iris) %>% class(),
"tidyr::drop_na"=tidyr::drop_na(iris) %>% class(),
"tidyr::expand"=tidyr::expand(iris) %>% class(),
"tidyr::extract"=tidyr::extract(data.frame(x = c(NA, "a-b", "a-d", "b-c", "d-e")), x, "A") %>% class(),
"tidyr::fill"=tidyr::fill(iris) %>% class(),
"tidyr::gather"=tidyr::gather(iris) %>% class(),
"tidyr::nest"=tidyr::nest(iris) %>% class(),
"tidyr::replace_na"=tidyr::replace_na(iris) %>% class(),
"tidyr::separate"=tidyr::separate(data.frame(x = c(NA, "a.b", "a.d", "b.c")), x, c("A", "B")) %>% class(),
"tidyr::separate_rows"=tidyr::separate_rows(iris) %>% class(),
"tidyr::spread"=tidyr::spread(data.frame(x = c("a", "b"), y = c(3, 4), z = c(5, 6)), x, y) %>% class(),
"tidyr::uncount"=tidyr::uncount(data.frame(x = c("a", "b"), n = c(1, 2)), n) %>% class(),
"tidyr::unite"=tidyr::unite(iris, "z", 1:2) %>% class(),
"tidyr::unnest"=tidyr::unnest(iris) %>% class()
) %>%
enframe() %>%
mutate(value = map_chr(value, max)) %>%
data.frame()
#> name value
#> 1 dplyr::filter data.frame
#> 2 dplyr::filter_all data.frame
#> 3 dplyr::group_by tbl_df
#> 4 dplyr::group_by_all tbl_df
#> 5 dplyr::group_keys data.frame
#> 6 dplyr::group_modify data.frame
#> 7 dplyr::mutate data.frame
#> 8 dplyr::mutate_all data.frame
#> 9 dplyr::rowwise tbl_df
#> 10 dplyr::select data.frame
#> 11 dplyr::select_all data.frame
#> 12 dplyr::slice data.frame
#> 13 dplyr::summarise data.frame
#> 14 dplyr::summarise_all data.frame
#> 15 modelr::add_predictions data.frame
#> 16 modelr::add_residuals data.frame
#> 17 modelr::bootstrap tbl_df
#> 18 modelr::crossv_mc tbl_df
#> 19 modelr::data_grid tbl_df
#> 20 modelr::model_matrix tbl_df
#> 21 modelr::permute tbl_df
#> 22 tibble::add_column data.frame
#> 23 tibble::add_row data.frame
#> 24 tidyr::complete data.frame
#> 25 tidyr::drop_na data.frame
#> 26 tidyr::expand tbl_df
#> 27 tidyr::extract data.frame
#> 28 tidyr::fill data.frame
#> 29 tidyr::gather data.frame
#> 30 tidyr::nest tbl_df
#> 31 tidyr::replace_na data.frame
#> 32 tidyr::separate data.frame
#> 33 tidyr::separate_rows tbl_df
#> 34 tidyr::spread data.frame
#> 35 tidyr::uncount data.frame
#> 36 tidyr::unite data.frame
#> 37 tidyr::unnest tbl_df
Candidate functions were identified by finding tidyverse
functions whose first argument was ".data"
, ".tbl"
, or "data"
.
df <-
collidr::CRANdf %>%
filter(package_names %in% tidyverse::tidyverse_packages()) %>%
mutate(f = map2(function_names, package_names, possibly(getFromNamespace, otherwise = NA))) %>%
filter(map_lgl(f, is_function), !map_lgl(f, rlang::is_primitive)) %>%
mutate(first_arg = map(f, ~rlang::fn_fmls(.) %>% names() %>% first())) %>%
filter(first_arg %in% c(".data", ".tbl", "data")) %>%
select(package_names, function_names, first_arg)
df
#> package_names function_names first_arg
#> 1 cli tree data
#> 2 dbplyr arrange.tbl_lazy .data
#> 3 dbplyr do.tbl_sql .data
#> 4 dbplyr window_order .data
#> 5 dplyr arrange .data
#> 6 dplyr arrange_all .tbl
#> 7 dplyr distinct .data
#> 8 dplyr distinct_all .tbl
#> 9 dplyr do .data
#> 10 dplyr filter .data
#> 11 dplyr filter_all .tbl
#> 12 dplyr group_by .data
#> 13 dplyr group_by_all .tbl
#> 14 dplyr group_keys .tbl
#> 15 dplyr group_map .data
#> 16 dplyr group_rows .data
#> 17 dplyr group_trim .tbl
#> 18 dplyr mutate .data
#> 19 dplyr mutate_all .tbl
#> 20 dplyr pull .data
#> 21 dplyr rowwise data
#> 22 dplyr select .data
#> 23 dplyr select_all .tbl
#> 24 dplyr slice .data
#> 25 dplyr summarise .data
#> 26 dplyr summarise_all .tbl
#> 27 ggplot2 ggplot data
#> 28 modelr add_predictions data
#> 29 modelr add_residuals data
#> 30 modelr bootstrap data
#> 31 modelr crossv_mc data
#> 32 modelr data_grid data
#> 33 modelr fit_with data
#> 34 modelr model_matrix data
#> 35 modelr permute data
#> 36 modelr resample data
#> 37 modelr resample_bootstrap data
#> 38 modelr resample_partition data
#> 39 modelr resample_permutation data
#> 40 rlang as_data_mask data
#> 41 tibble add_column .data
#> 42 tibble add_row .data
#> 43 tidyr complete data
#> 44 tidyr drop_na data
#> 45 tidyr expand data
#> 46 tidyr extract data
#> 47 tidyr fill data
#> 48 tidyr gather data
#> 49 tidyr nest .data
#> 50 tidyr replace_na data
#> 51 tidyr separate data
#> 52 tidyr separate_rows data
#> 53 tidyr spread data
#> 54 tidyr uncount data
#> 55 tidyr unite data
#> 56 tidyr unnest data
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With