How can I find a dataset that has some specific attributes? [duplicate]

Tags:

r

The package datasets and various packages come with a fair amount of useful datasets, however there seems to be no easy way to find your perfect dataset when you need it for your package examples, for teaching purposes, or to ask/answer a question here on SO.

Say for instance I want a dataset that is a data.frame, has at least 2 character columns, and is less than 100 rows long.

How can I explore EVERY dataset available and see a maximum of relevant information to make my choice ?

My past tries were messy, taking time, and crashed with some packages which have an unusual object structure like caret.

783

asked Sep 14 '17 16:09

Moody_Mudskipper

1 Answers

I've packaged a solution in a one function github package.

I'm copying the whole code at the bottom but the simplest is :

remotes::install_github("moodymudskipper/datasearch")
library(datasearch)

All data sets from package "dplyr"

dplyr_all <-
  datasearch("dplyr")

View(dplyr_all)

dplyr

Datasets from package "datasets" restricted by condition

datasets_ncol5 <-
  datasearch("datasets", filter =  ~is.data.frame(.) && ncol(.) == 5)

View(datasets_ncol5)

datasets

All datasets from all installed packages, no restriction


# might take more or less time, depends what you have installed
all_datasets <- datasearch()

View(all_datasets)

# subsetting the output
my_subset <- subset(
  all_datasets, 
  class1 == "data.frame" &
    grepl("treatment", names_collapsed) &
    nrow < 100
)

View(my_subset)

all

datasearch <- function(pkgs = NULL, filter = NULL){
  # make function silent
  w <- options()$warn
  options(warn = -1)
  search_ <- search()
  file_ <- tempfile()
  file_ <- file(file_, "w")
  on.exit({
    options(warn = w)
    to_detach <- setdiff(search(), search_)
    for(pkg in to_detach) eval(bquote(detach(.(pkg))))
    # note : we still have loaded namespaces, we could unload those that we ddn't
    # have in the beginning but i'm worried about surprising effects, I think
    # the S3 method tables should be cleaned too, and maybe other things

    # note2 : tracing library and require didn't work
    })

  # convert formula to function
  if(inherits(filter, "formula")) {
    filter <- as.function(c(alist(.=), filter[[length(filter)]]))
  }

  ## by default fetch all available packages in .libPaths()
  if(is.null(pkgs)) pkgs <- .packages(all.available = TRUE)
  ## fetch all data sets description
  df <- as.data.frame(data(package = pkgs, verbose = FALSE)$results)
  names(df) <- tolower(names(df))
  item <- NULL # for cmd check note
  df <- transform(
    df,
    data_name = sub('.*\\((.*)\\)', '\\1', item),
    dataset   = sub(' \\(.*', '', item),
    libpath = NULL,
    item = NULL
    )
  df <- df[order(df$package, df$data_name),]
  pkg_data_names <- aggregate(dataset ~ package + data_name, df, c)
  pkg_data_names <- pkg_data_names[order(pkg_data_names$package, pkg_data_names$data_name),]

  env <- new.env()
  n <-  nrow(pkg_data_names)
  pb <- progress::progress_bar$new(
    format = "[:bar] :percent :pkg",
    total = n)
  row_dfs <- vector("list", n)
  for(i in seq(nrow(pkg_data_names))) {
    pkg    <- pkg_data_names$package[i]
    data_name <- pkg_data_names$data_name[i]
    datasets  <- pkg_data_names$dataset[[i]]
    pb$tick(tokens = list(pkg = format(pkg, width = 12)))

    sink(file_, type = "message")
    data(list=data_name, package = pkg, envir = env)
    row_dfs_i <- lapply(datasets, function(dataset) {
      dat <- get(dataset, envir = env)
      if(!is.null(filter) && !filter(dat)) return(NULL)
      cl <- class(dat)
      nms <- names(dat)
      nc <- ncol(dat)
      if (is.null(nc)) nc <- NA
      nr <- nrow(dat)
      if (is.null(nr)) nr <- NA

      out <- data.frame(
        package = pkg,
        data_name = data_name,
        dataset = dataset,
        class = I(list(cl)),
        class1 = cl[1],
        type = typeof(dat),
        names = I(list(nms)),
        names_collapsed = paste(nms, collapse = "/"),
        nrow       = nr,
        ncol       = nc,
        length     = length(dat))

      if("data.frame" %in% cl) {
        classes <- lapply(dat, class)
        cl_flat <- unlist(classes)
        out <- transform(
          out,
          classes    = I(list(classes)),
          types      = I(list(vapply(dat, typeof, character(1)))),
          logical    = sum(cl_flat == 'logical'),
          integer    = sum(cl_flat == 'integer'),
          numeric    = sum(cl_flat == 'numeric'),
          complex    = sum(cl_flat == 'complex'),
          character  = sum(cl_flat == 'character'),
          raw        = sum(cl_flat == 'raw'),
          list       = sum(cl_flat == 'list'),
          data.frame = sum(cl_flat == 'data.frame'),
          factor     = sum(cl_flat == 'factor'),
          ordered    = sum(cl_flat == 'ordered'),
          Date       = sum(cl_flat == 'Date'),
          POSIXt     = sum(cl_flat == 'POSIXt'),
          POSIXct    = sum(cl_flat == 'POSIXct'),
          POSIXlt    = sum(cl_flat == 'POSIXlt'))
      } else {
        out <- transform(
          out,
          nrow       = NA,
          ncol       = NA,
          classes    = NA,
          types      = NA,
          logical    = NA,
          integer    = NA,
          numeric    = NA,
          complex    = NA,
          character  = NA,
          raw        = NA,
          list       = NA,
          data.frame = NA,
          factor     = NA,
          ordered    = NA,
          Date       = NA,
          POSIXt     = NA,
          POSIXct    = NA,
          POSIXlt    = NA)
      }
      if(is.matrix(dat)) {
        out$names <- list(colnames(dat))
        out$names_collapsed = paste(out$names, collapse = "/")
      }
      out
    })
    row_dfs_i <- do.call(rbind, row_dfs_i)
    if(!is.null(row_dfs_i)) row_dfs[[i]] <- row_dfs_i
    sink(type = "message")
  }
  df2 <- do.call(rbind, row_dfs)
  df <- merge(df, df2)
  df
}

answered Nov 10 '22 06:11

Moody_Mudskipper

Related questions
                            
                                Reduce lists by summing element-wise in purrr
                            
                                Negative lookahead in R not behaving as expected
                            
                                Why is the apply function of R confusing rows with columns?
                            
                                Missing values error in train() function Caret for trees
                            
                                Best practice for defining S3 methods with different arguments [closed]
                            
                                Commenting in rmarkdown/knitr to prevent R evaluation
                            
                                Find rows where one column string is in another column using dplyr in R
                            
                                How do I edit bootstrap theme in rmarkdown?
                            
                                Duplicate and customize secondary y axis
                            
                                how to render DT::datatables in a pdf using rmarkdown?
                            
                                How does the use.cache feature of Packrat work?
                            
                                Reorder factor levels based on another factor
                            
                                R error : "attempt to select less than one element in get1index"
                            
                                R wordcloud2 letterCloud showing only the background
                            
                                plm Package in R - empty model when including only variables without variation over time per individual
                            
                                Tidy evaluation when column names are stored in strings
                            
                                extra variables in legend when wrapping ggplot2 in plotly R
                            
                                Can I run a Shiny app from within R Tools for Visual Studio
                            
                                Plotting sfc_POLYGON in leaflet
                            
                                Handling of closures in data.table

Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!

Donate Us With

How can I find a dataset that has some specific attributes? [duplicate]

Tags:

r

Moody_Mudskipper

People also ask

1 Answers

Moody_Mudskipper

Recent Activity

Donate For Us