I am using the html_table from rvest to read a two-column concordance table from the website below. Both columns contain instances of leading zeros which I would want to preserve. As such, I would want the columns to be of class character. I use the following code:
library(rvest)
library(data.table)
df <- list()
for (j in 1:25) {
url <- paste('http://unstats.un.org/unsd/cr/registry/regso.asp?Ci=70&Lg=1&Co=&T=0&p=',
j, '&prn=yes', sep='')
webpage <- read_html(url)
table <- html_nodes(webpage, 'table')
df[[j]] <- html_table(table, header=TRUE)[[1]]
df[[j]] <- df[[j]][,c(1:2) ]
}
ISIC4.NACE2 <- rbindlist(df)
However a str(df[[1]]) returns
'data.frame': 40 obs. of 2 variables:
$ ISIC Rev.4: chr "01" "011" "0111" "0112" ...
$ NACE Rev.2: num 1 1.1 1.11 1.12 1.13 1.14 1.15 1.16 1.19 1.2 ...
It seems the html_table function interprets the first column as character and the second column as numeric, thereby truncating the leading zeros in the latter. Is there a way to specify column class using html_table?
col_classes
should either be NULL
or a list
. if a list
then it should be in the form:
list(`COL#`=`class`, ...)
for example:
list(`1`='character', `3`='integer', `7`='logical')
You have to source everything below into the session you're using rvest
from since it's replacing the rvest
S3 definitions of these functions:
I changed the html_table
line in your code to:
df[[j]] <- html_table(table, header=TRUE, col_classes=list(`2`='character'))[[1]]
and now get the following as str
output:
'data.frame': 40 obs. of 2 variables:
$ ISIC Rev.4: int 1 11 111 112 113 114 115 116 119 12 ...
$ NACE Rev.2: chr "01" "01.1" "01.11" "01.12" ...
------ source everything below -------
html_table <- function(x, header = NA, trim = TRUE, fill = FALSE, dec = ".", col_classes = NULL) {
UseMethod("html_table")
}
' @export
html_table.xml_document <- function(x, header = NA, trim = TRUE, fill = FALSE,
dec = ".", col_classes = NULL) {
tables <- xml2::xml_find_all(x, ".//table")
lapply(tables, html_table, header = header, trim = trim, fill = fill, dec = dec, col_classes)
}
html_table.xml_nodeset <- function(x, header = NA, trim = TRUE, fill = FALSE,
dec = ".", col_classes = NULL) {
# FIXME: guess useful names
lapply(x, html_table, header = header, trim = trim, fill = fill, dec = dec, col_classes)
}
html_table.xml_node <- function(x, header = NA, trim = TRUE,
fill = FALSE, dec = ".",
col_classes=NULL) {
stopifnot(html_name(x) == "table")
# Throw error if any rowspan/colspan present
rows <- html_nodes(x, "tr")
n <- length(rows)
cells <- lapply(rows, "html_nodes", xpath = ".//td|.//th")
ncols <- lapply(cells, html_attr, "colspan", default = "1")
ncols <- lapply(ncols, as.integer)
nrows <- lapply(cells, html_attr, "rowspan", default = "1")
nrows <- lapply(nrows, as.integer)
p <- unique(vapply(ncols, sum, integer(1)))
maxp <- max(p)
if (length(p) > 1 & maxp * n != sum(unlist(nrows)) &
maxp * n != sum(unlist(ncols))) {
# then malformed table is not parsable by smart filling solution
if (!fill) { # fill must then be specified to allow filling with NAs
stop("Table has inconsistent number of columns. ",
"Do you want fill = TRUE?", call. = FALSE)
}
}
values <- lapply(cells, html_text, trim = trim)
out <- matrix(NA_character_, nrow = n, ncol = maxp)
# fill colspans right with repetition
for (i in seq_len(n)) {
row <- values[[i]]
ncol <- ncols[[i]]
col <- 1
for (j in seq_len(length(ncol))) {
out[i, col:(col+ncol[j]-1)] <- row[[j]]
col <- col + ncol[j]
}
}
# fill rowspans down with repetition
for (i in seq_len(maxp)) {
for (j in seq_len(n)) {
rowspan <- nrows[[j]][i]; colspan <- ncols[[j]][i]
if (!is.na(rowspan) & (rowspan > 1)) {
if (!is.na(colspan) & (colspan > 1)) {
# special case of colspan and rowspan in same cell
nrows[[j]] <- c(head(nrows[[j]], i),
rep(rowspan, colspan-1),
tail(nrows[[j]], length(rowspan)-(i+1)))
rowspan <- nrows[[j]][i]
}
for (k in seq_len(rowspan - 1)) {
l <- head(out[j+k, ], i-1)
r <- tail(out[j+k, ], maxp-i+1)
out[j + k, ] <- head(c(l, out[j, i], r), maxp)
}
}
}
}
if (is.na(header)) {
header <- all(html_name(cells[[1]]) == "th")
}
if (header) {
col_names <- out[1, , drop = FALSE]
out <- out[-1, , drop = FALSE]
} else {
col_names <- paste0("X", seq_len(ncol(out)))
}
# Convert matrix to list to data frame
df <- lapply(seq_len(maxp), function(i) {
if (!is.null(col_classes) & (i %in% names(col_classes))) {
as(out[, i], col_classes[[as.character(i)]])
} else {
utils::type.convert(out[, i], as.is = TRUE, dec = dec)
}
})
names(df) <- col_names
class(df) <- "data.frame"
attr(df, "row.names") <- .set_row_names(length(df[[1]]))
if (length(unique(col_names)) < length(col_names)) {
warning('At least two columns have the same name')
}
df
}
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With