I have prepared a r script and trying to apply the same codes in multiple files currently working with 5 sample files (but trying to learn to work with files over 100) in the directory to learn how to work with multiple files. Sorry for the quality of codes I have written as I am currently learning R and I am sure there are much better and organised way to write then what I have prepared. (please see below for sample data and my codes)
What I am trying to acheive is to run my codes in all files and write them back to the same directory or a different directory with slightly altered name.
I have tried to read each files using the following and add all my codes within the {} bracekts:
filenames = dir(pattern=".csv")
for( i in 1:length(filenames) ){}
but its not working, I thing I am doing this step all wrong, I am just wondering if you could give me some guidence about how should I approch to work with multiple files?
I have prepared a sample dataset so that I can show you the codes I have got, the following two pictures shows the dataset after I read and after I run all codes how the dataset look like:
My sample datafile:
> dput (df)
structure(list(X = structure(c(3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "w", "wo"), class = "factor"),
X.1 = structure(c(1L, 11L, 18L, 9L, 26L, 30L, 22L, 5L, 14L,
15L, 6L, 23L, 27L, 19L, 2L, 10L, 16L, 7L, 24L, 28L, 20L,
3L, 12L, 17L, 8L, 25L, 29L, 21L, 4L, 13L), .Label = c("Fri 1 Jan",
"Fri 15 Jan", "Fri 22 Jan", "Fri 29 Jan", "Fri 8 Jan", "Mon 11 Jan",
"Mon 18 Jan", "Mon 25 Jan", "Mon 4 Jan", "Sat 16 Jan", "Sat 2 Jan",
"Sat 23 Jan", "Sat 30 Jan", "Sat 9 Jan", "Sun 10 Jan", "Sun 17 Jan",
"Sun 24 Jan", "Sun 3 Jan", "Thu 14 Jan", "Thu 21 Jan", "Thu 28 Jan",
"Thu 7 Jan", "Tue 12 Jan", "Tue 19 Jan", "Tue 26 Jan", "Tue 5 Jan",
"Wed 13 Jan", "Wed 20 Jan", "Wed 27 Jan", "Wed 6 Jan"), class = "factor"),
X1 = c(322L, 89L, 242L, NA, 136L, 113L, 70L, 134L, 232L,
NA, 127L, 124L, 120L, 162L, 179L, 374L, 477L, NA, 147L, 136L,
152L, 196L, 384L, 491L, 136L, NA, 143L, 172L, 226L, 509L),
X2 = c(409L, 71L, 206L, NA, 104L, 57L, NA, 98L, 201L, NA,
74L, 94L, 69L, 98L, 117L, 277L, 445L, NA, 131L, 90L, 83L,
NA, 329L, 473L, 73L, NA, 104L, 113L, 136L, 427L), X3 = c(367L,
54L, 211L, NA, 107L, 69L, 51L, 63L, 157L, NA, 56L, 115L,
96L, 100L, 118L, 250L, 388L, NA, 124L, 85L, 96L, 118L, 313L,
431L, 79L, NA, 93L, 135L, 134L, 290L), X4 = c(343L, 60L,
183L, NA, 110L, 53L, 32L, 77L, 123L, NA, 37L, 100L, 64L,
68L, 99L, 199L, 333L, NA, 107L, 71L, 81L, 89L, 219L, 393L,
43L, NA, 72L, 96L, 127L, NA), X5 = c(231L, 42L, 79L, NA,
74L, 58L, 48L, 59L, 78L, NA, 62L, 74L, 63L, 59L, 74L, 110L,
134L, NA, 74L, 82L, 59L, 73L, 135L, 170L, 53L, NA, 61L, 72L,
67L, 186L), X6 = c(140L, 41L, 57L, NA, 104L, 92L, 89L, 94L,
68L, NA, 116L, 131L, NA, 110L, 125L, 89L, 89L, NA, 113L,
124L, 115L, 116L, 95L, 77L, 126L, NA, 110L, 122L, 119L, 122L
), X7 = c(90L, 104L, 82L, NA, 368L, 258L, NA, 289L, 117L,
NA, 395L, 416L, 397L, 391L, 400L, 132L, 101L, NA, 397L, 426L,
418L, 411L, 151L, 66L, 396L, NA, 457L, 437L, 428L, 128L)), .Names = c("X",
"X.1", "X1", "X2", "X3", "X4", "X5", "X6", "X7"), class = "data.frame", row.names = c(NA,
-30L))
The codes I have prepared:
# example codes for sample data
## step 1 - read file
filename <- '101_E45_N66.csv'
df <- read.csv(filename, header = TRUE,skip =5, nrow =
length(count.fields(filename)) - 12)
## step 2 - Change coloumn name
colnames(df) = c("type","date","v1","v2","v3","v4","v5","v6","v7")
## step 3 - spliting name "101_E45_N66.csv" to create 3 new coloumn within dataframe
s = strsplit(filename,"_",,fixed=TRUE)[[1]]
df1= cbind(df[,c("type","date")],ID=s[1],name1=s[2],name2=s[3],df[,3:ncol(df)])
## step 4 - changing type coloumn for weekday/weekend
f = c("wd", "we", "we", "wd", "wd", "wd", "wd")
df1$type = rep(f,52, length.out = 30)
## creating a backup file
df2 = df1
## step 5 - subsetting for weekday and weekend
df3 = df2[df2$type == "wd",] ## weekday
df4 = df2[df2$type == "we",] ## weekend
## step 16 - adding new rows in df1 with total, weekday and weeknd sum and number of missing values
df2[31,(6:12)] <- colSums(df1[,6:12], na.rm = T) ## all
df2[32,(6:12)] <- colSums(df3[,6:12], na.rm = T) ## weekday
df2[33,(6:12)] <- colSums(df4[,6:12], na.rm = T) ## weekend
df2[34,(6:12)] = colSums(is.na(df1[,6:12])) ## all missing
df2[35,(6:12)] = colSums(is.na(df3[,6:12]))## weekday missing
df2[36,(6:12)] = colSums(is.na(df4[,6:12]))## weekend missing
df2
In order to read multiple CSV files or all files from a folder in R, use data. table package. data. table is a third-party library hence, in order to use data.
When R calls load(), all of the R objects saved in the file are loaded into R. The names given to these objects when they were originally saved will be given to them when they are loaded. The command > ls() can be used to print out all of the objects currently loaded into R.
To read a set of csv files into a data.frame
, I often use ldply
from the plyr package:
library(plyr)
all_data = ldply(list.files(pattern = "csv"), function(fname) {
dum = read.csv(fname)
dum$fname = fname # adds the filename it was read from as a column
return(dum)
})
If you need more specific things, you extend the function you call with ldply
.
does this toy example do what you want?
# clean up:
rm(list = ls())
setwd(tempdir())
unlink(dir(tempdir()))
# create some files in tempdir:
a <- data.frame(x = 1:3, y = 4:6)
b <- data.frame(x = 10:13, y = 14:15)
write.csv(a, "file1.csv", row.names = F)
write.csv(b, "file2.csv", row.names = F)
# now read all files to list:
mycsv = dir(pattern=".csv")
n <- length(mycsv)
mylist <- vector("list", n)
for(i in 1:n) mylist[[i]] <- read.csv(mycsv[i])
# now change something in all dfs in list:
mylist <- lapply(mylist, function(x) {names(x) <- c("a", "b") ; return(x)})
# then save back dfs:
for(i in 1:n)
write.csv(file = paste("file", i, ".csv", sep = ""),
mylist[i], row.names = F)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With