I have data in the following format for people punching their work times in:
(dat<-data.frame(Date = c("1/1/19", "1/2/19", "1/4/19", "1/2/19"),
Person = c("John Doe", "Brian Smith", "Jane Doe", "Alexandra Wakes"),
Time_In = c("1:15pm", "1:45am", "11:00pm", "1:00am"),
Time_Out = c("2:30pm","3:33pm","3:00am","1:00am")))
Date Person Time_In Time_Out
1 1/1/19 John Doe 1:15pm 2:30pm
2 1/2/19 Brian Smith 1:45am 3:33pm
3 1/4/19 Jane Doe 3:00pm 3:00am
4 1/2/19 Alexandra Wakes 1:00am 1:00am
I am looking to write a function in R or Python that will extract the total number of hours each person worked into 24 different buckets with each bucket as its own column. It would look something like this:
So in the first case, the person worked from 1:15pm to 2:30 pm, so they worked .75 hours from 1pm to 2pm (13-14), and .5 hours from 2pm to 3pm (14-15).
Some things I think may work are...
Attempts from #1 and #2 (?) from above were utter failures. Not sure what the workflow is but any advice is much appreciated.
Note that the columns in the resulting table need notbe numbers (could be hour 1, hour 2, etc. or just any factor in general -as long as it represents a 24 hour period of time).
My past attempts have included nested for loops like the following:
for (i in 1:nrow(data)){
if((int_overlaps(createinterval(data$PunchDate[i],0,1), workinterval[i]))){ `0-1`[i]=1} else{ `0-1`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],1,2), workinterval[i]))){ `1-2`[i]=1} else{ `1-2`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],2,3), workinterval[i]))){ `2-3`[i]=1} else{ `2-3`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],3,4), workinterval[i]))){ `3-4`[i]=1} else{ `3-4`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],4,5), workinterval[i]))){ `4-5`[i]=1} else{ `4-5`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],5,6), workinterval[i]))){ `5-6`[i]=1} else{ `5-6`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],6,7), workinterval[i]))){ `6-7`[i]=1} else{ `6-7`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],7,8), workinterval[i]))){ `7-8`[i]=1} else{ `7-8`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],8,9), workinterval[i]))){ `8-9`[i]=1} else{ `8-9`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],9,10), workinterval[i]))){ `9-10`[i]=1} else{ `9-10`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],10,11), workinterval[i]))){ `10-11`[i]=1} else{ `10-11`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],11,12), workinterval[i]))){ `11-12`[i]=1} else{ `11-12`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],12,13), workinterval[i]))){ `12-13`[i]=1} else{ `12-13`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],13,14), workinterval[i]))){ `13-14`[i]=1} else{ `13-14`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],14,15), workinterval[i]))){ `14-15`[i]=1} else{ `14-15`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],15,16), workinterval[i]))){ `15-16`[i]=1} else{ `15-16`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],16,17), workinterval[i]))){ `16-17`[i]=1} else{ `16-17`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],17,18), workinterval[i]))){ `17-18`[i]=1} else{ `17-18`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],18,19), workinterval[i]))){ `18-19`[i]=1} else{ `18-19`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],19,20), workinterval[i]))){ `19-20`[i]=1} else{ `19-20`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],20,21), workinterval[i]))){ `20-21`[i]=1} else{ `20-21`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],21,22), workinterval[i]))){ `21-22`[i]=1} else{ `21-22`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],22,23), workinterval[i]))){ `22-23`[i]=1} else{ `22-23`[i]=0}
if((int_overlaps(createinterval(data$PunchDate[i],23,24), workinterval[i]))){ `23-24`[i]=1} else{ `23-24`[i]=0}
}
cbind(data, `0-1`, `1-2`, `2-3`, `3-4`, `4-5`, `5-6`,
`6-7`, `7-8`, `8-9`, `9-10`, `10-11`, `11-12`,
`12-13`, `13-14`, `14-15`, `15-16`, `16-17`, `17-18`, `18-19`,
`19-20`, `20-21`, `21-22`, `22-23`, `23-24`
)
This involves bit of fiddling with the dates and time but it seems to work using dcast
.
library(lubridate)
library(data.table)
# Data
dat<-data.frame(Date = c("1/1/19", "1/2/19", "1/4/19", "1/2/19"),
Person = c("John Doe", "Brian Smith", "Jane Doe", "Alexandra Wakes"),
Time_In = c("1:15pm", "1:45am", "11:00pm", "1:00am"),
Time_Out = c("2:30pm","3:33pm","3:00am","1:00am"))
# Create Date Out field, if out the next day then need to add extra day to the Date in
dat$Date <- as.Date(dat$Date, format = "%m/%d/%y")
dat$Date_out <- as.Date(ifelse(grepl("am", dat$Time_Out), dat$Date + days(1), dat$Date), origin = "1970-01-01")
# Create date time in and out variable in format yyyy-dd-dd hh:mm:ss
dat$time_in1 <- strptime(paste(dat$Date, " ", dat$Time_In, sep = ""), format = "%Y-%m-%d %I:%M%p")
dat$time_out1 <- strptime(paste(dat$Date_out, " ", dat$Time_Out, sep = ""), format = "%Y-%m-%d %I:%M%p")
# Fiddling with dates and time
# This will be used to duplicate data frame x times for dcast below
dat$diff_time <- ceiling(as.numeric(difftime(dat$time_out1, dat$time_in1, units = "hours")))
dat$time_in_min <- format(dat$time_in1, format = "%M")
dat$time_out_min <- format(dat$time_out1, format = "%M")
dat$diff_time <- ifelse(dat$time_out_min < dat$time_in_min, dat$diff_time + 1, dat$diff_time)
# For Time in add extra hour to minus time in, i.e. if time in is 2:35pm then time in will show 3:00pm to calculate 25 minutes
dat$time_in2 <- strptime(dat$time_in1 + hours(1), format = "%Y-%m-%d %H")
dat$time_out2 <- strptime(dat$time_out1, format = "%Y-%m-%d %H")
# Calculate fraction of hours for the Time in/Out
dat$diff_in <- as.numeric(difftime(dat$time_in2, dat$time_in1, units = "hours"))
dat$diff_out <- as.numeric(difftime(dat$time_out1, dat$time_out2, units = "hours"))
# For the 24 hour bucket for each person
dat$start_hr <- format(dat$time_in1, format = "%H")
# Append Data multiple times based on number of hours in between Out and In
dt <- dat[rep(seq_len(nrow(dat)), dat$diff_time), c("Date", "Person", "Time_In", "Time_Out", "start_hr", "diff_in", "diff_out", "diff_time")]
dt <- data.table(dt)
# For the 24 hour bucket for each person
dt[, rank := 1:.N, by = c("Person", "Date", "Time_In", "Time_Out")]
dt[, start_hr2 := as.numeric(start_hr) + rank]
# Combine with Time in and Out to allow fraction of hour start and end
dt[, rank2 := 1]
dt[rank == 1, rank2:= diff_in]
dt[diff_time == rank & diff_out > 0, rank2 := diff_out]
# 24 hours in a day
dt[start_hr2 > 24, start_hr2 := start_hr2 - 24]
# For the data provided it works without this line because Alexander worked 24 hours
# Need this line to include all 24 hour bucket
dt$start_hr2 <- factor(dt$start_hr2, levels = 1:24)
dt_dcast <- dcast(dt, Person + Date + Time_In + Time_Out ~ start_hr2, value.var = "rank2", fill = 0, drop = c(TRUE, FALSE))
setnames(dt_dcast, names(dt_dcast), c("Person", "Date", "Time In", "Time Out", paste0(1:24 - 1, "-", 1:24)))
dt_dcast
Person Date Time In Time Out 0-1 1-2 2-3 3-4 4-5 5-6 6-7 7-8 8-9 9-10 10-11 11-12 12-13 13-14 14-15 15-16 16-17 17-18 18-19 19-20 20-21 21-22 22-23 23-24
1: Alexandra Wakes 2019-01-02 1:00am 1:00am 1 1.00 1 1 1 1 1 1 1 1 1 1 1 1.00 1.0 1.00 1 1 1 1 1 1 1 1
2: Brian Smith 2019-01-02 1:45am 3:33pm 0 0.25 1 1 1 1 1 1 1 1 1 1 1 1.00 1.0 0.55 0 0 0 0 0 0 0 0
3: Jane Doe 2019-01-04 11:00pm 3:00am 1 1.00 1 0 0 0 0 0 0 0 0 0 0 0.00 0.0 0.00 0 0 0 0 0 0 0 1
4: John Doe 2019-01-01 1:15pm 2:30pm 0 0.00 0 0 0 0 0 0 0 0 0 0 0 0.75 0.5 0.00 0 0 0 0 0 0 0 0
Using for
loop:
library(lubridate)
library(data.table)
# Data
dat<-data.frame(Date = c("1/1/19", "1/2/19", "1/4/19", "1/2/19"),
Person = c("John Doe", "Brian Smith", "Jane Doe", "Alexandra Wakes"),
Time_In = c("1:15pm", "1:45am", "11:00pm", "1:00am"),
Time_Out = c("2:30pm","3:33pm","3:00am","1:00am"))
# Create Date Out field, if out the next day then need to add extra day to the Date in
dat$Date <- as.Date(dat$Date, format = "%m/%d/%y")
dat$Date_out <- as.Date(ifelse(grepl("am", dat$Time_Out), dat$Date + days(1), dat$Date), origin = "1970-01-01")
# Create date time in and out variable in format yyyy-dd-dd hh:mm:ss
dat$time_in <- strptime(paste(dat$Date, " ", dat$Time_In, sep = ""), format = "%Y-%m-%d %I:%M%p")
dat$time_out <- strptime(paste(dat$Date_out, " ", dat$Time_Out, sep = ""), format = "%Y-%m-%d %I:%M%p")
# Create 'hour' gap
dat$diff_time <- ceiling(as.numeric(difftime(dat$time_out, dat$time_in, units = "hours")))
dat$time_in_min <- format(dat$time_in, format = "%M")
dat$time_out_min <- format(dat$time_out, format = "%M")
dat$diff_time <- ifelse(dat$time_out_min < dat$time_in_min, dat$diff_time + 1, dat$diff_time)
# For the 24 hour bucket for each person
dat$start_hr <- as.numeric(format(dat$time_in + hours(1), format = "%H"))
dat$start_hr <- ifelse(dat$start_hr == 0, 24, dat$start_hr)
# For Time in add extra hour to minus time in, i.e. if time in is 2:35pm then time in will show 3:00pm to calculate 25 minutes
dat$hour_in <- strptime(dat$time_in + hours(1), format = "%Y-%m-%d %H")
dat$hour_out <- strptime(dat$time_out, format = "%Y-%m-%d %H")
# Calculate fraction of hours for the Time in/Out
dat$diff_in <- as.numeric(difftime(dat$hour_in, dat$time_in, units = "hours"))
dat$diff_out <- as.numeric(difftime(dat$time_out, dat$hour_out, units = "hours"))
hr_bucket <- data.frame(matrix(0, ncol = 24, nrow = nrow(dat)))
names(hr_bucket) <- paste0(1:24-1, "_", 1:24)
stg_data <- dat[, c("start_hr", "diff_time", "diff_in", "diff_out")]
stg_calc <- cbind(stg_data, hr_bucket)
col_index <- ncol(stg_data)
for (i in 1:nrow(stg_calc)) {
ref_start_hr <- stg_calc[i ,]$start_hr
ref_diff_time <- stg_calc[i ,]$diff_time
ref_diff_in <- stg_calc[i ,]$diff_in
ref_diff_out <- stg_calc[i ,]$diff_out
# if a person works till the next morning
if ((ref_start_hr + ref_diff_time) > 24) {
offset_col_used <- 24 - ref_start_hr + 1
offset_col_rem <- ref_diff_time - offset_col_used
stg_calc[i, (col_index + ref_start_hr):(col_index + 24)] <- 1
stg_calc[i, (col_index + 1):(col_index + offset_col_rem)] <- 1
} else {
stg_calc[i, (col_index + ref_start_hr):(col_index + ref_start_hr + ref_diff_time - 1)] <- 1
}
# To adjust for fraction of hour worked at start and end
if (stg_calc[i, ]$diff_in %% 1 > 0) stg_calc[i, col_index + ref_start_hr] <- ref_diff_in
if (stg_calc[i, ]$diff_out %% 1 > 0) stg_calc[i, col_index + ref_start_hr + ref_diff_time - 1] <- ref_diff_out
}
dat2 <- cbind(dat[, c("Person", "Date", "Time_In", "Time_Out")], stg_calc[, names(hr_bucket)])
dat2
Person Date Time_In Time_Out 0_1 1_2 2_3 3_4 4_5 5_6 6_7 7_8 8_9 9_10 10_11 11_12 12_13 13_14 14_15 15_16 16_17 17_18 18_19 19_20 20_21 21_22 22_23 23_24
1 John Doe 2019-01-01 1:15pm 2:30pm 0 0.00 0 0 0 0 0 0 0 0 0 0 0 0.75 0.5 0.00 0 0 0 0 0 0 0 0
2 Brian Smith 2019-01-02 1:45am 3:33pm 0 0.25 1 1 1 1 1 1 1 1 1 1 1 1.00 1.0 0.55 0 0 0 0 0 0 0 0
3 Jane Doe 2019-01-04 11:00pm 3:00am 1 1.00 1 0 0 0 0 0 0 0 0 0 0 0.00 0.0 0.00 0 0 0 0 0 0 0 1
4 Alexandra Wakes 2019-01-02 1:00am 1:00am 1 1.00 1 1 1 1 1 1 1 1 1 1 1 1.00 1.0 1.00 1 1 1 1 1 1 1 1
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With