Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Function that will extract hour values from one table and populate "buckets" of one hour increments in another table

I have data in the following format for people punching their work times in:

(dat<-data.frame(Date = c("1/1/19", "1/2/19", "1/4/19", "1/2/19"), 
                Person = c("John Doe", "Brian Smith", "Jane Doe", "Alexandra Wakes"), 
                Time_In = c("1:15pm", "1:45am", "11:00pm", "1:00am"), 
                Time_Out = c("2:30pm","3:33pm","3:00am","1:00am")))

    Date          Person Time_In Time_Out
1 1/1/19        John Doe  1:15pm   2:30pm
2 1/2/19     Brian Smith  1:45am   3:33pm
3 1/4/19        Jane Doe  3:00pm   3:00am
4 1/2/19 Alexandra Wakes  1:00am   1:00am

I am looking to write a function in R or Python that will extract the total number of hours each person worked into 24 different buckets with each bucket as its own column. It would look something like this:

enter image description here

So in the first case, the person worked from 1:15pm to 2:30 pm, so they worked .75 hours from 1pm to 2pm (13-14), and .5 hours from 2pm to 3pm (14-15).

Some things I think may work are...

  1. A series of nested loops
  2. A long series of if/then statements
  3. Some function in Tidyverse or Pandas that I have not thought of yet.

Attempts from #1 and #2 (?) from above were utter failures. Not sure what the workflow is but any advice is much appreciated.

Note that the columns in the resulting table need notbe numbers (could be hour 1, hour 2, etc. or just any factor in general -as long as it represents a 24 hour period of time).

My past attempts have included nested for loops like the following:

for (i in 1:nrow(data)){

  if((int_overlaps(createinterval(data$PunchDate[i],0,1), workinterval[i]))){ `0-1`[i]=1} else{ `0-1`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],1,2), workinterval[i]))){ `1-2`[i]=1} else{ `1-2`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],2,3), workinterval[i]))){ `2-3`[i]=1} else{ `2-3`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],3,4), workinterval[i]))){ `3-4`[i]=1} else{ `3-4`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],4,5), workinterval[i]))){ `4-5`[i]=1} else{ `4-5`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],5,6), workinterval[i]))){ `5-6`[i]=1} else{ `5-6`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],6,7), workinterval[i]))){ `6-7`[i]=1} else{ `6-7`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],7,8), workinterval[i]))){ `7-8`[i]=1} else{ `7-8`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],8,9), workinterval[i]))){ `8-9`[i]=1} else{ `8-9`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],9,10), workinterval[i]))){ `9-10`[i]=1} else{ `9-10`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],10,11), workinterval[i]))){ `10-11`[i]=1} else{ `10-11`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],11,12), workinterval[i]))){ `11-12`[i]=1} else{ `11-12`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],12,13), workinterval[i]))){ `12-13`[i]=1} else{ `12-13`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],13,14), workinterval[i]))){ `13-14`[i]=1} else{ `13-14`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],14,15), workinterval[i]))){ `14-15`[i]=1} else{ `14-15`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],15,16), workinterval[i]))){ `15-16`[i]=1} else{ `15-16`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],16,17), workinterval[i]))){ `16-17`[i]=1} else{ `16-17`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],17,18), workinterval[i]))){ `17-18`[i]=1} else{ `17-18`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],18,19), workinterval[i]))){ `18-19`[i]=1} else{ `18-19`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],19,20), workinterval[i]))){ `19-20`[i]=1} else{ `19-20`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],20,21), workinterval[i]))){ `20-21`[i]=1} else{ `20-21`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],21,22), workinterval[i]))){ `21-22`[i]=1} else{ `21-22`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],22,23), workinterval[i]))){ `22-23`[i]=1} else{ `22-23`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],23,24), workinterval[i]))){ `23-24`[i]=1} else{ `23-24`[i]=0}


}
cbind(data, `0-1`, `1-2`, `2-3`, `3-4`, `4-5`, `5-6`,
            `6-7`, `7-8`, `8-9`, `9-10`, `10-11`, `11-12`,
            `12-13`, `13-14`, `14-15`, `15-16`, `16-17`, `17-18`, `18-19`,
            `19-20`, `20-21`, `21-22`, `22-23`, `23-24`
      )
like image 838
Cyrus Mohammadian Avatar asked Apr 12 '19 19:04

Cyrus Mohammadian


Video Answer


1 Answers

This involves bit of fiddling with the dates and time but it seems to work using dcast.

library(lubridate)
library(data.table)

# Data
dat<-data.frame(Date = c("1/1/19", "1/2/19", "1/4/19", "1/2/19"), 
                Person = c("John Doe", "Brian Smith", "Jane Doe", "Alexandra Wakes"), 
                Time_In = c("1:15pm", "1:45am", "11:00pm", "1:00am"), 
                Time_Out = c("2:30pm","3:33pm","3:00am","1:00am"))

# Create Date Out field, if out the next day then need to add extra day to the Date in
dat$Date <- as.Date(dat$Date, format = "%m/%d/%y")
dat$Date_out <- as.Date(ifelse(grepl("am", dat$Time_Out), dat$Date + days(1), dat$Date), origin = "1970-01-01")

# Create date time in and out variable in format yyyy-dd-dd hh:mm:ss
dat$time_in1 <- strptime(paste(dat$Date, " ", dat$Time_In, sep = ""), format = "%Y-%m-%d %I:%M%p")
dat$time_out1 <- strptime(paste(dat$Date_out, " ", dat$Time_Out, sep = ""), format = "%Y-%m-%d %I:%M%p")

# Fiddling with dates and time
# This will be used to duplicate data frame x times for dcast below
dat$diff_time <- ceiling(as.numeric(difftime(dat$time_out1, dat$time_in1, units = "hours")))

dat$time_in_min <- format(dat$time_in1, format = "%M")
dat$time_out_min <- format(dat$time_out1, format = "%M")
dat$diff_time <- ifelse(dat$time_out_min < dat$time_in_min, dat$diff_time + 1, dat$diff_time)

# For Time in add extra hour to minus time in, i.e. if time in is 2:35pm then time in will show 3:00pm to calculate 25 minutes
dat$time_in2 <- strptime(dat$time_in1 + hours(1), format = "%Y-%m-%d %H")
dat$time_out2 <- strptime(dat$time_out1, format = "%Y-%m-%d %H")

# Calculate fraction of hours for the Time in/Out
dat$diff_in <- as.numeric(difftime(dat$time_in2, dat$time_in1, units = "hours"))
dat$diff_out <- as.numeric(difftime(dat$time_out1, dat$time_out2, units = "hours"))

# For the 24 hour bucket for each person
dat$start_hr <- format(dat$time_in1, format = "%H")

# Append Data multiple times based on number of hours in between Out and In
dt <- dat[rep(seq_len(nrow(dat)), dat$diff_time), c("Date", "Person", "Time_In", "Time_Out", "start_hr", "diff_in", "diff_out", "diff_time")]
dt <- data.table(dt)

# For the 24 hour bucket for each person
dt[, rank := 1:.N, by = c("Person", "Date", "Time_In", "Time_Out")]
dt[, start_hr2 := as.numeric(start_hr) + rank]

# Combine with Time in and Out to allow fraction of hour start and end
dt[, rank2 := 1]
dt[rank == 1, rank2:= diff_in]
dt[diff_time == rank & diff_out > 0, rank2 := diff_out]

# 24 hours in a day
dt[start_hr2 > 24, start_hr2 := start_hr2 - 24]

# For the data provided it works without this line because Alexander worked 24 hours
# Need this line to include all 24 hour bucket
dt$start_hr2 <- factor(dt$start_hr2, levels = 1:24)

dt_dcast <- dcast(dt, Person + Date + Time_In + Time_Out ~ start_hr2, value.var = "rank2", fill = 0, drop = c(TRUE, FALSE))
setnames(dt_dcast, names(dt_dcast), c("Person", "Date", "Time In", "Time Out", paste0(1:24 - 1, "-", 1:24)))
dt_dcast
            Person       Date Time In Time Out 0-1  1-2 2-3 3-4 4-5 5-6 6-7 7-8 8-9 9-10 10-11 11-12 12-13 13-14 14-15 15-16 16-17 17-18 18-19 19-20 20-21 21-22 22-23 23-24
1: Alexandra Wakes 2019-01-02  1:00am   1:00am   1 1.00   1   1   1   1   1   1   1    1     1     1     1  1.00   1.0  1.00     1     1     1     1     1     1     1     1
2:     Brian Smith 2019-01-02  1:45am   3:33pm   0 0.25   1   1   1   1   1   1   1    1     1     1     1  1.00   1.0  0.55     0     0     0     0     0     0     0     0
3:        Jane Doe 2019-01-04 11:00pm   3:00am   1 1.00   1   0   0   0   0   0   0    0     0     0     0  0.00   0.0  0.00     0     0     0     0     0     0     0     1
4:        John Doe 2019-01-01  1:15pm   2:30pm   0 0.00   0   0   0   0   0   0   0    0     0     0     0  0.75   0.5  0.00     0     0     0     0     0     0     0     0

Using for loop:

library(lubridate)
library(data.table)

# Data
dat<-data.frame(Date = c("1/1/19", "1/2/19", "1/4/19", "1/2/19"), 
                Person = c("John Doe", "Brian Smith", "Jane Doe", "Alexandra Wakes"), 
                Time_In = c("1:15pm", "1:45am", "11:00pm", "1:00am"), 
                Time_Out = c("2:30pm","3:33pm","3:00am","1:00am"))

# Create Date Out field, if out the next day then need to add extra day to the Date in
dat$Date <- as.Date(dat$Date, format = "%m/%d/%y")
dat$Date_out <- as.Date(ifelse(grepl("am", dat$Time_Out), dat$Date + days(1), dat$Date), origin = "1970-01-01")

# Create date time in and out variable in format yyyy-dd-dd hh:mm:ss
dat$time_in <- strptime(paste(dat$Date, " ", dat$Time_In, sep = ""), format = "%Y-%m-%d %I:%M%p")
dat$time_out <- strptime(paste(dat$Date_out, " ", dat$Time_Out, sep = ""), format = "%Y-%m-%d %I:%M%p")

# Create 'hour' gap
dat$diff_time <- ceiling(as.numeric(difftime(dat$time_out, dat$time_in, units = "hours")))
dat$time_in_min <- format(dat$time_in, format = "%M")
dat$time_out_min <- format(dat$time_out, format = "%M")
dat$diff_time <- ifelse(dat$time_out_min < dat$time_in_min, dat$diff_time + 1, dat$diff_time)

# For the 24 hour bucket for each person
dat$start_hr <- as.numeric(format(dat$time_in + hours(1), format = "%H"))
dat$start_hr <- ifelse(dat$start_hr == 0, 24, dat$start_hr)

# For Time in add extra hour to minus time in, i.e. if time in is 2:35pm then time in will show 3:00pm to calculate 25 minutes
dat$hour_in <- strptime(dat$time_in + hours(1), format = "%Y-%m-%d %H")
dat$hour_out <- strptime(dat$time_out, format = "%Y-%m-%d %H")

# Calculate fraction of hours for the Time in/Out
dat$diff_in <- as.numeric(difftime(dat$hour_in, dat$time_in, units = "hours"))
dat$diff_out <- as.numeric(difftime(dat$time_out, dat$hour_out, units = "hours"))

hr_bucket <- data.frame(matrix(0, ncol = 24, nrow = nrow(dat)))
names(hr_bucket) <- paste0(1:24-1, "_", 1:24)

stg_data <- dat[, c("start_hr", "diff_time", "diff_in", "diff_out")]
stg_calc <- cbind(stg_data, hr_bucket)

col_index <- ncol(stg_data)
for (i in 1:nrow(stg_calc)) {

  ref_start_hr <- stg_calc[i ,]$start_hr 
  ref_diff_time <- stg_calc[i ,]$diff_time

  ref_diff_in <- stg_calc[i ,]$diff_in
  ref_diff_out <- stg_calc[i ,]$diff_out

  # if a person works till the next morning
  if ((ref_start_hr + ref_diff_time) > 24) {

    offset_col_used <- 24 - ref_start_hr + 1
    offset_col_rem <- ref_diff_time - offset_col_used

    stg_calc[i, (col_index + ref_start_hr):(col_index + 24)] <- 1
    stg_calc[i, (col_index + 1):(col_index + offset_col_rem)] <- 1

  } else {

    stg_calc[i, (col_index + ref_start_hr):(col_index + ref_start_hr + ref_diff_time - 1)] <- 1

  } 

  # To adjust for fraction of hour worked at start and end
  if (stg_calc[i, ]$diff_in %% 1 > 0) stg_calc[i, col_index + ref_start_hr] <- ref_diff_in
  if (stg_calc[i, ]$diff_out %% 1 > 0) stg_calc[i, col_index + ref_start_hr + ref_diff_time - 1] <- ref_diff_out

}

dat2 <- cbind(dat[, c("Person", "Date", "Time_In", "Time_Out")], stg_calc[, names(hr_bucket)])
dat2
           Person       Date Time_In Time_Out 0_1  1_2 2_3 3_4 4_5 5_6 6_7 7_8 8_9 9_10 10_11 11_12 12_13 13_14 14_15 15_16 16_17 17_18 18_19 19_20 20_21 21_22 22_23 23_24
1        John Doe 2019-01-01  1:15pm   2:30pm   0 0.00   0   0   0   0   0   0   0    0     0     0     0  0.75   0.5  0.00     0     0     0     0     0     0     0     0
2     Brian Smith 2019-01-02  1:45am   3:33pm   0 0.25   1   1   1   1   1   1   1    1     1     1     1  1.00   1.0  0.55     0     0     0     0     0     0     0     0
3        Jane Doe 2019-01-04 11:00pm   3:00am   1 1.00   1   0   0   0   0   0   0    0     0     0     0  0.00   0.0  0.00     0     0     0     0     0     0     0     1
4 Alexandra Wakes 2019-01-02  1:00am   1:00am   1 1.00   1   1   1   1   1   1   1    1     1     1     1  1.00   1.0  1.00     1     1     1     1     1     1     1     1
like image 69
MKa Avatar answered Sep 17 '22 19:09

MKa