lets first take some random data
A <- c(1:5)
score_one <- c(123.5, 223.1, 242.2, 351.8, 123.1)
score_two <- c(324.2, 568.2, 124.9, 323.1, 213.4)
score_three <- c(553.1, 412.3, 435.7, 523.1, 365.4)
score_four <- c(123.2, 225.1, 243.6, 741.1, 951.2)
df1 <- data.frame(A, score_one, score_two, score_three, score_four)
library(dplyr)
library(tidyr)
df2 <- df1 %>%
group_by(A) %>%
mutate_each(funs(substr(.,1,1))) %>%
ungroup %>%
gather(variable, type, -c(A)) %>%
select(-variable) %>%
mutate(type = paste0("type_",type),
value = 1) %>%
group_by(A,type) %>%
summarise(value = sum(value)) %>%
ungroup %>%
spread(type, value, fill=0) %>%
inner_join(df1, by=c("A")) %>%
select(A, starts_with("score_"), starts_with("type_"))
This introduces a summary variable for each score_
and counts the frequency of each unique first-digit number
Hence we see in row one, type_1 == 2. Because in the corresponding score_ columns we have 2 occurrences where the number 1 is the first number
Problem Statement
now we want introduce a variable that calls upon the type_n columns.
score_ column/stype_n == 0, we want to assign a 0$type_n_G2such that the desired output should look like1

Taking for example, type_1_G2
type_1 == 2score_one and score_fourtype_1_G2==1There is no need for the complicated construction of df2 in my opinion. A reshaped df1 into long format is a better starting point to get to the desired end result in fewer steps.
An approach using the data.table package:
library(data.table)
# melting the original dataframe 'df1' to a long format datatable
dt <- melt(setDT(df1), "A")
# creating two type variables & a logical vector indicating whether
# the decimal for a specific type is equal or above .2
dt[, `:=` (type1=paste0("type_",substr(value,1,1)),
type2=paste0("type_",substr(value,1,1),"_g2"))
][, g2 := +(+(value - floor(value) >= 0.2)==1), .(A,type1)]
# creating separate wide datatables for the variable & two type columns
dt1 <- dcast(dt, A ~ variable)
dt2 <- dcast(dt, A ~ type1)
dt3 <- dcast(dt, A ~ type2, fun=sum, value.var="g2")[, lapply(.SD, function(x) +(x>=1)), A]
# two options for merging the wide datatables together into one
dtres <- dt1[dt2[dt3, on = "A"], on = "A"]
dtres <- Reduce(function(...) merge(..., all = TRUE, by = "A"), list(dt1, dt2, dt3))
# or in one go without creating intermediate datatables
dtres <- dcast(dt, A ~ variable)[dcast(dt, A ~ type1)[dcast(dt, A ~ type2, fun=sum, value.var = "g2")[, lapply(.SD, function(x) +(x>=1)) , A], on = "A"], on = "A"]
this results in:
> dtres
A score_one score_two score_three score_four type_1 type_2 type_3 type_4 type_5 type_7 type_9 type_1_g2 type_2_g2 type_3_g2 type_4_g2 type_5_g2 type_7_g2 type_9_g2
1: 1 123.5 324.2 553.1 123.2 2 0 1 0 1 0 0 1 0 0 0 0 0 0
2: 2 223.1 568.2 412.3 225.1 0 2 0 1 1 0 0 0 0 0 1 1 0 0
3: 3 242.2 124.9 435.7 243.6 1 2 0 1 0 0 0 1 1 0 1 0 0 0
4: 4 351.8 323.1 523.1 741.1 0 0 2 0 1 1 0 0 0 1 0 0 0 0
5: 5 123.1 213.4 365.4 951.2 1 1 1 0 0 0 1 0 1 1 0 0 0 1
This approach can be translated into a dplyr/tidyr implementation as follows:
library(dplyr)
library(tidyr)
df <- df1 %>% gather(variable, value,-A) %>%
mutate(type1 = paste0("type_",substr(value,1,1)),
type2 = paste0("type_",substr(value,1,1),"_g2")) %>%
group_by(A,type1) %>%
mutate(g2 = +(+(value - floor(value) >= 0.2)==1),
type1n = n()) %>%
ungroup()
d1 <- df %>% select(1:3) %>% spread(variable, value)
d2 <- df %>% group_by(A, type1) %>% tally() %>% spread(type1, n, fill=0)
d3 <- df %>% group_by(A, type2) %>% summarise(g = any(g2==1)) %>% spread(type2, g, fill=0)
dfres <- left_join(d1, d2, by = "A") %>% left_join(., d3, by = "A")
which gives the same result:
> dfres
A score_one score_two score_three score_four type_1 type_2 type_3 type_4 type_5 type_7 type_9 type_1_g2 type_2_g2 type_3_g2 type_4_g2 type_5_g2 type_7_g2 type_9_g2
1 1 123.5 324.2 553.1 123.2 2 0 1 0 1 0 0 1 0 0 0 0 0 0
2 2 223.1 568.2 412.3 225.1 0 2 0 1 1 0 0 0 0 0 1 1 0 0
3 3 242.2 124.9 435.7 243.6 1 2 0 1 0 0 0 1 1 0 1 0 0 0
4 4 351.8 323.1 523.1 741.1 0 0 2 0 1 1 0 0 0 1 0 0 0 0
5 5 123.1 213.4 365.4 951.2 1 1 1 0 0 0 1 0 1 1 0 0 0 1
Here's a vectorized attempt to first melt and then dcast the data using the data.table package. It needs some polish but I don't have time right now
library(data.table) # v >= 1.9.6
# melt and order by "A"
temp <- setorder(melt(df2, id = 1:5), A)
# Create the "type_n_G2" column names
temp$Var <- paste0(temp$variable, "_G2")
# Selecting only the "score_one", "score_two", "score_three" and "score_four"
indx1 <- indx2 <- temp[2:5]
# Finding the first integer within each number
indx2[] <- sub("(^.{1}).*", "\\1", as.matrix(indx2))
# The works horse: simultaneously compare `indx2` against `type_n` and extract decimals
indx3 <- indx1 * (indx2 == as.numeric(sub(".*_", "", temp$variable))) - floor(indx1)
# Compare the result against 0.2, sum the rows and see if any is greater than 0
temp$res<- +(rowSums(indx3 >= 0.2) > 0)
# Convert back to wide format
dcast(temp, A ~ Var, value.var = "res")
# A type_1_G2 type_2_G2 type_3_G2 type_4_G2 type_5_G2 type_7_G2 type_9_G2
# 1 1 1 0 0 0 0 0 0
# 2 2 0 1 0 1 1 0 0
# 3 3 1 1 0 1 0 0 0
# 4 4 0 0 1 0 0 0 0
# 5 5 0 1 1 0 0 0 1
Now you can just cbind the result to df2 (This doesn't match your result exactly cause your provided data doesn't match it too)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With