Character-location identity to create a new variable

Question

lets first take some random data

A <- c(1:5)
score_one <- c(123.5, 223.1, 242.2, 351.8, 123.1)
score_two <- c(324.2, 568.2, 124.9, 323.1, 213.4)
score_three <- c(553.1, 412.3, 435.7, 523.1, 365.4)
score_four <- c(123.2, 225.1, 243.6, 741.1, 951.2)


df1 <- data.frame(A, score_one, score_two, score_three, score_four)

library(dplyr)
library(tidyr)

df2 <- df1 %>% 
  group_by(A) %>% 
  mutate_each(funs(substr(.,1,1))) %>%                
  ungroup %>%
  gather(variable, type, -c(A)) %>%                     
  select(-variable) %>%
  mutate(type = paste0("type_",type),
         value = 1) %>%
  group_by(A,type) %>%                                     
  summarise(value = sum(value)) %>% 
  ungroup %>%
  spread(type, value, fill=0) %>%                       
  inner_join(df1, by=c("A")) %>%                            
  select(A, starts_with("score_"), starts_with("type_"))

This introduces a summary variable for each score_ and counts the frequency of each unique first-digit number

Hence we see in row one, type_1 == 2. Because in the corresponding score_ columns we have 2 occurrences where the number 1 is the first number

Problem Statement
now we want introduce a variable that calls upon the type_n columns.

It checks if the value > 0.
when this is the case, we want to inspect the corresponding score_ column/s
here we analyse if the digit after the decimal place is >= than the value 2
now if one or all of the corresponding rows' value after the decimal place are >= 2, we want to assign a value of 1
if all of the corresponding rows' value after the decimal place are < 2, we want to assign a value of 0
and consequnetly, if type_n == 0, we want to assign a 0
say we name this variable $type_n_G2

such that the desired output should look like1

enter image description here

Taking for example, type_1_G2

we have type_1 == 2
we have the corresponding identities at score_one and score_four
both values after the decimal place are >= 2, so we assign type_1_G2==1

Jaap · Accepted Answer

There is no need for the complicated construction of df2 in my opinion. A reshaped df1 into long format is a better starting point to get to the desired end result in fewer steps.

An approach using the data.table package:

library(data.table)
# melting the original dataframe 'df1' to a long format datatable
dt <- melt(setDT(df1), "A")

# creating two type variables & a logical vector indicating whether
# the decimal for a specific type is equal or above .2
dt[, `:=` (type1=paste0("type_",substr(value,1,1)),
           type2=paste0("type_",substr(value,1,1),"_g2"))
   ][, g2 := +(+(value - floor(value) >= 0.2)==1), .(A,type1)]

# creating separate wide datatables for the variable & two type columns
dt1 <- dcast(dt, A ~ variable)
dt2 <- dcast(dt, A ~ type1)
dt3 <- dcast(dt, A ~ type2, fun=sum, value.var="g2")[, lapply(.SD, function(x) +(x>=1)), A]

# two options for merging the wide datatables together into one
dtres <- dt1[dt2[dt3, on = "A"], on = "A"]
dtres <- Reduce(function(...) merge(..., all = TRUE, by = "A"), list(dt1, dt2, dt3))

# or in one go without creating intermediate datatables
dtres <- dcast(dt, A ~ variable)[dcast(dt, A ~ type1)[dcast(dt, A ~ type2, fun=sum, value.var = "g2")[, lapply(.SD, function(x) +(x>=1)) , A], on = "A"], on = "A"]

this results in:

> dtres
   A score_one score_two score_three score_four type_1 type_2 type_3 type_4 type_5 type_7 type_9 type_1_g2 type_2_g2 type_3_g2 type_4_g2 type_5_g2 type_7_g2 type_9_g2
1: 1     123.5     324.2       553.1      123.2      2      0      1      0      1      0      0         1         0         0         0         0         0         0
2: 2     223.1     568.2       412.3      225.1      0      2      0      1      1      0      0         0         0         0         1         1         0         0
3: 3     242.2     124.9       435.7      243.6      1      2      0      1      0      0      0         1         1         0         1         0         0         0
4: 4     351.8     323.1       523.1      741.1      0      0      2      0      1      1      0         0         0         1         0         0         0         0
5: 5     123.1     213.4       365.4      951.2      1      1      1      0      0      0      1         0         1         1         0         0         0         1

This approach can be translated into a dplyr/tidyr implementation as follows:

library(dplyr)
library(tidyr)

df <- df1 %>% gather(variable, value,-A) %>%
  mutate(type1 = paste0("type_",substr(value,1,1)),
         type2 = paste0("type_",substr(value,1,1),"_g2")) %>%
  group_by(A,type1) %>%
  mutate(g2 = +(+(value - floor(value) >= 0.2)==1),
         type1n = n()) %>%
  ungroup()

d1 <- df %>% select(1:3) %>% spread(variable, value)
d2 <- df %>% group_by(A, type1) %>% tally() %>% spread(type1, n, fill=0)
d3 <- df %>% group_by(A, type2) %>% summarise(g = any(g2==1)) %>% spread(type2, g, fill=0)

dfres <- left_join(d1, d2, by = "A") %>% left_join(., d3, by = "A")

which gives the same result:

> dfres
  A score_one score_two score_three score_four type_1 type_2 type_3 type_4 type_5 type_7 type_9 type_1_g2 type_2_g2 type_3_g2 type_4_g2 type_5_g2 type_7_g2 type_9_g2
1 1     123.5     324.2       553.1      123.2      2      0      1      0      1      0      0         1         0         0         0         0         0         0
2 2     223.1     568.2       412.3      225.1      0      2      0      1      1      0      0         0         0         0         1         1         0         0
3 3     242.2     124.9       435.7      243.6      1      2      0      1      0      0      0         1         1         0         1         0         0         0
4 4     351.8     323.1       523.1      741.1      0      0      2      0      1      1      0         0         0         1         0         0         0         0
5 5     123.1     213.4       365.4      951.2      1      1      1      0      0      0      1         0         1         1         0         0         0         1

David Arenburg · Answer

Here's a vectorized attempt to first melt and then dcast the data using the data.table package. It needs some polish but I don't have time right now

library(data.table) # v >= 1.9.6
# melt and order by "A" 
temp <- setorder(melt(df2, id = 1:5), A)

# Create the "type_n_G2" column names
temp$Var <- paste0(temp$variable, "_G2")

# Selecting only the "score_one", "score_two", "score_three" and "score_four"
indx1 <- indx2 <- temp[2:5]

# Finding the first integer within each number
indx2[] <- sub("(^.{1}).*", "\1", as.matrix(indx2))

# The works horse: simultaneously compare `indx2` against `type_n` and extract decimals
indx3 <- indx1 * (indx2 == as.numeric(sub(".*_", "", temp$variable))) - floor(indx1)

# Compare the result against 0.2, sum the rows and see if any is greater than 0
temp$res<- +(rowSums(indx3 >= 0.2) > 0)

# Convert back to wide format
dcast(temp, A ~ Var, value.var = "res")
#   A type_1_G2 type_2_G2 type_3_G2 type_4_G2 type_5_G2 type_7_G2 type_9_G2
# 1 1         1         0         0         0         0         0         0
# 2 2         0         1         0         1         1         0         0
# 3 3         1         1         0         1         0         0         0
# 4 4         0         0         1         0         0         0         0
# 5 5         0         1         1         0         0         0         1

Now you can just cbind the result to df2 (This doesn't match your result exactly cause your provided data doesn't match it too)

Character-location identity to create a new variable

Tags:

r

dplyr

tidyr

lukeg

2 Answers

Jaap

David Arenburg

Recent Activity

Donate For Us

Character-location identity to create a new variable

Tags:

r

dplyr

tidyr

lukeg

2 Answers

Jaap

David Arenburg

Related questions

Recent Activity

Donate For Us