Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Data Smoothing in R

Tags:

r

dplyr

This question is related to this one that I asked before. But referring to that question is not necessary to answer this one.

Data

I have a data set containing velocities of 2169 vehicles recorded at intervals of 0.1 seconds. So, there are many rows for an individual vehicle. Here I am reproducing the data only for the vehicle # 2:

> dput(uma)
structure(list(Frame.ID = 13:445, Vehicle.velocity = c(40, 40, 
40, 40, 40, 40, 40, 40.02, 40.03, 39.93, 39.61, 39.14, 38.61, 
38.28, 38.42, 38.78, 38.92, 38.54, 37.51, 36.34, 35.5, 35.08, 
34.96, 34.98, 35, 34.99, 34.98, 35.1, 35.49, 36.2, 37.15, 38.12, 
38.76, 38.95, 38.95, 38.99, 39.18, 39.34, 39.2, 38.89, 38.73, 
38.88, 39.28, 39.68, 39.94, 40.02, 40, 39.99, 39.99, 39.65, 38.92, 
38.52, 38.8, 39.72, 40.76, 41.07, 40.8, 40.59, 40.75, 41.38, 
42.37, 43.37, 44.06, 44.29, 44.13, 43.9, 43.92, 44.21, 44.59, 
44.87, 44.99, 45.01, 45.01, 45, 45, 45, 44.79, 44.32, 43.98, 
43.97, 44.29, 44.76, 45.06, 45.36, 45.92, 46.6, 47.05, 47.05, 
46.6, 45.92, 45.36, 45.06, 44.96, 44.97, 44.99, 44.99, 44.99, 
44.99, 45.01, 45.02, 44.9, 44.46, 43.62, 42.47, 41.41, 40.72, 
40.49, 40.6, 40.76, 40.72, 40.5, 40.38, 40.43, 40.38, 39.83, 
38.59, 37.02, 35.73, 35.04, 34.85, 34.91, 34.99, 34.99, 34.97, 
34.96, 34.98, 35.07, 35.29, 35.54, 35.67, 35.63, 35.53, 35.53, 
35.63, 35.68, 35.55, 35.28, 35.06, 35.09, 35.49, 36.22, 37.08, 
37.8, 38.3, 38.73, 39.18, 39.62, 39.83, 39.73, 39.58, 39.57, 
39.71, 39.91, 40, 39.98, 39.97, 40.08, 40.38, 40.81, 41.27, 41.69, 
42.2, 42.92, 43.77, 44.49, 44.9, 45.03, 45.01, 45, 45, 45, 45, 
45, 45, 45, 45, 45, 45, 45, 44.99, 45.03, 45.26, 45.83, 46.83, 
48.2, 49.68, 50.95, 51.83, 52.19, 52, 51.35, 50.38, 49.38, 48.63, 
48.15, 47.87, 47.78, 48.01, 48.63, 49.52, 50.39, 50.9, 50.96, 
50.68, 50.3, 50.05, 49.94, 49.87, 49.82, 49.82, 49.88, 49.96, 
50, 50, 49.98, 49.98, 50.16, 50.64, 51.43, 52.33, 53.01, 53.27, 
53.22, 53.25, 53.75, 54.86, 56.36, 57.64, 58.28, 58.29, 57.94, 
57.51, 57.07, 56.64, 56.43, 56.73, 57.5, 58.27, 58.55, 58.32, 
57.99, 57.89, 57.92, 57.74, 57.12, 56.24, 55.51, 55.1, 54.97, 
54.98, 55.02, 55.03, 54.86, 54.3, 53.25, 51.8, 50.36, 49.41, 
49.06, 49.17, 49.4, 49.51, 49.52, 49.51, 49.45, 49.24, 48.84, 
48.29, 47.74, 47.33, 47.12, 47.06, 47.07, 47.08, 47.05, 47.04, 
47.25, 47.68, 47.93, 47.56, 46.31, 44.43, 42.7, 41.56, 41.03, 
40.92, 40.92, 40.98, 41.19, 41.45, 41.54, 41.32, 40.85, 40.37, 
40.09, 39.99, 39.99, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 
39.98, 39.97, 40.1, 40.53, 41.36, 42.52, 43.71, 44.57, 45.01, 
45.1, 45.04, 45, 45, 45, 45, 45, 45, 44.98, 44.97, 45.08, 45.39, 
45.85, 46.2, 46.28, 46.21, 46.29, 46.74, 47.49, 48.35, 49.11, 
49.63, 49.89, 49.94, 49.97, 50.14, 50.44, 50.78, 51.03, 51.12, 
51.05, 50.85, 50.56, 50.26, 50.06, 50.1, 50.52, 51.36, 52.5, 
53.63, 54.46, 54.9, 55.03, 55.09, 55.23, 55.35, 55.35, 55.23, 
55.07, 54.99, 54.98, 54.97, 55.06, 55.37, 55.91, 56.66, 57.42, 
58.07, 58.7, 59.24, 59.67, 59.95, 60.02, 60, 60, 60, 60, 60, 
60.01, 60.06, 60.23, 60.65, 61.34, 62.17, 62.93, 63.53, 64, 64.41, 
64.75, 65.04, 65.3, 65.57, 65.75, 65.74, 65.66, 65.62, 65.71, 
65.91, 66.1, 66.26, 66.44, 66.61, 66.78, 66.91, 66.99, 66.91, 
66.7, 66.56, 66.6, 66.83, 67.17, 67.45, 67.75, 68.15, 68.64, 
69.15, 69.57, 69.79, 69.79, 69.72, 69.72, 69.81, 69.94, 70, 70.01, 
70.02, 70.03)), .Names = c("Frame.ID", "Vehicle.velocity"), class = "data.frame", row.names = c(NA, 
433L))

Frame.ID is the time frame in which the Vehicle.velocity was observed. There is some noise in the velocity variable and I want to smooth it.

Methodology

To smooth the velocity I am using following equation:

equation

where,
Delta = 10
Nalpha = number of data points (rows)
i = 1, ... ,Nalpha (i.e. the row number)
D = minimum of {i-1, Nalpha - i, 3*delta=30}
xalpha = velocity

Question

I have gone through the documentation of filter and convolution in R. It seems that I have to know about convolution to do this. However, I have tried my best and can't understand how convolution works! The linked question has an answer which helped me in understanding some of the inner workings in the function but I am still not sure.
Could anyone here on SO please explain how this thing works? Or guide me to an alternative methodology to achieve the same purpose i.e. apply the equation?

My current code which works but is lengthy

Here is what uma looks like:

> head(uma)
  Frame.ID Vehicle.velocity
1       13               40
2       14               40
3       15               40
4       16               40
5       17               40
6       18               40

uma$i <- 1:nrow(uma)             # this is i
uma$im1 <- uma$i - 1
uma$Nai <- nrow(uma) - uma$i     # this is Nalpha 
uma$delta3 <- 30                 # this is 3 times delta
uma$D <- pmin(uma$im1, uma$Nai, uma$delta3)  # selecting the minimum of {i-1, Nalpha - i, 3*delta=15}
uma$imD <- uma$i - uma$D         # i-D
uma$ipD <- uma$i + uma$D         # i+D

uma <- ddply(uma, .(Frame.ID), transform, k = imD:ipD)  # to include all k in the data frame
umai <- uma
umai$imk <- umai$i - umai$k      # i-k
umai$aimk <- (-1) * abs(umai$imk)  # -|i-k|
umai$delta <- 10                  
umai$kernel <- exp(umai$aimk/umai$delta)   # The kernel in the equation i.e. EXP^-|i-k|/delta
umai$p <- umai$Vehicle.velocity[match(umai$k,umai$i)]   #observed velocity in kth row as described in equation as t(k)
umai$kernelp <- umai$p * umai$kernel       # the product of kernel and observed velocity in kth row as described in equation as t(k)
umair <- ddply(umai, .(Frame.ID), summarize, Z = sum(kernel), prod = sum(kernelp))  # summing the kernel to get Z and summing the product to get the numerator of the equation
umair$new.Y <- umair$prod/umair$Z   # the final step to get the smoothed velocity

Plot

Just for reference, if I plot the observed and smoothed velocities against time frames we can see the result of smoothing:

ggplot() + 
       geom_point(data=uma,aes(y=Vehicle.velocity, x= Frame.ID)) + 
  geom_point(data=umair,aes(y=new.Y, x= Frame.ID), color="red") 

smooth

Please help me making my code short and applicable to all vehicles (represented by Vehicle.ID in the data set) by guiding me about use of convolution.

dplyr

Alright, so I used following code and it works but takes 3 hours on 32 GB RAM. Can anyone suggest improvements to speed it up (1 hour each is taken by umal, umav and umaa)?

uma <- tbl_df(uma)
uma <- uma %>%     # take data frame 
  group_by(Vehicle.ID)  %>%  # group by Vehicle ID
  mutate(i = 1:length(Frame.ID), im1 = i-1, Nai = length(Frame.ID) - i,
         Dv = pmin(im1, Nai, 30),
         Da = pmin(im1, Nai, 120),
         Dl = pmin(im1, Nai, 15),

         imDv = i - Dv,
         ipDv = i + Dv,
         imDa = i - Da,
         ipDa = i + Da,
         imDl = i - Dl,
         ipDl = i + Dl) %>%  # finding i, i-1 and Nalpha-i, D, i-D and i+D for location, velocity and acceleration
  ungroup()



umav <- uma %>%
  group_by(Vehicle.ID, Frame.ID) %>%
  do(data.frame(kv = .$imDv:.$ipDv)) %>%
  left_join(x=., y=uma) %>%
  mutate(imk = i - kv, aimk = (-1) * abs(imk), delta = 10, kernel = exp(aimk/delta)) %>%
  ungroup() %>%
  group_by(Vehicle.ID) %>%
  mutate(p = Vehicle.velocity2[match(kv,i)], kernelp = p * kernel) %>%
  ungroup() %>%
  group_by(Vehicle.ID, Frame.ID) %>%
  summarise(Z = sum(kernel), prod = sum(kernelp)) %>%
  mutate(svel = prod/Z) %>%
  ungroup()



umaa <- uma %>%
  group_by(Vehicle.ID, Frame.ID) %>%
  do(data.frame(ka = .$imDa:.$ipDa)) %>%
  left_join(x=., y=uma) %>%
  mutate(imk = i - ka, aimk = (-1) * abs(imk), delta = 10, kernel = exp(aimk/delta)) %>%
  ungroup() %>%
  group_by(Vehicle.ID) %>%
  mutate(p = Vehicle.acceleration2[match(ka,i)], kernelp = p * kernel) %>%
  ungroup() %>%
  group_by(Vehicle.ID, Frame.ID) %>%
  summarise(Z = sum(kernel), prod = sum(kernelp)) %>%
  mutate(sacc = prod/Z) %>%
  ungroup()




umal <- uma %>%
  group_by(Vehicle.ID, Frame.ID) %>%
  do(data.frame(kl = .$imDl:.$ipDl)) %>%
  left_join(x=., y=uma) %>%
  mutate(imk = i - kl, aimk = (-1) * abs(imk), delta = 10, kernel = exp(aimk/delta)) %>%
  ungroup() %>%
  group_by(Vehicle.ID) %>%
  mutate(p = Local.Y[match(kl,i)], kernelp = p * kernel) %>%
  ungroup() %>%
  group_by(Vehicle.ID, Frame.ID) %>%
  summarise(Z = sum(kernel), prod = sum(kernelp)) %>%
  mutate(ycoord = prod/Z) %>%
  ungroup()

umal <- select(umal,c("Vehicle.ID", "Frame.ID", "ycoord"))
umav <- select(umav, c("Vehicle.ID", "Frame.ID", "svel"))
umaa <- select(umaa, c("Vehicle.ID", "Frame.ID", "sacc"))

umair <- left_join(uma, umal) %>% left_join(x=., y=umav) %>% left_join(x=., y=umaa)
like image 656
umair durrani Avatar asked Oct 17 '14 23:10

umair durrani


People also ask

How do I smooth data in R?

Use of Loess() function: Loess() function is used on a numerical vector to smoothen it. It is also used to predict the Y locally.

What is a smooth in R?

Smoothing attempts to progressively remove the higher frequency behavior to make it easier to describe the lower frequency behavior. Ideally, a small amount of smoothing removes noise, more smoothing removes the seasonal component, and then finally the cyclical component is removed to isolate trend.

What is meant by data smoothing?

What Is Data Smoothing? Data smoothing is done by using an algorithm to remove noise from a data set. This allows important patterns to more clearly stand out. Data smoothing can be used to help predict trends, such as those found in securities prices, as well as in economic analysis.

How do I smooth a time series in R?

You can then use the “SMA()” function to smooth time series data. To use the SMA() function, you need to specify the order (span) of the simple moving average, using the parameter “n”. For example, to calculate a simple moving average of order 5, we set n=5 in the SMA() function.


1 Answers

A good first step would be to take a for loop (which I'll hide with sapply) and perform the exponential smoothing for each index:

josilber1 <- function(uma) {
  delta <- 10
  sapply(1:nrow(uma), function(i) {
    D <- min(i-1, nrow(uma)-i, 30)
    rng <- (i-D):(i+D)
    rng <- rng[rng >= 1 & rng <= nrow(uma)]
    expabs <- exp(-abs(i-rng)/delta)
    return(sum(uma$Vehicle.velocity[rng] * expabs) / sum(expabs))
  })
}

A more involved approach would be to only compute the incremental change in the exponential smoothing function for each index (as opposed to re-summing at each index). The exponential smoothing function has a lower part (data before the current index; I include the current index in low in the code below) and an upper part (data after the current index; high in the code below). As we loop through the vector, all the data in the lower part gets weighted less (we divide by mult) and all the data in the upper part gets weighted more (we multiply by mult). The leftmost element is dropped from low, the leftmost element in high moves to low, and one element is added to the right side of high.

The actual code is a bit messier to deal with the beginning and ending of the vector and to deal with numerical stability issues (errors in high are multiplied by mult each iteration):

josilber2 <- function(uma) {
  delta <- 10
  x <- uma$Vehicle.velocity
  ret <- c(x[1], rep(NA, nrow(uma)-1))
  low <- x[1]
  high <- 0
  norm <- 1
  old.D <- 0
  mult <- exp(1/delta)
  for (i in 2:nrow(uma)) {
    D <- min(i-1, nrow(uma)-i, 30)
    if (D == old.D + 1) {
      low <- low / mult + x[i]
      high <- high * mult - x[i] + x[i+D-1]/mult^(D-1) + x[i+D]/mult^D
      norm <- norm + 2 / mult^D
    } else if (D == old.D) {
      low <- low / mult - x[i-(D+1)]/mult^(D+1) + x[i]
      high <- high * mult - x[i] + x[i+D]/mult^D
    } else {
      low <- low / mult - x[i-(D+2)]/mult^(D+2) - x[i-(D+1)]/mult^(D+1) + x[i]
      high <- high * mult - x[i]
      norm <- norm - 2 / mult^(D+1)
    }

    # For numerical stability, recompute high every so often
    if (i %% 50 == 0) {
      rng <- (i+1):(i+D)
      expabs <- exp(-abs(i-rng)/delta)
      high <- sum(x[rng] * expabs)
    }

    ret[i] <- (low+high)/norm
    old.D <- D
  }
  return(ret)
}

R code like josilber2 can often be sped up considerably using the Rcpp package:

library(Rcpp)
josilber3 <- cppFunction(
"
NumericVector josilber3(NumericVector x) {
  double delta = 10.0;
  NumericVector ret(x.size(), 0.0);
  ret[0] = x[0];
  double low = x[0];
  double high = 0.0;
  double norm = 1.0;
  int oldD = 0;
  double mult = exp(1/delta);
  for (int i=1; i < x.size(); ++i) {
    int D = i;
    if (x.size()-i-1 < D)  D = x.size()-i-1;
    if (30 < D)  D = 30;
    if (D == oldD + 1) {
      low = low / mult + x[i];
      high = high * mult - x[i] + x[i+D-1]/pow(mult, D-1) + x[i+D]/pow(mult, D);
      norm = norm + 2 / pow(mult, D);
    } else if (D == oldD) {
      low = low / mult - x[i-(D+1)]/pow(mult, D+1) + x[i];
      high = high * mult - x[i] + x[i+D]/pow(mult, D);
    } else {
      low = low / mult - x[i-(D+2)]/pow(mult, D+2) - x[i-(D+1)]/pow(mult, D+1) + x[i];
      high = high * mult - x[i];
      norm = norm - 2 / pow(mult, D+1);
    }

    if (i % 50 == 0) {
      high = 0.0;
      for (int j=i+1; j <= i+D; ++j) {
        high += x[j] * exp((i-j)/delta);
      }
    }

    ret[i] = (low+high)/norm;
    oldD = D;
  }
  return ret;
}")

We can now benchmark the improvements from these three new approaches:

all.equal(umair.fxn(uma), josilber1(uma))
# [1] TRUE
all.equal(umair.fxn(uma), josilber2(uma))
# [1] TRUE
all.equal(umair.fxn(uma), josilber3(uma$Vehicle.velocity))
# [1] TRUE
library(microbenchmark)
microbenchmark(umair.fxn(uma), josilber1(uma), josilber2(uma), josilber3(uma$Vehicle.velocity))
# Unit: microseconds
#                             expr        min          lq         mean     median         uq        max neval
#                   umair.fxn(uma) 370006.728 382327.4115 398554.71080 393495.052 404186.153 572801.355   100
#                   josilber1(uma)  12879.268  13640.1310  15981.82099  14265.610  14805.419  28959.230   100
#                   josilber2(uma)   4324.724   4502.8125   5753.47088   4918.835   5244.309  17328.797   100
#  josilber3(uma$Vehicle.velocity)     41.582     54.5235     57.76919     57.435     60.099     90.998   100

We got a lot of improvement (25x) with the simpler josilber1 and a 70x total speedup with josilber2 (the advantage would be more with a larger delta value). With josilber3 we achieve a 6800x speedup, getting the runtime all the way down to 54 microseconds to process a single vehicle!

like image 55
josliber Avatar answered Sep 29 '22 02:09

josliber