Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Remove duplicated words from data frame

Tags:

r

Let's use an example for that:

> dput(data)
structure(list(mpg = c(15.2, 10.4, 13.3, 14.7, 22.8, 15.5, 14.3, 
19.7, 32.4, 27.3, 15.8, 30.4, 21.4, 18.7, 10.4, 30.4, 15, 21, 
21, 22.8, 24.4, 19.2, 17.8, 16.4, 17.3, 15.2, 19.2, 26, 33.9, 
21.5, 18.1, 21.4), cyl = c(8, 8, 8, 8, 4, 8, 8, 6, 4, 4, 8, 4, 
6, 8, 8, 4, 8, 6, 6, 4, 4, 6, 6, 8, 8, 8, 8, 4, 4, 4, 6, 4), 
    disp = c(304, 472, 350, 440, 108, 318, 360, 145, 78.7, 79, 
    351, 75.7, 258, 360, 460, 95.1, 301, 160, 160, 140.8, 146.7, 
    167.6, 167.6, 275.8, 275.8, 275.8, 400, 120.3, 71.1, 120.1, 
    225, 121), hp = c(150, 205, 245, 230, 93, 150, 245, 175, 
    66, 66, 264, 52, 110, 175, 215, 113, 335, 110, 110, 95, 62, 
    123, 123, 180, 180, 180, 175, 91, 65, 97, 105, 109), drat = c(3.15, 
    2.93, 3.73, 3.23, 3.85, 2.76, 3.21, 3.62, 4.08, 4.08, 4.22, 
    4.93, 3.08, 3.15, 3, 3.77, 3.54, 3.9, 3.9, 3.92, 3.69, 3.92, 
    3.92, 3.07, 3.07, 3.07, 3.08, 4.43, 4.22, 3.7, 2.76, 4.11
    ), wt = c(3.435, 5.25, 3.84, 5.345, 2.32, 3.52, 3.57, 2.77, 
    2.2, 1.935, 3.17, 1.615, 3.215, 3.44, 5.424, 1.513, 3.57, 
    2.62, 2.875, 3.15, 3.19, 3.44, 3.44, 4.07, 3.73, 3.78, 3.845, 
    2.14, 1.835, 2.465, 3.46, 2.78), qsec = c(17.3, 17.98, 15.41, 
    17.42, 18.61, 16.87, 15.84, 15.5, 19.47, 18.9, 14.5, 18.52, 
    19.44, 17.02, 17.82, 16.9, 14.6, 16.46, 17.02, 22.9, 20, 
    18.3, 18.9, 17.4, 17.6, 18, 17.05, 16.7, 19.9, 20.01, 20.22, 
    18.6), vs = c(0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 
    1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1), am = c(0, 
    0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 
    0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1), gear = c(3, 3, 3, 3, 
    4, 3, 3, 5, 4, 4, 5, 4, 3, 3, 3, 5, 5, 4, 4, 4, 4, 4, 4, 
    3, 3, 3, 3, 5, 4, 3, 3, 4), carb = c(2, 4, 4, 4, 1, 2, 4, 
    6, 1, 1, 4, 2, 1, 2, 4, 2, 8, 4, 4, 2, 2, 4, 4, 3, 3, 3, 
    2, 2, 1, 1, 1, 2), car = structure(c(18L, 19L, 5L, 13L, 14L, 
    31L, 7L, 21L, 20L, 22L, 23L, 24L, 25L, 26L, 2L, 15L, 4L, 
    9L, 12L, 29L, 30L, 6L, 1L, 3L, 27L, 10L, 28L, 16L, 11L, 8L, 
    17L, 32L), .Label = c("AMC Javelin, AMC Javelin, AMC Javelin, AMC Javelin, AMC Javelin", 
    "Cadillac Fleetwood", "Camaro Z28", "Chrysler Imperial", 
    "Datsun 710, Datsun 710, Datsun 710, Datsun 710", "Dodge Challenger", 
    "Duster 360", "Ferrari Dino, Ferrari Dino, Ferrari Dino, Ferrari Dino, Ferrari Dino", 
    "Fiat 128, Fiat 128, Fiat 128, Fiat 128, Fiat 128", "Fiat X1-9", 
    "Ford Pantera L", "Honda Civic, Honda Civic, Honda Civic, Honda Civic, Honda Civic", 
    "Hornet 4 Drive, Hornet 4 Drive, Hornet 4 Drive, Hornet 4 Drive, Hornet 4 Drive", 
    "Hornet Sportabout", "Lincoln Continental", "Lotus Europa", 
    "Maserati Bora", "Mazda RX4", "Mazda RX4 Wag, Mazda RX4 Wag, Mazda RX4 Wag, Mazda RX4 Wag, Mazda RX4 Wag", 
    "Merc 230", "Merc 240D", "Merc 280", "Merc 280C", "Merc 450SE", 
    "Merc 450SL", "Merc 450SLC", "Pontiac Firebird", "Porsche 914-2", 
    "Toyota Corolla", "Toyota Corona", "Valiant", "Volvo 142E"
    ), class = "factor")), .Names = c("mpg", "cyl", "disp", "hp", 
"drat", "wt", "qsec", "vs", "am", "gear", "carb", "car"), row.names = c("Mazda RX4", 
"Mazda RX4 Wag", "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", 
"Valiant", "Duster 360", "Merc 240D", "Merc 230", "Merc 280", 
"Merc 280C", "Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood", 
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic", 
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin", 
"Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2", 
"Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora", 
"Volvo 142E"), class = "data.frame")

As you can see we have some duplicated words in column car. I would like to keep just one word/phrase in each cell. In my original data the words are separated by coma as well. It's possible that in my original data I have the same word in different column and I would like to leave it like it is. The function should be used only for this specific column.

like image 838
Shaxi Liver Avatar asked Jul 03 '15 11:07

Shaxi Liver


2 Answers

One option is to use a regular expression in order to check if there is a comma in each row and if the condition is met, to keep only what comes before it

sub(",.*", "", data$car)
like image 97
David Arenburg Avatar answered Nov 07 '22 16:11

David Arenburg


This could be another option making use of cSplit

library(splitstackshape)
unique(cSplit(df, "car", sep = ',', direction = "long"))

#     mpg cyl  disp  hp drat    wt  qsec vs am gear carb                 car
# 1: 15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2           Mazda RX4
# 2: 10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4       Mazda RX4 Wag
# 3: 13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4          Datsun 710
# 4: 14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4      Hornet 4 Drive
# 5: 22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1   Hornet Sportabout
# 6: 15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2             Valiant
# 7: 14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4          Duster 360
# 8: 19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6           Merc 240D
# 9: 32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1            Merc 230
#10: 27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1            Merc 280
#11: 15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4           Merc 280C
#12: 30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2          Merc 450SE
#13: 21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1          Merc 450SL
#14: 18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2         Merc 450SLC
#15: 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4  Cadillac Fleetwood
#...
like image 20
Veerendra Gadekar Avatar answered Nov 07 '22 16:11

Veerendra Gadekar