library(ggplot2)
data(diamonds)
str(diamonds)
## 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
This is my crosstab
table(diamonds$cut,diamonds$color)
##
## D E F G H I J
## Fair 163 224 312 314 303 175 119
## Good 662 933 909 871 702 522 307
## Very Good 1513 2400 2164 2299 1824 1204 678
## Premium 1603 2337 2331 2924 2360 1428 808
## Ideal 2834 3903 3826 4884 3115 2093 896
But instead of getting the counts I want the average price (or mean(price) or even max(price)
I tried the Hmisc package but it gives me data in the longer format I needed it to the in the table format above
summarize(diamonds$price,llist(diamonds$color,diamonds$clarity),max)
## diamonds$color diamonds$clarity diamonds$price
## 1 D I1 15964
## 4 D SI2 18693
## 3 D SI1 18468
## 6 D VS2 18318
## 5 D VS1 17936
## 8 D VVS2 17545
## 7 D VVS1 17932
## 2 D IF 18542
## 9 E I1 11548
## 12 E SI2 18477
## 11 E SI1 18731
## 14 E VS2 18557
## 13 E VS1 18729
## 16 E VVS2 18188
## 15 E VVS1 16256
## 10 E IF 18700
## 17 F I1 10685
## 20 F SI2 18784
## 19 F SI1 18759
## 22 F VS2 18791
## 21 F VS1 18780
## 24 F VVS2 18614
## 23 F VVS1 18777
## 18 F IF 18552
## 25 G I1 13203
## 28 G SI2 18804
## 27 G SI1 18818
## 30 G VS2 18700
## 29 G VS1 18419
## 32 G VVS2 18768
## 31 G VVS1 18445
## 26 G IF 18806
## 33 H I1 17329
## 36 H SI2 18745
## 35 H SI1 18803
## 38 H VS2 18659
## 37 H VS1 18522
## 40 H VVS2 17267
## 39 H VVS1 14603
## 34 H IF 16300
## 41 I I1 16193
## 44 I SI2 18756
## 43 I SI1 18797
## 46 I VS2 18823
## 45 I VS1 18795
## 48 I VVS2 15952
## 47 I VVS1 15654
## 42 I IF 12725
## 49 J I1 18531
## 52 J SI2 18710
## 51 J SI1 18508
## 54 J VS2 18701
## 53 J VS1 18706
## 56 J VVS2 17214
## 55 J VVS1 17891
## 50 J IF 18594
Data concerning two categorical (i.e., nominal- or ordinal-level) variables can be displayed in a two-way contingency table, clustered bar chart, or stacked bar chart.
Cross tabulation is used to quantitatively analyze the relationship between multiple variables. Cross tabulations — also referred to as contingency tables or crosstabs — group variables together and enable researchers to understand the correlation between the different variables.
This test is used to determine if two categorical variables are independent or if they are in fact related to one another. If two categorical variables are independent, then the value of one variable does not change the probability distribution of the other.
Try
library(reshape2)
acast(diamonds, cut~color, value.var='price', mean)
# D E F G H I J
#Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
#Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
#Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
#Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
#Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
Or using base R
with(diamonds, tapply(price, list(cut,color), FUN= mean))
# D E F G H I J
#Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
#Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
#Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
#Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
#Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
Or as @DavidArenburg suggested
xtabs(price ~ cut + color, diamonds)/table(diamonds[c('cut', 'color')])
# color
#cut D E F G H I J
#Fair 4291.061 3682.312 3827.003 4239.255 5135.683 4685.446 4975.655
#Good 3405.382 3423.644 3495.750 4123.482 4276.255 5078.533 4574.173
#Very Good 3470.467 3214.652 3778.820 3872.754 4535.390 5255.880 5103.513
#Premium 3631.293 3538.914 4324.890 4500.742 5216.707 5946.181 6294.592
#Ideal 2629.095 2597.550 3374.939 3720.706 3889.335 4451.970 4918.186
You may also try dcast
from the devel version of data.table
i.e. v1.9.5.
library(data.table)
dcast(as.data.table(diamonds), cut~color, value.var='price', mean)
If the grouping variables are 'clarity' and 'color'
with(diamonds, tapply(price, list(clarity,color), FUN = mean))
For other functions, change the FUN
in tapply
or fun.aggregate
in acast/dcast
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With