First of all we will load all the libraries used in this section.
library(readr)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.2
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ ggplot2 3.2.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(dplyr)
The data are data on wine prices and age and origins of wines.
We will use read_delim() to import the dataset:
vindata <- read_delim("vindata.csv", ";", trim_ws = TRUE)
## Parsed with column specification:
## cols(
## pris = col_number(),
## dnkval = col_double(),
## distrikt = col_double(),
## alder = col_double(),
## bestlist = col_double(),
## Bordeaux = col_double(),
## LanguedocRoussilion = col_double(),
## Andre = col_double(),
## Burgund = col_double()
## )
head(vindata)
## # A tibble: 6 x 9
## pris dnkval distrikt alder bestlist Bordeaux LanguedocRoussi… Andre
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 153 88 5 0 1 0 0 1
## 2 169 85 5 0 1 0 0 1
## 3 169 85 5 0 1 0 0 1
## 4 109 81 5 0 0 0 0 1
## 5 1595 78 5 2 1 0 0 1
## 6 1259 76 5 1 1 0 0 1
## # … with 1 more variable: Burgund <dbl>
We will plot data using head() and a random sample
head(vindata)
## # A tibble: 6 x 9
## pris dnkval distrikt alder bestlist Bordeaux LanguedocRoussi… Andre
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 153 88 5 0 1 0 0 1
## 2 169 85 5 0 1 0 0 1
## 3 169 85 5 0 1 0 0 1
## 4 109 81 5 0 0 0 0 1
## 5 1595 78 5 2 1 0 0 1
## 6 1259 76 5 1 1 0 0 1
## # … with 1 more variable: Burgund <dbl>
dplyr::sample_n(vindata, 10)
## # A tibble: 10 x 9
## pris dnkval distrikt alder bestlist Bordeaux LanguedocRoussi… Andre
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 169 83 1 1 1 0 0 0
## 2 5898 91 1 4 1 0 0 0
## 3 456 94 4 4 1 0 0 0
## 4 199 85 4 4 1 0 0 0
## 5 2699 91 4 3 1 0 0 0
## 6 1699 87 4 4 1 0 0 0
## 7 136 77 4 1 1 0 0 0
## 8 126 82 4 1 1 0 0 0
## 9 457 94 4 3 1 0 0 0
## 10 1799 79 2 1 1 1 0 0
## # … with 1 more variable: Burgund <dbl>
And now we will visualize using ggplot
ggplot(vindata, aes(x=factor(alder), y=dnkval)) + geom_boxplot()
We can also add a jitter plot or dot plot:
ggplot(vindata, aes(x=factor(alder), y=dnkval)) + geom_boxplot() + geom_jitter(shape=16, position=position_jitter(0.2))
ggplot(vindata, aes(x=factor(alder), y=dnkval)) + geom_boxplot() + geom_dotplot(binaxis='y', stackdir='center', dotsize=1)
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(vindata, aes(x=factor(alder), y=dnkval)) + geom_boxplot(position=position_dodge(1)) + geom_boxplot()
vindata <- vindata %>% mutate(alder1 = ifelse(alder %in% 0:1, "0",
ifelse(alder %in% 2:3, "1",
ifelse(alder %in% 4:5, "2", "3") )))
group_by(vindata, alder1, Andre) %>%
summarise(
count = n(),
mean = mean(dnkval, na.rm = TRUE),
sd = sd(dnkval, na.rm = TRUE)
)
## # A tibble: 8 x 5
## # Groups: alder1 [4]
## alder1 Andre count mean sd
## <chr> <dbl> <int> <dbl> <dbl>
## 1 0 0 38 81.8 5.70
## 2 0 1 12 79.4 5.00
## 3 1 0 67 85.7 5.58
## 4 1 1 8 81.1 4.29
## 5 2 0 58 85.3 5.96
## 6 2 1 1 73 NA
## 7 3 0 32 85.7 5.47
## 8 3 1 2 89.5 0.707
ggplot(vindata, aes(x=factor(alder1), y=dnkval)) + geom_boxplot(position=position_dodge(1)) + geom_boxplot()
We will now see how we can perform an anova measurement:
# Compute the analysis of variance
res.aov <- aov(dnkval ~ factor(alder1) , data = vindata)
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(alder1) 3 671 223.82 6.85 0.000199 ***
## Residuals 214 6993 32.68
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The p-value is less than 0.05, and hence we can conclude that there are significant differences between different age groups and the the quality of the wine (highlighted with “*" in the model summary).
A significant p-value indicates that some of the group means are different, however we don’t know which pairs of groups are different. To find out we need to perform a post-hoc analysis.
TukeyHSD(res.aov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = dnkval ~ factor(alder1), data = vindata)
##
## $`factor(alder1)`
## diff lwr upr p adj
## 1-0 4.0333333 1.330954 6.735712 0.0008446
## 2-0 3.8986441 1.053468 6.743820 0.0026688
## 3-0 4.7211765 1.430978 8.011375 0.0014678
## 2-1 -0.1346893 -2.710433 2.441054 0.9991115
## 3-1 0.6878431 -2.372359 3.748045 0.9374140
## 3-2 0.8225324 -2.364474 4.009539 0.9089343
Here we see that only the young wines 0-1 year are rated lower than the rest of the wines.
Lets just do the same with the wine prices:
# Compute the analysis of variance
res.aov <- aov(pris ~ factor(alder1) , data = vindata)
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(alder1) 3 1.288e+08 42940976 3.698 0.0126 *
## Residuals 214 2.485e+09 11611217
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(res.aov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = pris ~ factor(alder1), data = vindata)
##
## $`factor(alder1)`
## diff lwr upr p adj
## 1-0 1124.7400 -486.1500 2735.630 0.2724331
## 2-0 1162.0261 -533.9853 2858.037 0.2886764
## 3-0 2514.8247 553.5352 4476.114 0.0057905
## 2-1 37.2861 -1498.1165 1572.689 0.9999105
## 3-1 1390.0847 -434.1037 3214.273 0.2014154
## 3-2 1352.7986 -546.9780 3252.575 0.2558044
It looks like there might be a difference between young wines (0-1 years) and aged wines (7-8 years). However, we jumped ahead and did not look at the price data. Let us just have a look:
ggplot(vindata, aes(x=factor(alder1), y=pris)) + geom_boxplot()
It does not look normally distributed. Furthermore we can test the homogenecity of variances using a plot and levene’s test:
plot(res.aov, 2)
leveneTest(pris ~ alder1, data = vindata)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 3 4.9927 0.002286 **
## 214
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Here we see that the plot of residuals is skewed and p-value is less than 0.05, which means that there is evidence that the varriance among groups is statistically different.
Therefore we have to use a non-parametric alternative (or we might be able to log-transform the data).
kruskal.test(pris ~ alder1, data = vindata)
##
## Kruskal-Wallis rank sum test
##
## data: pris by alder1
## Kruskal-Wallis chi-squared = 7.0452, df = 3, p-value = 0.07047
Here the data does not seem to be significantly different.
Just to show you the two-way anova. It is quit simple in the aov() function and require the same as when you create another function, i.e. the lm() function:
res.aov <- aov(dnkval ~ alder1 + Andre , data = vindata)
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## alder1 3 671 223.82 7.004 0.000163 ***
## Andre 1 186 186.09 5.823 0.016663 *
## Residuals 213 6807 31.96
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From the ANOVA table we can conclude that both factors are statistically significant. Note that we assume normal distribution, equal varriances and independent factors.
res.aov <- aov(dnkval ~ alder1 + Andre + alder1:Andre , data = vindata)
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## alder1 3 671 223.82 7.108 0.000143 ***
## Andre 1 186 186.09 5.910 0.015897 *
## alder1:Andre 3 194 64.73 2.056 0.107185
## Residuals 210 6613 31.49
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1