First of all we will load all the libraries used in this section.

library(readr)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.2
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ ggplot2 3.2.1     ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
library(dplyr)

Importing data

The data are data on wine prices and age and origins of wines.

We will use read_delim() to import the dataset:

vindata <- read_delim("vindata.csv", ";", trim_ws = TRUE)
## Parsed with column specification:
## cols(
##   pris = col_number(),
##   dnkval = col_double(),
##   distrikt = col_double(),
##   alder = col_double(),
##   bestlist = col_double(),
##   Bordeaux = col_double(),
##   LanguedocRoussilion = col_double(),
##   Andre = col_double(),
##   Burgund = col_double()
## )
head(vindata)
## # A tibble: 6 x 9
##    pris dnkval distrikt alder bestlist Bordeaux LanguedocRoussi… Andre
##   <dbl>  <dbl>    <dbl> <dbl>    <dbl>    <dbl>            <dbl> <dbl>
## 1   153     88        5     0        1        0                0     1
## 2   169     85        5     0        1        0                0     1
## 3   169     85        5     0        1        0                0     1
## 4   109     81        5     0        0        0                0     1
## 5  1595     78        5     2        1        0                0     1
## 6  1259     76        5     1        1        0                0     1
## # … with 1 more variable: Burgund <dbl>

Cleaning and visualising data

We will plot data using head() and a random sample

head(vindata)
## # A tibble: 6 x 9
##    pris dnkval distrikt alder bestlist Bordeaux LanguedocRoussi… Andre
##   <dbl>  <dbl>    <dbl> <dbl>    <dbl>    <dbl>            <dbl> <dbl>
## 1   153     88        5     0        1        0                0     1
## 2   169     85        5     0        1        0                0     1
## 3   169     85        5     0        1        0                0     1
## 4   109     81        5     0        0        0                0     1
## 5  1595     78        5     2        1        0                0     1
## 6  1259     76        5     1        1        0                0     1
## # … with 1 more variable: Burgund <dbl>
dplyr::sample_n(vindata, 10)
## # A tibble: 10 x 9
##     pris dnkval distrikt alder bestlist Bordeaux LanguedocRoussi… Andre
##    <dbl>  <dbl>    <dbl> <dbl>    <dbl>    <dbl>            <dbl> <dbl>
##  1   169     83        1     1        1        0                0     0
##  2  5898     91        1     4        1        0                0     0
##  3   456     94        4     4        1        0                0     0
##  4   199     85        4     4        1        0                0     0
##  5  2699     91        4     3        1        0                0     0
##  6  1699     87        4     4        1        0                0     0
##  7   136     77        4     1        1        0                0     0
##  8   126     82        4     1        1        0                0     0
##  9   457     94        4     3        1        0                0     0
## 10  1799     79        2     1        1        1                0     0
## # … with 1 more variable: Burgund <dbl>

And now we will visualize using ggplot

ggplot(vindata, aes(x=factor(alder), y=dnkval)) + geom_boxplot()

We can also add a jitter plot or dot plot:

ggplot(vindata, aes(x=factor(alder), y=dnkval)) + geom_boxplot() + geom_jitter(shape=16, position=position_jitter(0.2))

ggplot(vindata, aes(x=factor(alder), y=dnkval)) + geom_boxplot() + geom_dotplot(binaxis='y', stackdir='center', dotsize=1)
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(vindata, aes(x=factor(alder), y=dnkval)) + geom_boxplot(position=position_dodge(1)) + geom_boxplot()

vindata <- vindata %>% mutate(alder1 = ifelse(alder %in%  0:1, "0", 
                                              ifelse(alder %in%  2:3, "1",
                                              ifelse(alder %in% 4:5, "2", "3") )))

Compute mean and SD by groups using dplyr R package:

group_by(vindata, alder1, Andre) %>%
  summarise(
    count = n(),
    mean = mean(dnkval, na.rm = TRUE),
    sd = sd(dnkval, na.rm = TRUE)
  )
## # A tibble: 8 x 5
## # Groups:   alder1 [4]
##   alder1 Andre count  mean     sd
##   <chr>  <dbl> <int> <dbl>  <dbl>
## 1 0          0    38  81.8  5.70 
## 2 0          1    12  79.4  5.00 
## 3 1          0    67  85.7  5.58 
## 4 1          1     8  81.1  4.29 
## 5 2          0    58  85.3  5.96 
## 6 2          1     1  73   NA    
## 7 3          0    32  85.7  5.47 
## 8 3          1     2  89.5  0.707
ggplot(vindata, aes(x=factor(alder1), y=dnkval)) + geom_boxplot(position=position_dodge(1)) + geom_boxplot()

We will now see how we can perform an anova measurement:

# Compute the analysis of variance
res.aov <- aov(dnkval ~ factor(alder1) , data = vindata)
summary(res.aov)
##                 Df Sum Sq Mean Sq F value   Pr(>F)    
## factor(alder1)   3    671  223.82    6.85 0.000199 ***
## Residuals      214   6993   32.68                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-value is less than 0.05, and hence we can conclude that there are significant differences between different age groups and the the quality of the wine (highlighted with “*" in the model summary).

A significant p-value indicates that some of the group means are different, however we don’t know which pairs of groups are different. To find out we need to perform a post-hoc analysis.

TukeyHSD(res.aov)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dnkval ~ factor(alder1), data = vindata)
## 
## $`factor(alder1)`
##           diff       lwr      upr     p adj
## 1-0  4.0333333  1.330954 6.735712 0.0008446
## 2-0  3.8986441  1.053468 6.743820 0.0026688
## 3-0  4.7211765  1.430978 8.011375 0.0014678
## 2-1 -0.1346893 -2.710433 2.441054 0.9991115
## 3-1  0.6878431 -2.372359 3.748045 0.9374140
## 3-2  0.8225324 -2.364474 4.009539 0.9089343

Here we see that only the young wines 0-1 year are rated lower than the rest of the wines.

Lets just do the same with the wine prices:

# Compute the analysis of variance
res.aov <- aov(pris ~ factor(alder1) , data = vindata)
summary(res.aov)
##                 Df    Sum Sq  Mean Sq F value Pr(>F)  
## factor(alder1)   3 1.288e+08 42940976   3.698 0.0126 *
## Residuals      214 2.485e+09 11611217                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(res.aov)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = pris ~ factor(alder1), data = vindata)
## 
## $`factor(alder1)`
##          diff        lwr      upr     p adj
## 1-0 1124.7400  -486.1500 2735.630 0.2724331
## 2-0 1162.0261  -533.9853 2858.037 0.2886764
## 3-0 2514.8247   553.5352 4476.114 0.0057905
## 2-1   37.2861 -1498.1165 1572.689 0.9999105
## 3-1 1390.0847  -434.1037 3214.273 0.2014154
## 3-2 1352.7986  -546.9780 3252.575 0.2558044

It looks like there might be a difference between young wines (0-1 years) and aged wines (7-8 years). However, we jumped ahead and did not look at the price data. Let us just have a look:

ggplot(vindata, aes(x=factor(alder1), y=pris)) + geom_boxplot()

It does not look normally distributed. Furthermore we can test the homogenecity of variances using a plot and levene’s test:

plot(res.aov, 2)

leveneTest(pris ~ alder1, data = vindata)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value   Pr(>F)   
## group   3  4.9927 0.002286 **
##       214                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Here we see that the plot of residuals is skewed and p-value is less than 0.05, which means that there is evidence that the varriance among groups is statistically different.

Therefore we have to use a non-parametric alternative (or we might be able to log-transform the data).

kruskal.test(pris ~ alder1, data = vindata)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  pris by alder1
## Kruskal-Wallis chi-squared = 7.0452, df = 3, p-value = 0.07047

Here the data does not seem to be significantly different.

Just to show you the two-way anova. It is quit simple in the aov() function and require the same as when you create another function, i.e. the lm() function:

res.aov <- aov(dnkval ~ alder1 + Andre , data = vindata)
summary(res.aov)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## alder1        3    671  223.82   7.004 0.000163 ***
## Andre         1    186  186.09   5.823 0.016663 *  
## Residuals   213   6807   31.96                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

From the ANOVA table we can conclude that both factors are statistically significant. Note that we assume normal distribution, equal varriances and independent factors.

res.aov <- aov(dnkval ~ alder1 + Andre + alder1:Andre , data = vindata)
summary(res.aov)
##               Df Sum Sq Mean Sq F value   Pr(>F)    
## alder1         3    671  223.82   7.108 0.000143 ***
## Andre          1    186  186.09   5.910 0.015897 *  
## alder1:Andre   3    194   64.73   2.056 0.107185    
## Residuals    210   6613   31.49                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1