Add and activate packages. NOTE THE NEW FUNCTION to install all packages ONLY IF Needed…..

Visualize the distribution of the data “Histogram”

Los datos son de un festival de Rock y Heavy Metal en UK La categoria son 0 = You smell like a rotting corpse 4 = You smell like of sweet roses

library(readr)
DownloadFestival <- read_csv("Data/DownloadFestival.csv")
## Rows: 810 Columns: 5
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gender
## dbl (4): ticknumb, day1, day2, day3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dlf <- DownloadFestival

dlf=subset(DownloadFestival, day1<5)
head(dlf)
## # A tibble: 6 × 5
##   ticknumb gender  day1  day2  day3
##      <dbl> <chr>  <dbl> <dbl> <dbl>
## 1     2111 Male    2.64  1.35  1.61
## 2     2229 Female  0.97  1.41  0.29
## 3     2338 Male    0.84 NA    NA   
## 4     2384 Female  3.03 NA    NA   
## 5     2401 Female  0.88  0.08 NA   
## 6     2405 Male    0.85 NA    NA
tail(dlf)
## # A tibble: 6 × 5
##   ticknumb gender  day1  day2  day3
##      <dbl> <chr>  <dbl> <dbl> <dbl>
## 1     4749 Female  0.52 NA       NA
## 2     4756 Female  2.91  0.94    NA
## 3     4758 Female  2.61  1.44    NA
## 4     4759 Female  1.47 NA       NA
## 5     4760 Male    1.28 NA       NA
## 6     4765 Female  1.26 NA       NA
hist.day1 <- ggplot(dlf, aes(day1)) + 
  geom_histogram(aes(y=..density..), colour="black", fill="white") 

hist.day1+ 
  labs(x="Hygiene score on day 1", y = "Density")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

hist.day1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave("histogram_festival_hygiene.pdf") # Can be either be a device function (e.g. png()), or one of "eps", "ps", "tex" (pictex), "pdf", "jpeg", "tiff", "png", "bmp", "svg" or "wmf" (windows only)
## Saving 7 x 5 in image
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Remove the outlier from the day1 hygiene score dlf\(day1 <- ifelse(dlf\)day1 > 5, NA, dlf\(day1) df\)Column = ifelse(df$column_to_be evaluated, replace_with_NA, otherwise_leave_as_before)

dlf$day1 <- ifelse(dlf$day1 > 5, NA, dlf$day1)


# Note here that we use ..density..  # CUAL ES LA DIFERENCIA entre densidad y frecuencia?  

hist.day1 <- ggplot(dlf, aes(day1)) + 
  theme(legend.position = "none") + 
  geom_histogram(aes(y=..density..), colour="black", fill="white") + 
  labs(x="Hygiene score on day 1", y = "Density")

hist.day1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The Normal Distribution

https://en.wikipedia.org/wiki/Normal_distribution

  1. visualization if the observed distribution follows a theoretical normal distribution
  2. Test to determine if the observed distribution follows a theoretical distribution

\[P(x)=\frac{1}{{\sigma\sqrt{ 2\pi}}}{e}^{\frac{{(x-µ)}^{2}}{{2\sigma}^{2}}}\]

# Ahora añadir la linea de distribución normal

hist.day1 + 
  stat_function(fun = dnorm, 
                args = list(mean = mean(dlf$day1,na.rm = TRUE), 
                sd = sd(dlf$day1 , na.rm = TRUE)), 
                colour = "blue", size = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

shapiro.test(dlf$day1)  # don't use with more than 40 a 200 data point
## 
##  Shapiro-Wilk normality test
## 
## data:  dlf$day1
## W = 0.99591, p-value = 0.03184
length(dlf$day1)
## [1] 809

Add a straight line on the qqplot

# This function is to add a straight line through the qqplot
qqplot.data <- function (vec) # argument: vector of numbers
{
  # following four lines from base R's qqline()
  y <- quantile(vec[!is.na(vec)], c(0.25, 0.75))
  x <- qnorm(c(0.25, 0.75))
  slope <- diff(y)/diff(x)
  int <- y[1L] - slope * x[1L]
  
  d <- data.frame(resids = vec)
  
  ggplot(d, aes(sample = resids)) + 
    stat_qq() + 
    geom_abline(slope = slope, intercept = int, color="red")
  
}

qqplot.data(dlf$day3)
## Warning: Removed 686 rows containing non-finite values (`stat_qq()`).

ggsave("qqplot.png")
## Saving 7 x 5 in image
## Warning: Removed 686 rows containing non-finite values (`stat_qq()`).
dlf=DownloadFestival
#Quantifying normality with numbers
library(psych)      #load the "psych" library, if you haven't already, for the describe() function.

#Using the describe() function for a single variable.
psych::describe(dlf$day2)
##    vars   n mean   sd median trimmed  mad min  max range skew kurtosis   se
## X1    1 264 0.96 0.72   0.79    0.87 0.61   0 3.44  3.44 1.08     0.76 0.04

Que es la varianza? The variance \[s^{ 2 }=\frac { \sum _{ i=1 }^{ n }{ (x_{ i }-\bar { x } ) } ^{ 2 } }{ n-1 } \]

Que es la desviación estandard

The standard deviation of the mean \[s=\sqrt { s^{ 2 } } \]

Multiple variables

#Two alternative ways to describe multiple variables.
psych::describe(cbind(dlf$day1, dlf$day2, dlf$day3))
##    vars   n mean   sd median trimmed  mad  min   max range skew kurtosis   se
## X1    1 810 1.79 0.94   1.79    1.77 0.70 0.02 20.02 20.00 8.83   168.97 0.03
## X2    2 264 0.96 0.72   0.79    0.87 0.61 0.00  3.44  3.44 1.08     0.76 0.04
## X3    3 123 0.98 0.71   0.76    0.90 0.61 0.02  3.41  3.39 1.01     0.59 0.06
psych::describe(dlf[,c("day1", "day2", "day3")])
##      vars   n mean   sd median trimmed  mad  min   max range skew kurtosis   se
## day1    1 810 1.79 0.94   1.79    1.77 0.70 0.02 20.02 20.00 8.83   168.97 0.03
## day2    2 264 0.96 0.72   0.79    0.87 0.61 0.00  3.44  3.44 1.08     0.76 0.04
## day3    3 123 0.98 0.71   0.76    0.90 0.61 0.02  3.41  3.39 1.01     0.59 0.06

Dicover the Mode. La moda en R

# the mode
library(modeest)
## Registered S3 method overwritten by 'rmutil':
##   method         from 
##   plot.residuals psych
mfv(dlf$day1, method="mfv")
## [1] 2
mfv(dlf$day2, method="mfv")
## [1] NA
mfv(dlf$day3, method="mfv")
## [1] NA

Test of normality, Shapiro Wilks Test

library(pastecs)
stat.desc(dlf$day3, basic = FALSE, norm = TRUE) # "norm=TRUE"" is to calculate the Shapiro Wilk Test
##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
## 7.600000e-01 9.765041e-01 6.404352e-02 1.267805e-01 5.044934e-01 7.102770e-01 
##     coef.var     skewness     skew.2SE     kurtosis     kurt.2SE   normtest.W 
## 7.273672e-01 1.007813e+00 2.309035e+00 5.945454e-01 6.862946e-01 9.077516e-01 
##   normtest.p 
## 3.804486e-07
stat.desc(cbind(dlf$day1, dlf$day2, dlf$day3), basic = FALSE, norm = TRUE)
##                        V1           V2           V3
## median       1.790000e+00 7.900000e-01 7.600000e-01
## mean         1.793358e+00 9.609091e-01 9.765041e-01
## SE.mean      3.318617e-02 4.436095e-02 6.404352e-02
## CI.mean.0.95 6.514115e-02 8.734781e-02 1.267805e-01
## var          8.920705e-01 5.195239e-01 5.044934e-01
## std.dev      9.444949e-01 7.207801e-01 7.102770e-01
## coef.var     5.266627e-01 7.501022e-01 7.273672e-01
## skewness     8.832504e+00 1.082811e+00 1.007813e+00
## skew.2SE     5.140707e+01 3.611574e+00 2.309035e+00
## kurtosis     1.689671e+02 7.554615e-01 5.945454e-01
## kurt.2SE     4.923139e+02 1.264508e+00 6.862946e-01
## normtest.W   6.539142e-01 9.083191e-01 9.077516e-01
## normtest.p   1.545986e-37 1.281630e-11 3.804486e-07
round(stat.desc(dlf[, c("day1", "day2", "day3")], basic = FALSE, norm = TRUE), digits = 3)
##                 day1  day2  day3
## median         1.790 0.790 0.760
## mean           1.793 0.961 0.977
## SE.mean        0.033 0.044 0.064
## CI.mean.0.95   0.065 0.087 0.127
## var            0.892 0.520 0.504
## std.dev        0.944 0.721 0.710
## coef.var       0.527 0.750 0.727
## skewness       8.833 1.083 1.008
## skew.2SE      51.407 3.612 2.309
## kurtosis     168.967 0.755 0.595
## kurt.2SE     492.314 1.265 0.686
## normtest.W     0.654 0.908 0.908
## normtest.p     0.000 0.000 0.000

Levene’s Test: testing for assumption of normality among groups

#Levene's test for comparison of variances of exam scores in the two universities.
library(ggplot2)
library(car)
library(readr)
RExam <- read_csv("Data/RExam.csv")
## Rows: 100 Columns: 5
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): exam, computer, lectures, numeracy, uni
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rexam=RExam
head(rexam)
## # A tibble: 6 × 5
##    exam computer lectures numeracy   uni
##   <dbl>    <dbl>    <dbl>    <dbl> <dbl>
## 1    18       54     75          7     0
## 2    30       47      8.5        1     0
## 3    40       58     69.5        6     0
## 4    30       37     67          6     0
## 5    40       53     44.5        2     0
## 6    15       48     76.5        8     0
ggplot(rexam, aes(numeracy, fill=as.factor(uni)))+
  geom_histogram()+
  facet_wrap(~uni)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 leveneTest(rexam$lectures, rexam$uni)
## Warning in leveneTest.default(rexam$lectures, rexam$uni): rexam$uni coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  1  1.4222 0.2359
##       98
leveneTest(rexam$lectures, rexam$uni, center = mean)
## Warning in leveneTest.default(rexam$lectures, rexam$uni, center = mean):
## rexam$uni coerced to factor.
## Levene's Test for Homogeneity of Variance (center = mean)
##       Df F value Pr(>F)
## group  1  1.7306 0.1914
##       98
leveneTest(rexam$computer, rexam$uni)
## Warning in leveneTest.default(rexam$computer, rexam$uni): rexam$uni coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  1  0.1078 0.7434
##       98
leveneTest(rexam$numeracy, rexam$uni)
## Warning in leveneTest.default(rexam$numeracy, rexam$uni): rexam$uni coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value  Pr(>F)  
## group  1   5.366 0.02262 *
##       98                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
x=c(0,2,3,4,5,6,100,1000)
x
## [1]    0    2    3    4    5    6  100 1000
log(x+1)
## [1] 0.000000 1.098612 1.386294 1.609438 1.791759 1.945910 4.615121 6.908755
sqrt(x)
## [1]  0.000000  1.414214  1.732051  2.000000  2.236068  2.449490 10.000000
## [8] 31.622777
head(log(dlf$day1))
## [1]  0.97077892 -0.03045921 -0.17435339  1.10856262 -0.12783337 -0.16251893
head(dlf)
## # A tibble: 6 × 5
##   ticknumb gender  day1  day2  day3
##      <dbl> <chr>  <dbl> <dbl> <dbl>
## 1     2111 Male    2.64  1.35  1.61
## 2     2229 Female  0.97  1.41  0.29
## 3     2338 Male    0.84 NA    NA   
## 4     2384 Female  3.03 NA    NA   
## 5     2401 Female  0.88  0.08 NA   
## 6     2405 Male    0.85 NA    NA
dlf$lgday1=log(dlf$day1)
head(dlf)
## # A tibble: 6 × 6
##   ticknumb gender  day1  day2  day3  lgday1
##      <dbl> <chr>  <dbl> <dbl> <dbl>   <dbl>
## 1     2111 Male    2.64  1.35  1.61  0.971 
## 2     2229 Female  0.97  1.41  0.29 -0.0305
## 3     2338 Male    0.84 NA    NA    -0.174 
## 4     2384 Female  3.03 NA    NA     1.11  
## 5     2401 Female  0.88  0.08 NA    -0.128 
## 6     2405 Male    0.85 NA    NA    -0.163