library(ggplot2)
library(Hmisc)
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:plyr':
## 
##     is.discrete, summarize
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units

La información utilizada en este documento proviene en parte del libro Discovering Statistics using R por Andy Field, Jeremy Miles y Zoë Field.

Como crear una gráfica de regresión lineal usando ggplot

  1. Subir los datos “ExamAnxiety” Los archivos de datos que se usan en el libro se encuentran en este enlace, https://studysites.sagepub.com/dsur/study/articles.htm

see page 136 in book

DATA FILE = Exam Anxiety

library(readr)
Exam_Anxiety <- read_csv("Data/Exam Anxiety.csv")
## Rows: 103 Columns: 5
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Gender
## dbl (4): Code, Revise, Exam, Anxiety
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(Exam_Anxiety, n=1)
## # A tibble: 1 × 5
##    Code Revise  Exam Anxiety Gender
##   <dbl>  <dbl> <dbl>   <dbl> <chr> 
## 1     1      4    40    86.3 Male
tail(Exam_Anxiety, n=2)
## # A tibble: 2 × 5
##    Code Revise  Exam Anxiety Gender
##   <dbl>  <dbl> <dbl>   <dbl> <chr> 
## 1   102      9    40    79.0 Male  
## 2   103     20    50    91.1 Female
Anxiety = Exam_Anxiety
Exam_Anxiety = Anxiety
examData=Anxiety
head(Exam_Anxiety)  #cuando no pone n=, se ve los primepors 6
## # A tibble: 6 × 5
##    Code Revise  Exam Anxiety Gender
##   <dbl>  <dbl> <dbl>   <dbl> <chr> 
## 1     1      4    40    86.3 Male  
## 2     2     11    65    88.7 Female
## 3     3     27    80    70.2 Male  
## 4     4     53    80    61.3 Male  
## 5     5      4    40    89.5 Male  
## 6     6     22    70    60.5 Female
tail(Exam_Anxiety, n=3)
## # A tibble: 3 × 5
##    Code Revise  Exam Anxiety Gender
##   <dbl>  <dbl> <dbl>   <dbl> <chr> 
## 1   101      1     2    82.3 Male  
## 2   102      9    40    79.0 Male  
## 3   103     20    50    91.1 Female

Medida de tendencia central

Calculate the mean

x=c(1,2,3,4,5,6,7,8,9,10, 1100)
d=c(2:100)
mean(x)
## [1] 105
mean(d)
## [1] 51
d
##  [1]   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20
## [20]  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39
## [39]  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58
## [58]  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77
## [77]  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96
## [96]  97  98  99 100

range

range(d)
## [1]   2 100
median(x)
## [1] 6

#Interquartile Range

quantile(x)
##     0%    25%    50%    75%   100% 
##    1.0    3.5    6.0    8.5 1100.0
quantile(x, probs = c(0.015, 0.25, 0.50,.75, .975)) #Selecionar los quantiles.  
##   1.5%    25%    50%    75%  97.5% 
##   1.15   3.50   6.00   8.50 827.50
mean(Exam_Anxiety$Revise)
## [1] 19.85437
range(Exam_Anxiety$Revise)
## [1]  0 98
y=rnorm(1001)
head(y)
## [1] -1.825470 -1.116996 -2.264853 -1.220137  1.848894  1.521978
quantile(y)
##          0%         25%         50%         75%        100% 
## -2.83903093 -0.71694099 -0.06473675  0.62185744  3.06750082
probs = c(0.01, 0.10, 0.50,.90)

How to do a simple linear point graph and add text to graph, change the axis name

library(ggplot2)
#names(Exam_Anxiety) # para seber el nombre de las columnas

ggplot(Exam_Anxiety, aes(y=Exam, x=Anxiety))+ 
  geom_line(colour="#308A8C")

tail(iris)
##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 145          6.7         3.3          5.7         2.5 virginica
## 146          6.7         3.0          5.2         2.3 virginica
## 147          6.3         2.5          5.0         1.9 virginica
## 148          6.5         3.0          5.2         2.0 virginica
## 149          6.2         3.4          5.4         2.3 virginica
## 150          5.9         3.0          5.1         1.8 virginica
ggplot(iris, aes(Sepal.Length, Petal.Length)) +
      geom_point(aes(col = Species)) +
  geom_smooth(method="lm", colour="aquamarine") +  # lm is for linear model 
  labs(x = "El largo del Sepalo", y = "El largo del Petalo")+
  ggtitle("Basic geom_point and linear regression")
## `geom_smooth()` using formula = 'y ~ x'

# How to add a title, "ggtitle"

# How to save a Graph

ggsave("Iris_size.png") # .jpeg/.jpg/.tiff
## Saving 7 x 5 in image
## `geom_smooth()` using formula = 'y ~ x'
#Simple scatter with regression line
scatter <- ggplot(examData, aes(Anxiety, Exam))
scatter + geom_point(shape=20, colour="red") + 
          geom_smooth(method = "lm", colour = "blue", se = F) +  # se = F, remueve el intervalo de confianza
  labs(x = "Exam Anxiety", y = "Exam Performance %")
## `geom_smooth()` using formula = 'y ~ x'

#Simple scatter with regression line + CI

scatter <- ggplot(examData, aes(Anxiety, Exam))
scatter + geom_point() + 
          geom_smooth(method = "lm", colour = "Red")+ 
  labs(x = "Exam Anxiety", y = "Exam Performance %") 
## `geom_smooth()` using formula = 'y ~ x'

#Regresión lineal más el intervalo de confianza.

scatter <- ggplot(examData, aes(Anxiety, Exam))
scatter + geom_point() + 
  geom_smooth(method = "lm", colour = "Red", alpha = 0.2, fill = "orange") + 
  labs(x = "Exam Anxiety", y = "Exam Performance %") 
## `geom_smooth()` using formula = 'y ~ x'

#Regresión lineal más el intervalo de confianza, usar “colour” por multiples grupos.

names(Exam_Anxiety)
## [1] "Code"    "Revise"  "Exam"    "Anxiety" "Gender"
ggplot(Exam_Anxiety, aes(y=Exam, x=Anxiety, colour=Gender))+ 
  geom_point(colour="coral")+
  geom_smooth(method="lm") +  # lm is for linear model 
  labs(x = "Exam Anxiety Score", y = "Exam Performance %")+
  ggtitle("Basic geom_point and linear regression")
## `geom_smooth()` using formula = 'y ~ x'

Constructing histograms

Data needed are from festivalData

A biologist was worried about the potential health effects of music festivals. So, one year she went to the Download Festival in UK (Download Festival UK). She measured the higiene of 810 concert goes over the 3 day festival. Each day she attempted find all the individuals she census on the first day. + 0 = you smell like a corpse, that is left to rot + 4 = you smell of sweet roses on fresh spring day

The hypothesis is that the personal hygiene of the concert goers would go down dramatically over the 3 days of the festival.

library(readr)
DownloadFestival <- read_csv("Data/DownloadFestival.csv")
## Rows: 810 Columns: 5
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gender
## dbl (4): ticknumb, day1, day2, day3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
FD=DownloadFestival
head(FD)  # ver las 6 primeras filas
## # A tibble: 6 × 5
##   ticknumb gender  day1  day2  day3
##      <dbl> <chr>  <dbl> <dbl> <dbl>
## 1     2111 Male    2.64  1.35  1.61
## 2     2229 Female  0.97  1.41  0.29
## 3     2338 Male    0.84 NA    NA   
## 4     2384 Female  3.03 NA    NA   
## 5     2401 Female  0.88  0.08 NA   
## 6     2405 Male    0.85 NA    NA
length(FD$ticknumb)  # cuantas filas
## [1] 810

Constructing histograms and detecting outliers

head(FD, n=2)
## # A tibble: 2 × 5
##   ticknumb gender  day1  day2  day3
##      <dbl> <chr>  <dbl> <dbl> <dbl>
## 1     2111 Male    2.64  1.35  1.61
## 2     2229 Female  0.97  1.41  0.29
tail(FD)
## # A tibble: 6 × 5
##   ticknumb gender  day1  day2  day3
##      <dbl> <chr>  <dbl> <dbl> <dbl>
## 1     4749 Female  0.52 NA       NA
## 2     4756 Female  2.91  0.94    NA
## 3     4758 Female  2.61  1.44    NA
## 4     4759 Female  1.47 NA       NA
## 5     4760 Male    1.28 NA       NA
## 6     4765 Female  1.26 NA       NA
head(FD)
## # A tibble: 6 × 5
##   ticknumb gender  day1  day2  day3
##      <dbl> <chr>  <dbl> <dbl> <dbl>
## 1     2111 Male    2.64  1.35  1.61
## 2     2229 Female  0.97  1.41  0.29
## 3     2338 Male    0.84 NA    NA   
## 4     2384 Female  3.03 NA    NA   
## 5     2401 Female  0.88  0.08 NA   
## 6     2405 Male    0.85 NA    NA
 ggplot(FD, aes(day1))+ 
  geom_histogram(colour="white", fill="steelblue") + 
  labs(x = "Hygiene (Day 1 of Festival)", y = "Frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  summary(DownloadFestival)
##     ticknumb       gender               day1             day2       
##  Min.   :2111   Length:810         Min.   : 0.020   Min.   :0.0000  
##  1st Qu.:3096   Class :character   1st Qu.: 1.312   1st Qu.:0.4100  
##  Median :3620   Mode  :character   Median : 1.790   Median :0.7900  
##  Mean   :3616                      Mean   : 1.793   Mean   :0.9609  
##  3rd Qu.:4155                      3rd Qu.: 2.230   3rd Qu.:1.3500  
##  Max.   :4765                      Max.   :20.020   Max.   :3.4400  
##                                                     NA's   :546     
##       day3       
##  Min.   :0.0200  
##  1st Qu.:0.4400  
##  Median :0.7600  
##  Mean   :0.9765  
##  3rd Qu.:1.5250  
##  Max.   :3.4100  
##  NA's   :687
#binwidth = 0.6, fill="orange", colour="white"

Subsetting data = removing values larger than x

Festivalday1=subset(DownloadFestival, day1<5)
summary(Festivalday1)
##     ticknumb       gender               day1            day2       
##  Min.   :2111   Length:809         Min.   :0.020   Min.   :0.0000  
##  1st Qu.:3096   Class :character   1st Qu.:1.310   1st Qu.:0.4100  
##  Median :3620   Mode  :character   Median :1.790   Median :0.7900  
##  Mean   :3616                      Mean   :1.771   Mean   :0.9553  
##  3rd Qu.:4154                      3rd Qu.:2.230   3rd Qu.:1.3350  
##  Max.   :4765                      Max.   :3.690   Max.   :3.4400  
##                                                    NA's   :546     
##       day3       
##  Min.   :0.0200  
##  1st Qu.:0.4400  
##  Median :0.7600  
##  Mean   :0.9765  
##  3rd Qu.:1.5250  
##  Max.   :3.4100  
##  NA's   :686
festivalHistogram <- ggplot(Festivalday1, aes(day1))
festivalHistogram + 
  geom_histogram(fill="orange", colour="white") + 
  labs(x = "Hygiene (Day 1 of Festival)", y = "Frequency")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

How to create boxplots

festivalBoxplot <- ggplot(Festivalday1, aes(x=1,y=day1))
festivalBoxplot + geom_boxplot() + labs(x = "Gender", y = "Hygiene (Day 1 of Festival)")

Boxplot by Gender

names(Festivalday1)
## [1] "ticknumb" "gender"   "day1"     "day2"     "day3"
festivalBoxplot <- ggplot(Festivalday1, aes(x=gender,y=day2, colour=gender))
festivalBoxplot + 
  geom_point()+
  geom_boxplot(fill="yellow", alpha=0.1) + 
  labs(x = "Gender", y = "Hygiene (Day 2 of Festival)")
## Warning: Removed 546 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 546 rows containing missing values (`geom_point()`).

How to detect outliers. Determine the % of data which are outliers (IN Spanish, sesgado) NOTE: that I’m constructing a function, which does not exist

#--------OUTLIERS----------


outlierSummary<-function(variable, digits = 2){
  
  zvariable<-(variable-mean(variable, na.rm = TRUE))/sd(variable, na.rm = TRUE)
  
  outlier95<-abs(zvariable) >= 1.96  # error de 95%
  outlier99<-abs(zvariable) >= 2.58   # error de 99%
  outlier999<-abs(zvariable) >= 3.29  # error de 99.9%
  
  ncases<-length(na.omit(zvariable))
  
  percent95<-round(100*length(subset(outlier95, outlier95 == TRUE))/ncases, digits)
  percent99<-round(100*length(subset(outlier99, outlier99 == TRUE))/ncases, digits)
  percent999<-round(100*length(subset(outlier999, outlier999 == TRUE))/ncases, digits)
  
  cat("Absolute z-score greater than 1.96 = ", percent95, "%", "\n")
  cat("Absolute z-score greater than 2.58 = ",  percent99, "%", "\n")
  cat("Absolute z-score greater than 3.29 = ",  percent999, "%", "\n")
}

outlierSummary(FD$day3)
## Absolute z-score greater than 1.96 =  4.07 % 
## Absolute z-score greater than 2.58 =  2.44 % 
## Absolute z-score greater than 3.29 =  0.81 %

How to do a BOXPLOT of the distribution of the data, no assuming normal distribution.

In this figure we show the median and the percentiles and the outliers

library(ggplot2)
ggplot(FD, aes(gender, day1))+ geom_boxplot()

Now to remove outliers

Festivalday1
## # A tibble: 809 × 5
##    ticknumb gender  day1  day2  day3
##       <dbl> <chr>  <dbl> <dbl> <dbl>
##  1     2111 Male    2.64  1.35  1.61
##  2     2229 Female  0.97  1.41  0.29
##  3     2338 Male    0.84 NA    NA   
##  4     2384 Female  3.03 NA    NA   
##  5     2401 Female  0.88  0.08 NA   
##  6     2405 Male    0.85 NA    NA   
##  7     2467 Female  1.56 NA    NA   
##  8     2478 Female  3.02 NA    NA   
##  9     2490 Male    2.29 NA    NA   
## 10     2504 Female  1.11  0.44  0.55
## # ℹ 799 more rows
ggplot(Festivalday1, aes(gender, day1))+ geom_boxplot()

#Line and error bars

  1. Learn how to reformat data in the correct type of data frame (as the original data set is not in the correct format) with the function “stack”
  2. How to add the mean of the variable with “stat_summary”
  3. How to connect the mean with a line and change color
  4. how to add the error bars and color (The 95% confidence intervals, created with the stat_summary() function and the “mean_cl_boot” argument are bootstrap confidence intervals using the smean.cl.boot() function in Hmisc)

Install library(Hmisc)

library(Hmisc) # you need this package to add the confidence interval

library(readr)
Hiccups <- read_csv("Data/Hiccups.csv")
## Rows: 15 Columns: 4
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): Baseline, Tongue, Carotid, Rectum
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#hiccupsData=Hiccups

hiccups<-stack(Hiccups) # to reorganize the data in two columns 
head(hiccups)
##   values      ind
## 1     15 Baseline
## 2     13 Baseline
## 3      9 Baseline
## 4      7 Baseline
## 5     11 Baseline
## 6     14 Baseline
tail(hiccups)
##    values    ind
## 55      4 Rectum
## 56      4 Rectum
## 57      4 Rectum
## 58      4 Rectum
## 59      2 Rectum
## 60      3 Rectum
names(hiccups)=c("Num_Hiccups","Intervention") # change the names of columns
#hiccups$Intervention_Factor <- factor(hiccups$Intervention, levels = hiccups$Intervention)
head(hiccups)
##   Num_Hiccups Intervention
## 1          15     Baseline
## 2          13     Baseline
## 3           9     Baseline
## 4           7     Baseline
## 5          11     Baseline
## 6          14     Baseline
ggplot(hiccups, aes(y=Num_Hiccups,x=Intervention))+ 
  stat_summary(fun.y = "mean", geom = "point") + 
  stat_summary(fun.y = "mean", geom = "line", aes(group = 1),colour = "Red", linetype = "dashed") +      
  stat_summary(fun.data = mean_cl_boot, geom = "errorbar", width = 0.2, colour="blue") + 
  labs(x = "Intervention", y = "Mean Number of Hiccups")
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

ggplot(hiccups, aes(y=Num_Hiccups,x=Intervention))+ 
  stat_summary(fun.y = mean, geom = "point") + 
  stat_summary(fun.y = mean, geom = "line", aes(group = 1),colour = "Red", linetype = "dashed") +      
  stat_summary(fun.data = mean_cl_boot, geom = "errorbar", width = 0.2, colour="blue") + 
  labs(x = "Intervention", y = "Mean Number of Hiccups")