Statistical Programming with R
library(magrittr) #for pipes library(MASS) #for the bacteria data library(Ecdat) #for the Benefits data
Base R function curve() to plot the pdf and cdf of a normal distribution:
par(mfrow=c(1,2), bg=NA) curve(dnorm(x,0,1),xlim=c(-3,3),ylab="f(x)",main='') curve(pnorm(x,0,1),xlim=c(-3,3),ylab="F(X < x)",main='')
or exponential distribution:
In R we can use:
rnorm() to obtain samples from a normal distributiondnorm() to sample from a normal density functionpnorm() to sample from a normal distribution functionqnorm() to sample from a normal quantile functionThis works for other distributions as well, e.g. rexp(), dexp(), pexp() and qexp().
set.seed (101) rnorm(5, 2, 5)
## [1] 0.3698175 4.7623093 -1.3747192 3.0717973 3.5538461
rnorm(5, 2, 5)
## [1] 7.8698314 5.0939493 1.4363284 6.5851414 0.8837032
set.seed(101) rnorm(5, 2, 5)
## [1] 0.3698175 4.7623093 -1.3747192 3.0717973 3.5538461
hist(rnorm(1000, 3, 0.5),
ylab = "P(Y=y)", xlab = "y",
ylim = c(0,1), prob = TRUE)
lines(density(rnorm(1000, 3, 0.5)))
Barplots for a poisson distribution:
par(mfrow = c(1, 3), bg = NA)
y <- factor(0:5)
x <- 0:5
barplot(dpois(x, .5), ylim = c(0, .6), names.arg = y,
ylab = "P(Y=y)", xlab = "y")
text(5, .5, expression(paste(lambda, " = 0.5")))
barplot(dpois(x, 1),
ylim = c(0, .6),
names.arg = y,
ylab = "P(Y=y)",
xlab = "y")
text(5, .5, expression(paste(lambda, " = 1")))
barplot(dpois(x, 2), ylim = c(0, .6), names.arg = y,
ylab = "P(Y = y)",xlab="y")
text(5,.5,expression(paste(lambda," = 2")))
bacteria %>% head(n=8)
## y ap hilo week ID trt ## 1 y p hi 0 X01 placebo ## 2 y p hi 2 X01 placebo ## 3 y p hi 4 X01 placebo ## 4 y p hi 11 X01 placebo ## 5 y a hi 0 X02 drug+ ## 6 y a hi 2 X02 drug+ ## 7 n a hi 6 X02 drug+ ## 8 y a hi 11 X02 drug+
Column y:
n: Disease absent.y: Disease present.Column ap:
a: Active medicine.p: Placebo.x2.table <- bacteria %$% table(ap, y) #do not need data argument x2.table
## y ## ap n y ## a 31 93 ## p 12 84
prop.table(table(bacteria$ap, bacteria$y), margin = 1)
## ## n y ## a 0.250 0.750 ## p 0.125 0.875
bacteria %$% chisq.test(ap, y)
## ## Pearson's Chi-squared test with Yates' continuity correction ## ## data: ap and y ## X-squared = 4.6109, df = 1, p-value = 0.03177
bacteria %$% table(ap, y) %>% chisq.test
## ## Pearson's Chi-squared test with Yates' continuity correction ## ## data: . ## X-squared = 4.6109, df = 1, p-value = 0.03177
bacteria %$% chisq.test(ap, y)$expected
## y ## ap n y ## a 24.23636 99.76364 ## p 18.76364 77.23636
bacteria %$% table(y, ap) %>% plot
barplot(table(bacteria$y, bacteria$ap),
beside = TRUE,
legend = c("No infection", "Infection"),
col = c("blue", "orange"),
args.legend = list(x=4.5, y=80))
head(Benefits, n = 5)
## stateur statemb state age tenure joblost nwhite school12 sex ## 1 4.5 167 42 49 21 other no no male ## 2 10.5 251 55 26 2 slack_work no no male ## 3 7.2 260 21 40 19 other no yes female ## 4 5.8 245 56 51 17 slack_work yes no female ## 5 6.5 125 58 33 1 slack_work no yes male ## bluecol smsa married dkids dykids yrdispl rr head ui ## 1 yes yes no no no 7 0.2906310 yes yes ## 2 yes yes no yes yes 10 0.5202020 yes no ## 3 yes yes yes no no 10 0.4324895 yes yes ## 4 yes yes yes no no 10 0.5000000 no yes ## 5 yes yes yes yes yes 4 0.3906250 yes no
Unemployment of blue collar workers:
Column stateur:
Column sex:
malefemalehist(Benefits$stateur, xlim=c(2,18),
main = "State Unemployment Rates in 1972",
xlab = "State Unemployment Rate (in %)")
Benefits %$% t.test(stateur, mu = 5, alternative = "two.sided")
## ## One Sample t-test ## ## data: stateur ## t = 70.127, df = 4876, p-value < 2.2e-16 ## alternative hypothesis: true mean is not equal to 5 ## 95 percent confidence interval: ## 7.440834 7.581229 ## sample estimates: ## mean of x ## 7.511031
Benefits %$% t.test(stateur, mu = 5, alternative = "greater")
## ## One Sample t-test ## ## data: stateur ## t = 70.127, df = 4876, p-value < 2.2e-16 ## alternative hypothesis: true mean is greater than 5 ## 95 percent confidence interval: ## 7.452123 Inf ## sample estimates: ## mean of x ## 7.511031
Benefits %$% t.test(stateur~sex)
## ## Welch Two Sample t-test ## ## data: stateur by sex ## t = -2.8681, df = 2023.4, p-value = 0.004172 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## -0.39311976 -0.07383349 ## sample estimates: ## mean in group female mean in group male ## 7.332609 7.566085
fit <- Benefits %$% aov(stateur~sex) summary(fit)
## Df Sum Sq Mean Sq F value Pr(>F) ## sex 1 48 47.91 7.672 0.00563 ** ## Residuals 4875 30441 6.24 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
With small expected cell frequencies (roughly if any are below 5), we should use Fishers exact test.
short <- bacteria %>% subset(week <= 2) %$% table(y, ap) short
## ap ## y a p ## n 6 3 ## y 47 38
short %>% chisq.test
## Warning in chisq.test(.): Chi-squared approximation may be incorrect
## ## Pearson's Chi-squared test with Yates' continuity correction ## ## data: . ## X-squared = 0.090475, df = 1, p-value = 0.7636
#fisher.test(table(short$y, short$ap)) short %>% fisher.test
## ## Fisher's Exact Test for Count Data ## ## data: . ## p-value = 0.7268 ## alternative hypothesis: true odds ratio is not equal to 1 ## 95 percent confidence interval: ## 0.3182193 10.5996850 ## sample estimates: ## odds ratio ## 1.609049
fit <- Benefits %$% aov(stateur~sex) plot(fit, 2)
Check whether two population distributions (independent samples) are identical without assuming normality.
Benefits %$% wilcox.test(stateur~sex, paired = FALSE)
## ## Wilcoxon rank sum test with continuity correction ## ## data: stateur by sex ## W = 2038781, p-value = 0.0125 ## alternative hypothesis: true location shift is not equal to 0
set paired = TRUE for Wilcoxon signed-rank test for dependent samples.
Check whether a collection of population distributions (independent samples) are identical without assuming normality.
Benefits %$% kruskal.test(stateur~sex)
## ## Kruskal-Wallis rank sum test ## ## data: stateur by sex ## Kruskal-Wallis chi-squared = 6.239, df = 1, p-value = 0.0125
Rfriedman.test: Friedman rank sum testRfit package for nonparametric regressioncor(x, y, method = "kendall") for Kendalls tau-b correlationcor(x, y, method = "spearman") for Spearman correlationsurvival package for semi- and nonparametric methods (Kaplan-Meier/Cox regression) in survival analysis