---
title: "Lesson 4"
runtime: shiny
output:
html_document:
theme: united
highlight: tango
---
Lesson 4
========================================================
***
### Working directory and libraries
```{r setup}
setwd('/Users/olgabelitskaya/version-control/reflections-ud651')
```
```{r Libraries1}
library(ggplot2)
library(lubridate)
library(ggthemes)
```
```{r Libraries3}
library(gridExtra)
library(plyr)
```
```{r Libraries4}
library(scales)
library(reshape2)
```
```{r Libraries5}
library(dplyr)
library(tidyr)
```
```{r Libraries6}
library(xlsx)
```
## Useful links
```{r Links}
# http://docs.ggplot2.org/current/
# http://docs.ggplot2.org/current/coord_trans.html
# http://sape.inf.usi.ch/quick-reference/ggplot2/themes
# http://personality-project.org/r/html/corr.test.html
# https://rpubs.com/hadley/ggplot2-layers
# http://rmarkdown.rstudio.com/articles_integration.html
```
```{r Pseudo-Facebook User Data}
pf <- read.csv('pseudo_facebook.tsv', sep='\t')
names(pf)
```
***
### Scatterplots
```{r Scatterplots}
qplot(age, friend_count, data = pf) + geom_point(color = 'darkblue')
```
***
### ggplot Syntax
```{r ggplot Syntax}
ggplot(aes(x = age, y = friend_count), data = pf) + geom_point(color = 'darkblue') + xlim(13,90)
```
***
### Overplotting
```{r Overplotting}
ggplot(aes(x = age, y = friend_count), data = pf) + geom_jitter(shape=7, alpha=1/10, color = 'darkred') + xlim(13,90)
```
***
### Coord_trans()
```{r Coord_trans()1}
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_point(shape=5, alpha=1/10, color = 'purple') + coord_trans(x="log10", y="log10")
```
#### Look up the documentation and add a layer to the plot that transforms friend_count using the square root function. Create your plot!
```{r Coord_trans()2}
ggplot(data = pf, aes(x = age, y = friend_count)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/10, color = 'navy') + coord_trans(y="sqrt") + theme_bw()
```
***
### Alpha and Jitter
```{r Alpha and Jitter}
ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(position = position_jitter(h=0), shape=10, alpha=1/20, color = 'darkgreen') + xlim(13,90) + coord_trans(y="sqrt") + theme_bw()
```
***
### Conditional Means
```{r Conditional Means}
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups, fc_mean= mean(friend_count), fc_median = median(friend_count), n = n())
pf.fc_by_age <- arrange(pf.fc_by_age, age)
head(pf.fc_by_age)
```
Create your plot!
```{r Conditional Means Plot}
ggplot(data = pf.fc_by_age, aes(x = age, y = fc_mean)) +
geom_line(color = 'forestgreen') + xlim(13,90) + theme_grey()
```
***
### Overlaying Summaries with Raw Data
```{r Overlaying Summaries with Raw Data}
ggplot(data = pf, aes(x = age, y = friend_count)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/10, color = 'orange') + geom_line(stat = 'summary', fun.y = mean, color = "green") + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .9), color = "darkgreen", linetype = 5) + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1), color = "darkblue", linetype = 5) + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .5), color = "blue", linetype = 5) + coord_cartesian(xlim = c(13,70), ylim = c(0,1000)) + theme_bw()
ggsave("pf01.jpg")
```
***
### Moira: Histogram Summary and Scatterplot
See the Instructor Notes of this video to download Moira's paper on perceived audience size and to see the final plot.
***
### Correlation
```{r Correlation}
?cor.test.formula
cor.test(pf$age, pf$friend_count, method = 'pearson')
```
What's the correlation between age and friend count? Round to three decimal places.
Pearson's product-moment correlation
data: pf$age and pf$friend_count
t = -8.6268, df = 99001, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.03363072 -0.02118189
sample estimates:
cor
-0.02740737
***
```{r Correlation2}
with(pf, cor.test(age, friend_count, method = 'pearson'))
```
### Correlation on Subsets
```{r Correlation on Subsets}
with(subset(pf, age <= 70), cor.test(age, friend_count))
```
***
### Correlation Methods
```{r Correlation on Subsets2}
with(subset(pf, age <= 70), cor.test(age, friend_count, method = 'spearman'))
```
***
## Create Scatterplots
```{r Create Scatterplots1}
ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/10, color = 'blue') + geom_line(stat = 'summary', fun.y = mean, color = "darkorchid1") + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .9), color = "red", linetype = 5) + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1), color = "darkred", linetype = 5) + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .5), color = "orange", linetype = 5) + coord_cartesian(xlim = c(0,1000), ylim = c(0,1000)) + theme_bw()
ggsave("pf02.jpg")
```
```{r Create Scatterplots2}
ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/10, color = 'blue') + coord_cartesian(xlim = c(0,500), ylim = c(0,500)) + geom_line(stat = 'summary', fun.y = mean, color = "darkorchid1") + theme_bw()
ggsave("pf03.jpg")
```
***
### Strong Correlations
```{r Strong Correlations}
ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) + geom_point(position = position_jitter(h=0), shape=5, alpha=1/10, color = 'darkgreen') + xlim(0, quantile(pf$www_likes_received, 0.95)) + ylim(0, quantile(pf$likes_received, 0.95)) + geom_smooth(method='lm', color ='darkorange') + theme_bw()
```
What's the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.
Pearson's product-moment correlation
data: www_likes_received and likes_received
t = 937.1, df = 99001, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9473553 0.9486176
sample estimates:
cor
0.9479902
```{r Correlation Calcuation}
with(pf, cor.test(www_likes_received, likes_received, method = 'pearson'))
```
***
### More Caution with Correlation
```{r More Caution With Correlation}
#install.packages('alr3')
library(alr3)
```
```{r, echo=FALSE}
data(Mitchell)
?Mitchell
```
Create your plot!
```{r Temp vs Month1}
ggplot(data = Mitchell, aes(x = Month, y = Temp)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/2, color = 'darkblue') + theme_bw()
```
***
### Noisy Scatterplots
a. Take a guess for the correlation coefficient for the scatterplot.
0.01
b. What is the actual correlation of the two variables?
(Round to the thousandths place)
Pearson's product-moment correlation
data: Temp and Month
t = 0.81816, df = 202, p-value = 0.4142
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.08053637 0.19331562
sample estimates:
cor
0.05747063
```{r Noisy Scatterplots}
with(Mitchell, cor.test(Month, Temp))
```
```{r Temp vs Month2}
ggplot(data = Mitchell, aes(x = Month, y = Temp)) + geom_point(position = position_jitter(h=0), shape=6, alpha=1/2, color = 'forestgreen') + theme_bw() + scale_x_discrete(breaks = seq(0, 203, 12))
```
***
### Making Sense of Data
```{r Making Sense of Data}
ggplot(data = Mitchell, aes(x = Month%%12, y = Temp)) + geom_point(position = position_jitter(h=0), shape=10, alpha=1/2, color = 'steelblue') + theme_bw() + geom_smooth(se = FALSE)
ggsave("mitchell01.jpg")
```
***
### Understanding Noise: Age to Age Months
```{r Understanding Noise: Age to Age Months}
pf$age_with_months <- pf$age + (1 - pf$dob_month / 12)
```
***
### Age with Months Means
```{r Age with Months Means}
age_with_months_groups <- group_by(pf, age_with_months)
```
Programming Assignment
```{r Programming Assignment}
pf.fc_by_age_months <- summarise(age_with_months_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age_months <- arrange(pf.fc_by_age_months, age_with_months)
head(pf.fc_by_age_months)
```
***
### Noise in Conditional Means
```{r Noise in Conditional Means}
ggplot(data = pf.fc_by_age_months, aes(x = age_with_months, y = friend_count_mean)) + geom_line(color = 'darkgreen') + xlim(13, 71) + theme_bw()
```
```{r Noise in Conditional Means2}
ggplot(data = subset(pf.fc_by_age_months, age_with_months < 71), aes(x = age_with_months, y = friend_count_mean)) + geom_line(color = 'darkred') + xlim(13, 71) + theme_bw()
```
***
### Smoothing Conditional Means
```{r Smoothing Conditional Means}
p1 <- ggplot(aes(x=age, y=fc_mean), data=subset(pf.fc_by_age, age < 71)) + geom_line(color = 'darkred') + geom_smooth(color = 'firebrick1')
p2 <- ggplot(aes(x=age_with_months, y=friend_count_mean), data=subset(pf.fc_by_age_months, age_with_months < 71)) + geom_line(color = 'darkblue') + geom_smooth(color = 'steelblue')
p3 <- ggplot(aes(x=round(age/5)*5, y=friend_count), data=subset(pf, age < 71)) + geom_line(stat = 'summary', fun.y = mean, color = 'darkgreen')
grid.arrange(p2, p1, p3, ncol=1)
ggsave("pf04.jpg")
```
title: "Lesson 4"
runtime: shiny
output:
html_document:
theme: united
highlight: tango
---
Lesson 4
========================================================
***
### Working directory and libraries
```{r setup}
setwd('/Users/olgabelitskaya/version-control/reflections-ud651')
```
```{r Libraries1}
library(ggplot2)
library(lubridate)
library(ggthemes)
```
```{r Libraries3}
library(gridExtra)
library(plyr)
```
```{r Libraries4}
library(scales)
library(reshape2)
```
```{r Libraries5}
library(dplyr)
library(tidyr)
```
```{r Libraries6}
library(xlsx)
```
## Useful links
```{r Links}
# http://docs.ggplot2.org/current/
# http://docs.ggplot2.org/current/coord_trans.html
# http://sape.inf.usi.ch/quick-reference/ggplot2/themes
# http://personality-project.org/r/html/corr.test.html
# https://rpubs.com/hadley/ggplot2-layers
# http://rmarkdown.rstudio.com/articles_integration.html
```
```{r Pseudo-Facebook User Data}
pf <- read.csv('pseudo_facebook.tsv', sep='\t')
names(pf)
```
***
### Scatterplots
```{r Scatterplots}
qplot(age, friend_count, data = pf) + geom_point(color = 'darkblue')
```
***
### ggplot Syntax
```{r ggplot Syntax}
ggplot(aes(x = age, y = friend_count), data = pf) + geom_point(color = 'darkblue') + xlim(13,90)
```
***
### Overplotting
```{r Overplotting}
ggplot(aes(x = age, y = friend_count), data = pf) + geom_jitter(shape=7, alpha=1/10, color = 'darkred') + xlim(13,90)
```
***
### Coord_trans()
```{r Coord_trans()1}
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_point(shape=5, alpha=1/10, color = 'purple') + coord_trans(x="log10", y="log10")
```
#### Look up the documentation and add a layer to the plot that transforms friend_count using the square root function. Create your plot!
```{r Coord_trans()2}
ggplot(data = pf, aes(x = age, y = friend_count)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/10, color = 'navy') + coord_trans(y="sqrt") + theme_bw()
```
***
### Alpha and Jitter
```{r Alpha and Jitter}
ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(position = position_jitter(h=0), shape=10, alpha=1/20, color = 'darkgreen') + xlim(13,90) + coord_trans(y="sqrt") + theme_bw()
```
***
### Conditional Means
```{r Conditional Means}
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups, fc_mean= mean(friend_count), fc_median = median(friend_count), n = n())
pf.fc_by_age <- arrange(pf.fc_by_age, age)
head(pf.fc_by_age)
```
Create your plot!
```{r Conditional Means Plot}
ggplot(data = pf.fc_by_age, aes(x = age, y = fc_mean)) +
geom_line(color = 'forestgreen') + xlim(13,90) + theme_grey()
```
***
### Overlaying Summaries with Raw Data
```{r Overlaying Summaries with Raw Data}
ggplot(data = pf, aes(x = age, y = friend_count)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/10, color = 'orange') + geom_line(stat = 'summary', fun.y = mean, color = "green") + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .9), color = "darkgreen", linetype = 5) + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1), color = "darkblue", linetype = 5) + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .5), color = "blue", linetype = 5) + coord_cartesian(xlim = c(13,70), ylim = c(0,1000)) + theme_bw()
ggsave("pf01.jpg")
```
***
### Moira: Histogram Summary and Scatterplot
See the Instructor Notes of this video to download Moira's paper on perceived audience size and to see the final plot.
***
### Correlation
```{r Correlation}
?cor.test.formula
cor.test(pf$age, pf$friend_count, method = 'pearson')
```
What's the correlation between age and friend count? Round to three decimal places.
Pearson's product-moment correlation
data: pf$age and pf$friend_count
t = -8.6268, df = 99001, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.03363072 -0.02118189
sample estimates:
cor
-0.02740737
***
```{r Correlation2}
with(pf, cor.test(age, friend_count, method = 'pearson'))
```
### Correlation on Subsets
```{r Correlation on Subsets}
with(subset(pf, age <= 70), cor.test(age, friend_count))
```
***
### Correlation Methods
```{r Correlation on Subsets2}
with(subset(pf, age <= 70), cor.test(age, friend_count, method = 'spearman'))
```
***
## Create Scatterplots
```{r Create Scatterplots1}
ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/10, color = 'blue') + geom_line(stat = 'summary', fun.y = mean, color = "darkorchid1") + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .9), color = "red", linetype = 5) + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1), color = "darkred", linetype = 5) + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .5), color = "orange", linetype = 5) + coord_cartesian(xlim = c(0,1000), ylim = c(0,1000)) + theme_bw()
ggsave("pf02.jpg")
```
```{r Create Scatterplots2}
ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/10, color = 'blue') + coord_cartesian(xlim = c(0,500), ylim = c(0,500)) + geom_line(stat = 'summary', fun.y = mean, color = "darkorchid1") + theme_bw()
ggsave("pf03.jpg")
```
***
### Strong Correlations
```{r Strong Correlations}
ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) + geom_point(position = position_jitter(h=0), shape=5, alpha=1/10, color = 'darkgreen') + xlim(0, quantile(pf$www_likes_received, 0.95)) + ylim(0, quantile(pf$likes_received, 0.95)) + geom_smooth(method='lm', color ='darkorange') + theme_bw()
```
What's the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.
Pearson's product-moment correlation
data: www_likes_received and likes_received
t = 937.1, df = 99001, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9473553 0.9486176
sample estimates:
cor
0.9479902
```{r Correlation Calcuation}
with(pf, cor.test(www_likes_received, likes_received, method = 'pearson'))
```
***
### More Caution with Correlation
```{r More Caution With Correlation}
#install.packages('alr3')
library(alr3)
```
```{r, echo=FALSE}
data(Mitchell)
?Mitchell
```
Create your plot!
```{r Temp vs Month1}
ggplot(data = Mitchell, aes(x = Month, y = Temp)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/2, color = 'darkblue') + theme_bw()
```
***
### Noisy Scatterplots
a. Take a guess for the correlation coefficient for the scatterplot.
0.01
b. What is the actual correlation of the two variables?
(Round to the thousandths place)
Pearson's product-moment correlation
data: Temp and Month
t = 0.81816, df = 202, p-value = 0.4142
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.08053637 0.19331562
sample estimates:
cor
0.05747063
```{r Noisy Scatterplots}
with(Mitchell, cor.test(Month, Temp))
```
```{r Temp vs Month2}
ggplot(data = Mitchell, aes(x = Month, y = Temp)) + geom_point(position = position_jitter(h=0), shape=6, alpha=1/2, color = 'forestgreen') + theme_bw() + scale_x_discrete(breaks = seq(0, 203, 12))
```
***
### Making Sense of Data
```{r Making Sense of Data}
ggplot(data = Mitchell, aes(x = Month%%12, y = Temp)) + geom_point(position = position_jitter(h=0), shape=10, alpha=1/2, color = 'steelblue') + theme_bw() + geom_smooth(se = FALSE)
ggsave("mitchell01.jpg")
```
***
### Understanding Noise: Age to Age Months
```{r Understanding Noise: Age to Age Months}
pf$age_with_months <- pf$age + (1 - pf$dob_month / 12)
```
***
### Age with Months Means
```{r Age with Months Means}
age_with_months_groups <- group_by(pf, age_with_months)
```
Programming Assignment
```{r Programming Assignment}
pf.fc_by_age_months <- summarise(age_with_months_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age_months <- arrange(pf.fc_by_age_months, age_with_months)
head(pf.fc_by_age_months)
```
***
### Noise in Conditional Means
```{r Noise in Conditional Means}
ggplot(data = pf.fc_by_age_months, aes(x = age_with_months, y = friend_count_mean)) + geom_line(color = 'darkgreen') + xlim(13, 71) + theme_bw()
```
```{r Noise in Conditional Means2}
ggplot(data = subset(pf.fc_by_age_months, age_with_months < 71), aes(x = age_with_months, y = friend_count_mean)) + geom_line(color = 'darkred') + xlim(13, 71) + theme_bw()
```
***
### Smoothing Conditional Means
```{r Smoothing Conditional Means}
p1 <- ggplot(aes(x=age, y=fc_mean), data=subset(pf.fc_by_age, age < 71)) + geom_line(color = 'darkred') + geom_smooth(color = 'firebrick1')
p2 <- ggplot(aes(x=age_with_months, y=friend_count_mean), data=subset(pf.fc_by_age_months, age_with_months < 71)) + geom_line(color = 'darkblue') + geom_smooth(color = 'steelblue')
p3 <- ggplot(aes(x=round(age/5)*5, y=friend_count), data=subset(pf, age < 71)) + geom_line(stat = 'summary', fun.y = mean, color = 'darkgreen')
grid.arrange(p2, p1, p3, ncol=1)
ggsave("pf04.jpg")
```
Комментариев нет:
Отправить комментарий