вторник, 30 августа 2016 г.
воскресенье, 21 августа 2016 г.
Data Analyst ND Project 4
Помогите кто-нибудь проект мой найти команде Udacity.com :))))
А то они сами не в состоянии :)))
https://github.com/OlgaBelitskaya/data-analyst-nd002/tree/master/Data_Analyst_ND_Project4
Интересно, с какой попытки найдут?
А то они сами не в состоянии :)))
Я направила им повторно с указанием специально организованной папки и ссылки в интернете:
https://github.com/OlgaBelitskaya/data-analyst-nd002/tree/master/Data_Analyst_ND_Project4
Интересно, с какой попытки найдут?
понедельник, 15 августа 2016 г.
DATA ANALYSIS WITH R; lesson6.Rmd
---
title: "Lesson 6"
runtime: shiny
output: html_document
---
```{r setup 1, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## ========================================================
```{r setup 2}
setwd('/Users/olgabelitskaya/version-control/reflections-ud651')
```
## Useful links
```{r Links}
# http://www.theanalysisfactor.com/interpreting-regression-coefficients/
# http://docs.ggplot2.org/current/coord_trans.html
# https://www.r-bloggers.com/interpreting-regression-coefficient-in-r/?utm_source=feedburner&utm_medium=email&utm_campaign=Feed%3A+RBloggers+%28R+bloggers%29
# http://personality-project.org/r/html/corr.test.html
# https://rpubs.com/hadley/ggplot2-layers
# http://rmarkdown.rstudio.com/articles_integration.html
# https://http://blog.yhat.com/posts/r-lm-summary.html
# http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/
# http://data.princeton.edu/R/linearModels.html
# https://www.datacamp.com/community/tutorials/importing-data-r-part-two#gs.K2RGNF0
# https://cran.r-project.org/doc/manuals/r-devel/R-data.html
```
```{r Libraries 1}
library(devtools)
library(markdown)
library(ggplot2)
library(xtable)
library(ggthemes)
library(xlsx)
library(RColorBrewer)
library(bitops)
library(RCurl)
```
```{r Libraries 2}
library(grid)
library(gridExtra)
library(scales)
library(reshape)
library(plyr)
library(dplyr)
library(tidyr)
library(GGally)
library(scales)
library(memisc)
```
### Scatterplot Review
```{r Scatterplot Review 1}
data(diamonds)
summary(diamonds)
centrobank <- tbl_df(read.xlsx("c_bank.xlsx", sheetName="Data", header=TRUE))
head(centrobank)
```
```{r Scatterplot Review 2}
ggplot(data = diamonds, aes(x = carat, y = price, color = price), xlim = c(0, quantile(diamonds$carat,0.99)), ylim = c(0, quantile(diamonds$price, 0.99))) + geom_point(shape=12, alpha=0.7, size = 2) + scale_colour_gradientn(colours=rainbow(15)) + theme_bw()
ggsave('0601.jpg', width = 16, height = 8)
```
```{r Scatterplot Review 3}
ggplot(data = centrobank, aes(x = date)) + geom_point(aes(y=centrobank$gold), color = 'darkred', shape=10, alpha=0.7, size = 3) + geom_smooth(aes(y=centrobank$gold), color = 'red') + geom_point(aes(y=centrobank$platinum), color = 'darkblue', shape=12, alpha=0.7, size = 3) + geom_smooth(aes(y=centrobank$platinum), color = 'blue') + theme_bw() + ylab('platinum and gold') + ggtitle('Centrobank 2016')
ggsave('0602.jpg', width = 16, height = 8)
```
***
### ggpairs Function
#### install.packages('memisc')
```{r ggpairs Function}
# sample 10,000 diamonds from the data set
set.seed(20022012)
diamond_samp <- diamonds[sample(1:length(diamonds$price), 10000), ]
ggpairs(diamond_samp, lower = list(continuous = wrap("points", shape = I('.'))), upper = list(combo = wrap("box", outlier.shape = I('.'))))
ggsave('0603.jpg', width = 24, height = 16)
```
### The Demand of Diamonds
```{r The Demand of Diamonds}
plot1 <- qplot(x = price, data = diamonds, binwidth = 200, fill = ..count..) + ggtitle('Price') + scale_fill_continuous(low="darkblue", high="red")
plot2 <- qplot(x=price, data = diamonds, binwidth = 0.02, fill = ..count..) + scale_x_log10() + ggtitle('Price (log10)') + scale_fill_continuous(low="darkblue", high="red")
g1 <- grid.arrange(plot1, plot2, ncol = 2)
ggsave("0604.jpg", g1, width = 12, height = 8)
```
***
### Connecting Demand and Price Distributions
Notes:
***
### Scatterplot Transformation
```{r Scatterplot Transformation}
qplot (carat, price, data = diamonds, color = price) + scale_y_continuous(trans = log10_trans()) + ggtitle('Price (log10) by Carat') + scale_colour_gradientn(colours=rainbow(4))
ggsave("0605.jpg",width = 12, height = 8)
```
### Create a new function to transform the carat variable
```{r cuberoot transformation}
cuberoot_trans = function() trans_new('cuberoot', transform = function(x) x^(1/3), inverse = function(x) x^3)
```
#### Use the cuberoot_trans function
```{r Use cuberoot_trans}
ggplot(aes(carat, price), data = diamonds) +
geom_point(alpha = 0.5, size = 0.75, position = 'jitter', shape = 14, color = 'darkgreen') +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat')
```
***
### Overplotting Revisited
```{r Sort and Head Tables}
head(sort(table(diamonds$price), desreasing = T))
```
```{r Overplotting Revisited}
ggplot(aes(carat, price, color=price), data = diamonds) +
geom_point(alpha = 0.5, size = 0.5, position = 'jitter', shape = 7) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
scale_colour_continuous(low="firebrick1", high="black") +
ggtitle('Price (log10) by Cube-Root of Carat')
```
***
### Price vs. Carat and Clarity
Alter the code below.
```{r Price vs. Carat and Clarity 1}
display.brewer.all()
```
```{r Price vs. Carat and Clarity 2}
ggplot(aes(x = carat, y = price, color=clarity), data = diamonds) +
geom_point(alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Clarity', reverse = T,
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Clarity')
```
***
### Price vs. Carat and Cut
Alter the code below.
```{r Price vs. Carat and Cut}
ggplot(aes(x = carat, y = price, color = cut), data = diamonds) +
geom_point(alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div', guide = guide_legend(title = 'Cut', reverse = T, override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3), breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000), breaks = c(350, 1000, 5000, 10000, 15000)) +
scale_color_brewer(palette= 'Spectral', type = 'div', guide = guide_legend(title = 'Cut', reverse = T, override.aes = list(alpha = 1, size = 2))) +
ggtitle('Price (log10) by Cube-Root of Carat and Cut') + theme_bw()
```
***
### Price vs. Carat and Color
Alter the code below.
```{r Price vs. Carat and Color}
ggplot(aes(x = carat, y = price, color = color), data = diamonds) +
geom_point(alpha = 0.5, size = 0.5, position = 'jitter') +
scale_color_brewer(palette = 'Set1', type = 'div', guide = guide_legend(title = 'Color', reverse = F, override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3), breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000), breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Color') + theme_bw()
```
***
### Building the Linear Model
```{r Building the Linear Model}
m1 <- lm(I(log(price)) ~ I(carat^(1/3)), data = diamonds)
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
mtable(m1, m2, m3, m4, m5)
```
Notice how adding cut to our model does not help explain much of the variance
in the price of diamonds. This fits with out exploration earlier.
### A Bigger, Better Data Set
Notes:
```{r A Bigger, Better Data Set}
load('BigDiamonds.Rda')
```
The code used to obtain the data is available here:
https://github.com/solomonm/diamonds-data
## Building a Model Using the Big Diamonds Data Set
```{r Building a Model Using the Big Diamonds Data Set 1}
diamondsbig$logprice = log(diamondsbig$price)
m1 <- lm(logprice ~ I(carat^(1/3)), data = diamondsbig[diamondsbig$price < 10000 & diamondsbig$cert == "GIA",])
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
mtable(m1, m2, m3, m4, m5)
```
```{r Building a Model Using the Big Diamonds Data Set 2}
diamondsBigSample <- diamond_samp
diamondsBigSample$logprice = log(diamondsBigSample$price)
m1 <- lm(logprice ~ I(carat^(1/3)), data = diamondsBigSample)
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
models <- mtable(m1, m2, m3, m4, m5)
models
```
***
## Predictions
Example Diamond from BlueNile:
Round 1.00 Very Good I VS1 $5,601
```{r Predicition 1}
#Be sure you’ve loaded the library memisc and have m5 saved as an object in your workspace.
thisDiamond = data.frame(carat = 1.00, cut = "Very Good",
color = "I", clarity="VS1")
modelEstimate = predict(m5, newdata = thisDiamond,
interval="prediction", level = .95)
exp(modelEstimate)
```
```{r Prediction 2}
dat = data.frame(m4$model, m4$residuals)
with(dat, sd(m4.residuals))
with(subset(dat, carat > .9 & carat < 1.1), sd(m4.residuals))
dat$resid <- as.numeric(dat$m4.residuals)
ggplot(aes(y = resid, x = round(carat, 2)), data = dat) + geom_line(stat = "summary", fun.y = sd,color = 'darkgreen') + theme_bw()
```
Evaluate how well the model predicts the BlueNile diamond's price. Think about the fitted point estimate as well as the 95% CI.
title: "Lesson 6"
runtime: shiny
output: html_document
---
```{r setup 1, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## ========================================================
```{r setup 2}
setwd('/Users/olgabelitskaya/version-control/reflections-ud651')
```
## Useful links
```{r Links}
# http://www.theanalysisfactor.com/interpreting-regression-coefficients/
# http://docs.ggplot2.org/current/coord_trans.html
# https://www.r-bloggers.com/interpreting-regression-coefficient-in-r/?utm_source=feedburner&utm_medium=email&utm_campaign=Feed%3A+RBloggers+%28R+bloggers%29
# http://personality-project.org/r/html/corr.test.html
# https://rpubs.com/hadley/ggplot2-layers
# http://rmarkdown.rstudio.com/articles_integration.html
# https://http://blog.yhat.com/posts/r-lm-summary.html
# http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/
# http://data.princeton.edu/R/linearModels.html
# https://www.datacamp.com/community/tutorials/importing-data-r-part-two#gs.K2RGNF0
# https://cran.r-project.org/doc/manuals/r-devel/R-data.html
```
```{r Libraries 1}
library(devtools)
library(markdown)
library(ggplot2)
library(xtable)
library(ggthemes)
library(xlsx)
library(RColorBrewer)
library(bitops)
library(RCurl)
```
```{r Libraries 2}
library(grid)
library(gridExtra)
library(scales)
library(reshape)
library(plyr)
library(dplyr)
library(tidyr)
library(GGally)
library(scales)
library(memisc)
```
### Scatterplot Review
```{r Scatterplot Review 1}
data(diamonds)
summary(diamonds)
centrobank <- tbl_df(read.xlsx("c_bank.xlsx", sheetName="Data", header=TRUE))
head(centrobank)
```
```{r Scatterplot Review 2}
ggplot(data = diamonds, aes(x = carat, y = price, color = price), xlim = c(0, quantile(diamonds$carat,0.99)), ylim = c(0, quantile(diamonds$price, 0.99))) + geom_point(shape=12, alpha=0.7, size = 2) + scale_colour_gradientn(colours=rainbow(15)) + theme_bw()
ggsave('0601.jpg', width = 16, height = 8)
```
```{r Scatterplot Review 3}
ggplot(data = centrobank, aes(x = date)) + geom_point(aes(y=centrobank$gold), color = 'darkred', shape=10, alpha=0.7, size = 3) + geom_smooth(aes(y=centrobank$gold), color = 'red') + geom_point(aes(y=centrobank$platinum), color = 'darkblue', shape=12, alpha=0.7, size = 3) + geom_smooth(aes(y=centrobank$platinum), color = 'blue') + theme_bw() + ylab('platinum and gold') + ggtitle('Centrobank 2016')
ggsave('0602.jpg', width = 16, height = 8)
```
***
### ggpairs Function
#### install.packages('memisc')
```{r ggpairs Function}
# sample 10,000 diamonds from the data set
set.seed(20022012)
diamond_samp <- diamonds[sample(1:length(diamonds$price), 10000), ]
ggpairs(diamond_samp, lower = list(continuous = wrap("points", shape = I('.'))), upper = list(combo = wrap("box", outlier.shape = I('.'))))
ggsave('0603.jpg', width = 24, height = 16)
```
### The Demand of Diamonds
```{r The Demand of Diamonds}
plot1 <- qplot(x = price, data = diamonds, binwidth = 200, fill = ..count..) + ggtitle('Price') + scale_fill_continuous(low="darkblue", high="red")
plot2 <- qplot(x=price, data = diamonds, binwidth = 0.02, fill = ..count..) + scale_x_log10() + ggtitle('Price (log10)') + scale_fill_continuous(low="darkblue", high="red")
g1 <- grid.arrange(plot1, plot2, ncol = 2)
ggsave("0604.jpg", g1, width = 12, height = 8)
```
***
### Connecting Demand and Price Distributions
Notes:
***
### Scatterplot Transformation
```{r Scatterplot Transformation}
qplot (carat, price, data = diamonds, color = price) + scale_y_continuous(trans = log10_trans()) + ggtitle('Price (log10) by Carat') + scale_colour_gradientn(colours=rainbow(4))
ggsave("0605.jpg",width = 12, height = 8)
```
### Create a new function to transform the carat variable
```{r cuberoot transformation}
cuberoot_trans = function() trans_new('cuberoot', transform = function(x) x^(1/3), inverse = function(x) x^3)
```
#### Use the cuberoot_trans function
```{r Use cuberoot_trans}
ggplot(aes(carat, price), data = diamonds) +
geom_point(alpha = 0.5, size = 0.75, position = 'jitter', shape = 14, color = 'darkgreen') +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat')
```
***
### Overplotting Revisited
```{r Sort and Head Tables}
head(sort(table(diamonds$price), desreasing = T))
```
```{r Overplotting Revisited}
ggplot(aes(carat, price, color=price), data = diamonds) +
geom_point(alpha = 0.5, size = 0.5, position = 'jitter', shape = 7) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
scale_colour_continuous(low="firebrick1", high="black") +
ggtitle('Price (log10) by Cube-Root of Carat')
```
***
### Price vs. Carat and Clarity
Alter the code below.
```{r Price vs. Carat and Clarity 1}
display.brewer.all()
```
```{r Price vs. Carat and Clarity 2}
ggplot(aes(x = carat, y = price, color=clarity), data = diamonds) +
geom_point(alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Clarity', reverse = T,
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Clarity')
```
***
### Price vs. Carat and Cut
Alter the code below.
```{r Price vs. Carat and Cut}
ggplot(aes(x = carat, y = price, color = cut), data = diamonds) +
geom_point(alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div', guide = guide_legend(title = 'Cut', reverse = T, override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3), breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000), breaks = c(350, 1000, 5000, 10000, 15000)) +
scale_color_brewer(palette= 'Spectral', type = 'div', guide = guide_legend(title = 'Cut', reverse = T, override.aes = list(alpha = 1, size = 2))) +
ggtitle('Price (log10) by Cube-Root of Carat and Cut') + theme_bw()
```
***
### Price vs. Carat and Color
Alter the code below.
```{r Price vs. Carat and Color}
ggplot(aes(x = carat, y = price, color = color), data = diamonds) +
geom_point(alpha = 0.5, size = 0.5, position = 'jitter') +
scale_color_brewer(palette = 'Set1', type = 'div', guide = guide_legend(title = 'Color', reverse = F, override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3), breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000), breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Color') + theme_bw()
```
***
### Building the Linear Model
```{r Building the Linear Model}
m1 <- lm(I(log(price)) ~ I(carat^(1/3)), data = diamonds)
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
mtable(m1, m2, m3, m4, m5)
```
Notice how adding cut to our model does not help explain much of the variance
in the price of diamonds. This fits with out exploration earlier.
### A Bigger, Better Data Set
Notes:
```{r A Bigger, Better Data Set}
load('BigDiamonds.Rda')
```
The code used to obtain the data is available here:
https://github.com/solomonm/diamonds-data
## Building a Model Using the Big Diamonds Data Set
```{r Building a Model Using the Big Diamonds Data Set 1}
diamondsbig$logprice = log(diamondsbig$price)
m1 <- lm(logprice ~ I(carat^(1/3)), data = diamondsbig[diamondsbig$price < 10000 & diamondsbig$cert == "GIA",])
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
mtable(m1, m2, m3, m4, m5)
```
```{r Building a Model Using the Big Diamonds Data Set 2}
diamondsBigSample <- diamond_samp
diamondsBigSample$logprice = log(diamondsBigSample$price)
m1 <- lm(logprice ~ I(carat^(1/3)), data = diamondsBigSample)
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
models <- mtable(m1, m2, m3, m4, m5)
models
```
***
## Predictions
Example Diamond from BlueNile:
Round 1.00 Very Good I VS1 $5,601
```{r Predicition 1}
#Be sure you’ve loaded the library memisc and have m5 saved as an object in your workspace.
thisDiamond = data.frame(carat = 1.00, cut = "Very Good",
color = "I", clarity="VS1")
modelEstimate = predict(m5, newdata = thisDiamond,
interval="prediction", level = .95)
exp(modelEstimate)
```
```{r Prediction 2}
dat = data.frame(m4$model, m4$residuals)
with(dat, sd(m4.residuals))
with(subset(dat, carat > .9 & carat < 1.1), sd(m4.residuals))
dat$resid <- as.numeric(dat$m4.residuals)
ggplot(aes(y = resid, x = round(carat, 2)), data = dat) + geom_line(stat = "summary", fun.y = sd,color = 'darkgreen') + theme_bw()
```
Evaluate how well the model predicts the BlueNile diamond's price. Think about the fitted point estimate as well as the 95% CI.
воскресенье, 14 августа 2016 г.
DATA ANALYSIS WITH R; problemset5.Rmd
---
title: "Problem Set 5"
runtime: shiny
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
Problem Set 5
========================================================
### Working directory and libraries
```{r setup 2}
setwd('/Users/olgabelitskaya/version-control/reflections-ud651')
```
```{r Libraries 1}
library(ggplot2)
library(lubridate)
```
```{r Libraries 2}
library(gridExtra)
library(plyr)
```
```{r Libraries 3}
library(scales)
library(reshape2)
```
```{r Libraries 4}
library(dplyr)
library(tidyr)
```
```{r Libraries 5}
library(xlsx)
library(ggthemes)
```
## Useful links
```{r Links}
# http://docs.ggplot2.org/current/
# http://docs.ggplot2.org/current/coord_trans.html
# http://sape.inf.usi.ch/quick-reference/ggplot2/themes
# http://personality-project.org/r/html/corr.test.html
# https://rpubs.com/hadley/ggplot2-layers
# http://rmarkdown.rstudio.com/articles_integration.html
# https://cran.r-project.org/web/packages/ggthemes/vignettes/ggthemes.html
#http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/
```
## 5.1
#### Create a histogram of diamond prices. Facet the histogram by diamond color and use cut to color the histogram bars.
#### The plot should look something like this: http://i.imgur.com/b5xyrOu.jpg.
#### Note: In the link, a color palette of type 'qual' was used to color the histogram using scale_fill_brewer(type = 'qual')
```{r 5.1.1}
p1 <- ggplot(diamonds, aes(x = price, fill = cut)) + geom_histogram() + facet_wrap(~ color) + scale_fill_brewer(type = 'qual', palette = 'Spectral') + xlab("Price") + ylab("Count") + theme_gray()
```
```{r 5.1.2}
p2 <- ggplot(diamonds, aes(x = price, fill = cut)) + geom_histogram() + facet_wrap(~ color) + scale_x_log10(expression(paste(Log[10], " of Price"))) + ylab("Count") + scale_fill_brewer(type = 'qual', palette = 'Spectral') + theme_gray()
g1 <- grid.arrange(p1, p2, ncol=1)
ggsave("05s01.jpg", g1, width = 8, height = 12)
```
## 5.2
#### Create a scatterplot of diamond price vs. table and color the points by the cut of the diamond.
#### The plot should look something like this: http://i.imgur.com/rQF9jQr.jpg.
#### Note: In the link, a color palette of type 'qual' was used to color the scatterplot using scale_color_brewer(type = 'qual')
```{r 5.2}
ggplot(diamonds, aes(x = table, y = price, color = cut)) + geom_jitter(size = 3, alpha=0.8, shape = 17) + scale_x_continuous(breaks = seq(42, 80, 1), limits = c(42, 80)) + scale_color_brewer(type = 'seq', palette = 'Set1') + theme_bw()
ggsave("05s02.jpg", width = 12, height = 8)
```
## 5.3
#### What is the typical table range for the majority of diamonds of ideal cut?
## (54; 57)
#### What is the typical table range for the majory of diamonds of premium cut?
## (58; 60)
#### Use the graph that you created from the previous exercise to see the answer. You do not need to run summaries.
## 5.4
#### Create a scatterplot of diamond price vs. volume (x * y * z) and color the points by the clarity of diamonds. Use scale on the y-axis to take the log10 of price. You should also omit the top 1% of diamond volumes from the plot.
#### Note: Volume is a very rough approximation of a diamond's actual volume.
#### The plot should look something like this: http://i.imgur.com/excUpea.jpg.
#### Note: In the link, a color palette of type 'div' was used to color the scatterplot using scale_color_brewer(type = 'div').
```{r 5.4.1}
diamonds <- diamonds %>%
mutate(volume = x * y *z)
p3 <-ggplot(subset(diamonds, volume <= quantile(volume, 0.99) & volume > 0 ), aes(x = volume, y = price, color = clarity)) + geom_jitter(size = 3, alpha=0.7, shape = 18) + scale_color_brewer(type = 'div', palette = 'Spectral') + theme_solarized()
p4 <-ggplot(subset(diamonds, volume <= quantile(volume, 0.99) & volume > 0 ), aes(x = volume, y = price, color = clarity)) + scale_y_log10() + geom_jitter(size = 3, alpha=0.7, shape = 18) + scale_color_brewer(type = 'div', palette = 'Spectral') + ylab("log10 of price")+ theme_solarized()
g2 <- grid.arrange(p3, p4, ncol=2)
ggsave("05s03.jpg", g2, width = 16, height = 8)
```
```{r 5.4.2}
diamonds <- diamonds %>%
mutate(volume = x * y *z)
p5 <-ggplot(subset(diamonds, volume <= quantile(volume, 0.99) & volume > 0 ), aes(x = volume, y = price, color = color)) + geom_jitter(size = 3, alpha=0.7, shape = 18) + scale_color_brewer(type = 'div', palette = 'Set1') + theme_solarized()
p6 <-ggplot(subset(diamonds, volume <= quantile(volume, 0.99) & volume > 0 ), aes(x = volume, y = price, color = color)) + scale_y_log10() + geom_jitter(size = 3, alpha=0.7, shape = 18) + scale_color_brewer(type = 'div', palette = 'Set1') + ylab("log10 of price")+ theme_solarized()
g3 <- grid.arrange(p5, p6, ncol=1)
ggsave("05s04.jpg", g3, width = 8, height = 16)
```
## 5.5
#### Many interesting variables are derived from two or more others. For example, we might wonder how much of a person's network on a service like Facebook the user actively initiated. Two users with the same degree (or number of friends) might be very different if one initiated most of those connections on the service, while the other initiated very few. So it could be useful to consider this proportion of existing friendships that the user initiated. This might be a good predictor of how active a user is compared with their peers, or other traits, such as personality (i.e., is this person an extrovert?).
#### Your task is to create a new variable called 'prop_initiated' in the Pseudo-Facebook data set. The variable should contain the proportion of friendships that the user initiated.
```{r 5.5}
pf <- read.delim('pseudo_facebook.tsv')
pf$prop_initiated <- ifelse(pf$friend_count > 0, pf$friendships_initiated/pf$friend_count, 0)
# variant 2
# pf <- pf %>%
# mutate(prop_initiated = ifelse(friend_count > 0, friendships_initiated/friend_count, 0))
```
## 5.6
#### Create a line graph of the median proportion of friendships initiated ('prop_initiated') vs. tenure and color the line segment by year_joined.bucket.
#### Recall, we created year_joined.bucket in Lesson 5 by first creating year_joined from the variable tenure. Then, we used the cut function on year_joined to create four bins or cohorts of users.
#### (2004, 2009]
#### (2009, 2011]
#### (2011, 2012]
#### (2012, 2014]
#### The plot should look something like this: http://i.imgur.com/vNjPtDh.jpg OR this% http://i.imgur.com/IBN1ufQ.jpg
```{r 5.6.1}
pf_yj <- pf %>%
mutate(year_joined = floor(2014 - tenure/365), year_joined_bucket = cut(year_joined, breaks=c(2004, 2009, 2011, 2012, 2014)))
```
```{r 5.6.2}
ggplot(subset(pf_yj, tenure > 0), aes(x=tenure, y=prop_initiated)) + geom_line(aes(color=year_joined_bucket), stat='summary', fun.y=median) + scale_color_brewer(type = 'seq', palette = 'Spectral') + theme_economist()
ggsave("05s05.jpg", width = 12, height = 8)
```
## 5.7
#### Smooth the last plot you created of of prop_initiated vs tenure colored by year_joined.bucket. You can bin together ranges of tenure or add a smoother to the plot.
```{r 5.7}
ggplot(subset(pf_yj, tenure > 0), aes(x=tenure, y=prop_initiated)) + geom_line(aes(color=year_joined_bucket), stat='summary', fun.y=median) + geom_smooth(color=rainbow(80)) + scale_color_brewer(type = 'seq', palette = 'Pastel1') + theme_bw()
ggsave("05s06.jpg", width = 12, height = 8)
```
## 5.8
#### On average, which group initiated the greatest poportion of its Facebook friendships? The plot with the smoother that you created in the last exercise can help you answer this question.
## (2012, 2014]
## 5.9
#### For the group with the largest proportion of friendships initated, what is the group's average (mean) proportion on friendships initiated?
## 0.64
```{r 5.9}
pf_yj %>%
filter(year_joined_bucket == "(2012,2014]") %>%
summarise(avg = mean(prop_initiated, na.rm=TRUE))
```
## 5.10
#### Create a scatter plot of the price/carat ratio of diamonds. The variable x should be assigned to cut. The points should be colored by diamond color, and the plot should be faceted by clarity.
#### The plot should look something like this: http://i.imgur.com/YzbWkHT.jpg.
#### Note: In the link, a color palette of type 'div' was used to color the histogram using scale_color_brewer(type = 'div').
```{r 5.10}
ggplot(diamonds, aes(x = cut, y = price/carat, color = color)) + geom_jitter(size = 2, alpha=0.7, shape = 18) + facet_wrap(~clarity) + scale_color_brewer(type = 'div', palette = "Set1") + theme_gdocs()
ggsave("05s07.jpg", width = 16, height = 8)
```
## 5.11
##### The Gapminder website contains over 500 data sets with information about the world's population. Your task is to continue the investigation you did at the end of Problem Set 4 or you can start fresh and choose a different data set from Gapminder.
#### If you’re feeling adventurous or want to try some data munging see if you can find a data set or scrape one from the web.
#### In your investigation, examine 3 or more variables and create 2-5 plots that make use of the techniques from Lesson 5.
#### You can find a link to the Gapminder website in the Instructor Notes.
#### Once you've completed your investigation, create a post in the discussions that includes:
#### 1. the variable(s) you investigated, your observations, and any summary statistics
#### 2. snippets of code that created the plots
#### 3. links to the images of your plots
```{r Data 5.11.1}
fact <- tbl_df(read.csv2("factbook.csv", header=TRUE))
names(fact)
```
```{r Data 5.11.2}
row.with.na <- apply(fact, 1, function(x){any(is.na(x))})
sum(row.with.na)
fact <- fact[!row.with.na,]
```
```{r Data 5.11.3}
names(fact)[1] <- "country"
names(fact)[2] <- "area"
names(fact)[3] <- "birth_rate"
names(fact)[4] <- "current_account_balance"
names(fact)[5] <- "death_rate"
names(fact)[6] <- "debt_external"
names(fact)[7] <- "electricity_consumption"
names(fact)[8] <- "electricity_production"
names(fact)[9] <- "exports"
names(fact)[10] <- "gdp"
```
```{r Data 5.11.4}
names(fact)[11] <- "gdp_per_cap"
names(fact)[12] <- "gdp_real"
names(fact)[13] <- "aids_adults"
names(fact)[14] <- "aids_deaths"
names(fact)[15] <- "aids_liv"
names(fact)[16] <- "highways"
names(fact)[17] <- "imports"
names(fact)[18] <- "industrial_production_growth_rate"
names(fact)[19] <- "infant_mortality_rate"
names(fact)[20] <- "inflation_rate"
```
```{r Data 5.11.5}
names(fact)[21] <- "internet_hosts"
names(fact)[22] <- "internet_users"
names(fact)[23] <- "investment_gross"
names(fact)[24] <- "labor_force"
names(fact)[25] <- "life_expectancy"
names(fact)[26] <- "military_expenditures"
names(fact)[27] <- "military_expenditures_percent"
names(fact)[28] <- "natural_gas_consumption"
names(fact)[29] <- "natural_gas_exports"
names(fact)[30] <- "natural_gas_imports"
```
```{r Data 5.11.6}
names(fact)[31] <- "natural_gas_production"
names(fact)[32] <- "natural_gas_reserves"
names(fact)[33] <- "oil_consumption"
names(fact)[34] <- "oil_exports"
names(fact)[35] <- "oil_imports"
names(fact)[36] <- "oil_production"
names(fact)[37] <- "oil_reserves"
names(fact)[38] <- "population"
names(fact)[39] <- "public_dept"
names(fact)[40] <- "railways"
```
```{r Data 5.11.7}
names(fact)[41] <- "reserves_foreign_exchange"
names(fact)[42] <- "phone_main_lines"
names(fact)[43] <- "mobile_phones"
names(fact)[44] <- "total_fertility_rate"
names(fact)[45] <- "unemployment_rate"
names(fact)
```
```{r Data 5.11.8}
country_set = c("Czech Republic", "United Kingdom", "Spain", "Austria", "Italy", 'Denmark', 'Hungary', 'Ireland', "Greece", "Poland")
fact1 <- fact[which(fact$country %in% country_set),]
p7 <-ggplot(subset(fact1, internet_hosts <= quantile(internet_hosts, 0.99) & internet_hosts > 0 ), aes(x = internet_hosts, y = internet_users, color = country)) + geom_jitter(size = 5, alpha=0.7, shape = 10) + scale_color_brewer(type = 'div', palette = 'Set1') + theme_solarized()
p8 <-ggplot(subset(fact1, internet_hosts <= quantile(internet_hosts, 0.99) & internet_hosts > 0 ), aes(x = internet_hosts, y = internet_users, color = country)) + scale_y_log10() + geom_jitter(size = 5, alpha=0.7, shape = 10) + scale_color_brewer(type = 'div', palette = 'Set1') + ylab("log10 of internet_users")+ theme_solarized()
g4 <- grid.arrange(p7, p8, ncol=1)
ggsave("05s08.jpg", g4, width = 8, height = 16)
```
```{r Data 5.11.9}
p9 <- ggplot(fact, aes(country)) + geom_point(aes(y=fact$oil_exports), color = 'red', size = 5, shape = 2) + geom_text(data=fact, mapping=aes(x=country, y=oil_exports), label='e', size=4, color ='red') + geom_point(aes(y=oil_imports), color="green", size = 5, shape = 6) + ylab('oil exports and imports') + geom_text(data=fact, mapping=aes(x=country, y=oil_imports), label='i', size=4, color ='green') + theme_bw() + theme(axis.text.x=element_text(size=10, angle=20),axis.title=element_text(size=12), legend.position = "bottom")
p10 <-ggplot(fact, aes(country)) + geom_point(aes(y=fact$natural_gas_exports), color = 'red', size = 5, shape = 2) + geom_text(data=fact, mapping=aes(x=country, y=natural_gas_exports), label='e', size=4, color ='red') + geom_point(aes(y=natural_gas_imports), color="green", size = 5, shape = 6) + geom_text(data=fact, mapping=aes(x=country, y=natural_gas_imports), label='i', size=4, color ='green') + ylab('natural gas exports and imports') + theme_bw() + theme(axis.text.x=element_text(size=10, angle=20),axis.title=element_text(size=12))
g5 <- grid.arrange(p9, p10, ncol=1)
ggsave("05s09.jpg", g5, width = 16, height = 8)
```
title: "Problem Set 5"
runtime: shiny
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
Problem Set 5
========================================================
### Working directory and libraries
```{r setup 2}
setwd('/Users/olgabelitskaya/version-control/reflections-ud651')
```
```{r Libraries 1}
library(ggplot2)
library(lubridate)
```
```{r Libraries 2}
library(gridExtra)
library(plyr)
```
```{r Libraries 3}
library(scales)
library(reshape2)
```
```{r Libraries 4}
library(dplyr)
library(tidyr)
```
```{r Libraries 5}
library(xlsx)
library(ggthemes)
```
## Useful links
```{r Links}
# http://docs.ggplot2.org/current/
# http://docs.ggplot2.org/current/coord_trans.html
# http://sape.inf.usi.ch/quick-reference/ggplot2/themes
# http://personality-project.org/r/html/corr.test.html
# https://rpubs.com/hadley/ggplot2-layers
# http://rmarkdown.rstudio.com/articles_integration.html
# https://cran.r-project.org/web/packages/ggthemes/vignettes/ggthemes.html
#http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/
```
## 5.1
#### Create a histogram of diamond prices. Facet the histogram by diamond color and use cut to color the histogram bars.
#### The plot should look something like this: http://i.imgur.com/b5xyrOu.jpg.
#### Note: In the link, a color palette of type 'qual' was used to color the histogram using scale_fill_brewer(type = 'qual')
```{r 5.1.1}
p1 <- ggplot(diamonds, aes(x = price, fill = cut)) + geom_histogram() + facet_wrap(~ color) + scale_fill_brewer(type = 'qual', palette = 'Spectral') + xlab("Price") + ylab("Count") + theme_gray()
```
```{r 5.1.2}
p2 <- ggplot(diamonds, aes(x = price, fill = cut)) + geom_histogram() + facet_wrap(~ color) + scale_x_log10(expression(paste(Log[10], " of Price"))) + ylab("Count") + scale_fill_brewer(type = 'qual', palette = 'Spectral') + theme_gray()
g1 <- grid.arrange(p1, p2, ncol=1)
ggsave("05s01.jpg", g1, width = 8, height = 12)
```
## 5.2
#### Create a scatterplot of diamond price vs. table and color the points by the cut of the diamond.
#### The plot should look something like this: http://i.imgur.com/rQF9jQr.jpg.
#### Note: In the link, a color palette of type 'qual' was used to color the scatterplot using scale_color_brewer(type = 'qual')
```{r 5.2}
ggplot(diamonds, aes(x = table, y = price, color = cut)) + geom_jitter(size = 3, alpha=0.8, shape = 17) + scale_x_continuous(breaks = seq(42, 80, 1), limits = c(42, 80)) + scale_color_brewer(type = 'seq', palette = 'Set1') + theme_bw()
ggsave("05s02.jpg", width = 12, height = 8)
```
## 5.3
#### What is the typical table range for the majority of diamonds of ideal cut?
## (54; 57)
#### What is the typical table range for the majory of diamonds of premium cut?
## (58; 60)
#### Use the graph that you created from the previous exercise to see the answer. You do not need to run summaries.
## 5.4
#### Create a scatterplot of diamond price vs. volume (x * y * z) and color the points by the clarity of diamonds. Use scale on the y-axis to take the log10 of price. You should also omit the top 1% of diamond volumes from the plot.
#### Note: Volume is a very rough approximation of a diamond's actual volume.
#### The plot should look something like this: http://i.imgur.com/excUpea.jpg.
#### Note: In the link, a color palette of type 'div' was used to color the scatterplot using scale_color_brewer(type = 'div').
```{r 5.4.1}
diamonds <- diamonds %>%
mutate(volume = x * y *z)
p3 <-ggplot(subset(diamonds, volume <= quantile(volume, 0.99) & volume > 0 ), aes(x = volume, y = price, color = clarity)) + geom_jitter(size = 3, alpha=0.7, shape = 18) + scale_color_brewer(type = 'div', palette = 'Spectral') + theme_solarized()
p4 <-ggplot(subset(diamonds, volume <= quantile(volume, 0.99) & volume > 0 ), aes(x = volume, y = price, color = clarity)) + scale_y_log10() + geom_jitter(size = 3, alpha=0.7, shape = 18) + scale_color_brewer(type = 'div', palette = 'Spectral') + ylab("log10 of price")+ theme_solarized()
g2 <- grid.arrange(p3, p4, ncol=2)
ggsave("05s03.jpg", g2, width = 16, height = 8)
```
```{r 5.4.2}
diamonds <- diamonds %>%
mutate(volume = x * y *z)
p5 <-ggplot(subset(diamonds, volume <= quantile(volume, 0.99) & volume > 0 ), aes(x = volume, y = price, color = color)) + geom_jitter(size = 3, alpha=0.7, shape = 18) + scale_color_brewer(type = 'div', palette = 'Set1') + theme_solarized()
p6 <-ggplot(subset(diamonds, volume <= quantile(volume, 0.99) & volume > 0 ), aes(x = volume, y = price, color = color)) + scale_y_log10() + geom_jitter(size = 3, alpha=0.7, shape = 18) + scale_color_brewer(type = 'div', palette = 'Set1') + ylab("log10 of price")+ theme_solarized()
g3 <- grid.arrange(p5, p6, ncol=1)
ggsave("05s04.jpg", g3, width = 8, height = 16)
```
## 5.5
#### Many interesting variables are derived from two or more others. For example, we might wonder how much of a person's network on a service like Facebook the user actively initiated. Two users with the same degree (or number of friends) might be very different if one initiated most of those connections on the service, while the other initiated very few. So it could be useful to consider this proportion of existing friendships that the user initiated. This might be a good predictor of how active a user is compared with their peers, or other traits, such as personality (i.e., is this person an extrovert?).
#### Your task is to create a new variable called 'prop_initiated' in the Pseudo-Facebook data set. The variable should contain the proportion of friendships that the user initiated.
```{r 5.5}
pf <- read.delim('pseudo_facebook.tsv')
pf$prop_initiated <- ifelse(pf$friend_count > 0, pf$friendships_initiated/pf$friend_count, 0)
# variant 2
# pf <- pf %>%
# mutate(prop_initiated = ifelse(friend_count > 0, friendships_initiated/friend_count, 0))
```
## 5.6
#### Create a line graph of the median proportion of friendships initiated ('prop_initiated') vs. tenure and color the line segment by year_joined.bucket.
#### Recall, we created year_joined.bucket in Lesson 5 by first creating year_joined from the variable tenure. Then, we used the cut function on year_joined to create four bins or cohorts of users.
#### (2004, 2009]
#### (2009, 2011]
#### (2011, 2012]
#### (2012, 2014]
#### The plot should look something like this: http://i.imgur.com/vNjPtDh.jpg OR this% http://i.imgur.com/IBN1ufQ.jpg
```{r 5.6.1}
pf_yj <- pf %>%
mutate(year_joined = floor(2014 - tenure/365), year_joined_bucket = cut(year_joined, breaks=c(2004, 2009, 2011, 2012, 2014)))
```
```{r 5.6.2}
ggplot(subset(pf_yj, tenure > 0), aes(x=tenure, y=prop_initiated)) + geom_line(aes(color=year_joined_bucket), stat='summary', fun.y=median) + scale_color_brewer(type = 'seq', palette = 'Spectral') + theme_economist()
ggsave("05s05.jpg", width = 12, height = 8)
```
## 5.7
#### Smooth the last plot you created of of prop_initiated vs tenure colored by year_joined.bucket. You can bin together ranges of tenure or add a smoother to the plot.
```{r 5.7}
ggplot(subset(pf_yj, tenure > 0), aes(x=tenure, y=prop_initiated)) + geom_line(aes(color=year_joined_bucket), stat='summary', fun.y=median) + geom_smooth(color=rainbow(80)) + scale_color_brewer(type = 'seq', palette = 'Pastel1') + theme_bw()
ggsave("05s06.jpg", width = 12, height = 8)
```
## 5.8
#### On average, which group initiated the greatest poportion of its Facebook friendships? The plot with the smoother that you created in the last exercise can help you answer this question.
## (2012, 2014]
## 5.9
#### For the group with the largest proportion of friendships initated, what is the group's average (mean) proportion on friendships initiated?
## 0.64
```{r 5.9}
pf_yj %>%
filter(year_joined_bucket == "(2012,2014]") %>%
summarise(avg = mean(prop_initiated, na.rm=TRUE))
```
## 5.10
#### Create a scatter plot of the price/carat ratio of diamonds. The variable x should be assigned to cut. The points should be colored by diamond color, and the plot should be faceted by clarity.
#### The plot should look something like this: http://i.imgur.com/YzbWkHT.jpg.
#### Note: In the link, a color palette of type 'div' was used to color the histogram using scale_color_brewer(type = 'div').
```{r 5.10}
ggplot(diamonds, aes(x = cut, y = price/carat, color = color)) + geom_jitter(size = 2, alpha=0.7, shape = 18) + facet_wrap(~clarity) + scale_color_brewer(type = 'div', palette = "Set1") + theme_gdocs()
ggsave("05s07.jpg", width = 16, height = 8)
```
## 5.11
##### The Gapminder website contains over 500 data sets with information about the world's population. Your task is to continue the investigation you did at the end of Problem Set 4 or you can start fresh and choose a different data set from Gapminder.
#### If you’re feeling adventurous or want to try some data munging see if you can find a data set or scrape one from the web.
#### In your investigation, examine 3 or more variables and create 2-5 plots that make use of the techniques from Lesson 5.
#### You can find a link to the Gapminder website in the Instructor Notes.
#### Once you've completed your investigation, create a post in the discussions that includes:
#### 1. the variable(s) you investigated, your observations, and any summary statistics
#### 2. snippets of code that created the plots
#### 3. links to the images of your plots
```{r Data 5.11.1}
fact <- tbl_df(read.csv2("factbook.csv", header=TRUE))
names(fact)
```
```{r Data 5.11.2}
row.with.na <- apply(fact, 1, function(x){any(is.na(x))})
sum(row.with.na)
fact <- fact[!row.with.na,]
```
```{r Data 5.11.3}
names(fact)[1] <- "country"
names(fact)[2] <- "area"
names(fact)[3] <- "birth_rate"
names(fact)[4] <- "current_account_balance"
names(fact)[5] <- "death_rate"
names(fact)[6] <- "debt_external"
names(fact)[7] <- "electricity_consumption"
names(fact)[8] <- "electricity_production"
names(fact)[9] <- "exports"
names(fact)[10] <- "gdp"
```
```{r Data 5.11.4}
names(fact)[11] <- "gdp_per_cap"
names(fact)[12] <- "gdp_real"
names(fact)[13] <- "aids_adults"
names(fact)[14] <- "aids_deaths"
names(fact)[15] <- "aids_liv"
names(fact)[16] <- "highways"
names(fact)[17] <- "imports"
names(fact)[18] <- "industrial_production_growth_rate"
names(fact)[19] <- "infant_mortality_rate"
names(fact)[20] <- "inflation_rate"
```
```{r Data 5.11.5}
names(fact)[21] <- "internet_hosts"
names(fact)[22] <- "internet_users"
names(fact)[23] <- "investment_gross"
names(fact)[24] <- "labor_force"
names(fact)[25] <- "life_expectancy"
names(fact)[26] <- "military_expenditures"
names(fact)[27] <- "military_expenditures_percent"
names(fact)[28] <- "natural_gas_consumption"
names(fact)[29] <- "natural_gas_exports"
names(fact)[30] <- "natural_gas_imports"
```
```{r Data 5.11.6}
names(fact)[31] <- "natural_gas_production"
names(fact)[32] <- "natural_gas_reserves"
names(fact)[33] <- "oil_consumption"
names(fact)[34] <- "oil_exports"
names(fact)[35] <- "oil_imports"
names(fact)[36] <- "oil_production"
names(fact)[37] <- "oil_reserves"
names(fact)[38] <- "population"
names(fact)[39] <- "public_dept"
names(fact)[40] <- "railways"
```
```{r Data 5.11.7}
names(fact)[41] <- "reserves_foreign_exchange"
names(fact)[42] <- "phone_main_lines"
names(fact)[43] <- "mobile_phones"
names(fact)[44] <- "total_fertility_rate"
names(fact)[45] <- "unemployment_rate"
names(fact)
```
```{r Data 5.11.8}
country_set = c("Czech Republic", "United Kingdom", "Spain", "Austria", "Italy", 'Denmark', 'Hungary', 'Ireland', "Greece", "Poland")
fact1 <- fact[which(fact$country %in% country_set),]
p7 <-ggplot(subset(fact1, internet_hosts <= quantile(internet_hosts, 0.99) & internet_hosts > 0 ), aes(x = internet_hosts, y = internet_users, color = country)) + geom_jitter(size = 5, alpha=0.7, shape = 10) + scale_color_brewer(type = 'div', palette = 'Set1') + theme_solarized()
p8 <-ggplot(subset(fact1, internet_hosts <= quantile(internet_hosts, 0.99) & internet_hosts > 0 ), aes(x = internet_hosts, y = internet_users, color = country)) + scale_y_log10() + geom_jitter(size = 5, alpha=0.7, shape = 10) + scale_color_brewer(type = 'div', palette = 'Set1') + ylab("log10 of internet_users")+ theme_solarized()
g4 <- grid.arrange(p7, p8, ncol=1)
ggsave("05s08.jpg", g4, width = 8, height = 16)
```
```{r Data 5.11.9}
p9 <- ggplot(fact, aes(country)) + geom_point(aes(y=fact$oil_exports), color = 'red', size = 5, shape = 2) + geom_text(data=fact, mapping=aes(x=country, y=oil_exports), label='e', size=4, color ='red') + geom_point(aes(y=oil_imports), color="green", size = 5, shape = 6) + ylab('oil exports and imports') + geom_text(data=fact, mapping=aes(x=country, y=oil_imports), label='i', size=4, color ='green') + theme_bw() + theme(axis.text.x=element_text(size=10, angle=20),axis.title=element_text(size=12), legend.position = "bottom")
p10 <-ggplot(fact, aes(country)) + geom_point(aes(y=fact$natural_gas_exports), color = 'red', size = 5, shape = 2) + geom_text(data=fact, mapping=aes(x=country, y=natural_gas_exports), label='e', size=4, color ='red') + geom_point(aes(y=natural_gas_imports), color="green", size = 5, shape = 6) + geom_text(data=fact, mapping=aes(x=country, y=natural_gas_imports), label='i', size=4, color ='green') + ylab('natural gas exports and imports') + theme_bw() + theme(axis.text.x=element_text(size=10, angle=20),axis.title=element_text(size=12))
g5 <- grid.arrange(p9, p10, ncol=1)
ggsave("05s09.jpg", g5, width = 16, height = 8)
```
DATA ANALYSIS WITH R; lesson5.Rmd
---
title: "Lesson 5"
runtime: shiny
output: html_document
---
```{r setup 1, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
========================================================
### Working directory and libraries
```{r setup 2}
setwd('/Users/olgabelitskaya/version-control/reflections-ud651')
```
```{r Libraries 1}
library(ggplot2)
library(lubridate)
```
```{r Libraries 2}
library(gridExtra)
library(plyr)
```
```{r Libraries 3}
library(scales)
library(reshape2)
```
```{r Libraries 4}
library(dplyr)
library(tidyr)
```
```{r Libraries 5}
library(xlsx)
library(ggthemes)
```
## Useful links
```{r Links}
# http://docs.ggplot2.org/current/
# http://docs.ggplot2.org/current/coord_trans.html
# http://sape.inf.usi.ch/quick-reference/ggplot2/themes
# http://personality-project.org/r/html/corr.test.html
# https://rpubs.com/hadley/ggplot2-layers
# http://rmarkdown.rstudio.com/articles_integration.html
# https://cran.r-project.org/web/packages/ggthemes/vignettes/ggthemes.html
```
```{r Pseudo-Facebook User Data}
pf <- read.csv('pseudo_facebook.tsv', sep='\t')
names(pf)
```
### Third Qualitative Variable
#### Write code to create a new data frame, called 'pf.fc_by_age_gender', that contains information on each age AND gender group.
#### The data frame should contain the following variables:
#### mean_friend_count,
#### median_friend_count,
#### n (the number of users in each age and gender grouping)
#### Here is an example of the structure of your data frame. Your data values will be different. Note that if you are grouping by more than one variable, you will probably need to call the ungroup() function.
#### age gender mean_friend_count median_friend_count n
#### 1 13 female 247.2953 150 207
#### 2 13 male 184.2342 61 265
#### 3 14 female 329.1938 245 834
#### 4 14 male 157.1204 88 1201
```{r Third Qualitative Variable 1}
ggplot(aes(x = gender, y = age, color = gender), data = subset(pf, !is.na(gender))) + geom_boxplot() + stat_summary(fun.y = mean, geom = 'point', shape=15)
```
```{r Third Qualitative Variable 2}
ggplot(aes(x = age, y = friend_count), data = subset(pf, !is.na(gender))) + geom_line(aes(color=gender), stat = 'summary', fun.y = median)
```
```{r Third Qualitative Variable 3}
detach("package:plyr", unload=TRUE)
pf.fc_by_age_gender <- pf %>%
filter(!is.na(gender)) %>%
group_by(age, gender) %>%
summarise(mean_friend_count = mean(friend_count),
median_friend_count = median(friend_count),
n = n() ) %>%
ungroup() %>%
arrange(age)
```
```{r Third Qualitative Variable 4}
pf.fc_by_age_gender2 <- subset(pf, !is.na(gender)) %>%
group_by(age, gender) %>%
summarise(mean_friend_count = mean(friend_count),
median_friend_count = median(friend_count),
n = n() ) %>%
arrange(age)
```
***
### Plotting Conditional Summaries
```{r Plotting Conditional Summaries}
ggplot(pf.fc_by_age_gender, aes(x = age, y = median_friend_count, color =gender)) + geom_line()
```
***
### Reshaping Data
```{r Reshaping Data}
pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender, age ~ gender, value.var = 'median_friend_count')
```
***
### Ratio Plot
```{r Ratio Plot}
ggplot(pf.fc_by_age_gender.wide, aes(x = age, y = female/male)) + geom_line(color = "darkred", linetype = 5) + geom_hline(yintercept=1, color= 'darkblue', linetype = 2, alpha=0.5) + theme_bw()
```
***
### Third Quantitative Variable
```{r Third Quantitative Variable 5}
pf$year_joined <- floor(2014 - pf$tenure / 365)
```
***
### Cut a Variable
#### Create a new variable in the data frame called year_joined.bucket by using the cut function on the variable year_joined.
#### You need to create the following buckets for the new variable, year_joined.bucket
#### (2004, 2009]
#### (2009, 2011]
#### (2011, 2012]
#### (2012, 2014]
#### Note that a parenthesis means exclude the year and a bracket means include the year.
```{r Cut a Variable 1}
summary(pf$year_joined)
table(pf$year_joined)
```
```{r Cut a Variable 2}
pf$year_joined.bucket <- cut(pf$year_joined, c(2004,2009,2011,2012,2014))
table(pf$year_joined.bucket, useNA = 'ifany')
```
***
### Plotting it All Together
#### Create a line graph of friend_count vs. age so that each year_joined.bucket is a line tracking the median user friend_count across age. This means you should have four different lines on your plot.
#### You should subset the data to exclude the users whose year_joined.bucket is NA.
```{r Plotting it All Together}
ggplot(data = subset(pf,!is.na(year_joined.bucket)), aes(x = age, y = friend_count)) + geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median) + theme_grey()
```
***
### Plot the Grand Mean
```{r Plot the Grand Mean}
ggplot(aes(x = age, y = friend_count), data = subset(pf,!is.na(year_joined.bucket))) + geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = mean, linetype = 5, color = "darkblue") + theme_grey()
```
***
### Friending Rate
```{r Friending Rate 1}
with(subset(pf,tenure >0), summary(friend_count / tenure))
```
```{r Friending Rate 2}
pf.tenure_morethanzero <- subset(pf,tenure >0)
pf.tenure_morethanzero$friend_rate = pf.tenure_morethanzero$friend_count / pf.tenure_morethanzero$tenure
summary(pf.tenure_morethanzero$friend_rate)
```
***
### Friendships Initiated
```{r Friendships Initiated}
ggplot(aes(x=tenure, y = friend_rate, color = year_joined.bucket), data = pf.tenure_morethanzero) + geom_line(stat = 'summary', fun.y = mean) + scale_colour_brewer(type="seq", palette='Spectral')
```
***
### Bias-Variance Tradeoff Revisited
```{r Bias-Variance Tradeoff Revisited}
p1 <- ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = mean)
p2 <- ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
p3 <- ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
p4 <- ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
g1 <- grid.arrange(p1, p2, p3, p4, ncol=1)
ggsave("0505.jpg", g1, width = 8, height = 12)
```
```{r Geom_smooth}
p5 <- ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated/tenure), data = subset(pf, tenure >= 1)) + geom_line(aes(color = year_joined.bucket), stat = "summary", fun.y = mean) + scale_colour_brewer(type="seq", palette='Spectral')
p6 <- ggplot(aes(x=tenure, y = friendships_initiated / tenure), data = subset(pf, tenure >= 1)) + geom_smooth(aes(color = year_joined.bucket)) + scale_colour_brewer(type="seq", palette='Spectral')
g2 <- grid.arrange(p5, p6, ncol=1)
ggsave("0503.jpg", g2, width = 12, height = 8)
```
***
### Yo Dataset
```{r Yo Dataset 1}
yo <- read.csv('yogurt.csv')
str(yo)
```
```{r Yo Dataset 2}
yo$id <- factor(yo$id)
str(yo)
```
### Histograms Revisited
```{r Histograms Revisited 1}
qplot(data = yo, x = price, fill = I('#F79420'), binwidth = 10)
```
```{r Histograms Revisited 2}
ggplot(aes(x = price), data=yo) + geom_histogram(binwidth = 3, fill = "forestgreen")
```
***
### Number of Purchases
```{r Number of Purchases 1}
summary(yo)
length(unique(yo$price))
table(yo$price)
```
```{r Number of Purchases 2}
yo$all.purchases <- yo$strawberry + yo$blueberry + yo$pina.colada + yo$plain + yo$mixed.berry
summary(yo$all.purchases)
```
```{r Number of Purchases 3}
yo <- transform(yo, all.purchases = strawberry + blueberry + pina.colada + plain + mixed.berry)
```
```{r Number of Purchases 4}
qplot(x = all.purchases, data = yo, fill=I('blue'), binwidth = 1)
```
***
### Prices over Time
```{r Prices over Time}
ggplot(yo, aes( x = time, y = price)) + geom_point(position = position_jitter(h=0), shape=9, alpha=0.5, color = 'darkblue') + theme_foundation()
ggsave("0504.jpg", width = 8, height = 12)
```
***
### Looking at Samples of Households
```{r Looking at Sample of Households 1}
set.seed(4230)
sample.ids <- sample(levels(yo$id), 16)
ggplot(aes(x = time, y = price), data = subset(yo, id %in% sample.ids)) +
facet_wrap(~id) +
geom_line() +
geom_point(aes(size = all.purchases), pch = 1)
```
```{r Looking at Sample of Households 2}
set.seed(2000)
sample.ids <- sample(levels(yo$id), 16)
ggplot(aes(x = time, y = price), data = subset(yo, id %in% sample.ids)) + facet_wrap(~id) + geom_line(color = 'darkgreen') + geom_point(aes(size = all.purchases), pch = 10, color = 'darkblue') + geom_smooth(method = 'gam')
ggsave("0501.jpg", width = 16, height = 8)
```
***
### Scatterplot Matrix
```{r Scatterplot Matrix 1}
ggplot(diamonds, aes(x = cut, y = price/carat, color = color)) + geom_jitter() + facet_wrap(~clarity) + scale_color_brewer(type = 'qual', palette = "Spectral") + theme_gray()
ggsave("0502.jpg", width = 8, height = 12)
```
```{r Libraries 6}
# install.packages('GGally')
library(GGally)
```
```{r Scatterplot Matrix 2}
set.seed(2000)
pf_subset <- pf[, c(2:15)]
names(pf_subset)
```
```{r Scatterplot Matrix 3}
ggpairs(pf_subset[sample.int(nrow(pf_subset), 1000), ])
```
***
### Heat Maps
```{r Dataset nci 1}
nci <- read.table("nci.tsv")
colnames(nci) <- c(1:64)
```
```{r Dataset nci 2}
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)
ggplot(aes(y = gene, x = case, fill = value),
data = nci.long.samp) +
geom_tile() +
scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))
```
title: "Lesson 5"
runtime: shiny
output: html_document
---
```{r setup 1, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
========================================================
### Working directory and libraries
```{r setup 2}
setwd('/Users/olgabelitskaya/version-control/reflections-ud651')
```
```{r Libraries 1}
library(ggplot2)
library(lubridate)
```
```{r Libraries 2}
library(gridExtra)
library(plyr)
```
```{r Libraries 3}
library(scales)
library(reshape2)
```
```{r Libraries 4}
library(dplyr)
library(tidyr)
```
```{r Libraries 5}
library(xlsx)
library(ggthemes)
```
## Useful links
```{r Links}
# http://docs.ggplot2.org/current/
# http://docs.ggplot2.org/current/coord_trans.html
# http://sape.inf.usi.ch/quick-reference/ggplot2/themes
# http://personality-project.org/r/html/corr.test.html
# https://rpubs.com/hadley/ggplot2-layers
# http://rmarkdown.rstudio.com/articles_integration.html
# https://cran.r-project.org/web/packages/ggthemes/vignettes/ggthemes.html
```
```{r Pseudo-Facebook User Data}
pf <- read.csv('pseudo_facebook.tsv', sep='\t')
names(pf)
```
### Third Qualitative Variable
#### Write code to create a new data frame, called 'pf.fc_by_age_gender', that contains information on each age AND gender group.
#### The data frame should contain the following variables:
#### mean_friend_count,
#### median_friend_count,
#### n (the number of users in each age and gender grouping)
#### Here is an example of the structure of your data frame. Your data values will be different. Note that if you are grouping by more than one variable, you will probably need to call the ungroup() function.
#### age gender mean_friend_count median_friend_count n
#### 1 13 female 247.2953 150 207
#### 2 13 male 184.2342 61 265
#### 3 14 female 329.1938 245 834
#### 4 14 male 157.1204 88 1201
```{r Third Qualitative Variable 1}
ggplot(aes(x = gender, y = age, color = gender), data = subset(pf, !is.na(gender))) + geom_boxplot() + stat_summary(fun.y = mean, geom = 'point', shape=15)
```
```{r Third Qualitative Variable 2}
ggplot(aes(x = age, y = friend_count), data = subset(pf, !is.na(gender))) + geom_line(aes(color=gender), stat = 'summary', fun.y = median)
```
```{r Third Qualitative Variable 3}
detach("package:plyr", unload=TRUE)
pf.fc_by_age_gender <- pf %>%
filter(!is.na(gender)) %>%
group_by(age, gender) %>%
summarise(mean_friend_count = mean(friend_count),
median_friend_count = median(friend_count),
n = n() ) %>%
ungroup() %>%
arrange(age)
```
```{r Third Qualitative Variable 4}
pf.fc_by_age_gender2 <- subset(pf, !is.na(gender)) %>%
group_by(age, gender) %>%
summarise(mean_friend_count = mean(friend_count),
median_friend_count = median(friend_count),
n = n() ) %>%
arrange(age)
```
***
### Plotting Conditional Summaries
```{r Plotting Conditional Summaries}
ggplot(pf.fc_by_age_gender, aes(x = age, y = median_friend_count, color =gender)) + geom_line()
```
***
### Reshaping Data
```{r Reshaping Data}
pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender, age ~ gender, value.var = 'median_friend_count')
```
***
### Ratio Plot
```{r Ratio Plot}
ggplot(pf.fc_by_age_gender.wide, aes(x = age, y = female/male)) + geom_line(color = "darkred", linetype = 5) + geom_hline(yintercept=1, color= 'darkblue', linetype = 2, alpha=0.5) + theme_bw()
```
***
### Third Quantitative Variable
```{r Third Quantitative Variable 5}
pf$year_joined <- floor(2014 - pf$tenure / 365)
```
***
### Cut a Variable
#### Create a new variable in the data frame called year_joined.bucket by using the cut function on the variable year_joined.
#### You need to create the following buckets for the new variable, year_joined.bucket
#### (2004, 2009]
#### (2009, 2011]
#### (2011, 2012]
#### (2012, 2014]
#### Note that a parenthesis means exclude the year and a bracket means include the year.
```{r Cut a Variable 1}
summary(pf$year_joined)
table(pf$year_joined)
```
```{r Cut a Variable 2}
pf$year_joined.bucket <- cut(pf$year_joined, c(2004,2009,2011,2012,2014))
table(pf$year_joined.bucket, useNA = 'ifany')
```
***
### Plotting it All Together
#### Create a line graph of friend_count vs. age so that each year_joined.bucket is a line tracking the median user friend_count across age. This means you should have four different lines on your plot.
#### You should subset the data to exclude the users whose year_joined.bucket is NA.
```{r Plotting it All Together}
ggplot(data = subset(pf,!is.na(year_joined.bucket)), aes(x = age, y = friend_count)) + geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median) + theme_grey()
```
***
### Plot the Grand Mean
```{r Plot the Grand Mean}
ggplot(aes(x = age, y = friend_count), data = subset(pf,!is.na(year_joined.bucket))) + geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = mean, linetype = 5, color = "darkblue") + theme_grey()
```
***
### Friending Rate
```{r Friending Rate 1}
with(subset(pf,tenure >0), summary(friend_count / tenure))
```
```{r Friending Rate 2}
pf.tenure_morethanzero <- subset(pf,tenure >0)
pf.tenure_morethanzero$friend_rate = pf.tenure_morethanzero$friend_count / pf.tenure_morethanzero$tenure
summary(pf.tenure_morethanzero$friend_rate)
```
***
### Friendships Initiated
```{r Friendships Initiated}
ggplot(aes(x=tenure, y = friend_rate, color = year_joined.bucket), data = pf.tenure_morethanzero) + geom_line(stat = 'summary', fun.y = mean) + scale_colour_brewer(type="seq", palette='Spectral')
```
***
### Bias-Variance Tradeoff Revisited
```{r Bias-Variance Tradeoff Revisited}
p1 <- ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = mean)
p2 <- ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
p3 <- ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
p4 <- ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",
fun.y = mean)
g1 <- grid.arrange(p1, p2, p3, p4, ncol=1)
ggsave("0505.jpg", g1, width = 8, height = 12)
```
```{r Geom_smooth}
p5 <- ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated/tenure), data = subset(pf, tenure >= 1)) + geom_line(aes(color = year_joined.bucket), stat = "summary", fun.y = mean) + scale_colour_brewer(type="seq", palette='Spectral')
p6 <- ggplot(aes(x=tenure, y = friendships_initiated / tenure), data = subset(pf, tenure >= 1)) + geom_smooth(aes(color = year_joined.bucket)) + scale_colour_brewer(type="seq", palette='Spectral')
g2 <- grid.arrange(p5, p6, ncol=1)
ggsave("0503.jpg", g2, width = 12, height = 8)
```
***
### Yo Dataset
```{r Yo Dataset 1}
yo <- read.csv('yogurt.csv')
str(yo)
```
```{r Yo Dataset 2}
yo$id <- factor(yo$id)
str(yo)
```
### Histograms Revisited
```{r Histograms Revisited 1}
qplot(data = yo, x = price, fill = I('#F79420'), binwidth = 10)
```
```{r Histograms Revisited 2}
ggplot(aes(x = price), data=yo) + geom_histogram(binwidth = 3, fill = "forestgreen")
```
***
### Number of Purchases
```{r Number of Purchases 1}
summary(yo)
length(unique(yo$price))
table(yo$price)
```
```{r Number of Purchases 2}
yo$all.purchases <- yo$strawberry + yo$blueberry + yo$pina.colada + yo$plain + yo$mixed.berry
summary(yo$all.purchases)
```
```{r Number of Purchases 3}
yo <- transform(yo, all.purchases = strawberry + blueberry + pina.colada + plain + mixed.berry)
```
```{r Number of Purchases 4}
qplot(x = all.purchases, data = yo, fill=I('blue'), binwidth = 1)
```
***
### Prices over Time
```{r Prices over Time}
ggplot(yo, aes( x = time, y = price)) + geom_point(position = position_jitter(h=0), shape=9, alpha=0.5, color = 'darkblue') + theme_foundation()
ggsave("0504.jpg", width = 8, height = 12)
```
***
### Looking at Samples of Households
```{r Looking at Sample of Households 1}
set.seed(4230)
sample.ids <- sample(levels(yo$id), 16)
ggplot(aes(x = time, y = price), data = subset(yo, id %in% sample.ids)) +
facet_wrap(~id) +
geom_line() +
geom_point(aes(size = all.purchases), pch = 1)
```
```{r Looking at Sample of Households 2}
set.seed(2000)
sample.ids <- sample(levels(yo$id), 16)
ggplot(aes(x = time, y = price), data = subset(yo, id %in% sample.ids)) + facet_wrap(~id) + geom_line(color = 'darkgreen') + geom_point(aes(size = all.purchases), pch = 10, color = 'darkblue') + geom_smooth(method = 'gam')
ggsave("0501.jpg", width = 16, height = 8)
```
***
### Scatterplot Matrix
```{r Scatterplot Matrix 1}
ggplot(diamonds, aes(x = cut, y = price/carat, color = color)) + geom_jitter() + facet_wrap(~clarity) + scale_color_brewer(type = 'qual', palette = "Spectral") + theme_gray()
ggsave("0502.jpg", width = 8, height = 12)
```
```{r Libraries 6}
# install.packages('GGally')
library(GGally)
```
```{r Scatterplot Matrix 2}
set.seed(2000)
pf_subset <- pf[, c(2:15)]
names(pf_subset)
```
```{r Scatterplot Matrix 3}
ggpairs(pf_subset[sample.int(nrow(pf_subset), 1000), ])
```
***
### Heat Maps
```{r Dataset nci 1}
nci <- read.table("nci.tsv")
colnames(nci) <- c(1:64)
```
```{r Dataset nci 2}
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)
ggplot(aes(y = gene, x = case, fill = value),
data = nci.long.samp) +
geom_tile() +
scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))
```
суббота, 13 августа 2016 г.
DATA ANALYSIS WITH R; problemset3.Rmd
---
title: "Problem Set 3"
runtime: shiny
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r setup2}
setwd('/Users/olgabelitskaya/version-control/reflections-ud651')
```
## Useful links
```{r Links}
# http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
# http://www.cookbook-r.com/Graphs/Shapes_and_line_types/
```
## Libraries
```{r Libraries1}
library(ggplot2)
library(lubridate)
```
```{r Libraries2}
library(ggthemes)
theme_set(theme_minimal(7))
```
```{r Libraries3}
library(gridExtra)
library(plyr)
```
```{r Libraries}
library(scales)
library(reshape2)
```
```{r Libraries4}
library(dplyr)
library(tidyr)
```
```{r Libraries5}
library(xlsx)
```
## Dataset
## 3.1
#### a) Load the 'diamonds' data set in R Studio.
#### How many observations are in the data set? 53940
#### b) How many variables are in the data set? 10
#### c) How many ordered factors are in the set? 3
#### d) What letter represents the best color for a diamonds? D
```{r Data set}
data(diamonds)
summary(diamonds)
str(diamonds)
nrow(diamonds)
ncol(diamonds)
levels(diamonds$color)
```
## 3.2
#### Create a histogram of the price of all the diamonds in the diamond data set.
```{r Histograms1}
qplot(x = price, data = diamonds, binwidth = 400, color = I("blue"), fill = I("red"))
```
## 3.3
#### Describe the shape and center of the price distribution. Include summary statistics like the mean and median.
```{r Histograms2}
summary(diamonds$price)
```
#### Wide-spread, skewed right and, of course, non-symmetric shape; unimodal (with one peak), without many outliers.
#### Median = 2401, mean = 3933; median < mean for this type of distribution.
## 3.4
#### a) How many diamonds cost less than $500? 1729
#### b) How many diamonds cost less than $250? 0
#### c) How many diamonds cost more than $15,000? 1656
```{r Prices}
sum(diamonds$price < 500)
sum(diamonds$price < 250)
sum(diamonds$price >= 15000)
```
## 3.5
#### Explore the largest peak in the price histogram you created earlier.
#### Try limiting the x-axis, altering the bin width, and setting different breaks on the x-axis.
```{r Histograms3}
ggplot(aes(x = price), data = diamonds) + geom_histogram(binwidth = 500, color = 'purple', fill = '#099DD9') + scale_x_continuous(limits = c(500, 10000), breaks = seq(500, 10000, 500))
ggsave("pricehist1.jpg")
```
```{r Histograms4}
ggplot(aes(x = price), data = diamonds) +
geom_histogram(binwidth = 50, color = 'orange',
aes(fill=..count..), alpha = .8) +
scale_x_continuous(limits = c(500, 2000), breaks = seq(500, 2000, 100)) +
scale_fill_gradient("Count", low = "blue", high = "red")
ggsave("pricehist2.jpg")
```
```{r Histograms5}
ggplot(data=diamonds, aes(x = price)) +
geom_histogram(aes(y =..density..), col="red", fill="green", alpha = .7) +
scale_x_continuous(limits = c(500, 2000), breaks = seq(500, 2000, 100)) +
geom_density(col=2) +
labs(title="Histogram for Price") + labs(x="Price", y="Count")
ggsave("pricehist3.jpg")
```
```{r Histograms6}
ggplot(data = diamonds, aes(x=price, fill=cut(..x.., seq(500, 2000, 100)))) +
scale_x_continuous(limits = c(500, 2000), breaks = seq(500, 2000, 100)) +
geom_histogram(binwidth=50, color="steelblue")
ggsave("pricehist4.jpg")
```
## 3.6
#### Break out the histogram of diamond prices by cut.
#### You should have five histograms in separate panels on your resulting plot.
```{r Histograms7}
ggplot(aes(x = price), data = diamonds) +
geom_histogram(binwidth = 500, color = 'red', fill = '#099DD9') +
scale_x_continuous(limits = c(500, 15000), breaks = seq(500, 15000, 500)) +
facet_grid(cut ~ .)
ggsave("pricehist5.jpg")
```
## 3.7
#### a) Which cut has the highest priced diamond? Premium
#### b) Which cut has the lowest priced diamond? Premium & Ideal
#### c) Which cut has the lowest median price? Ideal
```{r Cut1}
diamonds %>%
group_by(cut) %>%
summarise(max_price = max(price),
min_price = min(price),
median_price = median(price))
```
## 3.8
#### In the two last exercises, we looked at the distribution for diamonds by cut.
#### Run the code below in R Studio to generate the histogram as a reminder.
#### In the last exercise, we looked at the summary statistics for diamond price by cut. If we look at the output table, the the median and quartiles are reasonably close to each other.
#### diamonds$cut: Fair
#### Min. 1st Qu. Median Mean 3rd Qu. Max.
#### 337 2050 3282 4359 5206 18570
#### ------------------------------------------------------------------------
#### diamonds$cut: Good
#### Min. 1st Qu. Median Mean 3rd Qu. Max.
#### 327 1145 3050 3929 5028 18790
#### ------------------------------------------------------------------------
#### diamonds$cut: Very Good
#### Min. 1st Qu. Median Mean 3rd Qu. Max.
#### 336 912 2648 3982 5373 18820
#### ------------------------------------------------------------------------
#### diamonds$cut: Premium
#### Min. 1st Qu. Median Mean 3rd Qu. Max.
#### 326 1046 3185 4584 6296 18820
#### ------------------------------------------------------------------------
#### diamonds$cut: Ideal
#### Min. 1st Qu. Median Mean 3rd Qu. Max.
#### 326 878 1810 3458 4678 18810
#### This means the distributions should be somewhat similar, but the histograms we created don't show that.
#### The 'Fair' and 'Good' diamonds appear to have different distributions compared to the better cut diamonds. They seem somewhat uniform on the left with long tails on the right.
#### Let's look in to this more.
#### Look up the documentation for facet-wrap in R Studio. Then, scroll back up and add a parameter to facet_wrap so that the y-axis in the histograms is not fixed. You want the y-axis to be different for each histogram.
```{r Cut2}
qplot(x = price, data = diamonds) + facet_wrap(~cut, scales = "free") + geom_histogram(color = 'orange', aes(fill=..count..), alpha = .8) + scale_fill_gradient("Count", low = "green", high = "red")
```
## 3.9
#### Create a histogram of price per carat and facet it by cut. You can make adjustments to the code from the previous exercise to get started.
#### Adjust the bin width and transform the scale of the x-axis using log10.
```{r Cut3}
ggplot(diamonds, aes(x = price/carat)) +
geom_histogram(color = "purple", aes(fill=..count..), binwidth = .01) +
scale_x_log10(expression(paste(Log[10], " of Price")),
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) +
facet_grid(cut~., scale = "free") + ylab("Count") +
scale_fill_gradient("Count", low = "green", high = "blue")
```
## 3.10
##### Investigate the price of diamonds using box plots, numerical summaries, and one of the following categorical variables: cut, clarity, or color.
#### There won’t be a solution video for this exercise so go to the discussion thread for either BOXPLOTS BY CLARITY, BOXPLOT BY COLOR, or BOXPLOTS BY CUT to share you thoughts and to see what other people found.
#### You can save images by using the ggsave() command. ggsave() will save the last plot created. For example...
#### qplot(x = price, data = diamonds)
#### ggsave('priceHistogram.png')
#### ggsave currently recognises the extensions eps/ps, tex (pictex), pdf, jpeg, tiff, png, bmp, svg and wmf (windows only).
```{r Cut4}
diamonds %>%
group_by(cut) %>%
summarise(count = n(), avg_price = mean(price))
```
```{r Cut5}
ggplot(data=diamonds, aes(x = clarity, y = price, color = cut)) +
geom_boxplot() + facet_grid(color~.)
ggsave("pricehist6.jpg")
```
## 3.11
#### a) What is the price range for the middle 50% of the diamonds with color D? 911; 4213.5
#### b) What is the price range for the middle 50% of diamonds with color J? 1860.5; 7695
#### c) What is the IQR for diamonds with the best color? 3302.5
#### d) What is the IQR for the diamonds with the worst color? 5834.5
```{r Cut6}
by(diamonds$price, diamonds$color, summary)
```
```{r Cut7}
diamonds %>%
group_by(color) %>%
filter(color == "D") %>%
summarise(Quartile.25 = quantile(price, 0.25),
Quartile.75 = quantile(price, 0.75),
IQR = Quartile.75 - Quartile.25)
```
```{r Cut8}
diamonds %>%
group_by(color) %>%
filter(color == "J") %>%
summarise(Quartile.25 = quantile(price, 0.25),
Quartile.75 = quantile(price, 0.75),
IQR = Quartile.75 - Quartile.25)
```
## 3.12
#### Investigate the price per carat of diamonds across the different colors of diamonds using boxplots.
```{r Carat1}
ggplot(data = diamonds, aes(x = color, y = price/carat, fill = color)) + geom_boxplot() + scale_y_continuous(limits = c(2000, 6000), labels=dollar) + xlab("Color") + ylab("Price per Carat") + stat_summary(fun.y=mean, geom="point", shape=5, size=4) + coord_flip()
ggsave("pricehist8.jpg")
```
## 3.13
#### Investigate the weight of the diamonds (carat) using a frequency polygon. Use different bin widths to see how the frequency polygon changes. What carat size has a count greater than 2000? Check all that apply.
#### 0.3; 1.01
```{r Carat2}
summary(diamonds$carat)
```
```{r Carat3}
ggplot(diamonds, aes(x=carat)) + geom_freqpoly(binwidth=0.1, color = 'blueviolet', alpha = 0.7) +
scale_x_continuous(breaks = seq(0, 5, 0.1), expand = c(0,0)) +
geom_vline(xintercept = c(0.1, 0.8, 1.6, 2.0, 3.0, 5.0), color = "firebrick1", linetype="dotted") +
geom_vline(xintercept = c(0.3, 1.01), color = "green", linetype = "longdash") +
geom_hline(yintercept = 2000, color = "dodgerblue", linetype="solid", alpha = 0.5) +
xlab("Carat") + ylab("Count")
ggsave("pricehist7.jpg")
```
## 3.14
#### The Gapminder website contains over 500 data sets with information about the world's population. Your task is to download a data set of your choice and create 2-5 plots that make use of the techniques from Lesson 3.
#### You might use a simple histogram, a boxplot split over a categorical variable, or a frequency polygon. The choice is yours!
#### You can find a link to the Gapminder website in the Instructor Notes.
#### Once you've completed your investigation, create a post in the discussions that includes:
#### 1. any questions you answered, your observations, and summary statistics
#### 2. snippets of code that created the plots
#### 3. links to the images of your plots
```{r Urban1}
urban <- tbl_df(read.xlsx("indicator.xlsx", sheetName="Data", header=TRUE))
```
```{r Urban2}
row.with.na <- apply(urban, 1, function(x){any(is.na(x))})
sum(row.with.na)
```
```{r Urban3}
filtered <- urban[!row.with.na,]
names(filtered)[1] <- "country"
```
```{r Urban4}
population <- melt(filtered, id=c("country"), value.name="population", variable.name="year")
population <- tbl_df(population)
```
```{r Urban5}
population <- population %>%
mutate(year = as.character(year), year = substr(year, 2, 5), year = as.numeric(year))
```
```{r Urban6}
color_pallete_function <- colorRampPalette(colors = c("red", "green", "purple"), space = "Lab")
```
```{r Urban7}
country_set1 = c("China", "India", "United States", "Brazil", 'Indonesia')
country_set2 = c("Germany", "United Kingdom", "France", "Iran", "Turkey", "Italy", 'Pakistan', 'Philippines', 'Bangladesh')
```
```{r Urban8}
num_colors <- length(country_set2)
```
```{r Urban9}
color_colors <- color_pallete_function(num_colors)
reorder_size <- function(x) {factor(x, levels = names(sort(table(x))))}
```
```{r Urban10}
population3 <- population[which(population$country == 'Russia'),]
population2011 <- population[which(population$year == 2011 & population$country %in% country_set2),]
population2 <- population[which(population$country %in% country_set1),]
```
```{r Urban11}
ggplot(data=population2, aes(x = year, y = population, color = country)) + geom_line() + scale_x_continuous(limits = c(1961, 2011), breaks = seq(1961, 2011, 5)) + labs(title="Urban population's dynamics", x="Year", y="Urban population")
ggsave("urban01.jpg")
```
```{r Urban12}
ggplot(data=population2011, aes(x=country, y=reorder_size(population), fill=population)) + geom_bar(stat="identity", color="darkblue") + scale_fill_gradient("Urban population", low = "purple", high = "firebrick1") + labs(title="Urban population in 2011", x="Country", y="Urban population")
ggsave("urban02.jpg")
```
```{r, echo=FALSE}
fit.lm <-lm(population3$population ~ population3$year)
plot(population3$year, population3$population)
abline(fit.lm)
```
```{r Urban13}
ggplot(data=population3, aes(x=year, y=population, color = cut(..y.., seq(64000000, 122000000, 10000000)))) + geom_point() + geom_line(color = 'red') + scale_x_continuous(limits = c(1961, 2011), breaks = seq(1961, 2011, 5)) + geom_hline(yintercept = 100000000, color = "dodgerblue", linetype="solid", alpha = 0.8) + geom_hline(yintercept = 110000000, color = "firebrick2", linetype="solid", alpha = 0.8)+ labs(title="Urban population in Russia", x="Year", y="Urban population")
ggsave("urban03.jpg")
```
## 3.15
#### How many birthdays are in each month? Which day of the year has the most number of birthdays? Do you have at least 365 friends that have birthdays on everyday of the year?
# **********************************************************************
#### You will need to do some data munging and additional research to complete this task. This task won't be easy, and you may encounter some unexpected challenges along the way. We hope you learn a lot from it though.
#### You can expect to spend 30 min or more on this task depending on if use the provided data or obtain your personal data. We also encourage you to use the lubridate package for working with dates. Read over the documentation in RStudio and search for examples online if you need help.
#### You'll need to export your Facebooks friends' birthdays to a csv file. You may need to create a calendar of your Facebook friends birthdays in a program like Outlook or Gmail and then export the calendar as a csv file.
#### Once you load the data into R Studio, you can use the strptime() function to extract the birth months and birth days. We recommend looking up the documentation for the function and finding examples online.
```{r BD1}
bd_examples <- tbl_df(read.csv("birthdaysExample.csv"))
bdates <- mdy(bd_examples$dates)
```
```{r BD2}
bd_examples <- bd_examples %>%
mutate(birthday = bdates, weekday = weekdays(bdates, abbr=TRUE),
year = year(bdates), month = month(bdates, label=TRUE, abbr=FALSE),
day = day(bdates))
```
```{r BD3}
days_by_weekday <- ddply(bd_examples, .(weekday), summarize, dates=length(dates))
days_by_weekday
```
```{r BD4}
days_by_weekday$weekday <- factor(days_by_weekday$weekday, levels= c("пн", "вт", "ср", "чт", "пт", "сб", "вс"))
days_by_weekday[order(days_by_weekday$weekday), ]
```
```{r BD5}
ggplot(data=days_by_weekday, aes(x = weekday, y = dates, fill = weekday)) +
geom_bar(stat="identity", color="darkblue")
ggsave("birthdays01.jpg")
```
```{r BD6}
days_by_day <- ddply(bd_examples, .(day), summarize, dates=length(dates))
days_by_day
```
```{r BD7}
ggplot(data=days_by_day, aes(x = day, y = dates, fill = day)) +
geom_bar(stat="identity", color="darkblue")
ggsave("birthdays02.jpg")
```
```{r BD8}
days_by_month <- ddply(bd_examples, .(month), summarize, dates=length(dates))
days_by_month
```
```{r BD9}
ggplot(data=days_by_month, aes(x = month, y = dates, color = month)) +
geom_point(stat="identity", shape=8, size=5)
ggsave("birthdays03.jpg")
```
title: "Problem Set 3"
runtime: shiny
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r setup2}
setwd('/Users/olgabelitskaya/version-control/reflections-ud651')
```
## Useful links
```{r Links}
# http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
# http://www.cookbook-r.com/Graphs/Shapes_and_line_types/
```
## Libraries
```{r Libraries1}
library(ggplot2)
library(lubridate)
```
```{r Libraries2}
library(ggthemes)
theme_set(theme_minimal(7))
```
```{r Libraries3}
library(gridExtra)
library(plyr)
```
```{r Libraries}
library(scales)
library(reshape2)
```
```{r Libraries4}
library(dplyr)
library(tidyr)
```
```{r Libraries5}
library(xlsx)
```
## Dataset
## 3.1
#### a) Load the 'diamonds' data set in R Studio.
#### How many observations are in the data set? 53940
#### b) How many variables are in the data set? 10
#### c) How many ordered factors are in the set? 3
#### d) What letter represents the best color for a diamonds? D
```{r Data set}
data(diamonds)
summary(diamonds)
str(diamonds)
nrow(diamonds)
ncol(diamonds)
levels(diamonds$color)
```
## 3.2
#### Create a histogram of the price of all the diamonds in the diamond data set.
```{r Histograms1}
qplot(x = price, data = diamonds, binwidth = 400, color = I("blue"), fill = I("red"))
```
## 3.3
#### Describe the shape and center of the price distribution. Include summary statistics like the mean and median.
```{r Histograms2}
summary(diamonds$price)
```
#### Wide-spread, skewed right and, of course, non-symmetric shape; unimodal (with one peak), without many outliers.
#### Median = 2401, mean = 3933; median < mean for this type of distribution.
## 3.4
#### a) How many diamonds cost less than $500? 1729
#### b) How many diamonds cost less than $250? 0
#### c) How many diamonds cost more than $15,000? 1656
```{r Prices}
sum(diamonds$price < 500)
sum(diamonds$price < 250)
sum(diamonds$price >= 15000)
```
## 3.5
#### Explore the largest peak in the price histogram you created earlier.
#### Try limiting the x-axis, altering the bin width, and setting different breaks on the x-axis.
```{r Histograms3}
ggplot(aes(x = price), data = diamonds) + geom_histogram(binwidth = 500, color = 'purple', fill = '#099DD9') + scale_x_continuous(limits = c(500, 10000), breaks = seq(500, 10000, 500))
ggsave("pricehist1.jpg")
```
```{r Histograms4}
ggplot(aes(x = price), data = diamonds) +
geom_histogram(binwidth = 50, color = 'orange',
aes(fill=..count..), alpha = .8) +
scale_x_continuous(limits = c(500, 2000), breaks = seq(500, 2000, 100)) +
scale_fill_gradient("Count", low = "blue", high = "red")
ggsave("pricehist2.jpg")
```
```{r Histograms5}
ggplot(data=diamonds, aes(x = price)) +
geom_histogram(aes(y =..density..), col="red", fill="green", alpha = .7) +
scale_x_continuous(limits = c(500, 2000), breaks = seq(500, 2000, 100)) +
geom_density(col=2) +
labs(title="Histogram for Price") + labs(x="Price", y="Count")
ggsave("pricehist3.jpg")
```
```{r Histograms6}
ggplot(data = diamonds, aes(x=price, fill=cut(..x.., seq(500, 2000, 100)))) +
scale_x_continuous(limits = c(500, 2000), breaks = seq(500, 2000, 100)) +
geom_histogram(binwidth=50, color="steelblue")
ggsave("pricehist4.jpg")
```
## 3.6
#### Break out the histogram of diamond prices by cut.
#### You should have five histograms in separate panels on your resulting plot.
```{r Histograms7}
ggplot(aes(x = price), data = diamonds) +
geom_histogram(binwidth = 500, color = 'red', fill = '#099DD9') +
scale_x_continuous(limits = c(500, 15000), breaks = seq(500, 15000, 500)) +
facet_grid(cut ~ .)
ggsave("pricehist5.jpg")
```
## 3.7
#### a) Which cut has the highest priced diamond? Premium
#### b) Which cut has the lowest priced diamond? Premium & Ideal
#### c) Which cut has the lowest median price? Ideal
```{r Cut1}
diamonds %>%
group_by(cut) %>%
summarise(max_price = max(price),
min_price = min(price),
median_price = median(price))
```
## 3.8
#### In the two last exercises, we looked at the distribution for diamonds by cut.
#### Run the code below in R Studio to generate the histogram as a reminder.
#### In the last exercise, we looked at the summary statistics for diamond price by cut. If we look at the output table, the the median and quartiles are reasonably close to each other.
#### diamonds$cut: Fair
#### Min. 1st Qu. Median Mean 3rd Qu. Max.
#### 337 2050 3282 4359 5206 18570
#### ------------------------------------------------------------------------
#### diamonds$cut: Good
#### Min. 1st Qu. Median Mean 3rd Qu. Max.
#### 327 1145 3050 3929 5028 18790
#### ------------------------------------------------------------------------
#### diamonds$cut: Very Good
#### Min. 1st Qu. Median Mean 3rd Qu. Max.
#### 336 912 2648 3982 5373 18820
#### ------------------------------------------------------------------------
#### diamonds$cut: Premium
#### Min. 1st Qu. Median Mean 3rd Qu. Max.
#### 326 1046 3185 4584 6296 18820
#### ------------------------------------------------------------------------
#### diamonds$cut: Ideal
#### Min. 1st Qu. Median Mean 3rd Qu. Max.
#### 326 878 1810 3458 4678 18810
#### This means the distributions should be somewhat similar, but the histograms we created don't show that.
#### The 'Fair' and 'Good' diamonds appear to have different distributions compared to the better cut diamonds. They seem somewhat uniform on the left with long tails on the right.
#### Let's look in to this more.
#### Look up the documentation for facet-wrap in R Studio. Then, scroll back up and add a parameter to facet_wrap so that the y-axis in the histograms is not fixed. You want the y-axis to be different for each histogram.
```{r Cut2}
qplot(x = price, data = diamonds) + facet_wrap(~cut, scales = "free") + geom_histogram(color = 'orange', aes(fill=..count..), alpha = .8) + scale_fill_gradient("Count", low = "green", high = "red")
```
## 3.9
#### Create a histogram of price per carat and facet it by cut. You can make adjustments to the code from the previous exercise to get started.
#### Adjust the bin width and transform the scale of the x-axis using log10.
```{r Cut3}
ggplot(diamonds, aes(x = price/carat)) +
geom_histogram(color = "purple", aes(fill=..count..), binwidth = .01) +
scale_x_log10(expression(paste(Log[10], " of Price")),
breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) +
facet_grid(cut~., scale = "free") + ylab("Count") +
scale_fill_gradient("Count", low = "green", high = "blue")
```
## 3.10
##### Investigate the price of diamonds using box plots, numerical summaries, and one of the following categorical variables: cut, clarity, or color.
#### There won’t be a solution video for this exercise so go to the discussion thread for either BOXPLOTS BY CLARITY, BOXPLOT BY COLOR, or BOXPLOTS BY CUT to share you thoughts and to see what other people found.
#### You can save images by using the ggsave() command. ggsave() will save the last plot created. For example...
#### qplot(x = price, data = diamonds)
#### ggsave('priceHistogram.png')
#### ggsave currently recognises the extensions eps/ps, tex (pictex), pdf, jpeg, tiff, png, bmp, svg and wmf (windows only).
```{r Cut4}
diamonds %>%
group_by(cut) %>%
summarise(count = n(), avg_price = mean(price))
```
```{r Cut5}
ggplot(data=diamonds, aes(x = clarity, y = price, color = cut)) +
geom_boxplot() + facet_grid(color~.)
ggsave("pricehist6.jpg")
```
## 3.11
#### a) What is the price range for the middle 50% of the diamonds with color D? 911; 4213.5
#### b) What is the price range for the middle 50% of diamonds with color J? 1860.5; 7695
#### c) What is the IQR for diamonds with the best color? 3302.5
#### d) What is the IQR for the diamonds with the worst color? 5834.5
```{r Cut6}
by(diamonds$price, diamonds$color, summary)
```
```{r Cut7}
diamonds %>%
group_by(color) %>%
filter(color == "D") %>%
summarise(Quartile.25 = quantile(price, 0.25),
Quartile.75 = quantile(price, 0.75),
IQR = Quartile.75 - Quartile.25)
```
```{r Cut8}
diamonds %>%
group_by(color) %>%
filter(color == "J") %>%
summarise(Quartile.25 = quantile(price, 0.25),
Quartile.75 = quantile(price, 0.75),
IQR = Quartile.75 - Quartile.25)
```
## 3.12
#### Investigate the price per carat of diamonds across the different colors of diamonds using boxplots.
```{r Carat1}
ggplot(data = diamonds, aes(x = color, y = price/carat, fill = color)) + geom_boxplot() + scale_y_continuous(limits = c(2000, 6000), labels=dollar) + xlab("Color") + ylab("Price per Carat") + stat_summary(fun.y=mean, geom="point", shape=5, size=4) + coord_flip()
ggsave("pricehist8.jpg")
```
## 3.13
#### Investigate the weight of the diamonds (carat) using a frequency polygon. Use different bin widths to see how the frequency polygon changes. What carat size has a count greater than 2000? Check all that apply.
#### 0.3; 1.01
```{r Carat2}
summary(diamonds$carat)
```
```{r Carat3}
ggplot(diamonds, aes(x=carat)) + geom_freqpoly(binwidth=0.1, color = 'blueviolet', alpha = 0.7) +
scale_x_continuous(breaks = seq(0, 5, 0.1), expand = c(0,0)) +
geom_vline(xintercept = c(0.1, 0.8, 1.6, 2.0, 3.0, 5.0), color = "firebrick1", linetype="dotted") +
geom_vline(xintercept = c(0.3, 1.01), color = "green", linetype = "longdash") +
geom_hline(yintercept = 2000, color = "dodgerblue", linetype="solid", alpha = 0.5) +
xlab("Carat") + ylab("Count")
ggsave("pricehist7.jpg")
```
## 3.14
#### The Gapminder website contains over 500 data sets with information about the world's population. Your task is to download a data set of your choice and create 2-5 plots that make use of the techniques from Lesson 3.
#### You might use a simple histogram, a boxplot split over a categorical variable, or a frequency polygon. The choice is yours!
#### You can find a link to the Gapminder website in the Instructor Notes.
#### Once you've completed your investigation, create a post in the discussions that includes:
#### 1. any questions you answered, your observations, and summary statistics
#### 2. snippets of code that created the plots
#### 3. links to the images of your plots
```{r Urban1}
urban <- tbl_df(read.xlsx("indicator.xlsx", sheetName="Data", header=TRUE))
```
```{r Urban2}
row.with.na <- apply(urban, 1, function(x){any(is.na(x))})
sum(row.with.na)
```
```{r Urban3}
filtered <- urban[!row.with.na,]
names(filtered)[1] <- "country"
```
```{r Urban4}
population <- melt(filtered, id=c("country"), value.name="population", variable.name="year")
population <- tbl_df(population)
```
```{r Urban5}
population <- population %>%
mutate(year = as.character(year), year = substr(year, 2, 5), year = as.numeric(year))
```
```{r Urban6}
color_pallete_function <- colorRampPalette(colors = c("red", "green", "purple"), space = "Lab")
```
```{r Urban7}
country_set1 = c("China", "India", "United States", "Brazil", 'Indonesia')
country_set2 = c("Germany", "United Kingdom", "France", "Iran", "Turkey", "Italy", 'Pakistan', 'Philippines', 'Bangladesh')
```
```{r Urban8}
num_colors <- length(country_set2)
```
```{r Urban9}
color_colors <- color_pallete_function(num_colors)
reorder_size <- function(x) {factor(x, levels = names(sort(table(x))))}
```
```{r Urban10}
population3 <- population[which(population$country == 'Russia'),]
population2011 <- population[which(population$year == 2011 & population$country %in% country_set2),]
population2 <- population[which(population$country %in% country_set1),]
```
```{r Urban11}
ggplot(data=population2, aes(x = year, y = population, color = country)) + geom_line() + scale_x_continuous(limits = c(1961, 2011), breaks = seq(1961, 2011, 5)) + labs(title="Urban population's dynamics", x="Year", y="Urban population")
ggsave("urban01.jpg")
```
```{r Urban12}
ggplot(data=population2011, aes(x=country, y=reorder_size(population), fill=population)) + geom_bar(stat="identity", color="darkblue") + scale_fill_gradient("Urban population", low = "purple", high = "firebrick1") + labs(title="Urban population in 2011", x="Country", y="Urban population")
ggsave("urban02.jpg")
```
```{r, echo=FALSE}
fit.lm <-lm(population3$population ~ population3$year)
plot(population3$year, population3$population)
abline(fit.lm)
```
```{r Urban13}
ggplot(data=population3, aes(x=year, y=population, color = cut(..y.., seq(64000000, 122000000, 10000000)))) + geom_point() + geom_line(color = 'red') + scale_x_continuous(limits = c(1961, 2011), breaks = seq(1961, 2011, 5)) + geom_hline(yintercept = 100000000, color = "dodgerblue", linetype="solid", alpha = 0.8) + geom_hline(yintercept = 110000000, color = "firebrick2", linetype="solid", alpha = 0.8)+ labs(title="Urban population in Russia", x="Year", y="Urban population")
ggsave("urban03.jpg")
```
## 3.15
#### How many birthdays are in each month? Which day of the year has the most number of birthdays? Do you have at least 365 friends that have birthdays on everyday of the year?
# **********************************************************************
#### You will need to do some data munging and additional research to complete this task. This task won't be easy, and you may encounter some unexpected challenges along the way. We hope you learn a lot from it though.
#### You can expect to spend 30 min or more on this task depending on if use the provided data or obtain your personal data. We also encourage you to use the lubridate package for working with dates. Read over the documentation in RStudio and search for examples online if you need help.
#### You'll need to export your Facebooks friends' birthdays to a csv file. You may need to create a calendar of your Facebook friends birthdays in a program like Outlook or Gmail and then export the calendar as a csv file.
#### Once you load the data into R Studio, you can use the strptime() function to extract the birth months and birth days. We recommend looking up the documentation for the function and finding examples online.
```{r BD1}
bd_examples <- tbl_df(read.csv("birthdaysExample.csv"))
bdates <- mdy(bd_examples$dates)
```
```{r BD2}
bd_examples <- bd_examples %>%
mutate(birthday = bdates, weekday = weekdays(bdates, abbr=TRUE),
year = year(bdates), month = month(bdates, label=TRUE, abbr=FALSE),
day = day(bdates))
```
```{r BD3}
days_by_weekday <- ddply(bd_examples, .(weekday), summarize, dates=length(dates))
days_by_weekday
```
```{r BD4}
days_by_weekday$weekday <- factor(days_by_weekday$weekday, levels= c("пн", "вт", "ср", "чт", "пт", "сб", "вс"))
days_by_weekday[order(days_by_weekday$weekday), ]
```
```{r BD5}
ggplot(data=days_by_weekday, aes(x = weekday, y = dates, fill = weekday)) +
geom_bar(stat="identity", color="darkblue")
ggsave("birthdays01.jpg")
```
```{r BD6}
days_by_day <- ddply(bd_examples, .(day), summarize, dates=length(dates))
days_by_day
```
```{r BD7}
ggplot(data=days_by_day, aes(x = day, y = dates, fill = day)) +
geom_bar(stat="identity", color="darkblue")
ggsave("birthdays02.jpg")
```
```{r BD8}
days_by_month <- ddply(bd_examples, .(month), summarize, dates=length(dates))
days_by_month
```
```{r BD9}
ggplot(data=days_by_month, aes(x = month, y = dates, color = month)) +
geom_point(stat="identity", shape=8, size=5)
ggsave("birthdays03.jpg")
```
DATA ANALYSIS WITH R; problemset4.Rmd
---
title: "Problem Set 4"
runtime: shiny
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, results = 'markup')
```
```{r setup2}
setwd('/Users/olgabelitskaya/version-control/reflections-ud651')
```
## Libraries
```{r Libraries 1}
library(ggplot2)
library(lubridate)
```
```{r Libraries 2}
library(ggthemes)
```
```{r Libraries 3}
library(grid)
library(gridExtra)
```
```{r Libraries 4}
library(scales)
library(reshape2)
```
```{r Libraries 5}
library(plyr)
library(dplyr)
library(tidyr)
```
```{r Libraries 6}
library(xlsx)
```
## Useful links
```{r Links}
# http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
# http://www.cookbook-r.com/Graphs/Shapes_and_line_types/
# http://www.ats.ucla.edu/stat/r/faq/smooths.htm
# http://docs.ggplot2.org/current/scale_brewer.html
# http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/
# https://www.r-bloggers.com/from-continuous-to-categorical/
```
## 4.1
#### In this problem set, you'll continue to explore the diamonds data set.
#### Your first task is to create a scatterplot of price vs x, using the ggplot syntax.
```{r Data set}
data(diamonds)
summary(diamonds)
```
```{r Price x}
ggplot(diamonds, aes(x = x, y = price)) + geom_point(position = position_jitter(h=0), shape=15, alpha=1/10, color = 'darkgreen') + coord_cartesian(xlim=c(3, 11)) + scale_y_continuous(breaks=seq(1000, 19000, 1000),label=dollar) + theme_bw()
ggsave("energy01.jpg")
```
## 4.2
#### What are your observations about the scatterplot of price vs x?
#### The data set starts at about x = 3 and increases exponentially till the level 9. The top level of price is about $19k, it spreads much more widely near this point.
## 4.3
#### What is the correlation between price and x (and y, and z)?
```{r Cor.test}
with(diamonds, cor.test(price, x))
with(diamonds, cor.test(price, y))
with(diamonds, cor.test(price, z))
```
Pearson's product-moment correlation
data: price and x
t = 440.16, df = 53938, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.8825835 0.8862594
sample estimates:
cor
0.8844352
Pearson's product-moment correlation
data: price and y
t = 401.14, df = 53938, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.8632867 0.8675241
sample estimates:
cor
0.8654209
Pearson's product-moment correlation
data: price and z
t = 393.6, df = 53938, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.8590541 0.8634131
sample estimates:
cor
0.8612494
## 4.4
#### Create a simple scatter plot of price vs depth.
```{r Price depth 1}
ggplot(diamonds, aes(x = depth, y = price)) + geom_point(position = position_jitter(h=0), shape=6, alpha=1/10, color = 'darkred') + theme_grey()
ggsave("energy02.jpg")
```
## 4.5
#### Change the code to make the transparency of the points to be 1/100 of what they are now and mark the x-axis every 2 units. See the instructor notes for two hints.
```{r Price depth 2}
ggplot(diamonds, aes(x = depth, y = price)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/100, color = 'darkblue') + scale_x_continuous(breaks = seq(min(diamonds$depth), max(diamonds$depth), 2), labels = seq(min(diamonds$depth), max(diamonds$depth), 2)) + theme_bw()
ggsave("energy03.jpg")
```
## 4.6
#### Based on the scatterplot of depth vs. price, most diamonds are between what values of depth?
#### (58;64)
## 4.7
#### What's the correlation of depth vs. price?
#### Based on the correlation coefficient, would you use depth to predict the price of a diamond? Why?
```{r Price depth 3}
with(diamonds, cor.test(depth, price))
```
Pearson's product-moment correlation
data: depth and price
t = -2.473, df = 53938, p-value = 0.0134
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.019084756 -0.002208537
sample estimates:
cor
-0.0106474
No, the correlation is too weak. Only depth it not enough for price prediction.
## 4.8
#### Create a scatterplot of price vs carat and omit the top 1% of price and carat values.
```{r Price carat}
ggplot(diamonds, aes(x = carat, y = price)) + geom_point(position = position_jitter(h=0), shape=9, alpha=0.1, color = 'firebrick1') + scale_x_continuous(breaks=seq(0, 2.5, 0.1), limits=c(0, quantile(diamonds$carat, 0.99))) + scale_y_continuous(breaks=seq(0, 18000, 1000), limits=c(0 , quantile(diamonds$price, 0.99)), labels=dollar) + theme_bw()
ggsave("energy04.jpg")
```
## 4.9
#### Create a scatterplot of price vs. volume (x * y * z). This is a very rough approximation for a diamond's volume.
#### Create a new variable for volume in the diamonds data frame. This will be useful in a later exercise.
#### Don't make any adjustments to the plot just yet.
```{r Price volume 1}
diamonds_v <- diamonds %>%
mutate(volume=x*y*z)
```
```{r Price volume 2}
ggplot(diamonds_v, aes(x = volume, y = price)) + geom_point() + theme_economist_white()
ggsave("energy05.jpg")
```
## 4.10
#### What are your observations from the price vs volume scatterplot?
#### Prices rise exponentially with volume, the transformations of the x-scale is a possible way to improve this visualization. There are many diamonds with volumes near 0. Also we could see 3 outliers here.
## 4.11
#### What's the correlation of price and volume? Exclude diamonds that have a volume of 0 or that are greater than or equal to 800.
```{r Detach plyr}
detach("package:plyr", unload=TRUE)
```
```{r Price volume 3}
with(subset(diamonds_v, !(volume == 0 | volume >= 800) ), cor.test(price, volume))
```
Pearson's product-moment correlation
data: price and volume
t = 559.19, df = 53915, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9222944 0.9247772
sample estimates:
cor
0.9235455
## 4.12
#### Subset the data to exclude diamonds with a volume greater than or equal to 800. Also, exclude diamonds with a volume of 0. Adjust the transparency of the points and add a linear model to the plot. (See the Instructor Notes or look up the documentation of geom_smooth() for more details about smoothers.)
#### We encourage you to think about this next question and to post your thoughts in the discussion section.
#### Do you think this would be a useful model to estimate the price of diamonds? Why or why not?
```{r Price volume 4}
sub_diamonds_v <- diamonds_v %>%
filter(volume != 0, volume <= 800)
```
```{r Price volume 5}
ggplot(sub_diamonds_v, aes( x = volume, y = price)) + geom_point(position = position_jitter(h=0), shape=2, alpha=0.1, color = 'darkgreen') + geom_smooth(method = "lm", se = TRUE) + theme_bw()
ggsave("energy06.jpg")
```
```{r Price volume 6}
ggplot(sub_diamonds_v, aes( x = volume, y = price)) + geom_point(position = position_jitter(h=0), shape=1, alpha=0.1, color = 'darkviolet') + geom_smooth(method = "gam", se = TRUE) + scale_y_continuous(breaks=seq(0, 18000, 1000), limits=c(0 ,quantile(diamonds$price, 0.99))) + theme_bw()
ggsave("energy07.jpg")
```
## 4.13
##### Use the function dplyr package to create a new data frame containing info on diamonds by clarity.
#### Name the data frame diamondsByClarity. The data frame should contain the following variables in this order.
#### (1) mean_price
#### (2) median_price
#### (3) min_price
#### (4) max_price
#### (5) n
#### where n is the number of diamonds in each level of clarity.
```{r Price clarity}
diamondsByClarity<- diamonds %>%
group_by(clarity) %>%
summarise(mean_price = mean(price),
median_price = median(price),
min_price = min(price),
max_price = max(price),
n = n() ) %>%
arrange(clarity)
```
## 4.14
#### We’ve created summary data frames with the mean price by clarity and color. You can run the code in R to verify what data is in the variables diamonds_mp_by_clarity and diamonds_mp_by_color.
#### Your task is to write additional code to create two bar plots on one output image using the grid.arrange() function from the package gridExtra.
```{r Price group 1}
diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))
diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))
```
```{r Price group 2}
p1 <- ggplot(diamonds_mp_by_clarity, aes(x=clarity, y=mean_price, fill= clarity)) + geom_bar(stat = "identity", color = "darkblue") + scale_fill_hue(l=50, c=200) + guides(fill = guide_legend(ncol=1, title.hjust=0.2)) + theme_foundation()
ggsave("energy14.jpg")
p2 <- ggplot(diamonds_mp_by_color, aes(x=color, y=mean_price, fill=color)) + geom_bar(stat = "identity", color = "darkviolet") + scale_fill_brewer(palette="Spectral") + guides(fill = guide_legend(ncol=1, title.hjust=0.2)) + theme_foundation()
ggsave("energy08.jpg")
grid.arrange(p1, p2, ncol=1)
g1 <- grid.arrange(p1, p2, ncol=1)
ggsave(file = "energy15.jpg", g1)
```
## 4.15
#### What do you notice in each of the bar charts for mean price by clarity and mean price by color?
#### In general price decreases with a change of clarity from I1 to IF and increases with color change from D to J.
## 4.16
#### The Gapminder website contains over 500 data sets with information about the world's population. Your task is to continue the investigation you did at the end of Problem Set 3 or you can start fresh and choose a different data set from Gapminder.
#### If you are feeling adventurous or want to try some data munging see if you can find a data set or scrape one from the web.
#### In your investigation, examine pairs of variable and create 2-5 plots that make use of the techniques from Lesson 4.
#### Once you've completed your investigation, create a post in the discussions that includes:
#### 1. the variable(s) you investigated, your observations, and any summary statistics
#### 2. snippets of code that created the plots
#### 3. links to the images of your plots
#### Copy and paste all of the code that you used for your investigation, and submit it when you are ready.
# ====================================================================
```{r Gapminder data 1}
energy <- tbl_df(read.xlsx("energy_use_per_person.xlsx", sheetName="Data", header=TRUE))
```
```{r Gapminder data 2}
row.with.na <- apply(energy, 1, function(x){any(is.na(x))})
sum(row.with.na)
```
```{r Gapminder data 3}
filtered <- energy[!row.with.na,]
names(filtered)[1] <- "country"
```
```{r Gapminder data 4}
energy_db <- melt(filtered, id=c("country"), value.name="energy", variable.name="year")
energy_db <- tbl_df(energy_db)
```
```{r Gapminder data 5}
energy_db <- energy_db %>%
mutate(year = as.character(year), year = substr(year, 2, 5), year = as.numeric(year))
head(energy_db)
```
```{r Gapminder data 6}
energy_db$year_d <- cut(energy_db$year, seq(1959,2012,1), right=FALSE, labels=c(1959:2011))
```
```{r Gapminder data 7}
energy1 <- energy_db[which(energy_db$country == 'Japan'),]
energy2 <- energy_db[which(energy_db$country == 'United States'),]
energy3 <- energy_db[which(energy_db$country == 'Canada'),]
country_set = c("Germany", "United Kingdom", "France", "Austria", "Belgium", "Italy", 'Denmark', 'Netherlands', 'Norway', "Finland", "Greece", "Poland", "Portugal")
energy4 <- energy_db[which(energy_db$country %in% country_set),]
```
```{r Gapminder data 8}
ggplot(energy4, aes(x = year, y = energy, color=country)) + geom_point(position = position_jitter(h=0), shape=9, alpha=0.8) + geom_line() + labs(title="Energy use per person", x="Year", y="Energy")
ggsave("energy09.jpg")
```
```{r Gapminder data 9}
grid_arrange_shared_legend <- function(...) {
plots <- list(...)
g <- ggplotGrob(plots[[1]] + theme(legend.position="bottom"))$grobs
legend <- g[[which(sapply(g, function(x) x$name) == "guide-box")]]
lheight <- sum(legend$height)
grid.arrange(
do.call(arrangeGrob, lapply(plots, function(x)
x + theme(legend.position="none"))),
legend,
ncol = 1,
heights = unit.c(unit(1, "npc") - lheight, lheight))
}
```
```{r Gapminder data 10}
p3 <- ggplot(energy1, aes(x=year_d, y=energy, fill= year_d)) + geom_bar(stat = "identity", color = "darkblue") + scale_fill_hue(l=50, c=200) + labs(title="Energy per person in Japan", x="Year", y="Energy") + scale_x_discrete(breaks=seq(1960,2011,5), labels=seq(1960,2011,5))
ggsave("energy10.jpg")
```
```{r Gapminder data 11}
p4 <- ggplot(energy2, aes(x=year_d, y=energy, fill= year_d)) + geom_bar(stat = "identity", color = "darkblue") + scale_fill_hue(l=50, c=200) + labs(title="Energy per person in United States", x="Year", y="Energy") + scale_x_discrete(breaks=seq(1960,2011,5), labels=seq(1960,2011,5))
ggsave("energy11.jpg")
```
```{r Gapminder data 12}
p5 <- ggplot(energy3, aes(x=year_d, y=energy, fill= year_d)) + geom_bar(stat = "identity", color = "darkblue") + scale_fill_hue(l=50, c=200) + labs(title="Energy per person in Canada", x="Year", y="Energy") + scale_x_discrete(breaks=seq(1960,2011,5), labels=seq(1960,2011,5))
ggsave("energy12.jpg")
```
```{r Gapminder data 13}
grid_arrange_shared_legend(p3, p4, p5)
g2 <- grid_arrange_shared_legend(p3, p4, p5)
ggsave(file="energy13.jpg", g2, width = 8, height = 12)
title: "Problem Set 4"
runtime: shiny
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, results = 'markup')
```
```{r setup2}
setwd('/Users/olgabelitskaya/version-control/reflections-ud651')
```
## Libraries
```{r Libraries 1}
library(ggplot2)
library(lubridate)
```
```{r Libraries 2}
library(ggthemes)
```
```{r Libraries 3}
library(grid)
library(gridExtra)
```
```{r Libraries 4}
library(scales)
library(reshape2)
```
```{r Libraries 5}
library(plyr)
library(dplyr)
library(tidyr)
```
```{r Libraries 6}
library(xlsx)
```
## Useful links
```{r Links}
# http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
# http://www.cookbook-r.com/Graphs/Shapes_and_line_types/
# http://www.ats.ucla.edu/stat/r/faq/smooths.htm
# http://docs.ggplot2.org/current/scale_brewer.html
# http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/
# https://www.r-bloggers.com/from-continuous-to-categorical/
```
## 4.1
#### In this problem set, you'll continue to explore the diamonds data set.
#### Your first task is to create a scatterplot of price vs x, using the ggplot syntax.
```{r Data set}
data(diamonds)
summary(diamonds)
```
```{r Price x}
ggplot(diamonds, aes(x = x, y = price)) + geom_point(position = position_jitter(h=0), shape=15, alpha=1/10, color = 'darkgreen') + coord_cartesian(xlim=c(3, 11)) + scale_y_continuous(breaks=seq(1000, 19000, 1000),label=dollar) + theme_bw()
ggsave("energy01.jpg")
```
## 4.2
#### What are your observations about the scatterplot of price vs x?
#### The data set starts at about x = 3 and increases exponentially till the level 9. The top level of price is about $19k, it spreads much more widely near this point.
## 4.3
#### What is the correlation between price and x (and y, and z)?
```{r Cor.test}
with(diamonds, cor.test(price, x))
with(diamonds, cor.test(price, y))
with(diamonds, cor.test(price, z))
```
Pearson's product-moment correlation
data: price and x
t = 440.16, df = 53938, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.8825835 0.8862594
sample estimates:
cor
0.8844352
Pearson's product-moment correlation
data: price and y
t = 401.14, df = 53938, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.8632867 0.8675241
sample estimates:
cor
0.8654209
Pearson's product-moment correlation
data: price and z
t = 393.6, df = 53938, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.8590541 0.8634131
sample estimates:
cor
0.8612494
## 4.4
#### Create a simple scatter plot of price vs depth.
```{r Price depth 1}
ggplot(diamonds, aes(x = depth, y = price)) + geom_point(position = position_jitter(h=0), shape=6, alpha=1/10, color = 'darkred') + theme_grey()
ggsave("energy02.jpg")
```
## 4.5
#### Change the code to make the transparency of the points to be 1/100 of what they are now and mark the x-axis every 2 units. See the instructor notes for two hints.
```{r Price depth 2}
ggplot(diamonds, aes(x = depth, y = price)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/100, color = 'darkblue') + scale_x_continuous(breaks = seq(min(diamonds$depth), max(diamonds$depth), 2), labels = seq(min(diamonds$depth), max(diamonds$depth), 2)) + theme_bw()
ggsave("energy03.jpg")
```
## 4.6
#### Based on the scatterplot of depth vs. price, most diamonds are between what values of depth?
#### (58;64)
## 4.7
#### What's the correlation of depth vs. price?
#### Based on the correlation coefficient, would you use depth to predict the price of a diamond? Why?
```{r Price depth 3}
with(diamonds, cor.test(depth, price))
```
Pearson's product-moment correlation
data: depth and price
t = -2.473, df = 53938, p-value = 0.0134
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.019084756 -0.002208537
sample estimates:
cor
-0.0106474
No, the correlation is too weak. Only depth it not enough for price prediction.
## 4.8
#### Create a scatterplot of price vs carat and omit the top 1% of price and carat values.
```{r Price carat}
ggplot(diamonds, aes(x = carat, y = price)) + geom_point(position = position_jitter(h=0), shape=9, alpha=0.1, color = 'firebrick1') + scale_x_continuous(breaks=seq(0, 2.5, 0.1), limits=c(0, quantile(diamonds$carat, 0.99))) + scale_y_continuous(breaks=seq(0, 18000, 1000), limits=c(0 , quantile(diamonds$price, 0.99)), labels=dollar) + theme_bw()
ggsave("energy04.jpg")
```
## 4.9
#### Create a scatterplot of price vs. volume (x * y * z). This is a very rough approximation for a diamond's volume.
#### Create a new variable for volume in the diamonds data frame. This will be useful in a later exercise.
#### Don't make any adjustments to the plot just yet.
```{r Price volume 1}
diamonds_v <- diamonds %>%
mutate(volume=x*y*z)
```
```{r Price volume 2}
ggplot(diamonds_v, aes(x = volume, y = price)) + geom_point() + theme_economist_white()
ggsave("energy05.jpg")
```
## 4.10
#### What are your observations from the price vs volume scatterplot?
#### Prices rise exponentially with volume, the transformations of the x-scale is a possible way to improve this visualization. There are many diamonds with volumes near 0. Also we could see 3 outliers here.
## 4.11
#### What's the correlation of price and volume? Exclude diamonds that have a volume of 0 or that are greater than or equal to 800.
```{r Detach plyr}
detach("package:plyr", unload=TRUE)
```
```{r Price volume 3}
with(subset(diamonds_v, !(volume == 0 | volume >= 800) ), cor.test(price, volume))
```
Pearson's product-moment correlation
data: price and volume
t = 559.19, df = 53915, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9222944 0.9247772
sample estimates:
cor
0.9235455
## 4.12
#### Subset the data to exclude diamonds with a volume greater than or equal to 800. Also, exclude diamonds with a volume of 0. Adjust the transparency of the points and add a linear model to the plot. (See the Instructor Notes or look up the documentation of geom_smooth() for more details about smoothers.)
#### We encourage you to think about this next question and to post your thoughts in the discussion section.
#### Do you think this would be a useful model to estimate the price of diamonds? Why or why not?
```{r Price volume 4}
sub_diamonds_v <- diamonds_v %>%
filter(volume != 0, volume <= 800)
```
```{r Price volume 5}
ggplot(sub_diamonds_v, aes( x = volume, y = price)) + geom_point(position = position_jitter(h=0), shape=2, alpha=0.1, color = 'darkgreen') + geom_smooth(method = "lm", se = TRUE) + theme_bw()
ggsave("energy06.jpg")
```
```{r Price volume 6}
ggplot(sub_diamonds_v, aes( x = volume, y = price)) + geom_point(position = position_jitter(h=0), shape=1, alpha=0.1, color = 'darkviolet') + geom_smooth(method = "gam", se = TRUE) + scale_y_continuous(breaks=seq(0, 18000, 1000), limits=c(0 ,quantile(diamonds$price, 0.99))) + theme_bw()
ggsave("energy07.jpg")
```
## 4.13
##### Use the function dplyr package to create a new data frame containing info on diamonds by clarity.
#### Name the data frame diamondsByClarity. The data frame should contain the following variables in this order.
#### (1) mean_price
#### (2) median_price
#### (3) min_price
#### (4) max_price
#### (5) n
#### where n is the number of diamonds in each level of clarity.
```{r Price clarity}
diamondsByClarity<- diamonds %>%
group_by(clarity) %>%
summarise(mean_price = mean(price),
median_price = median(price),
min_price = min(price),
max_price = max(price),
n = n() ) %>%
arrange(clarity)
```
## 4.14
#### We’ve created summary data frames with the mean price by clarity and color. You can run the code in R to verify what data is in the variables diamonds_mp_by_clarity and diamonds_mp_by_color.
#### Your task is to write additional code to create two bar plots on one output image using the grid.arrange() function from the package gridExtra.
```{r Price group 1}
diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))
diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))
```
```{r Price group 2}
p1 <- ggplot(diamonds_mp_by_clarity, aes(x=clarity, y=mean_price, fill= clarity)) + geom_bar(stat = "identity", color = "darkblue") + scale_fill_hue(l=50, c=200) + guides(fill = guide_legend(ncol=1, title.hjust=0.2)) + theme_foundation()
ggsave("energy14.jpg")
p2 <- ggplot(diamonds_mp_by_color, aes(x=color, y=mean_price, fill=color)) + geom_bar(stat = "identity", color = "darkviolet") + scale_fill_brewer(palette="Spectral") + guides(fill = guide_legend(ncol=1, title.hjust=0.2)) + theme_foundation()
ggsave("energy08.jpg")
grid.arrange(p1, p2, ncol=1)
g1 <- grid.arrange(p1, p2, ncol=1)
ggsave(file = "energy15.jpg", g1)
```
## 4.15
#### What do you notice in each of the bar charts for mean price by clarity and mean price by color?
#### In general price decreases with a change of clarity from I1 to IF and increases with color change from D to J.
## 4.16
#### The Gapminder website contains over 500 data sets with information about the world's population. Your task is to continue the investigation you did at the end of Problem Set 3 or you can start fresh and choose a different data set from Gapminder.
#### If you are feeling adventurous or want to try some data munging see if you can find a data set or scrape one from the web.
#### In your investigation, examine pairs of variable and create 2-5 plots that make use of the techniques from Lesson 4.
#### Once you've completed your investigation, create a post in the discussions that includes:
#### 1. the variable(s) you investigated, your observations, and any summary statistics
#### 2. snippets of code that created the plots
#### 3. links to the images of your plots
#### Copy and paste all of the code that you used for your investigation, and submit it when you are ready.
# ====================================================================
```{r Gapminder data 1}
energy <- tbl_df(read.xlsx("energy_use_per_person.xlsx", sheetName="Data", header=TRUE))
```
```{r Gapminder data 2}
row.with.na <- apply(energy, 1, function(x){any(is.na(x))})
sum(row.with.na)
```
```{r Gapminder data 3}
filtered <- energy[!row.with.na,]
names(filtered)[1] <- "country"
```
```{r Gapminder data 4}
energy_db <- melt(filtered, id=c("country"), value.name="energy", variable.name="year")
energy_db <- tbl_df(energy_db)
```
```{r Gapminder data 5}
energy_db <- energy_db %>%
mutate(year = as.character(year), year = substr(year, 2, 5), year = as.numeric(year))
head(energy_db)
```
```{r Gapminder data 6}
energy_db$year_d <- cut(energy_db$year, seq(1959,2012,1), right=FALSE, labels=c(1959:2011))
```
```{r Gapminder data 7}
energy1 <- energy_db[which(energy_db$country == 'Japan'),]
energy2 <- energy_db[which(energy_db$country == 'United States'),]
energy3 <- energy_db[which(energy_db$country == 'Canada'),]
country_set = c("Germany", "United Kingdom", "France", "Austria", "Belgium", "Italy", 'Denmark', 'Netherlands', 'Norway', "Finland", "Greece", "Poland", "Portugal")
energy4 <- energy_db[which(energy_db$country %in% country_set),]
```
```{r Gapminder data 8}
ggplot(energy4, aes(x = year, y = energy, color=country)) + geom_point(position = position_jitter(h=0), shape=9, alpha=0.8) + geom_line() + labs(title="Energy use per person", x="Year", y="Energy")
ggsave("energy09.jpg")
```
```{r Gapminder data 9}
grid_arrange_shared_legend <- function(...) {
plots <- list(...)
g <- ggplotGrob(plots[[1]] + theme(legend.position="bottom"))$grobs
legend <- g[[which(sapply(g, function(x) x$name) == "guide-box")]]
lheight <- sum(legend$height)
grid.arrange(
do.call(arrangeGrob, lapply(plots, function(x)
x + theme(legend.position="none"))),
legend,
ncol = 1,
heights = unit.c(unit(1, "npc") - lheight, lheight))
}
```
```{r Gapminder data 10}
p3 <- ggplot(energy1, aes(x=year_d, y=energy, fill= year_d)) + geom_bar(stat = "identity", color = "darkblue") + scale_fill_hue(l=50, c=200) + labs(title="Energy per person in Japan", x="Year", y="Energy") + scale_x_discrete(breaks=seq(1960,2011,5), labels=seq(1960,2011,5))
ggsave("energy10.jpg")
```
```{r Gapminder data 11}
p4 <- ggplot(energy2, aes(x=year_d, y=energy, fill= year_d)) + geom_bar(stat = "identity", color = "darkblue") + scale_fill_hue(l=50, c=200) + labs(title="Energy per person in United States", x="Year", y="Energy") + scale_x_discrete(breaks=seq(1960,2011,5), labels=seq(1960,2011,5))
ggsave("energy11.jpg")
```
```{r Gapminder data 12}
p5 <- ggplot(energy3, aes(x=year_d, y=energy, fill= year_d)) + geom_bar(stat = "identity", color = "darkblue") + scale_fill_hue(l=50, c=200) + labs(title="Energy per person in Canada", x="Year", y="Energy") + scale_x_discrete(breaks=seq(1960,2011,5), labels=seq(1960,2011,5))
ggsave("energy12.jpg")
```
```{r Gapminder data 13}
grid_arrange_shared_legend(p3, p4, p5)
g2 <- grid_arrange_shared_legend(p3, p4, p5)
ggsave(file="energy13.jpg", g2, width = 8, height = 12)
DATA ANALYSIS WITH R; lesson4.Rmd
---
title: "Lesson 4"
runtime: shiny
output:
html_document:
theme: united
highlight: tango
---
Lesson 4
========================================================
***
### Working directory and libraries
```{r setup}
setwd('/Users/olgabelitskaya/version-control/reflections-ud651')
```
```{r Libraries1}
library(ggplot2)
library(lubridate)
library(ggthemes)
```
```{r Libraries3}
library(gridExtra)
library(plyr)
```
```{r Libraries4}
library(scales)
library(reshape2)
```
```{r Libraries5}
library(dplyr)
library(tidyr)
```
```{r Libraries6}
library(xlsx)
```
## Useful links
```{r Links}
# http://docs.ggplot2.org/current/
# http://docs.ggplot2.org/current/coord_trans.html
# http://sape.inf.usi.ch/quick-reference/ggplot2/themes
# http://personality-project.org/r/html/corr.test.html
# https://rpubs.com/hadley/ggplot2-layers
# http://rmarkdown.rstudio.com/articles_integration.html
```
```{r Pseudo-Facebook User Data}
pf <- read.csv('pseudo_facebook.tsv', sep='\t')
names(pf)
```
***
### Scatterplots
```{r Scatterplots}
qplot(age, friend_count, data = pf) + geom_point(color = 'darkblue')
```
***
### ggplot Syntax
```{r ggplot Syntax}
ggplot(aes(x = age, y = friend_count), data = pf) + geom_point(color = 'darkblue') + xlim(13,90)
```
***
### Overplotting
```{r Overplotting}
ggplot(aes(x = age, y = friend_count), data = pf) + geom_jitter(shape=7, alpha=1/10, color = 'darkred') + xlim(13,90)
```
***
### Coord_trans()
```{r Coord_trans()1}
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_point(shape=5, alpha=1/10, color = 'purple') + coord_trans(x="log10", y="log10")
```
#### Look up the documentation and add a layer to the plot that transforms friend_count using the square root function. Create your plot!
```{r Coord_trans()2}
ggplot(data = pf, aes(x = age, y = friend_count)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/10, color = 'navy') + coord_trans(y="sqrt") + theme_bw()
```
***
### Alpha and Jitter
```{r Alpha and Jitter}
ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(position = position_jitter(h=0), shape=10, alpha=1/20, color = 'darkgreen') + xlim(13,90) + coord_trans(y="sqrt") + theme_bw()
```
***
### Conditional Means
```{r Conditional Means}
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups, fc_mean= mean(friend_count), fc_median = median(friend_count), n = n())
pf.fc_by_age <- arrange(pf.fc_by_age, age)
head(pf.fc_by_age)
```
Create your plot!
```{r Conditional Means Plot}
ggplot(data = pf.fc_by_age, aes(x = age, y = fc_mean)) +
geom_line(color = 'forestgreen') + xlim(13,90) + theme_grey()
```
***
### Overlaying Summaries with Raw Data
```{r Overlaying Summaries with Raw Data}
ggplot(data = pf, aes(x = age, y = friend_count)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/10, color = 'orange') + geom_line(stat = 'summary', fun.y = mean, color = "green") + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .9), color = "darkgreen", linetype = 5) + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1), color = "darkblue", linetype = 5) + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .5), color = "blue", linetype = 5) + coord_cartesian(xlim = c(13,70), ylim = c(0,1000)) + theme_bw()
ggsave("pf01.jpg")
```
***
### Moira: Histogram Summary and Scatterplot
See the Instructor Notes of this video to download Moira's paper on perceived audience size and to see the final plot.
***
### Correlation
```{r Correlation}
?cor.test.formula
cor.test(pf$age, pf$friend_count, method = 'pearson')
```
What's the correlation between age and friend count? Round to three decimal places.
Pearson's product-moment correlation
data: pf$age and pf$friend_count
t = -8.6268, df = 99001, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.03363072 -0.02118189
sample estimates:
cor
-0.02740737
***
```{r Correlation2}
with(pf, cor.test(age, friend_count, method = 'pearson'))
```
### Correlation on Subsets
```{r Correlation on Subsets}
with(subset(pf, age <= 70), cor.test(age, friend_count))
```
***
### Correlation Methods
```{r Correlation on Subsets2}
with(subset(pf, age <= 70), cor.test(age, friend_count, method = 'spearman'))
```
***
## Create Scatterplots
```{r Create Scatterplots1}
ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/10, color = 'blue') + geom_line(stat = 'summary', fun.y = mean, color = "darkorchid1") + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .9), color = "red", linetype = 5) + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1), color = "darkred", linetype = 5) + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .5), color = "orange", linetype = 5) + coord_cartesian(xlim = c(0,1000), ylim = c(0,1000)) + theme_bw()
ggsave("pf02.jpg")
```
```{r Create Scatterplots2}
ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/10, color = 'blue') + coord_cartesian(xlim = c(0,500), ylim = c(0,500)) + geom_line(stat = 'summary', fun.y = mean, color = "darkorchid1") + theme_bw()
ggsave("pf03.jpg")
```
***
### Strong Correlations
```{r Strong Correlations}
ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) + geom_point(position = position_jitter(h=0), shape=5, alpha=1/10, color = 'darkgreen') + xlim(0, quantile(pf$www_likes_received, 0.95)) + ylim(0, quantile(pf$likes_received, 0.95)) + geom_smooth(method='lm', color ='darkorange') + theme_bw()
```
What's the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.
Pearson's product-moment correlation
data: www_likes_received and likes_received
t = 937.1, df = 99001, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9473553 0.9486176
sample estimates:
cor
0.9479902
```{r Correlation Calcuation}
with(pf, cor.test(www_likes_received, likes_received, method = 'pearson'))
```
***
### More Caution with Correlation
```{r More Caution With Correlation}
#install.packages('alr3')
library(alr3)
```
```{r, echo=FALSE}
data(Mitchell)
?Mitchell
```
Create your plot!
```{r Temp vs Month1}
ggplot(data = Mitchell, aes(x = Month, y = Temp)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/2, color = 'darkblue') + theme_bw()
```
***
### Noisy Scatterplots
a. Take a guess for the correlation coefficient for the scatterplot.
0.01
b. What is the actual correlation of the two variables?
(Round to the thousandths place)
Pearson's product-moment correlation
data: Temp and Month
t = 0.81816, df = 202, p-value = 0.4142
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.08053637 0.19331562
sample estimates:
cor
0.05747063
```{r Noisy Scatterplots}
with(Mitchell, cor.test(Month, Temp))
```
```{r Temp vs Month2}
ggplot(data = Mitchell, aes(x = Month, y = Temp)) + geom_point(position = position_jitter(h=0), shape=6, alpha=1/2, color = 'forestgreen') + theme_bw() + scale_x_discrete(breaks = seq(0, 203, 12))
```
***
### Making Sense of Data
```{r Making Sense of Data}
ggplot(data = Mitchell, aes(x = Month%%12, y = Temp)) + geom_point(position = position_jitter(h=0), shape=10, alpha=1/2, color = 'steelblue') + theme_bw() + geom_smooth(se = FALSE)
ggsave("mitchell01.jpg")
```
***
### Understanding Noise: Age to Age Months
```{r Understanding Noise: Age to Age Months}
pf$age_with_months <- pf$age + (1 - pf$dob_month / 12)
```
***
### Age with Months Means
```{r Age with Months Means}
age_with_months_groups <- group_by(pf, age_with_months)
```
Programming Assignment
```{r Programming Assignment}
pf.fc_by_age_months <- summarise(age_with_months_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age_months <- arrange(pf.fc_by_age_months, age_with_months)
head(pf.fc_by_age_months)
```
***
### Noise in Conditional Means
```{r Noise in Conditional Means}
ggplot(data = pf.fc_by_age_months, aes(x = age_with_months, y = friend_count_mean)) + geom_line(color = 'darkgreen') + xlim(13, 71) + theme_bw()
```
```{r Noise in Conditional Means2}
ggplot(data = subset(pf.fc_by_age_months, age_with_months < 71), aes(x = age_with_months, y = friend_count_mean)) + geom_line(color = 'darkred') + xlim(13, 71) + theme_bw()
```
***
### Smoothing Conditional Means
```{r Smoothing Conditional Means}
p1 <- ggplot(aes(x=age, y=fc_mean), data=subset(pf.fc_by_age, age < 71)) + geom_line(color = 'darkred') + geom_smooth(color = 'firebrick1')
p2 <- ggplot(aes(x=age_with_months, y=friend_count_mean), data=subset(pf.fc_by_age_months, age_with_months < 71)) + geom_line(color = 'darkblue') + geom_smooth(color = 'steelblue')
p3 <- ggplot(aes(x=round(age/5)*5, y=friend_count), data=subset(pf, age < 71)) + geom_line(stat = 'summary', fun.y = mean, color = 'darkgreen')
grid.arrange(p2, p1, p3, ncol=1)
ggsave("pf04.jpg")
```
title: "Lesson 4"
runtime: shiny
output:
html_document:
theme: united
highlight: tango
---
Lesson 4
========================================================
***
### Working directory and libraries
```{r setup}
setwd('/Users/olgabelitskaya/version-control/reflections-ud651')
```
```{r Libraries1}
library(ggplot2)
library(lubridate)
library(ggthemes)
```
```{r Libraries3}
library(gridExtra)
library(plyr)
```
```{r Libraries4}
library(scales)
library(reshape2)
```
```{r Libraries5}
library(dplyr)
library(tidyr)
```
```{r Libraries6}
library(xlsx)
```
## Useful links
```{r Links}
# http://docs.ggplot2.org/current/
# http://docs.ggplot2.org/current/coord_trans.html
# http://sape.inf.usi.ch/quick-reference/ggplot2/themes
# http://personality-project.org/r/html/corr.test.html
# https://rpubs.com/hadley/ggplot2-layers
# http://rmarkdown.rstudio.com/articles_integration.html
```
```{r Pseudo-Facebook User Data}
pf <- read.csv('pseudo_facebook.tsv', sep='\t')
names(pf)
```
***
### Scatterplots
```{r Scatterplots}
qplot(age, friend_count, data = pf) + geom_point(color = 'darkblue')
```
***
### ggplot Syntax
```{r ggplot Syntax}
ggplot(aes(x = age, y = friend_count), data = pf) + geom_point(color = 'darkblue') + xlim(13,90)
```
***
### Overplotting
```{r Overplotting}
ggplot(aes(x = age, y = friend_count), data = pf) + geom_jitter(shape=7, alpha=1/10, color = 'darkred') + xlim(13,90)
```
***
### Coord_trans()
```{r Coord_trans()1}
ggplot(data = diamonds, aes(x = carat, y = price)) + geom_point(shape=5, alpha=1/10, color = 'purple') + coord_trans(x="log10", y="log10")
```
#### Look up the documentation and add a layer to the plot that transforms friend_count using the square root function. Create your plot!
```{r Coord_trans()2}
ggplot(data = pf, aes(x = age, y = friend_count)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/10, color = 'navy') + coord_trans(y="sqrt") + theme_bw()
```
***
### Alpha and Jitter
```{r Alpha and Jitter}
ggplot(data = pf, aes(x = age, y = friendships_initiated)) + geom_point(position = position_jitter(h=0), shape=10, alpha=1/20, color = 'darkgreen') + xlim(13,90) + coord_trans(y="sqrt") + theme_bw()
```
***
### Conditional Means
```{r Conditional Means}
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups, fc_mean= mean(friend_count), fc_median = median(friend_count), n = n())
pf.fc_by_age <- arrange(pf.fc_by_age, age)
head(pf.fc_by_age)
```
Create your plot!
```{r Conditional Means Plot}
ggplot(data = pf.fc_by_age, aes(x = age, y = fc_mean)) +
geom_line(color = 'forestgreen') + xlim(13,90) + theme_grey()
```
***
### Overlaying Summaries with Raw Data
```{r Overlaying Summaries with Raw Data}
ggplot(data = pf, aes(x = age, y = friend_count)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/10, color = 'orange') + geom_line(stat = 'summary', fun.y = mean, color = "green") + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .9), color = "darkgreen", linetype = 5) + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1), color = "darkblue", linetype = 5) + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .5), color = "blue", linetype = 5) + coord_cartesian(xlim = c(13,70), ylim = c(0,1000)) + theme_bw()
ggsave("pf01.jpg")
```
***
### Moira: Histogram Summary and Scatterplot
See the Instructor Notes of this video to download Moira's paper on perceived audience size and to see the final plot.
***
### Correlation
```{r Correlation}
?cor.test.formula
cor.test(pf$age, pf$friend_count, method = 'pearson')
```
What's the correlation between age and friend count? Round to three decimal places.
Pearson's product-moment correlation
data: pf$age and pf$friend_count
t = -8.6268, df = 99001, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.03363072 -0.02118189
sample estimates:
cor
-0.02740737
***
```{r Correlation2}
with(pf, cor.test(age, friend_count, method = 'pearson'))
```
### Correlation on Subsets
```{r Correlation on Subsets}
with(subset(pf, age <= 70), cor.test(age, friend_count))
```
***
### Correlation Methods
```{r Correlation on Subsets2}
with(subset(pf, age <= 70), cor.test(age, friend_count, method = 'spearman'))
```
***
## Create Scatterplots
```{r Create Scatterplots1}
ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/10, color = 'blue') + geom_line(stat = 'summary', fun.y = mean, color = "darkorchid1") + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .9), color = "red", linetype = 5) + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1), color = "darkred", linetype = 5) + geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .5), color = "orange", linetype = 5) + coord_cartesian(xlim = c(0,1000), ylim = c(0,1000)) + theme_bw()
ggsave("pf02.jpg")
```
```{r Create Scatterplots2}
ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/10, color = 'blue') + coord_cartesian(xlim = c(0,500), ylim = c(0,500)) + geom_line(stat = 'summary', fun.y = mean, color = "darkorchid1") + theme_bw()
ggsave("pf03.jpg")
```
***
### Strong Correlations
```{r Strong Correlations}
ggplot(data = pf, aes(x = www_likes_received, y = likes_received)) + geom_point(position = position_jitter(h=0), shape=5, alpha=1/10, color = 'darkgreen') + xlim(0, quantile(pf$www_likes_received, 0.95)) + ylim(0, quantile(pf$likes_received, 0.95)) + geom_smooth(method='lm', color ='darkorange') + theme_bw()
```
What's the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.
Pearson's product-moment correlation
data: www_likes_received and likes_received
t = 937.1, df = 99001, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9473553 0.9486176
sample estimates:
cor
0.9479902
```{r Correlation Calcuation}
with(pf, cor.test(www_likes_received, likes_received, method = 'pearson'))
```
***
### More Caution with Correlation
```{r More Caution With Correlation}
#install.packages('alr3')
library(alr3)
```
```{r, echo=FALSE}
data(Mitchell)
?Mitchell
```
Create your plot!
```{r Temp vs Month1}
ggplot(data = Mitchell, aes(x = Month, y = Temp)) + geom_point(position = position_jitter(h=0), shape=2, alpha=1/2, color = 'darkblue') + theme_bw()
```
***
### Noisy Scatterplots
a. Take a guess for the correlation coefficient for the scatterplot.
0.01
b. What is the actual correlation of the two variables?
(Round to the thousandths place)
Pearson's product-moment correlation
data: Temp and Month
t = 0.81816, df = 202, p-value = 0.4142
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.08053637 0.19331562
sample estimates:
cor
0.05747063
```{r Noisy Scatterplots}
with(Mitchell, cor.test(Month, Temp))
```
```{r Temp vs Month2}
ggplot(data = Mitchell, aes(x = Month, y = Temp)) + geom_point(position = position_jitter(h=0), shape=6, alpha=1/2, color = 'forestgreen') + theme_bw() + scale_x_discrete(breaks = seq(0, 203, 12))
```
***
### Making Sense of Data
```{r Making Sense of Data}
ggplot(data = Mitchell, aes(x = Month%%12, y = Temp)) + geom_point(position = position_jitter(h=0), shape=10, alpha=1/2, color = 'steelblue') + theme_bw() + geom_smooth(se = FALSE)
ggsave("mitchell01.jpg")
```
***
### Understanding Noise: Age to Age Months
```{r Understanding Noise: Age to Age Months}
pf$age_with_months <- pf$age + (1 - pf$dob_month / 12)
```
***
### Age with Months Means
```{r Age with Months Means}
age_with_months_groups <- group_by(pf, age_with_months)
```
Programming Assignment
```{r Programming Assignment}
pf.fc_by_age_months <- summarise(age_with_months_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age_months <- arrange(pf.fc_by_age_months, age_with_months)
head(pf.fc_by_age_months)
```
***
### Noise in Conditional Means
```{r Noise in Conditional Means}
ggplot(data = pf.fc_by_age_months, aes(x = age_with_months, y = friend_count_mean)) + geom_line(color = 'darkgreen') + xlim(13, 71) + theme_bw()
```
```{r Noise in Conditional Means2}
ggplot(data = subset(pf.fc_by_age_months, age_with_months < 71), aes(x = age_with_months, y = friend_count_mean)) + geom_line(color = 'darkred') + xlim(13, 71) + theme_bw()
```
***
### Smoothing Conditional Means
```{r Smoothing Conditional Means}
p1 <- ggplot(aes(x=age, y=fc_mean), data=subset(pf.fc_by_age, age < 71)) + geom_line(color = 'darkred') + geom_smooth(color = 'firebrick1')
p2 <- ggplot(aes(x=age_with_months, y=friend_count_mean), data=subset(pf.fc_by_age_months, age_with_months < 71)) + geom_line(color = 'darkblue') + geom_smooth(color = 'steelblue')
p3 <- ggplot(aes(x=round(age/5)*5, y=friend_count), data=subset(pf, age < 71)) + geom_line(stat = 'summary', fun.y = mean, color = 'darkgreen')
grid.arrange(p2, p1, p3, ncol=1)
ggsave("pf04.jpg")
```
Подписаться на:
Комментарии (Atom)










