воскресенье, 14 августа 2016 г.

DATA ANALYSIS WITH R; lesson5.Rmd

---
title: "Lesson 5"
runtime: shiny
output: html_document
---

```{r setup 1, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

========================================================

### Working directory and libraries

```{r setup 2}
setwd('/Users/olgabelitskaya/version-control/reflections-ud651')
```

```{r Libraries 1}
library(ggplot2)
library(lubridate)
```

```{r Libraries 2}
library(gridExtra)
library(plyr)
```

```{r Libraries 3}
library(scales)
library(reshape2)
```

```{r Libraries 4}
library(dplyr)
library(tidyr)
```

```{r Libraries 5}
library(xlsx)
library(ggthemes)
```

## Useful links

```{r Links}
# http://docs.ggplot2.org/current/
# http://docs.ggplot2.org/current/coord_trans.html
# http://sape.inf.usi.ch/quick-reference/ggplot2/themes
# http://personality-project.org/r/html/corr.test.html
# https://rpubs.com/hadley/ggplot2-layers
# http://rmarkdown.rstudio.com/articles_integration.html
# https://cran.r-project.org/web/packages/ggthemes/vignettes/ggthemes.html
```

```{r Pseudo-Facebook User Data}
pf <- read.csv('pseudo_facebook.tsv', sep='\t')
names(pf)
```

### Third Qualitative Variable

#### Write code to create a new data frame, called 'pf.fc_by_age_gender', that contains information on each age AND gender group.
#### The data frame should contain the following variables:
####    mean_friend_count,
####    median_friend_count,
####    n (the number of users in each age and gender grouping)

#### Here is an example of the structure of your data frame. Your data values will be different. Note that if you are grouping by more than one variable, you will probably need to call the ungroup() function.

####   age gender mean_friend_count median_friend_count    n
#### 1  13 female          247.2953                 150  207
#### 2  13   male          184.2342                  61  265
#### 3  14 female          329.1938                 245  834
#### 4  14   male          157.1204                  88 1201


```{r Third Qualitative Variable 1}
ggplot(aes(x = gender, y = age, color = gender), data = subset(pf, !is.na(gender))) + geom_boxplot() + stat_summary(fun.y = mean, geom = 'point', shape=15)
```

```{r Third Qualitative Variable 2}
ggplot(aes(x = age, y = friend_count), data = subset(pf, !is.na(gender))) + geom_line(aes(color=gender), stat = 'summary', fun.y = median)
```

```{r Third Qualitative Variable 3}
detach("package:plyr", unload=TRUE)
pf.fc_by_age_gender <- pf %>%
  filter(!is.na(gender)) %>%
  group_by(age, gender) %>%
  summarise(mean_friend_count = mean(friend_count),
            median_friend_count = median(friend_count),
            n = n() ) %>%
  ungroup() %>%
  arrange(age)
```

```{r Third Qualitative Variable 4}
pf.fc_by_age_gender2 <- subset(pf, !is.na(gender)) %>%
  group_by(age, gender) %>%
  summarise(mean_friend_count = mean(friend_count),
            median_friend_count = median(friend_count),
            n = n() ) %>%
  arrange(age)
```

***

### Plotting Conditional Summaries

```{r Plotting Conditional Summaries}
ggplot(pf.fc_by_age_gender, aes(x = age, y = median_friend_count, color =gender)) +  geom_line()
```

***

### Reshaping Data

```{r Reshaping Data}
pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender, age ~ gender, value.var = 'median_friend_count')
```

***

### Ratio Plot

```{r Ratio Plot}
ggplot(pf.fc_by_age_gender.wide, aes(x = age, y = female/male)) +  geom_line(color = "darkred", linetype = 5) + geom_hline(yintercept=1, color= 'darkblue', linetype = 2, alpha=0.5) + theme_bw()
```

***

### Third Quantitative Variable

```{r Third Quantitative Variable 5}
pf$year_joined <- floor(2014 - pf$tenure / 365)
```

***

### Cut a Variable
#### Create a new variable in the data frame called year_joined.bucket by using the cut function on the variable year_joined.

#### You need to create the following buckets for the new variable, year_joined.bucket

####        (2004, 2009]
####        (2009, 2011]
####        (2011, 2012]
####        (2012, 2014]

#### Note that a parenthesis means exclude the year and a bracket means include the year.

```{r Cut a Variable 1}
summary(pf$year_joined)
table(pf$year_joined)
```

```{r Cut a Variable 2}
pf$year_joined.bucket <- cut(pf$year_joined, c(2004,2009,2011,2012,2014))
table(pf$year_joined.bucket, useNA = 'ifany')
```

***

### Plotting it All Together
#### Create a line graph of friend_count vs. age so that each year_joined.bucket is a line tracking the median user friend_count across age. This means you should have four different lines on your plot.

#### You should subset the data to exclude the users whose year_joined.bucket is NA.

```{r Plotting it All Together}
ggplot(data = subset(pf,!is.na(year_joined.bucket)), aes(x = age, y = friend_count)) + geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = median) + theme_grey()
```

***

### Plot the Grand Mean

```{r Plot the Grand Mean}
ggplot(aes(x = age, y = friend_count), data = subset(pf,!is.na(year_joined.bucket))) + geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = mean, linetype = 5, color = "darkblue") + theme_grey()
```

***

### Friending Rate

```{r Friending Rate 1}
with(subset(pf,tenure >0), summary(friend_count / tenure))
```

```{r Friending Rate 2}
pf.tenure_morethanzero <- subset(pf,tenure >0)
pf.tenure_morethanzero$friend_rate = pf.tenure_morethanzero$friend_count / pf.tenure_morethanzero$tenure
summary(pf.tenure_morethanzero$friend_rate)
```

***

### Friendships Initiated

```{r Friendships Initiated}
ggplot(aes(x=tenure, y = friend_rate, color = year_joined.bucket), data = pf.tenure_morethanzero) + geom_line(stat = 'summary', fun.y = mean) + scale_colour_brewer(type="seq", palette='Spectral')
```

***

### Bias-Variance Tradeoff Revisited

```{r Bias-Variance Tradeoff Revisited}

p1 <- ggplot(aes(x = tenure, y = friendships_initiated / tenure),
       data = subset(pf, tenure >= 1)) +
  geom_line(aes(color = year_joined.bucket),
            stat = 'summary',
            fun.y = mean)

p2 <- ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary",
            fun.y = mean)

p3 <- ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary",
            fun.y = mean)

p4 <- ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined.bucket),
            stat = "summary",
            fun.y = mean)

g1 <- grid.arrange(p1, p2, p3, p4, ncol=1)

ggsave("0505.jpg", g1, width = 8, height = 12)
```

```{r Geom_smooth}
p5 <- ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated/tenure), data = subset(pf, tenure >= 1)) + geom_line(aes(color = year_joined.bucket), stat = "summary", fun.y = mean) + scale_colour_brewer(type="seq", palette='Spectral')

p6 <- ggplot(aes(x=tenure, y = friendships_initiated / tenure), data = subset(pf, tenure >= 1)) + geom_smooth(aes(color = year_joined.bucket)) + scale_colour_brewer(type="seq", palette='Spectral')

g2 <- grid.arrange(p5, p6, ncol=1)

ggsave("0503.jpg", g2, width = 12, height = 8)
```
***

### Yo Dataset

```{r Yo Dataset 1}
yo <- read.csv('yogurt.csv')
str(yo)
```

```{r Yo Dataset 2}
yo$id <- factor(yo$id)
str(yo)
```

### Histograms Revisited

```{r Histograms Revisited 1}
qplot(data = yo, x = price, fill = I('#F79420'), binwidth = 10)
```

```{r Histograms Revisited 2}
ggplot(aes(x = price), data=yo) + geom_histogram(binwidth = 3, fill = "forestgreen")
```

***

### Number of Purchases

```{r Number of Purchases 1}
summary(yo)
length(unique(yo$price))
table(yo$price)
```


```{r Number of Purchases 2}
yo$all.purchases <- yo$strawberry + yo$blueberry + yo$pina.colada + yo$plain + yo$mixed.berry
summary(yo$all.purchases)
```

```{r Number of Purchases 3}
yo <- transform(yo, all.purchases = strawberry + blueberry + pina.colada + plain + mixed.berry)
```

```{r Number of Purchases 4}
qplot(x = all.purchases, data = yo, fill=I('blue'), binwidth = 1)
```
***

### Prices over Time

```{r Prices over Time}
ggplot(yo, aes( x = time, y = price)) + geom_point(position = position_jitter(h=0), shape=9, alpha=0.5, color = 'darkblue') + theme_foundation()

ggsave("0504.jpg", width = 8, height = 12)
```

***

### Looking at Samples of Households

```{r Looking at Sample of Households 1}
set.seed(4230)
sample.ids <- sample(levels(yo$id), 16)

ggplot(aes(x = time, y = price), data = subset(yo, id %in% sample.ids)) +
  facet_wrap(~id) +
  geom_line() +
  geom_point(aes(size = all.purchases), pch = 1)
```

```{r Looking at Sample of Households 2}
set.seed(2000)
sample.ids <- sample(levels(yo$id), 16)

ggplot(aes(x = time, y = price), data = subset(yo, id %in% sample.ids)) + facet_wrap(~id) + geom_line(color = 'darkgreen') + geom_point(aes(size = all.purchases), pch = 10, color = 'darkblue') + geom_smooth(method = 'gam')

ggsave("0501.jpg", width = 16, height = 8)
```

***

### Scatterplot Matrix

```{r  Scatterplot Matrix 1}
ggplot(diamonds, aes(x = cut, y = price/carat, color = color)) + geom_jitter() + facet_wrap(~clarity) + scale_color_brewer(type = 'qual', palette = "Spectral") + theme_gray()

ggsave("0502.jpg", width = 8, height = 12)
```

```{r Libraries 6}
# install.packages('GGally')
library(GGally)
```

```{r  Scatterplot Matrix 2}
set.seed(2000)

pf_subset <- pf[, c(2:15)]
names(pf_subset)

```

```{r  Scatterplot Matrix 3}

ggpairs(pf_subset[sample.int(nrow(pf_subset), 1000), ])

```
***

### Heat Maps

```{r Dataset nci 1}
nci <- read.table("nci.tsv")
colnames(nci) <- c(1:64)
```

```{r Dataset nci 2}
nci.long.samp <- melt(as.matrix(nci[1:200,]))
names(nci.long.samp) <- c("gene", "case", "value")
head(nci.long.samp)

ggplot(aes(y = gene, x = case, fill = value),
  data = nci.long.samp) +
  geom_tile() +
  scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))

```

Комментариев нет:

Отправить комментарий