---
title: "Precept 5 Solutions"
output:
  html_document:
    df_print: paged
editor_options: 
  chunk_output_type: console
---

### Small training sets and overfitting

```{r echo = F}
titanic <- read.csv("titanic.csv")
library(ggplot2)
library(dplyr)
```

We'll create one training set of size 100, and a validation set of size 500. We'll only use subsets of the training set throught.

```{r}
set.seed(0)
idx <- sample(1:nrow(titanic))
train.idx <- idx[1:100]
valid.idx <- idx[101:600]
```


We will now get our performance data

```{r warning = F}
GetTrainValidPerformance <- function(titanic.train, titanic.valid){
  fit <- glm(Survived ~ Age + Sex + Pclass, family=binomial, data = titanic.train)
  pred.train <- predict(fit, newdata = titanic.train, type = "response") > 0.5
  pred.valid <- predict(fit, newdata = titanic.valid, type = "response") > 0.5
  return(c(mean(pred.train == titanic.train$Survived), mean(pred.valid == titanic.valid$Survived)))
}

GetTrainValidPerformanceTrSize <- function(train.size, titanic, train.idx, valid.idx){
  titanic.valid <- titanic[valid.idx, ]
  titanic.train <- titanic[train.idx[1:train.size], ]
  return(GetTrainValidPerformance(titanic.train, titanic.valid))
}

sizes <- c(3, 6, 9, 15, 20, 25, 30, 40, 50, 70, 100)
perf <- sapply(sizes, FUN = GetTrainValidPerformanceTrSize, titanic, train.idx, valid.idx)
```

First, let's simply add two layers to display the two curves (that's the non-challenge version)

```{r}
perf.data <- data.frame(size = sizes, perf.train = perf[1, ], perf.valid = perf[2, ])
ggplot(data = perf.data, mapping = aes(x = sizes)) + 
  geom_smooth(mapping = aes(y = perf.train), method = "loess", color = "red") + 
  geom_smooth(mapping = aes(y = perf.valid), method = "loess", color = "blue") 
```

Now, let's do things the tidy data way:

```{r}
library(reshape2)
perf.data <- data.frame(size = sizes, perf.train = perf[1, ], perf.valid = perf[2, ])
perf.data <- melt(perf.data, 1)
perf.data <- perf.data %>% select(size = size, set = variable, performance = value)
perf.data$set <- as.character(perf.data$set)
perf.data$set[perf.data$set == "perf.train"] <- "train"
perf.data$set[perf.data$set == "perf.valid"] <- "valid"

ggplot(data = perf.data, mapping = aes(x = size, y = performance, color = set)) +
   geom_smooth(method = "loess")  

```