Small training sets and overfitting

titanic <- read.csv("http://guerzhoy.princeton.edu/201s20/titanic.csv")
library(tidyverse)

Problems 1, 2

Learning goals

  • Practice using R markdown
  • Understand that overfitting is good performance on the training set at the expense of good performance on new data
  • Understand that overfitting occurs more easily when the data set size is smaller
  • Programmatically split data frames
  • Practice simulating sampling datasets
  • Practice pivot_longer to produce long form data frames, use that in combination with ggplot

We’ll create one training set of size 100, and a validation set of size 500. We’ll only use subsets of the training set throughout

set.seed(0)
idx <- sample(1:nrow(titanic))
train.idx <- idx[1:100]
valid.idx <- idx[101:600]

Some functions for computing the performance:

GetTrainValidPerformance <- function(titanic.train, titanic.valid){
  fit <- glm(Survived ~ Age + Sex + Pclass, family=binomial, data = titanic.train)
  pred.train <- predict(fit, newdata = titanic.train, type = "response") > 0.5
  pred.valid <- predict(fit, newdata = titanic.valid, type = "response") > 0.5
  c(mean(pred.train == titanic.train$Survived), mean(pred.valid == titanic.valid$Survived))
}

GetTrainValidPerformanceTrSize <- function(train.size, titanic, train.idx, valid.idx){
  titanic.valid <- titanic[valid.idx, ]
  titanic.train <- titanic[train.idx[1:train.size], ]
  GetTrainValidPerformance(titanic.train, titanic.valid)
}

sizes <- c(3, 6, 9, 15, 20, 25, 30, 40, 50, 70, 100)
perf <- sapply(sizes, FUN = GetTrainValidPerformanceTrSize, titanic, train.idx, valid.idx)

First, let’s simply add two layers to display the two curves (that’s the non-challenge version)

perf.data <- data.frame(size = sizes, perf.train = perf[1, ], perf.valid = perf[2, ])
ggplot(data = perf.data, mapping = aes(x = size)) + 
  geom_line(mapping = aes(y = perf.train), color = "red") + 
  geom_line(mapping = aes(y = perf.valid), color = "blue") 

Let’s now add legends, using this technique:

colors <- c("Train" = "red", "Valid" = "blue")
ggplot(data = perf.data, mapping = aes(x = size)) + 
  geom_line(mapping = aes(y = perf.train, color = "Train")) + 
  geom_line(mapping = aes(y = perf.valid, color = "Valid"))  + 
  labs(x = "Train set size", y = "peformance", color = "Legend") +
  scale_color_manual(values = colors)

Now, let’s do things the tidy data way:

perf.data <- data.frame(size = sizes, perf.train = perf[1, ], perf.valid = perf[2, ])
perf.data <- perf.data %>% pivot_longer(c(perf.train, perf.valid), names_to = "set", values_to = "performance")
perf.data$set <- as.character(perf.data$set)
perf.data$set[perf.data$set == "perf.train"] <- "train"
perf.data$set[perf.data$set == "perf.valid"] <- "valid"

ggplot(data = perf.data, mapping = aes(x = size, y = performance, color = set)) +
   geom_line()  

Problem 3

Learning goals

  • Use replicate to repeat simulations
  • Understand that performance can vary due to sampling
  • Practice producing multiple histograms from multiple columns (probably using pivot_longer)
set.seed(0)
idx <- sample(1:nrow(titanic))
all.train.idx <- idx[1:400]
valid.idx <- idx[401:800]
GetTrainValidPerformanceTrSize.shuffle <- function(train.size, all.train.idx, valid.idx){
  train.idx <- sample(all.train.idx)[1:train.size]
  titanic.valid <- titanic[valid.idx, ]
  titanic.train <- titanic[train.idx, ]
  GetTrainValidPerformance(titanic.train, titanic.valid)
}
train.size <-  15
perfs <- replicate(n = 500, GetTrainValidPerformanceTrSize.shuffle(train.size, all.train.idx, valid.idx))
perfs.data <- data.frame(perf.train = perfs[1, ], perf.valid = perfs[2, ])
perfs.data.long <- perfs.data %>% pivot_longer(c(perf.train, perf.valid), names_to = "set", values_to = "performance")
ggplot(perfs.data.long) + 
  geom_histogram(mapping = aes(x = performance, fill = set), alpha = 0.5, position = "dodge")

train.size <-  25
perfs <- replicate(n = 500, GetTrainValidPerformanceTrSize.shuffle(train.size, all.train.idx, valid.idx))
perfs.data <- data.frame(perf.train = perfs[1, ], perf.valid = perfs[2, ])
perfs.data.long <- perfs.data %>% pivot_longer(c(perf.train, perf.valid), names_to = "set", values_to = "performance")
ggplot(perfs.data.long) + 
  geom_histogram(mapping = aes(x = performance, fill = set), alpha = 0.5, position = "dodge")