SML201 Precept 6 Solutions, Spring 2020

Problems 1, 2

Learning goals

Practice using R markdown
Understand that overfitting is good performance on the training set at the expense of good performance on new data
Understand that overfitting occurs more easily when the data set size is smaller
Programmatically split data frames
Practice simulating sampling datasets
Practice pivot_longer to produce long form data frames, use that in combination with ggplot

We’ll create one training set of size 100, and a validation set of size 500. We’ll only use subsets of the training set throughout

set.seed(0)
idx <- sample(1:nrow(titanic))
train.idx <- idx[1:100]
valid.idx <- idx[101:600]

Some functions for computing the performance:

GetTrainValidPerformance <- function(titanic.train, titanic.valid){
  fit <- glm(Survived ~ Age + Sex + Pclass, family=binomial, data = titanic.train)
  pred.train <- predict(fit, newdata = titanic.train, type = "response") > 0.5
  pred.valid <- predict(fit, newdata = titanic.valid, type = "response") > 0.5
  c(mean(pred.train == titanic.train$Survived), mean(pred.valid == titanic.valid$Survived))
}

GetTrainValidPerformanceTrSize <- function(train.size, titanic, train.idx, valid.idx){
  titanic.valid <- titanic[valid.idx, ]
  titanic.train <- titanic[train.idx[1:train.size], ]
  GetTrainValidPerformance(titanic.train, titanic.valid)
}

sizes <- c(3, 6, 9, 15, 20, 25, 30, 40, 50, 70, 100)
perf <- sapply(sizes, FUN = GetTrainValidPerformanceTrSize, titanic, train.idx, valid.idx)

First, let’s simply add two layers to display the two curves (that’s the non-challenge version)

perf.data <- data.frame(size = sizes, perf.train = perf[1, ], perf.valid = perf[2, ])
ggplot(data = perf.data, mapping = aes(x = size)) + 
  geom_line(mapping = aes(y = perf.train), color = "red") + 
  geom_line(mapping = aes(y = perf.valid), color = "blue")

Let’s now add legends, using this technique:

colors <- c("Train" = "red", "Valid" = "blue")
ggplot(data = perf.data, mapping = aes(x = size)) + 
  geom_line(mapping = aes(y = perf.train, color = "Train")) + 
  geom_line(mapping = aes(y = perf.valid, color = "Valid"))  + 
  labs(x = "Train set size", y = "peformance", color = "Legend") +
  scale_color_manual(values = colors)

Now, let’s do things the tidy data way:

perf.data <- data.frame(size = sizes, perf.train = perf[1, ], perf.valid = perf[2, ])
perf.data <- perf.data %>% pivot_longer(c(perf.train, perf.valid), names_to = "set", values_to = "performance")
perf.data$set <- as.character(perf.data$set)
perf.data$set[perf.data$set == "perf.train"] <- "train"
perf.data$set[perf.data$set == "perf.valid"] <- "valid"

ggplot(data = perf.data, mapping = aes(x = size, y = performance, color = set)) +
   geom_line()

Problem 3

Learning goals

Use replicate to repeat simulations
Understand that performance can vary due to sampling
Practice producing multiple histograms from multiple columns (probably using pivot_longer)

set.seed(0)
idx <- sample(1:nrow(titanic))
all.train.idx <- idx[1:400]
valid.idx <- idx[401:800]

GetTrainValidPerformanceTrSize.shuffle <- function(train.size, all.train.idx, valid.idx){
  train.idx <- sample(all.train.idx)[1:train.size]
  titanic.valid <- titanic[valid.idx, ]
  titanic.train <- titanic[train.idx, ]
  GetTrainValidPerformance(titanic.train, titanic.valid)
}

train.size <-  15
perfs <- replicate(n = 500, GetTrainValidPerformanceTrSize.shuffle(train.size, all.train.idx, valid.idx))
perfs.data <- data.frame(perf.train = perfs[1, ], perf.valid = perfs[2, ])
perfs.data.long <- perfs.data %>% pivot_longer(c(perf.train, perf.valid), names_to = "set", values_to = "performance")
ggplot(perfs.data.long) + 
  geom_histogram(mapping = aes(x = performance, fill = set), alpha = 0.5, position = "dodge")

train.size <-  25
perfs <- replicate(n = 500, GetTrainValidPerformanceTrSize.shuffle(train.size, all.train.idx, valid.idx))
perfs.data <- data.frame(perf.train = perfs[1, ], perf.valid = perfs[2, ])
perfs.data.long <- perfs.data %>% pivot_longer(c(perf.train, perf.valid), names_to = "set", values_to = "performance")
ggplot(perfs.data.long) + 
  geom_histogram(mapping = aes(x = performance, fill = set), alpha = 0.5, position = "dodge")

SML201 Precept 6 Solutions, Spring 2020

Small training sets and overfitting

Problems 1, 2

Learning goals

Problem 3

Learning goals