Logistic regression in 2D

First, let’s generate some data

set.seed(0)
N <- 1000
x1 <- runif(n = N, min = -5, max = 5) 
x2 <- runif(n = N, min = -5, max = 5) 

get.y <- function(i, x1, x2){
  y.i <- x1[i]**2 + x2[i]**2 < 1.5**2
  if(rbinom(n = 1, size = 1, prob = 0.15) == 1){
    y.i <- !y.i
  }
  y.i
}

y <- sapply(1:N, FUN = get.y, x1, x2)


plot(x1[y], x2[y], col = "blue")
points(x1[!y], x2[!y], col = "orange")

Now, we’ll fit a logistic regression to the data, using quadratic terms as well:

x1.2 <- x1**2
x2.2 <- x2**2

fit <- glm(y ~ x1 + x2 + x1.2 + x2.2, family = "binomial")
coefs <- fit$coefficients
pred.prob <- function(x1, x2, coefs){
  v <- c(1, x1, x2, x1**2, x2**2)
  plogis(sum(v * coefs))
}

Now, let’s compute the probabilities on a grid:

pred.prob.i <- function(i, grid, coefs){
  x1 <- grid[i, 1]
  x2 <- grid[i, 2]
  pred.prob(x1, x2, coefs)
}

my.seq <- seq(-5, 4.9, 0.1)

grid <- expand.grid(list(x1 = my.seq, x2 = my.seq))
grid[, "prob"] <- sapply(1:dim(grid)[1], FUN = pred.prob.i, grid, coefs)  

levelplot(prob ~ x1 * x2, data = grid)

Now let’s try the same thing, but with dataset that’s approximately linearly separable:

set.seed(0)
N <- 1000
x1 <- runif(n = N, min = -5, max = 5) 
x2 <- runif(n = N, min = -5, max = 5) 

get.y.lin <- function(i, x1, x2){
  y.i <- (.2*x1[i] - 0.5*x2[i]) > 0
  if(rbinom(n = 1, size = 1, prob = 0.15) == 1){
    y.i <- !y.i
  }
  y.i
}

y <- as.vector(sapply(1:N, FUN = get.y.lin, x1, x2))


plot(x1[y], x2[y], col = "blue")
points(x1[!y], x2[!y], col = "orange")

fit <- glm(y ~ x1 + x2, family = "binomial")
coefs <- fit$coefficients

Let’s now plot the probabilities:

pred.prob <- function(x1, x2, coefs){
  v <- c(1, x1, x2)
  plogis(sum(v * coefs))
}
pred.prob.i <- function(i, grid, coefs){
  x1 <- grid[i, 1]
  x2 <- grid[i, 2]
  pred.prob(x1, x2, coefs)
}

my.seq <- seq(-5, 4.9, 0.1)

grid <- expand.grid(list(x1 = my.seq, x2 = my.seq))
grid[, "prob"] <- sapply(1:dim(grid)[1], FUN = pred.prob.i, grid, coefs)  

levelplot(prob ~ x1 * x2, data = grid)