First, let’s generate some data
set.seed(0)
N <- 1000
x1 <- runif(n = N, min = -5, max = 5)
x2 <- runif(n = N, min = -5, max = 5)
get.y <- function(i, x1, x2){
y.i <- x1[i]**2 + x2[i]**2 < 1.5**2
if(rbinom(n = 1, size = 1, prob = 0.15) == 1){
y.i <- !y.i
}
y.i
}
y <- sapply(1:N, FUN = get.y, x1, x2)
plot(x1[y], x2[y], col = "blue")
points(x1[!y], x2[!y], col = "orange")
Now, we’ll fit a logistic regression to the data, using quadratic terms as well:
x1.2 <- x1**2
x2.2 <- x2**2
fit <- glm(y ~ x1 + x2 + x1.2 + x2.2, family = "binomial")
coefs <- fit$coefficients
pred.prob <- function(x1, x2, coefs){
v <- c(1, x1, x2, x1**2, x2**2)
plogis(sum(v * coefs))
}
Now, let’s compute the probabilities on a grid:
pred.prob.i <- function(i, grid, coefs){
x1 <- grid[i, 1]
x2 <- grid[i, 2]
pred.prob(x1, x2, coefs)
}
my.seq <- seq(-5, 4.9, 0.1)
grid <- expand.grid(list(x1 = my.seq, x2 = my.seq))
grid[, "prob"] <- sapply(1:dim(grid)[1], FUN = pred.prob.i, grid, coefs)
levelplot(prob ~ x1 * x2, data = grid)
Now let’s try the same thing, but with dataset that’s approximately linearly separable:
set.seed(0)
N <- 1000
x1 <- runif(n = N, min = -5, max = 5)
x2 <- runif(n = N, min = -5, max = 5)
get.y.lin <- function(i, x1, x2){
y.i <- (.2*x1[i] - 0.5*x2[i]) > 0
if(rbinom(n = 1, size = 1, prob = 0.15) == 1){
y.i <- !y.i
}
y.i
}
y <- as.vector(sapply(1:N, FUN = get.y.lin, x1, x2))
plot(x1[y], x2[y], col = "blue")
points(x1[!y], x2[!y], col = "orange")
fit <- glm(y ~ x1 + x2, family = "binomial")
coefs <- fit$coefficients
Let’s now plot the probabilities:
pred.prob <- function(x1, x2, coefs){
v <- c(1, x1, x2)
plogis(sum(v * coefs))
}
pred.prob.i <- function(i, grid, coefs){
x1 <- grid[i, 1]
x2 <- grid[i, 2]
pred.prob(x1, x2, coefs)
}
my.seq <- seq(-5, 4.9, 0.1)
grid <- expand.grid(list(x1 = my.seq, x2 = my.seq))
grid[, "prob"] <- sapply(1:dim(grid)[1], FUN = pred.prob.i, grid, coefs)
levelplot(prob ~ x1 * x2, data = grid)