/* knn-cv-1.c - k nearest neighbors for regression.
 *
 * Reads training cases from "train.n", test inputs from "test.n" and test
 * targets from "targets.n". Writes point predictions to "cguess.A.n",
 * "cguess.S.n" and densities under a predictive distribution to
 * "clptarg.L.n". Here "n" is the instance number supplied as a command
 * argument. For each loss type, "k" is selected by leave one out cross
 * validation.
 *
 * Copyright (c) 1996 by Carl Edward Rasmussen. */

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <values.h>
#include "util.h"

#define tolerance 1.0e-6
#define two_pi 6.28318530717959
#define SWAP(a, b) temp = (a); (a) = (b); (b) = temp;

extern real median(real *a, int k);  /* find median of first k elements in a */
static int  comp(const void *x, const void *y);    /* function used by qsort */
static void sort_dist(int k, real *inp);      /* sort k examples by distance */
static void find_neighbors(int k);            /* find targets of k neighbors */
static void var_est_1nn(real *glob_var);       /* estimate variance from 1nn */
       
int no_inp, no_tar;
real **targets,    /* matrix containing the targets of the nearest neighbors */
     **loo_est,       /* estimates for 3 loss functions for every value of k */
     **dist_tar;              /* array of distances and targets of neighbors */
struct exampleset train, test;

main(int argc, char **argv)
{
  int  i, j, k, l, top, k0, k1, k2;
  char df2[20], df[20];                     /* strings containing file names */
  real *m, *v, *glob_var, tmp, *temp;
  FILE *fp0, *fp1, *fp2;                     /* pointers to prediction files */

  if (argc != 2) {
    fprintf(stderr, "Usage: %s instance-number\n", argv[0]); exit(-1);
  }

  train.num = test.num = no_inp = no_tar = -1;      /* default for "unknown" */
  sprintf(df, "test.%s", argv[1]);                      /* name of test file */
  sprintf(df2, "targets.%s", argv[1]);               /* name of targets file */
  loadExamples(&test, &no_inp, &no_tar, df, df2);
  sprintf(df, "train.%s", argv[1]);                 /* name of training file */
  loadExamples(&train, &no_inp, &no_tar, df, NULL);
  top = train.num-1;

  dist_tar = createMatrix(train.num+2,1+no_tar);
  dist_tar++; dist_tar[-1][0] = -MAXFLOAT;  /* place a sentinel before array */
  dist_tar[top][0] = MAXFLOAT;  /* and two after; useful in find_neighbors() */
  dist_tar[train.num][0] = MAXFLOAT;  /* avoiding check for array boundaries */
                                                    /* when looking for ties */
  loo_est = createMatrix(train.num, 3);
  for (k=0; k<3; k++) for (i=0; i<train.num; i++) loo_est[i][k] = 0.0;
  targets = createMatrix(train.num, no_tar);
  m = (real *) malloc((size_t) no_tar*sizeof(real));
  v = (real *) malloc((size_t) no_tar*sizeof(real));
  glob_var = (real *) malloc((size_t) no_tar*sizeof(real));

/* Do "leave one out" by swapping cases with the last example, and doing knn on
 * the first "top" cases, for each value of k. Accumulate the "leave one out"
 * estimates for each loss type in the columns of "loo_est". */

  var_est_1nn(glob_var);                   /* estimate variance based on 1nn */
  for (i=top; i>=0; i--) {                            /* leave out example i */
    SWAP(train.inp[i], train.inp[top]); SWAP(train.tar[i], train.tar[top]); 
    sort_dist(top, train.inp[top]);
    for (k=1; k<train.num; k++) {      /* now do "leave one out" for every k */
      find_neighbors(k);
      for (l=0; l<no_tar; l++) {   /* find mean and variance for each target */
        for (m[l]=0.0, v[l]=glob_var[l], j=0; j<k; j++)
          { m[l] += targets[l][j]; v[l] += sq(targets[l][j]); }
        m[l] /= k; v[l] = v[l]/k-sq(m[l]); m[l] = sq(m[l]-train.tar[top][l]);
      }
      for (tmp=0.0, l=0; l<no_tar; l++) tmp += m[l]; 
      loo_est[k][0] += tmp;                          
      for (tmp=0.0, l=0; l<no_tar; l++)
        tmp += fabs(median(targets[l], k)-train.tar[top][l]);   
      loo_est[k][1] += tmp; 
      for (tmp=0.0, l=0; l<no_tar; l++) tmp += log(two_pi*v[l])+m[l]/v[l]; 
      loo_est[k][2] += tmp;
   }
  }
  for (k0=k1=k2=i=1; i<train.num; i++) {       /* find k's with minimum loss */
    if (loo_est[i][0] < loo_est[k0][0]) k0 = i;
    if (loo_est[i][1] < loo_est[k1][1]) k1 = i;
    if (loo_est[i][2] < loo_est[k2][2]) k2 = i;
  }
  fprintf(stderr, "Loss types and loo k-values: S: %d, A: %d and L: %d\n",
                   k0, k1, k2);

/* Use the estimated k values to make predictions, and write them to the
 * apropriate files. */

  fp0 = openPredFile("cguess.S.%s", argv[1]);
  fp1 = openPredFile("cguess.A.%s", argv[1]);
  fp2 = openPredFile("clptarg.L.%s", argv[1]);

  for (i=0; i<test.num; i++) {        /* make predictions for all test cases */
    sort_dist(train.num, test.inp[i]);
    find_neighbors(k0);                      /* first for squared error loss */
    for (l=0; l<no_tar; l++) {
      for (tmp=0.0, j=0; j<k0; j++) tmp += targets[l][j];
      fprintf(fp0, "%f ", tmp/k0);
    }
    fprintf(fp0, "\n");
    find_neighbors(k1);                          /* then absolute error loss */
    for (l=0; l<no_tar; l++)
      fprintf(fp1, "%f ", median(targets[l], k1));
    fprintf(fp1, "\n");
    find_neighbors(k2);              /* and lastly negative log density loss */
    for (tmp=0.0, l=0; l<no_tar; l++) {
      for (m[l]=0.0, v[l]=glob_var[l], j=0; j<k2; j++) 
        { m[l] += targets[l][j]; v[l] += sq(targets[l][j]); }
      m[l] /= k2; v[l] = v[l]/k2-sq(m[l]); m[l] = sq(m[l]-test.tar[i][l]);
      tmp += log(two_pi*v[l])+m[l]/v[l];
    }    
    fprintf(fp2, "%f\n", -0.5*tmp);
  }

  fclose(fp0); fclose(fp1); fclose(fp2);
  free(dist_tar[-1]); free(--dist_tar); free(loo_est[0]); free(loo_est);
  free(targets[0]); free(targets); free(m); free(v); free(glob_var);
}

/* Fill in the dist_tar array and sort it by the first column. The first column
 * contains the sqared Euclidian distance in input space between the loo case
 * and the remaining training cases; the remaining "no_tar" columns contain the
 * targets for those cases. */

static void sort_dist(int k, real *inp)
{
  int  i, j;
  real dist;

  for (i=0; i<k; i++) {
    for (dist=0.0, j=0; j<no_inp; j++) dist += sq(train.inp[i][j]-inp[j]);
    dist_tar[i][0] = dist; 
    for (j=0; j<no_tar; j++) dist_tar[i][j+1] = train.tar[i][j];
  }
  qsort(dist_tar, k, sizeof(real *), comp);
} 

/* Find neighbors and write their targets into the "targets" array. Mostly
 * this can be done by copying from the sorted "dist_tar" array, but we need
 * to take care when there are ties; in this case we use the average of the
 * targets of the tied cases. */

static void find_neighbors(int k)
{
  static int  i, j, k1 = 2, k2 = 0;
  static real tmp;

  for (j=0; j<no_tar; j++) for (i=0; i<k; i++)               /* copy targets */
    targets[j][i] = dist_tar[i][j+1];          
  while (dist_tar[k-1][0]-dist_tar[k-k1][0] < tolerance) k1++;
  while (dist_tar[k+k2][0]-dist_tar[k-1][0] < tolerance) k2++;
  if (k1+k2 > 2) {                  /* if there were any ties, then fix them */
    for (j=0; j<no_tar; j++) {
      for (tmp=0.0, i=k-k1+1; i<k+k2; i++) tmp += dist_tar[i][j+1];
      tmp /= k1+k2-2;
      for (i=k-k1+1; i<k; i++) targets[j][i] = tmp;
    }
    k1 = 2; k2 = 0;
  }
}

/* Estimate variance based on 1nn; this number is needed for evaluation of "log
 * probability" losses. Return an array of variances - one for each target. */

static void var_est_1nn(real *var)
{
  int  i, j, top = train.num-1;
  real *temp;

  for (j=0; j<no_tar; j++) {
    for (var[j]=0.0, i=top; i>=0; i--) {              /* leave out example i */
      SWAP(train.inp[i], train.inp[top]); SWAP(train.tar[i], train.tar[top]); 
      sort_dist(top, train.inp[top]);
      find_neighbors(1);     
      var[j] += sq(targets[j][0]-train.tar[top][j]);
    }
    var[j] /= train.num;
  }
}

static int comp(const void *x, const void *y)      /* function used by qsort */
{
  return (*((const real **)x)[0] > *((const real **)y)[0]) ? 1 : -1;
}
