% No 'submit' option for the problems by themselves.
\documentclass{harvardml}
\usepackage[margin=0.5in]{geometry}
% Use the 'submit' option when you submit your solutions.
% \documentclass[submit]{harvardml}

% Put in your full name and email address.
\name{Your Name}
\email{email@fas.harvard.edu}

% List any people you worked with.
\collaborators{%
  John Doe,
  Fred Doe
}

% You don't need to change these.
\course{CS281-F15}
\assignment{CSC412/2056 Assignment \#1}
\duedate{Thursday February 9, 11:59pm}

\usepackage{url, enumitem}
\usepackage{amsfonts, amsmath}
\usepackage{listings}
\usepackage[colorlinks=true,linkcolor=blue]{hyperref}

% Some useful macros.
\newcommand{\given}{\,|\,}
\newcommand{\R}{\mathbb{R}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\var}{\text{var}}
\newcommand{\cov}{\text{cov}}
\newcommand{\trans}{\mathsf{T}}
\newcommand{\bx}{\mathbf{x}}
\newcommand{\by}{\mathbf{y}}
\newcommand{\bw}{\mathbf{w}}
\newcommand{\distNorm}{\mathcal{N}}
\newcommand{\bzero}{\mathbf{0}}
\newcommand{\ident}{\mathbb{I}}
\newcommand{\N}{\mathcal{N}}
\newcommand{\ep}{\varepsilon}
\newcommand{\Dir}{\text{Dirichlet}}

\theoremstyle{plain}
\newtheorem{lemma}{Lemma}


\begin{document}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{problem}[Variance and covariance, 6 points]
Let $X$ and~$Y$ be two continuous independent random variables.

\begin{enumerate}[label=(\alph*)]
\item Starting from the definition of independence, show that the independence of~$X$ and~$Y$ implies that their covariance is zero.

\item For a scalar constant~$a$, show the following two properties, starting from the definition of expectation:
\begin{align*}
  \E(X + aY) &= \E(X) + a\E(Y)\\
  \var(X + aY) &= \var(X) + a^2\var(Y)
\end{align*}
\end{enumerate}
\end{problem}

% You can write your solution here.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{problem}[Densities, 5 points]

Answer the following questions:

\begin{enumerate}[label=(\alph*)]
  \item Can a probability density function (pdf) ever take values greater than 1?
  \item Let $X$ be a univariate normally distributed random variable with mean 0 and variance $1/100$. What is the pdf of $X$?
  \item What is the value of this pdf at 0?
  \item What is the probability that $X = 0$?
  % \item Explain the discrepancy.
\end{enumerate}
\end{problem}

% You can write your solution here.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{problem}[Calculus, 4 points]
Let $x, y \in \R^m$ and $A \in \R^{m \times m}$.
Please answer the following questions, writing your answers in vector notation.
\begin{enumerate}[label=(\alph*)]
\item What is the gradient with respect to $x$ of $x^T y$?
\item What is the gradient with respect to $x$ of $x^T x$?
\item What is the gradient with respect to $x$ of $x^T A$?
\item What is the gradient with respect to $x$ of $x^T A x$?
\end{enumerate}
\end{problem}
% You can write your solution here.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{problem}[Linear Regression, 10pts]
Suppose that $X \in \R^{n \times m}$ with $n \geq m$ and $Y \in \R^n$, and that $Y \sim \N(X\beta, \sigma^2 I)$.
In this question you will derive the result that the maximum likelihood estimate $\hat\beta$ of $\beta$ is given by
\[
\hat\beta = (X^TX)^{-1}X^TY
\]

\begin{enumerate}[label=(\alph*)]
% \item Why do we need to assume that $n \geq m$?
\item What are the expectation and covariance matrix of $\hat\beta$, for a given true value of $\beta$?
\item Show that maximizing the likelihood is equivalent to minimizing the squared error \linebreak $\sum_{i=1}^n (y_i - x_i\beta)^2$. [Hint: Use $\sum_{i=1}^n a_i^2 = a^Ta $]
\item Write the squared error in vector notation, (see above hint), expand the expression, and collect like terms. [Hint: Use $\beta^Tx^Ty = y^Tx\beta$ (why?) and $x^Tx$ is symmetric ]
\item Take the derivative of this expanded expression with respect to $\beta$ to show the maximum likelihood estimate $\hat\beta$ as above. [Hint: Use results 3.c and 3.d for derivatives in vector notation.]
\end{enumerate}
\end{problem}

% You can write your solution here.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{problem}[Ridge Regression, 10pts]
\begin{enumerate}[label=(\alph*)]

    Suppose we place a normal prior on $\beta$. That is, we assume that $\beta \sim \N(0, \tau^2 I)$.

    \item   Show that the MAP estimate of $\beta$ given $Y$ in this context is
\[
\hat \beta_{MAP} = (X^TX + \lambda I)^{-1}X^T Y
\]
where $\lambda = \sigma^2 / \tau^2$.

Estimating $\beta$ in this way is called {\em ridge regression} because the matrix $\lambda I$ looks like a ``ridge". Ridge regression is a common form of {\em regularization} that is used to avoid the overfitting that happens when the sample size is close to the output dimension in linear regression.

% \item Do we need $n \geq m$ to do ridge regression? Why or why not?
\item Show that ridge regression is equivalent to adding $m$ additional rows to $X$ where the $j$-th additional row has its $j$-th entry equal to $\sqrt{\lambda}$ and all other entries equal to zero, adding $m$ corresponding additional entries to $Y$ that are all 0, and and then computing the maximum likelihood estimate of $\beta$ using the modified $X$ and $Y$.
\end{enumerate}
\end{problem}


\begin{problem}[Gaussians in high dimensions, 10pts]

    In this question we will investigate how our intuition for samples from a Gaussian may break down in higher dimensions. Consider samples from a $D$-dimensional unit Gaussian
  $\bx \sim \distNorm(\bzero_D, \ident_D)$
where~$\bzero_D$ indicates a column vector of~$D$ zeros and~$\ident_D$
is a~${D\times D}$ identity matrix.

\begin{enumerate}
    \item Starting with the definition of Euclidean norm, quickly show that the distance of $\bx$ from the origin is $\sqrt{\bx^\trans\bx}$
    \item In low-dimensions our intuition tells us that samples from the unit Gaussian will be near the origin. Draw 10000 samples from a $D=1$ Gaussian and plot a normalized histogram for the distance of those samples from the origin. Does this confirm your intuition that the samples will be near the origin?
    \item Draw 10000 samples from $D=\{1,2,3,10,100\}$ Gaussians and, on a single plot, show the normalized histograms for the distance of those samples from the origin. As the dimensionality of the Gaussian increases, what can you say about the expected distance of the samples from the Gaussian's mean (in this case, origin).
    \item From Wikipedia, if $\bx_i$ are $k$ independent, normally distributed random variables
        with means $\mu_i$ and standard deviations $\sigma_i$ then the statistic $Y =
        \sqrt{\sum_{i=1}^k(\frac{x_i-\mu_i}{\sigma_i})^2}$ is distributed according to the
        \href{https://en.wikipedia.org/wiki/Chi_distribution}{$\chi $-distribution}. 
        On the previous normalized histogram, plot the probability density function (pdf) 
        of the $\chi$-distribution for $k=\{1,2,3,10,100\}$.

    \item Taking two samples from the $D$-dimensional unit Gaussian,
        $\bx_a,\bx_b \sim \distNorm(\bzero_D,\ident_D)$ how is $\bx_a - \bx_b$ distributed?
        Using the above result about $\chi$-distribution, how is $||\bx_a - \bx_b||_2$ distributed?
        (Hint: start with a $\mathcal{X}$-distributed random variable and use the \href{https://en.wikipedia.org/wiki/Probability_density_function#Dependent_variables_and_change_of_variables}{change of variables formula}.)
        Plot the pdfs of this distribution for $k=\{1,2,3,10,100\}$. 
        How does the distance between samples from a Gaussian behave as dimensionality increases?
        Confirm this by drawing two sets of $1000$ samples from the $D$-dimensional unit Gaussian.
        On the plot of the $\chi$-distribution pdfs, plot the normalized histogram of the distance between samples from the first and second set.

    \item In lecture we saw examples of interpolating between latent points to generate convincing data.
        Given two samples from a gaussian $\bx_a,\bx_b \sim \distNorm(\bzero_D,\ident_D)$ the
        linear interpolation between them $x_\alpha$ is defined as a function of  $\alpha\in[0,1]$ 
        \begin{align*}
            \text{lin\_interp}(\alpha,\bx_a,\bx_b) = \alpha \bx_a + (1-\alpha)\bx_b
        \end{align*}
        For two sets of 1000 samples from the unit gaussian in $D$-dimensions, plot the average 
        log-likelihood along the linear interpolations between the pairs of samples as a function of $\alpha$.
        (i.e. for each pair of samples compute the log-likelihood along a linear space of interpolated points between them, $\mathcal{N}(x_\alpha|0,I)$ for $\alpha \in [0,1]$. Plot the average log-likelihood over all the interpolations.)
        Do this for $D=\{1,2,3,10,100\}$, one plot per dimensionality.
        Comment on the log-likelihood under the unit Gaussian of points along the linear interpolation.
        Is a higher log-likelihood for the interpolated points necessarily better?
        Given this, is it a good idea to linearly interpolate between samples from a high dimensional Gaussian?
        
    \item  Instead we can interpolate in polar coordinates: For $\alpha\in[0,1]$ the polar interpolation is\begin{align*}
            \text{polar\_interp}(\alpha,\bx_a,\bx_b)=\sqrt{\alpha }\bx_a + \sqrt{(1-\alpha)}\bx_b
        \end{align*}
        This interpolates between two points while maintaining Euclidean norm.
        On the same plot from the previous question, plot the probabilitiy density of the polar interpolation between pairs of samples from two sets of 1000 samples from $D$-dimensional unit Gaussians for $D=\{1,2,3,10,100\}$. 
        Comment on the log-likelihood under the unit Gaussian of points along the polar interpolation.
        Give an intuative explanation for why polar interpolation is more suitable than linear interpolation for high dimensional Gaussians. \textbf{For 6. and 7. you should have one plot for each $D$ with two curves on each}.

    \item (\textbf{Bonus 5pts}) In the previous two questions we compute the average loglikelihood of the linear and polar interpolations under the unit gaussian. 
        Instead, consider the norm along the interpolation, $\sqrt{\bx_\alpha^\trans\bx_\alpha}$.
        As we saw previously, this is distributed according to the $\mathcal{X}$-distribution.
        Compute and plot the average log-likelihood of the norm along the two interpolations under the the $\mathcal{X}$-distribution for $D=\{1,2,3,10,100\}$, 
        i.e. $\mathcal{X}_D(\sqrt{\bx_\alpha^\trans\bx_\alpha})$. 
        There should be one plot for each $D$, each with two curves corresponding to log-likelihood of linear and polar interpolations.
        How does the log-likelihood along the linear interpolation compare to the log-likelihood of the true samples (endpoints)?
        Using your answer for questions 3 and 4, provide geometric intuition for the log-likelihood along the linear and polar interpolations.
        Use this to further justify your explanation for the suitability of polar v.s. linear interpolation.
\end{enumerate}
\end{problem}
% You can write your solution here.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\end{document}