@mastersthesis{Peto2,
  author = "Linda Bauman Peto",
  title = "A comparison of two smoothing methods for word bigram models",
  school = "Department of Computer Science, University of Toronto",
  month = "April",
  year = "1994",
  note = "Published as technical report CSRI-304",
  abstract = "<P>Word bigram models estimated from text corpora require smoothing
              methods to estimate the probabilities of unseen bigrams.  The deleted 
              estimation method uses the formula:</p>
              <blockquote>
                      Pr\(<I>i</I>|<I>j</I>\) &#61; <I>lambda</I> <I>f<SUB>i</SUB></I> +
                       (1 - <I>lambda</I>)<I>f<SUB>i|j</SUB></I>,
              </blockquote>
              <P>where <I>f<SUB>i</SUB></I> and <I>f<SUB>i\|j</SUB></I> are the relative
              frequency of <I>i</I> and the conditional relative frequency of
              <I>i</I> given <I>j</I>, respectively, and <I>lambda</I> is an
              optimized parameter.  MacKay (1994) proposes a Bayesian approach using
              Dirichlet priors, which yields a different formula:</p>
              <BLOCKQUOTE>
                 Pr\(<I>i</I>\|<I>j</I>\) &#61; \(<I>alpha</I>/<I>F<SUB>j</SUB></I> \+
                 <I>alpha</I>\) <I>m<SUB>i</SUB></I> \+ \(1 \- <I>alpha</I>/F_j \+
                 <I>alpha</I>\) <I>f<SUB>i\|j</SUB></I>
              </BLOCKQUOTE>
              <P>where <I>F<SUB>j</SUB></I> is the count of <I>j</I> and <I>alpha</I>
              and <I>m<SUB>i</SUB></I> are optimized parameters.  This thesis
              describes an experiment in which the two methods were trained on a
              two-million-word corpus taken from the Canadian <I>Hansard</I> and
              compared on the basis of the experimental perplexity that they
              assigned to a shared test corpus.  The methods proved to be about
              equally accurate, with MacKay's method using fewer resources.</p>",
  download = "http://ftp.cs.toronto.edu/csri-technical-reports/304/CSRI-304.ps"
}


