@phdthesis{Mohammad6,
  author = "Saif Mohammad",
  title = "Measuring Semantic Distance using Distributional Profiles of Concepts",
  year = "2008",
  month = "February",
  school = "Department of Computer Science, University of Toronto",
  abstract = "<p><b>Semantic distance</b> is a measure of how close or
              distant in meaning two units of language are.
              A large number of important natural language problems, including machine
              translation and word sense disambiguation,
              can be viewed as semantic distance problems.
              The two dominant approaches to estimating semantic distance are
              the <b>WordNet-based semantic measures</b> and the <b>corpus-based
              distributional measures</b>.  In this thesis, I compare them, both qualitatively and
              quantitatively, and identify the limitations of each.</p>
              <p>This thesis argues that estimating semantic distance is essentially a
              property of concepts (rather than words) and that two concepts are semantically 
              close if they occur in similar contexts. 
              Instead of identifying the co-occurrence (distributional) profiles of
              <i> words</i> (<b> distributional hypothesis</b>),
              I argue that <b>distributional profiles of concepts (DPCs)</b> can be used
              to infer the semantic properties of concepts and indeed to estimate
              semantic distance more accurately. I propose a new hybrid approach to calculating semantic distance
              that combines corpus statistics and a published thesaurus (<i>Macquarie Thesaurus</i>).
              The algorithm determines estimates of the DPCs
              using the categories in the thesaurus as very coarse concepts and, notably,
              without requiring any sense-annotated data.
              Even though the use of only about 1000 concepts to represent the
              vocabulary of a language seems drastic, I show that the method
              achieves results better than
              the state-of-the-art in a number of natural language tasks.</p>
              <p>I show how <b> cross-lingual DPCs</b> can be
              created by combining text in one language with a thesaurus from another.
              Using these cross-lingual DPCs, we can solve problems
              in one, possibly resource-poor, language using a knowledge source from another,
              possibly resource-rich, language. I show that the approach is
              also useful in tasks that inherently involve two or more languages, such as
              machine translation and multilingual text summarization.</p>
              <p>
              The proposed approach is computationally inexpensive, it can estimate
              both semantic
              relatedness and semantic similarity, and it can be applied to all
              parts of speech.
              Extensive experiments on ranking word pairs as per semantic distance,
              real-word spelling correction, solving <i> Reader's Digest</i> word
              choice problems,
              determining word sense dominance, word sense disambiguation, and
              word translation show that the new approach is markedly
              superior to previous ones.</p>",
  download = "http://ftp.cs.toronto.edu/pub/gh/Mohammad-PhD-thesis.pdf"
}


