import os
import math

# Redefine this constant to be the path to where you stored the unzipped 
# wikipages. The Windows OS uses \ as the directory separator and inside
# this constant string you need two slashes for each separator.
# For example "C:\\user\\folder\\subfolder\\"
HOMEFOLDER = "/media/guerzhoy/Windows/cs_for_docs/workshops/diagnosis_tfidf/wikipages/"    
#HOMEFOLDER = "/Users/mcraig/admin/Activity2016/cs_for_docs/workshops/diagnosis_tfidf/wikipages/"
PUNCTUATION = """.,<>;'":{}[]|!@#$%^&*()"""

def clean_up(text):
    """ (string) -> (string)

    Return a version of the string text where all the letters have been 
    converted to lower case, and all punctuation was replaced with whitespaces

    >>> clean_up('Influenza, commonly known as "the flu", is ...')
    'influenza  commonly known as  the flu   is    '
    """


def get_all_texts(datapath):
    """ (string) -> dict of {string: string}

    Return a dictionary where the keys are disease names
    and the values are the contents of the file key.html
    from the directory datapath.
    """
    
    # get a list of all the filenames in the directory
    filenames = os.listdir(datapath)
    
    # dictionary of all texts, keys are disease names
    disease_to_text= {}
    
    for filename in filenames:

        # only consider filenames that end in ".html"
        if len(filename) > 5  and  filename[-5:] == ".html":

            # read the entire file's contents as a string
            text = open(datapath + filename).read()
            # since all the filenames end in .html, just drop that part
            disease = filename[:-5]
            # insert it into the dictionary
            disease_to_text[disease] = text
    
    return disease_to_text
    

def keyword_found(keyword, doc_name, disease_to_text):
    """ (str, str, dict of {str:str}) -> bool
    
    Return True iff keyword is found in this doc_name inside disease_to_text
    as a full token separated by whitespace.
    
    """

    
def idf(keyword, disease_to_text):
    """ (str, dict of {str: str}) -> float

    """


def build_empty_scores_dict(disease_to_text):
    """ (dict of {str:str}) -> dict of {str:number}
    Build and return an empty dictionary where the keys are the same as the keys in disease_to_text
    and the values are all 0.
    """
        

def update_scores(current_scores, keyword, all_texts):
    """ (dict of {str: number}, str, dict of {str: str}) -> None

    Update current_scores by adding to the value of each entry to TF-IDF individual score
    for keyword based on the documents in all_texts.

    """