###
# author_classification_getting_started.py
#
# author: Kristina Striegnitz
#
# version: 3/1/2010
#
# Code to get you started on the 3rd programming project.
###

import math

TRAINING_AUTHOR_A = "Clinton_training.txt"
TRAINING_AUTHOR_B = "Bush_training.txt"
TESTING_AUTHOR_A = "Clinton_testing.txt"
TESTING_AUTHOR_B = "Bush_testing.txt"


def make_unigram_dict (filename) :
    """
    Fill in the code. This function takes a filename (string) as
    parameter and should return a dictionary. (Step 1)
    """
    
    return unigram_d


def make_bigram_dict (filename):
    """
    Fill in the code. This function takes a filename (string) as
    parameter and should return a dictionary. (Step 2)
    """

    return bigram_d


def count_words (unigram_dict):
    """
    Fill in the code. This function takes a dictionary mapping words
    to their frequency as its parameter. It should return the total
    number of words, that is, it should add up all the
    frequencies. (Step 3)
    """

    return total


def count_unique_words (list_of_filenames):
    """
    Fill in the code. This function takes a list of filenames as its
    parameter. It should return the size of the vocabulary used in all
    of the files. That is, it should count how many different words
    are used in the text files. Words that are used multiple times are
    only counted once. (Step 4)
    """

    return vocab_size


def unigram_prob (word, unigram_dict, total_words, vocab_size):
    """
    You are given this function. It calculates the probability of a
    unigram (word) based on the frequency of this word in a corpus. It
    takes four parameters: the word, a dictionary that maps words to
    their frequency, the total number of words in the corpus from
    which the dictionary was built, and the size of the relevant
    vocabulary (that is, how many different words are used in the
    corpus and the test passages).  The probability is smoothed using
    add-x smoothing, where x is 0.1. The logarithm of the probability
    is returned.
    """
    if unigram_dict.has_key(word):
        word_count = unigram_dict[word] + 0.1
    else:
        word_count = 0.1
    unigram_prob = float(word_count) / (float(total_words) + 0.1 * float(vocab_size))
    unigram_log_prob = math.log(unigram_prob, 2)
    return unigram_log_prob


def bigram_prob (word1, word2, unigram_dict, bigram_dict, total_words, vocab_size):
    """
    You are given this function. It calculates the probability of a
    bigram (a sequence of two words) based on the frequency of this
    sequence in a corpus. It takes five parameters: the word, a
    dictionary that maps words to their frequency, a dictionary that
    maps bigrams to their frequency, the total number of words in the
    corpus from which the dictionaries were built, and the size of the
    relevant vocabulary (that is, how many different words are used in
    the corpus and the test passages).  The probability is smoothed
    using add-x smoothing, where x is 0.1. The logarithm of the
    probability is returned.
    """

    bigram = word1+"+++"+word2
    if bigram_dict.has_key(bigram):
        bigram_count = bigram_dict[bigram] + 0.1
    else:
        bigram_count = 0.1
    if unigram_dict.has_key(word1):
        unigram_count = unigram_dict[word1]
    else:
        unigram_count = 0
    bigram_prob = float(bigram_count) / (float(unigram_count) + 0.1 * vocab_size)
    bigram_log_prob = math.log(bigram_prob, 2)
    return bigram_log_prob
    

def string_prob_unigrams (string, unigram_dict, total_words, vocab_size):
    """
    Fill in the code. This function takes a string of words as its
    first parameter. It should calculate the probability of this
    string based on the unigram probabilities of the words. To do
    that, you need to use the function unigram_prob to calculate the
    (logarithm of the) unigram probability each word and sum them up.

    That is, if your string consists of the words w1 w2 ... wn:
    string_prob = unigram_prob(w1) + unigram_prob(w1) + ... + unigram_prob(wn)
    """

    words = string.split()

    return string_prob


def string_prob_bigrams (string, unigram_dict, bigram_dict, total_words, vocab_size):
    """
    Fill in the code. This function takes a string of words as its
    first parameter. It should calculate the probability of this
    string based on the bigram probabilities of the pairs of words
    composing the string. To do that, you need to use the function
    bigram_prob to calculate the (logarithm of the) unigram
    probabilities for each word and its preceding word and sum them
    up. The first word in the string does not have a preceding word;
    instead, you use the unigram probability for the first word.

    That is, if your string consists of the words w1 w2 ... wn:
    string_prob = unigram_prob(w1) + bigram_prob(w1,w2) + ... + bigram_prob(wn-1,wn)
    """
    
    words = string.split()

    return string_prob


###################
#    What follows are a number of functions for testing your code.
##################

def test_step1 ():
    """
    To test the function for building the unigram dictionary. The
    unigram dictionary based on Clinton's speeches should contain 5168
    entries and the frequency of 'colleges' should be 2; the one based
    on Bush's speeches should contain 4895 entries and the frequency
    of 'compassion' should be 20.
    """
    
    ud_clinton = make_unigram_dict (TRAINING_AUTHOR_A)
    ud_bush = make_unigram_dict (TRAINING_AUTHOR_B)

    for (k,v) in ud_clinton.items()[:20]:
        print k, v
    print
    for (k,v) in ud_bush.items()[:20]:
        print k, v

    print "Author Clinton:", len(ud_clinton)
    print "frequency of 'colleges':", ud_clinton["colleges"]
    print "Author Bush:", len(ud_bush)
    print "frequency of 'compassion':", ud_bush["compassion"]

    
def test_step2 ():
    """
    To test the function for building the bigram dictionary. The
    bigram dictionary based on Clinton's speeches should contain 30332
    entries and the frequency of the bigram 'the environment' should
    be 15; the one based on Bush's speeches should contain 25168
    entries and the frequency of 'and free' should be 7.
    """
    bd_clinton = make_bigram_dict (TRAINING_AUTHOR_A)
    bd_bush = make_bigram_dict (TRAINING_AUTHOR_B)

    for (k,v) in bd_clinton.items()[:20]:
        print k, v
    print
    for (k,v) in bd_bush.items()[:20]:
        print k, v
        
    print "Author Clinton:", len(bd_clinton)
    print "frequency of 'the+++environment':", bd_clinton["the+++environment"]
    print "Author Bush:", len(bd_bush)
    print "frequency of 'and+++free':", bd_bush["and+++free"]
    

def test_step3 ():
    """
    To test the function for counting the total number of words in the
    speeches. For Clinton's speeches that number is 67035; for Bush's
    it is 50717.
    """
    ud_clinton = make_unigram_dict (TRAINING_AUTHOR_A)
    ud_bush = make_unigram_dict (TRAINING_AUTHOR_B)
    total_words_clinton = count_words(ud_clinton)
    total_words_bush = count_words(ud_bush)

    print "Author Clinton:", total_words_clinton
    print "Author Bush:", total_words_bush


def test_step4 ():
    """
    To test the function that calculates how many *different* words
    are used overall in both Clinton's and Bush's speeches including
    the passages that were taken out for testing. That is, how many
    different words were used in the four files TRAINING_AUTHOR_A,
    TRAINING_AUTHOR_B, TESTING_AUTHOR_A, TESTING_AUTHOR_B taken
    together. The number should be 7310.
    """
    vocab_size = count_unique_words([TRAINING_AUTHOR_A, TRAINING_AUTHOR_B, TESTING_AUTHOR_A, TESTING_AUTHOR_B])
    
    print vocab_size
    

def test_step5 ():
    """
    To thest the function that calculates the probability of a string
    just based on unigrams. The logarithm of the probability of the
    test sentence when calculated based on Clinton's speeches should
    be -433 and -437 when calculated based on Bush's speeches. That
    means it is more likely that Clinton is the author of this
    sentence.
    """
    ud_clinton = make_unigram_dict (TRAINING_AUTHOR_A)
    ud_bush = make_unigram_dict (TRAINING_AUTHOR_B)
    total_words_clinton = count_words(ud_clinton)
    total_words_bush = count_words(ud_bush)
    vocab_size = count_unique_words([TRAINING_AUTHOR_A, TRAINING_AUTHOR_B, TESTING_AUTHOR_A, TESTING_AUTHOR_B])
    
    test_sentence = "i challenge congress not to cut our support for drug - free schools . people like the d . a . r . e . officers are making a real impression on grade schoolchildren that will give them the strength to say no when the time comes ."

    print string_prob_unigrams(test_sentence, ud_clinton, total_words_clinton, vocab_size)
    print string_prob_unigrams(test_sentence, ud_bush, total_words_bush, vocab_size)


def test_step6 ():
    """
    To thest the function that calculates the probability of a string
    based on bigrams. The logarithm of the probability of the test
    sentence when calculated based on Clinton's speeches should be
    -469 and -513 when calculated based on Bush's speeches.  That
    means it is more likely that Clinton is the author of this
    sentence.
    """
    ud_clinton = make_unigram_dict (TRAINING_AUTHOR_A)
    ud_bush = make_unigram_dict (TRAINING_AUTHOR_B)
    total_words_clinton = count_words(ud_clinton)
    total_words_bush = count_words(ud_bush)
    bd_clinton = make_bigram_dict (TRAINING_AUTHOR_A)
    bd_bush = make_bigram_dict (TRAINING_AUTHOR_B)
    vocab_size = count_unique_words([TRAINING_AUTHOR_A, TRAINING_AUTHOR_B, TESTING_AUTHOR_A, TESTING_AUTHOR_B])
    
    test_sentence = "i challenge congress not to cut our support for drug - free schools . people like the d . a . r . e . officers are making a real impression on grade schoolchildren that will give them the strength to say no when the time comes ."

    print string_prob_bigrams(test_sentence, ud_clinton, bd_clinton, total_words_clinton, vocab_size)
    print string_prob_bigrams(test_sentence, ud_bush, bd_bush, total_words_bush, vocab_size)


def test_step7 ():
    """
    This function lets you test how well your program can distinguish
    between texts written by two authors. It reads in a file with
    passages written by Clinton and a second file with passages
    written by Bush. For all passages, it calculates who is more
    likely to be the author.
    """

    ud_a = make_unigram_dict (TRAINING_AUTHOR_A)
    ud_b = make_unigram_dict (TRAINING_AUTHOR_B)
    total_words_a = count_words(ud_a)
    total_words_b = count_words(ud_b)
    bd_a = make_bigram_dict (TRAINING_AUTHOR_A)
    bd_b = make_bigram_dict (TRAINING_AUTHOR_B)
    vocab_size = count_unique_words([TRAINING_AUTHOR_A, TRAINING_AUTHOR_B, TESTING_AUTHOR_A, TESTING_AUTHOR_B])

    print
    print "Testing passages written by author A (Clinton):"
    classify_all (TESTING_AUTHOR_A, ud_a, bd_a, total_words_a, ud_b, bd_b, total_words_b, vocab_size)

    print
    print "Testing passages written by author B (Bush):"
    classify_all (TESTING_AUTHOR_B, ud_a, bd_a, total_words_a, ud_b, bd_b, total_words_b, vocab_size)


def classify_all (filename, ud_A, bd_A, total_words_A, ud_B, bd_B, total_words_B, vocab_size):
    """
    This function reads in a file of test passages (one test passage
    per line). It calculates how likely it is that author A wrote that
    passage and how likley it is that author B wrote that passage and
    then compares these two results.
    """

    test_file = open(filename, "r")

    for line in test_file:
        print line
        author_a_prob =  string_prob_bigrams (line, ud_A, bd_A, total_words_A, vocab_size)
        author_b_prob =  string_prob_bigrams (line, ud_B, bd_B, total_words_B, vocab_size)
        print author_a_prob
        print author_b_prob
        if author_a_prob > author_b_prob:
            print "It is more likely that this text was written by author A."
        else:
            print "It is more likely that this text was written by author B."
        print        
    test_file.close()