### # author_classification_getting_started.py # # author: Kristina Striegnitz # # version: 3/1/2010 # # Code to get you started on the 3rd programming project. ### import math TRAINING_AUTHOR_A = "Clinton_training.txt" TRAINING_AUTHOR_B = "Bush_training.txt" TESTING_AUTHOR_A = "Clinton_testing.txt" TESTING_AUTHOR_B = "Bush_testing.txt" def make_unigram_dict (filename) : """ Fill in the code. This function takes a filename (string) as parameter and should return a dictionary. (Step 1) """ return unigram_d def make_bigram_dict (filename): """ Fill in the code. This function takes a filename (string) as parameter and should return a dictionary. (Step 2) """ return bigram_d def count_words (unigram_dict): """ Fill in the code. This function takes a dictionary mapping words to their frequency as its parameter. It should return the total number of words, that is, it should add up all the frequencies. (Step 3) """ return total def count_unique_words (list_of_filenames): """ Fill in the code. This function takes a list of filenames as its parameter. It should return the size of the vocabulary used in all of the files. That is, it should count how many different words are used in the text files. Words that are used multiple times are only counted once. (Step 4) """ return vocab_size def unigram_prob (word, unigram_dict, total_words, vocab_size): """ You are given this function. It calculates the probability of a unigram (word) based on the frequency of this word in a corpus. It takes four parameters: the word, a dictionary that maps words to their frequency, the total number of words in the corpus from which the dictionary was built, and the size of the relevant vocabulary (that is, how many different words are used in the corpus and the test passages). The probability is smoothed using add-x smoothing, where x is 0.1. The logarithm of the probability is returned. """ if unigram_dict.has_key(word): word_count = unigram_dict[word] + 0.1 else: word_count = 0.1 unigram_prob = float(word_count) / (float(total_words) + 0.1 * float(vocab_size)) unigram_log_prob = math.log(unigram_prob, 2) return unigram_log_prob def bigram_prob (word1, word2, unigram_dict, bigram_dict, total_words, vocab_size): """ You are given this function. It calculates the probability of a bigram (a sequence of two words) based on the frequency of this sequence in a corpus. It takes five parameters: the word, a dictionary that maps words to their frequency, a dictionary that maps bigrams to their frequency, the total number of words in the corpus from which the dictionaries were built, and the size of the relevant vocabulary (that is, how many different words are used in the corpus and the test passages). The probability is smoothed using add-x smoothing, where x is 0.1. The logarithm of the probability is returned. """ bigram = word1+"+++"+word2 if bigram_dict.has_key(bigram): bigram_count = bigram_dict[bigram] + 0.1 else: bigram_count = 0.1 if unigram_dict.has_key(word1): unigram_count = unigram_dict[word1] else: unigram_count = 0 bigram_prob = float(bigram_count) / (float(unigram_count) + 0.1 * vocab_size) bigram_log_prob = math.log(bigram_prob, 2) return bigram_log_prob def string_prob_unigrams (string, unigram_dict, total_words, vocab_size): """ Fill in the code. This function takes a string of words as its first parameter. It should calculate the probability of this string based on the unigram probabilities of the words. To do that, you need to use the function unigram_prob to calculate the (logarithm of the) unigram probability each word and sum them up. That is, if your string consists of the words w1 w2 ... wn: string_prob = unigram_prob(w1) + unigram_prob(w1) + ... + unigram_prob(wn) """ words = string.split() return string_prob def string_prob_bigrams (string, unigram_dict, bigram_dict, total_words, vocab_size): """ Fill in the code. This function takes a string of words as its first parameter. It should calculate the probability of this string based on the bigram probabilities of the pairs of words composing the string. To do that, you need to use the function bigram_prob to calculate the (logarithm of the) unigram probabilities for each word and its preceding word and sum them up. The first word in the string does not have a preceding word; instead, you use the unigram probability for the first word. That is, if your string consists of the words w1 w2 ... wn: string_prob = unigram_prob(w1) + bigram_prob(w1,w2) + ... + bigram_prob(wn-1,wn) """ words = string.split() return string_prob ################### # What follows are a number of functions for testing your code. ################## def test_step1 (): """ To test the function for building the unigram dictionary. The unigram dictionary based on Clinton's speeches should contain 5168 entries and the frequency of 'colleges' should be 2; the one based on Bush's speeches should contain 4895 entries and the frequency of 'compassion' should be 20. """ ud_clinton = make_unigram_dict (TRAINING_AUTHOR_A) ud_bush = make_unigram_dict (TRAINING_AUTHOR_B) for (k,v) in ud_clinton.items()[:20]: print k, v print for (k,v) in ud_bush.items()[:20]: print k, v print "Author Clinton:", len(ud_clinton) print "frequency of 'colleges':", ud_clinton["colleges"] print "Author Bush:", len(ud_bush) print "frequency of 'compassion':", ud_bush["compassion"] def test_step2 (): """ To test the function for building the bigram dictionary. The bigram dictionary based on Clinton's speeches should contain 30332 entries and the frequency of the bigram 'the environment' should be 15; the one based on Bush's speeches should contain 25168 entries and the frequency of 'and free' should be 7. """ bd_clinton = make_bigram_dict (TRAINING_AUTHOR_A) bd_bush = make_bigram_dict (TRAINING_AUTHOR_B) for (k,v) in bd_clinton.items()[:20]: print k, v print for (k,v) in bd_bush.items()[:20]: print k, v print "Author Clinton:", len(bd_clinton) print "frequency of 'the+++environment':", bd_clinton["the+++environment"] print "Author Bush:", len(bd_bush) print "frequency of 'and+++free':", bd_bush["and+++free"] def test_step3 (): """ To test the function for counting the total number of words in the speeches. For Clinton's speeches that number is 67035; for Bush's it is 50717. """ ud_clinton = make_unigram_dict (TRAINING_AUTHOR_A) ud_bush = make_unigram_dict (TRAINING_AUTHOR_B) total_words_clinton = count_words(ud_clinton) total_words_bush = count_words(ud_bush) print "Author Clinton:", total_words_clinton print "Author Bush:", total_words_bush def test_step4 (): """ To test the function that calculates how many *different* words are used overall in both Clinton's and Bush's speeches including the passages that were taken out for testing. That is, how many different words were used in the four files TRAINING_AUTHOR_A, TRAINING_AUTHOR_B, TESTING_AUTHOR_A, TESTING_AUTHOR_B taken together. The number should be 7310. """ vocab_size = count_unique_words([TRAINING_AUTHOR_A, TRAINING_AUTHOR_B, TESTING_AUTHOR_A, TESTING_AUTHOR_B]) print vocab_size def test_step5 (): """ To thest the function that calculates the probability of a string just based on unigrams. The logarithm of the probability of the test sentence when calculated based on Clinton's speeches should be -433 and -437 when calculated based on Bush's speeches. That means it is more likely that Clinton is the author of this sentence. """ ud_clinton = make_unigram_dict (TRAINING_AUTHOR_A) ud_bush = make_unigram_dict (TRAINING_AUTHOR_B) total_words_clinton = count_words(ud_clinton) total_words_bush = count_words(ud_bush) vocab_size = count_unique_words([TRAINING_AUTHOR_A, TRAINING_AUTHOR_B, TESTING_AUTHOR_A, TESTING_AUTHOR_B]) test_sentence = "i challenge congress not to cut our support for drug - free schools . people like the d . a . r . e . officers are making a real impression on grade schoolchildren that will give them the strength to say no when the time comes ." print string_prob_unigrams(test_sentence, ud_clinton, total_words_clinton, vocab_size) print string_prob_unigrams(test_sentence, ud_bush, total_words_bush, vocab_size) def test_step6 (): """ To thest the function that calculates the probability of a string based on bigrams. The logarithm of the probability of the test sentence when calculated based on Clinton's speeches should be -469 and -513 when calculated based on Bush's speeches. That means it is more likely that Clinton is the author of this sentence. """ ud_clinton = make_unigram_dict (TRAINING_AUTHOR_A) ud_bush = make_unigram_dict (TRAINING_AUTHOR_B) total_words_clinton = count_words(ud_clinton) total_words_bush = count_words(ud_bush) bd_clinton = make_bigram_dict (TRAINING_AUTHOR_A) bd_bush = make_bigram_dict (TRAINING_AUTHOR_B) vocab_size = count_unique_words([TRAINING_AUTHOR_A, TRAINING_AUTHOR_B, TESTING_AUTHOR_A, TESTING_AUTHOR_B]) test_sentence = "i challenge congress not to cut our support for drug - free schools . people like the d . a . r . e . officers are making a real impression on grade schoolchildren that will give them the strength to say no when the time comes ." print string_prob_bigrams(test_sentence, ud_clinton, bd_clinton, total_words_clinton, vocab_size) print string_prob_bigrams(test_sentence, ud_bush, bd_bush, total_words_bush, vocab_size) def test_step7 (): """ This function lets you test how well your program can distinguish between texts written by two authors. It reads in a file with passages written by Clinton and a second file with passages written by Bush. For all passages, it calculates who is more likely to be the author. """ ud_a = make_unigram_dict (TRAINING_AUTHOR_A) ud_b = make_unigram_dict (TRAINING_AUTHOR_B) total_words_a = count_words(ud_a) total_words_b = count_words(ud_b) bd_a = make_bigram_dict (TRAINING_AUTHOR_A) bd_b = make_bigram_dict (TRAINING_AUTHOR_B) vocab_size = count_unique_words([TRAINING_AUTHOR_A, TRAINING_AUTHOR_B, TESTING_AUTHOR_A, TESTING_AUTHOR_B]) print print "Testing passages written by author A (Clinton):" classify_all (TESTING_AUTHOR_A, ud_a, bd_a, total_words_a, ud_b, bd_b, total_words_b, vocab_size) print print "Testing passages written by author B (Bush):" classify_all (TESTING_AUTHOR_B, ud_a, bd_a, total_words_a, ud_b, bd_b, total_words_b, vocab_size) def classify_all (filename, ud_A, bd_A, total_words_A, ud_B, bd_B, total_words_B, vocab_size): """ This function reads in a file of test passages (one test passage per line). It calculates how likely it is that author A wrote that passage and how likley it is that author B wrote that passage and then compares these two results. """ test_file = open(filename, "r") for line in test_file: print line author_a_prob = string_prob_bigrams (line, ud_A, bd_A, total_words_A, vocab_size) author_b_prob = string_prob_bigrams (line, ud_B, bd_B, total_words_B, vocab_size) print author_a_prob print author_b_prob if author_a_prob > author_b_prob: print "It is more likely that this text was written by author A." else: print "It is more likely that this text was written by author B." print test_file.close()