To read a file, we use a context manager. The most widely used context manager is the
with statement. Use the following code to open the attached file "shakespeare.txt":
with open(filename, "r") as file: text = file.read()
with open("shakespeare.txt", "r") as file: text = file.read()
remove_punctuationwhich accepts a string of text and returns the string with all punctuations and new line characters removed.¶
def remove_punctuation(txt): """Convert text to lower case. Use the function replace(replacedtext,newtext) to remove \n and punctuations from text: punc = [".",":",",",";","'",'"', "!","?"] Then return text.""" # lower text by calling the method lower() on the string # replace \n with a SPACE # loop through punc and replace each punctuation with empty string "" # remember to return the text!
def frequency(text): """Returns dictionary of word:counts key-value pairs.""" # create empty dictionary # create list of individual words of all of Shakespeare's texts(Hint: Call split()) # loop through list of words # add to dictionary. Remember to check to see if a word is in the dictionary first. # return the dictionary
def text_stats(text): """Given a text, returns a TUPLE of total words and total unique words. Remember to call remove_punctuation and frequency above.""" # call remove_punctuation function # call frequency function # compute the total words # compute total unique words # return the tuple
from collections import Counter text = remove_punctuation(text) counter = Counter(text.split())
from collections import Counter # the function frequency we implement above have already been implemented # in the Counter class. Counter has a nice function most_common # that returns the most common occurring words in a text.
most_common(n)which accepts an integer n and returns the n most common occurring words. Call most_common to see the top 20 most occuring words in all of Shakespeare's works.¶
with open("stopwords.txt", 'r') as file: # create an empty list stop_words # loop through each line of file(line is a str of words separated by tabs('\t')) # call split() on line passing in "\t" as separator # loop through each word of line # apppend word to list, don't forget to call strip() to strip away any new line character # at the end of each line as well as any leading/trailing spaces
File "<ipython-input-2-ee1e2fcf1efb>", line 11 # at the end of each line as well as any leading/trailing spaces ^ SyntaxError: unexpected EOF while parsing