import sys import string import re def wordcount(text, _debug=0): wordcount = 0 words = {} sentencecount = 0 # Normalize white space text = re.sub(r'\s+', ' ', text) # What is a sentence? endofsentence = re.compile(r'\?|\.|\!') sentences = endofsentence.split(text) sentences = [ sentence.strip() for sentence in sentences if sentence.strip()] sentencesep = endofsentence.findall(text) if len(sentences) != len(sentencesep): if _debug: print "Sentence seperators do not equal number of sentences" # Put the seperators back sentences = [x+y for (x,y) in zip(sentences, sentencesep)] for sentence in sentences: sentencecount += 1 if _debug: print sentencecount, ':', repr(sentence) # split the string into a list of words # a word is delimited by whitespace or punctuation #"[.,:;?! \t\n]+" , # this is the regex used in my perl version for word in re.split( "[" + string.whitespace + string.punctuation + "]+" , sentence ) : # make the word lower case word = string.lower( word ) # check to make sure the string is considered a word if word.isalpha(): wordcount += 1 # if the word has been found before, increment its count # otherwise initialize its count to 1 if words.has_key( word ) : words[ word ] += 1 else : words[ word ] = 1 # Now print out the results of the count: if _debug: print "Total word count:" , wordcount # print each word and its count in sorted order sorted_word_list = words.keys() sorted_word_list.sort() if _debug: for word in sorted_word_list : print word , ":" , words[ word ], return sentencecount, words if __name__ == '__main__': import sys wordcount(sys.stdin.read(), 1)