Random Text Generator:
Generator "meaningful" random English text according to some input sample files.
Use markov algorithm as the basic algorithm to implement this generator.
Simple Version:
#!/usr/bin/env python ''' This module offers functions which parse an English text file and generate a statistical model of the text. ''' # simple parse function that parses a file and returns a dictionary # containing the statistical model def parse_simple(filepath, prefixnum=2): ''' parse a file specified in filepath and return a dict which has the form of {(p1, p2, .., pn):[s1, s2, ..., sm], ...} ''' dict_stat_model={} w1=w2=w3=None try: with open(filepath) as f: # parse file line by line, word by word for line in f: for w3 in line.split(): dict_add(dict_stat_model,w1,w2,w3) w1=w2 w2=w3 dict_add(dict_stat_model, w1, w2, None) except IOError as e: return None return dict_stat_model # helper functions def dict_add(d, w1, w2, w3): if (w1, w2) not in d: d[(w1, w2)] = [] if w3 not in d[(w1, w2)]: d[(w1, w2)].append(w3) def print_dict(d): for key in d: print key, d[key] print # self-contained test functions # return the number of failed test cases def test_parse_simple(): failed_test_cases=0 empty_file_path='test/empty.txt' short_file_path='test/short.txt' long_file_path='test/long.txt' invalid_file_path="INVALID_FILE_PATH" # test with empty input file dict_stat=parse_simple(empty_file_path) if dict_stat==None: failed_test_cases += 1 print '[empty file]' print_dict(dict_stat) # test with short input file dict_stat=parse_simple(short_file_path) if dict_stat==None: failed_test_cases += 1 print '[short file]' print_dict(dict_stat) # test with long input file dict_stat=parse_simple(long_file_path) if dict_stat==None: failed_test_cases += 1 print '[long file]' print_dict(dict_stat) # test with invalid input file (not exist) dict_stat=parse_simple(invalid_file_path) if dict_stat!=None: failed_test_cases += 1 return failed_test_cases # run tests if __name__ == '__main__': print '========== parse.py: self-contained tests start =====' failed_test_cases=test_parse_simple() print 'test_parse_simple: %s test cases failed' % failed_test_cases print '========== parse.py: self-contained tests end ======='
#!/usr/bin/env python ''' This program generates random English text based on the input file. Usage: gen.py <inputfile> ''' import sys import parse from random import choice def gen_rand_text(filepath): dict_stat=parse.parse_simple(filepath) if dict_stat==None: print 'parse % failed, maybe invalid file path' % filepath return 1 # deal with dict_stat and output random text based on it # parse.print_dict(dict_stat) w1=w2=w3=None while True: w3=choice(dict_stat[(w1,w2)]) if w3 == None: break if w1 != None: print w1, w1=w2 w2=w3 if w1 != None: print w1, w2 print return 0 def usage(): print "Usage: gen.py <inputfile>" if __name__=="__main__": # check command arguments if len(sys.argv) != 2: usage() sys.exit(1) gen_rand_text(sys.argv[1])
More complete version:
#!/usr/bin/env python ''' This module offers a class that represents the text model of some input text files -- TextModel. ''' from random import choice import sys class TextModel: def __init__(self, prefix_len=2): self.state_dict={} self.prefix_list=[] self.prefix_len=prefix_len # prefix_len should be within the range of [1,5], # otherwise, it's meaningless assert self.prefix_len>=1 assert self.prefix_len<=5 def __str__(self): return str(self.state_dict) def add(self, prefix, suffix): ''' add prefix,suffix to state_dict ''' if prefix not in self.state_dict: self.state_dict[prefix]=[suffix] self.prefix_list.append(prefix) else: self.state_dict[prefix].append(suffix) def getSuffixes(self, prefix): return self.state_dict[prefix] def getRandSuffix(self, prefix): suffixes=self.getSuffixes(prefix) return choice(suffixes) def getRandPrefix(self): return choice(self.prefix_list) def parseFile(self, f): ''' parse file to form internal data structures which stores the statistical model of this text file f should be a file object ''' assert isinstance(f, file) wlist=[] for i in range(0, self.prefix_len): wlist.append(None) for line in f: for w in line.split(): # add to dict self.add(tuple(wlist), w) # shift by one wlist=wlist[1:] wlist.append(w) self.add(tuple(wlist), None) def parseFiles(self, fList): for f in fList: self.parseFile(f) # self-contained tests for this module def testFileParsing(tm, argv): if len(argv) == 0: tm.parseFile(sys.stdin) sys.stdin.close() else: fList=[] for arg in argv: fList.append(open(arg)) tm.parseFiles(fList) for f in fList: f.close() def testRandPrefixAndSuffix(tm): for i in range(0, 10): prefix=tm.getRandPrefix() suffix=tm.getRandSuffix(prefix) print prefix, suffix if __name__ == "__main__": tm=TextModel() testFileParsing(tm, sys.argv[1:]) print tm testRandPrefixAndSuffix(tm)
#!/usr/bin/env python ''' This module provides a class -- TextGenerator. This class parses the a given text model and generate output according to the model. ''' import sys from TextModel import TextModel import re # class TextModel class TextGenerator: ''' Generator random text from text model according to some settings. ''' def __init__(self, text_model, out_file=sys.stdout, slen=1): self.tm=text_model self.out_file=out_file self.slen=slen def setSentenceNum(self, n): self.slen=n def sentence_start(self, word): ''' whether word is a start of a sentence ''' sentence_start_regexp=re.compile("^[A-Z].*") if sentence_start_regexp.match(word): return True else: return False def sentence_end(self, word): ''' whether word is an end of a sentence ''' sentence_end_regexp=re.compile('.*[.?!]{1}["]{0,1}$') if sentence_end_regexp.match(word): return True else: return False def genRandText(self): ''' generator random text start: when prefix[0] is the start of a sentence or None end: when slen (num of sentences) are output the output is written to the output file ''' num=0 while (num < self.slen): prefix=self.tm.getRandPrefix() # output one sentence in each inner loop flag_in_sentence=False while True: # if prefix[0] is None, break out, and choose a new prefix # otherwise, the None-headed prefix would have an impact on # the statistical model of the input text file if prefix[0] == None: break # output the start of the sentence if flag_in_sentence==False: if self.sentence_start(prefix[0]): flag_in_sentence=True self.out_file.write(prefix[0]) self.out_file.write(" ") if self.sentence_end(prefix[0]): flag_in_sentence=False self.out_file.write("\n") num += 1 break else: # if in sentence, output self.out_file.write(prefix[0]) self.out_file.write(" ") if self.sentence_end(prefix[0]): flag_in_sentence=False self.out_file.write("\n") num += 1 break # get a new prefix try: suffix=self.tm.getRandSuffix(prefix) prefix=list(prefix)[1:] prefix.append(suffix) prefix=tuple(prefix) except: prefix=self.tm.getRandPrefix() # check whether the sentence ends if prefix[-1]==None or self.sentence_end(prefix[-1]): break; # output remaining words of a sentence if flag_in_sentence: for w in prefix: if w: self.out_file.write(w) self.out_file.write(" ") if self.sentence_end(w): flag_in_sentence=False self.out_file.write("\n") break num += 1 # self-contained tests def testTextGenerator(argv): tm=TextModel() if len(argv) == 0: tm.parseFile(sys.stdin) sys.stdin.close() else: fList=[] for arg in argv: fList.append(open(arg)) tm.parseFiles(fList) for f in fList: f.close() tg=TextGenerator(tm) tg.setSentenceNum(5) tg.genRandText() if __name__ == "__main__": testTextGenerator(sys.argv[1:])