Python读书笔记009:文本统计

文本文件的统计数据:

>>> len(s)
46
>>> s.split()
['A', 'long', 'time', 'ago,', 'in', 'a', 'galaxy', 'far,', 'far', 'away...']
>>> t = ' a long time ago in a galaxy far far away'
>>> t.split()
['a', 'long', 'time', 'ago', 'in', 'a', 'galaxy', 'far', 'far', 'away']
>>> len(t.split())
10
>>> set(t.split())
{'in', 'away', 'ago', 'far', 'a', 'galaxy', 'time', 'long'}
>>> len(set(t.split()))
8

保留想要的字母

将字符串转换成小写:

>>> s = "I'd like a copy!"
>>> s.lower()
"i'd like a copy!"

删除不想要的字符:


>>> s = "I'd like a copy!"
>>> s.replace('!','')
"I'd like a copy"
>>> s.replace("'",'')
'Id like a copy!'
>>> s.replace("'",' ')
'I d like a copy!'

keep = {'a', 'b', 'c', 'd', 'e', 'f', \
        'g', 'h', 'i', 'j', 'k', 'l', \
        'm', 'n', 'o', 'p', 'q', 'r', \
        's', 't', 'u', 'v', 'w', 'x', \
        'y', 'z', ' ', '-', "'"}

def normalize(s):
    '''
    Convert s to a normatlized string
    '''
    result = ''
    for c in s.lower():
        if c in keep:
            result +=c
    return result
>>> s = "I'd like a copy!"
>>> normalize(s)
"i'd like a copy"

文本统计:

keep = {'a', 'b', 'c', 'd', 'e', 'f', \
        'g', 'h', 'i', 'j', 'k', 'l', \
        'm', 'n', 'o', 'p', 'q', 'r', \
        's', 't', 'u', 'v', 'w', 'x', \
        'y', 'z', ' ', '-', "'"}

def normalize(s):
    '''
    Convert s to a normatlized string
    '''
    result = ''
    for c in s.lower():
        if c in keep:
            result +=c
    return result

def make_freq_dict(s):
    '''
    Returns a dictionary whose keys
    are the words of s, and whose
    value are the counts of those
    words.
    '''
    s = normalize(s)
    words = s.split()
    d = {}
    for w in words:
        if w in d:
            d[w] +=1
        else:
            d[w] =1
    return d

def print_file_stats(fname):
    '''
    Print statistics for the given file.
    '''
    s = open(fname,'r').read()
    num_chars = len(s)
    num_lines = s.count('\n')
    d = make_freq_dict(s)
    num_words = sum(d[w] for w in d)

    lst = [(d[w],w) for w in d]
    lst.sort()
    lst.reverse()

    print("The file '%s' has" % frame)
    print("    %s characters" % num_chars)
    print("    %s lines"      % num_lines)
    print("    %s words"      % num_words)
    print("\nThe top 10 most frequent words are:")

    i=1
    for count, word in lst[:10]:
        print('%2s. %2s %s' %(i, count, word))
        i +=1





>>> frame="e://Python//The Babes.txt"
>>> print_file_stats(frame)
The file 'e://Python//The Babes.txt' has
    148319 characters
    3118 lines
    23817 words

The top 10 most frequent words are:
 1. 1253 the
 2. 746 and
 3. 675 to
 4. 657 of
 5. 496 her
 6. 436 a
 7. 383 in
 8. 352 she
 9. 261 you
10. 259 daph












































































你可能感兴趣的:(Python读书笔记009:文本统计)