第八章 自然语言处理-概括数据-数据清洗加去掉常规词语

#!/usr/bin/env python
# _*_ coding:utf-8 _*_
import operator
import re
import string
from collections import OrderedDict
from urllib.request import urlopen

from bs4 import BeautifulSoup


def cleanInput(input):
    input= re.sub('\n+'," ",input)
    input=re.sub('\[[0-9]*\]',"",input)
    input=re.sub(' +'," ",input)
    input=bytes(input,"UTF-8")
    input=input.decode("ascii","ignore")
    cleanInput=[]
    input=input.split(' ')
    for item in input:
        item=item.strip(string.punctuation)
        if len(item)>0 or (item.lower()=='a' or item.lower()=='t'):
            cleanInput.append(item)
    return cleanInput
def ngrams(input ,n):
    input=cleanInput(input)
    output={}
    for i in range(len(input)-n+1):
        outputTmp=" ".join(input[i:i+n])
        if outputTmp not in output:
            output[outputTmp]=0
        output[outputTmp]+=1
    return output
def isCommon(ngram):
    commonWords = ["the", "be", "and", "of", "a", "in", "to", "have", "it",
                   "i", "that", "for", "you", "he", "with", "on", "do", "say", "this",
                   "they", "is", "an", "at", "but", "we", "his", "from", "that", "not",
                   "by", "she", "or", "as", "what", "go", "their", "can", "who", "get",
                   "if", "would", "her", "all", "my", "make", "about", "know", "will",
                   "as", "up", "one", "time", "has", "been", "there", "year", "so",
                   "think", "when", "which", "them", "some", "me", "people", "take",
                   "out", "into", "just", "see", "him", "your", "come", "could", "now",
                   "than", "like", "other", "how", "then", "its", "our", "two", "more",
                   "these", "want", "way", "look", "first", "also", "new", "because",
                   "day", "more", "use", "no", "man", "find", "here", "thing", "give", "many",
                   "well"]
    ngram=ngram.lower()
    if ngram in commonWords:
        return True
    return False
content=str(urlopen("https://pythonscraping.com/files/inaugurationSpeech.txt").read(),'utf-8')
ngrams=ngrams(content,2)
nagramsPicked=ngrams.copy()
for k,v in ngrams.items():
    words=k.split(" ")
    for eachWord in words:
        flag=isCommon(eachWord)
        if flag:
            nagramsPicked.pop(k)
            break
sortedNagrams=sorted(ngrams.items(), key=operator.itemgetter(1),reverse=True)
print(sortedNagrams)

 

你可能感兴趣的:(python网络数据采集)