第七章7.1 数据清洗--将从网站上爬去的数据进行清洗然后转为2-grams序列输出

#!/usr/bin/env python
# _*_ coding:utf-8 _*_
import re
import string
from collections import OrderedDict
from urllib.request import urlopen

from bs4 import BeautifulSoup


def cleanInput(input):
    input= re.sub('\n+'," ",input)
    input=re.sub('\[[0-9]*\]',"",input)
    input=re.sub(' +'," ",input)
    input=bytes(input,"UTF-8")
    input=input.decode("ascii","ignore")
    cleanInput=[]
    input=input.split(' ')
    for item in input:
        item=item.strip(string.punctuation)
        if len(item)>0 or (item.lower()=='a' or item.lower()=='t'):
            cleanInput.append(item)
    return cleanInput
def ngrams(input ,n):
    input=cleanInput(input)
    output=[]
    outputNew=[]
    for i in range(len(input)-n+1):
        output.append(str(input[i:i+n]))
    setout=set(output)
    for item in setout:
        outputNew.append((item,output.count(item)))
    return outputNew
html=urlopen("https://en.wikipedia.org/wiki/Python_(programming_language)")
bsObj=BeautifulSoup(html,"html.parser")
content=bsObj.find("div",{"id":"mw-content-text"}).get_text()
ngrams=ngrams(content,2)
ngrams=OrderedDict(sorted(ngrams,key=lambda t: t[1],reverse=True))
print(ngrams)
print("2-ngrams count is "+str(len(ngrams)))

 

你可能感兴趣的:(python网络数据采集)