爬虫爬取新闻并生成词云

爬取豆瓣应用的函数同样可以用来爬取新闻。这里面主要是正则表达式的提取和对爬出的链接再进行爬取解析。对网页不

import re  # 正则
from bs4 import BeautifulSoup  # 网页解析
import urllib.request, urllib.error  # 制定url获取网络数据




def main():
    baseurl = "       "  #新闻总网站选取爬取网站网址
    getData(baseurl)  # 函数调用
    #生成词云



findlink = re.compile(r'')  # 分网站链接
findword= re.compile(r'

(.*?)

') # 内容 def getData(baseurl): # 爬取 links = [] url=baseurl html = askURL(url) #print(html) # 解析 soup = BeautifulSoup(html, "html.parser") #print(soup) try: for item in soup.find_all('li',class_="clearfix"):#不同网站内容不同 data = [] item = str(item) link=str(re.findall(findlink,item)[0]) html=askURL(link) Soup=BeautifulSoup(html, "html.parser") Soup=str(Soup ) word=str(re.findall(findword,Soup )) word=str(re.findall(r'[\u4e00-\u9fa5]',word)) word=word.replace("'",'') word = word.replace(",", '') word = str(word.replace(" ", '')) with open("新闻.txt", "a+") as f: # 写入txt f.write(word) links.append(data) for item1 in soup.find_all('div', class_="list-focus"): #选择爬取部份 data1 = [] item1 = str(item1) link = str(re.findall(findlink, item1)[0]) html = askURL(link) Soup1 = BeautifulSoup(html, "html.parser") # 解析那四个 Soup1 = str(Soup1) word = str(re.findall(findword, Soup1)) word = str(re.findall(r'[\u4e00-\u9fa5]', word)) # 提取所有汉字 word = word.replace("'", '') word = word.replace(",", '') word = str(word.replace(" ", '')) with open("新闻.txt", "a+") as f:#写入txt f.write(word) links.append(data1) for i in range(0, 300): url = baseurl html = askURL(url) for item2 in soup.find_all('li', class_="active"): #选择爬取部分 data2 = [] item2 = str(item2) link = str(re.findall(findlink, item2)[0]) html = askURL(link) Soup2 = BeautifulSoup(html, "html.parser") # 解析 Soup2 = str(Soup2) word = str(re.findall(findword, Soup2)) word = str(re.findall(r'[\u4e00-\u9fa5]', word)) # 提取所有汉字 word = word.replace("'", '') word = word.replace(",", '') word = str(word.replace(" ", '')) with open("新闻.txt", "a+") as f:#写入txt f.write(word) links.append(data2) except Exception as e: print(e) return def askURL(url): head = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.42" } request = urllib.request.Request(url, headers=head) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) return html if __name__ == "__main__": main() print("爬取完毕")

同的地方位置重复爬取。我这里对新闻不同位置元素爬取。多次用解析函数。别的都是按正常的。

首先用来隐藏成正常的电脑IP。在head那。然后用库函数访问。并指出别的情况。正则表达式主要用F12进行查取。取消不必要的部份。

  后面是词云生成

import re
import jieba
from wordcloud import WordCloud
import cv2.cv2 as cv


def analysis(savepath):
    f = open(savepath, 'r', encoding='gbk')
    result = f.read()
    result = re.sub('[a-zA-Z0-9"#$%&\'()*+,-./::;""()<=>?@,。?、…【】《》?![\\]^_`{|}~\s]+', '', result)#消除符号和换行
    
    words = jieba.lcut(result)
    string = []
    for word in words:
        if len(word) == 1:
            continue
        else:
            string.append(word)

    strings =' '.join(string)
    mk = cv.imread(' ')#背景图形的填入
    w = WordCloud(font_path="C:\Windows\Fonts\simhei.ttf", background_color="Red", width=1000, height=600, mask=mk,
                  max_words=500, colormap="autumn",
                  stopwords={' '})#这里依次是是所用字体,在文件夹里寻找可以更改。然后是背景颜色,定义长宽。##使用图形。定义最大词数,字体颜色定义。
    w.generate(strings)
    w.to_file('wordcloud.jpg')#生成词云图
    image = w.to_image()
    image.show()


def main():
    savepath = "新闻2.txt"
    analysis(savepath)
    print("finish")


if __name__ == "__main__":
    main()

用jiaba函数进行分词。然后把内容用循环遍历转成字符串。用词云函数代码中有注释。然后生成词云图。大体就是这样,不过在爬取时可以简化。从总的爬取。所有的新闻都有一个相同点可以用来爬取。

stopwords={' '}用来选择词云中不显现的词语

你可能感兴趣的:(爬虫,python,学习)