闲来随笔
from urllib import request
from bs4 import BeautifulSoup
from wordcloud import WordCloud
import jieba
import matplotlib.pyplot as plotShow
def replace(content):
str=''
for i in content:
if i != ' ':
str+=i
return str
def pythonpScrap1Camouflage():
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0"
}
url = "https://movie.douban.com/top250?format=text";
fullHtml = url
req = request.Request(fullHtml, headers=head)
res = request.urlopen(req)
html = res.read()
html = html.decode("utf-8")
soup = BeautifulSoup(html, "html.parser")
fileWrite = open('wordCount.txt', 'w+',encoding='utf-8')
for tag in soup.find_all("div", class_="item"):
m_movie_level = tag.find("div", class_="pic")
fileWrite.write(m_movie_level.find('a').get('href'))
fileWrite.write(m_movie_level.find('a').find("img")["src"])
fileWrite.write(m_movie_level.find('em').get_text())
m_movie_info_hd = tag.find("div", class_="info").find("div", class_="hd")
m_movie_info_hd_span = m_movie_info_hd.findAll("span");
fileWrite.write(m_movie_info_hd_span[0].contents[0]);
fileWrite.write(m_movie_info_hd_span[1].contents[0] + m_movie_info_hd_span[2].contents[0]);
m_movie_info_hd = tag.find("div", class_="info").find("div", class_="bd")
m_movie_info_hd_class = m_movie_info_hd.findAll("p");
fileWrite.write(replace(m_movie_info_hd_class[0].contents[0]));
m_movie_info_hd_star = m_movie_info_hd.find("div", class_="star").findAll("span");
fileWrite.write(replace(m_movie_info_hd_star[1].contents[0]))
m_movie_info_hd_cri = m_movie_info_hd.find("p", class_="quote").get_text()
fileWrite.write(replace(m_movie_info_hd_cri));
fileWrite.close()
def pythonpScrap2wordcloud1():
text=open(r'wordCount.txt',"r",encoding='utf-8').read();
cut_text = jieba.cut(text)
result = " ".join(cut_text)
wc=WordCloud(
font_path="C:/Windows/Fonts/STXINGKA.TTF",
background_color='white',
width=500,
height= 350,
max_font_size=50,
min_font_size=10,
mode='RGBA',
colormap='pink'
)
wc.generate(result)
wc.to_file(r'wordClound.png')
plotShow.figure("jay")
plotShow.imshow(wc)
plotShow.axis("off")
plotShow.show()
if __name__=="__main__":
pythonpScrap1Camouflage();
pythonpScrap2wordcloud1()