import requests
from bs4 import BeautifulSoup
url='https://movie.douban.com/cinema/nowplaying/xian/'
#1)获取页面信息
response=requests.get(url)
content=response.text
#print(content)
#2)分析页面,获取id和电影名
soup=BeautifulSoup(content,'lxml')
#先找到所有的电影信息对应的li标签
nowplaying_movie_list=soup.find_all('li',class_='list-item')
#存储所有的电影信息[{'title':'名称','id':'id号'}]
movies_info=[]
#依次遍历每一个li标签,再次提取需要的信息
for item in nowplaying_movie_list:
nowplaying_movie_dict={}
#根据属性获取title内容和id内容
#item['data-title']获取li标签里面的指定属性data-title对应的value值;
nowplaying_movie_dict['title']=item['data-title']
nowplaying_movie_dict['id']=item['id']
nowplaying_movie_dict['actors']=item['data-actors']
nowplaying_movie_dict['director']=item['data-director']
#将获取的{'title':"名称", "id":"id号"}添加到列表中;
movies_info.append(nowplaying_movie_dict)
print(movies_info)
import threading
import requests
from bs4 import BeautifulSoup
#1)爬取某一页的评论信息
def getOnePageComment(id,pageNum):
url='https://movie.douban.com/subject/%s/?from=playing_poster' %(id)
#2)爬取评论信息的网页内容
content=requests.get(url).text
#3)通过bs4分析网页
soup=BeautifulSoup(content,'lxml')
#分析网页得知,所有的评论信息都是在span标签,并且class为short
commentsList=soup.find_all('span',class_='short')
pageComments=""
#依次遍历每一个span标签,获取标签里面的评论信息,并将所有的评论信息存储到pageComments变量中
for commentTag in commentsList:
pageComments+=commentTag.text
print('%s page' %(pageNum))
global comments
comments+=pageComments
#2)爬取某个电影的前10页评论信息;
id = '30163509' #指的是电影飞驰人生
comments = ''
threads = []
#爬取前10页的评论信息;获取前几页就循环几次;
for pageNum in range(10): # 0 , 1 2 3 4...9
pageNum = pageNum + 1
#通过启动多线程获取每页评论信息
t = threading.Thread(target=getOnePageComment, args=(id, pageNum))
threads.append(t)
t.start()
#等待所有的子线程执行结束,再执行主线程内容;
_ = [thread.join() for thread in threads]
print("执行结束")
with open("%s.txt" % (id), 'w') as f:
f.write(comments)
import re
import wordcloud
import jieba
#1对于爬取的评论信息进行数据清洗(删除不必要的逗号,句号,表情,只留下中文或英文)
with open('./30163509.txt') as f:
comments=f.read()
#通过正则表达式实现
pattern=re.compile(r'([\u4e00-\u9fa5]+|[a-zA-Z]+)')
deal_comments=re.findall(pattern,comments)
newComments=""
for item in deal_comments:
newComments+=item
print(newComments)
通过jiaba(通过词库切割中文)、wordcloud(制作词云)模块实现
import jieba
import wordcloud
import matplotlib
text='马云曾公开表态称对钱没兴趣称其从来没碰过钱上了微博热搜但近来又说他对花钱很有兴趣'
#强调文件中出现的所有词语;
jieba.load_userdict('./doc/newWord')
#1)切割中文,lcut返回一个列表,cut返回一个生成器;
result = jieba.lcut(text)
print("切分结果:", result)
#2)绘制词云
wc = wordcloud.WordCloud(
background_color='snow',
font_path='./font/msyh.ttf', #处理中文数据时
min_font_size=5, #图片中最小字体大小;
max_font_size=15, #图片中最大字体大小;
width=200, #指定生成图片的宽度
)
wc.generate(",".join(result))
wc.to_file('./doc/douban.png')
import jieba
import wordcloud
import numpy as np
# 在python2中处理图像,Image;python3中如果处理图像,千万不要安装Image,安装pillow
from PIL import Image
#1)切割中文,lcut返回一个列表,cut返回一个生成器;
result = jieba.lcut(open('./30163509.txt').read())
#2)打开图片
imageObj = Image.open('./doc/mao.jpg') #选定图片样子,生成的便是这个样子
cloud_mask = np.array(imageObj)
#3)绘制词云
wc = wordcloud.WordCloud(
mask = cloud_mask,
background_color='red',
font_path='./font/msyh.ttf', #处理中文数据时
min_font_size=5, #图片中最小字体大小;
max_font_size=50, #图片中最大字体大小;
width=500, #指定生成图片的宽度
)
wc.generate(",".join(result))
wc.to_file('./doc/douban.png')
from bs4 import BeautifulSoup
from urllib.request import urlopen
def get_html(url):
a = urlopen(url).read().decode('gb2312')
return a
def get_info(text):
soup = BeautifulSoup(text, 'html5lib')
info_li = soup.find_all('a', class_='list-title')
news_li = [info_li[i].string for i in range(10)]
return news_li
def main():
url = 'http://top.baidu.com/buzz?b=341'
new_li = get_info(get_html(url))
[print(i) for i in new_li]
main()