以爬取豆瓣电影的排行榜前十的电影为例进行说明:
代码如下:
import requests
from bs4 import BeautifulSoup
import re
from lxml import etree
import codecs
import csv
class FirstCrawl():
#初始化数据
def __init__(self):
self.headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
}
self.url = 'https://movie.douban.com/chart'
#proxies = { "http": "http://10.10.1.10:3128", "https": "http://10.10.1.10:1080", }
self.proxies = {'http': '171.14.90.249:30733'}
#设置csv文件
self.CsvFileName="movie_data1.csv"
#存储Csv数据
self.CsvData=[]
#爬取的电影计数
self.num = 0
#解析网页中的内容
def GetMovieProfile(self,content):
soup = BeautifulSoup(content,'lxml')
#爬取所有的电影图片以及链接
img_list = soup.select('.item > td > .nbg')
#计数第几个电影
for img in img_list:
self.num += 1
print('现在开始爬取第%d个电影的详细信息...'% self.num)
#获取电影图片链接
img_src = img.img['src']
#获取电影链接
movie_href = img['href']
#发送请求
r = requests.get(movie_href,headers=self.headers)
#解析内容,通过电影链接去获取电影的详情
soup = BeautifulSoup(r.text,'lxml')
tree = etree.HTML(r.text)
#每部影片的影片名字
movie_name = tree.xpath('//div[@id="content"]/h1[1]//text()')
movie_name = ''.join(movie_name).strip('\n ').replace('\n','').replace(' ','')
#print(movie_name)
#每部影片的信息包括导演、编剧、
movie_all_info = soup.select('#info')
movie_base_info = movie_all_info[0].select('span')
#电影的导演
movie_direction = movie_base_info[0].select('.attrs')[0].text
#print(movie_direction)
#电影的编剧
#movie_base_info[1].select('.attrs')[0].text
movie_scriptwrite = tree.xpath('//div[@id="info"]/span[2]//text()')[2:]
movie_scriptwrite = ''.join(movie_scriptwrite).strip('\n ').replace('\n','').replace(' ','')
#print(movie_scriptwrite)
#电影的主演
movie_actor = movie_base_info[2].text
#print(movie_actor)
#电影类型
movielist = movie_all_info[0].find_all('span',property="v:genre")
movie_type_list = []
for i in range(0,len(movielist)):
movie_type_list.append(movielist[i].text + '/')
movie_type = ''.join(movie_type_list)
#print(''.join(movie_type_list))
movie_section_list = []
section_list = tree.xpath('//div[@id="info"]/text()')
i = 0
for section in section_list:
sec = section.strip(' \n /')
movie_section_list.append(sec)
movie_section_info = []
for i in range(0,len(movie_section_list)):
if movie_section_list[i] != '':
movie_section_info.append(movie_section_list[i])
#print(''.join(movie_section_info))
#产出地
birth_palace = movie_section_info[0]
#电影语言
movie_language = movie_section_info[1]
# print(movie_language)
#电影上映日期
show_date = movie_all_info[0].find('span',property='v:initialReleaseDate').text
#print(date)
#电影片长
running_time = movie_all_info[0].find('span',property ='v:runtime').text
#print(running_time)
#电影的又名
try:
other_name = movie_section_info[2]
except Exception:
other_name = '无'
# print(other_name)
# print(other_name)
#豆瓣电影总评分
sum_rating = soup.select('.clearfix > .rating_num')[0].text
#电影的总评分人数
sum_person = soup.select('.rating_people > span')[0].text + '人评价过'
#推荐电影的百分比列表
stars_list = soup.select('.ratings-on-weight > .item')
#力荐这部电影人数的百分比(五星)
stars5 = stars_list[0].find('span',class_ = 'rating_per').text
# print(stars5)
#推荐这部电影人数的百分比(4星)
stars4 = stars_list[1].find('span',class_ = 'rating_per').text
# print(stars4)
#觉得这部电影还行的人数的百分比(3星)
stars3 = stars_list[2].find('span',class_ = 'rating_per').text
# print(stars3)
#觉得这部电影较差的人数的百分比(2星)
stars2 = stars_list[3].find('span',class_ = 'rating_per').text
# print(stars2)
#觉得这部电影很差的人数的百分比(1星)
stars1 = stars_list[4].find('span',class_ = 'rating_per').text
# print(stars1)
#在哪可以看这个电影的视频
# watch_movie_list = soup.select('.gray_ad > .bs > li')
# watch_movie_show_list = []
# i = 0
# for watch_movie in watch_movie_list:
# watch_movie_name = watch_movie.find('a',class_ = 'playBtn').text
# watch_movie_show = watch_movie.find('a',class_ = 'playBtn' )['href']
# watch_movie_show_list.append({watch_movie_name:watch_movie_show})
#print(watch_movie_show_list)
#该电影的简介
#movie_introduce = tree.xpath('//div[@id="link-report"]//span[@property="v:summary"]/text()')
#print(movie_introduce)
#拼凑行数据
d=[movie_name,
movie_direction,
movie_scriptwrite,
movie_actor,
movie_type,
birth_palace,
movie_language,
show_date,
running_time,
other_name,
sum_rating,
sum_person,
stars5,
stars4,
stars3,
stars2,
stars1,
]
self.CsvData.append(d)
print('结束爬取第%d个电影的详细信息...'% self.num)
#获取页面数据
def GetContent(self):
try:
r = requests.get(self.url,headers=self.headers,proxies=self.proxies)
self.GetMovieProfile(r.text)
#写入Csv文件中
with open(self.CsvFileName, 'w',encoding='utf8') as csvfile:
spamwriter = csv.writer(csvfile, dialect='excel')
#设置标题
spamwriter.writerow(["电影名称","导演","编剧","主演","类型","产出地","语种","上映日期","放映时长","又名","总评分(十分制)","总评分人数","力荐这部电影人数的百分比(五星)","推荐这部电影人数的百分比(4星)","觉得这部电影还行的人数的百分比(3星)","觉得这部电影较差的人数的百分比(2星)","觉得这部电影很差的人数的百分比(1星)"])
#将CsvData中的数据循环写入到CsvFileName文件中
for item in self.CsvData:
spamwriter.writerow(item)
print("成功导出CSV文件!")
except Exception as e:
print("404 error!%s" % e )
#实例化类
fc = FirstCrawl()
#执行获取内容的方法
fc.GetContent()