直接上代码
import xlrd
import requests
from lxml import etree
import random
import jsonpath
import json
import xlwt
import time
import datetime
import csv
import pandas as pd
获取电影名称列表
def get_movie_list(parse):
file = ‘电影票房.xls’
wb = xlrd.open_workbook(filename=file) # 打开文件
sheet1 = wb.sheet_by_index(0) # 通过索引获取sheet
# print(sheet1.name,sheet1.nrows,sheet1.ncols)
# rows = sheet1.row_values()#获取行内容
movie_list = sheet1.col_values(0, 1, parse) # 获取列内容,第一列从第二行开始
return movie_list
获取电影ID的url
def get_url_list(parse):
file = ‘电影票房.xls’
wb = xlrd.open_workbook(filename=file) # 打开文件
sheet1 = wb.sheet_by_index(0) # 通过索引获取sheet
# print(sheet1.name,sheet1.nrows,sheet1.ncols)
# rows = sheet1.row_values()#获取行内容
movie_list = sheet1.col_values(0, 1, parse) # 获取列内容,第一列从第二行开始
url = ‘https://maoyan.com/query?kw=’
url_list = []
for i in movie_list:
url_list.append(url + i)
# print(url_list)
return url_list
print(url_list)
获取电影的id
def get_id(url_list):
id_list = []
for url in url_list:
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36’, }
response = requests.get(url=url, headers=headers)
# print(response)
content = response.text
# print(content)
html = etree.HTML(content)
dict = html.xpath(’//div[@class=“movie-item”]/a[@href]’)[0]
movie_id = dict.attrib[‘href’].replace(’/films/’, ‘’)
print(movie_id)
id_list.append(movie_id)
return id_list
获取电影评论的URL
def grt_URL_list(id_list):
URL = ‘http://m.maoyan.com/mmdb/comments/movie/{}.json?v=yes&offset=’
URL_list = []
for id in id_list:
URL_list.append(URL.format(id))
return URL_list
def get_ip():
# 获取代理ip
file = ‘代理.xls’
data = xlrd.open_workbook(filename=file)
sheet1 = data.sheet_by_index(0) # 通过索引获取sheet
# print(sheet1.name,sheet1.nrows,sheet1.ncols)
ip_list = sheet1.col_values(0, 0) # 获取列内容,第一列从第一行开始
proxies_list = []
for i in ip_list:
P = {‘http’: ‘http://’ + i}
proxies_list.append§
return proxies_list
http://m.maoyan.com/mmdb/comments/movie/344264.json?v=yes&offset=&startTime=2019-10-1%2023%3A59%3A59
http://m.maoyan.com/mmdb/comments/movie/1211270.json?v=yes&offset=0&startTime=2019-11-24 20:25:01
获取数据并写入excel
def get_comment(URL_list, movie_list):
now_time = datetime.datetime.now().strftime(’%Y-%m-%d %H:%M:%S’).replace(’ ', ‘%20’)
a = 0
for url in URL_list:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36', }
comment = [None] * 7
file = movie_list[a] + '.xls'
a += 1
# 创建一个Excel
book = xlwt.Workbook()
sheet1 = book.add_sheet(file, cell_overwrite_ok=True)
# 设置需要保存的信息栏
row0 = ['nickName', 'startTime', 'gender', 'userLevel', 'cityName', 'score', 'content']
# 将首行信息写入
for l in range(len(row0)):
sheet1.write(0, l, row0[l])
# 对时间进行循环
for i in range(1,10):
print('正在下载第{}页评论'.format(i))
# proxies = random.choice(proxies_list)
time.sleep(0.5)
try:
Url = url + str(i * 15) + '&startTime={}'.format(now_time)
print(Url)
r = requests.get(url=Url, headers=headers,timeout=5)
#print(r.text)
except:
print('失败')
continue
# 判断网页状态码,正常则提取数据,否则直接退出循环
if r.status_code == 200:
#print(r.text)
try:
soup = json.loads(r.text)['cmts']
# 每页的最后一条评论的时间,每次请求后给全局的now_time重新赋值,下次请求时用的时间就是上次响应数据中的最后一条数据的时间
now_time = soup["startTime"][-1].replace(' ', '%20')
# print(soup)
j = 0
# 保存数据
for cmt in soup:
j += 1
try:
comment[0] = cmt['nickName']
comment[1] = cmt['startTime']
if cmt.get('gender'):
comment[2] = cmt['gender']
else:
comment[2] = None
comment[3] = cmt['userLevel']
comment[4] = cmt['cityName']
comment[5] = cmt['score']
comment[6] = cmt['content']
print(comment[0])
except:
break
# 写入数据
print('写入数据')
for k in range(len(comment)):
sheet1.write((day - 1) * 1005 + (i * 15 + j), k, comment[k])
except:
break
else:
print('***********************************')
continue
# 保存文件
book.save(file)
def get_Comment(URL_list, movie_list):
now_time = datetime.datetime.now().strftime(’%Y-%m-%d %H:%M:%S’).replace(’ ', ‘%20’)
a = 0
for url in URL_list:
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36’, }
filename = movie_list[a] + '.csv'
a = a + 1
row0 = ['nickName', 'startTime', 'gender', 'userLevel', 'cityName', 'score', 'content']
fp = open(filename,'a' , encoding='utf-8', newline='')
# 将首行信息写入
writer = csv.writer(fp)
writer.writerow(row0)
for i in range(2000):
print('正在下载第{}页评论'.format(i))
# proxies = random.choice(proxies_list)
session = requests.Session()
proxies = {
"http": "http://HY9543C28642965D:[email protected]:9020",
}
timeslep = 20 * random.random()
time.sleep(timeslep)
try:
Url = url + str(i * 15) + '&startTime={}'.format(now_time)
print(Url)
r = requests.get(url=Url, headers=headers,proxies = proxies,timeout=5)
#print(r.text)
except:
print('失败')
continue
# 判断网页状态码,正常则提取数据,否则直接退出循环
try:
if r.status_code == 200:
# print(r.text)
response = r.json()
# 每页的最后一条评论的时间,每次请求后给全局的now_time重新赋值,下次请求时用的时间就是上次响应数据中的最后一条数据的时间
now_time = response["cmts"][-1]["startTime"].replace(' ','%20')
# print(soup)
# 保存数据
for cmt in response["cmts"]:
comment = [None] * 7
comment[0] = cmt['nickName']
comment[1] = cmt['startTime']
if cmt.get('gender'):
comment[2] = cmt['gender']
else:
comment[2] = None
comment[3] = cmt['userLevel']
comment[4] = cmt['cityName']
comment[5] = cmt['score']
comment[6] = cmt['content']
#print(comment)
writer.writerow(comment)
except:
print('失败**********')
continue
else:
continue
if name == ‘main’:
parse = int(input(‘请输入爬取电影数量(小于1975):’)) + 1
movie_list = get_movie_list(parse)
url_list = get_url_list(parse)
# movie_list = [‘中国机长’]
# url_list = [‘https://maoyan.com/query?kw=中国机长’]
id_list = get_id(url_list)
URL_list = grt_URL_list(id_list)
# proxies_list = get_ip()
get_Comment(URL_list, movie_list)
print(‘爬取结束’)