目标网站: https://v.qq.com/x/hotlist/search/?channel=1
对网站发送请求
import requests
url = 'https://v.qq.com/x/hotlist/search/?channel=1'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763',
}
r = requests.get(url,headers=headers)
导入三个解析库
解析时,要加上r.encoding = ‘utf - 8’,否则报错.
import requests
import re
from lxml import etree
from bs4 import BeautifulSoup
url = 'https://v.qq.com/x/hotlist/search/?channel=1'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763',
}
r = requests.get(url,headers=headers)
r.encoding = 'utf-8'
对电影名字解析
tree = etree.HTML(r.text)
title = tree.xpath('//div[@class="item item_a"]/a/@title')
对每个电影的地址解析
href = tree.xpath('//div[@class="item item_a"]/a/@href')
href只是每个电影的网页页面,还要进行二次爬取。
接下来遍历每个电影的网页地址,分别发送请求
获取相应的电影地址,并保存到csv文件中
导入csv
import csv
创建csv文件并命名两列标题[([‘名字’,‘地址’])
#打开文件,追加w
out = open('F:\腾讯电影热搜榜\腾讯视频top50名字 + 地址.csv','w', newline='')
#设定写入模式
eWriter = csv.writer(out,delimiter=',',lineterminator='\r\n')
csv_write = csv.writer(out,dialect='excel')
csv_write.writerow(['名字','地址'])
开始遍历,对每个电影名字以及地址进行保存
for a in range(50):
print('==============================================================================')
url_1 = href[a]
time.sleep(0.5)
print('开始解析第{}个电影'.format(a + 1))
r_1 = requests.get(url_1,headers=headers)
r_1.encoding = 'utf-8'
tree_1 = etree.HTML(r_1.text)
time.sleep(0.5)
print('开始获取第{}个电影地址'.format(a + 1))
time.sleep(0.5)
try:
print('开始用第一个方法解析')
href_1 = tree_1.xpath('//div[@class="_infos"]/div/a/@href')[0]
except:
print('第一种方法失败')
print('开始用第二种方法解析')
href_1 = tree_1.xpath('//div[@class="_infos"]/div/a/@href')
time.sleep(0.5)
print('{}-{}开始写入'.format(a + 1,title[a]))
csv_write.writerow([title[a],'{}'.format(href_1)])
time.sleep(0.5)
print('{}-{}写入完成'.format(a + 1,title[a]))
time.sleep(0.5)
完成后程序退出
print("=============================================================================")
print('top50写入完成')
print('程序3秒后退出')
time.sleep(3)
out.close()