ssr1(电影数据网站,无反爬,T)
总结(requests实现):
'''
1、/text()获取指定标签下的文本内容,//text()获取指定标签下的文本内容,包括子标签下的文本内容
这一点在标签数量不确定时用处较大
如每个电影的类型标签数量不一,而每个类型又位于html文档的不同标签里,这时可以将包含这些类型的大标签拿出来,然后读取文本内容,包括子标签下的
2、去除列表中的空格与换行
data_list = [x.strip() for x in temp_list if x.strip() != ''] ,其中temp_list是指列表名
eg
['\n ', ' 剧情 ', '\n ', '\n爱情 ', '\n ', '\n ']
['剧情', '爱情']
3、对于空值要进行判断(否则遇到空值会报错,因为xpath此时无法定位到对象)
错误写法:
data_dict['score'] = node.xpath('./div[3]/p[1]/text()')[0]
建议写法
data_dict['score'] = node.xpath('./div[3]/p[1]/text()')[0] if len(node.xpath('./div[3]/p[1]/text()')) > 0 else None
'''
requests实现:
# coding = utf-8
# @Time : 2021/12/19
# url : 'https://ssr1.scrape.center/'
import requests
from lxml import etree
import time
class DIANying(object):
def __init__(self):
self.url = 'https://ssr1.scrape.center/page/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.57'
}
def get_data(self, page):
url = self.url + str(page)
response = requests.get(url=url, headers=self.headers)
time.sleep(3) # 设置等待时间
return response.content.decode()
def parse_data(self, res, data_list):
# 构建etree对象
html = etree.HTML(res)
# 定位节点
node_list = html.xpath('//*[@id="index"]/div[1]/div[1]/div/div/div')
# 遍历每个节点,提取信息
for node in node_list:
data_dict = {}
data_dict['name'] = node.xpath('./div[2]/a/h2/text()')[0] if len(node.xpath('./div[2]/a/h2/text()')) > 0 else None
type_set = node.xpath('./div[2]/div[1][@class="categories"]//text()') if len(node.xpath('./div[2]/div[1][@class="categories"]//text()')) > 0 else None
# 去除列表中的空值
film_type = [x.strip() for x in type_set if x.strip() != '']
type_str = '、'.join(film_type)
data_dict['type'] = type_str
data_dict['area'] = node.xpath('./div[2]/div[2]/span[1]/text()')[0] if len(node.xpath('./div[2]/div[2]/span[1]/text()')) > 0 else None
data_dict['long'] = node.xpath('./div[2]/div[2]/span[3]/text()')[0] if len(node.xpath('./div[2]/div[2]/span[3]/text()')) > 0 else None
data_dict['begin_time'] = node.xpath('./div[2]/div[3]/span/text()')[0] if len(node.xpath('./div[2]/div[3]/span/text()')) > 0 else None
data_dict['score'] = node.xpath('./div[3]/p[1]/text()')[0].strip() if len(node.xpath('./div[3]/p[1]/text()')) > 0 else None
data_list.append(data_dict)
return data_list
def run(self):
# 定义空列表,用于储存数据
data_list = []
# 依次访问每一页
for page in range(1, 11):
print('正在爬取第{}页,,,'.format(page))
# 请求url,获取响应
res = self.get_data(page)
# 处理数据
data_list = self.parse_data(res, data_list)
print(data_list)
print(len(data_list))
if __name__ == '__main__':
dianying = DIANying()
dianying.run()
总结(selenium实现)
'''
1、使用selenium进行爬虫,使用xpath进行定位时,如果某些标签缺失,会导致定位不到元素而报错,如何像在requests中使用xpath一样,增加判断条件,缺失的标签赋值为None
目前,我只会使用try/except来实现
如果确定是由于某些标签缺失而导致定位不到元素报错,则直接赋值为None
2、翻页时,怎么保证到最后一页就不进行翻页操作了,如果仍旧进行翻页操作,会由于定位不到“下一页”标签而报错
目前,我也只是使用try/except来实现
如果确定是由于到了最后一页而定位不到“下一页”标签报错,就直接终止
'''
selenium实现
# coding = utf-8
# @Time : 2021/12/
from selenium import webdriver
from selenium.webdriver.common.by import By
class DIANying(object):
def __init__(self):
self.url = 'https://ssr1.scrape.center/'
self.driver = webdriver.Edge()
def parse_data(self, data_list):
node_list = self.driver.find_elements(By.XPATH, '//*[@id="index"]/div[1]/div[1]/div/div/div')
for i in range(len(node_list)):
try:
data_dict = {}
data_dict['name'] = node_list[i].find_element(By.XPATH, './div[2]/a/h2').text if len(node_list[i].find_element(By.XPATH, './div[2]/a/h2').text) > 0 else None
type_set = node_list[i].find_element(By.XPATH, './div[2]/div[1][@class="categories"]').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[1][@class="categories"]').text) > 0 else None
data_dict['type'] = type_set
data_dict['area'] = node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[1]').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[1]').text) > 0 else None
data_dict['long'] = node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[3]').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[3]').text) > 0 else None
data_dict['begin_time'] = node_list[i].find_element(By.XPATH, './div[2]/div[3]/span').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[3]/span').text) > 0 else None
data_dict['score'] = node_list[i].find_element(By.XPATH, './div[3]/p[1]').text.strip() if len(node_list[i].find_element(By.XPATH, './div[3]/p[1]').text) > 0 else None
except:
node_list = self.driver.find_elements(By.XPATH, '//*[@id="index"]/div[1]/div[1]/div/div/div')
data_dict = {}
data_dict['name'] = node_list[i].find_element(By.XPATH, './div[2]/a/h2').text if len(node_list[i].find_element(By.XPATH, './div[2]/a/h2').text) > 0 else None
type_set = node_list[i].find_element(By.XPATH, './div[2]/div[1][@class="categories"]').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[1][@class="categories"]').text) > 0 else None
data_dict['type'] = type_set
data_dict['area'] = node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[1]').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[1]').text) > 0 else None
data_dict['long'] = node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[3]').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[3]').text) > 0 else None
try:
data_dict['begin_time'] = node_list[i].find_element(By.XPATH, './div[2]/div[3]/span').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[3]/span').text) > 0 else None
except:
data_dict['begin_time'] = None
data_dict['score'] = node_list[i].find_element(By.XPATH, './div[3]/p[1]').text.strip() if len(node_list[i].find_element(By.XPATH, './div[3]/p[1]').text) > 0 else None
data_list.append(data_dict)
# print(data_list)
# 下一页操作
try:
self.driver.find_element(By.XPATH, '//*[@id="index"]/div[2]/div/div/div/a[@class="next"]/button').click()
except:
return data_list
return data_list
def run(self):
data_list = []
# 访问初始页面
self.driver.get(self.url)
# 处理数据
for i in range(10):
data_list = self.parse_data(data_list)
self.driver.quit()
print(data_list)
print(len(data_list))
if __name__ == '__main__':
dianying = DIANying()
dianying.run()