https://ssr1.scrape.center/ 简单练习网站requests、selenium两种方式爬取

ssr1(电影数据网站,无反爬,T)

总结(requests实现):

'''
1、/text()获取指定标签下的文本内容,//text()获取指定标签下的文本内容,包括子标签下的文本内容
这一点在标签数量不确定时用处较大
如每个电影的类型标签数量不一,而每个类型又位于html文档的不同标签里,这时可以将包含这些类型的大标签拿出来,然后读取文本内容,包括子标签下的

2、去除列表中的空格与换行
data_list = [x.strip() for x in temp_list if x.strip() != '']  ,其中temp_list是指列表名
eg
['\n        ', '    剧情   ', '\n        ', '\n爱情        ', '\n        ', '\n        ']
['剧情', '爱情']

3、对于空值要进行判断(否则遇到空值会报错,因为xpath此时无法定位到对象)
错误写法:
data_dict['score'] = node.xpath('./div[3]/p[1]/text()')[0]
建议写法
data_dict['score'] = node.xpath('./div[3]/p[1]/text()')[0] if len(node.xpath('./div[3]/p[1]/text()')) > 0 else None
'''

requests实现:

# coding = utf-8
# @Time : 2021/12/19
# url : 'https://ssr1.scrape.center/'

import requests
from lxml import etree
import time


class DIANying(object):

    def __init__(self):
        self.url = 'https://ssr1.scrape.center/page/'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.57'
        }

    def get_data(self, page):
        url = self.url + str(page)
        response = requests.get(url=url, headers=self.headers)
        time.sleep(3)    # 设置等待时间
        return response.content.decode()

    def parse_data(self, res, data_list):
        # 构建etree对象
        html = etree.HTML(res)
        # 定位节点
        node_list = html.xpath('//*[@id="index"]/div[1]/div[1]/div/div/div')
        # 遍历每个节点,提取信息
        for node in node_list:
            data_dict = {}
            data_dict['name'] = node.xpath('./div[2]/a/h2/text()')[0] if len(node.xpath('./div[2]/a/h2/text()')) > 0 else None
            type_set = node.xpath('./div[2]/div[1][@class="categories"]//text()') if len(node.xpath('./div[2]/div[1][@class="categories"]//text()')) > 0 else None
            # 去除列表中的空值
            film_type = [x.strip() for x in type_set if x.strip() != '']
            type_str = '、'.join(film_type)

            data_dict['type'] = type_str
            data_dict['area'] = node.xpath('./div[2]/div[2]/span[1]/text()')[0] if len(node.xpath('./div[2]/div[2]/span[1]/text()')) > 0 else None
            data_dict['long'] = node.xpath('./div[2]/div[2]/span[3]/text()')[0] if len(node.xpath('./div[2]/div[2]/span[3]/text()')) > 0 else None
            data_dict['begin_time'] = node.xpath('./div[2]/div[3]/span/text()')[0] if len(node.xpath('./div[2]/div[3]/span/text()')) > 0 else None
            data_dict['score'] = node.xpath('./div[3]/p[1]/text()')[0].strip() if len(node.xpath('./div[3]/p[1]/text()')) > 0 else None

            data_list.append(data_dict)

        return data_list

    def run(self):
        # 定义空列表,用于储存数据
        data_list = []
        # 依次访问每一页
        for page in range(1, 11):
            print('正在爬取第{}页,,,'.format(page))
            # 请求url,获取响应
            res = self.get_data(page)
            # 处理数据
            data_list = self.parse_data(res, data_list)

        print(data_list)
        print(len(data_list))


if __name__ == '__main__':
    dianying = DIANying()
    dianying.run()

总结(selenium实现)

'''
1、使用selenium进行爬虫,使用xpath进行定位时,如果某些标签缺失,会导致定位不到元素而报错,如何像在requests中使用xpath一样,增加判断条件,缺失的标签赋值为None
目前,我只会使用try/except来实现
如果确定是由于某些标签缺失而导致定位不到元素报错,则直接赋值为None

2、翻页时,怎么保证到最后一页就不进行翻页操作了,如果仍旧进行翻页操作,会由于定位不到“下一页”标签而报错
目前,我也只是使用try/except来实现
如果确定是由于到了最后一页而定位不到“下一页”标签报错,就直接终止
''' 

selenium实现

# coding = utf-8
# @Time : 2021/12/
from selenium import webdriver
from selenium.webdriver.common.by import By


class DIANying(object):
    def __init__(self):
        self.url = 'https://ssr1.scrape.center/'
        self.driver = webdriver.Edge()

    def parse_data(self, data_list):
        node_list = self.driver.find_elements(By.XPATH, '//*[@id="index"]/div[1]/div[1]/div/div/div')
        for i in range(len(node_list)):
            try:
                data_dict = {}
                data_dict['name'] = node_list[i].find_element(By.XPATH, './div[2]/a/h2').text if len(node_list[i].find_element(By.XPATH, './div[2]/a/h2').text) > 0 else None
                type_set = node_list[i].find_element(By.XPATH, './div[2]/div[1][@class="categories"]').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[1][@class="categories"]').text) > 0 else None
                data_dict['type'] = type_set
                data_dict['area'] = node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[1]').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[1]').text) > 0 else None
                data_dict['long'] = node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[3]').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[3]').text) > 0 else None
                data_dict['begin_time'] = node_list[i].find_element(By.XPATH, './div[2]/div[3]/span').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[3]/span').text) > 0 else None
                data_dict['score'] = node_list[i].find_element(By.XPATH, './div[3]/p[1]').text.strip() if len(node_list[i].find_element(By.XPATH, './div[3]/p[1]').text) > 0 else None
            except:
                node_list = self.driver.find_elements(By.XPATH, '//*[@id="index"]/div[1]/div[1]/div/div/div')
                data_dict = {}
                data_dict['name'] = node_list[i].find_element(By.XPATH, './div[2]/a/h2').text if len(node_list[i].find_element(By.XPATH, './div[2]/a/h2').text) > 0 else None
                type_set = node_list[i].find_element(By.XPATH, './div[2]/div[1][@class="categories"]').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[1][@class="categories"]').text) > 0 else None

                data_dict['type'] = type_set
                data_dict['area'] = node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[1]').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[1]').text) > 0 else None
                data_dict['long'] = node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[3]').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[2]/span[3]').text) > 0 else None
                try:
                    data_dict['begin_time'] = node_list[i].find_element(By.XPATH, './div[2]/div[3]/span').text if len(node_list[i].find_element(By.XPATH, './div[2]/div[3]/span').text) > 0 else None
                except:
                    data_dict['begin_time'] = None
                data_dict['score'] = node_list[i].find_element(By.XPATH, './div[3]/p[1]').text.strip() if len(node_list[i].find_element(By.XPATH, './div[3]/p[1]').text) > 0 else None

            data_list.append(data_dict)

        # print(data_list)
        # 下一页操作
        try:
            self.driver.find_element(By.XPATH, '//*[@id="index"]/div[2]/div/div/div/a[@class="next"]/button').click()
        except:
            return data_list
        return data_list

    def run(self):
        data_list = []
        # 访问初始页面
        self.driver.get(self.url)
        # 处理数据
        for i in range(10):
            data_list = self.parse_data(data_list)


        self.driver.quit()
        print(data_list)
        print(len(data_list))


if __name__ == '__main__':
    dianying = DIANying()
    dianying.run()


你可能感兴趣的:(爬虫,selenium,python,爬虫,edge)