selenium爬取网页版小红书

小红书太难爬了,app版本的你需要有安卓逆向的知识储备,然后才能做好。今天我就看了网页版的。嗯网页版的数据不全,所以如果想要很全的数据还要想办法获取app的数据。今天我写的这个爬虫就是简单的用selenium来获取小红书的网页版的数据这些数据自己玩玩可以的。
直接上代码,代码都有注释:

#_*_coding:utf-8_*_
from urllib.parse import urljoin

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.firefox.options import Options
from lxml import etree
import time
class Text(object):
    def __init__(self,key):
        self.driver = webdriver.Chrome()
        self.url = 'https://www.xiaohongshu.com/explore'
        chrome_opt = webdriver.ChromeOptions()
        chrome_opt.add_argument("--headless")
        prefs = {"profile.managed_default_content_settings.images": 2}
        chrome_opt.add_experimental_option("prefs", prefs)
        # 创建chrome无界面对象 options=chrome_opt
        self.driver = webdriver.Chrome(options=chrome_opt)
        self.driver.get(self.url)
        self.key = key
        # self.driver.maximize_window()

    def get_input(self,xpath):
        btn = WebDriverWait(self.driver, 5).until(EC.visibility_of_element_located((By.XPATH, xpath)))
        return btn

    def __del__(self):
        pass
        # self.driver.quit()

    def get_data(self):
        res = self.driver.page_source
        res = etree.HTML(res)
        nodes = res.xpath('//div[@class="note-list-row"]/div')
        for node  in nodes:
            item = {}
            # 标题
            title = node.xpath('.//h3')[0].xpath('string(.)')
            # url
            url = urljoin('https://www.xiaohongshu.com',node.xpath('./a/@href')[0])
            # 作者
            author = node.xpath('.//li[@class="user-info"]/span/text()')[0]
            # 点赞数
            like_number = node.xpath('.//li[@class="like"]/span/text()')[0]
            item['title'] = title
            item['url'] = url
            item['author'] = author
            item['like_number'] = like_number
            print(item)


    def run(self):
        # 定位输入框
        put_btn = self.get_input('//input[@class="input"]')
        put_btn.send_keys(self.key)
        # 定位点击按钮
        # # 执行点击
        js = 'document.getElementsByClassName("search-icon")[0].click();'
        self.driver.execute_script(js)
        # click_btn.click()
        # # 解析内容,在此之前切换句柄
        self.driver.switch_to.window(self.driver.window_handles[1])
        # 解析获取数据
        self.get_data()



if __name__ == '__main__':
    key = input('请输入关键字:')
    a = Text(key)
    a.run()


运行效果:
selenium爬取网页版小红书_第1张图片
selenium爬取网页版小红书_第2张图片

你可能感兴趣的:(python爬虫)