selenium for jd

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import selenium.common.exceptions
import json
import csv
import time


class Spider(object):
    def open_file(self):
        """
        设置结果存储格式
        :return:
        """
        self.fm = input('请输入文件保存格式（txt、json、csv）：')
        while self.fm != 'txt' and self.fm != 'json' and self.fm != 'csv':
            self.fm = input('输入错误，请重新输入文件保存格式（txt、json、csv）：')
        if self.fm == 'txt':
            self.fd = open('Jd.txt', 'w', encoding='utf-8')
        elif self.fm == 'json':
            self.fd = open('Jd.json', 'w', encoding='utf-8')
        elif self.fm == 'csv':
            self.fd = open('Jd.csv', 'w', encoding='utf-8', newline='')

    def getOptions(self):
        """
        对webdriver进行设置
        :return:
        """
        options = webdriver.ChromeOptions()
        options.add_argument(
            'User-Agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36')
        options.add_argument('--no-sandbox')  # 停用沙箱
        # 禁用GPU实现加速
        options.add_argument('--disable-gpu')
        # 开启无痕模式
        # options.add_argument('--incognito')
        # 去掉显示“浏览器正在受自动化控制”
        options.add_experimental_option('excludeSwitches', ['enable-automation'])
        # 忽略证书错误
        options.add_argument('--ignore-certificate-errors')
        return options

    def open_browser(self):
        """
        设置driver
        :return:
        """
        options = self.getOptions()
        self.browser = webdriver.Chrome(options=options)
        self.browser.implicitly_wait(10)
        self.wait = WebDriverWait(self.browser, 10)

    def init_variable(self):
        self.data = zip()
        self.isLast = False

    def parse_page(self):
        """
        解析网页内容
        :return:
        """
        try:
            skus = self.wait.until(EC.presence_of_all_elements_located((By.XPATH, '//li[@class="gl-item"]')))
            skus = [item.get_attribute('data-sku') for item in skus]
            links = ['https://item.jd.com/{sku}.html'.format(sku=item) for item in skus]
            prices = self.wait.until(
                EC.presence_of_all_elements_located((By.XPATH, '//div[@class="gl-i-wrap"]/div[2]/strong/i')))
            prices = [item.text for item in prices]
            names = self.wait.until(
                EC.presence_of_all_elements_located((By.XPATH, '//div[@class="gl-i-wrap"]/div[3]/a/em')))
            names = [item.text for item in names]
            comments = self.wait.until(
                EC.presence_of_all_elements_located((By.XPATH, '//div[@class="gl-i-wrap"]/div[4]/strong')))
            comments = [item.text for item in comments]
            self.data = zip(links, prices, names, comments)
        except selenium.common.exceptions.TimeoutException:
            print('parse_page: TimeoutException')
            self.parse_page()
        except selenium.common.exceptions.StaleElementReferenceException:
            print('parse_page: StaleElementReferenceException')
            self.browser.refresh()

    def turn_page(self):
        """
        网页翻页
        :return:
        """
        try:
            self.wait.until(EC.element_to_be_clickable((By.XPATH, '//a[@class="pn-next"]'))).click()
            time.sleep(1)
            self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            time.sleep(2)
        except selenium.common.exceptions.NoSuchElementException:
            self.isLast = True
        except selenium.common.exceptions.TimeoutException:
            print('turn_page: TimeoutException')
            self.turn_page()
        except selenium.common.exceptions.StaleElementReferenceException:
            print('turn_page: StaleElementReferenceException')
            self.browser.refresh()

    def write_to_file(self):
        """
        将结果写入文件
        :return:
        """
        if self.fm == 'txt':
            for item in self.data:
                self.fd.write('----------------------------------------\n')
                self.fd.write('link：' + str(item[0]) + '\n')
                self.fd.write('price：' + str(item[1]) + '\n')
                self.fd.write('name：' + str(item[2]) + '\n')
                self.fd.write('comment：' + str(item[3]) + '\n')
        if self.fm == 'json':
            temp = ('link', 'price', 'name', 'comment')
            for item in self.data:
                json.dump(dict(zip(temp, item)), self.fd, ensure_ascii=False)
        if self.fm == 'csv':
            writer = csv.writer(self.fd)
            for item in self.data:
                writer.writerow(item)

    def close_file(self):
        self.fd.close()

    def close_browser(self):
        self.browser.quit()

    def crawl(self):
        """
        爬取主程序
        :return:
        """
        self.open_file()
        self.open_browser()
        self.init_variable()
        print('开始爬取')
        self.browser.get('https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC&enc=utf-8')
        time.sleep(1)
        self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
        time.sleep(2)
        count = 0
        while not self.isLast:
            count += 1
            print('正在爬取第 ' + str(count) + ' 页......')
            self.parse_page()
            self.write_to_file()
            self.turn_page()
        self.close_file()
        self.close_browser()
        print('结束爬取')


if __name__ == '__main__':
    spider = Spider()
    spider.crawl()
selenium for jd

你可能感兴趣的:(selenium for jd)