python爬取京东众筹【项目名称、支持者数量、关注数量、剩余天数、话题数、类别】

python爬虫

  1. 用requests请求列表页
  2. selenium请求详情页
  3. lxml解析网页
  4. 保存成csv

抓取范围:(按时间排序)前100页,每页16个,共计1600个众筹项目

具体看代码

'''
2020/3/11
京东众筹
https://z.jd.com/bigger/search.html
------------------------------
项目名称、支持者数量、关注数量、剩余天数、话题数、类别
------------------------------
'''
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import requests
import csv
import time
from itertools import count
from lxml import etree
import xlwt
import os
from urllib.parse import urlsplit
import sys
import datetime
import re


class LoopOver(Exception):
    def __init__(self, *args, **kwargs):
        pass


class Spider:
    def __init__(self):
        self.path = '.'
        self.csvfilenamegbk = 'datas-gbk.csv'
        self.csvfilename = 'datas.csv'

        self.retry_time = 1

        self.host = 'https://z.jd.com'
        self.listurl = 'https://z.jd.com/bigger/search.html'

        self.user_input_sales = 100

        self.browser = webdriver.Chrome()
        self.wait = WebDriverWait(self.browser, 20)

    def run(self):
        strat = time.time()
        for text in self.get_list(self.listurl):
            for page_url in self.parse_list(text):
                item = self.parse_page(self.get_page(page_url))
                self.save_data(item)

        end = time.time()

        self.runtime = end - strat

    def get_list(self, url):
        for index in range(1, 101):
            data = {
                'productEnd': '-28',
                'sort': 'zxsx'
            }
            print('当前在第{}页'.format(index))
            data['page'] = index

            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36',
            }
            response = requests.post(url, data=data, headers=headers)

            yield response.text

    def parse_list(self, text):
        html = etree.HTML(text)

        urls = html.xpath('//ul[@class="infos clearfix"]/li/a/@href')
        urls = [self.host + i for i in urls]
        if len(urls) == 0:
            self.save_html(text)
            raise LoopOver
        for url in urls:
            yield url

    def get_page(self, url):
        self.browser.get(url)
        self.wait.until(EC.presence_of_element_located(
            (By.XPATH, '//*[@id="topicBtn"]/span')))
        time.sleep(3)
        return self.browser.page_source

    def parse_page(self, text):
        # 项目名称、支持者数量、关注数量、剩余天数、话题数、类别
        html = etree.HTML(text)
        title = html.xpath('//h1[@class="p-title"]/text()')[0].strip()
        typee = html.xpath('//title/text()')[0].strip().split('-')[-2].replace('众筹', '')
        try:
            p_num = re.findall(
                r"\d+\.?\d*", html.xpath('//p[@class="p-progress"]/span[@class="fr"]/text()')[0].strip())[0]
        except IndexError:
            p_num = '0'
        like_num = re.findall(
            r"\d+\.?\d*", html.xpath('//*[@id="focusCount"]/text()')[0].strip())[0]
        try:
            l_day = html.xpath(
                '//*[@id="projectMessage"]/span[last()]/text()')[0].strip()
        except IndexError:
            l_day = 0

        try:
            talk_num = html.xpath('//*[@id="topicBtn"]/span/text()')[0].strip()
        except IndexError:
            talk_num = 0

        return [title, typee, p_num, like_num, l_day, talk_num]

    def save_data(self, item):
        '''
        保存文件
        '''
        print('>>>', item)
        with open('{}/{}'.format(self.path, self.csvfilename), 'a', encoding='utf_8', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(item)

    def save_html(self, text):
        with open('test.html', 'w', encoding='utf-8') as f:
            f.write(text)

    @property
    def time(self):
        return '总共用时:{}秒'.format(self.runtime)


if __name__ == '__main__':
    spider = Spider()
    spider.run()
    print(spider.time)  # 运行总时间

143f9d7da47e2a1be8f31556d3e91fa9.png

你可能感兴趣的:(python,网页爬虫)