Python 爬取JD商品,requests + lxml,并用xlwt直接写入xls文件。

前面写的爬虫,进行了优化,加强了一下体验,会提示进度。
需要自定义地方:start_url 与 length以及cookie。cookie可用chrome浏览器的检查功能抓取,每个人不一样。切勿泄露。这里用省略号代替。
只放代码,详细过程见我另一博客: https://blog.csdn.net/weixin_44521703/article/details/96447206

import requests
import time
import random
import urllib3
import xlwt
from lxml import etree


def get_text(url, code):
    hds = {
        'cookie': '.....',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
    try:
        r = requests.get(url, headers=hds, timeout=60)
        r.raise_for_status()
        r.encoding = code
        return r.text
    except (TimeoutError, urllib3.exceptions.NewConnectionError, requests.exceptions.ConnectionError):
        print("Failed to connect to the server!")


def get_list(start, length):
    return [start + '&page={}&s={}'.format(2 * i + 1, 50 * i + 1) for i in range(length)]


def get_info(html, info_list):
    response = etree.HTML(html)
    try:
        titles_1 = response.xpath(
            '//*[@id="J_goodsList"]/ul/li/div/div[3]/a/em/text()[1]')
        titles_2 = response.xpath(
            '//*[@id="J_goodsList"]/ul/li/div/div[3]/a/em/text()[2]')
        prices = response.xpath(
            '//*[@id="J_goodsList"]/ul/li/div/div[2]/strong/i/text()')
        urls = response.xpath(
            '//*[@id="J_goodsList"]/ul/li/div/div[3]/a/@href')

        for i in range(len(urls)):
            if urls[i][:5] == 'https':
                continue
            else:
                urls[i] = 'https:' + urls[i]

        for title_1, title_2, price, url in zip(
                titles_1, titles_2, prices, urls):
            info_list.append([title_1 + "电蚊香" + title_2, price, url])
    except BaseException:
        print("Not good data!!")


def write_excel(info_list, filename):
    try:
        print("\n正在写入excel文件")
        workbook = xlwt.Workbook()
        sheet1 = workbook.add_sheet('information', cell_overwrite_ok=True)
        row_0 = ['Description', 'Price', 'URL']
        for i in range(len(row_0)):
            sheet1.write(0, i, row_0[i])
        for i in range(len(info_list)):
            for j in range(3):
                sheet1.write(i + 1, j, info_list[i][j])
        workbook.save(filename)
        print("爬取成功!")
    except PermissionError:
        print("please close the excel file!")


if __name__ == '__main__':
    start_url = 'https://search.jd.com/Search?keyword=%E7%94%B5%E8%9A%8A%E9%A6%99&enc=utf-8'
    length = 8
    code = 'urf-8'
    info_list = []
    d = time.strftime("%Y-%m-%d", time.gmtime())


    count = 1
    url_lists = get_list(start_url, length)
    for url in url_lists:
        html = get_text(url, 'utf-8')
        time.sleep(random.random() * 2)
        if html == '':
            continue
        else:
            get_info(html, info_list)
        print('\r已完成{}%'.format(count / len(url_lists) * 100), end='')
        count += 1

    write_excel(info_list, filename=d + '.xls')


你可能感兴趣的:(Python,爬虫)