Python爬虫,根据搜索关键字爬取京东商品信息

因为有在双十一配主机的计划,所有就产生了采集京东上指定商品信息(主要是价格)的想法。花闲余时间简单学习了下Python,参考了一些其他人的爬虫代码,最终完成一个比较简单的Python爬虫。可以根据商品ID或者搜索关键字爬取商品信息。

主要代码

import requests
from lxml.html import etree
import csv
import time
import datetime

def getHeader(referer):
    headers = {
        'authority': 'search.jd.com',
        'accept': '*/*',
        'method': 'GET',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
        'x-requested-with': 'XMLHttpRequest',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'cors',
        'sec-fetch-dest': 'empty',
        'referer': referer,
        'accept-language': 'zh-CN,zh;q=0.9',
    }
    return headers


def collectData(url,keyword):
    headers = getHeader(url)

    response = requests.get(url, headers=headers)  # 获取数据
    res_html = etree.HTML(response.text)  # 解析数据

    total_page = res_html.xpath('.//span[@class="fp-text"]/i/text()')[0].strip()

    for i in range(1, int(total_page)*2+1):
        a = time.time()
        b = '%.5f' % a
        params = {
            'keyword': keyword,
            'suggest': '1.his.0.0',
            'wq': keyword,
            'page': str(i),
            's': str(1 + (i - 1) * 25),
            'scrolling': 'y',
            'log_id': str(b),
            'tpl': '1_M',
            'isList': '0'
        }

        response = requests.get('https://search.jd.com/s_new.php', headers=headers, params=params)  # 获取数据
        res_html = etree.HTML(response.text)  # 解析数据

        goods_list_items = res_html.xpath('//li[@class="gl-item"]')
        num_goods = len(goods_list_items)
        if num_goods == 0:
            print('第{}页,这页总共{}件商品'.format(i, num_goods))
            print('None')
        else:
            print('第{}页,这页总共{}件商品'.format(i, num_goods))
            x = 0
            for goods_list_item in goods_list_items:
                x += 1

                good_id = goods_list_item.xpath('@data-sku')[0].strip()
                price_url = f'http://p.3.cn/prices/mgets?skuIds={good_id}'
                try:
                    price_json = requests.get(price_url, headers=headers).json()
                    goods_now_price = price_json[0].get('p')  # 现在的价格
                    goods_old_price = price_json[0].get('m')  # 以前的价格
                except:
                    goods_now_price = ''
                    goods_old_price = ''
                item_url = 'https://item.jd.com/{}.html'.format(good_id)
                res1 = requests.get(item_url, headers=headers)
                res_item = etree.HTML(res1.text)
                try:
                    goods_brand = res_item.xpath('//ul[@id="parameter-brand"]/li/@title')[0]  # 品牌
                    good_name = res_item.xpath('//ul[@class="parameter2 p-parameter-list"]/li[1]/@title')[0]
                except:
                    goods_brand = ''
                    good_name = ''
                today = datetime.date.today()

                list = [goods_brand, good_name, good_id, goods_now_price, goods_old_price, item_url, today]
                print(x, list)
                csv_file = open('备战双十一.csv', 'a', newline='', encoding='utf-8-sig')
                writer = csv.writer(csv_file)
                writer.writerow(list)
            print('\n')
            time.sleep(1)
    csv_file.close()


def collectSingle(good_id):
    item_url = 'https://item.jd.com/{}.html'.format(good_id)
    headers = getHeader(item_url)

    price_url = f'http://p.3.cn/prices/mgets?skuIds={good_id}'
    try:
        price_json = requests.get(price_url, headers=headers).json()
        goods_now_price = price_json[0].get('p')  # 现在的价格
        goods_old_price = price_json[0].get('m')  # 以前的价格
    except:
        goods_now_price = ''
        goods_old_price = ''

    res1 = requests.get(item_url, headers=headers)
    res_item = etree.HTML(res1.text)
    try:
        goods_brand = res_item.xpath('//ul[@id="parameter-brand"]/li/@title')[0]  # 品牌
        good_name = res_item.xpath('//ul[@class="parameter2 p-parameter-list"]/li[1]/@title')[0]
    except:
        goods_brand = ''
        good_name = ''
    today = datetime.date.today()

    list = [goods_brand, good_name, good_id, goods_now_price, goods_old_price, item_url, today]
    print(list)
    csv_file = open('单件商品.csv', 'a', newline='', encoding='utf-8-sig')
    writer = csv.writer(csv_file)
    writer.writerow(list)
    csv_file.close()

简单介绍

主要功能就两个方法
collectSingle(good_id): 根据单件商品的商品ID取数据存入csv文件,其实就是从collectData方法剥离出的一部分简单功能
collectData(url,keyword): 根据列表页url和关键字取数据存入csv文件,url主要用于添加限制条件,要不有些关键字的检索结果实在太多了,关键字要转化成16位unicode。这里贴一个范例数据。

['https://search.jd.com/search?keyword=%E5%9B%BA%E6%80%81%E7%A1%AC%E7%9B%98m.2%E8%87%AA%E8%90%A5&wq=%E5%9B%BA%E6%80%81%E7%A1%AC%E7%9B%98m.2%E8%87%AA%E8%90%A5&ev=1948_82059%7C%7C124285%5E539_91928%5E',
     '\u56fa\u6001\u786c\u76d8m.2\u81ea\u8425']

总结

在有学习过其他编程语言的基础上,python的学习成本并不是很高。这里并没有用python的爬虫框架,只是简单的request请求然后解析html页面取数据。在摸索实践的过程中对HTTP协议、chrome调试、学习新知识等方面还是有很大的提升的。总之是一次挺有意义的学习过程。

主要参考

https://blog.csdn.net/sinat_20019511/article/details/104354417
https://blog.csdn.net/weixin_48615832/article/details/107174331

你可能感兴趣的:(学习提升)