爬取JD格力空调的各种信息

又做了一回爬JD信息的爬虫,但是这次爬取的内容更多更全,其实写代码本身不难,主要费时间的就是找相关信息的url,详细代码如下:

防水处理了一下url,base64加密的,解密可用如下函数:

import base64


def dec(string):
    return base64.b64decode(string.encode()).decode()

完整代码如下:

import requests
import re
import time
import json
import os
import traceback
from lxml import etree
from collections import OrderedDict


def get_text(href):
    try:
        hds = {
            'Referer': 'https://item.jd.com/1361956.html',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
        resp = requests.get(href, headers=hds)
        resp.encoding = resp.apparent_encoding
        resp.raise_for_status()
        return resp.text
    except requests.RequestException:
        return ''


def uniform_url(text):
    if 'https' not in text:
        text = 'https:' + text
    return text


def get_urls(href):
    text = get_text(href)
    html = etree.HTML(text)
    url_list = html.xpath('//div[@id="J_goodsList"]/ul/li//div[3]/a/@href')
    url_list = map(uniform_url, url_list)
    return url_list


def get_func(href):
    # 'https://c.3.cn/product/tag?skuIds=1361956'
    text = json.loads(get_text(href))
    details = OrderedDict()
    try:
        for item in text['items'][0]['detail']:
            details[item['name']] = round(item['value'] / item['avg'], 2)
        return details
    except IndexError:
        return None


def get_comment(href):
    """
    :param href:
    :return: the comments information of air-conditioner.
    """


    text = json.loads(get_text(href))
    try:
        comment_count = text['productCommentSummary']['commentCount']  # 评价综述
        good_rate_show = text['productCommentSummary']['goodRateShow']  # 评分
        hot_comment_tag_statistics = text['hotCommentTagStatistics']  # 热评
        details = OrderedDict()
        for item in hot_comment_tag_statistics:
            details[item['name']] = item['count']
        return comment_count, good_rate_show, details
    except IndexError:
        return None


def get_price(href):
    """
    :param href: 
    :return: price of the air conditioner
    """
    try:
        text = json.loads(get_text(href))
        return text[0]['p']
    except IndexError:
        return ''


def parse_url(href):
    """
    :param href: the subpage url
    :return: the detail information of air-conditioner. including title, prices, main-function, and comments
    """
    html = etree.HTML(get_text(href))
    title = html.xpath('//div[@class="sku-name"]/text()')[0].strip()
    if title == '':
        title = html.xpath('//div[@class="sku-name"]//text()')[-1].strip()
    print(f"正在获取{title}")
    description = dict()
    dl_list = html.xpath('//div[@class="Ptable"]//dl[@class="clearfix"]')
    for dl in dl_list:
        try:
            key = dl.xpath('./dt/text()')[0].strip()
            value = dl.xpath('./dd/text()')[0].strip()
            if len(value) == 0:
                value = dl.xpath('./dd[2]/text()')[0].strip()
            description[key] = value
        except:
            traceback.print_exc()
    # print(description)

    try:
        index = re.findall(r'\d+', href)[0]
        functions = 'aHR0cHM6Ly9jLjMuY24vcHJvZHVjdC90YWc/c2t1SWRzPXt9'.format(index)
        function = get_func(functions)
        prices = 'aHR0cHM6Ly9wLjMuY24vcHJpY2VzL21nZXRzPyZza3VJZHM9Sl97fQ=='.format(index)
        price = get_price(prices)
        comments = 'aHR0cHM6Ly9zY2x1Yi5qZC5jb20vY29tbWVudC9wcm9kdWN0UGFnZUNvbW1lbnRzLmFjdGlvbj9wcm9kdWN0SWQ9e30mc2NvcmU9MCZzb3J0VHlwZT01JnBhZ2U9MCZwYWdlU2l6ZT0xMCZpc1NoYWRvd1NrdT0wJmZvbGQ9MQ=='.format(index)
        comment_count, good_rate_show, details = get_comment(comments)

        with open('infos.txt', 'a', encoding='utf-8') as file:
            file.write("标题:{}\n".format(title))
            file.write("链接:{}\n".format(href))
            file.write("价格:{}\n".format(price))
            file.write("评价总数:{}\n".format(comment_count))
            file.write("总体评分:{}\n".format(good_rate_show))
            file.write('\n功能:\n')
            for key, value in description.items():
                file.write("{}: {}\n".format(key, value))
            file.write("功能评分:{}\n".format(function))
            file.write("热评词频:{}\n".format(details))
            file.write('=' * 120 + '\n')
    except IndexError:
        traceback.print_exc()
        print('wrong url: {}'.format(href))


if __name__ == '__main__':
    if os.path.exists('infos.txt'):
        os.remove('infos.txt')

    url = 'aHR0cHM6Ly9zZWFyY2guamQuY29tL3NlYXJjaD9rZXl3b3JkPSVFNiVBMCVCQyVFNSU4QSU5QiVFNyVBOSVCQSVFOCVCMCU4MyZlbmM9dXRmLTgmZXY9ZXhicmFuZF8lRTYlQTAlQkMlRTUlOEElOUIlRUYlQkMlODhHUkVFJUVGJUJDJTg5JTVFNTM0MV8xMjMxMTElNUUzNjgwXzIxMyU1RQ=='

    for url in get_urls(url):
        parse_url(url)
        time.sleep(1)

代码写得越来越好看了有木有~~嘿嘿

写入的txt如下图所示,信息应该相当全了:
爬取JD格力空调的各种信息_第1张图片

你可能感兴趣的:(爬虫,request,Python)