爬取京东图书商品信息

关键之处在于页面上的评论数的信息是动态加载的,是通过那个标签唯一的id号进行url拼接获得一个json文件然后显示的评论数。
爬取京东图书商品信息_第1张图片
抓包寻找了许久之后发现了一个奇特的文件。
爬取京东图书商品信息_第2张图片
爬取京东图书商品信息_第3张图片
然后再源代码里面搜索发现这个是商品的id号,于是提取这个id号并构造url进行进一步的响应和提取json的信息即可。
爬取京东图书商品信息_第4张图片
写了那么多天scrapy,换下口味~~滑稽

import csv
import json
import re
import time
import requests
from lxml import etree
from requests.exceptions import RequestException

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
}

fp = open('D:/京东图书.csv', 'wt', newline='', encoding='utf-8')
writer = csv.writer(fp)
writer.writerow(('书名', '价格', '评论数', '经营店', '作者', '出版社', '发行时间'))


def get_html(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.content.decode()
        else:
            return None
    except RequestException:
        return None


def get_info(html):
    selector = etree.HTML(html)
    goods = selector.xpath("//div[@class='gl-i-wrap']")
    for good in goods:
        try:
            price = good.xpath("div[@class='p-price']/strong/i/text()")[0] + '元'
            # summary = good.xpath("div[@class='p-name']/a/em/text()")[0]
            name = good.xpath("div[@class='p-name']/a/em/text()")[0]
            comment_link = good.xpath("div[@class='p-commit']/strong/a/@id")[0]
            jquery_id = comment_link.split('_')[2]
            comment_number = get_comment_number(jquery_id)
            manufacturer = good.xpath("div[@class='p-shopnum']/*[@class='curr-shop']/text()")[0]
            author = good.xpath("div[@class='p-bookdetails']/span[@class='p-bi-name']/a/@title")[0]
            maker = good.xpath("div[@class='p-bookdetails']/span[@class='p-bi-store']/a/@title")[0]
            release_time = good.xpath("div[@class='p-bookdetails']/span[@class='p-bi-date']/text()")[0]
            now = (price, summary, comment_number, manufacturer, author, maker, release_time)
            writer.writerow((name, price, comment_number, manufacturer, author, maker, release_time))
            print(now)
        except:
            pass


def get_comment_number(jquery_id):
    url = 'https://sclub.jd.com/comment/productCommentSummaries.action?referenceIds={}&callback=jQuery4085889&_=1550374407814'.format(
        jquery_id)
    data = requests.get(url, headers=headers).content.decode('gbk')
    real_data = re.findall('\((.*?)\)', data, re.S)[0]
    json_data = json.loads(real_data)
    comment_info = json_data['CommentsCount'][0]['CommentCountStr']
    return comment_info


if __name__ == '__main__':
    urls = ['https://search.jd.com/Search?keyword=machine%20learning&page={}'.format(i) for i in range(1, 34, 2)]
    for url in urls:
        html = get_html(url)
        get_info(html)
        time.sleep(2)
        

最好控制好爬取频率,反正我是爬了一遍想爬第二遍的时候不行了,估计ip被封了。

你可能感兴趣的:(爬虫,动态网页爬虫)