淘宝评论的爬取

文章禁止转载,违者必究

淘宝网页的评论数据动态加载的,首先要关闭adblock等屏蔽插件广告。还有我登陆了账号。

 
 宝贝详情 
 
  • 累计评论 6050
  •   
     
     
    li id="J_ServiceTab">   
    专享服务  
    
    淘宝评论的爬取_第1张图片
    image.png

    通过firefox浏览器,终于找到了评论的url,如下。

    淘宝评论的爬取_第2张图片
    image.png

    评论的url

    这几个url在html源代码中可以找到。

       
        描述加载中
        
        

    找到了评论的url其内容为

    jsonp_tbcrate_reviews_list({
        "qnaDisabled": true,
        "watershed": 100,
        "search": "http://11.178.153.195:30051/bin/sp?app=pinglun&outfmt=json&seek_timeout=400&gmt_create=1521907200~&risk_status=0|-1&rate_risk_limit=0&item_id=556129616183&user_id=2970747408&risk_time_now=1537497213&layer_quota=500000&rate_risk_search=1&fold_flag=0&no_risk_status=0|-1&rate_risk_own=0&order=algo_sort:des&s=0&n=20&is_wireless=0&user_id=2970747408&utd_id=2970747408&is_click_sku=0",
        "total": 467,
        "comments": [{
            "date": "2018年09月16日 12:32",
            "shareInfo": {
                "lastReplyTime": "",
                "share": false,
                "pic": 0,
                "reply": 0,
                "userNumIdBase64": ""
            },
            "showDepositIcon": false,
            "o2oRate": null,
            "mainTradeId": 0,
            "raterType": 0,
            "validscore": 1,
            "video": null,
            "photos": [],
            "content": "小巧精致,挺好意思,孩子再大点就可以玩了。带上壳风扇,温度在40多度,装好系统直接在电脑上开ssh加wifi,不用再加显示器键盘鼠标,通电直接连上,爽撒",
            "rateId": 1017227150661,
            "spuRatting": [],
            "auction": {
                "thumbnail": "",
                "link": "//item.taobao.com/item.htm?id=556129616183",
                "auctionPic": "//img.alicdn.com/bao/uploaded/null_40x40.jpg",
                "sku": "套餐:单独主板  树莓派:3B+ E14 新款",
                "title": "",
                "aucNumId": "556129616183"
            },
            "award": "",
            "rate": "1",
            "creditFraudRule": 0,
            "appendCanExplainable": false,
            "from": "",
            "tag": "",
            "propertiesAvg": "0.0",
            "reply": null,
            "dayAfterConfirm": 0,
            "lastModifyFrom": 0,
            "bidPriceMoney": {
                "amount": 243,
                "centFactor": 100,
                "cent": 24300,
                "displayUnit": "元",
                "currency": {
                    "symbol": "¥",
                    "displayName": "人民币",
                    "currencyCode": "CNY",
                    "defaultFractionDigits": 2,
                    "numericCode": 156
                },
                "currencyCode": "CNY"
            },
            "noQna": true,
            "promotionType": "活动促销 ",
            "vicious": "",
            "enableSNS": false,
            "appendList": [],
            "buyAmount": 1,
            "showCuIcon": true,
            "serviceRate": null,
            "useful": 0,
            "user": {
                "nick": "l***3",
                "vipLevel": 0,
                "displayRatePic": "b_blue_2.gif",
                "nickUrl": "",
                "anony": false,
                "rank": 874,
                "avatar": "//wwc.alicdn.com/avatar/getAvatar.do?userIdStr=Xmhzv8xLX8cbP0lhOmcSPmcLOH8LPG80XFR-P8c4PGMhMmgSPFx4vCIuvGgyOFvW&width=40&height=40&type=sns",
                "vip": "",
                "userId": "",
                "rankUrl": ""
            },
            "append": null,
            "status": 0
        }, 
    

    ...................

    其返回的东西是json传输的。

    json.loads
    json.loads 用于解码 JSON 数据.该函数返回 Python 字段的数据类型。
    json.loads(s[, encoding[, cls[, object_hook[, parse_float[, parse_int[, parse_constant[, object_pairs_hook[, **kw]]]]]]]])

    根据上面得到comments后面的[]即列表类型,[]里有多个{}信息,每个{}代表一个用户的评论信息

    通过json.loads()后变成字典,取出需要的东西。

    下面就是构造所有的url
    下面是1,2页数的评论,主要currentPageNum变化。构造出需要的网站。

    https://rate.taobao.com/feedRateList.htm?auctionNumId=556129616183&userNumId=143813255¤tPageNum=1&pageSize=20&rateType=&orderType
    
    https://rate.taobao.com/feedRateList.htm?auctionNumId=556129616183&userNumId=143813255¤tPageNum=2&pageSize=20&rateType=&orderType
    

    相关的code如下

    # -*- coding: utf-8 -*-
    """
    Created on Fri Sep 21 11:36:31 2018
    
    @author: dflx
    """
    
    import requests
    import re
    import os
    import time
    import json
    import pandas as pd
    from bs4 import BeautifulSoup
    import xlwt
    import xlrd
    import os
    
    
    #获取网页的html源码
    def get_html(url):
        header = {
            'Accept':'application/json, text/plain, */*',
            'Accept-Language':'zh-CN,zh;q=0.3',
            'Referer':'https://item.taobao.com/item.htm',
            'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
            'Connection':'keep-alive',
        }
        
        response=requests.get(url,headers=header, verify=False)
        #response.encoding=response.apparent_encoding
        html=response.text
        return html
    
    def getJson(url):
        data=get_html(url).strip().strip('()')
        d=json.loads(data)
        b=d['comments']  #得到comments后面的[]即列表类型,[]里有多个{}信息,每个{}代表一个用户的评论信息
        print(type(b[0])) #列表[]里的每一个元素又是一个字典类型的数据
    #下面遍历列表元素,取出每一个列表元素中字典里的特定信息:content、nick、date
        print(type(b))
        elem_list=[]
        for elem in b:
            user=elem["user"]['nick']
            date=elem['date']
            xh=elem['auction']["sku"]
            money=elem["bidPriceMoney"]['amount']
            comt=elem['content']
            elem_list.append(user)
            elem_list.append(date)
            elem_list.append(xh)
            elem_list.append(money)
            elem_list.append(comt)
            
        return elem_list
            
            
        #print(b['date'])
        #table=pd.DataFrame(b,columns=["user"['nick'],'date','auction'["sku"],"bidPriceMoney"['amount'],'content'])
        #table=pd.DataFrame(b,columns=["user",'date'])
        #table.to_excel("E:\\mytable.xlsx")
    
    
    #单独写一页的评论
    row=0 #行
    def write(information):
        path="E:\\informa.xls"
        global row #行
        column=0 #列
        count=0
        file=xlwt.Workbook()
        sheet1=file.add_sheet('1')
        t=len(information)
        if(t==0):
            return
        print(information)
        print(" 一共有",5,'列 ',t/5,'行')
        
        for j in range(t):
            sheet1.write(row,column,information[j])
            count=count+1 #计数器
            column=column+1 # 
            if(count%5==0):
                row=row+1
                column=0
        file.save(path)
        print('successful')
           
    
    
    
    def writeAll(num):
        first="https://rate.taobao.com/feedRateList.htm?auctionNumId=556129616183&userNumId=143813255¤tPageNum="
        path="E:\\informa.xls"
        column=0 #列
        count=0
        file=xlwt.Workbook()
        sheet1=file.add_sheet('1')
    
        elem_list=[]
        elem_list.append("user")
        elem_list.append("date")
        elem_list.append("型号")
        elem_list.append("money")
        elem_list.append("评论")
        
        for index in range(len(elem_list)):
            sheet1.write(0,index,elem_list[index])
        row=1
    
    
     
        for n in range(1,num+1):
            try:
                url=first+str(n)+"&pageSize=20"
                print(url)
                infor=getJson(url)
                t=len(infor)
                if(t==0):
                    return
                for j in range(t):
                    sheet1.write(row,column,infor[j])
                    count=count+1 #计数器
                    column=column+1 # 
                    if(count%5==0):
                        row=row+1
                        column=0
                file.save(path)
                
            except:
                print("has error")
                continue     
        print(str(n)+"-------------页写入成功了,")
    
    
    def main():
        url0="https://item.taobao.com/item.htm?spm=a230r.1.14.20.6e20434b4CzHZ3&id=556129616183&ns=1&abbucket=9#detail"
        html=get_html(url0)
        #print(html)
        
        url1="https://rate.taobao.com/feedRateList.htm?auctionNumId=556129616183&userNumId=143813255¤tPageNum=1&pageSize=20"
        html1=get_html(url1)
        #print(html1)
        writeAll(30)
        
    

    遇到了一个坑,就是写excel时候单独写一页,后在写,会覆盖。最后直接总体写

    淘宝评论的爬取_第3张图片
    image.png

    参考文章如下
    Python爬虫 获得淘宝商品评论
    Python爬取淘宝商品详情页数据
    通过Python抓取天猫评论数据
    Python xlrd、xlwt 用法说明

    你可能感兴趣的:(淘宝评论的爬取)