Python 爬取汽车之家口碑数据

环境:

win10 ,Contos7.4
python3.6.1
pycharm2017
retrying=1.3.3
requests=2.22.0
fake_useragent

抓包获取口碑数据接口:

  • 车系口碑数据列表
    # 口碑数据接口 ss:车系ID, p:页数, s:一页返回数据个数最多50
    https://*****.com.cn/autov9.1.0/alibi/seriinos-ss3170-st0-p112-s50-isstruct0.json
    
  • 口碑详细数据接口
    # 口碑详细数据接口 eid=3052096 口碑详情页ID 
    https://*****.com.cn/autov9.1.0/alibi/NeEaltionInfo.ashx?eid=
    

第一步 获取所有车型数据:

    def get_model(self, url):
        """获取所有车型数据"""       
        response = self._parse_url(url)
        content = response.content.decode('GBK')  # GBK解码
        # 剔除开头和结尾处多余字符 转换为json
        content = content.replace('var listCompare$100= ', '').replace(';', '')
        content = json.loads(content)
        for i in content:
            for q in i['List']:
                # 车系ID
                yield q['I']

第二步 获取所有车型数据:

    def get_eid(self, url, car):
        """获取车型口碑ID列表"""
        log_init().info(f'车系:{car} 口碑数据获取中...')
        p = 1
        while True: 
            try:
                response = self._parse_url(url).json()
            except:
                return
            koubeis = response.get('result').get('list')
            if not koubeis:
                log_init().info(f'车系:{car}车型口碑ID列表获取完成。')
                return
            eids = [i['Koubeiid'] for i in koubeis]
            for eid in eids:
                yield eid
            p += 1

第三步 解析口碑详情数据:

    def get_content(self, cars, eid):
        """解析口碑详情数据"""
        url = f'{self.NewEvaluationUrl}{eid}'
        log_init().info(f'{url} 数据获取中...')
        response = self._parse_url(url).json()
        result = response.get('result')
        if not result:
            log_init().info(f'{eid}无数据!')
            return
        specid = result.get('specid')  # 车型ID
        userId = result.get('userId')  # 用户ID
        userName = result.get('userName')  # 用户姓名
        specname = result.get('specname')  # 购买车型
        boughtprovincename = result.get('boughtprovincename')  # 购买地点
        dealername = result.get('dealername')  # 购买经销商
        boughtdate = result.get('boughtdate')  # 购买时间
        boughtPrice = result.get('boughtPrice')  # 裸车购买价
        actualOilConsumption = result.get('actualOilConsumption')  # 油耗
        drivekilometer = result.get('drivekilometer')  # 目前行驶
        spaceScene = result.get('spaceScene').get('score')  # 空间
        powerScene = result.get('powerScene').get('score')  # 动力
        maneuverabilityScene = result.get('maneuverabilityScene').get('score')  # 操控
        oilScene = result.get('oilScene').get('score')  # 油耗
        comfortablenessScene = result.get('comfortablenessScene').get('score')  # 舒适性
        apperanceScene = result.get('apperanceScene').get('score')  # 外观
        internalScene = result.get('internalScene').get('score')  # 内饰
        costefficientScene = result.get('costefficientScene').get('score')  # 性价比
        purpose = ','.join([i['purposename'] for i in result.get('purpose')])  # 购车目的

        brandname = result.get('brandname')  # 品牌名称
        seriesname = result.get('seriesname')  # 车系名称
        boughtcityname = result.get('boughtcityname')  # 车系名称

        data = [[userId, userName, cars, specid, brandname, seriesname, specname, boughtprovincename, boughtcityname, dealername,
                 boughtdate, boughtPrice, actualOilConsumption, drivekilometer, spaceScene, powerScene,
                 maneuverabilityScene, oilScene, comfortablenessScene, apperanceScene, internalScene,
                 costefficientScene, purpose]]
        return data

第四部 多线程启动:

    @run_time
    def main(self, num):
        """程序入口"""
        # 多线程启动
        pool = Pool(num)
        for car in self.get_model():
            # 判断是否获取
            if self.keep_records(str(car), vali=True):
                log_init().info(f'{car} 已获取跳过!')
                continue
            # 启动线程
            pool.apply_async(self.run, (car,))

        pool.close()
        pool.join()

运行结果:

在这里插入图片描述

在这里插入图片描述

本文仅供学习交流使用,如侵立删!
企鹅 、WX: 1033383881


你可能感兴趣的:(Python 爬取汽车之家口碑数据)