aiohttp与asyncio库爬取汽车销量排行榜

本次爬取是采用异步方法,异步请求来爬取http://db.auto.sohu.com/cxdata/的数据, 数据要求是提取每个车辆品牌每个车型的每个月份的销量, 数据是ajax加载的, 还是需要抓包获取,难点在于数据的对应以及整体思路是怎么实现爬取的。代码放在了https://github.com/dongxun1/The-Cars-Sales-Nums,里面包含了自定义的mysql储存,详细的提取步骤以及思路放在了README.md中。
以及一个配置文件,cars.py采集函数。

主代码:

from Test.Homework.configs import *
import aiohttp
import asyncio
import datetime
import re
import pymongo
from Test.Homework.configs_mysql import MYSQL
print(datetime.datetime.today(), '开始采集')


class AioCar(object):
    def __init__(self):
        #  从这里面获取总的车辆品牌id, 注意,这是总的, 每个品牌id 又对应多个车型
        self.url_1 = 'http://db.auto.sohu.com/cxdata/xml/basic/brandList.xml'

    async def have_brand_id(self, url):
        """
        获取品牌id brand_id
        :param url:
        :return:
        """
        async with aiohttp.ClientSession() as session:
            async with session.get(self.url_1) as resp:
                result = await resp.text()
                brand_ids = re.findall('id="(\d+)"', result, re.S)
                brand_ids = list(brand_ids)
                # brand_ids = ", ".join(brand_ids) # 注意!!字符串循环的赋值不可行, 必须是单独的数字对象
                # print(brand_ids)
                return brand_ids

    async def have_brand_name(self, url):
        """
        http://db.auto.sohu.com/cxdata/xml/basic/brand145ModelListWithCorp.xml
        通过这个网址 ,提取车辆名字, 最后的数据显示会有多个, 每个对应多个车型
        :param url:
        :return:brand_name
        """
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as resp:
                result = await resp.text(encoding="GBK")
                brand_name = re.findall('brand name="(.*?)"', result, re.S)[0]
                return brand_name

    async def have_ids(self, url):
        """
        依旧是这个网址, 获取id, 这是真正的独一无二的id
        ,对应的是每个不同的车型, 注意数据对应
        :param url:
        :return:
        """
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as resp:
                result = await resp.text(encoding="gbk")
                ids = re.findall('id="(\d+)"', result, re.S)
                return ids


    async def have_leardboard_message(self, url):
        """
        通过id 获取数据,
        :param id:
        :return:
        """
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as resp:
                try:
                    result = await resp.text(encoding="GBK")
                except UnicodeError:
                    result = await resp.text(encoding="utf-8")

                datas = re.findall('date="(.*?)" salesNum="(\d+)"', result, re.S)
                item = []
                name = re.findall('name="(.*?)"', result, re.S)[0]
                for data in datas:
                    datetime = data[0]
                    sale_nums = data[1]
                    data_list = [name, datetime, sale_nums]
                    item.append(data_list)
                return item

    @staticmethod
    def save_message_to_mongodb(data):
        try:
            if db[MONGO_COLLECTION].insert(data):
                pass
        except Exception as e:
            print(e.args)
        else:
            pass

    async def main(self):
        aio_car = AioCar()
        task1 = aio_car.have_brand_id(self.url_1)
        return await asyncio.ensure_future(asyncio.gather(task1))


if __name__ == '__main__':
    aio_car = AioCar()
    mysql = MYSQL()
    loop = asyncio.get_event_loop()
    results = loop.run_until_complete(aio_car.main())
    url_2 = 'http://db.auto.sohu.com/cxdata/xml/basic/brand{}ModelListWithCorp.xml' # 包含车辆总名字以及id
    url_3 = 'http://db.auto.sohu.com/cxdata/xml/sales/model/model{}sales.xml'
    client = pymongo.MongoClient(MONGO_URL)
    db = client[MONGO_DB]
    for brand_id in results[0]:
        url = url_2.format(brand_id)
        task = aio_car.have_brand_name(url)  # 获取品牌名字函数
        brand_name = loop.run_until_complete(task)   # 获取车辆匹配总名字, 也就是搜寻的第一个名字
        ids = loop.run_until_complete(aio_car.have_ids(url))  # 利用事件循环获取ids
        for id in ids:
            url = url_3.format(id)
            task = aio_car.have_leardboard_message(url)
            datas = loop.run_until_complete(task)
            for item in datas:
                item = [brand_name, item[0], item[1], item[2]]
                item = " ".join(item)
                item = {
                    'result' : item
                }
                aio_car.save_message_to_mongodb(item) # 保存到MongoDB
                mysql.insert(item)  # 保存到Mysql


print('采集结束', datetime.datetime.today())





你可能感兴趣的:(爬虫异步,python高级)