scrapy多进程爬取数据

好长时间没有更新,今天更新一次!!。

因为工作原因,一直在使用pyspider框架有半年没有用过scrapy框架了,知识点也忘记了很多,今天写了一个多线程爬取APP的图片时间的脚本,供大家参考!

import re, random, time, json, requests, datetime, os
from pyquery import PyQuery as pq
from multiprocessing import Pool
#detail_page函数是获取详情页的内容,当然有不同的数据解析包获取数。这个地方可以视情况而定
def detail_page(page_url):
    res = requests.get(page_url)
    res_dict = res.json()
    for each in json.loads(res_dict['info']['image_list']):
        data = {
            "pid": str(time.time()).split('.')[0] + str(random.randint(10000, 99999)),
            "task_id": 257609,
            "clue_id": 437389,
            "clue_name": '玩咖',
            "company_id": 230433,
            "url": page_url,
            "pic_url": each,
            "client_date": get_date2(res_dict['info']['mtime']),
            "url_article_title": res_dict['info']['title'],
            "url_article": pq(res_dict['info']['content'])('p').text(),
            "is_cover": 0,
        }
        aa = {'resource': data}
        d = json.dumps(aa)
        try:
            url = 'http://shijue.qingapi.cn/task_python/start'
            r = requests.post(url, data={"data": d})
        except:
            filemame = os.path.join(os.path.abspath('.'), 'wanka_error')
            with open(filemame,'a') as f:
                f.write(d)
                f.write('\n')
        print(data)
#list_page函数是APP每个板块的内容,进行分页,再将获取到的详情页的url传给detail_page函数
def list_page1(pid):
    for i in range(pid, pid+100):
        print(i)
        url = 'http://data.gm825.com/api/channel/recommendation?pn='+str(i)+'&cuid=FA2B688F603E1C48EE93CB8291D5A0D5&svr=2.0.0.5&vcode=20&ovr=8.0.0&device=HUAWEI_FRD-AL00&app_id=h9999j&channel_id=90001b&client_id=UKiUuZkYs%2BYRgWJPphEM7w%3D%3D'
        res = requests.get(url)
        res_dict = res.json()
        for each in res_dict['list']:
            page_url = 'http://data.gm825.com/api/article/getbyid?id={mid}&cuid=FA2B688F603E1C48EE93CB8291D5A0D5&svr=2.0.0.5&vcode=20&ovr=8.0.0&device=HUAWEI_FRD-AL00&app_id=h9999j&channel_id=90001b&client_id=UKiUuZkYs%2BYRgWJPphEM7w%3D%3D'.format(mid=str(each['module_id']))
            try:
                detail_page(page_url)
            except:
                pass
    for j in range(pid,pid+100):
       # print(j)
        url='http://data.gm825.com/api/channel/mha?pn='+str(j)+'&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D'
        rese=requests.get(url)
        res_dict=rese.json()
        #print(res_dict)
        for each in res_dict['list']:
            #print(each)
            page_url = 'http://data.gm825.com/api/article/getbyid?id={mid}&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D'.format(mid=str(each['id']))
            #print(page_url)
            try:
                detail_page(page_url)
            except:
                pass

    for q in range(pid,pid+100):

        url='http://data.gm825.com/api/channel/gallery?pn='+str(q)+'&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D'
        rese=requests.get(url)
        res_dict=rese.json()
        for each in res_dict['list']:
            page_url = 'http://data.gm825.com/api/gallery/getbyid?id={mid}&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D'.format(mid=str(each['id']))
            try:
                detail_page(page_url)
            except:
                pass
    for e in range(pid,pid+100):
        url='http://data.gm825.com/api/channel/mixture?pn='+str(e)+'&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D'
        rese=requests.get(url)
        res_dict=rese.json()
        for each in res_dict['list']:
            page_url = 'http://data.gm825.com/api/article/getbyid?id={mid}&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D'.format(mid=str(each['id']))
            try:
                detail_page(page_url)
            except:
                pass
if __name__ == '__main__':
    p = Pool(10)#定义了10个进程的进程池
    for i in range(10,70):
        p.apply_async(list_page1, args=(i*100,))
    p.close()
    p.join()

代码就这些,如果有不懂的朋友可以加我Q353061949,我会给你讲解哦!

你可能感兴趣的:(python)