好长时间没有更新,今天更新一次!!。
因为工作原因,一直在使用pyspider框架有半年没有用过scrapy框架了,知识点也忘记了很多,今天写了一个多线程爬取APP的图片时间的脚本,供大家参考!
import re, random, time, json, requests, datetime, os from pyquery import PyQuery as pq from multiprocessing import Pool #detail_page函数是获取详情页的内容,当然有不同的数据解析包获取数。这个地方可以视情况而定 def detail_page(page_url): res = requests.get(page_url) res_dict = res.json() for each in json.loads(res_dict['info']['image_list']): data = { "pid": str(time.time()).split('.')[0] + str(random.randint(10000, 99999)), "task_id": 257609, "clue_id": 437389, "clue_name": '玩咖', "company_id": 230433, "url": page_url, "pic_url": each, "client_date": get_date2(res_dict['info']['mtime']), "url_article_title": res_dict['info']['title'], "url_article": pq(res_dict['info']['content'])('p').text(), "is_cover": 0, } aa = {'resource': data} d = json.dumps(aa) try: url = 'http://shijue.qingapi.cn/task_python/start' r = requests.post(url, data={"data": d}) except: filemame = os.path.join(os.path.abspath('.'), 'wanka_error') with open(filemame,'a') as f: f.write(d) f.write('\n') print(data) #list_page函数是APP每个板块的内容,进行分页,再将获取到的详情页的url传给detail_page函数 def list_page1(pid): for i in range(pid, pid+100): print(i) url = 'http://data.gm825.com/api/channel/recommendation?pn='+str(i)+'&cuid=FA2B688F603E1C48EE93CB8291D5A0D5&svr=2.0.0.5&vcode=20&ovr=8.0.0&device=HUAWEI_FRD-AL00&app_id=h9999j&channel_id=90001b&client_id=UKiUuZkYs%2BYRgWJPphEM7w%3D%3D' res = requests.get(url) res_dict = res.json() for each in res_dict['list']: page_url = 'http://data.gm825.com/api/article/getbyid?id={mid}&cuid=FA2B688F603E1C48EE93CB8291D5A0D5&svr=2.0.0.5&vcode=20&ovr=8.0.0&device=HUAWEI_FRD-AL00&app_id=h9999j&channel_id=90001b&client_id=UKiUuZkYs%2BYRgWJPphEM7w%3D%3D'.format(mid=str(each['module_id'])) try: detail_page(page_url) except: pass for j in range(pid,pid+100): # print(j) url='http://data.gm825.com/api/channel/mha?pn='+str(j)+'&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D' rese=requests.get(url) res_dict=rese.json() #print(res_dict) for each in res_dict['list']: #print(each) page_url = 'http://data.gm825.com/api/article/getbyid?id={mid}&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D'.format(mid=str(each['id'])) #print(page_url) try: detail_page(page_url) except: pass for q in range(pid,pid+100): url='http://data.gm825.com/api/channel/gallery?pn='+str(q)+'&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D' rese=requests.get(url) res_dict=rese.json() for each in res_dict['list']: page_url = 'http://data.gm825.com/api/gallery/getbyid?id={mid}&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D'.format(mid=str(each['id'])) try: detail_page(page_url) except: pass for e in range(pid,pid+100): url='http://data.gm825.com/api/channel/mixture?pn='+str(e)+'&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D' rese=requests.get(url) res_dict=rese.json() for each in res_dict['list']: page_url = 'http://data.gm825.com/api/article/getbyid?id={mid}&cuid=E8F10259E26C5509EBCFDC46459812DB&svr=2.0.0.5&vcode=20&ovr=7.1.2&device=Meizu_M6+Note&app_id=h9999j&channel_id=90001a&client_id=%2BbVBZZb5aMV2hD%2FXTvehaQ%3D%3D'.format(mid=str(each['id'])) try: detail_page(page_url) except: pass if __name__ == '__main__': p = Pool(10)#定义了10个进程的进程池 for i in range(10,70): p.apply_async(list_page1, args=(i*100,)) p.close() p.join()
代码就这些,如果有不懂的朋友可以加我Q353061949,我会给你讲解哦!