多线程爬取豆瓣用户

多线程爬豆瓣用户

import threading
import time
import requests
from pymongo import MongoClient
import json


class myThread(threading.Thread):  # 继承父类threading.Thread
    def __init__(self, url):
        threading.Thread.__init__(self)
        self.url = url
        client = MongoClient()
        self.col = client['douban']['users']

    def run(self):  # 把要执行的代码写到run函数里面 线程在创建后会直接运行run函数
        spider(urls=self.url,col=self.col)


def spider(urls,col):
    headers = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Origin': 'https://www.douban.com',
        'Referer': 'https://www.douban.com/gallery/topic/1348/?from=gallery_new_post',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
    }

    for url in urls:
        resp = requests.get(url, headers=headers)
        ret = json.loads(resp.text)
        for i in ret['items']:
            try:
                users = i['target']['status']['author']
            except Exception as e:
                print(e)
            else:
                users = i['target']['author']
                users['_id'] = users['name']
                try:
                    col.insert(users)
                    print("======>>>>>" + users['name'])
                    print('插入成功')
                except Exception as e:
                    print(e)
                    print('重复')


if __name__ == '__main__':
    # 创建新线程
    urls= ['https://m.douban.com/rexxar/api/v2/gallery/topic/84292/items?sort=hot&start={}&count=20&status_full_text=1&guest_only=0&ck=yzY7'.format(i) for i in range(20,4000,20)]
    thread1 = myThread(urls[:2000])
    thread2 = myThread(urls[2000:])

    # 开启线程
    thread1.start()
    thread2.start()

你可能感兴趣的:(python爬虫)