应用多线程抓取小米应用商店的例子

多线程需要用到threading模块,
通过这种方法可以创建线程:

from threading import Thread
t_list = []
# 创建多个线程,括号里面3表示3个线程
for i in range(3):
t = Thread(target='执行的函数')
t_list.append(t)
t.start()

程序结束之后不要忘了回收线程

for t in t_list:
t.join()

多线程需要考虑两个问题,
第一个:对目标网页的访问不能有重复的
第二个:抓取出来的数据在保存文件或数据库时,必须保证不能有数据丢失
先来解决第一个问题: 对url来讲, 可以用队列来保证每次不同线程获取到的都是不同的url
导入模块(代码都是小米应用商店的抓取为例)

from queue import Queue
# 创建队列对象
q = Queue()
 # URL入队列
q.put(url)
# 获取队列里的url
q.get()

通过队列, 现在保证了每次抓取到的数据都不会重复,也保证了所有的数据都能获取到,然后是数据存入本地文件或者数据库的时候, 不能有数据丢失,就必须保证每次进行写入操作的时候,只能有一个线程在执行写入操作.这里引用Lock方法,给写入操作加上一把锁,每次获取到锁的线程才能进行写入操作.在一个线程操作完之后释放锁

from threading import Lock
lock = Lock()
# 上锁
lock.acquire()
# 释放锁
lock.release()

下面是完整的代码

import requests
from fake_useragent import UserAgent
from threading import Thread
from queue import Queue
import time
from lxml import etree
import pymysql
import random
from threading import Lock
import math


class XiaoMiSpider(object):
    def __init__(self):
        self.base_url = 'http://app.mi.com/'
        self.type_url = 'http://app.mi.com/categotyAllListApi?page={}&categoryId={}&pageSize=30'
        self.q = Queue()  # 创建队列
        self.ua = UserAgent()
        self.i = 0
        self.app_type_list_info = []
        self.conn = pymysql.connect('127.0.0.1', '用户名', '密码', 'xm_db', charset='utf8mb4')
        self.cursor = self.conn.cursor()
        self.lock = Lock()  # 创建锁

    def get_type_info(self):
        headers = {'User-Agent': self.ua.random}
        html = requests.get(self.base_url, headers=headers).text
        parse_html = etree.HTML(html)
        base_info = parse_html.xpath('//div[@class="sidebar"]/div[2]/ul/li')
        for info in base_info:
            category_id = info.xpath('./a/@href')[0].split('/')[-1]
            app_count = self.get_app_num(category_id)
            self.app_type_list_info.append((category_id, app_count))
        print('app类型列表抓取完毕!')
        self.url_in()

    def get_app_num(self, category_id):
        url = self.type_url.format(0, category_id)
        headers = {'User-Agent': self.ua.random}
        html = requests.get(url, headers=headers).json()
        app_count = html['count']
        return app_count

    # URL入队列
    def url_in(self):
        for info in self.app_type_list_info:
            for page in range(math.ceil(int(info[1]) // 30)):
                url = self.type_url.format(page, info[0])
                self.q.put(url)

    # 线程事件type_id
    def get_data(self):
        while True:
            if not self.q.empty():
                url = self.q.get()
                headers = {'User-Agent': self.ua.random}
                html = requests.get(url, headers=headers).json()
                self.parse_html(html)
            else:
                break

    def parse_html(self, html):
        app_list = []
        time.sleep(2)
        for info in html['data']:
            app_name = info['displayName']
            app_link = 'http://app.mi.com/details?id=' + info['packageName']
            app_type = info['level1CategoryName']
            app_list.append([app_type, app_name, app_link])
            self.i += 1
            print(f'{app_name}抓取完毕')
        self.save_to_file(app_list)
        time.sleep(random.randint(1, 3))

    def save_to_file(self, app_list):
        self.lock.acquire()
        ins = 'insert into app_info (app_type, app_name, app_link) values (%s,%s,%s)'
        try:
            self.cursor.executemany(ins, app_list)
        except Exception as e:
            print('error', e)
        self.lock.release()

    def main(self):
        self.get_type_info()
        t_list = []
        for i in range(3):
            t = Thread(target=self.get_data)
            t_list.append(t)
            t.start()

        for t in t_list:
            t.join()
        print(f'共{self.i}条数据')
        self.conn.commit()
        self.cursor.close()
        self.conn.close()


if __name__ == '__main__':
    start_time = time.time()
    spider = XiaoMiSpider()
    spider.main()
    end_time = time.time()
    print(f'执行时间{(end_time - start_time)}s')
    

你可能感兴趣的:(应用多线程抓取小米应用商店的例子)