单线程串行(不加改造的程序)>>>多线程并发(threading)>>>多CPU并行(multiprocessing)>>>多机器并行(hadoop/hive/spark)
多线程:threading,利用CPU和IO可以同时执行的原理,让CPU不会干巴巴等待IO完成
多进程:multiprocessing,利用多核CPU的能力,真正的并行执行任务
异步IO:asyncio,在单线程利用CPU和IO同时执行的原理,实现函数异步执行
使用Lock对资源枷锁,防止冲突访问(多线程和多进程同时访问一个文件会冲突,如果锁起来就可以有序访问)
使用Queue实现不同线程/进程之间的数据通信,实现生产者-消费者模式
使用线程池Pool/进程池Pool,简化线程/进程的任务提交、等待结束、获取结果
使用subprocess启动外部程序的进程,并进行输入输出交互
CPU密集型也叫计算密集型,是指I/O在很短的时间就可以完成,CPU需要大量的计算和处理,特点是CPU占用率相当高。
例如:解压缩、加密解密、正则表达式搜索
IO密集型指的是系统运作大部分的状况是CPU在等I/O(硬盘 /内存)的读/写操作,CPU占用率仍然较低。
例如:文件处理程序、网络爬虫程序、读写数据库程序
多线程Thread、多进程Process、多协程Coroutine
关系:一个进程中,可以启动N个线程;一个线程中,可以启动N个协程
全局解释器锁(Global Interpreter Look,缩写GIL)是计算机程序设计语言解释器用于同步线程的一种机制,它使得任何时刻仅有一个线程在执行。即便在多核心处理器上,使用GIL。
1. 准备一个函数
def my_func(a,b):
dp_craw(a,b)
2. 怎样创建一个线程
import threading
t = threading.Thread(target = my_func, args = (100,200) # args是函数的参数
3. 启动线程
t.start()
4. 等待结束
t.join()
具体案例
import requests
urls = [
f"https://www.cnblogs.com/#p{page}"
for page in range(1, 51)
]
def craw(url):
r = requests.get(url)
print(url, len(r.text))
import blog_spider
import threading
def single_thread():
print("single_thread begin")
for url in blog_spider.urls:
blog_spider.craw(url)
print("single_thread end")
def multi_thread():
print("multi_thread begin")
threads = []
for url in blog_spider.urls:
threads.append(
threading.Thread(target=blog_spider.craw, args=(url,)) #元组要加逗号,否则就是个字符串
)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
print("multi_thread end")
复杂的事情一般都不会一下子做完,而是会分很多中间步骤一步步完成
import requests
from bs4 import BeautifulSoup
urls = [
f"https://www.cnblogs.com/#p{page}"
for page in range(1, 51)
]
def craw(url):
r = requests.get(url)
return r.text
def parse(html):
# class="post-item-title"
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all("a", class_="post-item-title")
return [(link["href"]) for link in links]
import queue
import blog_spider
import time
import random
import threading
def do_craw(url_queue:queue.Queue, html_queue: queue.Queue):
while True:
html = url_queue.get()
html = blog_spider.craw(url)
html_queue.put(html)
print(threading.current_thread().name, f"craw{url}",
"url_queue.size=", url_queue.qsize())
time.sleep(random.randint(1,2))
def do_parse(html_queue:queue.Queue, fout):
while True:
html = html_queue.get()
results = blog_spider.parse(html)
for result in results:
fout.write(str(result) + "\n")
print(threading.current_thread().name, f"results.size",
len(results),"html_queue.size=", html_queue.qsize())
time.sleep(random.randint(1, 2))
if __name__ == "__main__":
url_queue = queue.Queue()
html_queue = queue.Queue()
for url in blog_spider.urls:
url_queue.put(url)
for idx in range(3):
t = threading.Thread(target=do_craw, args = (url_queue, html_queue),
name = f"craw{idx}")
t.start()
fout = open("o2.data.txt", "w")
for idx in range(2):
t = threading.Thread(target=do_parse, args=(html_queue, fout),
name=f"parse{idx}")
t.start()
def draw(account, amount):
if account.balance >= amount:
account.balance -= amount
为解决上述问题,需要对代码进行加锁,有以下两种方法:
(先获取锁,然后执行代码)
import threading
lock = threading.Lock()
lock.acquire()
try:
# do something
finally:
lock.release()
import threading
lock = threading.Lock()
with lock:
#do something
import threading
import time
**lock = threading.Lock()**
class Account:
def __init__(self, balance):
self.balance = balance
def draw(account, amount):
**with lock:**
if account.balance >= amount:
time.sleep(0.1)
print(threading.current_thread().name, "取钱成功")
account.balance -= amount
print(threading.current_thread().name,
"余额", account.balance)
else:
print(threading.current_thread().name,
"取钱失败,余额不足")
if __name__ == "__main__":
account = Account(1000)
ta = threading.Thread(name="ta", target=draw,args=(account, 800))
tb = threading.Thread(name="tb", target=draw,args=(account, 800))
ta.start()
tb.start()
需要对上面代码进行简要修改,修改部分标粗
新建线程系统需要分配资源、终止线程系统需要回收资源;如果可以重用线程,则可以减去新建/终止的开销。这样就引出了线程池。
两种用法:
from concurrent.futures import ThreadPoolExecutor, as_completed
with ThreadPoolExecutor() as pool:
results = pool.map(craw,urls) # map传入函数和参数列表
for result in results:
print(result)
遇到多个参数的函数需要线程池支持时,依次并列往后就行,不用生成元组:
with ThreadPoolExecutor() as pool:
# results = pool.map(run_noerror,arg)
results = pool.map(run_noerror, titles, start_list, end_list) # 多余的参数就依次往后列出就行
from concurrent.futures import ThreadPoolExecutor, as_completed
with ThreadPoolExecutor() as pool:
futures = [pool.submit(craw, url) for url in urls]
for future in futures: # 第一种这种会按照url对应的顺序依次获取结果
print(future.result())
for future in as_completed(futures): # 返回先执行完的任务,无顺序
print(future.result())
创建python文件04.thread_pool.py如下:
import concurrent.futures
import blog_spider
# craw 方法1
with concurrent,futures.ThreadPoolExcecutor() as pool:
htmls = pool.map(blog_spider.craw, blog_spider.urls)
htmls = list(zip(blog_spider.urls, htmls))
for url, html in htmls:
print(url, len(html))
print("craw over")
# parse 方法2
with concurrent.futures.ThreadPoolExecutor() as pool:
futures = {}
for url, html in htmlsL
future = pool.submit(blog_spider.parse, html)
futures[future] = url
for future, url in futures.items(): # 有顺序的方式
print(url, future.result())
for future in concurrent.futures.as_completed(futures): # 无顺序的方法
url = futures[future]
print(url, future.result())
Web后台服务的特点:
使用线程池ThreadPoolExecutor的好处:
创建05.flask_thread_pool.py的python文件
import flask
import json
import time
from concurrent.futures import ThreadPoolExecutor
app = flask.Flask(__name__)
pool = ThreadPoolExecutor()
def read_file():
time.sleep(0.3)
return "file result"
def read_db():
time.sleep(0.1)
return "db result"
def read_api():
time.sleep(0.2)
return "df result"
@app.route("/")
def index():
result_file = pool.submit(read_file)
result_db = pool.submit(read_bd)
result_api = pool.submit(read_api)
return json.dumps({
"result_file": result_file.result(),
"result_db": result_db.result(),
"result_api": result_api.result()
})
if __name__ == "__main__":
app.run()
multiprocessing模块就是python为了解决GIL缺陷引入的一个模块,原理是用多进程在多CPU上并行执行。
import math
import concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import time
PRIMES = [112272535095293] * 100
def is_prime():
if n<2:
return False
if n == 2:
return True
if n % 2 == 0:
return False
sqrt_n = int(math.floor(math.sqrt(n)))
for i in range(3, sqrt_n + 1,2):
if n%i ==0:
return False
return True
# 单线程
def single_thread():
for number in PRIMES:
is_prime(number)
def multi_thread():
with ThreadPoolExecutor as pool:
pool.map(is_prime,PRIMES)
def multi_process():
with ProcessPoolExceutor() as pool:
pool.map(is_prime, PRIMES)
if __name__ = "__main__":
start = time.time()
single_thread()
end = time.time()
print("single_thread,cost:", end - start, "seconds")
start = time.time()
multi_thread()
end = time.time()
print("multi_thread,cost:", end - start, "seconds")
start = time.time()
multi_process()
end = time.time()
print("multi_process,cost:", end - start, "seconds")
创建python文件07.process_pool.py如下:
import flask
from concurrent.futures import ProcessPoolExecutor
import json
import math
app = flask.Flask(__name__)
def is_prime():
if n<2:
return False
if n == 2:
return True
if n % 2 == 0:
return False
sqrt_n = int(math.floor(math.sqrt(n)))
for i in range(3, sqrt_n + 1,2):
if n%i ==0:
return False
return True
@app.route("/is_prime/" )
def api_is_prime(numbers):
number_list = [int(x) for x in numbers.split(",")]
results = process_pool.map(is_prime(),number_list)
return json.dumps(dict(zip(number_list,results)))
if __name__ == "__main__":
process_pool = ProcessPoolExecutor()
app.run()
import asyncio
#获取事件循环
loop = asyncio.get_event_loop()
# 定义协程
async def myfunc(url):
await get_url(url)
# 创建task列表
tasks = [loop.create_task(myfunc(url)) for url in urls]
# 执行爬虫事件列表
loop.run_until_complete(asyncio.wait(tasks))
import asyncio
import aiohttp
import blog_spider
async def async_craw(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
result = await resp.text()
print(f"craw url: {url},{len(result)}")
loop = asyncio.get_event_loop()
tasks = [
loop.create_task(async_craw(url))
for url in blog_spider.urls
]
import time
start = time.time()
loop.run_until_complete(asyncio.wait(tasks))
end = time.time()
print(start-end)
注意:
import asyncio
import aiohttp
import blog_spider
semaphore = asyncio.Semaphore(10) # 信号量控制了并发度
async def async_craw(url):
async with semaphore:
print("craw url:", url)
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
result = await resp.text()
await asyncio.sleep(5)
print(f"craw url: {url},{len(result)}")
loop = asyncio.get_event_loop()
tasks = [
loop.create_task(async_craw(url))
for url in blog_spider.urls
]
import time
start = time.time()
loop.run_until_complete(asyncio.wait(tasks))
end = time.time()
print(start-end)
视频来源:https://www.bilibili.com/video/BV1bK411A7tV?p=1&vd_source=63b8ded929e53ceb23c48c6ca09fa194