链接: bilibili
那些程序提速方法:
实现方法:
Python速度慢的两大原因
python相较于C/C++,java确实慢,在一些特俗场景下,速度相差100~200倍。
- 原因1,Python是动态类型语言,边解释边执行;
- 原因2,GIL导致无法利用多喝CPU并发执行;
GIL是什么
全局解释器锁(Global Interpreter Lock,GIL),是计算机程序设计语言解释器用于线程同步的一种机制,它使任何时刻仅有一个线程在执行。即使在多核处理器上,GIL的计时器也只允许同一时间执行一个线程。
# 1.主备一个函数
def my_func(a, b):
do_craw(a, b)
# 2.创建一个线程
import threading
t = threading.Thread(target = my_func, args = (100, 200))
# 3.启动线程
t.srart()
# 4.等待结束
t.join()
# blog_spider.py
import requests
from bs4 import BeautifulSoup
# 博客园url列表
urls = [
f"https://www.cnblogs.com/#p{page}"
for page in range(1, 50 + 1)
]
# 获取网页信息,输出url+内容长度,返回网页内容
def craw(url):
r = requests.get(url)
print(url, len(r.text))
return r.text
# 下文生产者-消费者模式要用到的爬取信息处理函数
# def parse(html):
# soup = BeautifulSoup(html, "html.parser")
# links = soup.find_all("a", class_="post-item-title")
# return [(link["href"], link.get_text()) for link in links]
if __name__ == "__main__":
for result in parse(craw(urls[2])):
print(result)
# 01.multi_thread_craw.py
import blog_spider
import threading
import time
# 单线程爬虫
def single_thread():
print("single_thread begin")
for url in blog_spider.urls:
blog_spider.craw(url)
print("single_thread end")
# 多线程爬虫
def multi_thread():
print("multi_thread begin")
threads = []
for url in blog_spider.urls:
threads.append(
threading.Thread(target=blog_spider.craw, args=(url,))
# 不加括号是不调用
)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
print("multi_thread end")
if __name__ == "__main__":
strat = time.time()
single_thread()
end = time.time()
print("single thread cost:", end - strat, "sec")
strat = time.time()
multi_thread()
end = time.time()
print("multi thread cost:", end - strat, "sec")
single_thread begin
single_thread end
single thread cost: 7.926880836486816 sec
multi_thread begin
multi_thread end
multi thread cost: 0.5494334697723389 sec
import queue
import blog_spider
import time
import random
import threading
# 生产者函数
def do_craw(url_queue: queue.Queue, html_queue: queue.Queue):
while True:
url = url_queue.get()
html = blog_spider.craw(url)
html_queue.put(html)
print(threading.current_thread().name, f"craw{url}",
"url_queue.size=", url_queue.qsize())
time.sleep(random.randint(1, 2))
# 消费者函数
def do_parse(html_queue: queue.Queue, fout):
while True:
html = html_queue.get()
results = blog_spider.parse(html)
for result in results:
fout.write(str(result) + "\n")
print(threading.current_thread().name, f"results.size",
len(results), "html_queue.size=", html_queue.qsize())
time.sleep(random.randint(1, 2))
if __name__ == "__main__":
url_queue = queue.Queue()
html_queue = queue.Queue()
for url in blog_spider.urls:
url_queue.put(url)
for idx in range(3):
t = threading.Thread(target=do_craw, args=(url_queue, html_queue),
name=f"craw{idx}")
t.start()
fout = open("02.data.txt", "w", encoding='utf-8')
for idx in range(2):
t = threading.Thread(target=do_parse, args=(html_queue, fout),
name=f"parse{idx}")
t.start()
# Lock_concurrent.py
import threading
import time
lock = threading.Lock()
class Account: # 创建Circle类
def __init__(self, balance): # 约定成俗这里应该使用r,它与self.r中的r同名
self.balance = balance
def draw(account, amount):
with lock:
if account.balance >= amount:
time.sleep(0.1)
print(threading.current_thread().name, "successful")
account.balance -= amount
print(threading.current_thread().name, "balance", account.balance)
else:
print(threading.current_thread().name, "failed")
if __name__ == "__main__":
account = Account(1000)
ta = threading.Thread(name="ta", target=draw, args=(account, 800))
tb = threading.Thread(name="tb", target=draw, args=(account, 800))
ta.start()
tb.start()
# thread_pool.py
import concurrent.futures
import blog_spider
# craw
with concurrent.futures.ThreadPoolExecutor() as pool:
htmls = pool.map(blog_spider.craw, blog_spider.urls)
htmls = list(zip(blog_spider.urls, htmls))
for url, html in htmls:
print(url, len(html))
# futures = [pool.submit(blog_spider.craw, url) for url in blog_spider.urls]
# for future in futures:
# print(future.result())
# for future in as_completed(futures):
# print(future.result())
print("craw over")
# parse
with concurrent.futures.ThreadPoolExecutor() as pool:
futures = {}
for url, html in htmls:
future = pool.submit(blog_spider.parse, html)
futures[future] = url
for future, url in futures.items():
print(url, future.result())
# for future in concurrent.futures.as_completed(futures):
# url = futures[future]
# print(url, future.result())
#flask_thread_pool.py
import json
import flask
import time
from concurrent.futures import ThreadPoolExecutor
pool = ThreadPoolExecutor()
app = flask.Flask(__name__)
def read_file():
time.sleep(0.1)
print('file')
return "file result"
def read_db():
time.sleep(0.2)
print('db')
return "db result"
def read_api():
time.sleep(0.3)
print('api')
return "api result"
@app.route("/")
def index():
start = time.time()
result_file = pool.submit(read_file)
result_db = pool.submit(read_db)
result_api = pool.submit(read_api)
# result_file = read_file()
# result_db = read_db()
# result_api = read_api()
end = time.time()
return json.dumps({
"result_file": result_file.result(),
"result_db": result_db.result(),
"result_api": result_api.result(),
# "result_file": result_file,
# "result_db": result_db,
# "result_api": result_api,
"time": end - start,
})
if __name__ == "__main__":
app.run()
single_thread, cost: 36.88734722137451 sec
multi_thread, cost: 37.69435501098633 sec
multi_process, cost: 6.157953977584839 sec
#Thread_process_cpu_borad.py
import math
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import time
# 导入100个大素数
PRIMRS = [112272535095293] * 100
# 函数用来判断是否为素数
def is_prime(n):
if n < 2:
return False
if n == 2:
return True
if n % 2 == 0:
return False
sqrt_n = int(math.floor(math.sqrt(n)))
for i in range(3, sqrt_n + 1, 2):
if n % i == 0:
return False
return True
def single_thread():
for number in PRIMRS:
is_prime(number)
def multi_thread():
with ThreadPoolExecutor() as pool:
pool.map(is_prime, PRIMRS)
def multi_process():
with ProcessPoolExecutor() as pool:
pool.map(is_prime, PRIMRS)
if __name__ == "__main__":
start = time.time()
single_thread()
end = time.time()
print("single_thread, cost:", end - start, "sec")
start = time.time()
multi_thread()
end = time.time()
print("multi_thread, cost:", end - start, "sec")
start = time.time()
multi_process()
end = time.time()
print("multi_process, cost:", end - start, "sec")
import flask
import math
import json
from concurrent.futures import ProcessPoolExecutor
process_pool = ProcessPoolExecutor()
app = flask.Flask(__name__)
# PRIMRS = [112272535095293] * 100
def is_prime(n):
if n < 2:
return False
if n == 2:
return True
if n % 2 == 0:
return False
sqrt_n = int(math.floor(math.sqrt(n)))
for i in range(3, sqrt_n + 1, 2):
if n % i == 0:
return False
return True
# 接口
@app.route("/is_prime/" )
def api_is_prime(numbers):
number_list = [int(x) for x in numbers.split(",")]
results = process_pool.map(is_prime, number_list)
return json.dumps(dict(zip(number_list, results)))
if __name__ == "__main__":
process_pool = ProcessPoolExecutor()
app.run()
import asyncio
import aiohttp
import blog_spider
import time
async def async_craw(url):
print("craw url:", url)
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
result = await resp.text()
print(f"craw url:{url}, {len(result)}")
loop = asyncio.get_event_loop()
tasks = [loop.create_task(async_craw(url))
for url in blog_spider.urls]
start = time.time()
loop.run_until_complete(asyncio.wait(tasks))
end = time.time()
print("spend time:", end - start, "sec")
import asyncio
import aiohttp
import blog_spider
import time
semaphore = asyncio.Semaphore(1)
async def async_craw(url):
async with semaphore:
print("craw url:", url)
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
result = await resp.text()
print(f"craw url:{url}, {len(result)}")
loop = asyncio.get_event_loop()
tasks = [loop.create_task(async_craw(url))
for url in blog_spider.urls]
start = time.time()
loop.run_until_complete(asyncio.wait(tasks))
end = time.time()
print("spend time:", end - start, "sec")