本节学习多进程多线程
一些参考
Python的线程与进程
python的asyncio
python的异步编程
进程(Process)
线程(thread)
进程、线程、协程的区别
from multiprocessing import Pool
def f(x):
return x*x
if __name__ == '__main__':
p = Pool(5)
list = [1,2,3,4,5,6,7,8,9]
print(p.map(f, list))
例子
import time
import requests
from multiprocessing import Pool
task_list = [
'https://www.jianshu.com/p/91b702f4f24a',
'https://www.jianshu.com/p/8e9e0b1b3a11',
'https://www.jianshu.com/p/7ef0f606c10b',
'https://www.jianshu.com/p/b117993f5008',
'https://www.jianshu.com/p/583d83f1ff81',
'https://www.jianshu.com/p/91b702f4f24a',
'https://www.jianshu.com/p/8e9e0b1b3a11',
'https://www.jianshu.com/p/7ef0f606c10b',
'https://www.jianshu.com/p/b117993f5008',
'https://www.jianshu.com/p/583d83f1ff81'
]
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
def download(url):
response = requests.get(url,headers=header,imeout=30)
return response.status_code
if __name__ == '__main__':
p = Pool(10)
time_old = time.time()
for item in p.map(download, task_list):
print(item)
time_new = time.time()
time_cost = time_new - time_old
print(time_cost)
from multiprocessing import Process
def f(name):
print('hello', name)
if __name__ == '__main__':
p_1 = Process(target=f, args=('bob',))
p_1.start()
p_1.join()
p_2 = Process(target=f, args=('alice',))
p_2.start()
p_2.join()
例子
import time
import requests
from multiprocessing import Process
task_list = [
'https://www.jianshu.com/p/91b702f4f24a',
'https://www.jianshu.com/p/8e9e0b1b3a11',
'https://www.jianshu.com/p/7ef0f606c10b',
'https://www.jianshu.com/p/b117993f5008',
'https://www.jianshu.com/p/583d83f1ff81'
]
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
def download(url):
response = requests.get(url,headers=header,timeout=30)
print(response.status_code)
if __name__ == '__main__':
for item in task_list:
p = Process(target=download, args=(item,))
p.start()
p.join()
了解下,不推荐使用
一个挂了就会全部挂掉
import threading
import time
class myThread(threading.Thread):
def __init__(self, threadID, name, counter):
threading.Thread.__init__(self)
self.threadID = threadID
self.name = name
self.counter = counter
def run(self):
print("Starting " + self.name)
# 获得锁,成功获得锁定后返回True
# 可选的timeout参数不填时将一直阻塞直到获得锁定
# 否则超时后将返回False
threadLock.acquire()
print_time(self.name, self.counter, 3)
# 释放锁
threadLock.release()
def print_time(threadName, delay, counter):
while counter:
time.sleep(delay)
print("%s: %s" % (threadName, time.ctime(time.time())))
counter -= 1
threadLock = threading.Lock()
threads = []
# 创建新线程
thread1 = myThread(1, "Thread-1", 1)
thread2 = myThread(2, "Thread-2", 2)
# 开启新线程
thread1.start()
thread2.start()
# 添加线程到线程列表
threads.append(thread1)
threads.append(thread2)
# 等待所有线程完成
for t in threads:
t.join()
print("Exiting Main Thread")
ssl和loop
import aiohttp
import asyncio
import ssl
async def fetch(session, url):
async with session.get(url,ssl=ssl.SSLContext()) as response:
return await response.text()
async def main():
async with aiohttp.ClientSession() as session:
html = await fetch(session, 'http://www.baidu.com')
print(html)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
gather方法
import asyncio
async def a(t):
print('-->', t)
await asyncio.sleep(0.5) #把协程交给别的
print('<--', t)
return t * 10
def main():
futs = [a(t) for t in range(6)]
print(futs)
ret = asyncio.gather(*futs)
print(ret)
loop = asyncio.get_event_loop()
ret1 = loop.run_until_complete(ret)
print(ret1)
main()
create_task()方法
import asyncio
async def a(t):
print('-->', t)
await asyncio.sleep(0.5)
print('<--', t)
return t * 10
async def b():
# loop = asyncio.get_event_loop()
cnt = 0
while 1:
cnt += 1
cor = a(cnt) # coroutine
resp = loop.create_task(cor)
await asyncio.sleep(0.1)
print(resp)
loop = asyncio.get_event_loop()
loop.run_until_complete(b())
用asyncio和aiohttp抓取博客的总阅读量 (提示:先用接又找到每篇文章的链接)
https://www.jianshu.com/u/130f76596b02
import re
import asyncio
import aiohttp
import requests
import ssl
from lxml import etree
from asyncio.queues import Queue
from aiosocksy import Socks5Auth
from aiosocksy.connector import ProxyConnector, ProxyClientRequest
class Common():
task_queue = Queue()
result_queue = Queue()
result_queue_1 = []
async def session_get(session,url,socks):
auth = Socks5Auth(login='...', password='...')
headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
timeout = aiohttp.ClientTimeout(total=20)
response = await session.get(
url,
proxy=socks,
proxy_auth=auth,
timeout=timeout,
headers=headers,
ssl=ssl.SSLContext()
)
return await response.text(), response.status
async def download(url):
connector = ProxyConnector()
socks = None
async with aiohttp.ClientSession(
connector = connector,
request_class = ProxyClientRequest
) as session:
ret, status = await session_get(session, url, socks)
if 'window.location.href' in ret and len(ret) < 1000:
url = ret.split("window.location.href='")[1].split("'")[0]
ret, status = await session_get(session, url, socks)
return ret, status
async def parse_html(content):
read_num_pattern = re.compile(r'"views_count":\d+')
read_num = int(read_num_pattern.findall(content)[0].split(':')[-1])
return read_num
def get_all_article_links():
links_list = []
for i in range(1,21):
url = 'https://www.jianshu.com/u/130f76596b02?order_by=shared_at&page={}'.format(i)
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
response = requests.get(url,
headers=header,
timeout=5
)
tree = etree.HTML(response.text)
article_links = tree.xpath('//div[@class="content"]/a[@class="title"]/@href')
for item in article_links:
article_link = 'https://www.jianshu.com' + item
links_list.append(article_link)
print(article_link)
return links_list
async def down_and_parse_task(queue):
while 1:
try:
url = queue.get_nowait()
except:
return
error = None
for retry_cnt in range(3):
try:
html, status = await download(url)
if status != 200:
html, status = await download(url)
read_num = await parse_html(html)
print(read_num)
# await Common.result_queue.put(read_num)
Common.result_queue_1.append(read_num)
break
except Exception as e:
error = e
await asyncio.sleep(0.2)
continue
else:
raise error
async def count_sum():
while 1:
try:
print(Common.result_queue_1)
print('总阅读量 = ',sum(Common.result_queue_1))
await asyncio.sleep(3)
except:
pass
async def main():
all_links = get_all_article_links()
for item in set(all_links):
await Common.task_queue.put(item)
for _ in range(10):
loop.create_task(down_and_parse_task(Common.task_queue))
loop.create_task(count_sum())
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.create_task(main())
loop.run_forever()
了解了多进程和多线程的爬虫
不过最后的例子还没完全搞懂
先记一笔