Python多线程因为GIL的存在,它们并不能利用CPU多核优势,一个Python进程中,只允许有一个线程处于运行状态.但是在做阻塞的系统调用时,当前线程会释放GIL,让别的线程有执行机会.
#对于io操作来说,多线程和多进程性能差别不大
#1.通过Thread类实例化
import time
import threading
def get_detail_html(url):
print("get detail html started")
time.sleep(2)
print("get detail html end")
def get_detail_url(url):
print("get detail url started")
time.sleep(4)
print("get detail url end")
if __name__ == "__main__":
thread1 = threading.Thread(target=get_detail_url)
thread2 = threading.Thread(target=get_detail_html)
start_time = time.time()
thread1.start()
thread2.start()
thread1.join()
thread2.join()
print("last time: {}".format(time.time() - start_time))
get detail url started
get detail html started
get detail html end
get detail url end
last time: 4.001551389694214
#2. 通过集成Thread来实现多线程
class GetDetailHtml(threading.Thread):
def __init__(self, name):
super().__init__(name=name)
def run(self):
print("get detail html started")
time.sleep(2)
print("get detail html end")
class GetDetailUrl(threading.Thread):
def __init__(self, name):
super().__init__(name=name)
def run(self):
print("get detail url started")
time.sleep(4)
print("get detail url end")
if __name__ == "__main__":
thread1 = GetDetailHtml("get_detail_html")
thread2 = GetDetailUrl("get_detail_url")
start_time = time.time()
thread1.start()
thread2.start()
thread1.join()
thread2.join()
print ("last time: {}".format(time.time()-start_time))
get detail url started
get detail html started
get detail html end
get detail url end
last time: 4.0013439655303955
from threading import Lock, RLock #可重入的锁
# Lock在同一个线程里面, 只能调用一次, 否则会出现死锁
# RLock在同一个线程里面,可以连续调用多次acquire, 一定要注意acquire的次数要和release的次数相等
total = 0
lock = RLock()
def add():
global lock
global total
for i in range(1000000):
lock.acquire()
total += 1
lock.release()
def desc():
global total
global lock
for i in range(1000000):
lock.acquire()
total -= 1
lock.release()
import threading
thread1 = threading.Thread(target=add)
thread2 = threading.Thread(target=desc)
thread1.start()
thread2.start()
thread1.join()
thread2.join()
print(total)
#1. 用锁会影响性能
#2. 锁会引起死锁
0
#通过queue的方式进行线程间同步
from queue import Queue
import time
import threading
def get_detail_html(queue):
#爬取文章详情页
url = queue.get()
print("get detail html started")
time.sleep(2)
print(url)
print("get detail html end")
def get_detail_url(queue):
# 爬取文章列表页
print("get detail url started")
time.sleep(4)
queue.put("https://www.baidu.com/")
print("get detail url end")
if __name__ == "__main__":
detail_url_queue = Queue(maxsize=1000)
thread_detail_url = threading.Thread(target=get_detail_url, args=(detail_url_queue,))
html_thread = threading.Thread(target=get_detail_html, args=(detail_url_queue,))
thread_detail_url.start()
html_thread.start()
detail_url_queue.join()
html_thread.join()
get detail url started
get detail url end
get detail html started
https://www.baidu.com/
get detail html end
import threading
from threading import Condition
#通过condition完成协同读诗
class XiaoAi(threading.Thread):
def __init__(self, cond):
super().__init__(name="小爱")
self.cond = cond
def run(self):
with self.cond:
self.cond.wait()
print("{} : 在 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 好啊 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 君住长江尾 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 共饮长江水 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 此恨何时已 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 定不负相思意 ".format(self.name))
self.cond.notify()
class TianMao(threading.Thread):
def __init__(self, cond):
super().__init__(name="天猫精灵")
self.cond = cond
def run(self):
with self.cond:
print("{} : 小爱同学 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 我们来对古诗吧 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 我住长江头 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 日日思君不见君 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 此水几时休 ".format(self.name))
self.cond.notify()
self.cond.wait()
print("{} : 只愿君心似我心 ".format(self.name))
self.cond.notify()
self.cond.wait()
if __name__ == "__main__":
cond = threading.Condition()
xiaoai = XiaoAi(cond)
tianmao = TianMao(cond)
#启动顺序很重要
#在调用with cond之后才能调用wait或者notify方法
#condition有两层锁, 一把底层锁会在线程调用了wait方法的时候释放, 上面的锁会在每次调用wait的时候分配一把并放入到cond的等待队列中,等到notify方法的唤醒
xiaoai.start()
tianmao.start()
天猫精灵 : 小爱同学
小爱 : 在
天猫精灵 : 我们来对古诗吧
小爱 : 好啊
天猫精灵 : 我住长江头
小爱 : 君住长江尾
天猫精灵 : 日日思君不见君
小爱 : 共饮长江水
天猫精灵 : 此水几时休
小爱 : 此恨何时已
天猫精灵 : 只愿君心似我心
小爱 : 定不负相思意
#Semaphore 是用于控制进入数量的锁
#文件, 读、写, 写一般只是用于一个线程写,读可以允许有多个
#做爬虫
import threading
import time
class HtmlSpider(threading.Thread):
def __init__(self, url, sem):
super().__init__()
self.url = url
self.sem = sem
def run(self):
time.sleep(2)
print("got html text success")
self.sem.release()
class UrlProducer(threading.Thread):
def __init__(self, sem):
super().__init__()
self.sem = sem
def run(self):
for i in range(20):
self.sem.acquire()
html_thread = HtmlSpider("https://baidu.com/{}".format(i), self.sem)
html_thread.start()
if __name__ == "__main__":
sem = threading.Semaphore(3)
url_producer = UrlProducer(sem)
url_producer.start()
got html text success
......
got html text success
got html text success
from concurrent.futures import ThreadPoolExecutor, as_completed, wait, FIRST_COMPLETED
#未来对象,task的返回容器
#线程池, 为什么要线程池
#主线程中可以获取某一个线程的状态或者某一个任务的状态,以及返回值
#当一个线程完成的时候我们主线程能立即知道
#futures可以让多线程和多进程编码接口一致
import time
def get_html(times):
time.sleep(times)
print("get page {} success".format(times))
return times
executor = ThreadPoolExecutor(max_workers=2)
#要获取已经成功的task的返回
urls = [3,2,4]
all_task = [executor.submit(get_html, (url)) for url in urls]
# wait(all_task, return_when=FIRST_COMPLETED)
# print("main")
for future in as_completed(all_task):
data = future.result()
print("get {} page".format(data))
#通过executor的map获取已经完成的task的值
# for data in executor.map(get_html, urls):
# print("get {} page".format(data))
get page 2 success
get 2 page
get page 3 success
get 3 page
get page 4 success
get 4 page