爬虫学习 异步爬虫(五)

多线程 多进程 协程

进程 运行中的程序
线程 被CPU调度的执行过程,操作系统 运算调度的min单位
在进程之中,进程中实际运作单位

from threading import Thread


#创建任务
def func(name):
    for i in range(100):
        print(name,i)


if __name__ == '__main__':
    #创建线程
    t1 = Thread(target = func,args=("一一一",))#Thread() ,必须要求元组
    t2 = Thread(target = func,args=("二二二",))
	t1.start()
	t2.start()
print("我是主线程")
#1个主线程  2个副线程

面向对象方法

from threading import Thread

class MyThread(Thread):
    def __init__ (self,name):#init 传参
        super(MyThread,self).__init__()#初始化
        self.name = name

    def run(self): #run方法 -->返回值可不写
        for i in range(100):
            print(self.name,i)

if __name__ == '__main__':
    t1 = MyThread("一一一")
    t2 = MyThread("二二二")
    t1.start()
    t2.start()

线程池

from concurrent.futures import ThreadPoolExecutor

def func(name):
    for i in range(10):
        print(name,i)

if __name__ == '__main__':
    with ThreadPoolExecutor(10) as t:
        for i in range(100):
            t.submit (func,f"num{i}")

返回值使用

from concurrent.futures import ThreadPoolExecutor

def func(name):
    for i in range(10):
        print(name,i)
        return name
    
def fn(res):
    print(res.result())

if __name__ == '__main__':
    with ThreadPoolExecutor(10) as t:
        for i in range(100):
            t.submit (func,f"num{i}").add_done_callback(fn)   #返回 即执行 callback函数
            #顺序不确定

map映射
map返回值 有顺序

from concurrent.futures import ThreadPoolExecutor

def func(name):
    for i in range(10):
        print(name,i)
        return name
    
def fn(res):
    print(res.result())

if __name__ == '__main__':
    with ThreadPoolExecutor(10) as t:
        for i in range(100):
            t.submit (func,f"num{i}").add_done_callback(fn)   #返回   即执行 callback函数   顺序不确定
            #绑定fn函数   fn 返回值-result()?
if __name__ == '__main__':
    with ThreadPoolExecutor(10) as t:
    result = t.map(func,["111","222","333"])
        for r in result:
            print(r)
            # map 返回值为生成器,返回顺序一致

线程池案例

#北京新发地
import  requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor#线程池

f = open("线程池案例.csv","w",encoding="utf-8")

def download (url):
    resp = requests.get(url)
    tree = etree.HTML(resp.text)

    tr_list = tree.xpath("//table[@class='hq_table']/tr")
    for tr in tr_list:
        td_texts = tr.xpath("./td/text()")
        s = ",".join(td_texts)
        f.write(s)
        f.write('\n')

if __name__ =='__main__':
    with ThreadPoolExecutor(10) as t: #线程池
        for i in range(1,16964):
            url = f"http...{i}.shtml"
            #download(url) 线程池不能这么干
            t.submit(download,url)

你可能感兴趣的:(爬虫,学习,python)