有时某些网站反爬不是很厉害的时候,这时候就需要提高爬取数据的效率,可以使用多线程,协程等方式,关于这些的知识有很多文章讲述,肯定比我讲的好,下面是我个人理解使用方式,可以说是我写代码的模板吧
import requests
from bs4 import BeautifulSoup
import time
from loguru import logger
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}
def f1(url):
res_list = []
logger.debug('请求%s'%url)
response = requests.get(url=url,headers=headers)
response.encoding = 'gbk'
page_text = response.text
soup = BeautifulSoup(page_text, 'lxml')
dls = soup.findAll("dl")
for dl in dls:
brandId = dl.get("id")
brandName = dl.find("dt").text
cxzs = dl.find_all(class_="rank-list-ul")
for cxz in cxzs:
zm = cxz.findPrevious().text
cxs = cxz.findAll("li")
for cx in cxs:
try:
cxId = cx.get("id").replace("s", "")
cxName = cx.find("a").text
res_list.append(dict(brandId=brandId,brandName=brandName,cxId=cxId,cxName=cxName))
except:
pass
logger.info('请求结束%s' % (url))
return dict(url=url,res_list=res_list,code='成功')
if __name__ == '__main__':
# logger.error(time.time())
a = time.time()
urls = []
for i in range(ord("A"), ord("Z") + 1):
U = chr(i)
urls.append("https://www.autohome.com.cn/grade/carhtml/%s.html" % U)
for url in urls:
f1(url)
logger.error(time.time()-a)
有兴趣可以运行一下,例子是汽车之家A-Z请求数据,接下来多线程,协程都用这个当为例子,具体链接此文章链接
这是我习惯套的模板
import threading
from loguru import logger
import time
import pandas as pd
from sqlalchemy import create_engine
class ApiDataThread(object):
def __init__(self,shop_info_list):
self.shop_info_list = shop_info_list
self.thread_lock = threading.Lock()
def get_info(self):
task = None
self.thread_lock.acquire()
if self.shop_info_list:
task = self.shop_info_list.pop()
"""
取数据
"""
self.thread_lock.release()
if task is not None:
"""
业务逻辑
"""
self.thread_lock.acquire()
"""
存储数据
"""
self.thread_lock.release()
def main(self,n):
pool = []
for i in range(n):
t = threading.Thread(target=self.get_info)
t.start()
pool.append(t)
for j in pool:
j.join()
if __name__ == '__main__':
obj = ApiDataThread(list)
obj.main(len(list))
下面是结合上述例子,一个url就是一个线程
import threading
from loguru import logger
import time
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}
class ApiDataThread(object):
def __init__(self,shop_info_list):
self.shop_info_list = shop_info_list
self.thread_lock = threading.Lock()
self.res_list = []
def get_info(self):
task = None
self.thread_lock.acquire()
if self.shop_info_list:
task = self.shop_info_list.pop()
self.thread_lock.release()
if task is not None:
logger.debug('请求%s'%task)
response = requests.get(url=task,headers=headers)
response.encoding = 'gbk'
page_text = response.text
soup = BeautifulSoup(page_text, 'lxml')
dls = soup.findAll("dl")
for dl in dls:
brandId = dl.get("id")
brandName = dl.find("dt").text
cxzs = dl.find_all(class_="rank-list-ul")
for cxz in cxzs:
zm = cxz.findPrevious().text
cxs = cxz.findAll("li")
for cx in cxs:
try:
cxId = cx.get("id").replace("s", "")
cxName = cx.find("a").text
self.thread_lock.acquire()
self.res_list.append(dict(brandId=brandId,brandName=brandName,cxId=cxId,cxName=cxName))
self.thread_lock.release()
except:
pass
def main(self,n):
pool = []
for i in range(n):
t = threading.Thread(target=self.get_info)
t.start()
pool.append(t)
for j in pool:
j.join()
return self.res_list
if __name__ == '__main__':
a = time.time()
urls = []
for i in range(ord("A"), ord("Z") + 1):
U = chr(i)
urls.append("https://www.autohome.com.cn/grade/carhtml/%s.html" % U)
obj = ApiDataThread(urls)
print(obj.main(len(urls)))
logger.error(time.time() - a)
运行是不是快了一点
这里需要导入aiohttp,asyncio
import aiohttp
from loguru import logger
import asyncio
from bs4 import BeautifulSoup
import time
class AioHttps(object):
def __init__(self,urls):
self.urls = urls
self.result = []
async def fetch(self,session,url,headers=None,timeout=9):
_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}
if headers:
_headers = headers
logger.debug('开始请求%s'%(url))
try:
async with session.get(url,headers=headers,timeout=timeout) as response:
self.bs4_html(await response.read(),url)
except Exception as e:
self.result.append(dict(url=url,res_list='',exceptions=e))
logger.info('请求结束%s'%(url))
async def main(self):
async with aiohttp.ClientSession() as client:
tasks = []
for url in self.urls:
tasks.append(asyncio.create_task(self.fetch(client,url)))
await asyncio.wait(tasks)
def bs4_html(self,response,url):
"""
解析html
:param response:
:param url:
:return:
"""
res_list = []
soup = BeautifulSoup(response, 'lxml')
dls = soup.findAll("dl")
for dl in dls:
brandId = dl.get("id")
brandName = dl.find("dt").text
cxzs = dl.find_all(class_="rank-list-ul")
for cxz in cxzs:
zm = cxz.findPrevious().text
cxs = cxz.findAll("li")
for cx in cxs:
try:
cxId = cx.get("id").replace("s", "")
cxName = cx.find("a").text
res_list.append(dict(brandId=brandId,brandName=brandName,cxId=cxId,cxName=cxName))
except:
pass
self.result.append(dict(url=url,res_list=res_list,code='成功'))
def run(self):
asyncio.get_event_loop().run_until_complete(self.main())
return self.result
if __name__ == '__main__':
a = time.time()
urls = []
for i in range(ord("A"), ord("Z") + 1):
U = chr(i)
urls.append("https://www.autohome.com.cn/grade/carhtml/%s.html" % U )
obj = AioHttps(urls)
obj.run()
logger.error(time.time() - a)
运行感觉比多线程还快了一点,效果很好啊
import aiohttp
from loguru import logger
import asyncio
from bs4 import BeautifulSoup
import time
import threading
class AioHttps(object):
"""
协程类
"""
def __init__(self,urls):
self.urls = urls
self.result = []
async def fetch(self,session,url,headers=None,timeout=9):
_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}
if headers:
_headers = headers
logger.debug('开始请求%s'%(url))
try:
async with session.get(url,headers=headers,timeout=timeout) as response:
self.bs4_html(await response.read(),url)
except Exception as e:
self.result.append(dict(url=url,res_list='',exceptions=e))
logger.info('请求结束%s'%(url))
async def main(self):
async with aiohttp.ClientSession() as client:
tasks = []
for url in self.urls:
tasks.append(asyncio.create_task(self.fetch(client,url)))
await asyncio.wait(tasks)
def bs4_html(self,response,url):
"""
解析html
:param response:
:param url:
:return:
"""
res_list = []
soup = BeautifulSoup(response, 'lxml')
dls = soup.findAll("dl")
for dl in dls:
brandId = dl.get("id")
brandName = dl.find("dt").text
cxzs = dl.find_all(class_="rank-list-ul")
for cxz in cxzs:
zm = cxz.findPrevious().text
cxs = cxz.findAll("li")
for cx in cxs:
try:
cxId = cx.get("id").replace("s", "")
cxName = cx.find("a").text
res_list.append(dict(brandId=brandId,brandName=brandName,cxId=cxId,cxName=cxName))
except:
pass
self.result.append(dict(url=url,res_list=res_list,code='成功'))
def run(self):
# new_event_loop ,否则会报错
asyncio.new_event_loop().run_until_complete(self.main())
return self.result
class ApiDataThread(object):
"""
多线程类
"""
def __init__(self,shop_info_list):
self.shop_info_list = shop_info_list
self.thread_lock = threading.Lock()
self.res_list = []
def get_info(self):
task = None
self.thread_lock.acquire()
if self.shop_info_list:
task = self.shop_info_list.pop()
self.thread_lock.release()
if task is not None:
self.thread_lock.acquire()
obj = AioHttps(task)
self.res_list += obj.run()
self.thread_lock.release()
def main(self,n):
pool = []
for i in range(n):
t = threading.Thread(target=self.get_info)
t.start()
pool.append(t)
for j in pool:
j.join()
return self.res_list
if __name__ == '__main__':
a = time.time()
urls = []
for i in range(ord("A"), ord("Z") + 1):
U = chr(i)
urls.append("https://www.autohome.com.cn/grade/carhtml/%s.html" % U)
lists = [urls[i:i + 10] for i in range(0, len(urls), 10)]
obj = ApiDataThread(lists)
print(obj.main(len(lists)))
logger.error(time.time() - a)
运行一下发现,速度也很快,但差距不大可能是数据太少,体现不出来,
以上是我经常用到一些套路模板,如果有其他见解和意见,随时欢迎一起交流讨论!