python 异步爬取必应搜索结果

简单的通过异步来爬取必应搜索结果,速度非常可观。
通过用 aiohttp, asyncio这两个异步模块,再通过xpath来提取链接。
加个cookie可以防止爬虫被禁

import aiohttp
import asyncio
from lxml import etree


headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0',
				'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
				'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
				'Accept-Encoding': 'gzip, deflate',
				'Cookie': 'BAIDUID=1A6EF88EE4929836C761FB37A1303522:FG=1; BIDUPSID=1A6EF88EE4929836C761FB37A1303522; PSTM=1603199415; H_PS_PSSID=32755_1459_32877_7567_31253_32706_32231_7517_32117_32845_32761_26350; BD_UPN=13314752; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=0; BD_CK_SAM=1; PSINO=5; H_PS_645EC=e4bcE4275G3zWcvH2pxYG6R32rBxb5yuey8xcioaej8V7IaJRfEq4xp4iCo; COOKIE_SESSION=45294_0_2_5_0_2_0_1_0_2_3_0_0_0_0_0_0_0_1603244844%7C5%230_0_1603244844%7C1; BA_HECTOR=2gal2h2ga58025f1vs1fov5vf0k'}

async def url():
	async with aiohttp.ClientSession() as session:
		for i in range(1,100):#通过for in来翻页
			url = 'https://cn.bing.com/search?q=site%3aedu.cn&go=%e6%90%9c%e7%b4%a2&qs=ds&first='+ str((i*10)-1) +'&FORM=PERE'
			try:
				async with session.get(url,headers = headers) as resp:
					r = await resp.text()
					a = etree.HTML(r)
					xpath = a.xpath('//*[@id="b_results"]/li/h2/a/@href')#提取url连接
					for i in xpath:
						print(i)
			except:
				print('无法连接')

asyncio.run(url())




你可能感兴趣的:(python 异步爬取必应搜索结果)