深入使用requests.Session()
import requests
# 创建一个会话实例
session = requests.Session()
# 向相同的主机发送多次请求
response_one = session.get('https://httpbin.org/get')
response_two = session.get('https://httpbin.org/get')
# 展示使用了持久连接的行为,两个请求将通过相同的连接发送
print(id(response_one.raw._connection))
print(id(response_two.raw._connection))
# 输出一样的ID,意味着使用的是同一连接
# 事后一定清理:常规操作
session.close()
import requests
from requests.auth import HTTPBasicAuth
# 创建会话实例,并设置默认值
session = requests.Session()
session.headers.update({'user-agent': 'my-app/0.0.1'})
session.auth = HTTPBasicAuth('username', 'password')
# 现在进行的所有的请求都会发送预设的头信息
response = session.get('https://httpbin.org/headers')
print(response.text) # 应当会见到"user-agent"和之前设定的鉴权信息
# 一般在完成请求后关闭会话
session.close()
import requests
# 创建会话实例
session = requests.Session()
# 初次登录以设置cookie
login_res = session.post('https://example.com/login', data={'username':'xxx', 'password':'yyy'})
# Session会保存服务端设置在客户端的cookie信息, 现在进行的请求都将携带这个cookie
profile_res = session.get('https://example.com/profile')
# 经过验证的响应内容
print(profile_res.text)
# 完成所有动作后关闭会话提释放资源
session.close()
import requests
import logging
from requests.exceptions import HTTPError, ConnectionError, Timeout, RequestException
# 配置logging,设置日志级别为WARNING,简短日志格式,并将日志输出到控制台。
logging.basicConfig(level=logging.WARNING,
format='%(asctime)s - %(levelname)s - %(message)s')
url = "https://example.com"
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
except HTTPError as http_err:
logging.warning(f"HTTP错误发生了:{http_err}")
except ConnectionError as conn_err:
logging.warning(f"连接错误发生了:{conn_err}")
except Timeout as timeout_err:
logging.warning(f"请求超时了:{timeout_err}")
except RequestException as err:
logging.warning(f"出现了请求错误:{err}")
else:
print("请求成功完成。")
from bs4 import BeautifulSoup
import logging
# 配置logging,设置日志级别并输出到控制台
logging.basicConfig(level=logging.WARNING,
format='%(asctime)s - %(levelname)s - %(message)s')
html_doc = """
This is title
"""
try:
soup = BeautifulSoup(html_doc, "html.parser")
title_text = soup.title.text
except AttributeError as e:
# 如果BeautifulSoup尝试访问不存在的属性会抛出这个错误
logging.warning(f"未能找到属性。错误:{e}")
except Exception as e:
# 通用异常捕获,可能在解析HTML文档时遇到其他没有预料到的错误
logging.error(f"发生错误:{e}")
else:
print(f"文档的标题是:{title_text}")
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
urls = [
'https://example.com',
'https://example.org',
'https://example.net',
]
def fetch_and_parse(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.title.text
return title
with ThreadPoolExecutor(max_workers=5) as executor:
futures = [executor.submit(fetch_and_parse, url) for url in urls]
for future in concurrent.futures.as_completed(futures):
try:
data = future.result()
print(data)
except Exception as exc:
print(f"生成异常: {url} {exc}")
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
url = "https://example.com/products"
def parse_html(html):
soup = BeautifulSoup(html, "html.parser")
products = soup.find_all('li', {'class': 'product'})
return [product.text for product in products]
def get_html(url):
response = requests.get(url)
return response.text
with ThreadPoolExecutor() as executor:
html = executor.submit(get_html, url).result()
product_texts = executor.submit(parse_html, html).result()
print(product_texts)
from multiprocessing.pool import ThreadPool
import requests
from bs4 import BeautifulSoup
urls = ["https://example.com", "https://example.org"]
def fetch_and_parse(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
return soup.title.text
if __name__ == '__main__':
pool = ThreadPool(processes=2)
results = pool.map(fetch_and_parse, urls)
pool.close()
pool.join()
for title in results:
print(title)