requests和BeautifulSoup都是自行检测网页编码并进行编码的,所以可能会出现检测错误,需要手动更改编码方式,使得中文能够正常显示
from bs4 import BeautifulSoup
import requests
headers = {
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
}
res = requests.get('http://info.2016.163.com/athlete/1280.html', headers=self.headers)
res.encoding = 'utf-8' # 去掉这句会造成中文显示乱码,其中utf-8是根据网页源代码的编码格式指定的,也有可能是如gk18030等
soup = BeautifulSoup(res.text, 'lxml')
body_soup = soup.html.body
info = body_soup.select('.brief .table')[0]
print(info.h1.contents[0]) # 安-弗雷泽