Python requests响应数据乱码问题处理

# !/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
@des     :Python 爬虫过程中编码乱码问题解决,
			默认情况况下,requests会自动从响应头和响应体中解析编码方式,解析失败会赋个默认编码格式 ISO-8859-1,所以导致遇到GBK等编码时出现乱码,以下get_encodin通过相同原理,获取编码方式,成功率显著提高
"""
import chardet
import requests
import re


def get_encoding(response_obj:object)->str:
    """
    Automatically parse web page encoding
        First parse the charset from the response header
        Then parse from the response data
    :param response_obj: REQUEST RESPONSE OBJECT
    :return: Coding
    """
    charset_header = response_obj.headers["Content-Type"]
    pattern_charset_header = re.compile('charset=(.*)', re.I)
    charset = re.search(pattern_charset_header, charset_header)
    if charset is not None:
        return charset.groups()[0]
    else:
        charset = chardet.detect(response_obj.content)["encoding"]
        return charset


url = "http://news.inewsweek.cn/society/2022-05-30/15753.shtml"
res = requests.get(url)
res.encoding = get_encoding(res)
print(res.text)

你可能感兴趣的:(爬虫,爬虫)