[杞浇]Python HTTP搴搑equests涓枃椤甸潰涔辩爜瑙e喅鏂规锛�

Python涓枃涔辩爜锛屾槸涓�涓緢澶х殑鍧戯紝鑷繁涓嶇煡閬撳湪杩欓噷閬囧埌澶氬皯闂浜嗐�傝繕濂介�氳繃鑷繁涓嶆柇鐨勬�荤粨锛岀幇鍦ㄩ亣鍒颁贡鐮佺殑鎯呭喌瓒婃潵瓒婂皯锛屽氨绠楀嚭鐜帮紝涓�鑸篃鑳藉揩閫熻В鍐抽棶棰樸�傝繖涓棶棰橈紝鎴戜竷鏈堝氨瑙e喅浜嗭紝浠婂ぉ鎬荤粨鍑烘潵锛屽拰鏈嬪弸涓�璧峰垎浜��

鏈�杩戝啓杩囧ソ鍑犱釜鐖櫕锛岀啛鎮変簡涓婸ython

requests搴撶殑鐢ㄦ硶锛岃繖涓簱鐪熺殑Python鐨勫畼鏂筧pi鎺ュ彛濂界敤澶氫簡銆傜編涓笉瓒崇殑鏄細杩欎釜搴撳ソ鍍忓涓枃鐨勬敮鎸佷笉鏄緢鍙嬪ソ锛屾湁浜涢〉闈細鍑虹幇涔辩爜锛岀劧鍚庢崲鎴恥rllib鍚庯紝闂灏辨病鏈変簡銆傜敱浜巖equests搴撴渶缁堜娇鐢ㄧ殑鏄痷rllib3浣滀负搴曞眰浼犺緭閫傞厤鍣紝requests鍙槸鎶妘rllib3搴撹鍙栫殑鍘熷杩涜浜烘�у寲鐨勫鐞嗭紝鎵�浠ラ棶棰榬equests搴撴湰韬笂锛佷簬鏄喅瀹氶槄璇诲簱婧愮爜锛岃В鍐宠涓枃涔辩爜闂锛涗竴鏂归潰锛屼篃鏄笇鏈涘姞寮鸿嚜宸卞HTTP鍗忚銆丳ython鐨勭悊瑙c��

鍏堟槸鎸夌収api鎺ュ彛锛屼竴琛岃闃呰浠g爜锛屽皾璇曚簡瑙i棶棰樺嚭鍦ㄥ摢閲岋紒鐪熶釜杩囩▼杩涘睍姣旇緝鎱紝鎴戝ぇ姒傝姳浜�5澶╁乏鍙崇殑鏃堕棿锛岄�氳浜嗚搴撶殑婧愪唬鐮併�傞槄璇讳唬鐮佽繃绋嬩腑锛屾湁涓嶆噦鐨勫湴鏂癸紝灏辫嚜宸辨墦鍗版棩蹇椾俊鎭紝浠ュ府鍔╃悊瑙c��

鏈�鍚庢垜鏄繖鏍峰彂鐜伴棶棰樻墍鍦ㄧ殑锛�

>>> req = requests.get('http://www.jd.com')>>> req>>> print req.text[:100]FILE: /usr/lib/python2.7/dist-packages/requests/models.pyc, LINE: 770 <==> ISO-8859-1FILE: /usr/lib/python2.7/dist-packages/requests/models.pyc, LINE: 781 <==> ISO-8859-1戮漏露芦(JD.COM)-脳脹潞脧脥酶鹿潞脢脳脩隆-脮媒脝路碌脥录脹隆垄脝路脰脢

# 杩欓噷鍑虹幇浜嗕贡鐮�

>>> dir(req)

['__attrs__', '__bool__', '__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__getstate__', '__hash__', '__init__', '__iter__', '__module__', '__new__', '__nonzero__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_content', '_content_consumed', 'apparent_encoding', 'close', 'connection', 'content', 'cookies', 'elapsed', 'encoding', 'headers', 'history', 'is_redirect', 'iter_content', 'iter_lines', 'json', 'links', 'ok', 'raise_for_status', 'raw', 'reason', 'request', 'status_code', 'text', 'url']

req鏈塩ontent灞炴�э紝杩樻湁text灞炴�э紝鎴戜滑鐪嬬湅content灞炴�э細

>>> print req.content[:100]戮漏露芦(JD.COM)-貨潞袓雼椦�-纸品碌图邸垄品讑

>>>

>>>

>>> print req.content.decode('gbk')[:100]浜笢(JD.COM)-缁煎悎缃戣喘棣栭��-姝e搧浣庝环銆佸搧璐ㄤ繚闅溿�侀厤閫佸強鏃躲�佽交鏉捐喘鐗╋紒

## 鐢变簬璇ラ〉闈㈡椂gbk缂栫爜鐨勶紝鑰孡inux鏄痷tf-8缂栫爜锛屾墍浠ユ墦鍗拌偗瀹氭槸涔辩爜锛屾垜浠厛杩涜瑙g爜銆傚氨鑳芥纭樉绀轰簡銆�


鍙槸锛宼ext灞炴�э紝鎸夌収姝ょ鏂瑰紡锛屽苟涓嶅彲琛岋紒

>>> print req.text[:100]FILE: /usr/lib/python2.7/dist-packages/requests/models.pyc, LINE: 770 <==> ISO-8859-1FILE: /usr/lib/python2.7/dist-packages/requests/models.pyc, LINE: 781 <==> ISO-8859-1戮漏露芦(JD.COM)-脳脹潞脧脥酶鹿潞脢脳脩隆-脮媒脝路碌脥录脹隆垄脝路脰脢

>>> print req.text.decode('gbk')[:100]

FILE: /usr/lib/python2.7/dist-packages/requests/models.pyc,聽 LINE: 770 <==> ISO-8859-1

FILE: /usr/lib/python2.7/dist-packages/requests/models.pyc,聽 LINE: 781 <==> ISO-8859-1

Traceback (most recent call last):

聽 File "", line 1, in

UnicodeEncodeError: 'ascii' codec can't encode characters in position 60-63: ordinal not in range(128)

#聽 瀵箃ext灞炴�ц繘琛岃В鐮侊紝灏变細鍑虹幇閿欒銆�


璁╂垜浠潵鐪嬬湅锛岃繖涓や釜灞炴�х殑婧愮爜锛�

# /requests/models.py

@property

def content(self):

聽 聽 """Content of the response, in bytes."""

聽 聽 if self._content is False:

聽 聽 聽 聽 # Read the contents.

聽 聽 聽 聽 try:

聽 聽 聽 聽 聽 聽 if self._content_consumed:

聽 聽 聽 聽 聽 聽 聽 聽 raise RuntimeError(

聽 聽 聽 聽 聽 聽 聽 聽 聽 聽 'The content for this response was already consumed')

聽 聽 聽 聽 聽 聽 if self.status_code == 0:

聽 聽 聽 聽 聽 聽 聽 聽 self._content = None

聽 聽 聽 聽 聽 聽 else:

聽 聽 聽 聽 聽 聽 聽 聽 self._content = bytes().join(self.iter_content(CONTENT_CHUNK_SIZE)) or bytes()

聽 聽 聽 聽 except AttributeError:

聽 聽 聽 聽 聽 聽 self._content = None

聽 聽 self._content_consumed = True

聽 聽 # don't need to release the connection; that's been handled by urllib3

聽 聽 # since we exhausted the data.

聽 聽 return self._content


# requests/models.py

@property

def text(self):

聽 聽 """Content of the response, in unicode.

聽 聽 If Response.encoding is None, encoding will be guessed using

聽 聽 ``chardet``.

聽 聽 The encoding of the response content is determined based solely on HTTP

聽 聽 headers, following RFC 2616 to the letter. If you can take advantage of

聽 聽 non-HTTP knowledge to make a better guess at the encoding, you should

聽 聽 set ``r.encoding`` appropriately before accessing this property.

聽 聽 """

聽 聽 # Try charset from content-type

聽 聽 content = None

聽 聽 encoding = self.encoding

聽 聽 if not self.content:

聽 聽 聽 聽 return str('')

聽 聽 # Fallback to auto-detected encoding.

聽 聽 if self.encoding is None:

聽 聽 聽 聽 encoding = self.apparent_encoding

聽 聽 # Decode unicode from given encoding.

聽 聽 try:

聽 聽 聽 聽 content = str(self.content, encoding, errors='replace')

聽 聽 except (LookupError, TypeError):

聽 聽 聽 聽 # A LookupError is raised if the encoding was not found which could

聽 聽 聽 聽 # indicate a misspelling or similar mistake.

聽 聽 聽 聽 #

聽 聽 聽 聽 # A TypeError can be raised if encoding is None

聽 聽 聽 聽 #

聽 聽 聽 聽 # So we try blindly encoding.

聽 聽 聽 聽 content = str(self.content, errors='replace')

聽 聽 return content


鐪嬬湅娉ㄥ拰婧愮爜鐭ラ亾锛宑ontent鏄痷rllib3璇诲彇鍥炴潵鐨勫師濮嬪瓧鑺傜爜锛岃�宼ext涓嶈繃鏄皾璇曞content閫氳繃缂栫爜鏂瑰紡瑙g爜涓簎nicode銆俲d.com 椤甸潰涓篻bk缂栫爜锛岄棶棰樺氨鍑哄湪杩欓噷銆�

>>> req.apparent_encoding;req.encoding'GB2312'

'ISO-8859-1'

杩欓噷鐨勪袱绉嶇紪鐮佹柟寮忓拰椤甸潰缂栫爜鏂瑰紡涓嶄竴鑷达紝鑰宑ontent鍗磋繕灏濊瘯鐢ㄩ敊璇殑缂栫爜鏂瑰紡杩涜瑙g爜銆傝偗瀹氫細鍑虹幇闂锛�

鎴戜滑鏉ョ湅鐪嬶紝req鐨勪袱绉嶇紪鐮佹柟寮忔槸鎬庝箞鑾峰彇鐨勶細

# rquests/models.py

@property

def apparent_encoding(self):

聽 聽 """The apparent encoding, provided by the chardet library"""

returnchardet.detect(self.content)['encoding']


椤轰究璇翠竴涓嬶細chardet搴撶洃娴嬬紪鐮佷笉涓�瀹氭槸瀹屽叏瀵圭殑锛屽彧鏈変竴瀹氱殑鍙俊搴︺�傛瘮濡俲d.com椤甸潰锛岀紪鐮佹槸gbk锛屼絾鏄娴嬪嚭鏉ュ嵈鏄疓B2312锛岃櫧鐒惰繖涓ょ缂栫爜鏄吋瀹圭殑锛屼絾鏄敤GB2312鍖鸿В鐮乬bk缂栫爜鐨勭綉椤靛瓧鑺備覆鏄細鏈夎繍琛屾椂閿欒鐨勶紒

鑾峰彇encoding鐨勪唬鐮佸湪杩欓噷锛�

# requests/adapters.pydef build_response(self, req, resp): """Builds a :class:`Response` object from a urllib3 response. This should not be called from user code, and is only exposed for use when subclassing the :class:`HTTPAdapter` :param req: The :class:`PreparedRequest` used to generate the response.

聽 聽 :param resp: The urllib3 response object.

聽 聽 """

聽 聽 response = Response()

聽 聽 # Fallback to None if there's no status_code, for whatever reason.

聽 聽 response.status_code = getattr(resp, 'status', None)

聽 聽 # Make headers case-insensitive.

聽 聽 response.headers = CaseInsensitiveDict(getattr(resp, 'headers', {}))

聽 聽 # Set encoding.

聽 聽 response.encoding = get_encoding_from_headers(response.headers)

聽 聽 # .......

閫氳繃get_encoding_from_headers(response.headers)鍑芥暟鑾峰彇缂栫爜锛屾垜浠啀鏉ョ湅鐪嬭繖涓嚱鏁帮紒

# requests/utils.py

def get_encoding_from_headers(headers):

聽 聽 """Returns encodings from given HTTP Header Dict.

聽 聽 :param headers: dictionary to extract encoding from.

聽 聽 """

聽 聽 content_type = headers.get('content-type')

聽 聽 if not content_type:

聽 聽 聽 聽 return None

聽 聽 content_type, params = cgi.parse_header(content_type)

聽 聽 if 'charset' in params:

聽 聽 聽 聽 return params['charset'].strip("'\"")

聽 聽 if 'text' in content_type:

聽 聽 聽 聽 return 'ISO-8859-1'

鍙戠幇浜嗗悧锛熺▼搴忓彧閫氳繃http鍝嶅簲棣栭儴鑾峰彇缂栫爜锛屽亣濡傚搷搴斾腑锛屾病鏈夋寚瀹歝harset, 閭d箞鐩存帴杩斿洖'ISO-8859-1'銆�

鎴戜滑灏濊瘯杩涜鎶撳寘锛岀湅鐪媓ttp鍝嶅簲鍐呭鏄粈涔堬細

[杞浇]Python HTTP搴搑equests涓枃椤甸潰涔辩爜瑙e喅鏂规锛�_第1张图片
[杞浇]Python HTTP搴搑equests涓枃椤甸潰涔辩爜瑙e喅鏂规锛�_第2张图片

鍙互鐪嬪埌锛宺eqponse header鍙寚瀹氫簡type锛屼絾鏄病鏈夋寚瀹氱紪鐮�(涓�鑸幇鍦ㄩ〉闈㈢紪鐮侀兘鐩存帴鍦╤tml椤甸潰涓�)銆傛墍鏈夎鍑芥暟灏辩洿鎺ヨ繑鍥�'ISO-8859-1'銆�

鍙兘澶у浼氶棶锛氫綔鑰呬负浠�涔堣榛樿杩欐牱澶勭悊鍛紵杩欐槸涓�涓猙ug鍚楋紵鍏跺疄锛屼綔鑰呮槸涓ユ牸http鍗忚鏍囧噯鍐欒繖涓簱鐨勶紝銆奌TTP鏉冨▉鎸囧崡銆嬮噷绗�16绔犲浗闄呭寲閲屾彁鍒帮紝濡傛灉HTTP鍝嶅簲涓瑿ontent-Type瀛楁娌℃湁鎸囧畾charset锛屽垯榛樿椤甸潰鏄�'ISO-8859-1'缂栫爜銆�杩欏鐞嗚嫳鏂囬〉闈㈠綋鐒舵病鏈夐棶棰橈紝浣嗘槸涓枃椤甸潰锛屽氨浼氭湁涔辩爜浜嗭紒

瑙e喅鏂规锛�

鎵惧埌浜嗛棶棰樻墍鍦紝鎴戜滑鐜板湪鏈変袱绉嶆柟寮忚В鍐宠闂銆�

1. 淇敼get_encoding_from_headers鍑芥暟锛岄�氳繃姝e垯鍖归厤锛屾潵妫�娴嬮〉闈㈢紪鐮併�傜敱浜庣幇鍦ㄧ殑椤甸潰閮藉湪HTML浠g爜涓寚瀹氫簡charset锛屾墍浠ラ�氳繃姝e垯寮忓尮閰嶇殑缂栫爜鏂瑰紡鏄畬鍏ㄦ纭殑銆�

2. 鐢变簬content鏄疕TTP鐩稿簲鐨勫師濮嬪瓧鑺備覆锛屾墍浠ユ垜浠渶瑕佺洿鎺ュ彲浠ラ�氳繃浣跨敤瀹冦�傛妸content鎸夌収椤甸潰缂栫爜鏂瑰紡瑙g爜涓簎nicode锛�

你可能感兴趣的:([杞浇]Python HTTP搴搑equests涓枃椤甸潰涔辩爜瑙e喅鏂规锛�)