Python涓枃涔辩爜锛屾槸涓�涓緢澶х殑鍧戯紝鑷繁涓嶇煡閬撳湪杩欓噷閬囧埌澶氬皯闂浜嗐�傝繕濂介�氳繃鑷繁涓嶆柇鐨勬�荤粨锛岀幇鍦ㄩ亣鍒颁贡鐮佺殑鎯呭喌瓒婃潵瓒婂皯锛屽氨绠楀嚭鐜帮紝涓�鑸篃鑳藉揩閫熻В鍐抽棶棰樸�傝繖涓棶棰橈紝鎴戜竷鏈堝氨瑙e喅浜嗭紝浠婂ぉ鎬荤粨鍑烘潵锛屽拰鏈嬪弸涓�璧峰垎浜��
鏈�杩戝啓杩囧ソ鍑犱釜鐖櫕锛岀啛鎮変簡涓婸ython
requests搴撶殑鐢ㄦ硶锛岃繖涓簱鐪熺殑Python鐨勫畼鏂筧pi鎺ュ彛濂界敤澶氫簡銆傜編涓笉瓒崇殑鏄細杩欎釜搴撳ソ鍍忓涓枃鐨勬敮鎸佷笉鏄緢鍙嬪ソ锛屾湁浜涢〉闈細鍑虹幇涔辩爜锛岀劧鍚庢崲鎴恥rllib鍚庯紝闂灏辨病鏈変簡銆傜敱浜巖equests搴撴渶缁堜娇鐢ㄧ殑鏄痷rllib3浣滀负搴曞眰浼犺緭閫傞厤鍣紝requests鍙槸鎶妘rllib3搴撹鍙栫殑鍘熷杩涜浜烘�у寲鐨勫鐞嗭紝鎵�浠ラ棶棰榬equests搴撴湰韬笂锛佷簬鏄喅瀹氶槄璇诲簱婧愮爜锛岃В鍐宠涓枃涔辩爜闂锛涗竴鏂归潰锛屼篃鏄笇鏈涘姞寮鸿嚜宸卞HTTP鍗忚銆丳ython鐨勭悊瑙c��
鍏堟槸鎸夌収api鎺ュ彛锛屼竴琛岃闃呰浠g爜锛屽皾璇曚簡瑙i棶棰樺嚭鍦ㄥ摢閲岋紒鐪熶釜杩囩▼杩涘睍姣旇緝鎱紝鎴戝ぇ姒傝姳浜�5澶╁乏鍙崇殑鏃堕棿锛岄�氳浜嗚搴撶殑婧愪唬鐮併�傞槄璇讳唬鐮佽繃绋嬩腑锛屾湁涓嶆噦鐨勫湴鏂癸紝灏辫嚜宸辨墦鍗版棩蹇椾俊鎭紝浠ュ府鍔╃悊瑙c��
鏈�鍚庢垜鏄繖鏍峰彂鐜伴棶棰樻墍鍦ㄧ殑锛�
>>> req = requests.get('http://www.jd.com')>>> req>>> print req.text[:100]FILE: /usr/lib/python2.7/dist-packages/requests/models.pyc, LINE: 770 <==> ISO-8859-1FILE: /usr/lib/python2.7/dist-packages/requests/models.pyc, LINE: 781 <==> ISO-8859-1戮漏露芦(JD.COM)-脳脹潞脧脥酶鹿潞脢脳脩隆-脮媒脝路碌脥录脹隆垄脝路脰脢
# 杩欓噷鍑虹幇浜嗕贡鐮�
>>> dir(req)
['__attrs__', '__bool__', '__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__getstate__', '__hash__', '__init__', '__iter__', '__module__', '__new__', '__nonzero__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_content', '_content_consumed', 'apparent_encoding', 'close', 'connection', 'content', 'cookies', 'elapsed', 'encoding', 'headers', 'history', 'is_redirect', 'iter_content', 'iter_lines', 'json', 'links', 'ok', 'raise_for_status', 'raw', 'reason', 'request', 'status_code', 'text', 'url']
req鏈塩ontent灞炴�э紝杩樻湁text灞炴�э紝鎴戜滑鐪嬬湅content灞炴�э細
>>> print req.content[:100]戮漏露芦(JD.COM)-貨潞袓雼椦�-纸品碌图邸垄品讑
>>>
>>>
>>> print req.content.decode('gbk')[:100]浜笢(JD.COM)-缁煎悎缃戣喘棣栭��-姝e搧浣庝环銆佸搧璐ㄤ繚闅溿�侀厤閫佸強鏃躲�佽交鏉捐喘鐗╋紒
## 鐢变簬璇ラ〉闈㈡椂gbk缂栫爜鐨勶紝鑰孡inux鏄痷tf-8缂栫爜锛屾墍浠ユ墦鍗拌偗瀹氭槸涔辩爜锛屾垜浠厛杩涜瑙g爜銆傚氨鑳芥纭樉绀轰簡銆�
鍙槸锛宼ext灞炴�э紝鎸夌収姝ょ鏂瑰紡锛屽苟涓嶅彲琛岋紒
>>> print req.text[:100]FILE: /usr/lib/python2.7/dist-packages/requests/models.pyc, LINE: 770 <==> ISO-8859-1FILE: /usr/lib/python2.7/dist-packages/requests/models.pyc, LINE: 781 <==> ISO-8859-1戮漏露芦(JD.COM)-脳脹潞脧脥酶鹿潞脢脳脩隆-脮媒脝路碌脥录脹隆垄脝路脰脢
>>> print req.text.decode('gbk')[:100]
FILE: /usr/lib/python2.7/dist-packages/requests/models.pyc,聽 LINE: 770 <==> ISO-8859-1
FILE: /usr/lib/python2.7/dist-packages/requests/models.pyc,聽 LINE: 781 <==> ISO-8859-1
Traceback (most recent call last):
聽 File "", line 1, in
UnicodeEncodeError: 'ascii' codec can't encode characters in position 60-63: ordinal not in range(128)
#聽 瀵箃ext灞炴�ц繘琛岃В鐮侊紝灏变細鍑虹幇閿欒銆�
璁╂垜浠潵鐪嬬湅锛岃繖涓や釜灞炴�х殑婧愮爜锛�
# /requests/models.py
@property
def content(self):
聽 聽 """Content of the response, in bytes."""
聽 聽 if self._content is False:
聽 聽 聽 聽 # Read the contents.
聽 聽 聽 聽 try:
聽 聽 聽 聽 聽 聽 if self._content_consumed:
聽 聽 聽 聽 聽 聽 聽 聽 raise RuntimeError(
聽 聽 聽 聽 聽 聽 聽 聽 聽 聽 'The content for this response was already consumed')
聽 聽 聽 聽 聽 聽 if self.status_code == 0:
聽 聽 聽 聽 聽 聽 聽 聽 self._content = None
聽 聽 聽 聽 聽 聽 else:
聽 聽 聽 聽 聽 聽 聽 聽 self._content = bytes().join(self.iter_content(CONTENT_CHUNK_SIZE)) or bytes()
聽 聽 聽 聽 except AttributeError:
聽 聽 聽 聽 聽 聽 self._content = None
聽 聽 self._content_consumed = True
聽 聽 # don't need to release the connection; that's been handled by urllib3
聽 聽 # since we exhausted the data.
聽 聽 return self._content
# requests/models.py
@property
def text(self):
聽 聽 """Content of the response, in unicode.
聽 聽 If Response.encoding is None, encoding will be guessed using
聽 聽 ``chardet``.
聽 聽 The encoding of the response content is determined based solely on HTTP
聽 聽 headers, following RFC 2616 to the letter. If you can take advantage of
聽 聽 non-HTTP knowledge to make a better guess at the encoding, you should
聽 聽 set ``r.encoding`` appropriately before accessing this property.
聽 聽 """
聽 聽 # Try charset from content-type
聽 聽 content = None
聽 聽 encoding = self.encoding
聽 聽 if not self.content:
聽 聽 聽 聽 return str('')
聽 聽 # Fallback to auto-detected encoding.
聽 聽 if self.encoding is None:
聽 聽 聽 聽 encoding = self.apparent_encoding
聽 聽 # Decode unicode from given encoding.
聽 聽 try:
聽 聽 聽 聽 content = str(self.content, encoding, errors='replace')
聽 聽 except (LookupError, TypeError):
聽 聽 聽 聽 # A LookupError is raised if the encoding was not found which could
聽 聽 聽 聽 # indicate a misspelling or similar mistake.
聽 聽 聽 聽 #
聽 聽 聽 聽 # A TypeError can be raised if encoding is None
聽 聽 聽 聽 #
聽 聽 聽 聽 # So we try blindly encoding.
聽 聽 聽 聽 content = str(self.content, errors='replace')
聽 聽 return content
鐪嬬湅娉ㄥ拰婧愮爜鐭ラ亾锛宑ontent鏄痷rllib3璇诲彇鍥炴潵鐨勫師濮嬪瓧鑺傜爜锛岃�宼ext涓嶈繃鏄皾璇曞content閫氳繃缂栫爜鏂瑰紡瑙g爜涓簎nicode銆俲d.com 椤甸潰涓篻bk缂栫爜锛岄棶棰樺氨鍑哄湪杩欓噷銆�
>>> req.apparent_encoding;req.encoding'GB2312'
'ISO-8859-1'
杩欓噷鐨勪袱绉嶇紪鐮佹柟寮忓拰椤甸潰缂栫爜鏂瑰紡涓嶄竴鑷达紝鑰宑ontent鍗磋繕灏濊瘯鐢ㄩ敊璇殑缂栫爜鏂瑰紡杩涜瑙g爜銆傝偗瀹氫細鍑虹幇闂锛�
鎴戜滑鏉ョ湅鐪嬶紝req鐨勪袱绉嶇紪鐮佹柟寮忔槸鎬庝箞鑾峰彇鐨勶細
# rquests/models.py
@property
def apparent_encoding(self):
聽 聽 """The apparent encoding, provided by the chardet library"""
returnchardet.detect(self.content)['encoding']
椤轰究璇翠竴涓嬶細chardet搴撶洃娴嬬紪鐮佷笉涓�瀹氭槸瀹屽叏瀵圭殑锛屽彧鏈変竴瀹氱殑鍙俊搴︺�傛瘮濡俲d.com椤甸潰锛岀紪鐮佹槸gbk锛屼絾鏄娴嬪嚭鏉ュ嵈鏄疓B2312锛岃櫧鐒惰繖涓ょ缂栫爜鏄吋瀹圭殑锛屼絾鏄敤GB2312鍖鸿В鐮乬bk缂栫爜鐨勭綉椤靛瓧鑺備覆鏄細鏈夎繍琛屾椂閿欒鐨勶紒
鑾峰彇encoding鐨勪唬鐮佸湪杩欓噷锛�
# requests/adapters.pydef build_response(self, req, resp): """Builds a :class:`Response` object from a urllib3 response. This should not be called from user code, and is only exposed for use when subclassing the :class:`HTTPAdapter` :param req: The :class:`PreparedRequest` used to generate the response.
聽 聽 :param resp: The urllib3 response object.
聽 聽 """
聽 聽 response = Response()
聽 聽 # Fallback to None if there's no status_code, for whatever reason.
聽 聽 response.status_code = getattr(resp, 'status', None)
聽 聽 # Make headers case-insensitive.
聽 聽 response.headers = CaseInsensitiveDict(getattr(resp, 'headers', {}))
聽 聽 # Set encoding.
聽 聽 response.encoding = get_encoding_from_headers(response.headers)
聽 聽 # .......
閫氳繃get_encoding_from_headers(response.headers)鍑芥暟鑾峰彇缂栫爜锛屾垜浠啀鏉ョ湅鐪嬭繖涓嚱鏁帮紒
# requests/utils.py
def get_encoding_from_headers(headers):
聽 聽 """Returns encodings from given HTTP Header Dict.
聽 聽 :param headers: dictionary to extract encoding from.
聽 聽 """
聽 聽 content_type = headers.get('content-type')
聽 聽 if not content_type:
聽 聽 聽 聽 return None
聽 聽 content_type, params = cgi.parse_header(content_type)
聽 聽 if 'charset' in params:
聽 聽 聽 聽 return params['charset'].strip("'\"")
聽 聽 if 'text' in content_type:
聽 聽 聽 聽 return 'ISO-8859-1'
鍙戠幇浜嗗悧锛熺▼搴忓彧閫氳繃http鍝嶅簲棣栭儴鑾峰彇缂栫爜锛屽亣濡傚搷搴斾腑锛屾病鏈夋寚瀹歝harset, 閭d箞鐩存帴杩斿洖'ISO-8859-1'銆�
鎴戜滑灏濊瘯杩涜鎶撳寘锛岀湅鐪媓ttp鍝嶅簲鍐呭鏄粈涔堬細
鍙互鐪嬪埌锛宺eqponse header鍙寚瀹氫簡type锛屼絾鏄病鏈夋寚瀹氱紪鐮�(涓�鑸幇鍦ㄩ〉闈㈢紪鐮侀兘鐩存帴鍦╤tml椤甸潰涓�)銆傛墍鏈夎鍑芥暟灏辩洿鎺ヨ繑鍥�'ISO-8859-1'銆�
鍙兘澶у浼氶棶锛氫綔鑰呬负浠�涔堣榛樿杩欐牱澶勭悊鍛紵杩欐槸涓�涓猙ug鍚楋紵鍏跺疄锛屼綔鑰呮槸涓ユ牸http鍗忚鏍囧噯鍐欒繖涓簱鐨勶紝銆奌TTP鏉冨▉鎸囧崡銆嬮噷绗�16绔犲浗闄呭寲閲屾彁鍒帮紝濡傛灉HTTP鍝嶅簲涓瑿ontent-Type瀛楁娌℃湁鎸囧畾charset锛屽垯榛樿椤甸潰鏄�'ISO-8859-1'缂栫爜銆�杩欏鐞嗚嫳鏂囬〉闈㈠綋鐒舵病鏈夐棶棰橈紝浣嗘槸涓枃椤甸潰锛屽氨浼氭湁涔辩爜浜嗭紒
瑙e喅鏂规锛�
鎵惧埌浜嗛棶棰樻墍鍦紝鎴戜滑鐜板湪鏈変袱绉嶆柟寮忚В鍐宠闂銆�
1. 淇敼get_encoding_from_headers鍑芥暟锛岄�氳繃姝e垯鍖归厤锛屾潵妫�娴嬮〉闈㈢紪鐮併�傜敱浜庣幇鍦ㄧ殑椤甸潰閮藉湪HTML浠g爜涓寚瀹氫簡charset锛屾墍浠ラ�氳繃姝e垯寮忓尮閰嶇殑缂栫爜鏂瑰紡鏄畬鍏ㄦ纭殑銆�
2. 鐢变簬content鏄疕TTP鐩稿簲鐨勫師濮嬪瓧鑺備覆锛屾墍浠ユ垜浠渶瑕佺洿鎺ュ彲浠ラ�氳繃浣跨敤瀹冦�傛妸content鎸夌収椤甸潰缂栫爜鏂瑰紡瑙g爜涓簎nicode锛�