1、urllib的简单使用
import urllib.request
url = 'http://www.baidu.com'
response = urllib.request.urlopen(url)
content = response.read().decode('utf-8')
print(content)
2、urllib:一个类型和六个方法
import urllib.request
url = 'http://www.baidu.com'
response = urllib.request.urlopen(url)
3、urllib:下载
import urllib.request
url_video = 'https://vd4.bdstatic.com/mda-kfsm974k6xwx9y8j/v1-cae/sc/mda-kfsm974k6xwx9y8j.mp4?v_from_s=hkapp-haokan-nanjing&auth_key=1652930638-0-0-dce2ef63e5e2c4d4cbafeea2dd68cff1&bcevod_channel=searchbox_feed&pd=1&cd=0&pt=3&logid=3238681392&vid=2292765971694816622&abtest=102148_2-17451_1&klogid=3238681392'
urllib.request.urlretrieve(url_video,'news.mp4')
4、urllib:请求对象的定制
import urllib.request
url = 'https://www.baidu.com'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
request = urllib.request.Request(url = url,headers = headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf8')
print(content)
5、get请求
(1)quote方法:转换为Unicode编码(单个参数)
import urllib.request
url = 'https://www.baidu.com/s?wd='
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
name = urllib.parse.quote('周杰伦')
url = url + name
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
(2) urlencode方法:多个参数的时候
import urllib.request
import urllib.parse
base_url = 'https://www.baidu.com/s?'
data = {
'wd': '周杰伦',
'sex': '男',
'location': '中国台湾省'
}
new_data = urllib.parse.urlencode(data)
url = base_url + new_data
print(url)
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
6、post请求
(1)请求百度翻译
import urllib.request
import urllib.parse
url = 'https://fanyi.baidu.com/sug'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
data = {
'kw':'spider',
}
data = urllib.parse.urlencode(data).encode('utf-8')
request = urllib.request.Request(url=url,data=data,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
print(type(content))
import json
obj = json.loads(content)
print(obj)
"""
总结:
1、post请求方式的参数必须编码 data = urllib.parse.urlencode(data)
2、编码之后必须调用encode的方法data=urllib.parse.erlencode(data).encode('utf-8')
3、参数是放在请求对象定制的方法中 :request = urllib.request.Request(url=url,data=data,headers=headers)
"""
(1)请求百度翻译详解
import urllib.request
import urllib.parse
url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh'
headers = {
'Cookie': 'BIDUPSID=9C33D9AF842C65882B22D090B46042A7; PSTM=1610940896; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; __yjs_duid=1_0eb9831c9922fbeed75979b5e0cfd7231619776633784; BAIDUID=9139193BF8529188B5B5F4741D0FD50D:FG=1; APPGUIDE_10_0_2=1; BDSFRCVID_BFESS=tP8OJeC62mn-gd3DoJHSKm8DOg3F4AQTH6aoOpQpj2wA39Ni7uPrEG0PHU8g0KubT5mFogKKy2OTH9DF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF_BFESS=tbuJ_KDyJKP3fP36qR6VMPIHqxby26nL3jn9aJ5nJDoCVnTojTJUXfk_jNoThlvMtDCj2qkaQpP-HJ7yM-5HbfPwbbbTJbcz3CrPKl0MLPOYbb0xynoDLT0B5xnMBMnGamOnanra3fAKftnOM46JehL3346-35543bRTLnLy5KJtMDFljTu2DjQyeU5eetjK2CntsJOOaCvVHlQOy4oWK441Dh7MQt6R36chWqvEfp-WDqvoD-Jc3M04X-o9-hvT-54e2p3FBUQJHC33Qft20b0m3gFq3q8La5bD-R7jWhvBhl72y5rUQlRX5q79atTMfNTJ-qcH0KQpsIJM5-DWbT8IjHCDt5FjtRIeV-35b5rfjJrT5-QSMICShUFsWlOCB2Q-XPoO3KJWeUokQMo-b4CUhH7RqPriWbRM2MbgylRpjM3K0462XjKBKPOpK-DfX2TxoUJ2XM0VMloMqtnWKqLebPRih6j9Qg-8KpQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0hC09j68MjjoM5h5K5-LXJR6BWb_8Kb7VbPbMeMnkbfJBDxcXe4bbt2nyaqnRWbCWsh7TDUDMLq_7yajK2MRjL6RBot3ptKTFjD3G0b5pQT8ryb_OK5OibCrn_R6Iab3vOPI4XpO1ef0zBN5thURB2DkO-4bCWJ5TMl5jDh3Mb6ksD-FtqtJHKbDt_II-JfK; BDUSS=NOUXhyMjAwblBqV2NpfkF0dU9Ca1Z3cGNhOTh0R0lBMHU1TWl-bjU0ck1aNUJpRVFBQUFBJCQAAAAAAAAAAAEAAAAe0iJm0ru49tXm1qqw9AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMzaaGLM2mhiV3; BDUSS_BFESS=NOUXhyMjAwblBqV2NpfkF0dU9Ca1Z3cGNhOTh0R0lBMHU1TWl-bjU0ck1aNUJpRVFBQUFBJCQAAAAAAAAAAAEAAAAe0iJm0ru49tXm1qqw9AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMzaaGLM2mhiV3; BAIDUID_BFESS=875209CD6EF1B6A11342D5227CD66391:FG=1; delPer=0; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1650515716,1650872384,1651827471,1652925735; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; av1_switch_v3=0; PSINO=7; BA_HECTOR=a4210hal8la101ag821h8brmv0r; RT="z=1&dm=baidu.com&si=6304f9glu2&ss=l3cf5iav&sl=0&tt=0&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=46b&ul=c3gs0&hd=c3gvq"; H_PS_PSSID=36425_36367_34812_35914_36166_34584_35978_36055_26350_36315_36447; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1652952606; ab_sr=1.0.1_ZWI2ZWRiZWI1Y2JiODRlYWRjZTlkNThjZjAyNDI5NDFhZDg1MGFmN2VkYzI1ZmVlMGIxYjVmZGNhOTEyOWU5ODJlMDI5ZDUxYjc2ODJhOGRlZWZmMzk1NmZjM2U2NDVjNWE3YWIwNjVkOTQ3Zjc2Mjk3Nzg0ZmQ3MmZmNmJlODkzMTQzMjE4OWZiMWExODJiMzZlOTA2NGJkMTkxZmNhZDZjNWQxMzgwZWE1YmM4ZjI5OGViNzM0Y2EyNmRjMjVl',
}
data = {
'from': 'en',
'to': 'zh',
'query': 'spider',
'transtype': 'realtime',
'simple_means_flag': '3',
'sign': '63766.268839',
'token': '92389bc1e4d32b64ec36f56fb41f03db',
'domain': 'common'
}
data = urllib.parse.urlencode(data).encode('utf-8')
request = urllib.request.Request(url=url,data=data,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
import json
obj = json.loads(content)
print(obj)
7、ajax的get请求豆瓣电影第一页
import urllib.request
url = 'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&start=0&genres=%E5%96%9C%E5%89%A7'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
with open('dounban1.json','w',encoding='utf-8') as fp:
fp.write(content)
8、ajax的get请求豆瓣电影前10页
import urllib.parse
import urllib.request
def create_request(page):
base_url = 'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&'
data = {
'start':(page - 1) * 20,
'limit':20
}
data = urllib.parse.urlencode(data)
url = base_url + data
print(url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
return request
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content
def down_load(page,content):
with open('douban_'+str(page) +'.json','w',encoding='utf-8')as fp:
fp.write(content)
if __name__ == '__main__':
start_page = int(input('请输入起始的页码:'))
end_page = int(input('请输入结束的页码:'))
for page in range(start_page,end_page+1):
request = create_request(page)
content = get_content(request)
down_load(page,content)
9、ajax的post请求肯德基官网
import urllib.request
import urllib.parse
def create_request(page):
base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
data = {
'cname':'',
'pid':'',
'keyword': '北京',
'pageIndex': page,
'pageSize': '10'
}
data = urllib.parse.urlencode(data).encode('utf-8')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
request = urllib.request.Request(url=base_url,data=data,headers=headers)
return request
def get_content(requset):
response = urllib.request.urlopen(requset)
content = response.read().decode('utf-8')
return content
def down_load(page,content):
with open('kfc_'+str(page)+'.json','w',encoding='utf-8')as fp:
fp.write(content)
if __name__ == '__main__':
start_page = int(input('请输入起始页码:'))
end_page = int(input('请输入结束页码:'))
for page in range(start_page,end_page+1):
requests = create_request(page)
content = get_content(requests)
down_load(page,content)
10、urllib异常
import urllib.request
import urllib.error
url = 'http://www.goudan111.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
try:
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
except urllib.error.HTTPError:
print('系统正在升级。。')
except urllib.error.URLError:
print('我都说了 系统正在升级。。。')
11、handler的基本使用
import urllib.request
url = 'http://www.baidu.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
handler = urllib.request.HTTPHandler()
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')
print(content)
12、代理
import urllib.request
url = 'https://www.baidu.com/s?ie=UTF-8&wd=ip'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
proxies = {
'http':'118.24.219.151:16817'
}
handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')
with open('daili.html','w',encoding='utf-8')as fp:
fp.write(content)
13、代理池
import urllib.request
proxies_pool = [
{'http':'14.215.212.37:9168'},
{'http':'14.215.212.37:9168'},
{'http':'14.215.212.37:9168'}
]
import random
proxies = random.choice(proxies_pool)
url = 'https://www.baidu.com/s?ie=UTF-8&wd=ip'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
handler = urllib.request.ProxyHandler(proxies=proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode('utf-8')
with open('dailichi.html','w',encoding='utf-8')as fp:
fp.write(content)