import urllib.request
import urllib.parse
url = 'https://www.baidu.com/s?wd='
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
# headers请换成自己电脑的
# 将周杰伦变成unicode编码的格式
# 需要依赖与urllib.parse
# a = input('输入你想查询的人物:')
# name = urllib.parse.quote(a)
name = urllib.parse.quote('周杰伦')
url = url + name
# print(url)
# print(urllib.parse.unquote(url))
request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
import urllib.request
import urllib.parse
url = 'https://fanyi.baidu.com/sug'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
# headers请换成自己电脑的
data = {
'kw': 'spider'
}
# post请求的参数 必须要进行编码
data = urllib.parse.urlencode(data).encode('utf-8')
request = urllib.request.Request(url=url,data=data,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
# print(content)
# print(type(content))
import json
obj = json.loads(content)
print(obj)
import urllib.request
# 定义一个url 就是访问的地址
url = 'https://www.baidu.com'
# 把爬虫伪装成pc端的浏览器
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
# headers请换成自己电脑的
# 因为urlopen方法不能存储字典 所以headers不能传递进去
# 创建请求对象,请求对象的定制
# 因为参数顺序的问题 不能直接写url和headers 需要关键字传参
request = urllib.request.Request(url=url,headers=headers)
# urlopen的参数可以是简单的字符串,也可以是一个request请求对象
response = urllib.request.urlopen(request)
# read方法 返回的是字节形式的二进制数据 我们要将二进制的数据转换为字符串
# 二进制--》字符串 解码 decode(‘编码的格式’)
content = response.read().decode('utf8')
print(content)
import urllib.request
import urllib.parse
url = 'https://fanyi.baidu.com/v2transapi?from=zh&to=en'
headers = {
'Cookie': 'BIDUPSID=C6CE5FF2DC19CD38B69AD13B21E47203; PSTM=1658718986; BDRCVFR[hiIP0dwB613]=mk3SLVN4HKm; BAIDUID=C6CE5FF2DC19CD38E2B3B384D4D04767:FG=1; BA_HECTOR=0h818h21018k00al80a4b2ig1hds2ob17; ZFY=JbPtF2sqFmd:AwWHsrXZJWqT8ApOxc5A1OWxeytmRT:BM:C; BDRCVFR[oxw6P5LCmIs]=mk3SLVN4HKm; delPer=0; PSINO=6; H_PS_PSSID=26350; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1658739277; APPGUIDE_10_0_2=1; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; ab_sr=1.0.1_NzBlOTBjYzA2NzZiZmNmZDAzZjhiYTdkNDZiM2QyNWQ4N2JiM2JjNTRhZTJhYTQzNTM0YzJjODhlMDM2ZWIxNWZjNmJlYTc0YzM3MmU2ZWRiNWU5YmFkNDhiZTVlNGIyMzAxMDBmMTkzOTc3NTgwODRjNWYzNmU4ODUxYzliYTQxM2I0ZTJjOTI2ZjA3ZjQ1NzNlYTU5MzNjMDg2YWMyOA==; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1658739306'
}
data = {
'from': 'zh',
'to': 'en',
'query': '中国',
'simple_means_flag': '3',
'sign': '777849.998728',
'domain': 'common',
}
data = urllib.parse.urlencode(data).encode('utf-8')
request = urllib.request.Request(url=url,data=data,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
import urllib.parse
import urllib.request
import os
# 1.请求对象的定制
def create_request(page):
base_url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&'
data = {
'start':(page-1)*20,
'limit':20
}
data = urllib.parse.urlencode(data)
url = base_url + data
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.7062 SLBChan/105'
}
request = urllib.request.Request(url=url,headers=headers)
return request
# 2.获取响应的数据
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content
# 3.下载数据
def downlode(page,content):
if os.path.exists('../douban_file'):
pass
else:
os.mkdir('../douban_file')
with open('douban_file/'+'douban'+str(page)+'.json','w',encoding='utf-8') as fp:
fp.write(content)
def run():
start_page = int(input('请输入起始的页码:'))
end_page = int(input('请输入结束的页码:'))
for page in range(start_page,end_page+1):
request = create_request(page)
content = get_content(request)
downlode(page,content)
# 程序入口
if __name__ == '__main__':
run()
urllib基本库很容易理解,认真学一定会懂得!本篇文章可能存在一些不足,大家可以私信反馈,如果有什么不懂可以评论区留言哦。