使用urllib来获取百度首页的源码
import urllib.request
#(1) 定义一个url
url = 'http://www.baidu.com'
#(2) 模拟浏览器向服务器发送请求
response= urllib.request.urlopen(url)
#(3) 获取响应中的页面的源码
content = response.read() # read返回的是字节形式的二进制数据
content = response.read().decode('utf-8')
1个类型:上述代码中的response是HTTPResponse类型。
6个方法:
content = response.read() # 按照一个一个字节的方法去读取
content = response.read(5) # 读取5个字节
content = response.readline() # 读取一行
content = response.readlines() # 读取所有行,按照列表返回
response.getcode() # 返回状态码
response.geturl() # 获取url地址
response.getheaders() # 获取一些状态信息
#下载网页
url_page = 'http://www.baidu.com'
urllib.request.urlretrieve(url_page, 'baidu.html')
#下载图片
url_img = '图片地址'
urllib.request.urlretrieve(url_img, filename='图片名称.jpg')
#下载视频
url_video = '视频地址'
urllib.request.urlretrieve(url_video, '视频名称.mp4')
https://www.baidu.com/s?wd=周杰伦
headers = {'User-Agent':xxxxxxxxxxxxxxxxxxxxxxxxxxxx}
#因为urlopen方法中不能存储字典,所以headers不能传递进去
#请求对象的定制
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf8')
print(content)
#url中的汉字以Unicode编码形式存在:%E5%91%A8%E6%9D%B0%E4%BC%A6
#url = https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=%E5%91%A8%E6%9D%B0%E4%BC%A6...
#使用urllib.parse.quote()可以将汉字转为Unicode编码形式
url = https://www.baidu.com/s?wd=
name = urllib.parse.quote('周杰伦')
url = url + name
应用场景:多个参数的时候
# https://www.baidu.com/s?wd=周杰伦&sex=男
data = {
'wd':'周杰伦'
'sex':'男'
'location':'台湾省'
}
base_url = 'https://www.baidu.com/s?'
new_data = urllib.parse.urlencode(data)
url = base_url + new_data
以百度翻译为例
url = 'https://fanyi.baidu.com/sug'
data = {
'kw':'spider'
}
#post请求方式的参数必须编码
data = urllib.parse.urlencode(data).encode('utf-8')
#post的请求参数是不会拼接在url的后面的,而是需要放在请求对象定制的参数中
request = urllib.request.Request(url=url, data=data, headers=headers)
#模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)
#获取响应数据
content = response.read().decode('utf-8')
obj = json.loads(content)
以百度翻译的详细翻译为例
url = https://fanyi.baidu.com/v2transapi?from=srp&to=zh
# 决定性因素:cookie。反爬方法:要求cookie
headers = {
'Accept':' */*'
#'Accept-Encoding':' gzip, deflate, br' # 一定要注释掉
'Accept-Language':' zh-CN,zh;q=0.9'
'Acs-Token':' xxxxxxxxxxxxxxxxxxx'
'Connection':' keep-alive'
'Content-Length':' 137'
'Content-Type':' application/x-www-form-urlencoded; charset=UTF-8'
'Cookie: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
'Host':' fanyi.baidu.com'
'Origin: https':'//fanyi.baidu.com'
'Referer: https':'//fanyi.baidu.com/'
'sec-ch-ua':' xxxxxxxxxxxxxxxxxxx'
'sec-ch-ua-mobile':' ?0'
'sec-ch-ua-platform':' "Windows"'
'Sec-Fetch-Dest':' empty'
'Sec-Fetch-Mode':' cors'
'Sec-Fetch-Site':' same-origin'
'User-Agent':'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
'X-Requested-With':' XMLHttpRequest'
}
data = {
'from':'srp'
'to':'zh'
'query':'spider'
'transtype':'realtime'
'simple_means_flag':'3'
'sign':'xxxxx.xxxxxx'
'token':'xxxxxxxxxxxxxx'
'domain':'common'
}
#post请求方式的参数必须编码
data = urllib.parse.urlencode(data).encode('utf-8')
#post的请求参数是不会拼接在url的后面的,而是需要放在请求对象定制的参数中
request = urllib.request.Request(url=url, data=data, headers=headers)
#模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)
#获取响应数据
content = response.read().decode('utf-8')
obj = json.loads(content)
豆瓣动作电影排行榜前20(首页/ajax)
url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20'
headers = {
'User-Agent' : 'xxxxxxxxxxxxxxxxxxxx'
}
# (1)请求对象的定制
request = urllib.request.Request(url=url, headers=headers)
# (2)获取响应的数据
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
# (3)保存数据到本地
# open方法默认情况下使用的是gbk编码,如果要保存中文,需要用utf-8编码
with open('douban.json', 'w', encoding='utf-8') as fp:
fp.write(content)
豆瓣动作电影排行榜前几页(ajax)
#找规律
#‘https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20’
#‘https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=20&limit=20‘
#‘https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=40&limit=20’
#......
#page 1 2 3 4
#start 0 20 40 60
#start=(page-1)*20
def create_request(page):
base_url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&'
data = {
'start':(page - 1) * 20,
'limit':20
}
data = urllib.parse.urlencode(data)
url = base_url + data
headers = {
'User-Agent' : 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
}
# 请求对象的定制
request = urllib.request.Request(url=url, headers=headers)
return request
def get_content(request):
# 获取响应的数据
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content
def down_load(page, content):
# 保存数据到本地
with open('douban_' + str(page) + '.json', 'w', encoding='utf-8') as fp:
fp.write(content)
if __name__ == '__main__':
start_page = int(input('请输入起始的页码'))
end_page = int(input('请输入结束的页码'))
for page in range(start_page, end_page+1):
#每一页都有自己的请求对象的定制
request = create_request(page)
content = get_content(request)
down_load(page, content)
查询KFC在珠海的餐厅信息
#第一页
#http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword
#post:
#cname:
#pid:
#keyword: 珠海
#pageIndex: 1
#pageSize: 10
#第二页
#http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword
#post
#cname:
#pid:
#keyword: 珠海
#pageIndex: 2
#pageSize: 10
base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
def create_request(page):
base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
data = {
'cname':'',
'pid':'',
'keyword': '珠海',
'pageIndex': page,
'pageSize': '10'
}
data = urllib.parse.urlencode(data).encode('utf-8')
headers = {
'User-Agent' : 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
}
# 请求对象的定制
request = urllib.request.Request(url=base_url, headers=headers, data=data)
return request
def get_content(request):
# 获取响应的数据
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content
def download(page, content):
# 保存数据到本地
with open('kfc_' + str(page) + '.json', 'w', encoding='utf-8') as fp:
fp.write(content)
if __name__ == '__main__':
start_page = int(input('请输入起始的页码:'))
end_page = int(input('请输入结束的页码:'))
for page in range(start_page, end_page+1):
#每一页都有自己的请求对象的定制
request = create_request(page)
content = get_content(request)
download(page, content)
# base_url = 'https://blog.csdn.net/qq_41684621/article/details/113851644'
#base_url = 'http://www.doudan1111.com'
headers = {
'User-Agent' : 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
}
try:
request = urllib.request.Request(url=base_url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
print(content)
except urllib.error.HTTPError:
print('系统正在升级...')
except urllib.error.URLError:
print('请检查网址...')
适用的场景:数据采集的时候,绕过登陆进入某个页面
url = 'https://weibo.cn/6451491586/info'
headers = {
'User-Agent' : 'xxxxxxxxxxxxxxxxxxx'
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
with open('weibo.html', 'w', encoding='utf-8') as fp:
fp.write(content)
上述代码运行会报错
Traceback (most recent call last):
File "F:\Python\spider\spider_test.py", line 17, in <module>
content = response.read().decode('utf-8')
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb5 in position 672: invalid start byte
是因为虽然要打开的个人信息页面是utf-8编码方式,但是这个请求并没有进入个人信息页面,而是跳转到了登陆页面,登陆页面不是utf-8,以下做出相应更改
url = 'https://weibo.cn/6451491586/info'
headers = {
'User-Agent' : 'xxxxxxxxxxxxxxxxxxx'
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('gb2312')
with open('weibo.html', 'w', encoding='gb2312') as fp:
fp.write(content)
可以运行,但是无法通过weibo.html打开登陆页面
加入请求头Request Headers内的内容,其中:cookie中携带者个人登陆信息,如果有登陆之后的cookie,那么就可以携带者cookie进入到任何页面;referer判断当前路径是不是由上一个路径进来的,一般情况下,做图片防盗链
urllib.request.urlopen(url) 不能定制请求头,不能解决UA反爬
urllib.request.Request(url, headers,data) 可以定制请求头
Handler 定制更高级的请求头(随着业务逻辑的复杂,请求对象的定制已经无法满足我们的需求(动态cookie和代理不能使用请求对象的定制))
#使用handler来访问百度,获取网页源码
url = 'https://www.baidu.com'
headers = {
'User-Agent' : 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
}
request = urllib.request.Request(url=url, headers=headers)
#handler build_opener open
# 获取handler对象
handler = urllib.request.HTTPHandler()
# 获取opener对象
opener = urllib.request.build_opener(handler)
# 调用opener方法
response = opener.open(request)
content = response.read().decode('utf-8')
print(content)
代理的常用功能:
配置代理:
# 代理服务器的使用
url = 'http://www.bing.com/search?q=ip'
headers = {
'User-Agent' : 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
}
# 请求对象的定制
request = urllib.request.Request(url=url, headers=headers)
# 代理IP(快代理)
proxies = {
'http':'61.216.185.88:60808'
}
#handler build_opener open
# 获取handler对象
handler = urllib.request.ProxyHandler(proxies=proxies)
# 获取opener对象
opener = urllib.request.build_opener(handler)
# 调用opener方法
response = opener.open(request)
content = response.read().decode('utf-8')
with open('daili.html', 'w', encoding='utf-8') as fp:
fp.write(content)
# 代理池的使用
url = 'http://www.bing.com/search?q=ip'
headers = {
'User-Agent' : 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
}
# 代理池(快代理)
proxies_pool = [
{'http':'61.216.185.88:60808'},
{'http':'112.14.47.6:52024'},
{'http':'121.13.252.62:41564'},
{'http':'61.216.156.222:60808'}
]
proxies = random.choice(proxies_pool)
# 请求对象的定制
request = urllib.request.Request(url=url, headers=headers)
#handler build_opener open
# 获取handler对象
handler = urllib.request.ProxyHandler(proxies=proxies)
# 获取opener对象
opener = urllib.request.build_opener(handler)
# 调用opener方法
response = opener.open(request)
content = response.read().decode('utf-8')
with open('daili.html', 'w', encoding='utf-8') as fp:
fp.write(content)