python爬虫之requests库实例代码

# 简单实例引入
# import requests
# response = requests.get('https://www.baidu.com')
# print(type(response))
# print(response.status_code)
# print(response.text)
# print(response.cookies)


# 请求方式
# import requests
# # requests.post('http://httpbin.org/post')
# response = requests.get('http://httpbin.org/get')
# # requests.put('http://httpbin.org/put')
# print(response.text)


# # 带参数的get请求
# import requests
# data = {
#     'name':'germey',
#     'age':22
# }
# response = requests.get('http://httpbin.org/get',params=data)
# print(response.text)

# 解析json
# import requests
# response = requests.get('http://httpbin.org/get')
# print(type(response.text))
# print(response.json())
# print(type(response.json()))


# 获取二进制数据
# import requests
# response = requests.get('https://github.com/favicon.ico')
# print(type(response.text),type(response.content),sep='\n')
# # print(response.text)
# # print(response.content)    #获取二进制内容
# with open('favicon.ico','wb') as f:
#     f.write(response.content)   # 保存照片


# 添加headers
# 未加任何东西
# import requests
# response = requests.get('https://www.zhihu.com/explore')
# print(response.text)

# 加上请求头的内容
# import requests
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
# }
# response = requests.get('https://www.zhihu.com/explore',headers=headers)
# print(response.text)


# 基本post请求
# import requests
# data = {
#     'name':'germey', 'age':22
# }
# header = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
# }
# response = requests.post("https://httpbin.org/post",data=data, headers=header)
# print(response.text)
# print(response.json())  #以json的形式打开
# 简单说一下json
# JSON是一种轻量级的数据交换格式。
# Python中包含了json模块来对JSON进行编解码。
# 主要应用的两个函数为:
# json.dumps(): 对数据进行编码。
# json.loads(): 对数据进行解码。
# 在json的编解码过程中,python的原始类型会与json类型进行相互转换。
#
# 如果你要处理的是文件而不是字符串,你可以使用 json.dump() 和 json.load() 来编码和解码JSON数据。


# response属性
# import requests
# response = requests.get('http://www.jianshu.com')
# print(type(response.status_code),response.status_code,sep='\n')
# print(type(response.headers),response.headers,sep='\n')
# print(type(response.cookies),response.cookies,sep='\n')
# print(response.url)
# print(response.history)



# 状态码判断
# 因查阅状态码表比较繁琐,所以本人更喜欢用数字来判断状态码
# import requests
# response = requests.get('http://www.jianshu.com')
# exit() if not response.status_code == 200 else print('request successful')


# 高级操作
# 文件上传
# import requests
# files = {'file':open('favicon.ico','rb')}
# response = requests.post('http://httpbin.org/post',files=files)
# print(response.text)



# 获取cookie
import requests
# response = requests.get('https://www.baidu.com')
# print(response.cookies)
# for key,value in response.cookies.items():
#     print(key + '=' + value)


# 会话维持:主要作用时用来模拟自动登陆
# 简单模拟
# import requests
# requests.get('http://httpbin.org/cookies/set/number/123456789') # 设置cookies模拟登录
# response = requests.get('http://httpbin.org/cookies')
# print(response.text)  # cookies为空,说明模拟失败
# # 错误原因:发起的两次请求,请求相互独立,故无法实现模拟
# 正确模拟登录
# import requests
# s = requests.session()
# s.get('http://httpbin.org/cookies/set/munber/123456789')
# response = s.get('http://httpbin.org/cookies')
# print(response.text)
# #建议使用request来进行模拟登陆



# 证书验证
# import requests
# response = requests.get('https://www.12306.cn')
# print(response.status_code)
# #证书验证错误
# 解决方法:我们可以将get方法的参数verify,使其参数为false,避开证书验证环节
# import requests
# response = requests.get('https://www.12306.cn', verify=False)
# print(response.status_code)
# 手动指定证书
# import requests
# response = requests.get('https://www.12306.cn', cert=('/path/server.crt','/path/key'))
# print(response.status_code)


# 代理设置
# import requests
# proxy = {
#     'http:':'http://127.0.0.1:9743',
#     'https:':'https://127.0.0.1:9743',
# }
# response = requests.get('https://www.taobao.com',proxies=proxy)
# print(response.status_code)



# 超时设置
# import requests
# from requests.exceptions import ReadTimeout
# try:
#     response = requests.get('https://taobao.com',timeout=0.05)
#     print(response.status_code)
# except ReadTimeout :
#     print('timeout')




# 认证设置
# 当遇到访问页面需要密码时,我们可以在get方法中的auth参数中传入元组类型的用户名密码
# import requests
# response = requests.get('http://120.27.34.24:9001',auth=('user', '123'))
# print(response.status_code)


# # 异常处理
# import requests
# from requests.exceptions import ReadTimeout,HTTPError,RequestException,ConnectionError
# try:
#     response = requests.get('http://httpbin.org/get',timeout=0.05)
#     print(response.status_code)
# except ReadTimeout:
#     print('timeout')
# except HTTPError:
#     print('httperror')
# except ConnectionError:
#     print('connect error')
# except RequestException:
#     print('error')

你可能感兴趣的:(python,python爬虫程序笔记)