2、学习到的知识点总结
pip install requests
import requests
import json
r = requests.get("https://www.baidu.com/")
r.encoding = r.apparent_encoding # 将编码格式设为网页原有的编码格式,避免中文乱码
# r.encoding = 'gbk' # 设置输出的编码格式为gbk
print(r.url) # 打印请求网址
print(r.text) # 打印请求到的内容,一般为网页源码,为str格式
print(r.content) # 打印请求到的内容,为bytes格式,二进制数据,需要转码,推荐使用:response.content.decode('utf8')的方式获取相应的html页面
# response.text返回的是Unicode格式,通常需要转换为utf-8格式,否则就是乱码。response.content是二进制模式,可以下载视频之类的,如果想看的话需要decode成utf-8格式。不管是通过response.content.decode("utf-8)的方式还是通过response.encoding="utf-8"的方式都可以避免乱码的问题发生
print(r.cookies) # 查看cookies
print(r.status_code) # 打印get请求的状态码,这里为200,表示请求成功
print(r.headers['content-type']) # 查看请求的数据类型,这里是text/html,表示为html的文本类型
print(r.encoding) # 查看编码格式,这里是utf-8
# 输出为json格式
response = requests.get("http://httpbin.org/get")
print(type(response.text)) #
print(response.json()) # 等同于json.loads(response.text)
print(json.loads(response.text))
print(type(response.json())) #
有参数:
import requests
payload = {'key1': 'value1', 'key2': 'value2'}
ret = requests.get("http://httpbin.org/get", params=payload)
print(ret.url) # 输出为:http://httpbin.org/get?key1=value1&key2=value2
# print(ret.text)
添加请求头headers
不加请求头:
import requests
url = 'https://www.zhihu.com/'
response = requests.get(url)
response.encoding = "utf-8"
print(response.text)
结果提示发生400错误(也就说你连知乎登录页面的html都下载不下来)
400 Bad Request
400 Bad Request
openresty
所以我们按F12,查看network下的www.zhihu.com页面的headers,
import requests
url = 'https://www.zhihu.com/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
}
response = requests.get(url,headers=headers)
print(response.text)
一般这种方式,headers下面会有一个form data,如下:
import requests
response = requests.post(
url='https://dig.chouti.com/login',
data={
'phone': '8615911111111',
'password': '1314520',
'oneMonth': '1'
},
headers={
'user-agent': 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'},
cookies=r1_cookie
)
print(response.text)
例二:
import requests
payload = {'key1': 'value1', 'key2': 'value2'}
r = requests.post("http://httpbin.org/post", data=payload)
print r.text
#运行结果如下:
{
"args": {},
"data": "",
"files": {},
"form": {
"key1": "value1",
"key2": "value2"
},
"headers": {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Content-Length": "23",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "http://httpbin.org",
"User-Agent": "python-requests/2.9.1"
},
"json": null,
"url": "http://httpbin.org/post"
}
import json
import requests
url = 'http://httpbin.org/post'
payload = {'some': 'data'}
r = requests.post(url, data=json.dumps(payload))
print r.text
运行结果:
{
"args": {},
"data": "{\"some\": \"data\"}",
"files": {},
"form": {},
"headers": {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Content-Length": "16",
"Host": "http://httpbin.org",
"User-Agent": "python-requests/2.9.1"
},
"json": {
"some": "data"
},
"url": "http://httpbin.org/post"
}
import requests
url = 'http://httpbin.org/post'
files = {'file': open('test.txt', 'rb')}
r = requests.post(url, files=files)
print r.text
{
"args": {},
"data": "",
"files": {
"file": "Hello World!"
},
"form": {},
"headers": {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Content-Length": "156",
"Content-Type": "multipart/form-data; boundary=7d8eb5ff99a04c11bb3e862ce78d7000",
"Host": "http://httpbin.org",
"User-Agent": "python-requests/2.9.1"
},
"json": null,
"url": "http://httpbin.org/post"
}
这样我们便成功完成了一个文件的上传。
with open('massive-body') as f:
requests.post('http://some.url/streamed', data=f)
r = requests.get("https://www.baidu.com") # 等效于requests.request(method='get', url='https://www.baidu.com')
requests.get(url, params=None, **kwargs)
requests.post(url, data=None, json=None, **kwargs)
requests.put(url, data=None, **kwargs)
requests.head(url, **kwargs)
requests.delete(url, **kwargs)
requests.patch(url, data=None, **kwargs)
requests.options(url, **kwargs)
# 以上方法均是在此方法的基础上构建
# 登录抽屉的例子:
import requests
from bs4 import BeautifulSoup
# 第一次访问任意页面获取cookies
r1 = requests.get(
url='https://dig.chouti.com/all/hot/recent/1',
headers={'user-agent':'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'}
)
# print(r1.text)
r1_cookie=r1.cookies.get_dict()
# 第二次携带第一次的cookies登录,并对cookies授权,获取授权后的cookies,用授权后的cookies里的gpsd也可以
r2 = requests.post(
url='https://dig.chouti.com/login',
data={
'phone': '8615918732559',
'password': 'xxxxx',
'oneMonth': '1'
},
headers={
'user-agent': 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'},
cookies=r1_cookie
)
r2_cookies = r2.cookies.get_dict()
# 以后访问就都使用授权后的cookies了
vote = requests.post(
url='https://dig.chouti.com/link/vote?linksId=%s' % id,
headers={
'user-agent': 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'},
cookies=r2_cookies
)
用了session()会话保持后
import requests
session = requests.Session()
### 1、首先登陆任何页面,获取cookie
i1 = session.get(url="http://dig.chouti.com/help/service")
### 2、用户登陆,携带上一次的cookie,后台对cookie中的 gpsd 进行授权
i2 = session.post(
url="http://dig.chouti.com/login",
data={
'phone': "8615131255089",
'password': "xxxxxx",
'oneMonth': ""
}
)
i3 = session.post(
url="http://dig.chouti.com/link/vote?linksId=8589623",
)
print(i3.text)
session的一些基本设置:
# 用session对象发出get请求,设置cookies
session.get('http://httpbin.org/cookies/set/number/12456')
# 用session对象发出另外一个get请求,获取cookies
response = session.get('http://httpbin.org/cookies')
import requests
response = requests.get('https://www.12306.cn')
# 在请求https时,request会进行证书的验证,如果验证失败则会抛出异常
print(response.status_code)
结果爆出异常:
import requests
# 关闭验证,但是仍然会报出证书警告
response = requests.get('https://www.12306.cn',verify=False)
print(response.status_code)
还是报错:
from requests.packages import urllib3
import requests
urllib3.disable_warnings()
response = requests.get('https://www.12306.cn',verify=False)
print(response.status_code)
import requests
proxies = {
"http": "http://127.0.0.1:9743",
"https": "https://127.0.0.1:9743",
}
response = requests.get("https://www.taobao.com", proxies=proxies)
print(response.status_code)
import requests
from requests.auth import HTTPBasicAuth
#方法一
r = requests.get('http://120.27.34.24:9001', auth=HTTPBasicAuth('user', '123'))
#方法二
r = requests.get('http://120.27.34.24:9001', auth=('user', '123'))
print(r.status_code)
import requests
from requests.exceptions import ReadTimeout, ConnectionError, RequestException
try:
response = requests.get("http://httpbin.org/get", timeout = 0.5)
print(response.status_code)
except ReadTimeout:
print('Timeout')
except ConnectionError:
print('Connection error')
except RequestException:
print('Error')
首先被捕捉的异常是timeout,当把网络断掉的haul就会捕捉到ConnectionError,如果前面异常都没有捕捉到,最后也可以通过RequestExctption捕捉到
import requests
response = requests.get('http://img.ivsky.com/img/tupian/pre/201708/30/kekeersitao-002.jpg')
b = response.content
with open('F://fengjing.jpg','wb') as f:
f.write(b)
100: ('continue',),
101: ('switching_protocols',),
102: ('processing',),
103: ('checkpoint',),
122: ('uri_too_long', 'request_uri_too_long'),
200: ('ok', 'okay', 'all_ok', 'all_okay', 'all_good', '\\o/', '✓'),
201: ('created',),
202: ('accepted',),
203: ('non_authoritative_info', 'non_authoritative_information'),
204: ('no_content',),
205: ('reset_content', 'reset'),
206: ('partial_content', 'partial'),
207: ('multi_status', 'multiple_status', 'multi_stati', 'multiple_stati'),
208: ('already_reported',),
226: ('im_used',),
# Redirection.
300: ('multiple_choices',),
301: ('moved_permanently', 'moved', '\\o-'),
302: ('found',),
303: ('see_other', 'other'),
304: ('not_modified',),
305: ('use_proxy',),
306: ('switch_proxy',),
307: ('temporary_redirect', 'temporary_moved', 'temporary'),
308: ('permanent_redirect',
'resume_incomplete', 'resume',), # These 2 to be removed in 3.0
# Client Error.
400: ('bad_request', 'bad'),
401: ('unauthorized',),
402: ('payment_required', 'payment'),
403: ('forbidden',),
404: ('not_found', '-o-'),
405: ('method_not_allowed', 'not_allowed'),
406: ('not_acceptable',),
407: ('proxy_authentication_required', 'proxy_auth', 'proxy_authentication'),
408: ('request_timeout', 'timeout'),
409: ('conflict',),
410: ('gone',),
411: ('length_required',),
412: ('precondition_failed', 'precondition'),
413: ('request_entity_too_large',),
414: ('request_uri_too_large',),
415: ('unsupported_media_type', 'unsupported_media', 'media_type'),
416: ('requested_range_not_satisfiable', 'requested_range', 'range_not_satisfiable'),
417: ('expectation_failed',),
418: ('im_a_teapot', 'teapot', 'i_am_a_teapot'),
421: ('misdirected_request',),
422: ('unprocessable_entity', 'unprocessable'),
423: ('locked',),
424: ('failed_dependency', 'dependency'),
425: ('unordered_collection', 'unordered'),
426: ('upgrade_required', 'upgrade'),
428: ('precondition_required', 'precondition'),
429: ('too_many_requests', 'too_many'),
431: ('header_fields_too_large', 'fields_too_large'),
444: ('no_response', 'none'),
449: ('retry_with', 'retry'),
450: ('blocked_by_windows_parental_controls', 'parental_controls'),
451: ('unavailable_for_legal_reasons', 'legal_reasons'),
499: ('client_closed_request',),
# Server Error.
500: ('internal_server_error', 'server_error', '/o\\', '✗'),
501: ('not_implemented',),
502: ('bad_gateway',),
503: ('service_unavailable', 'unavailable'),
504: ('gateway_timeout',),
505: ('http_version_not_supported', 'http_version'),
506: ('variant_also_negotiates',),
507: ('insufficient_storage',),
509: ('bandwidth_limit_exceeded', 'bandwidth'),
510: ('not_extended',),
511: ('network_authentication_required', 'network_auth', 'network_authentication'),
状态码用法
import requests
response = requests.get('http://www.jianshu.com/404.html')
# 使用request内置的字母判断状态码
#如果response返回的状态码是非正常的就返回404错误
if response.status_code != requests.codes.ok:
print('404')
#如果页面返回的状态码是200,就打印下面的状态
response = requests.get('http://www.jianshu.com')
if response.status_code == 200:
print('200')
pip install beautifulsoup4
from bs4 import BeautifulSoup
html = '''
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
'''
soup = BeautifulSoup(html,'lxml') # 实例化一个soup对象,html是之前下载的html代码,'lxml'是解析器,需要安装,python内置的是html.parser解析器
print(soup.prettify()) # 格式化输出,更美观
print(soup.title) # 获取第一个title标签,The Dormouse's story
print(soup.title.name) # 获取第一个title标签名,结果是title
print(soup.title.string) # 获取第一个title的内容,The Dormouse's story
print(soup.title.parent.name) # 获取第一个title的父标签名,head
print(soup.p) # 获取第一个p标签,The Dormouse's story
print(soup.p["class"]) # 获取第一个p标签的class属性,["title"]
print(soup.a) # 获取第一个a标签,Elsie
print(soup.find_all('a')) # 找出所有的a标签,返回一个列表
print(soup.find(id='link3')) # 找出第一个id为'link3'的标签。
soup.title.name #可以获得该title标签的名称,即title
标签属性attrs
soup.p.attrs['name'] # 获取p标签的名字即p
soup.p.attrs['id'] # 获取p标签id
soup.p.attrs['class'] # 获取p标签的样式class
标签内容string
soup.p.string # 结果就可以获取第一个p标签的内容:The Dormouse's story
也可以嵌套获取
soup.head.title.p.string # 结果同soup.p.string,
获取子标签,
print(soup.p.contents) # 获取p节点里面的所有节点,并以列表的形式返回
法二:
print(soup.p.children) # 获取p节点里面的所有节点,并以迭代器iter的形式返回,只能通过循环迭代获取
比如:
for i,child in enumerate(soup.p.children):
print(i,child)
获取子子孙孙标签,每个子标签跟孙标签都作为一条记录获取
print(soup.p.descendants)
父节点和祖先节点
soup.a.next_siblings 获取后面的兄弟节点
soup.a.previous_siblings 获取前面的兄弟节点
soup.a.next_sibling 获取下一个兄弟标签
souo.a.previous_sinbling 获取上一个兄弟标签
html='''
Hello
- Foo
- Bar
- Jay
- Foo
- Bar
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all('ul')) # 查找所有的ul标签,并以列表的形式返回
print(type(soup.find_all('ul')[0])) # 查找所有的ul标签,返回列表里的第一个ul标签。
同时我们是可以针对结果再次find_all,从而获取所有的li标签信息
for ul in soup.find_all('ul'):
print(ul.find_all('li'))
# 通过attrs属性查找
print(soup.find_all(attrs={'id': 'list-1'}))
print(soup.find_all(attrs={'name': 'elements'}))
attrs可以传入字典的方式来查找标签,但是这里有个特殊的就是class,因为class在python中是特殊的字段,所以如果想要查找class相关的可以更改attrs={'class_':'element'}或者soup.find_all('',{"class":"element}),特殊的标签属性可以不写attrs,例如id
print(soup.find_all(text='Foo'))
返回的结果是所有Foo的文本。
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.select('.panel .panel-heading')) # 查找class为panel下面,所有class为panel-heading的标签
print(soup.select('ul li')) # 查找ul标签下所有的li标签
print(soup.select('#list-2 .element')) # 查找id为list-2,所有class为element的标签
print(type(soup.select('ul')[0])) # 查找ul标签下的第一个子标签
# 获取文本
for li in soup.select('li'):
print(li.get_text())
# 获取属性
# 获取属性的时候可以通过[属性名]或者attrs[属性名]
for ul in soup.select('ul'):
print(ul['id'])
print(ul.attrs['id'])
1、抽屉登录并点赞
import requests
from bs4 import BeautifulSoup
# 访问页面获取源码与cookies,注意携带headers,伪造的更像浏览器
r1 = requests.get(
url='https://dig.chouti.com/all/hot/recent/1',
headers={'user-agent':'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'}
)
r1_cookie=r1.cookies.get_dict()
# 分析源码,获取每篇文章的链接id
soup = BeautifulSoup(r1.text,'html.parser')
div = soup.find('div', 'content-list')
item_list = div.find_all(attrs={'class': 'item'})
id_list = []
for item in item_list:
tag = item.find(name='div', attrs={'class': 'part2'})
# link_id = link_div.attrs['share-linkid']
link_id = tag.get('share-linkid')
id_list.append(link_id)
# print(link_id)
# 携带headers与之前的cookies,登录抽屉,并对cookies授权
r2 = requests.post(
url='https://dig.chouti.com/login',
data={
'phone': '86159xxxxxxxx',
'password': 'xxxxxx',
'oneMonth': '1'
},
headers={
'user-agent': 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'},
cookies=r1_cookie
)
r2_cookies = r2.cookies.get_dict()
# 携带授权后的cookies,对文章点赞
for id in id_list:
vote = requests.post(
url='https://dig.chouti.com/link/vote?linksId=%s' % id,
headers={
'user-agent': 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'},
cookies=r2_cookies
)
vote_cancel = requests.post(
url='https://dig.chouti.com/vote/cancel/vote.do',
data={
'linksId':'%s' % id
},
headers={
'user-agent': 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'},
cookies=r1_cookie
)
print(vote_cancel.text)
2、自动登录github并获取个人信息
import requests
from bs4 import BeautifulSoup
# 第一步,获取token
get_token_html = requests.get('https://github.com/login')
get_token_soup = BeautifulSoup(get_token_html.text, 'html.parser')
# 因为input标签的内容是空的,所以不要用hidden_tag.text,它是个空值。
token = get_token_soup.find(name='input',attrs={'name': 'authenticity_token'}) .get('value')
# 获取cookies,此时未授权
cookies_list=get_token_html.cookies.get_dict()
# print(token)
# print(ret1_cookies)
# 第二步,携带步骤一的cookies和token登录github
sign_in_html = requests.post(
url='https://github.com/session',
data={
'utf8': '✓',
'authenticity_token': '%s' % token,
'login': 'gituser',
'password': 'gitpassword',
'commit': 'Sign in'
},
headers={'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'},
cookies=cookies_list
)
# 获取登录后授权的cookies
sign_in_cookies = sign_in_html.cookies.get_dict()
# print(sign_in_html.text)
# 通过登录后的返回页获取用户名
sign_in_soup = BeautifulSoup(sign_in_html.text, 'lxml')
username = sign_in_soup.find('strong', attrs={'class': 'css-truncate-target'}).text
print('用户信息之用户名是:', username)
# 第三步,进入个人信息页
profile_html = requests.get(
url='https://github.com/%s'%username,
headers={'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'},
cookies=sign_in_cookies
)
profile_soup = BeautifulSoup(profile_html.text, 'lxml')
# 下载用户头像
avatar_url = profile_soup.find('img', attrs={'class': 'avatar width-full rounded-2'}).get('src')
avatar_filename = '%s_avatar.jpg'%username
with open(avatar_filename,'wb') as f:
avatar_img = requests.get(avatar_url)
f.write(avatar_img.content)
print('已下载用户头像在本地!')
# 获取用户仓库信息
repositories_html = requests.get('https://github.com/%s?tab=repositories'%username)
# print(repositories_html.text)
repositories_soup = BeautifulSoup(repositories_html.text, 'lxml')
repositories_div = repositories_soup.find(name='div', id='user-repositories-list')
repositories_li_list = repositories_div.find_all('li')
for li in repositories_li_list:
tag = li.find(name='a', attrs={'itemprop': 'name codeRepository'})
repositories_name = tag.text.strip()
repositories_href = 'https://github.com%s' % tag.get('href')
print('用户仓库名:%s,仓库链接地址:%s'%(repositories_name,repositories_href))