有些网站需要填写账户和密码,如果直接爬去网页信息,自然是进不去了,这里采用cookiejar工具来实现这个目的
import urllib.request, urllib.parse, urllib.error
import http.cookiejar
import ppretty
from bs4 import BeautifulSoup
if __name__ == '__main__':
LOGIN_URL = 'https://passport.csdn.net/account/login'
values = {'user': '**********', 'password': '**********'}
postdata = urllib.parse.urlencode(values).encode()
user_agent = r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'
headers = {'User-Agent': user_agent, 'Connection': 'keep-alive'}
#cookiejar可以理解为存储cookie的一个工具吧,在这里我们把第一次http请求返回的cookie存储在cookie.txt文件中
cookie_filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(cookie_filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
request = urllib.request.Request(LOGIN_URL, postdata, headers)
try:
response = opener.open(request)
page = response.read().decode()
# print(page)
except urllib.error.URLError as e:
print(e.code, ':', e.reason)
cookie.save(ignore_discard=True, ignore_expires=True) # 保存cookie到cookie.txt中
print(cookie)
for item in cookie:
print('Name = ' + item.name)
print('Value = ' + item.value)
get_url = 'https://i.csdn.net/#/uc/profile'
get_request = urllib.request.Request(get_url, headers=headers)
#采用之前的第一次请求返回的cookie作为参数再次进行二次请求,就可以
get_response = opener.open(get_request)
soup=BeautifulSoup(get_response,'lxml')
print(soup.head)