2019-12-29学习心得---关于运用cookie登录网站后爬取数据

cookie是保存账号和密码的地方,可以尝试通过运用cookie登录所想要的网站,然后进行爬取页码,一般cookie有效期半个小时,而url可能会有好几天,按照前几节课的方法进行cookie登录会被挡在外面,因此需要用到cookiejar


#导入cookiejar

import http.cookiejar

import urllib.parse

import urllib.request

#先定制一个cookie对象

#注意后面区分大小写

cookie = http.cookiejar.CookieJar()

handler = urllib.request.HTTPCookieProcessor(cookie)

opener = urllib.request.build_opener(handler)

post_url ="http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=20191131847172 "

headers = {

"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"

#既然是Post的请求,就有表单数据的传递

}

data = {

"email":"[email protected]",

"icode":"",

"origURL":"http://www.renren.com/home",

"domain":"renren.com",

"key_id":"1",

"captcha_type":"web_login",

"password":"3ef61cfeffe709bcbc0afcce4e752af3a4190d57c1f4918f81c814a694c676e4",

"rkey":"aed6ec6603b6e6c14f782872bcdf6cc3",

"f":"http%3A%2F%2Fwww.renren.com%2F973165373%2Fnewsfeed%2Fphoto",

}

data = urllib.parse.urlencode(data).encode("utf-8")

p_req = urllib.request.Request(data=data,headers=headers,url=post_url)

#下面的opener已经存储了cookie了,所以可以直接请求主页

login_req = opener.open(p_req)

#因为已经使用了opener了,所以这里不需要构建header和data了

owner_url ="http://www.renren.com/317141561/profile"

owner_req = urllib.request.Request(url=owner_url,headers=headers)

res = opener.open(owner_req)

content = res.read().decode("utf-8")

with open("rr20.html","w",encoding="utf-8")as fp:

fp.write(content)

你可能感兴趣的:(2019-12-29学习心得---关于运用cookie登录网站后爬取数据)