爬取个人 人人主页 的代码:
首先用最原始的方法进行,可以看出这样写代码,比较麻烦,重复代码很多:
from urllib import request, parse
url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018721913553'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
# 'Cookie': 'xxx'
}
form = {
'email': '1550000004',
'icode': '',
'origURL': 'http://www.renren.com/home',
'domain': 'renren.com',
'key_id': '1',
'captcha_type': 'web_login',
'password': '68b1b0e6c6909edxxxxxxxxxxxxxxxxc93122a14b2f7c68',
'rkey': '15b278axxxxxxxxxx2df83c257aaa1a1',
'f': 'http%3A%2F%2Fwww.renren.com%2F967453885',
}
'''由于人人需要登录,故为POST请求,需要额外的提供data'''
# --------------------先写POST请求,获取个人主页----------------------------------------------------------
# 对form进行转换
form_str = parse.urlencode(form,encoding='utf-8')
form_bytes = form_str.encode('utf_8')
req = request.Request(url,data=form_bytes,headers=headers)
response = request.urlopen(req)
info = response.read()
post_info = info.decode('utf-8') # {"code":true,"homeUrl":"http://www.renren.com/home"}
# print(type(post_info)) #
# 由于上面得到的是字符串类型,我们无法获取 homeUrl的内容,故需要转换为字典,可以调用json包
import json
info_dict = json.loads(post_info)
# print(type(info_dict)) #
# 此时我们可以用字典读取方式取出里面的homeUrl,然后我再进行GRT请求获取我们需要的页面内容
# ---------------------再写GET请求获取页面内容----------------------------------------------------------
get_url = info_dict['homeUrl']
'''
注意:此时的cookie为登录进去的http://www.renren.com/967453885这个里面的967453885里面的cookie值
'''
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Cookie': 'xxxxxx' }
req = request.Request(get_url,headers=headers)
response = request.urlopen(req,timeout=3)
get_info = response.read()
get_info = get_info.decode('utf-8')
print(get_info)
使用下面这种方式把cookie保存起来:
# 保存cookie
from http import cookiejar
# 通过对象保存cookie
cookie_object = cookiejar.CookieJar()
# handler 对应着一个操作
handler = request.HTTPCookieProcessor(cookie_object)
# opener 遇到有cookie的response的时候,
# 调用handler内部的一个函数, 存储到cookie object
opener = request.build_opener(handler)
下面对上面代码进行封装:
code_fengzhuang.py
from urllib import request, parse
from urllib.error import HTTPError, URLError
# 保存cookie
from http import cookiejar
class session(object):
def __init__(self):
cookie_object = cookiejar.CookieJar()
# handler 对应着一个操作
handler = request.HTTPCookieProcessor(cookie_object)
# opener 遇到有cookie的response的时候,
# 调用handler内部的一个函数, 存储到cookie object
self.opener = request.build_opener(handler)
def get(self, url, headers=None):
return get(url, headers, self.opener)
def post(self, url, form, headers=None):
return post(url, form, headers, self.opener)
# a. get(url, headers=None)
def get(url, headers=None, opener = None):
return urlrequests(url, headers=headers, opener=opener)
def post(url, form, headers=None, opener = None):
return urlrequests(url, form, headers=headers, opener=opener)
#b. post(url, form, headers=None)
#1. 传入url
#2. user_agent
#3. headers
#4. 定义Request
#5. urlopen
#6. 返回byte数组
def urlrequests(url, form=None, headers=None, opener = None):
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
# 如果用户需要自行传入headers, 则覆盖之前的headers
if headers == None:
headers = {
'User-Agent': user_agent
}
html_bytes = b''
try:
if form:
# POST
# 2.1 转换成str
form_str = parse.urlencode(form, encoding='utf-8')
#print(form_str)
# 2.2 转换成bytes
form_bytes = form_str.encode('utf-8')
req = request.Request(url, data=form_bytes, headers=headers)
else:
# GET
req = request.Request(url, headers=headers)
if opener:
response = opener.open(req)
else:
response = request.urlopen(req)
html_bytes = response.read()
except HTTPError as e:
print(e)
except URLError as e:
print(e)
return html_bytes
对上面代码进行调用:
from code_fengzhuang import session
import json
# url
url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018721514559'
# form
form = {
'email': '15sss4',
'autoLogin': 'true',
'icode': '',
'origURL': 'http://www.renren.com/home',
'domain': 'renren.com',
'key_id': '1',
'captcha_type': 'web_login',
'password': '09axxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx05938607a0',
'rkey': 'a9c4ac1308aa52xxxxxxxxxxxxxxxxxxxxxxx77add011f71',
'f': 'http%3A%2F%2Fwww.rxxxxxxxxxxxxxxxxxxxxx7453885',
}
s = session()
html_bytes = s.post(url, form)
print(html_bytes)
# html_bytes = post(url, form=form)
# 打印结果
# print(html_bytes)
# 通过json获取一个字典类型
res_dict = json.loads(html_bytes.decode('utf-8'))
print(res_dict)
home_url = res_dict['homeUrl']
# 访问页面
html_bytes = s.get(home_url)
print(html_bytes.decode('utf-8'))