网络爬虫笔记(Day2)

爬取个人 人人主页 的代码:

首先用最原始的方法进行,可以看出这样写代码,比较麻烦,重复代码很多:

from urllib import request, parse

url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018721913553'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
#     'Cookie': 'xxx'
    }

form = {
    'email': '1550000004',
    'icode': '',
    'origURL': 'http://www.renren.com/home',
    'domain': 'renren.com',
    'key_id': '1',
    'captcha_type': 'web_login',
    'password': '68b1b0e6c6909edxxxxxxxxxxxxxxxxc93122a14b2f7c68',
    'rkey': '15b278axxxxxxxxxx2df83c257aaa1a1',
    'f': 'http%3A%2F%2Fwww.renren.com%2F967453885',
    }

'''由于人人需要登录,故为POST请求,需要额外的提供data'''
# --------------------先写POST请求,获取个人主页----------------------------------------------------------
# 对form进行转换
form_str = parse.urlencode(form,encoding='utf-8')
form_bytes = form_str.encode('utf_8')


req = request.Request(url,data=form_bytes,headers=headers)

response = request.urlopen(req)

info = response.read()
post_info = info.decode('utf-8')   # {"code":true,"homeUrl":"http://www.renren.com/home"}
# print(type(post_info))   # 

# 由于上面得到的是字符串类型,我们无法获取 homeUrl的内容,故需要转换为字典,可以调用json包
import json
info_dict = json.loads(post_info)
# print(type(info_dict))   # 
# 此时我们可以用字典读取方式取出里面的homeUrl,然后我再进行GRT请求获取我们需要的页面内容

# ---------------------再写GET请求获取页面内容----------------------------------------------------------
get_url = info_dict['homeUrl']

'''
注意:此时的cookie为登录进去的http://www.renren.com/967453885这个里面的967453885里面的cookie值
'''
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    'Cookie': 'xxxxxx'  }
req = request.Request(get_url,headers=headers)
response = request.urlopen(req,timeout=3)
get_info = response.read()
get_info = get_info.decode('utf-8')
print(get_info)

使用下面这种方式把cookie保存起来:

# 保存cookie
from http import cookiejar
# 通过对象保存cookie
cookie_object = cookiejar.CookieJar()
# handler 对应着一个操作
handler = request.HTTPCookieProcessor(cookie_object)
# opener 遇到有cookie的response的时候,
# 调用handler内部的一个函数, 存储到cookie object
opener = request.build_opener(handler)

下面对上面代码进行封装:

code_fengzhuang.py
from urllib import request, parse
from urllib.error import HTTPError, URLError
# 保存cookie
from http import cookiejar


class session(object):
    def __init__(self):
        cookie_object = cookiejar.CookieJar()
        # handler 对应着一个操作
        handler = request.HTTPCookieProcessor(cookie_object)
        # opener 遇到有cookie的response的时候,
        # 调用handler内部的一个函数, 存储到cookie object
        self.opener = request.build_opener(handler)

    def get(self, url, headers=None):
        return get(url, headers, self.opener)

    def post(self, url, form, headers=None):
        return post(url, form, headers, self.opener)

# a. get(url, headers=None)

def get(url, headers=None, opener = None):
    return urlrequests(url, headers=headers, opener=opener)

def post(url, form, headers=None, opener = None):
    return urlrequests(url, form, headers=headers, opener=opener)

#b. post(url, form, headers=None)

#1. 传入url
#2. user_agent
#3. headers
#4. 定义Request
#5. urlopen
#6. 返回byte数组
def urlrequests(url, form=None, headers=None, opener = None):

    user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    # 如果用户需要自行传入headers, 则覆盖之前的headers
    if headers == None:
        headers = {
            'User-Agent': user_agent
        }
    html_bytes = b''
    try:
        if form:
            # POST
            # 2.1 转换成str
            form_str = parse.urlencode(form, encoding='utf-8')
            #print(form_str)
            # 2.2 转换成bytes
            form_bytes = form_str.encode('utf-8')
            req = request.Request(url, data=form_bytes, headers=headers)
        else:
            # GET
            req = request.Request(url, headers=headers)
        if opener:
            response = opener.open(req)
        else:
            response = request.urlopen(req)
        html_bytes = response.read()
    except HTTPError as e:
        print(e)
    except URLError as e:
        print(e)

    return html_bytes



对上面代码进行调用:

from code_fengzhuang import session
import json
# url
url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018721514559'
# form
form = {
    'email': '15sss4',
    'autoLogin': 'true',
    'icode': '',
    'origURL': 'http://www.renren.com/home',
    'domain': 'renren.com',
    'key_id': '1',
    'captcha_type': 'web_login',
    'password': '09axxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx05938607a0',
    'rkey': 'a9c4ac1308aa52xxxxxxxxxxxxxxxxxxxxxxx77add011f71',
    'f': 'http%3A%2F%2Fwww.rxxxxxxxxxxxxxxxxxxxxx7453885',
}

s = session()
html_bytes = s.post(url, form)
print(html_bytes)

# html_bytes = post(url, form=form)
# 打印结果
# print(html_bytes)
# 通过json获取一个字典类型
res_dict = json.loads(html_bytes.decode('utf-8'))
print(res_dict)
home_url = res_dict['homeUrl']

# 访问页面
html_bytes = s.get(home_url)
print(html_bytes.decode('utf-8'))

 

你可能感兴趣的:(网络爬虫)