爬虫——手动登陆一次,使用cookie信息进行信息爬取

1、首先使用 爬虫脚本对百度翻译进行网页爬取,定义函数对get和post两种请求方式进行封装

from urllib import request, parse
from urllib.error import HTTPError, URLError


#定义get请求函数
def get(url,headers=None):
    return urlrequests(url,headers=headers)

#定义post请求函数
def post(url,form,headers=None):
    return urlrequests(url,form,headers=headers)


#爬虫封装函数
def urlrequests(url,form=None,headers=None):
    #模拟浏览器
    user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'

    if headers == None:
        headers = {'User-Agent':user_agent}
    #定义为byte(字节流类型)
    html_bytes = b''
    try:
        if form:
            #POST请求方式
            #(1):转换成str格式
            #将其序列化为get请求参数
            form_str = parse.urlencode(form)
            #(2):转换成bytes类型
            form_bytes = form_str.encode('utf-8')
            #去网站访问数据
            #                         data要求必须是bytes数据流类型
            req = request.Request(url,data=form_bytes,headers=headers)
            #指定写入文件
            with open('fanyi.html','wb') as f:
                f.write(html_bytes)
        else:
            #GET请求方式
            req = request.Request(url,headers=headers)
        response = request.urlopen(req)
        # 获取相应内容
        html_bytes = response.read()
    except HTTPError as e :
        print(e)
    except URLError as e :
        print(e)

    return html_bytes


if __name__ == '__main__':
    url = 'http://fanyi.baidu.com/sug/'

    form = {'kw':'汽车'}
    html_bytes = post(url,form)
    print(html_bytes)

    #get请求方式
    url = 'http://fanyi.baidu.com/'
    html_bytes = get(url)
    with open('youdao.html','wb') as f:
        f.write(html_bytes)

2、调用前期封装好的函数开始

#这是之前自己封装的函数
from tuozhanbao_all import post,get

from urllib import request,parse
from http import cookiejar
import json
#实例化一个存储cookie的容器
cookie_object = cookiejar.CookieJar()

#实例化一个操作cookie的方法 能读能写
handler = request.HTTPCookieProcessor(cookie_object)

opener = request.build_opener(handler)

#从控制台ajax获取的url地址
url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018722133420'

form = {
    'email':'XXXXXXXXX',  #用户名信息,从控制台的FormData中获取
    'icode':'',
    'origURL':'http://www.renren.com/home',
    'domain':'renren.com',
    'key_id':'1',
    'captcha_type':'web_login',
    'password':'XXXXXXXX',    #用户密码,获取方法同上
    'rkey':'8075568d7f5b3e190002eb191cad586f',
    'f':'http%3A%2F%2Fwww.renren.com%2F338858221%2Fprofile',
}

form_bytes = parse.urlencode(form).encode('utf-8')

# response = request.urlopen(url,form_bytes)
# html_bytes = response.read()
response = opener.open(url,form_bytes)
html_bytes = response.read()
print(type(html_bytes))
#

#把json类型转换为字典类型
res_dict = json.loads(html_bytes.decode('utf-8'))
# print(res_dict)
#{'code': True, 'homeUrl': 'http://www.renren.com/home'}

home_url = res_dict['homeUrl']
print(home_url)
#http://www.renren.com/home

response = opener.open(home_url)
html_bytes = response.read()
#写入文件 进行保存
with open('renren.html','wb') as f:
    f.write(html_bytes)
# print(html_bytes.decode('utf-8'))

 

你可能感兴趣的:(网络爬虫)