基于requests、urllib实现简单分页爬虫(包含登录验证)

简介

实现登录认证、cookie管理,分页获取爬取数据

requests库

import requests
import sys,json,logging, base64, math

base_url = 'https://127.0.0.1:5667'
user_name = 'user'
user_pswd = 'user@123'
opener = requests.Session()
g_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 11.0; Win64; x64)',
}


def conf_log():
    '''
    日志配置
    '''
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(logging.DEBUG)
    console_handler.setFormatter(logging.Formatter(
        '[%(asctime)s %(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S'
    ))
    logging.basicConfig(handlers=[console_handler], level=logging.DEBUG)


def login():
    '''
    登录
    '''
    url = base_url+'/system/login'
    # 注意去掉base64字符串的换行符(最后的strip函数)
    # 登录信息使用base64方式简单处理,防止http传输过程中出现明文
    ucode = base64.encodestring(
        (user_name+'|'+user_pswd).encode('utf8')).decode('utf8').strip()
    data = {
        'ucode': ucode,
        'rememberMe': False
    }
    res = opener.post(url=url, data=data, headers=g_headers, verify=False)
    try:
        result = json.loads(res.text)
        if result['code'] == 1:
            return True
        else:
            return False
    except Exception as e:
        logging.error(e)
        return False


def logout():
    '''
    登出
    '''
    url = base_url+'/system/logout'
    try:
        opener.get(url=url, headers=g_headers, verify=False)
    except Exception as e:
        logging.error(e)


def load_jz_info(offset, limit, page):
    url = base_url+"/system/gps/jzlist"
    data = {
        "devId": "", "dept": "", "description": "", "state": "",
        "offset": offset, "limit": limit, "page": page, "order": "asc"
    }
    # 注意post的json参数
    res = opener.post(url=url, json=data, headers=g_headers, verify=False)
    try:
        result = json.loads(res.text)
        total = result['total']
        for row in result['rows']:
            jz_info = {
                'devId': row['devId'],
                'description': row['description'],
                'longitude': row['longitude'],
                'latitude': row['latitude'],
                'dept': row['dept'],
                'devType': row['devType']
            }
            logging.debug(jz_info)
        if math.ceil(total/10) > page:
            load_jz_info(page*limit, limit, page+1) # 递归调用,爬取所有的页
    except Exception as e:
        logging.error(e)


if __name__ == '__main__':
    conf_log()
    if not login():
        logging.debug('login error')
        sys.exit()
    logging.debug('login ok')
    load_jz_info(0, 10, 1)
    logout()
    logging.debug('login out')

urllib库

from urllib import request, parse, error
from http import cookiejar # cookie管理模块
import sys, json,ssl, logging,base64, math

cookie = cookiejar.CookieJar()
cookie_handler = request.HTTPCookieProcessor(cookiejar=cookie)
context = ssl._create_unverified_context()
https_handler = request.HTTPSHandler(context=context)
http_handler = request.HTTPHandler()

opener = request.build_opener(http_handler, https_handler, cookie_handler)
opener.addheaders = [
    ('User-Agent', 'Mozilla/5.0 (Windows NT 11.0; Win64; x64)')]

base_url = 'https://127.0.0.1:5667'
user_name = 'user'
user_pswd = 'user@123'


def conf_logging():
    '''
    配置log
    '''
    console_handler = logging.StreamHandler(sys.stdout)
    console_handler.setLevel(logging.DEBUG)
    console_handler.setFormatter(logging.Formatter(
        '[%(asctime)s %(levelname)s]%(message)s', datefmt='%Y-%m-%d %H:%M:%S'))
    logging.basicConfig(level=logging.DEBUG, handlers=[console_handler])


def login():
    '''
    登录
    '''
    url = base_url+'/system/login'
    data = {
        # 注意去掉base64字符串的换行符(最后的strip函数)
        'ucode': base64.encodestring((user_name+'|'+user_pswd).encode('utf8')).decode('utf8').strip(),
        'rememberMe': False
    }
    req = request.Request(url=url, data=parse.urlencode(data).encode('utf8'))
    res = opener.open(req)
    try:
        result = json.loads(res.read().decode('utf8'))
        if result['code'] == 1:
            return True
        else:
            return False
    except Exception as e:
        logging.error(e)
        return False


def logout():
    '''
    登出
    '''
    url = base_url+'/system/logout'
    try:
        req = request.Request(url=url)
        opener.open(req)
    except error.HTTPError as e:
        logging.error(e)


def load_jz_info(offset, limit, page):
    url = base_url+"/system/gps/jzlist"
    data = {
       "devId": "", "dept": "", "description": "", "state": "",
        "offset": offset, "limit": limit, "page": page, "order": "asc"
    }
    # post + json 传参方式
    headers = {
        'Content-Type': 'application/json'
    }

    req = request.Request(url=url, data=bytes(
        json.dumps(data), 'utf8'), headers=headers)
    res = opener.open(req)
    try:
        result = json.loads(res.read().decode('utf8'))
        total = result['total']
        for row in result['rows']:
            jz_info = {
                'devId': row['devId'],
                'description': row['description'],
                'longitude': row['longitude'],
                'latitude': row['latitude'],
                'dept': row['dept'],
                'devType': row['devType']
            }
            logging.debug(jz_info)
        if math.ceil(total/10) > page:
            load_jz_info(page*limit, limit, page+1)
    except Exception as e:
        logging.error(e)


if __name__ == '__main__':
    conf_logging()
    if not login():
        logging.debug('login error')
        sys.exit()
    logging.debug('login ok')
    load_jz_info(0, 10, 1)
    logout()
    logging.debug('login out')
    

你可能感兴趣的:(Python爬虫,爬虫)