模拟登录古诗词网

主要思路:

  1. 对登录界面发送请求,网址:https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx
  2. 由于该网址密码输错三次会有验证码,我们就需要第三方平台帮忙破解验证码。小编采用的斐斐打码平台,当然平台打码是需要收费的,但是比较便宜,还有首充优惠。验证码的类型可以查看,不同类型的验证码收费不一样模拟登录古诗词网_第1张图片
  3. 爬取验证码图片保存,运用打码平台获取验证码。
  4. 我们再登录时打开我们的抓包工具,会发现一个网址:https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx
  5. 该网址是再登录时,向服务器发送的请求,我们发现里面携带了一些参数模拟登录古诗词网_第2张图片
    我们发现后面五个参数分别是网址,邮箱/电话,密码,验证码,登录,但是前面两个参数就很奇怪了。不过没关系,仔细找找我发现他在登录界面时的网页里有:https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx
  6. 获取全部参数后,还不能直接马上发送请求,因为再发送想服务器发送请求中有可能携带了cookie,所以还得创建一个session会话保存cookie。
  7. 最后保存源码到本地,就实现了模拟登录。

附上源码

# coding=utf-8
import os, sys
import hashlib
import time
import json
import requests

FATEA_PRED_URL = "http://pred.fateadm.com"


def LOG(log):
    # 不需要测试时,注释掉日志就可以了
    print(log)
    log = None


class TmpObj():
    def __init__(self):
        self.value = None


class Rsp():
    def __init__(self):
        self.ret_code = -1
        self.cust_val = 0.0
        self.err_msg = "succ"
        self.pred_rsp = TmpObj()

    def ParseJsonRsp(self, rsp_data):
        if rsp_data is None:
            self.err_msg = "http request failed, get rsp Nil data"
            return
        jrsp = json.loads(rsp_data)
        self.ret_code = int(jrsp["RetCode"])
        self.err_msg = jrsp["ErrMsg"]
        self.request_id = jrsp["RequestId"]
        if self.ret_code == 0:
            rslt_data = jrsp["RspData"]
            if rslt_data is not None and rslt_data != "":
                jrsp_ext = json.loads(rslt_data)
                if "cust_val" in jrsp_ext:
                    data = jrsp_ext["cust_val"]
                    self.cust_val = float(data)
                if "result" in jrsp_ext:
                    data = jrsp_ext["result"]
                    self.pred_rsp.value = data


def CalcSign(pd_id, passwd, timestamp):
    md5 = hashlib.md5()
    md5.update((timestamp + passwd).encode())
    csign = md5.hexdigest()

    md5 = hashlib.md5()
    md5.update((pd_id + timestamp + csign).encode())
    csign = md5.hexdigest()
    return csign


def CalcCardSign(cardid, cardkey, timestamp, passwd):
    md5 = hashlib.md5()
    md5.update(passwd + timestamp + cardid + cardkey)
    return md5.hexdigest()


def HttpRequest(url, body_data, img_data=""):
    rsp = Rsp()
    post_data = body_data
    files = {
     
        'img_data': ('img_data', img_data)
    }
    header = {
     
        'User-Agent': 'Mozilla/5.0',
    }
    rsp_data = requests.post(url, post_data, files=files, headers=header)
    rsp.ParseJsonRsp(rsp_data.text)
    return rsp


class FateadmApi():
    # API接口调用类
    # 参数(appID,appKey,pdID,pdKey)
    def __init__(self, app_id, app_key, pd_id, pd_key):
        self.app_id = app_id
        if app_id is None:
            self.app_id = ""
        self.app_key = app_key
        self.pd_id = pd_id
        self.pd_key = pd_key
        self.host = FATEA_PRED_URL

    def SetHost(self, url):
        self.host = url

    #
    # 查询余额
    # 参数:无
    # 返回值:
    #   rsp.ret_code:正常返回0
    #   rsp.cust_val:用户余额
    #   rsp.err_msg:异常时返回异常详情
    #
    def QueryBalc(self):
        tm = str(int(time.time()))
        sign = CalcSign(self.pd_id, self.pd_key, tm)
        param = {
     
            "user_id": self.pd_id,
            "timestamp": tm,
            "sign": sign
        }
        url = self.host + "/api/custval"
        rsp = HttpRequest(url, param)
        if rsp.ret_code == 0:
            LOG("query succ ret: {} cust_val: {} rsp: {} pred: {}".format(rsp.ret_code, rsp.cust_val, rsp.err_msg,
                                                                          rsp.pred_rsp.value))
        else:
            LOG("query failed ret: {} err: {}".format(rsp.ret_code, rsp.err_msg.encode('utf-8')))
        return rsp

    #
    # 查询网络延迟
    # 参数:pred_type:识别类型
    # 返回值:
    #   rsp.ret_code:正常返回0
    #   rsp.err_msg: 异常时返回异常详情
    #
    def QueryTTS(self, pred_type):
        tm = str(int(time.time()))
        sign = CalcSign(self.pd_id, self.pd_key, tm)
        param = {
     
            "user_id": self.pd_id,
            "timestamp": tm,
            "sign": sign,
            "predict_type": pred_type,
        }
        if self.app_id != "":
            #
            asign = CalcSign(self.app_id, self.app_key, tm)
            param["appid"] = self.app_id
            param["asign"] = asign
        url = self.host + "/api/qcrtt"
        rsp = HttpRequest(url, param)
        if rsp.ret_code == 0:
            LOG("query rtt succ ret: {} request_id: {} err: {}".format(rsp.ret_code, rsp.request_id, rsp.err_msg))
        else:
            LOG("predict failed ret: {} err: {}".format(rsp.ret_code, rsp.err_msg.encode('utf-8')))
        return rsp

    #
    # 识别验证码
    # 参数:pred_type:识别类型  img_data:图片的数据
    # 返回值:
    #   rsp.ret_code:正常返回0
    #   rsp.request_id:唯一订单号
    #   rsp.pred_rsp.value:识别结果
    #   rsp.err_msg:异常时返回异常详情
    #
    def Predict(self, pred_type, img_data, head_info=""):
        tm = str(int(time.time()))
        sign = CalcSign(self.pd_id, self.pd_key, tm)
        param = {
     
            "user_id": self.pd_id,
            "timestamp": tm,
            "sign": sign,
            "predict_type": pred_type,
            "up_type": "mt"
        }
        if head_info is not None or head_info != "":
            param["head_info"] = head_info
        if self.app_id != "":
            #
            asign = CalcSign(self.app_id, self.app_key, tm)
            param["appid"] = self.app_id
            param["asign"] = asign
        url = self.host + "/api/capreg"
        files = img_data
        rsp = HttpRequest(url, param, files)
        if rsp.ret_code == 0:
            LOG("predict succ ret: {} request_id: {} pred: {} err: {}".format(rsp.ret_code, rsp.request_id,
                                                                              rsp.pred_rsp.value, rsp.err_msg))
        else:
            LOG("predict failed ret: {} err: {}".format(rsp.ret_code, rsp.err_msg))
            if rsp.ret_code == 4003:
                # lack of money
                LOG("cust_val <= 0 lack of money, please charge immediately")
        return rsp

    #
    # 从文件进行验证码识别
    # 参数:pred_type;识别类型  file_name:文件名
    # 返回值:
    #   rsp.ret_code:正常返回0
    #   rsp.request_id:唯一订单号
    #   rsp.pred_rsp.value:识别结果
    #   rsp.err_msg:异常时返回异常详情
    #
    def PredictFromFile(self, pred_type, file_name, head_info=""):
        with open(file_name, "rb") as f:
            data = f.read()
        return self.Predict(pred_type, data, head_info=head_info)

    #
    # 识别失败,进行退款请求
    # 参数:request_id:需要退款的订单号
    # 返回值:
    #   rsp.ret_code:正常返回0
    #   rsp.err_msg:异常时返回异常详情
    #
    # 注意:
    #    Predict识别接口,仅在ret_code == 0时才会进行扣款,才需要进行退款请求,否则无需进行退款操作
    # 注意2:
    #   退款仅在正常识别出结果后,无法通过网站验证的情况,请勿非法或者滥用,否则可能进行封号处理
    #
    def Justice(self, request_id):
        if request_id == "":
            #
            return
        tm = str(int(time.time()))
        sign = CalcSign(self.pd_id, self.pd_key, tm)
        param = {
     
            "user_id": self.pd_id,
            "timestamp": tm,
            "sign": sign,
            "request_id": request_id
        }
        url = self.host + "/api/capjust"
        rsp = HttpRequest(url, param)
        if rsp.ret_code == 0:
            LOG("justice succ ret: {} request_id: {} pred: {} err: {}".format(rsp.ret_code, rsp.request_id,
                                                                              rsp.pred_rsp.value, rsp.err_msg))
        else:
            LOG("justice failed ret: {} err: {}".format(rsp.ret_code, rsp.err_msg.encode('utf-8')))
        return rsp

    #
    # 充值接口
    # 参数:cardid:充值卡号  cardkey:充值卡签名串
    # 返回值:
    #   rsp.ret_code:正常返回0
    #   rsp.err_msg:异常时返回异常详情
    #
    def Charge(self, cardid, cardkey):
        tm = str(int(time.time()))
        sign = CalcSign(self.pd_id, self.pd_key, tm)
        csign = CalcCardSign(cardid, cardkey, tm, self.pd_key)
        param = {
     
            "user_id": self.pd_id,
            "timestamp": tm,
            "sign": sign,
            'cardid': cardid,
            'csign': csign
        }
        url = self.host + "/api/charge"
        rsp = HttpRequest(url, param)
        if rsp.ret_code == 0:
            LOG("charge succ ret: {} request_id: {} pred: {} err: {}".format(rsp.ret_code, rsp.request_id,
                                                                             rsp.pred_rsp.value, rsp.err_msg))
        else:
            LOG("charge failed ret: {} err: {}".format(rsp.ret_code, rsp.err_msg.encode('utf-8')))
        return rsp

    ##
    # 充值,只返回是否成功
    # 参数:cardid:充值卡号  cardkey:充值卡签名串
    # 返回值: 充值成功时返回0
    ##
    def ExtendCharge(self, cardid, cardkey):
        return self.Charge(cardid, cardkey).ret_code

    ##
    # 调用退款,只返回是否成功
    # 参数: request_id:需要退款的订单号
    # 返回值: 退款成功时返回0
    #
    # 注意:
    #    Predict识别接口,仅在ret_code == 0时才会进行扣款,才需要进行退款请求,否则无需进行退款操作
    # 注意2:
    #   退款仅在正常识别出结果后,无法通过网站验证的情况,请勿非法或者滥用,否则可能进行封号处理
    ##
    def JusticeExtend(self, request_id):
        return self.Justice(request_id).ret_code

    ##
    # 查询余额,只返回余额
    # 参数:无
    # 返回值:rsp.cust_val:余额
    ##
    def QueryBalcExtend(self):
        rsp = self.QueryBalc()
        return rsp.cust_val

    ##
    # 从文件识别验证码,只返回识别结果
    # 参数:pred_type;识别类型  file_name:文件名
    # 返回值: rsp.pred_rsp.value:识别的结果
    ##
    def PredictFromFileExtend(self, pred_type, file_name, head_info=""):
        rsp = self.PredictFromFile(pred_type, file_name, head_info)
        return rsp.pred_rsp.value

    ##
    # 识别接口,只返回识别结果
    # 参数:pred_type:识别类型  img_data:图片的数据
    # 返回值: rsp.pred_rsp.value:识别的结果
    ##
    def PredictExtend(self, pred_type, img_data, head_info=""):
        rsp = self.Predict(pred_type, img_data, head_info)
        return rsp.pred_rsp.value


def TestFunc(filname,pred_type):
    pd_id = "用户账号"  # 用户中心页可以查询到pd信息
    pd_key = "用户密钥"
    app_id = "开发者账号"  # 开发者分成用的账号,在开发者中心可以查询到
    app_key = "开发者密钥"
    # 识别类型,
    # 具体类型可以查看官方网站的价格页选择具体的类型,不清楚类型的,可以咨询客服
    pred_type = pred_type
    api = FateadmApi(app_id, app_key, pd_id, pd_key)
    # 查询余额
    balance = api.QueryBalcExtend()  # 直接返余额
    # api.QueryBalc()

    # 通过文件形式识别:
    file_name = filname
    # 多网站类型时,需要增加src_url参数,具体请参考api文档: http://docs.fateadm.com/web/#/1?page_id=6
    result =  api.PredictFromFileExtend(pred_type,file_name)   # 直接返回识别结果


    # rsp = api.PredictFromFile(pred_type, file_name)  # 返回详细识别结果

    '''
    # 如果不是通过文件识别,则调用Predict接口:
    # result 			= api.PredictExtend(pred_type,data)   	# 直接返回识别结果
    rsp             = api.Predict(pred_type,data)				# 返回详细的识别结果
    '''

    # just_flag = False
    # if just_flag:
    #     if rsp.ret_code == 0:
    #         # 识别的结果如果与预期不符,可以调用这个接口将预期不符的订单退款
    #         # 退款仅在正常识别出结果后,无法通过网站验证的情况,请勿非法或者滥用,否则可能进行封号处理
    #         api.Justice(rsp.request_id)

    # card_id         = "123"
    # card_key        = "123"
    # 充值
    # api.Charge(card_id, card_key)
    # LOG("print in testfunc")
    # rsp = FateadmApi.PredictFromFile(pred_type, file_name, head_info="")
    return result
if __name__ == "__main__":
    print(TestFunc('pag_Code.jpg','30500'))

这是斐斐打码平台的源码,我封装成一个包了,再爬虫代码引入就行了

# coding=utf-8

import requests
from lxml import etree
import fateadm_api

session = requests.Session()
url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
headers = {
     
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
}
session = requests.Session()
"""
原因是验证码图片使用的是urlretrive进行的请求,该次请求的时候,服务器端会返回cookie.所以对验证码图片的请求必须使用session进行,将cookie存储在
session中,然后使用session继续进行登录请求即可完成.
session对象和requests作用几乎一样,都可以进行请求的发送,并且请求发送的方式也是一致的,
session进行请求的发送,如果会产生cookie的话,则cookie会自动被存储到session对象中
"""
response = session.get(url=url, headers=headers)
html = etree.HTML(response.text)
pag_src = 'https://so.gushiwen.cn' + html.xpath('//*[@id="imgCode"]/@src')[0]
VIEWSTATE = html.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
VIEWSTATEGENERATOR = html.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]
# print(pag_src)
src_response = session.get(url=pag_src, headers=headers).content
with open('./pag_Code.jpg', 'wb') as f:
    f.write(src_response)
    print("图片保存成功!!!")
code_text = fateadm_api.TestFunc('pag_Code.jpg', pred_type='30500')
num = code_text
# print(num)
data = {
     
    '__VIEWSTATE': VIEWSTATE,
    '__VIEWSTATEGENERATOR': VIEWSTATEGENERATOR,
    'from': 'http://so.gushiwen.cn/user/collect.aspx',
    'email': '17784757553',
    'pwd': 'jianwei20001112',
    'code': num,
    'denglu': '登录'
}
login_url = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx'
login_text = session.post(url=login_url, headers=headers, data=data).text
with open('古诗词登录.html', 'w', encoding='utf-8') as f:
    f.write(login_text)
print("登录成功!!")

如果对你有帮助,别忘了点赞加关注,谢谢!!!

你可能感兴趣的:(爬虫)