gecco的模拟登录(仅供学习)

gecco的模拟登录,感觉还有更优化的,希望有大佬能优化一下

  • 一、java获取cookie
  • 二、gecco保存cookie值
        • 第一个类 Login
        • 第二个类 Login_In

一、java获取cookie

啥也不说,看代码(这是爬取的一个小说网站的个人书架)

    public Map<String, String> loginIn(String loginUrl) throws IOException {

        // 构造登陆参数
        Map<String, String> data = new HashMap<String, String>();
        data.put("LoginForm[username]", "你的账号");
        data.put("LoginForm[password]", "你的密码");
        data.put("usecookie", "1");
        data.put("chkLogin", "立即登录");
        data.put("action", "login");
        Connection.Response login = Jsoup.connect(loginUrl)
                .ignoreContentType(true) // 忽略类型验证
                .followRedirects(false) // 禁止重定向
                .postDataCharset("utf-8")
                .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3")
                .header("Content-Type", "application/x-www-form-urlencoded")
                .header("Origin", "https://www.kenshu.cc")
                .header("Host", "www.kenshu.cc")
                .header("Referer", "https://www.kenshu.cc/login.php")
                .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36")
                .data(data)
                .method(Connection.Method.POST)
                .execute();
        login.charset("UTF-8");
        // login 中已经获取到登录成功之后的cookies
        // 构造访问个人中心的请求
        return login.cookies();
    }

以上有点Java基础都能看懂,所以就不讲解了

二、gecco保存cookie值

这一块我觉得做的还不好,代码需要大佬简化
上代码

第一个类 Login

package cn.demo;

import com.geccocrawler.gecco.GeccoEngine;
import com.geccocrawler.gecco.annotation.Gecco;
import com.geccocrawler.gecco.annotation.Request;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.spider.HtmlBean;
import org.jsoup.Connection;
import org.jsoup.Jsoup;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

@Gecco(matchUrl = "https://www.kenshu.cc/login.php?do=submit", pipelines = "login_in")
public class Login implements HtmlBean {
    private Map<String, String> cookie;
    @Request
    private HttpRequest request;

    public HttpRequest getRequest() {
        return request;
    }

    public void setRequest(HttpRequest request) {
        this.request = request;
    }

    public Map getCookie() {
        return cookie;
    }

    public void setCookie(Map cookie) {
        this.cookie = cookie;
    }

    /**
     * @param loginUrl 登录url
     * @throws IOException
     */
    public Map<String, String> loginIn(String loginUrl) throws IOException {

        // 构造登陆参数
        Map<String, String> data = new HashMap<String, String>();
        data.put("LoginForm[username]", "你的账号");
        data.put("LoginForm[password]", "你的密码");
        data.put("usecookie", "1");
        data.put("chkLogin", "立即登录");
        data.put("action", "login");
        Connection.Response login = Jsoup.connect(loginUrl)
                .ignoreContentType(true) // 忽略类型验证
                .followRedirects(false) // 禁止重定向
                .postDataCharset("utf-8")
                .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3")
                .header("Content-Type", "application/x-www-form-urlencoded")
                .header("Origin", "https://www.kenshu.cc")
                .header("Host", "www.kenshu.cc")
                .header("Referer", "https://www.kenshu.cc/login.php")
                .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36")
                .data(data)
                .method(Connection.Method.POST)
                .execute();
        login.charset("UTF-8");
        // login 中已经获取到登录成功之后的cookies
        // 构造访问个人中心的请求
        return login.cookies();
    }

    public static void main(String[] args) {

        GeccoEngine.create()
                //Gecco搜索的包路径
                .classpath("cn.demo")
                //开始抓取的页面地址
                .start("https://www.kenshu.cc/login.php?do=submit")
                //开启几个爬虫线程
                .thread(1)
                //单个爬虫每次抓取完一个请求后的间隔时间
                .interval(2000)
                .start();
    }
}

第二个类 Login_In

package cn.demo;

import com.geccocrawler.gecco.annotation.PipelineName;
import com.geccocrawler.gecco.pipeline.Pipeline;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.scheduler.SchedulerContext;

import java.io.IOException;
import java.util.Map;

@PipelineName("login_in")
public class Login_In implements Pipeline<Login> {
    @Override
    public void process(Login login) {
        HttpRequest currRequest = login.getRequest();
        Map<String,String> cookie= null;
        try {
            cookie = login.loginIn("https://www.kenshu.cc/login.php?do=submit");
        } catch (IOException e) {
            e.printStackTrace();
        }
        for (String key:cookie.keySet()) {
            currRequest.addCookie(key,cookie.get(key));
        }
        SchedulerContext.into(currRequest.subRequest("https://www.kenshu.cc/modules/article/bookcase.php"));
    }
}

通过这两个类实现cookie的引入,但我总觉得代码麻烦了,希望大佬能帮小弟优化一下,谢谢!!!

搞定了cookie,就搞定了模拟登录,接下来看一下我的这一篇文档gecco的简单使用

你可能感兴趣的:(爬虫,Gecco,java)