啥也不说,看代码(这是爬取的一个小说网站的个人书架)
public Map<String, String> loginIn(String loginUrl) throws IOException {
// 构造登陆参数
Map<String, String> data = new HashMap<String, String>();
data.put("LoginForm[username]", "你的账号");
data.put("LoginForm[password]", "你的密码");
data.put("usecookie", "1");
data.put("chkLogin", "立即登录");
data.put("action", "login");
Connection.Response login = Jsoup.connect(loginUrl)
.ignoreContentType(true) // 忽略类型验证
.followRedirects(false) // 禁止重定向
.postDataCharset("utf-8")
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3")
.header("Content-Type", "application/x-www-form-urlencoded")
.header("Origin", "https://www.kenshu.cc")
.header("Host", "www.kenshu.cc")
.header("Referer", "https://www.kenshu.cc/login.php")
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36")
.data(data)
.method(Connection.Method.POST)
.execute();
login.charset("UTF-8");
// login 中已经获取到登录成功之后的cookies
// 构造访问个人中心的请求
return login.cookies();
}
以上有点Java基础都能看懂,所以就不讲解了
这一块我觉得做的还不好,代码需要大佬简化
上代码
package cn.demo;
import com.geccocrawler.gecco.GeccoEngine;
import com.geccocrawler.gecco.annotation.Gecco;
import com.geccocrawler.gecco.annotation.Request;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.spider.HtmlBean;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
@Gecco(matchUrl = "https://www.kenshu.cc/login.php?do=submit", pipelines = "login_in")
public class Login implements HtmlBean {
private Map<String, String> cookie;
@Request
private HttpRequest request;
public HttpRequest getRequest() {
return request;
}
public void setRequest(HttpRequest request) {
this.request = request;
}
public Map getCookie() {
return cookie;
}
public void setCookie(Map cookie) {
this.cookie = cookie;
}
/**
* @param loginUrl 登录url
* @throws IOException
*/
public Map<String, String> loginIn(String loginUrl) throws IOException {
// 构造登陆参数
Map<String, String> data = new HashMap<String, String>();
data.put("LoginForm[username]", "你的账号");
data.put("LoginForm[password]", "你的密码");
data.put("usecookie", "1");
data.put("chkLogin", "立即登录");
data.put("action", "login");
Connection.Response login = Jsoup.connect(loginUrl)
.ignoreContentType(true) // 忽略类型验证
.followRedirects(false) // 禁止重定向
.postDataCharset("utf-8")
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3")
.header("Content-Type", "application/x-www-form-urlencoded")
.header("Origin", "https://www.kenshu.cc")
.header("Host", "www.kenshu.cc")
.header("Referer", "https://www.kenshu.cc/login.php")
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36")
.data(data)
.method(Connection.Method.POST)
.execute();
login.charset("UTF-8");
// login 中已经获取到登录成功之后的cookies
// 构造访问个人中心的请求
return login.cookies();
}
public static void main(String[] args) {
GeccoEngine.create()
//Gecco搜索的包路径
.classpath("cn.demo")
//开始抓取的页面地址
.start("https://www.kenshu.cc/login.php?do=submit")
//开启几个爬虫线程
.thread(1)
//单个爬虫每次抓取完一个请求后的间隔时间
.interval(2000)
.start();
}
}
package cn.demo;
import com.geccocrawler.gecco.annotation.PipelineName;
import com.geccocrawler.gecco.pipeline.Pipeline;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.scheduler.SchedulerContext;
import java.io.IOException;
import java.util.Map;
@PipelineName("login_in")
public class Login_In implements Pipeline<Login> {
@Override
public void process(Login login) {
HttpRequest currRequest = login.getRequest();
Map<String,String> cookie= null;
try {
cookie = login.loginIn("https://www.kenshu.cc/login.php?do=submit");
} catch (IOException e) {
e.printStackTrace();
}
for (String key:cookie.keySet()) {
currRequest.addCookie(key,cookie.get(key));
}
SchedulerContext.into(currRequest.subRequest("https://www.kenshu.cc/modules/article/bookcase.php"));
}
}
通过这两个类实现cookie的引入,但我总觉得代码麻烦了,希望大佬能帮小弟优化一下,谢谢!!!
搞定了cookie,就搞定了模拟登录,接下来看一下我的这一篇文档gecco的简单使用