哨得哨得
第一次在掘金发博客, 感觉爽爽的, 和掘金之间我还是选择掘金了, 因为掘金才是开发者的平台, 大部分还是作者吧!(个人观点, 贤者勿喷)
进入正题
简单介绍:
本次带来的一个用java写的爬取吾爱破解网(大家都懂得, 不是什么不正经的网站哈, 不过也是福利)最新更新的资源, 毕竟此网站一直不定时更新牛×哄哄的资源, 这个就是专门爬取最新分享的资源的 (什么XX软件啊, 某马教程视频啊....)
意图 (原因):
- 本人刚刚接触java(有半年了吧), 工作用到了html解析, 感觉里爬虫不远了, 就想涉足一下
- 本人资源收藏爱好者, 吾爱XX给了我海量资源, 但是由于大部分资源你是百度云链接, 而且深知百毒云有些敏感资源过时太快了, 所以想弄一个爬虫, 自动爬取, 自动保存(这一步下次更新完成吧)
- 毕竟谁也没事运行下这个java程序, 后期会放入服务器开通接口, 再用自己的微信小程序调用
(PS:有木有懂前端(喜欢开发UI)的来指导指导我啊!)
, 这样只要在微信就可以直接看到最新的资源了, (不只是资源哦, 还有链接, 回复, 链接状态等等)
用到的知识点
- java基础
- jsoup 解析html第三方jar
- okhttp 浏览器请求第三方jar
- 正则表达式(正则表达式 + Excel + NotePad++ + 列编辑模式几乎解决所有字符串批处理问题, 下次演示)
代码(两个类):
GetInfo.java
package test;
import com.mtl.pojo.Item;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.ibatis.io.Resources;
import org.apache.ibatis.session.SqlSession;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;
import org.junit.Test;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
public class GetInfo {
private static String lastTopic; //上一次查询最终的帖子的标题, 用来判断是否解析到上次解析的位置
private static String thisTopic; // 暂时保存这次解析的第一个标题, 最后加到lastTopic中去
@Test
public void getInfo(){
try {
OkHttpClient client = new OkHttpClient();//创建OkHttpClient对象
for (int i = 1; i <= 8; i++) {
//构建请求对象 通过内部类Request.Builder构建
Request request = new Request.Builder()
.url("https://www.52pojie.cn/forum.php?mod=guide&view=newthread&page=" + i)//请求接口。如果需要传参拼接到接口后面。
.build();
Response response = null;
//发送请求得到response对象
response = client.newCall(request).execute();
//判断返回状态码
if (response.isSuccessful()) {
String string = response.body().string();
// 查看返回的response头信息, 实际上用来设置返回的cookie的, 还没有完成
// Headers header = response.headers();
// for (int j = 0; j < header.size(); j++) {
// System.out.println(header.name(i) + "-----" + header.value(i));
// }
// System.out.println(string);
//调用方法解析html文本
ParseHtml parseHtml = new ParseHtml();
List- items = parseHtml.getCurrentPageItems(string, lastTopic);
testInsert(items);
if (i == 1){
thisTopic = items.get(0).getTitle();
}
if (parseHtml.isFind()){
break;
}
}
}
lastTopic = thisTopic;
} catch (Exception e) {
e.printStackTrace();
}
}
public void testInsert(List
- items){
try {
InputStream resourceAsStream= Resources.getResourceAsStream("mybatis.xml");
SqlSessionFactory build = new SqlSessionFactoryBuilder().build(resourceAsStream);
SqlSession sqlSession = build.openSession();
int insert = sqlSession.insert("com.mtl.mapper.ItemMapper.insertItems", items);
System.out.println("insert = " + insert);
sqlSession.commit();
sqlSession.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
-
ParseHtml.java
用来解析html字符串的工具类吧(不过并没有设置静态方法,为了以后spring管理哈哈
)
package test;
import com.mtl.pojo.Item;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ParseHtml {
private boolean isFind = false; //用来判断是否解析到了上次执行的最后一个标题, 结束条件
/**
* 获取当前html页面的所有item对象
* @param html 当前页面的html字符串
* @param lastTitle 停止解析的帖子标题
* @return item集合
* @throws IOException okhttp抛出的异常
*/
public List- getCurrentPageItems(String html, String lastTitle) throws IOException {
ArrayList
- items = new ArrayList<>();
//Jsoup解析html文本获取Document对象
Document parse = Jsoup.parse(html);
Element body = parse.body();
//通过选择器获取到标志的div然后赋值给item
Element element = body.selectFirst("div#forumnew");
// System.out.println("element = " + element);
Element table = element.nextElementSibling();
Elements tbodys = table.select("tbody");
for (int j = 0; j < tbodys.size(); j++) {
element = tbodys.get(j);
String title = element.selectFirst("a.xst").html();
if (title.equals(lastTitle)){ //如果查找到上次的最后的话题就直接结束并通知前台找到了标记
isFind = true;
break;
}
Item item = new Item();
item.setTitle(title);
Element tbody = element.selectFirst("tbody");
Elements tds = tbody.select("td");
for (int i = 0; i < tds.size(); i++) {
Element td = tds.get(i);
switch (i){
case 0:
item.setUrl("https://www.52pojie.cn/" + td.selectFirst("a").attr("href"));
Element span = td.selectFirst("span");
if (span != null)
item.setAuthorityLevel(span.html());
break;
case 1:
item.setPartition(td.selectFirst("a").html());
break;
case 2:
item.setAuther(td.selectFirst("a").html());
item.setPublishTime(td.selectFirst("span").html());
break;
case 3:
item.setReplyNum(td.selectFirst("a").html());
item.setViewNum(td.selectFirst("em").html());
break;
case 4:
item.setLastReplyName(td.selectFirst("a").html());
item.setLastReplyTime(td.selectFirst("em").selectFirst("a").html());
item.setLastReplyUrl("https://www.52pojie.cn/" + td.selectFirst("a").attr("href"));
break;
}
}
parseLink(item);
items.add(item);
}
return items;
}
/**
* 解析item内部的百度云链接
* @param item item对象
*/
private void parseLink(Item item) throws IOException {
if (item.getAuthorityLevel() == null) {
OkHttpClient okHttpClient = new OkHttpClient();
String url = item.getUrl();
Request build = new Request.Builder()
.url(url)
.build();
Response response = okHttpClient.newCall(build).execute();
if (response.isSuccessful()){
String string = response.body().string();
// System.out.println(string);
Matcher matcher = Pattern.compile("[^\"](https://pan.baidu.com/s/[\\w\\-0-9_]+[a-zA-Z_0-9])((?!https).)+密码: ?([a-zA-Z0-9]{4})[^a-zA-Z0-9]").matcher(string);
StringBuilder links = new StringBuilder();
StringBuilder pwds = new StringBuilder();
while (matcher.find()){
if (links.indexOf(matcher.group(1)) == -1){
links.append(matcher.group(1)).append(";");
pwds.append(matcher.group(3)).append(";");
}
// System.out.println("match = " + matcher.group(0));
}
if (!links.toString().equals("")){
item.setLinksAndPwdsStr(links.toString() + "#;#" + pwds.toString());
}
}
}
}
/**
* 测试需要阅读权限的链接返回的报文体 为以后自动登录获取链接做准备
* @throws IOException
*/
@Test
public void testLink() throws IOException {
OkHttpClient okHttpClient = new OkHttpClient();
Request build = new Request.Builder()
.url("https://www.52pojie.cn/thread-719615-1-1.html")
.build();
Response response = okHttpClient.newCall(build).execute();
if (response.isSuccessful()){
String string = response.body().string();
Matcher authLevel = Pattern.compile("抱歉,本帖要求阅读权限高于 \\d+ 才能浏览").matcher(string);
System.out.println(string);
if (authLevel.find()) {
System.out.println("需要权限");
}else {
Matcher matcher = Pattern.compile("[^\"](https://pan.baidu.com/s/[\\w\\-0-9_]+[a-zA-Z_0-9])((?!https).)+密码: ?([a-zA-Z0-9]{4})[^a-zA-Z0-9]").matcher(string);
while (matcher.find()){
System.out.println("match = " + matcher.group(1) + "--" + matcher.group(3));
}
}
}
}
public boolean isFind() {
return isFind;
}
public void setFind(boolean find) {
isFind = find;
}
}
-
Item.java
实体类
package com.mtl.pojo;
public class Item {
private String title; //标题
private String url; //链接
private String[] links; //百度云链接数组
private String[] pwds; //对应百度云链接密码
private String linksAndPwdsStr; //百度云链接数组
private String publishTime; //发表时间
private String authorityLevel; //查看权限
private String partition; //帖子分区
private String auther; //帖子作者
private String replyNum; //回复数量
private String viewNum; //查看数量
private String lastReplyName; //最后回复账户
private String lastReplyTime; //最后回复时间
private String lastReplyUrl; //最后回复链接
private String firstPageReply; //第一页回复内容集合
private boolean isNeedReply; //是否需要回复才可以获取下载链接
private int searchLinkTimes; //搜寻链接次数, 以备后期超过阈值不在获取
public String getLinksAndPwdsStr() {
return linksAndPwdsStr;
}
public void setLinksAndPwdsStr(String linksAndPwdsStr) {
if (linksAndPwdsStr == null || linksAndPwdsStr.equals("")){
links = new String[]{};
pwds = new String[]{};
}else {
String[] split = linksAndPwdsStr.split("#;#");
links = split[0].split(";");
pwds = split[1].split(";");
}
this.linksAndPwdsStr = linksAndPwdsStr;
}
public String[] getLinks() {
return links;
}
public String[] getPwds() {
return pwds;
}
public String getFirstPageReply() {
return firstPageReply;
}
public void setFirstPageReply(String firstPageReply) {
this.firstPageReply = firstPageReply;
}
public boolean isNeedReply() {
return isNeedReply;
}
public void setNeedReply(boolean needReply) {
isNeedReply = needReply;
}
public int getSearchLinkTimes() {
return searchLinkTimes;
}
public void setSearchLinkTimes(int searchLinkTimes) {
this.searchLinkTimes = searchLinkTimes;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getPublishTime() {
return publishTime;
}
public void setPublishTime(String publishTime) {
this.publishTime = publishTime;
}
public String getAuthorityLevel() {
return authorityLevel;
}
public void setAuthorityLevel(String authorityLevel) {
this.authorityLevel = authorityLevel;
}
public String getPartition() {
return partition;
}
public void setPartition(String partition) {
this.partition = partition;
}
public String getAuther() {
return auther;
}
public void setAuther(String auther) {
this.auther = auther;
}
public String getReplyNum() {
return replyNum;
}
public void setReplyNum(String replyNum) {
this.replyNum = replyNum;
}
public String getViewNum() {
return viewNum;
}
public void setViewNum(String viewNum) {
this.viewNum = viewNum;
}
public String getLastReplyName() {
return lastReplyName;
}
public void setLastReplyName(String lastReplyName) {
this.lastReplyName = lastReplyName;
}
public String getLastReplyTime() {
return lastReplyTime;
}
public void setLastReplyTime(String lastReplyTime) {
this.lastReplyTime = lastReplyTime;
}
public String getLastReplyUrl() {
return lastReplyUrl;
}
public void setLastReplyUrl(String lastReplyUrl) {
this.lastReplyUrl = lastReplyUrl;
}
}
由于数据库是存储数组很麻烦, 所以我想了一个折中的办法, 在实体类上下了手脚, 有兴趣的小伙伴可以看一下
后续打算
- 完成ssm项目,配置好服务, 测试接口
- 完成微信小程序UI, 使用服务器接口
- 上线微信小程序
- 完成自动登录, 防止阅读权限无法获取问题
- 自动判断百度盘是否失效, 自动去除该item
- 自动回复获取需要回复才可以查看隐藏链接的帖子