本文介绍如何使用java 爬取小说然后用Jsoup解析返回文档的简单demo.
这里为演示网址: book.zmjmall.com 可以初始化爬取小说,定时更新小说章节,搜索后站内没有资源的话从站外采集
https://gitee.com/javazmj/reptile-demo
源码已私有化,要源码的可以联系博主. [email protected]
废话不多说,代码走起.
1.使用IDEA新建一个springboot项目,这里使用的springboot版本为 2.0.4.RELEASE
1.8 UTF-8 5.6.4 org.springframework.boot spring-boot-starter-web org.springframework.boot spring-boot-starter-freemarker org.springframework.boot spring-boot-starter-aop org.springframework.boot spring-boot-devtools true org.projectlombok lombok true org.springframework.boot spring-boot-starter-data-redis org.springframework.boot spring-boot-configuration-processor true org.springframework.boot spring-boot-starter-test test org.apache.commons commons-lang3 3.5 commons-fileupload commons-fileupload 1.3.2 redis.clients jedis 2.9.0 com.alibaba fastjson 1.2.47 org.jsoup jsoup 1.11.3 org.apache.httpcomponents httpclient 4.5.5 com.google.code.gson gson com.google.guava guava 22.0 org.springframework.data spring-data-elasticsearch 3.0.6.RELEASE org.elasticsearch elasticsearch ${elasticsearch.version} org.elasticsearch.client transport ${elasticsearch.version} commons-logging commons-logging org.elasticsearch.plugin transport-netty3-client 5.6.10 org.springframework.boot spring-boot-starter-mail org.mybatis.spring.boot mybatis-spring-boot-starter 2.0.1 com.h2database h2 runtime reptile org.springframework.boot spring-boot-maven-plugin org.apache.maven.plugins maven-compiler-plugin -parameters
这里我们演示采集 笔趣读里的 元尊 这本小说,其他网站结构都差不多
这里使用Json自带的请求,也可以自己封装HttpClient请求
有的https网站可以用Jsoup.parse()请求有的会提示这样的错误
sun.security.validator.ValidatorException: PKIX path building failed: sun.security.provider.certpath.SunCertPathBuilderException: unable to find valid certification path to requested target
我们自己封装一个Https请求的工具类
public class HttpUtils { /** * get * * @param host * @param path * * @param headers * @param querys * @return * @throws Exception */ public static HttpResponse doGet(String host, String path, Map
headers, Map querys) throws Exception { HttpClient httpClient = wrapClient(host,path); HttpGet request = new HttpGet(buildUrl(host, path, querys)); for (Map.Entry e : headers.entrySet()) { request.addHeader(e.getKey(), e.getValue()); } return httpClient.execute(request); } /** * post form * * @param host * @param path * * @param headers * @param querys * @param bodys * @return * @throws Exception */ public static HttpResponse doPost(String host, String path, Map headers, Map querys, Map bodys) throws Exception { HttpClient httpClient = wrapClient(host,path); HttpPost request = new HttpPost(buildUrl(host, path, querys)); for (Map.Entry e : headers.entrySet()) { request.addHeader(e.getKey(), e.getValue()); } if (bodys != null) { List nameValuePairList = new ArrayList (); for (String key : bodys.keySet()) { nameValuePairList.add(new BasicNameValuePair(key, bodys.get(key))); } UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(nameValuePairList, "utf-8"); formEntity.setContentType("application/x-www-form-urlencoded; charset=UTF-8"); request.setEntity(formEntity); } return httpClient.execute(request); } /** * Post String * * @param host * @param path * * @param headers * @param querys * @param body * @return * @throws Exception */ public static HttpResponse doPost(String host, String path, Map headers, Map querys, String body) throws Exception { HttpClient httpClient = wrapClient(host,path); HttpPost request = new HttpPost(buildUrl(host, path, querys)); for (Map.Entry e : headers.entrySet()) { request.addHeader(e.getKey(), e.getValue()); } if (StringUtils.isNotBlank(body)) { request.setEntity(new StringEntity(body, "utf-8")); } return httpClient.execute(request); } /** * Post stream * * @param host * @param path * * @param headers * @param querys * @param body * @return * @throws Exception */ public static HttpResponse doPost(String host, String path, Map headers, Map querys, byte[] body) throws Exception { HttpClient httpClient = wrapClient(host,path); HttpPost request = new HttpPost(buildUrl(host, path, querys)); for (Map.Entry e : headers.entrySet()) { request.addHeader(e.getKey(), e.getValue()); } if (body != null) { request.setEntity(new ByteArrayEntity(body)); } return httpClient.execute(request); } /** * Put String * @param host * @param path * * @param headers * @param querys * @param body * @return * @throws Exception */ public static HttpResponse doPut(String host, String path, Map headers, Map querys, String body) throws Exception { HttpClient httpClient = wrapClient(host,path); HttpPut request = new HttpPut(buildUrl(host, path, querys)); for (Map.Entry e : headers.entrySet()) { request.addHeader(e.getKey(), e.getValue()); } if (StringUtils.isNotBlank(body)) { request.setEntity(new StringEntity(body, "utf-8")); } return httpClient.execute(request); } /** * Put stream * @param host * @param path * * @param headers * @param querys * @param body * @return * @throws Exception */ public static HttpResponse doPut(String host, String path, Map headers, Map querys, byte[] body) throws Exception { HttpClient httpClient = wrapClient(host,path); HttpPut request = new HttpPut(buildUrl(host, path, querys)); for (Map.Entry e : headers.entrySet()) { request.addHeader(e.getKey(), e.getValue()); } if (body != null) { request.setEntity(new ByteArrayEntity(body)); } return httpClient.execute(request); } /** * Delete * * @param host * @param path * * @param headers * @param querys * @return * @throws Exception */ public static HttpResponse doDelete(String host, String path, Map headers, Map querys) throws Exception { HttpClient httpClient = wrapClient(host,path); HttpDelete request = new HttpDelete(buildUrl(host, path, querys)); for (Map.Entry e : headers.entrySet()) { request.addHeader(e.getKey(), e.getValue()); } return httpClient.execute(request); } /** * 构建请求的 url * @param host * @param path * @param querys * @return * @throws UnsupportedEncodingException */ private static String buildUrl(String host, String path, Map querys) throws UnsupportedEncodingException { StringBuilder sbUrl = new StringBuilder(); if (!StringUtils.isBlank(host)) { sbUrl.append(host); } if (!StringUtils.isBlank(path)) { sbUrl.append(path); } if (null != querys) { StringBuilder sbQuery = new StringBuilder(); for (Map.Entry query : querys.entrySet()) { if (0 < sbQuery.length()) { sbQuery.append("&"); } if (StringUtils.isBlank(query.getKey()) && !StringUtils.isBlank(query.getValue())) { sbQuery.append(query.getValue()); } if (!StringUtils.isBlank(query.getKey())) { sbQuery.append(query.getKey()); if (!StringUtils.isBlank(query.getValue())) { sbQuery.append("="); sbQuery.append(URLEncoder.encode(query.getValue(), "utf-8")); } } } if (0 < sbQuery.length()) { sbUrl.append("?").append(sbQuery); } } return sbUrl.toString(); } /** * 获取 HttpClient * @param host * @param path * @return */ private static HttpClient wrapClient(String host, String path) { HttpClient httpClient = HttpClientBuilder.create().build(); if (host != null && host.startsWith("https://")) { return sslClient(); }else if (StringUtils.isBlank(host) && path != null && path.startsWith("https://")) { return sslClient(); } return httpClient; } /** * 在调用SSL之前需要重写验证方法,取消检测SSL * 创建ConnectionManager,添加Connection配置信息 * @return HttpClient 支持https */ private static HttpClient sslClient() { try { // 在调用SSL之前需要重写验证方法,取消检测SSL X509TrustManager trustManager = new X509TrustManager() { @Override public X509Certificate[] getAcceptedIssuers() { return null; } @Override public void checkClientTrusted(X509Certificate[] xcs, String str) {} @Override public void checkServerTrusted(X509Certificate[] xcs, String str) {} }; SSLContext ctx = SSLContext.getInstance(SSLConnectionSocketFactory.TLS); ctx.init(null, new TrustManager[] { trustManager }, null); SSLConnectionSocketFactory socketFactory = new SSLConnectionSocketFactory(ctx, NoopHostnameVerifier.INSTANCE); // 创建Registry RequestConfig requestConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD_STRICT) .setExpectContinueEnabled(Boolean.TRUE).setTargetPreferredAuthSchemes(Arrays.asList(AuthSchemes.NTLM, AuthSchemes.DIGEST)) .setProxyPreferredAuthSchemes(Arrays.asList(AuthSchemes.BASIC)).build(); Registry socketFactoryRegistry = RegistryBuilder. create() .register("http", PlainConnectionSocketFactory.INSTANCE) .register("https",socketFactory).build(); // 创建ConnectionManager,添加Connection配置信息 PoolingHttpClientConnectionManager connectionManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry); CloseableHttpClient closeableHttpClient = HttpClients.custom().setConnectionManager(connectionManager) .setDefaultRequestConfig(requestConfig).build(); return closeableHttpClient; } catch (KeyManagementException ex) { throw new RuntimeException(ex); } catch (NoSuchAlgorithmException ex) { throw new RuntimeException(ex); } } /** * 将结果转换成JSONObject * @param httpResponse * @return * @throws IOException */ public static JSONObject getJson(HttpResponse httpResponse) throws IOException { HttpEntity entity = httpResponse.getEntity(); String resp = EntityUtils.toString(entity, "UTF-8"); EntityUtils.consume(entity); return JSON.parseObject(resp); } public static HttpResponse doGetReptile(String host, String path) throws Exception { HttpClient httpClient = wrapClient(host,path); HttpGet request = new HttpGet(buildUrl(host, path, null)); request.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"); public static Document doGetReptile(String host, String path) throws Exception { HttpClient httpClient = wrapClient(host,path); HttpGet request = new HttpGet(buildUrl(host, path, null)); request.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"); String string = EntityUtils.toString(httpClient.execute(request).getEntity(),"UTF-8"); return Jsoup.parse(string); } }
新建一个Test测试类:
public class Test {
public static void main(String[] args) throws Exception { //元尊主页的网址 String path = "https://www.biqudu.net/31_31729/"; // URL url = new URL(path); // Document docemunt = Jsoup.parse(url, 20000); Document docemunt = HttpUtils.doGetReptile(null, path); System.out.println(docemunt.toString()); } }
这里我们打印看一下返回的Document内容
从这里可以看到返回的小说名称 作者 最后更新时间 最后更新章节等信息 我们来取到这些信息,新建一个
DocumentBook实体类来存放这些字段名
public class DocumentBook { public static final String description = "og:description"; public static final String image = "og:image"; public static final String category = "og:novel:category"; public static final String author = "og:novel:author"; public static final String book_name = "og:novel:book_name"; public static final String read_url = "og:novel:read_url"; public static final String status = "og:novel:status"; public static final String update_time = "og:novel:update_time"; public static final String latest_chapter_name = "og:novel:latest_chapter_name"; public static final String latest_chapter_url = "og:novel:latest_chapter_url"; }
新建Book BookList BookInfo实体类
在新建一个Doc2Bean 类将Document里的信息提取为Bean
@Data public class Book implements Serializable { private static final long serialVersionUID = 8058846183802946244L; private String id; private String description; private String image; private String category; private String author; private String bookName; private String readUrl; private String status; private Date updateTime; private Date createTime; private String latestChapterName; private String latestChapterUrl; }
@Data public class BookList implements Serializable { private static final long serialVersionUID = 4252922287795414078L; private String bookId; private String readUrl; private String name; private Integer sort; private String content; }
@Data public class BookInfo implements Serializable { private static final long serialVersionUID = 7211837590337550637L; private String bookId; private String readUrl; private String pre; private String next; private String content; }
public class Doc2Bean { private static final String pattern = "yyyy-MM-dd HH:mm:ss"; private static final String property = "property"; private static final String content = "content"; public static void Doc2BeanBook(Document document, Book book) { String author = document.getElementsByAttributeValue(property, DocumentBook.author). get(0).attr(content); String book_name = document.getElementsByAttributeValue(property, DocumentBook.book_name). get(0).attr(content); String category = document.getElementsByAttributeValue(property, DocumentBook.category). get(0).attr(content); String description = document.getElementsByAttributeValue(property, DocumentBook.description). get(0).attr(content); String image = document.getElementsByAttributeValue(property, DocumentBook.image). get(0).attr(content); String latest_chapter_name = document.getElementsByAttributeValue(property, DocumentBook.latest_chapter_name). get(0).attr(content); String latest_chapter_url = document.getElementsByAttributeValue(property, DocumentBook.latest_chapter_url). get(0).attr(content); String read_url = document.getElementsByAttributeValue(property, DocumentBook.read_url). get(0).attr(content); String status = document.getElementsByAttributeValue(property, DocumentBook.status). get(0).attr(content); String update_time = document.getElementsByAttributeValue(property, DocumentBook.update_time). get(0).attr(content); book.setAuthor(author); book.setBookName(book_name); book.setCategory(category); book.setDescription(description); book.setImage(image); book.setLatestChapterName(latest_chapter_name); book.setLatestChapterUrl(UrlUtils.getSort(latest_chapter_url)); book.setReadUrl(UrlUtils.getTrimId(read_url)); book.setStatus(status); book.setCreateTime(new Date()); try { book.setUpdateTime(DateUtils.parseDate(update_time,pattern)); } catch (ParseException ex) { ex.printStackTrace(); } } }
因为这里的id和readUrl等是/31_31729/的形式 为了后期传参方便,入库时我们将斜杠去掉,
public class UrlUtils { /** * 给url拼接/0_49/ * @param url * @return */ public static String urlAdd(String url) { StringBuffer sb = new StringBuffer("/"); sb.append(url); sb.append("/"); return sb.toString(); } /** * 去掉连接头尾/ . 取中间的数值 * /0_49/1045829.html * @param url * @return */ public static String getSort(String url) { if (null == url) return null; if (!StringUtils.contains(url,".")) { return getTrimId(url); } return url.substring(url.lastIndexOf("/") + 1,url.lastIndexOf(".")); } /** * 去掉id的下划线 * /0_122/ * @param id * @return */ public static String getTrimId(String id) { String replaceAll = id.replaceAll("/", ""); return replaceAll.trim(); } /** * 给url拼接连接 * @param bookId * @param sort * @return */ public static String urlAdd(String bookId,Integer sort) { StringBuffer sb = new StringBuffer("/"); sb.append(bookId); sb.append("/"); sb.append(sort); sb.append(".html"); return sb.toString(); } public static String urlAddNotHtml(String bookId, String readUrl) { StringBuffer sb = new StringBuffer("/"); sb.append(bookId); sb.append("/"); sb.append(readUrl); return sb.toString(); }
提取Bean以后就可以存入数据库了,至于用什么数据库请随意.接下来是提取章节列表的url
这里我们可以看到有两个
这里会发现章节的url有个共同特点就是后面的章节数字要比前面的章节数字大,我们可以将章节的数字存入数据库做排序使用也将数字提取出来
我们直接提取