前面基本部署就不说了(然后需要企业版的idea)
tomcat初步部署博客链接
很多小说网站都只提供在线阅读但不提供下载,这个项目的主要功能是把在线阅读网站的小说内容爬取下来生成一个txt文件以供读者下载
用户在使用前要先复制某本小说的第一章的链接,粘贴到网页的输入框内,点击确定后,网页将链接发送给MainServlet,就是主要的服务器进行解析,先获得小说名称,然后去数据库中查询,如果数据库中已经存在了,就直接从数据库中导出文本,生成txt文件,同时网页跳转到新的download.jsp页面,有一个超链接提供下载,如果数据库中没有,则调用爬取方法,开始一章章爬取,生成txt文件,同时将名称和内容导入数据库.
1)加入两个jar包 导入lib文件夹;
此处jar文件的版本一定要和你jdk,mysql的版本一致,然后右键选择
如果不这么做,以后会发现idea里能跑,在浏览器上就显示找不到jar包
2)创建两个文件夹一个image放背景图,一个text放文本文档
1)非常简单的html,css和javascript代码
<%--
Created by IntelliJ IDEA.
User: chenzhenghui
Date: 2019/10/8
Time: 6:59 下午
To change this template use File | Settings | File Templates.
--%>
<%@ page contentType="text/html;charset=UTF-8" language="java" %>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>czh的第一个项目</title>
</head>
<style>
h1{
font-size:54px;
color:#FFF;
}
h2{
font-size:30px;
color:#FF0000;
}
h6{
font-size:24px;
color:#FFF;
}
h5{
font-size:55px;
color:transparent
}
</style>
<body style="background-image: url(image/0f04af422502a40b6c8dc19d53d1f348-2.jpg)">
<div style="text-align: center;">
<table width="100%" border="0">
<tr>
<h5>占位符</h5>
</tr>
<tr>
</tr>
<tr>
<td align="center" width="90%"><h1>欢迎使用在线阅读网站小说爬取工具</h1></td>
</tr>
<tr>
<td align="center" width="90%"><h6>本站可提供小说的txt格式下载</h6></td>
</tr>
<tr>
<td width="50%" align="center">
<h2>
在使用前请复制小说网站的第一章的链接,粘贴到下面的文本框中并点击确定
</h2>
</td>
</tr>
<tr>
<td width="100%" align="center">
<form name="form1" action="MainServlet" method="get" >
<input name="url" type="text" style="width:600px; height:40px;" /><br>
<input type="button" width="200" align="center" value="确定" style="width:90px;height:1000px" onclick="check()">
</form></td>
<script language="JavaScript">
function check(){
if(form1.url.value==""){
alert("请输入链接!")
}
else{
form1.submit()
alert("正在爬取........请勿关闭页面")
}
}
</script>
</tr>
<tr>
<td height="200" width="100" align="center"><h6>跳转页面后点击下载链接下载</h6></td>
</tr>
<tr>
<td height="200" width="100" align="center"><a href=""></a> <h5></h5></td>
</tr>
</table>
</div>
</body>
</html>
效果图如下
主要方法有如下几个:
package ww;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import javax.servlet.ServletContext;
import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.*;
import java.sql.*;
@WebServlet(name = "MainServlet")
public class MainServlet extends HttpServlet {
protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
}
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
String url= request.getParameter("url");//把show.jsp传来的URL作为参数
String name1= null;//调用抓取方法,返回小说名
try {
name1 = catchText(url);
} catch (SQLException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
request.setAttribute("name",name1);//将name参数加入request属性
request.getRequestDispatcher("download.jsp").forward(request, response);//转到download.jsp
}
/************************数据库相关的方法开始**************************/
public void setDatabase(String name,String url) throws
ClassNotFoundException, SQLException, FileNotFoundException {//把新的小说加入数据库
Class.forName("com.mysql.jdbc.Driver");
String name1="root";
String password="31284679";
String url1="jdbc:mysql://localhost:3306/Fdb";
Connection connection=DriverManager.getConnection(url1,name1,password);
if (connection!=null) System.out.println("链接成功");//连接数据库
String sql="insert into novelSet(name,url,text)values(?,?,?)";
PreparedStatement pa=connection.prepareStatement(sql);
File file=new File(url);
InputStream inputStream=new FileInputStream(file);
pa.setString(1,name);
pa.setString(2,url);
pa.setAsciiStream(3,inputStream,file.length());
int row=pa.executeUpdate();
if (row>0)
System.out.println("更新了"+row+"行数据");
else System.out.println("失败");
pa.close();
connection.close();
}
public boolean findDatabase(String name) throws ClassNotFoundException, SQLException, IOException {
Class.forName("com.mysql.jdbc.Driver");
String name1="root";
String password="31284679";
String url1="jdbc:mysql://localhost:3306/Fdb";
Connection connection=DriverManager.getConnection(url1,name1,password);
if (connection!=null) System.out.println("链接成功");//连接数据库
String sql="select text from novelSet where name=?";
PreparedStatement pa=connection.prepareStatement(sql);//查询小说名
pa.setString(1,name);
ResultSet resultSet=pa.executeQuery();
if (resultSet.next()){//如果查询的到数据,就创建一个新的txt文件把数据库里的数据导入这个txt
String s=this.getClass().getResource("/").getPath();
System.out.println( System.getProperty("java.class.path"));
String pathname=s+"web/text/"+name+".txt";
System.out.println("查询到已有数据");
File file=new File(pathname);
if (!file.exists())
file.createNewFile();
InputStream inputStream=resultSet.getAsciiStream("text");
byte[] text=inputStream.readAllBytes();
FileOutputStream outputStream=new FileOutputStream(file);
outputStream.write(text);
outputStream.close();
connection.close();
pa.close();
return true;
}
else {//查不到就返回false
connection.close();
pa.close();
return false;
}
}
/************************数据库相关的方法结束**************************/
/************************辅助方法开始**************************/
public static String findname(String starturl, String constent) throws IOException {
Document test = Jsoup.connect(starturl).get();
Elements xx=test.select("a[href="+constent+"]");
String end="";
for( Element element : xx ){//拿到小说名字
end=element.text();
break;
}
return end;
}
public static String findHref(String name,String starturl) throws IOException {
Document test = Jsoup.connect(starturl).get();
Elements xx=test.select("a[href~=^/cbook]");
String end="";
for( Element element : xx ){//拿到章节目录的超链接
if (element.text().equals(name)){
end=element.attr("href");
break;
}
}
return end;
}
/************************辅助方法结束**************************/
/************************爬取主方法开始**************************/
public String firstFindNovle(String name,String url) throws IOException, SQLException, ClassNotFoundException {
//try {
String s=this.getServletContext().getRealPath("");
System.out.println(s+"web/text/"+name+".txt");
//String pathname=s+"web/text/"+name+".txt";
String pathname= "/Users/chenzhenghui/IdeaProjects/test/fristweb/web/text/"+name+".txt";
File file = new File(pathname);
System.out.println(file);
file.createNewFile();
System.out.println("12");
Writer out = new FileWriter(file);
System.out.println("3");
while (true) {
Document document=Jsoup.connect(url).get();
String title=document.title();
Elements elements=document.select("div[id=content]");
for (Element element:elements){
String rawText=element.text()+"\n"+"\n";
String text=rawText.replace(" ","\n");
out.write(title);
out.write(text);
}
String next= "http://www.yuetutu.com"+findHref("下一章",url);
if (next.equals(end)){
out.close();
break;
}
else url=next;
}
out.close();
setDatabase(name,pathname);
return name;
//}
//catch (Exception e){
/*System.out.println("爬取出现了问题");
return "错误";*/
//}
}
/************************爬取主方法结束**************************/
/************************真!主方法开始**************************/
public String catchText(String geturl) throws SQLException, IOException, ClassNotFoundException {
String starturl = geturl;
String constent = findHref("章节目录", starturl);
String end = "http://www.yuetutu.com" + constent;
String name=findname(starturl, constent);
if (findDatabase(name))
return name;
else {
System.out.println("新的小说");
return firstFindNovle(name,starturl);
}
}
/************************真!主方法结束**************************/
}
其中链接数据库的代码命令Class.forname("com.mysql.jdbc.Driver"是java反射
简陋的一行文字说明和一个下载链接
<%--
Created by IntelliJ IDEA.
User: chenzhenghui
Date: 2019/10/12
Time: 10:33 上午
To change this template use File | Settings | File Templates.
--%>
<%@ page contentType="text/html;charset=UTF-8" language="java" %>
<html>
<head>
<title>下载地址</title>
</head>
<style>
h1{
font-size:54px;
color:#FFF;
}
h2{
font-size:45px;
color:#FFF;
}
</style>
<body style=" background-repeat:no-repeat;background-image: url(image/timg3.jpeg);background-size:cover">
<div style="text-align: center;">
<h1>请点击下面的链接下载</h1>
<% String name=(String) request.getAttribute("name");
name=name+".txt";
//System.out.println(name);
String path=request.getContextPath()+"/text/"+name;
%>
<a href =<%=path%>><h2>下载链接</h2></a>
</div>
</body>
</html>