package novelCrawler;
import org.jsoup.Connection;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import ui.DownMsgUI;
import ui.crawlerUI;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Timer;
import java.util.TimerTask;
import javax.sound.sampled.AudioFormat.Encoding;
import javax.swing.JFrame;
import javax.swing.JOptionPane;
import javax.swing.JProgressBar;
public class biquge {
public biquge(String nn,String start,JProgressBar jpbVal,JProgressBar jpb2) {
long t1 = System.currentTimeMillis();
//查找小说
Connection connection1 = Jsoup.connect("http://www.xbiquge.la/xiaoshuodaquan/");
Document document1 = null;
try {
document1 = connection1.get();
} catch (IOException e2) {
// TODO 自动生成的 catch 块
e2.printStackTrace();
}
Elements elementsLis1=null;
try {
Element elementUL1 = document1.select("[class=novellist]").first();
elementsLis1 = elementUL1.select("li");
} catch (Exception e1) {
// TODO 自动生成的 catch 块
e1.printStackTrace();
}
jpbVal.setVisible(true);
int count = 0;
DownMsgUI dmu = new DownMsgUI();//消息框
//巡官遍历获取到的整个elementsLis集合
for(Element elementLi1 : elementsLis1){
try {
Thread.sleep(200);
} catch (InterruptedException e1) {
// TODO 自动生成的 catch 块
e1.printStackTrace();
}
Element elementA1 = elementLi1.select("a").first();
String href = elementA1.attr("href");//获取标签中的属性值(它这里采用的是相对路径的写法
String novelNa = elementA1.text();//小说名
jpbVal.setMaximum(elementsLis1.size());
int result1 = novelNa.indexOf(nn);//匹配小说名
if(result1 != -1){
//跟进小说
jpbVal.setValue(elementsLis1.size());
//JOptionPane msg = new JOptionPane();
//msg.setBounds(100, 100, 100, 100);
//JOptionPane.showMessageDialog(jpb2,"已找到小说:"+novelNa);
dmu.ta.append("找到小说: "+novelNa+"\n");
connectNovel(novelNa,href,start,jpb2,dmu);
break;
}else{
//继续往下寻找
//System.out.println("未找到小说!");
jpbVal.setValue(count);
}
count++;
if(count>=elementsLis1.size()) {
System.out.println("未找到小说!");
}
}
}
/
public void connectNovel(String novelNa,String h,String start,JProgressBar jpb2,DownMsgUI dmu) {
//1.与我们要爬取数据的页面建立连接
Connection connection = Jsoup.connect(h);
jpb2.setVisible(true);
Document document = null;
try {
document = connection.get();
} catch (IOException e2) {
// TODO 自动生成的 catch 块
e2.printStackTrace();
}
Element elementUL = document.select("[id=list]").first();
Elements elementsLis = elementUL.select("dd");
//小说信息
Element el = document.select("[id=info]").first();
Elements ele = el.select("p");
dmu.ta.append(ele.text().substring(0, ele.text().indexOf("动"))+"\n");
String str1=ele.text().substring(0, ele.text().indexOf("部"));
String str2=ele.text().substring(str1.length()+1, ele.text().length());
dmu.ta.append(str2+"\n");
System.out.println(str2);
int midTime = elementsLis.size();//单位s
//所需时间
jpb2.setMaximum(midTime);
//residueTime(midTime);//倒计时
int count2=0;
boolean flag = false;
int count=0;
int count3=0;
jpb2.setMaximum(elementsLis.size());
//巡官遍历获取到的整个elementsLis集合
for(Element elementLi : elementsLis){
try {
Thread.sleep(10);
} catch (InterruptedException e1) {
// TODO 自动生成的 catch 块
e1.printStackTrace();
}
Element elementA = elementLi.select("a").first();
String href = elementA.attr("href");//获取标签中的属性值(它这里采用的是相对路径的写法)
String imgName = elementA.text();
count3++;
int result1 = imgName.indexOf(start);//匹配章节
if(result1 != -1){
//开始下载
try {
download(novelNa,href,imgName, dmu);
count++;
jpb2.setMinimum(count3);
jpb2.setValue(jpb2.getMinimum()+(count2++));
} catch (IOException e) {
// TODO 自动生成的 catch 块
e.printStackTrace();
}
flag = true;
}else{
//继续往下寻找
if(flag == true) {
try {
download(novelNa,href,imgName, dmu);
count++;
jpb2.setValue(jpb2.getMinimum()+(count2++));
} catch (IOException e) {
// TODO 自动生成的 catch 块
e.printStackTrace();
}
}
}
if(count3==elementsLis.size())
{
dmu.ta.append("下载完成 共 "+count+" 章");
jpb2.setValue(jpb2.getMaximum());
}
}
}
public static void download(String novelNa,String href,String imgName,DownMsgUI dmu) throws IOException {
String netPath = "http://www.xbiquge.la"+href;
Connection newConnection = null;
Document newDocument = null;
try {
Thread.sleep(100);
newConnection = Jsoup.connect(netPath);
Thread.sleep(500);
newDocument = newConnection.get();
} catch(HttpStatusException e) {
System.out.println("下载错误,尝试重新连接");
try {
Thread.sleep(500);
newConnection = Jsoup.connect(netPath);
Thread.sleep(1000);
newDocument = newConnection.get();
} catch(HttpStatusException e1) {
System.out.println("下载错误2,尝试重新连接");
} catch
(InterruptedException e3) {
e.printStackTrace();
}
}
catch
(InterruptedException e) {
e.printStackTrace();
}
try {
Thread.sleep(500);
} catch (InterruptedException e1) {
e1.printStackTrace();
}
Element div = newDocument.select("[id=content]").first();
String divStyle = div.attr("style");
String text = div.text().trim();
text = new String(text.getBytes("UTF-8"),"UTF-8");
String l = System.getProperty("line.separator");
text = text.replace("。", "。"+l);
// System.out.print(text);
System.out.print("正在下载:"+imgName+" ");
FileOutputStream fileOutputStream = new FileOutputStream("./"+novelNa+".txt",true);
fileOutputStream.write(l.getBytes("UTF-8"));
fileOutputStream.write(imgName.getBytes("UTF-8"));
fileOutputStream.write(l.getBytes("UTF-8"));
fileOutputStream.write(text.getBytes("UTF-8"));//不指定则空格乱码,iso编码
fileOutputStream.flush();
fileOutputStream.close();
System.out.println("下载完成");
dmu.ta.append("正在下载:"+imgName+" \n");
long t4 = System.currentTimeMillis();
//double time = (double) (t4-t1)/1000;
// System.out.println("恭喜您已完成全部下载,共耗时:"+time+"秒,下载"+"章");
}
}
```java
package novelCrawler;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GetText {
/**
* 1、根据小说存放位置创建file对象
2、根据网页结构编写正则,创建pattern对象
3、编写循环,创建向所有小说章节页面发起网络请求的url对象
4、网络流BufferReader
5、创建输入流
6、循环读取请求得到的内容,使用正则匹配其中的内容
7、将读取到的内容写入本地文件,知道循环结束
8、注意代码中的异常处理
* @param args
*/
public static void main(String[] args) {
// 1、根据小说存放位置创建file对象
File file = new File("D:\\File\\three_guo.txt");
// 2、根据网页结构编写正则,创建pattern对象
String regex_content = "(.*?)" ;
String regex_title = "(.*?) ";
Pattern p_content = Pattern.compile(regex_content);
Pattern p_title = Pattern.compile(regex_title);
Matcher m_content;
Matcher m_title;
// 3、编写循环,创建向所有小说章节页面发起网络请求的url对象
for (int i = 1; i <= 120; i++) {
System.out.println("第" + i + "章开始下载。。。");
try {
// 创建每一个页面的url对象
URL url = new URL("http://www.xbiquge.la/13/13959/");
// 创建网络读取流
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(),"utf8"));
// 4、读取网络内容网络流BufferReader
String str = null;
// 5、创建输入流
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file,true)));
while ((str = reader.readLine()) != null) {
m_title = p_title.matcher(str.toString());
m_content = p_content.matcher(str.toString());
// 获取小说标题并写入本地文件
Boolean isEx = m_title.find();
if (isEx) {
String title = m_title.group();
// 清洗得到的数据
title = title.replace("" , "").replace("", "");
System.out.println(title);
writer.write("第" + i + "章:" + title + "\n");
}
while (m_content.find()) {
String content = m_content.group();
// 清洗得到的数据
content = content.replace(""
, "").replace("", "").replace(" ", "").replace("?", "");
// 把小说内容写入文件
writer.write(content + "\n");
}
}
System.out.println("第" + i + "章下载完成.........");
writer.write("\n\n");
writer.close();
reader.close();
} catch (Exception e) {
System.out.println("下载失败");
e.printStackTrace();
}
}
}
}
```java
package novelCrawler;
public class bqgThread {
public synchronized void DownWait() {
try {
this.wait();
} catch (InterruptedException e) {
// TODO 自动生成的 catch 块
e.printStackTrace();
}
}
public synchronized void DownNotify() {
this.notify();;
}
}
package ui;
import java.awt.*;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.ServerSocket;
import javax.swing.JFrame;
import javax.swing.JList;
import javax.swing.JPanel;
import javax.swing.JProgressBar;
import javax.swing.JTextArea;
import javax.swing.event.ChangeEvent;
import javax.swing.event.ChangeListener;
import novelCrawler.biquge;
public class crawlerUI implements ActionListener {
TextField novelName = new TextField(10);
TextField novelSrc = new TextField("笔趣阁",10);
TextField start = new TextField(10);
JProgressBar jpb = new JProgressBar();
JPanel jp = new JPanel();
JProgressBar jpb2 = new JProgressBar();
JFrame myui = new JFrame("novelCrawler");
Button dl = new Button("开始下载");
public crawlerUI() {
myui.setSize(400,300);
myui.setLayout(null);
Label label1=new Label("书 源");
label1.setBounds(10, 15, 30, 20);
myui.add(label1);
novelSrc.setBounds(70,15, 90, 20);
myui.add(novelSrc);
Label label2=new Label("书 名");
label2.setBounds(10, 45, 30, 20);
myui.add(label2);
novelName.setBounds(70,45, 90, 20);
myui.add(novelName);
Label label3=new Label("开始章节");
label3.setBounds(10, 75, 60, 20);
myui.add(label3);
jpb2.setBounds(70, 150, 200, 30);
myui.add(jpb2);
start.setBounds(70, 75, 90, 20);
myui.add(start);
jpb.setBounds(70, 100, 200, 30);
myui.add(jpb);
jpb.setValue(1);
jpb.setStringPainted(true);
jpb.setVisible(false);
jpb2.setVisible(false);
jpb2.setStringPainted(true);
dl.setBounds(150, 200, 70,40 );
myui.add(dl);
dl.addActionListener(this);
myui.setVisible(true);
// myui.setResizable(true);
}
static int stop = 0;
public void actionPerformed(ActionEvent ev) {
String name = novelName.getText();
System.out.println(name);
String start1 = start.getText();
System.out.println(start1);
Thread bqg = new Thread(new Runnable() {
@Override
public void run() {
biquge bq = new biquge(name,start1,jpb,jpb2);
}
});
if(stop==1) {
dl.setLabel("继续");
stop = 2;
}
else if(stop==2){
dl.setLabel("暂停");
stop = 1;
}
else {
bqg.start();
stop = 1;
dl.setLabel("暂停");
}
}
}
package ui;
import javax.swing.*;
import javax.swing.JTextArea;
public class DownMsgUI {
public JFrame dm ;
public JTextArea ta ;
public JScrollPane sp;
public DownMsgUI(){
dm = new JFrame("下载信息");
ta= new JTextArea();
sp=new JScrollPane(ta);
dm.setBounds(500, 500, 500, 500);
ta.setBounds(70, 200, 200, 100);
dm.add(sp);
dm.setVisible(true);
}
}