java爬虫爬取笔趣阁小说

java爬虫爬取笔趣阁小说

java爬虫爬取笔趣阁小说_第1张图片
java爬虫爬取笔趣阁小说_第2张图片
java爬虫爬取笔趣阁小说_第3张图片
java爬虫爬取笔趣阁小说_第4张图片

package novelCrawler;

import org.jsoup.Connection;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import ui.DownMsgUI;
import ui.crawlerUI;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Timer;
import java.util.TimerTask;

import javax.sound.sampled.AudioFormat.Encoding;
import javax.swing.JFrame;
import javax.swing.JOptionPane;
import javax.swing.JProgressBar;
public class biquge {
     

	   public biquge(String nn,String start,JProgressBar jpbVal,JProgressBar jpb2) {
     
	        long t1 = System.currentTimeMillis();
	        
	        //查找小说
	        Connection connection1 = Jsoup.connect("http://www.xbiquge.la/xiaoshuodaquan/");
	        
	        Document document1 = null;
			try {
     
				document1 = connection1.get();
			} catch (IOException e2) {
     
				// TODO 自动生成的 catch 块
				e2.printStackTrace();
			}
	        Elements elementsLis1=null;
			try {
     
	        Element elementUL1 = document1.select("[class=novellist]").first();
	        elementsLis1 = elementUL1.select("li");
			} catch (Exception e1) {
     
				// TODO 自动生成的 catch 块
				e1.printStackTrace();
			}
			jpbVal.setVisible(true);
			int count = 0;
			 DownMsgUI dmu = new DownMsgUI();//消息框
	        //巡官遍历获取到的整个elementsLis集合
	        for(Element elementLi1 : elementsLis1){
     
	        	
	        	try {
     
					Thread.sleep(200);
				} catch (InterruptedException e1) {
     
					// TODO 自动生成的 catch 块
					e1.printStackTrace();
				}
	        	
	            Element elementA1 = elementLi1.select("a").first();      
	            String href = elementA1.attr("href");//获取标签中的属性值(它这里采用的是相对路径的写法
	            String novelNa = elementA1.text();//小说名
	            
	            
	            jpbVal.setMaximum(elementsLis1.size());
	            
	          
	            
	            int result1 = novelNa.indexOf(nn);//匹配小说名
	            if(result1 != -1){
     
	            	//跟进小说
	            	jpbVal.setValue(elementsLis1.size());
	            	//JOptionPane msg = new JOptionPane();
	            	//msg.setBounds(100, 100, 100, 100);
	            	//JOptionPane.showMessageDialog(jpb2,"已找到小说:"+novelNa);
	            	dmu.ta.append("找到小说: "+novelNa+"\n");
	            	connectNovel(novelNa,href,start,jpb2,dmu);
	            	break;
	            	
	            }else{
     
	            	//继续往下寻找
	            	//System.out.println("未找到小说!");
	            	jpbVal.setValue(count);
	            }
	            count++;
	            if(count>=elementsLis1.size()) {
     
	            	System.out.println("未找到小说!");
	            }
	           
	    }
	   }    
	        
	        
	    /    
	  public void connectNovel(String novelNa,String h,String start,JProgressBar jpb2,DownMsgUI dmu) {
        
	        //1.与我们要爬取数据的页面建立连接
	       Connection connection = Jsoup.connect(h);
	        
	        jpb2.setVisible(true);

	        Document document = null;
			try {
     
				document = connection.get();
			} catch (IOException e2) {
     
				// TODO 自动生成的 catch 块
				e2.printStackTrace();
			}
	        
	 
	  
	        Element elementUL = document.select("[id=list]").first();
	        Elements elementsLis = elementUL.select("dd");

	     //小说信息
	        Element el = document.select("[id=info]").first();
	        Elements ele = el.select("p");
	        dmu.ta.append(ele.text().substring(0, ele.text().indexOf("动"))+"\n");
	        String str1=ele.text().substring(0, ele.text().indexOf("部"));
	        String str2=ele.text().substring(str1.length()+1, ele.text().length());
	        dmu.ta.append(str2+"\n");
	        System.out.println(str2);
	        
	        int midTime = elementsLis.size();//单位s
	        //所需时间
	        jpb2.setMaximum(midTime);
	        //residueTime(midTime);//倒计时
	        
	        
	       int  count2=0;
	      
	          boolean flag = false;
	          
	          int  count=0;
	          int count3=0;
	          jpb2.setMaximum(elementsLis.size());
	        //巡官遍历获取到的整个elementsLis集合
	        for(Element elementLi : elementsLis){
     
	        	try {
     
					Thread.sleep(10);
				} catch (InterruptedException e1) {
     
					// TODO 自动生成的 catch 块
					e1.printStackTrace();
				}
	        	
	            Element elementA = elementLi.select("a").first();      
	            String href = elementA.attr("href");//获取标签中的属性值(它这里采用的是相对路径的写法)

	            String imgName = elementA.text();
	            	
	            	count3++;
	          
	   	     

	            int result1 = imgName.indexOf(start);//匹配章节
	            if(result1 != -1){
     
	            	//开始下载
	            	
	            	try {
     
						download(novelNa,href,imgName, dmu);
						count++;
						
						jpb2.setMinimum(count3);
						jpb2.setValue(jpb2.getMinimum()+(count2++));
					} catch (IOException e) {
     
						// TODO 自动生成的 catch 块
						e.printStackTrace();
					}
	            	flag = true;
	            }else{
     
	            	//继续往下寻找
	            	if(flag == true) {
     
	            		try {
     
							download(novelNa,href,imgName, dmu);
							count++;
							jpb2.setValue(jpb2.getMinimum()+(count2++));
						} catch (IOException e) {
     
							// TODO 自动生成的 catch 块
							e.printStackTrace();
						}
	            	}
	            }

	           if(count3==elementsLis.size())
	           {
     
	        	   dmu.ta.append("下载完成     共  "+count+" 章");
	        	   jpb2.setValue(jpb2.getMaximum());
	           }
	    }
	        
	        
	   }    

	    public static void download(String novelNa,String href,String imgName,DownMsgUI dmu) throws IOException {
     
	    	 String netPath = "http://www.xbiquge.la"+href;

	            Connection newConnection = null; 
	            Document newDocument  = null; 
	            
	            try {
     
					Thread.sleep(100);
					 
	                 newConnection = Jsoup.connect(netPath);
	                 Thread.sleep(500);
	                  newDocument = newConnection.get();
				} catch(HttpStatusException e) {
     
					System.out.println("下载错误,尝试重新连接");
					 try {
     
							Thread.sleep(500);
							 
			                 newConnection = Jsoup.connect(netPath);
			                 Thread.sleep(1000);
			                  newDocument = newConnection.get();
						} catch(HttpStatusException e1) {
     
							System.out.println("下载错误2,尝试重新连接");
							
						} catch
			            (InterruptedException e3) {
     
							
							e.printStackTrace();
						}
					
				}
	            catch
	            (InterruptedException e) {
     
				
					e.printStackTrace();
				}
	          
	            
	            try {
     
					Thread.sleep(500);
				} catch (InterruptedException e1) {
     
					
					e1.printStackTrace();
				}
	            
	            Element div = newDocument.select("[id=content]").first();
	         
	            String divStyle = div.attr("style");
	            String text = div.text().trim();
	            text = new String(text.getBytes("UTF-8"),"UTF-8");
	            String l = System.getProperty("line.separator");
	            
	        text = text.replace("。", "。"+l);
	       // System.out.print(text);
	            System.out.print("正在下载:"+imgName+"    ");
	            FileOutputStream fileOutputStream = new FileOutputStream("./"+novelNa+".txt",true);
	            fileOutputStream.write(l.getBytes("UTF-8"));
	            fileOutputStream.write(imgName.getBytes("UTF-8"));
	            fileOutputStream.write(l.getBytes("UTF-8"));
	            fileOutputStream.write(text.getBytes("UTF-8"));//不指定则空格乱码,iso编码
	            fileOutputStream.flush();
	            fileOutputStream.close();
	            System.out.println("下载完成");
	           
	        	dmu.ta.append("正在下载:"+imgName+"    \n");
	        	
	        long t4 = System.currentTimeMillis();
	        //double time = (double) (t4-t1)/1000;
	       // System.out.println("恭喜您已完成全部下载,共耗时:"+time+"秒,下载"+"章");
	    }

	    
	    
}

	

```java
package novelCrawler;


import java.io.BufferedReader;

import java.io.BufferedWriter;

import java.io.File;

import java.io.FileOutputStream;

import java.io.InputStreamReader;

import java.io.OutputStreamWriter;

import java.net.URL;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

 

public class GetText {
     

/**

* 1、根据小说存放位置创建file对象

2、根据网页结构编写正则,创建pattern对象

3、编写循环,创建向所有小说章节页面发起网络请求的url对象

4、网络流BufferReader

5、创建输入流

6、循环读取请求得到的内容,使用正则匹配其中的内容

7、将读取到的内容写入本地文件,知道循环结束

8、注意代码中的异常处理

 

* @param args

*/

public static void main(String[] args) {
     

// 1、根据小说存放位置创建file对象

File file = new File("D:\\File\\three_guo.txt");

// 2、根据网页结构编写正则,创建pattern对象

String regex_content = "(.*?)

"
; String regex_title = "(.*?)"; Pattern p_content = Pattern.compile(regex_content); Pattern p_title = Pattern.compile(regex_title); Matcher m_content; Matcher m_title; // 3、编写循环,创建向所有小说章节页面发起网络请求的url对象 for (int i = 1; i <= 120; i++) { System.out.println("第" + i + "章开始下载。。。"); try { // 创建每一个页面的url对象 URL url = new URL("http://www.xbiquge.la/13/13959/"); // 创建网络读取流 BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(),"utf8")); // 4、读取网络内容网络流BufferReader String str = null; // 5、创建输入流 BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file,true))); while ((str = reader.readLine()) != null) { m_title = p_title.matcher(str.toString()); m_content = p_content.matcher(str.toString()); // 获取小说标题并写入本地文件 Boolean isEx = m_title.find(); if (isEx) { String title = m_title.group(); // 清洗得到的数据 title = title.replace(""</span><span class="token punctuation">,</span> <span class="token string">""</span><span class="token punctuation">)</span><span class="token punctuation">.</span><span class="token function">replace</span><span class="token punctuation">(</span><span class="token string">"", ""); System.out.println(title); writer.write("第" + i + "章:" + title + "\n"); } while (m_content.find()) { String content = m_content.group(); // 清洗得到的数据 content = content.replace("

", "").replace("

"
, "").replace(" ", "").replace("?", ""); // 把小说内容写入文件 writer.write(content + "\n"); } } System.out.println("第" + i + "章下载完成........."); writer.write("\n\n"); writer.close(); reader.close(); } catch (Exception e) { System.out.println("下载失败"); e.printStackTrace(); } } } }

```java
package novelCrawler;

public class bqgThread {
	 
    public synchronized void  DownWait() {
    	try {
			this.wait();
		} catch (InterruptedException e) {
			// TODO 自动生成的 catch 块
			e.printStackTrace();
		}
    }
    public synchronized void  DownNotify() {
    	this.notify();;
    }
}

package ui;
import java.awt.*;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.ServerSocket;

import javax.swing.JFrame;
import javax.swing.JList;
import javax.swing.JPanel;
import javax.swing.JProgressBar;
import javax.swing.JTextArea;
import javax.swing.event.ChangeEvent;
import javax.swing.event.ChangeListener;

import novelCrawler.biquge;
public class crawlerUI implements ActionListener {
     
	TextField novelName = new TextField(10); 
	TextField novelSrc = new TextField("笔趣阁",10); 
	TextField start = new TextField(10); 
	JProgressBar jpb = new JProgressBar(); 
	 JPanel jp = new JPanel();
	 JProgressBar jpb2 = new JProgressBar(); 
	 JFrame myui = new JFrame("novelCrawler");
	 Button dl = new Button("开始下载");
	 
	public crawlerUI() {
      
		
               myui.setSize(400,300);  
               myui.setLayout(null);
               Label label1=new Label("书  源");
               label1.setBounds(10, 15, 30, 20);
               
               myui.add(label1);
               
               novelSrc.setBounds(70,15, 90, 20);
               myui.add(novelSrc);
               
               Label label2=new Label("书  名");
               label2.setBounds(10, 45, 30, 20);
               myui.add(label2);
               
               novelName.setBounds(70,45, 90, 20);
               myui.add(novelName);
             
               Label label3=new Label("开始章节");
               label3.setBounds(10, 75, 60, 20);
               myui.add(label3);
               
               jpb2.setBounds(70, 150, 200, 30);
               myui.add(jpb2);
               start.setBounds(70, 75, 90, 20);
               myui.add(start);
               
               jpb.setBounds(70, 100, 200, 30);
               myui.add(jpb);
               
           
               
               jpb.setValue(1);
               jpb.setStringPainted(true);
              jpb.setVisible(false);
              jpb2.setVisible(false);
              jpb2.setStringPainted(true);
              
               
               dl.setBounds(150, 200, 70,40 );
               myui.add(dl);
               
          
               dl.addActionListener(this);
        myui.setVisible(true);               
       // myui.setResizable(true);
        
	}
	static int stop = 0;
	public void actionPerformed(ActionEvent ev) {
     
		String name = novelName.getText();
		System.out.println(name);
		String start1 = start.getText();
		System.out.println(start1);
		
		Thread bqg = new Thread(new Runnable() {
     
            @Override
            public void run() {
     
            	
                biquge bq = new biquge(name,start1,jpb,jpb2);
                
            }
           
        });
		
		
	
		if(stop==1) {
     
			dl.setLabel("继续");
			stop = 2;
			
			
			}
			else if(stop==2){
     
				dl.setLabel("暂停");
				stop = 1;
			
			}
			else {
     

				
				bqg.start();
				stop = 1;
				dl.setLabel("暂停");
			}
		
		
	}
	
	
}

package ui;
import javax.swing.*;
import javax.swing.JTextArea;

public class DownMsgUI {
     
		
	public JFrame dm ;
	public JTextArea ta ;
	public JScrollPane sp;
	public DownMsgUI(){
     
	
		dm	= new JFrame("下载信息");
		ta= new JTextArea();
	sp=new JScrollPane(ta);
	dm.setBounds(500, 500, 500, 500);
    ta.setBounds(70, 200, 200, 100);
    dm.add(sp);
    
    
    
    dm.setVisible(true);
	}
}

你可能感兴趣的:(笔记)