爬虫

package com.tanzhou.spiders;



import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;


import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 
    * @ClassName: Main  
    * @Description: TODO(爬虫程序测试)  
    * @author Administrator  
    * @date 2018年4月30日  
    *
 */
public class Main {
    /**
     * @throws IOException 
     * 
        * @Title: processPage  
        * @Description: TODO(爬虫方法,用来从网页上爬取数据)  
        * @param @param URL    参数  
        * @return void    返回类型  
        * @throws
     */
    public static String processPage(String word,int x) throws IOException{
        
        List list = Ha.getList();
        
        Document doc = Jsoup.connect("https://www.baidu.com/s?wd="+word+"&pn="+x).get();
        
        Elements select2 = doc.select("head");
        int y = 1;
        if(x>0){
            y= x+1;
        }
        
        for(int i = y;i<(x+11);i++){
        
            Element elementById = doc.getElementById(""+i+"");
            list.add(elementById.toString());
            
        }
        String path = "D:/workspace/Spiders/WebContent/jsp/css.html";
        File f = new File(path);
        if (!f.exists()) {
            f.createNewFile();
        }
        Writer writer = new BufferedWriter(  
                new OutputStreamWriter(  
                        new FileOutputStream(f), "UTF-8"));  
        writer.write(select2.toString()); 
        for (int j = 0,len = list.size(); j < len; j++) {
             writer.append((CharSequence) list.get(j));
        }
        writer.flush();
        
        writer.close(); 
        return path;
    }
    public static void main(String[] args) throws IOException {
        
        processPage("haha",10);
    }
    
    
        

    
    
    }


    

package com.tanzhou.spiders;

import java.io.IOException;

import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;




@WebServlet(urlPatterns= "/SpiderServlet")
public class SpiderServlet extends HttpServlet {
    @Override
    protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        request.setCharacterEncoding("utf-8");
        String word = request.getParameter("word");
        String num = request.getParameter("num");
        int x = Integer.parseInt(num);
        x=10;
        String processPage =null;
        for(int i = 0;i<(x/10);i++){
            processPage = Main.processPage(word,i*10);
        }
        int of = processPage.indexOf("jsp");
        String substring = processPage.substring(of);
        
        System.out.println(substring);
        request.getRequestDispatcher("/"+substring).forward(request, response);
    }
    @Override
    protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
        
    }
    
}

你可能感兴趣的:(爬虫)