Jsoup爬数据+设置代理IP

本文利用Jsoup工具从网站中爬IP,然后动态改变本地IP进行远程访问。
主要工作类:

public class Test {

    /**
     * @param args
     */
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        parse();
    }

    public static void parse() {

        // blogBody("");
        List list = null;
        try {
            list = getHtml();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

        String path = "/Users/tianjia/Documents/article";
        List articles = FileUtil.getListFromFile(path);
        ExecutorService executorService = Executors.newCachedThreadPool();
        int len_article = articles.size();
        for (int i = 0; i < len_article; i++) {
            executorService.execute(new MyRun(articles.get(i), list));
        }
    }

    private static List getHtml() throws IOException {
        Document doc = null;
        try {
            // doc = Jsoup.connect("http://www.baidu.com")
            doc = Jsoup.connect("http://www.xicidaili.com/nt")
            // .data("query", "Java")
                    .userAgent("Mozilla")
                    // .cookie("auth", "token")
                    // .timeout(3000)
                    .get();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        List list = new ArrayList();
        Elements elements = doc.select("tr.odd");
        int len = elements.size();
        Element element = null;
        for (int i = 0; i < len; i++) {
            element = elements.get(i);
            StringBuilder sBuilder = new StringBuilder(20);
            sBuilder.append(element.child(1).text());
            sBuilder.append(":");
            sBuilder.append(element.child(2).text());
            list.add(sBuilder.toString());
        }
        // System.out.println(doc.html());
        doc = null;
        elements.clear();
        elements = null;
        return list;
    }

    public static void visit(String ip, String url){
        // prop.setProperty("http.proxyHost", "183.45.78.31");
        // 设置http访问要使用的代理服务器的端口
        // prop.setProperty("http.proxyPort", "8080");
        String[] r = ip.split(":");
        System.getProperties().setProperty("http.proxyHost", r[0]);
        System.getProperties().setProperty("http.proxyPort", r[1]);
        try {
            // doc = Jsoup.connect("http://www.baidu.com")
            Jsoup.connect(url)
            // .data("query", "Java")
                    .userAgent("Mozilla")
                    // .cookie("auth", "token")
                    // .timeout(3000)
                    .get();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

}

自定义线程类:

public class MyRun implements Runnable{

    private List list;
    private String urlString;
    public MyRun(String url,List list) {
        this.list =  list;
        this.urlString = url;
    }
    @Override
    public void run() {
        // TODO Auto-generated method stub
        int len = list.size();
        for (int i = 0; i < len; i++) {
            Test.visit(list.get(i), urlString);
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }

}

文件操作类:

public class FileUtil {

    public static List<String> getListFromFile(String  path){
        List<String> list = new ArrayList<>();
        String data = null;
        try {
            BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path)));
            while((data = br.readLine())!=null)
            {
                System.out.println(data); 
                list.add(data);
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return list;
    }
}

你可能感兴趣的:(Jsoup)