一段基于Jsoup和Dom4j的海报爬取小程序

/**
 * 
 */
package com.pan.tools;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.ResourceBundle;

import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.io.OutputFormat;
import org.dom4j.io.XMLWriter;
import org.jsoup.Jsoup;
import org.jsoup.select.Elements;

/**
 * @author Javay
 * 
 * 2012-9-7下午3:13:10
 * 
 */
public class MovieRssCNGenerator {

	private ResourceBundle bundle = ResourceBundle.getBundle("xmlCN");
	private final static int RETRY_TIME = 3;
	
	public static String getDateTime() {		
		return new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date());
	}
	
	/**
	 * 解析获取豆瓣电影宣传海报信息
	 * 
	 * @return */
	public Elements getDoubanMovieSlidePic() {
		
		int time = 0;
		
		do{
			try {
				
				org.jsoup.nodes.Document doc = Jsoup
						.connect(bundle.getString("douban")).data("query", "Java")
						.userAgent("Mozilla").cookie("auth", "token")
						.timeout(20000).post();
				org.jsoup.nodes.Element screeningbd = doc
						.select("div.screening-bd").first();
				Elements slideItems = screeningbd.select("li.poster");
				return slideItems;
				
			} catch (IOException e) {			
				time++;
				if(time < RETRY_TIME){
					System.out.println("请求超时,进行第"+time+"次重连。");
					try {
						Thread.sleep(3000);
					} catch (InterruptedException e1) {
						continue;
					}
				}
								
			}
		}while(time < RETRY_TIME);
		
		return null;

	}
	/**
	 * 生成XML文件
	 * 
	 * @param items
	 */
	public void createXMLDoc(Elements items) {
		Document doc = DocumentHelper.createDocument();
		doc.addComment("panmay.com"+this.getDateTime());
		Element root = doc.addElement("movies");
		if (items != null) {
			for (org.jsoup.nodes.Element item : items) {
				Element movie = root.addElement("movie");
				Element title = movie.addElement("title");
				title.setText(item.select("img").attr("alt").trim());
				Element link = movie.addElement("link");
				link.setText(item.select("a").attr("href"));

				String img = item.select("img").attr("data-original");
				Element pic = movie.addElement("pic");
				if ("".equals(img)) {
					pic.setText(item.select("img").attr("src").trim());
				} else {
					pic.setText(img.trim());
				}

			}

			String directory = bundle.getString("xmlPath");
			String fileName = bundle.getString("fileName");
			OutputFormat format = OutputFormat.createPrettyPrint();
			format.setEncoding("UTF-8");

			File file = new File(directory);
			if (!file.exists()) {
				System.out.println("目录不存在,创建一个新的文件输出路径: " + file);
				file.mkdirs();
			}
			try {
				FileOutputStream fos = new FileOutputStream(directory
						+ fileName + ".xml");
				try {
					XMLWriter writer = new XMLWriter(fos, format);
					try {
						writer.write(doc);
					} catch (IOException e) {
						e.printStackTrace();
					} finally {
						if (writer != null) {
							try {
								writer.close();
								System.out.println(fileName + "文件输出完毕!");
							} catch (IOException e) {
								e.printStackTrace();
							}
						}
					}
				} catch (UnsupportedEncodingException e) {
					e.printStackTrace();
				}
			} catch (FileNotFoundException e1) {
				e1.printStackTrace();
			}
		} else {
			System.out.println("数据读取失败!程序终止!");
		}

	}

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		MovieRssCNGenerator robot = new MovieRssCNGenerator();
		robot.createXMLDoc(robot.getDoubanMovieSlidePic());
	}

}

你可能感兴趣的:(dom4j,JSoup)