Java爬虫:爬取豆瓣图片之代码

关于豆瓣相册页面的特征提取和分析,详见我的另一篇博文:

《初涉爬虫:爬取豆瓣图片之分析》http://blog.csdn.net/allhaillouis/article/details/20226127


本贴展示代码,效果:爬取豆瓣相册,每个页面的图片分开保存在子文件夹下。

package douban;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class GetGoodPicByDouban {
	/**********************
	 ******* Setting ******
	 **********************/
	//从第x页抓起
	public static final String URL = "xxxxxxxxxxxxxxxxxxxx";
	// 模仿UA
//	public static final String UA = 
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.47 Safari/536.11";
	public static final String UA = 
"Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13";
	// 图片节点选择器
	public static final String IMG_DIV_SELECTOR = ".image-show img";
	//帖子节点选择器
	public static final String POST_SELECTOR = ".photo_wrap a[class]";
	// 最高页数
    public static final int MAX_PAGE = 522;
	// 存储路径
	public static final String BASE_PATH = "F:\\豆瓣图片";
	
	

	/**
	 * @Description 主函数
	 */
	public static void main(String[] args) {

		for (int i = 0; i<=MAX_PAGE; i=i+18) {
			String page_url = URL +i;
			// 图片按页面分文件夹
			String pagePath = BASE_PATH+"\\"+i;
			
			System.out.println("\n" + "**************解析URL(第" + i + "页):" + page_url + "**************\n");
			String pageResult = getResultByUrl(page_url);	
			Iterator iterator=getPostUrl(pageResult).iterator();
			while(iterator.hasNext()){
				try {
					Thread.currentThread().sleep(1000);
				} catch (InterruptedException e) {
					e.printStackTrace();
				}
				String postUrl = (String) iterator.next();
				System.out.println("解析图片帖子URL:" + postUrl);
				String postResult = getResultByUrl(postUrl);
				List urls = getImgUrl(postResult);
				for (String str : urls) {
					try {
						Thread.currentThread().sleep(500);
					} catch (InterruptedException e) {
						e.printStackTrace();
					}
					System.out.println("解析图片url:"+str);
					File imgFile = getStoreFile(str, pagePath);
					if (saveImg(str, imgFile))
						System.out.println("存入图片" + imgFile.getName());
				}
			}
			System.out.println("\n" + "**************解析URL完成(第" + i + "页)**************\n");
		}
		System.out.println("\n" + "**************全部URL解析完成**************\n");
			
			
	}

	/**
	 * 获取帖子目录名和对应的url
	 * @param pageResult
	 * @return 返回map,key:图片目录path,value:帖子url
	 */
	public static List getPostUrl(String pageResult){
		Document doc = Jsoup.parse(pageResult);
		List rtn = new ArrayList();
		Elements es = doc.select(POST_SELECTOR);
		for (Iterator i = es.iterator(); i.hasNext();) {
			Element e = i.next();
			rtn.add(e.attr("href"));
			System.out.println("图片帖子链接:" + e.attr("href"));
		}
		return rtn;
	}
	
	/**
	 * 给定url获取整个页面内容
	 * 
	 * @param url
	 * @return
	 */
	public static String getResultByUrl(String url) {
		HttpClient hc = new DefaultHttpClient();
		try {
			HttpGet httpget = new HttpGet(url);
			httpget.setHeader("User-Agent", UA);
			httpget.setHeader("Accept-Encoding", "utf-8");
			HttpResponse response = hc.execute(httpget);
			HttpEntity entity = response.getEntity();
			if (entity != null) {
				InputStream in = entity.getContent();
				BufferedReader br = new BufferedReader(new InputStreamReader(in,"utf-8"));
			    StringBuffer buffer = new StringBuffer();
			    String line = "";
			    while ((line = br.readLine()) != null){
			      buffer.append(line);
			    }
			    in.close();
			    return buffer.toString();
			}
		} catch (Exception e) {
			//再来一遍
				e.printStackTrace();
		}
		return "";
	}
	
	
	/**
	 * 从帖子内容中获取图片url
	 */
	public static List getImgUrl(String str) {
		List img_urls = new ArrayList();
		Document doc = Jsoup.parse(str);

		Elements es = doc.select(IMG_DIV_SELECTOR);
		for (Iterator i = es.iterator(); i.hasNext();) {
			Element e = i.next();
			img_urls.add(e.attr("src"));
		}
		return img_urls;
	}

	/**
	 * 从图片url和帖子名,生成图片的存储路径
	 */
	public static File getStoreFile(String imgUrl, String postPath) {

		String[] tmp = imgUrl.split("/");

		String imgName = tmp[tmp.length - 1];

		File dir = new File(postPath);
		if (!dir.exists())
			dir.mkdirs();
		File imgFile = new File(postPath + "\\" + imgName);
		if (!imgFile.exists()) {
			try {
				imgFile.createNewFile();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		return imgFile;
	}
	/**
	 * 将图片写入本地
	 */
	public static boolean saveImg(String img_url, File file) {
		HttpClient hc = new DefaultHttpClient();
		try {
			HttpGet httpget = new HttpGet(img_url);
			httpget.setHeader("User-Agent", UA);
			httpget.setHeader("Accept-Encoding", "utf-8");

			HttpResponse response = hc.execute(httpget);
			HttpEntity entity = response.getEntity();
			if (entity != null) {
				InputStream in = entity.getContent();
				OutputStream os = new FileOutputStream(file);
				int count = IOUtils.copy(in, os);
				IOUtils.closeQuietly(in);
				IOUtils.closeQuietly(os);
				if (0 != count)
					return true;
			}
		} catch (Exception e) {
				e.printStackTrace();
		}
		return false;
	}
}


你可能感兴趣的:(爬虫)