java爬取堆糖所有头像(高质量版头像)

百度贴吧的用户头像爬下来之后,图片经过裁剪,尺寸太小,都是110*110之类的,并且有很多用户被注销后,头像是缩略图重复,所以爬了堆糖里面的头像,堆糖里的头像质量还是蛮高的。

堆糖官网:www.duitang.com

package com.yq.spider;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.Closeable;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * ----爬虫工具类----
 * 用于爬取堆糖头像
 * 堆糖官网:https://www.duitang.com
 * @author 习惯~
 * 完成时间:2018/12/22 1:51:00
 * 欢迎交流:qq  714588944
 *   
 */
public class DuiTangDemo {
	public static void main(String[] args) {
		while(true){
			int startIndex = (int)(Math.random()*23977);
			DuiTangDemo.start(startIndex);
		}
	}
	
	public static void start(int startIndex) {
		String urlStr = "https://www.duitang.com/napi/blog/list/by_filter_id/?include_fields=top_comments%2Cis_root%2Csource_link%2Citem%2Cbuyable%2Croot_id%2Cstatus%2Clike_count%2Csender%2Calbum%2Creply_count&filter_id=%E5%A4%B4%E5%83%8F&start=" + startIndex;
		String charset = "utf8";
		String content = getURLContent(urlStr, charset);
		
		String regex = "https://b-ssl.duitang.com/uploads/([\\s\\S]+?)g";
		List result = getMatcherContent(content, regex );
		download(result);
	}
	
	/**
	 * 指定url网页字符集,并返回网页源代码
	 * 
	 * @param urlStr
	 * @param charset
	 * @return
	 * @throws IOException
	 */
	public static String getURLContent(String urlStr, String charset) {
		StringBuilder sb = new StringBuilder();
		BufferedReader br = null;
		try {
			URL url = new URL(urlStr);
			br = new BufferedReader(new InputStreamReader(url.openStream(), charset));
			String line = null;
			while ((line = br.readLine()) != null) {
				sb.append(line);
			}
		} catch (IOException e) {
			e.printStackTrace();
			closeAll(br);
		}
		return sb.toString();
	}

	/**
	 * 网页源代码通过匹配正则表达式获取符合条件的信息 并封装成list返回
	 * 
	 * @param content
	 * @param regex
	 * @return
	 */
	public static List getMatcherContent(String content, String regex) {
		List result = new ArrayList<>();
		Pattern p = Pattern.compile(regex);
		Matcher m = p.matcher(content);
		while (m.find()) {
			result.add(m.group());
		}
		return result;
	}
	
	/**
	 * 下载集合中的图片
	 * 
	 * @param result
	 * @throws IOException
	 */
	public static void download(List result) {
		for (String temp : result) {
			download(temp);
		}
	}

	/**
	 * 下载图片详细方法
	 * 
	 * @param imgURL
	 */
	public static void download(String imgURL) {
		// 测试中发现有个结果不符合要求,过滤掉
		if (!(imgURL.endsWith(".jpg") || imgURL.endsWith(".jpeg")))
			return;
		System.out.println("堆糖--正在下载:" + imgURL);
		BufferedInputStream bis = null;
		BufferedOutputStream bos = null;
		try {
			File dest = new File("d:/堆糖头像");// 图片下载在d盘下的spider文件夹下
			if (!dest.exists())
				dest.mkdirs();
			// 图片名
			String imgName = imgURL.substring(imgURL.lastIndexOf("/") + 1);
			dest = new File(dest, imgName); // 构建子文件夹
			// 开始下载图片到本地
			URL url = new URL(imgURL);
			bis = new BufferedInputStream(url.openStream());
			bos = new BufferedOutputStream(new FileOutputStream(dest));
			byte[] flush = new byte[1024];
			int len = 0;
			while ((len = bis.read(flush)) != -1) {
				bos.write(flush, 0, len);
			}
			bos.flush(); // 强制刷新
		} catch (IOException e) {
			e.printStackTrace();
			closeAll(bis, bos);
		}
	}
	
	/**
	 * 关闭流的方法
	 * 
	 * @param io
	 */
	public static void closeAll(Closeable... io) {
		for (Closeable temp : io) {
			try {
				if (temp != null)
					temp.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
}

你可能感兴趣的:(爬虫)