[记录]Java网络爬虫基础和抓取网站数据的两个小实例

前段时间在学习爬虫,并从网络抓取了一些简单的数据,记录一下。

抓取分成下面3个部分:

1、网络请求

2、解析抓取下来的页面,并且处理乱码或者解压代码的问题

3、拿到指定的数据、资源

完整代码如下:

第一个实例:

/**
	 * 从某网站查找所有帖子标题
	 * 把所有标题和链接存放在txt文件里面
	 */
	public static Map parseClPage(){
		String html = "http://cl.xxxx/thread0806.php"; // 解析的网站域名
		String currentuserdesktop = System.getProperty("user.home")+"\\Desktop";  // 获取操作系统为windows的桌面路径
		Map resultMap = new TreeMap(); // 结果-链接
		Document doc = null;
		try {
			for (int i = 0; i < 199; i++) { // 设置扫描的页数范围
				StringBuffer htmlCode = new StringBuffer("");
				HttpMethod httpMethod = new GetMethod("http://cl.xxxx/thread0806.php?fid=7&search=&page="+(i+1));
				HttpClient client = new HttpClient(); // 以下是设置网络请求的头信息,可以直接用浏览器的头信息
				httpMethod.addRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
				httpMethod.addRequestHeader("Accept-Encoding", "gzip, deflate, sdch");
				httpMethod.addRequestHeader("Accept-Language", "zh-CN,zh;q=0.8");
				httpMethod.addRequestHeader("Referer", "http://cl.clvv.biz/thread0806.php?fid=7");
				httpMethod.addRequestHeader("HTTPS", "1");
				httpMethod.addRequestHeader("Connection", "keep-alive");
				httpMethod.addRequestHeader("Host", "cl.clvv.biz");
				httpMethod.addRequestHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36ozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36");
				client.setTimeout(3000);
				client.executeMethod(httpMethod);
				InputStream inputStream = httpMethod.getResponseBodyAsStream();// 得到网络请求返回的HTML流
				GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream);// 解压压缩的HTML流,解压方式是GZIP
				InputStreamReader inputStreamReader = new InputStreamReader(gzipInputStream,Charset.forName("gb2312")); // 转码
				BufferedReader bin21 = new BufferedReader(inputStreamReader);
				while(bin21.readLine()!=null){
					String line = bin21.readLine();
					htmlCode.append(line);
				}
				doc = Jsoup.parse(htmlCode.toString()); // 解析html用的是操作较简单的jsoup包
				Elements elementsTr = doc.select("table tr"); // 用jQuery方式解析特定的数据域
				for (Element element : elementsTr) {
					String title = element.select("td").eq(1).select("h3 a").text();
					if(null!=title && !"".equals(title)){
						String link = "http://cl.xxxx/"+element.select("td").eq(1).select("h3 a").attr("href");
						// 文件流
						writefiletotxt((new FileWriter(currentuserdesktop+"\\查找结果.txt",true)),("标题:"+title+"\t链接:"+link+"\r\n"));
					}
				}
				// 释放链接
				httpMethod.abort();
				httpMethod.releaseConnection();
			}
			System.out.println("done--");
		} catch (Exception e) {
			e.printStackTrace();
		}
		return resultMap;
	}
	public static void writefiletotxt(FileWriter fw,String result){
		try {
			fw.write(result);
			fw.flush();
			fw.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

第二个实例:抓取网站图片。思路和第一个差不多

/**
 * @author 高攀
 * @上午9:58:01
 */
public class CatchImages {
	
	private static String curdesktop = System.getProperty("user.home")+"\\Desktop\\CatchImages\\";
	
	public static void main(String[] args) {
		doCatch("http://item.jd.com/716240.html");
	}
	
	// 网络请求并且拿到图片链接
	public static Integer doCatch(String site){
		GetMethod method = new GetMethod(site);
		HttpClient client = new HttpClient();
		
		try {
			method.addRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
//			method.addRequestHeader("Accept-Encoding", "gzip, deflate, sdch");
			method.addRequestHeader("Accept-Language", "zh-CN,zh;q=0.8");
			method.addRequestHeader("Avail-Dictionary", "XprLfaXG");
			method.addRequestHeader("Cache-Control", "max-age=0");
			method.addRequestHeader("Connection", "keep-alive");
			method.addRequestHeader("Cookie", "");
			method.addRequestHeader("Host", "user.qzone.qq.com");
			method.addRequestHeader("If-Modified-Since", "Thu, 24 Sep 2015 02:55:30 GMT");
			method.addRequestHeader("Upgrade-Insecure-Requests", "1");
			method.addRequestHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36ozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36");
			
			client.executeMethod(method);
			String htmlCode = method.getResponseBodyAsString();
			// 得到所有的img标签的链接
			Document doc = Jsoup.parse(htmlCode);
			Elements elementImg = doc.select("body img");
			for (Element element : elementImg) {
				String src = element.attr("src");
				if(src.contains("http")){ // 绝对路径就不变
					
				}else { // 否则变成绝对路径
					String rootUrl = HTMLParserHelper.getRootUrl(site); // 得到根路径
					// 加上跟路径
					src = rootUrl+src;
				}
				System.out.println(src);
				downloadImage(src);
				System.out.println("ok");
			}
			System.out.println(elementImg.size()+" result catched.");
		} catch (Exception e) {
			e.printStackTrace();
		} finally{
			method.abort();
			method.releaseConnection();
		}
		return 0;
	}

	// 下载图片的方法
	public static void downloadImage(String imageUrl){
		GetMethod method = new GetMethod(imageUrl);
		HttpClient client = new HttpClient();
		try {
			client.executeMethod(method);
			InputStream inputStream = method.getResponseBodyAsStream();
			
			File file = new File(curdesktop);
			if(!file.exists()){
				try {
					file.mkdir();
				} catch (Exception e) {
					e.printStackTrace();
				}
			}
			
			byte b[] = {1};
			int size = 0;
			FileOutputStream outputStream = new FileOutputStream(new File(curdesktop+HTMLParserHelper.getImageNameAndHouzui(imageUrl)));
			while((size=inputStream.read(b))!=-1){
				outputStream.write(b, 0, size);
			}
			outputStream.close();
		} catch (Exception e) {
			e.printStackTrace();
		} finally{
			// 释放链接
			method.abort();
			method.releaseConnection();
		}
		
	}
}


涉及的包百度:Httpclinet、Jsoup

你可能感兴趣的:(Java)