继上一篇文章 爬虫记录(1)——简单爬取一个页面的内容并写入到文本中 这代码,我们在之前类中增加了一些其他的方法
1、爬虫工具类,用来获取网页内容
package com.dyw.crawler.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
/**
* 爬虫工具类
* Created by dyw on 2017/9/1.
*/
public class CrawlerUtils {
/**
* 获取html内容转成string输出。
*
* @param url url链接
* @return 整个网页转成String字符串
*/
public static String getHtml(String url) throws Exception {
URL url1 = new URL(url);//使用java.net.URL
URLConnection connection = url1.openConnection();//打开链接
InputStream in = connection.getInputStream();//获取输入流
InputStreamReader isr = new InputStreamReader(in);//流的包装
BufferedReader br = new BufferedReader(isr);
String line;
StringBuffer sb = new StringBuffer();
while ((line = br.readLine()) != null) {//整行读取
sb.append(line, 0, line.length());//添加到StringBuffer中
sb.append('\n');//添加换行符
}
//关闭各种流,先声明的后关闭
br.close();
isr.close();
in.close();
return sb.toString();
}
/**
* 下载文件流
* @param urlStr url地址
* @return InputStream
*/
public static InputStream downLoadFromUrl(String urlStr) throws IOException {
URL url = new URL(urlStr);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
//防止屏蔽程序抓取而返回403错误
conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
//设置超时间为3秒
conn.setConnectTimeout(3 * 1000);
conn.setRequestProperty("Accept",
"image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-powerpoint, application/vnd.ms-excel, application/msword, */*");
conn.setRequestProperty("Accept-Language", "zh-cn");
conn.setRequestProperty("UA-CPU", "x86");
conn.setRequestProperty("Accept-Encoding", "gzip");//为什么没有deflate呢
conn.setRequestProperty("Content-type", "text/html");
conn.setRequestProperty("Connection", "keep-alive");
//得到输入流
return conn.getInputStream();
}
}
2、正则工具类,用来匹配需要获取的url地址
package com.dyw.crawler.util;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 正则表达式工具类
* Created by dyw on 2017/9/1.
*/
public class RegularUtils {
//获取img标签正则
private static final String IMGURL_REG = "]*?>" ;
//获取href正则
private static final String AURL_REG = "href=\"(.*?)\"";
//获取http开头,png|jpg|bmp|gif结尾的 正则
private static final String IMGSRC_REG = "[a-zA-z]+://[^\\s]*(?:png|jpg|bmp|gif)";
/**
* 获取 A 标签的正则表达式
*
* @param html 匹配的内容
* @return List结果集
*/
public static List getAUrl(String html) {
return match(AURL_REG, html);
}
/**
* 获取 IMG 标签的正则表达式
*
* @param html 匹配的内容
* @return List结果集
*/
public static List getIMGUrl(String html) {
List imgUrl = match(IMGURL_REG, html);
return match(IMGSRC_REG, imgUrl);
}
/**
* 获取 A 标签的正则表达式
*
* @param html 匹配的内容
* @return List结果集
*/
public static List getIMGSrc(String html) {
return match(IMGSRC_REG, html);
}
/**
* String匹配正则,封装到list中
*
* @param regular 正则表达式
* @param html 匹配的内容
* @return 匹配到的结果 List
*/
private static List match(String regular, String html) {
Matcher matcher = Pattern.compile(regular).matcher(html);
List list = new ArrayList<>();
while (matcher.find()) {
list.add(matcher.group());
}
return list;
}
/**
* list匹配正则,封装到list中
*
* @param regular 正则表达式
* @param list 匹配的列表
* @return 匹配到的结果 List
*/
private static List match(String regular, List list) {
List result = new ArrayList<>();
list.forEach(string -> {
Matcher matcher = Pattern.compile(regular).matcher(string);
while (matcher.find()) {
result.add(matcher.group());
}
});
return result;
}
}
3、IO工具类,用来把获取的html内容进行写入到文件中
package com.dyw.crawler.util;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
/**
* IO工具类
* Created by dyw on 2017/9/1.
*/
public class IOUtils {
/**
* 创建文件
*
* @param file File类型
*/
public static void createFile(File file) throws Exception {
try {
if (!file.exists()) {
file.createNewFile();
}
} catch (Exception e) {
throw new Exception("创建文件的时候错误!", e);
}
}
/**
* 写入String到file中
*
* @param content 写入内容
* @param fileName 写入位置
*/
public static void writeFile(String content, File fileName) throws Exception {
writeFile(content.getBytes("Utf-8"), fileName);
}
/**
* 写入bytes到file中
*
* @param bytes 写入内容
* @param fileName 写入位置
*/
public static void writeFile(byte[] bytes, File fileName) throws Exception {
FileOutputStream o;
try {
o = new FileOutputStream(fileName);
o.write(bytes);
o.close();
} catch (Exception e) {
throw new Exception("写入文件的时候错误!", e);
}
}
/**
* 保存inputStream到文件
*
* @param inputStream 输入流
* @param fileName 保存文件的位置
*/
public static void saveFile(InputStream inputStream, File fileName) throws Exception {
writeFile(readInputStream(inputStream), fileName);
}
/**
* 从输入流中获取字节数组
*
* @param inputStream 输入流
* @return byte数组
*/
private static byte[] readInputStream(InputStream inputStream) throws IOException {
byte[] buffer = new byte[1024];
int len = 0;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
while ((len = inputStream.read(buffer)) != -1) {
bos.write(buffer, 0, len);
}
bos.close();
inputStream.close();
return bos.toByteArray();
}
}
4、main方法执行
package com.dyw.crawler.project;
import com.dyw.crawler.util.CrawlerUtils;
import com.dyw.crawler.util.IOUtils;
import com.dyw.crawler.util.RegularUtils;
import java.io.File;
import java.io.InputStream;
import java.util.List;
/**
* 下载网页中的图片
* Created by dyw on 2017/9/4.
*/
public class Project1 {
public static void main(String[] args) {
//文件放置的路径
String path = "C:\\Users\\dyw\\Desktop\\crawler";
//爬取的网站地址
String url = "http://blog.csdn.net/juewang_love";
//获取内容
String htmlContent = null;
try {
htmlContent = CrawlerUtils.getHtml(url);
} catch (Exception e) {
throw new RuntimeException("获取内容失败!", e);
}
//获取所有的img的内容
List imgUrls = RegularUtils.getIMGUrl(htmlContent);
//分别下载每个img
imgUrls.forEach(imgUrl -> {
String[] split = imgUrl.split("/");
String imgName = split[split.length - 1];
try {
File file1 = new File(path + "/" + imgName);
InputStream inputStream = CrawlerUtils.downLoadFromUrl(imgUrl);
IOUtils.saveFile(inputStream, file1);
System.out.println("success:" + imgName);
} catch (Exception e) {
System.out.println("fail:" + imgUrl + "" + imgName);
}
});
}
}
5、修改 CrawlerUtils 工具类 用 httpclient 替代 urlConnection
package com.dyw.crawler.util;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
/**
* 爬虫工具类
* Created by dyw on 2017/9/1.
*/
public class CrawlerUtils {
/**
* http请求设置消息头
*
* @param httpMethod http请求方法
*/
private static void setHead(HttpMethod httpMethod) {
httpMethod.setRequestHeader("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
httpMethod.setRequestHeader("Content-Type", "Utf-8");
httpMethod.setRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
}
/**
* 获取html内容转成string输出(get方法)
*
* @param url url链接
* @return 整个网页转成String字符串
*/
public static String getHtml(String url) throws Exception {
InputStream inputStream = downLoadFromUrl(url);
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream, "Utf-8"));
StringBuffer stringBuffer = new StringBuffer();
String str;
while ((str = br.readLine()) != null) {
stringBuffer.append(str);
stringBuffer.append('\n');//添加换行符
}
return stringBuffer.toString();
}
/**
* 获取文件流(get方法)
*
* @param urlStr url地址
* @return InputStream
*/
public static InputStream downLoadFromUrl(String urlStr) throws IOException {
//通过httpclient来代替urlConnection
HttpClient httpClient = new HttpClient();
HttpMethod httpMethod = new GetMethod(urlStr);
setHead(httpMethod);
int status = httpClient.executeMethod(httpMethod);
InputStream responseBodyAsStream = null;
if (status == HttpStatus.SC_OK) {
responseBodyAsStream = httpMethod.getResponseBodyAsStream();
}
return responseBodyAsStream;
}
}