- /**
- * 内容摘要:采集工具类
- * 流程说明:
- * @author wanghao QQ:115308504
- * @return
- */
- package util;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.apache.http.HttpEntity;
- import org.apache.http.HttpResponse;
- import org.apache.http.client.HttpClient;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.DefaultHttpClient;
- import org.apache.http.util.EntityUtils;
- import org.htmlparser.Node;
- import org.htmlparser.Parser;
- import org.htmlparser.tags.ScriptTag;
- import org.htmlparser.tags.StyleTag;
- import org.htmlparser.visitors.ObjectFindingVisitor;
- import org.htmlparser.visitors.TextExtractingVisitor;
- public class CrawNewsTools {
- // 使用HttpClient组件读取指定URL的页面HTML源码
- public static String getPage(String url, String encode) {
- String page = "";
- HttpClient httpClient = null;
- try {
- httpClient = new DefaultHttpClient();
- // 创建httpget
- HttpGet httpget = new HttpGet(url);
- System.out.println("请求URI路径:" + httpget.getURI());
- // 执行get请求
- HttpResponse response = httpClient.execute(httpget);
- // 获得响应实体
- HttpEntity httpEntity = response.getEntity();
- String charset = EntityUtils.getContentCharSet(httpEntity);
- System.out.println("###当前页面编码:" + charset);
- // 获取内容时,指定编码
- if (encode != null && !encode.trim().equals("")) {
- System.out.println("###采用指定编码:" + encode);
- } else if (charset != null) {
- System.out.println("###采用网页自身编码:" + charset);
- encode = charset;
- } else {
- System.out.println("###采用默认UTF-8编码:UTF-8");
- encode = "UTF-8";
- }
- page = EntityUtils.toString(httpEntity, encode);
- System.out.println(page);
- page = removeCssTag(page, encode);
- page = removeJsTag(page, encode);
- page = getBody(page);
- //page = formatHtml(page);
- } catch (Exception ex) {
- ex.printStackTrace();
- } finally {
- httpClient.getConnectionManager().shutdown();
- }
- return page;
- }
- // 格式化body标签
- public static String fromatBodyTag(String htmlCode) {
- String result = htmlCode;
- if (htmlCode != null) {
- while (result.indexOf("<BODY") != -1) {
- result.replaceAll("<BODY", "<body");
- }
- while (result.indexOf("<Body") != -1) {
- resultresult = result.replaceAll("<Body", "<body");
- }
- while (result.indexOf("</BODY") != -1) {
- resultresult = result.replaceAll("</BODY", "</body");
- }
- while (result.indexOf("</Body") != -1) {
- resultresult = result.replaceAll("</Body", "</body");
- }
- }
- return result;
- }
- // 使用正则表达式提取body体内容
- public static String getBody(String htmlCode) {
- if (htmlCode == null) {
- return null;
- }
- Pattern pattern = Pattern.compile("<body(.*)>(.*)</body>",
- Pattern.MULTILINE | Pattern.DOTALL);
- Matcher matcher = pattern.matcher(htmlCode);
- if (matcher.find()) {
- return matcher.group();
- } else {
- return null;
- }
- }
- // 过滤css标签
- public static String removeCssTag(String htmlCode, String encode) {
- String htmlEndCode = htmlCode;
- try {
- Parser parser = Parser.createParser(htmlCode, encode);
- ObjectFindingVisitor visitor = new ObjectFindingVisitor(
- StyleTag.class);
- parser.visitAllNodesWith(visitor);
- Node[] nodes = visitor.getTags();
- for (int i = 0; i < nodes.length; i++) {
- // System.out.println(nodes[i].toHtml());
- htmlEndCodehtmlEndCode = htmlEndCode.replace(nodes[i].toHtml(), "");
- }
- // System.out.println("###去除css标签后:" + htmlEndCode);
- } catch (Exception e) {
- e.printStackTrace();
- }
- return htmlEndCode;
- }
- // 过滤js标签
- public static String removeJsTag(String htmlCode, String encode) {
- String htmlEndCode = htmlCode;
- try {
- Parser parser = Parser.createParser(htmlCode, encode);
- ObjectFindingVisitor visitor = new ObjectFindingVisitor(
- ScriptTag.class);
- parser.visitAllNodesWith(visitor);
- Node[] nodes = visitor.getTags();
- for (int i = 0; i < nodes.length; i++) {
- // System.out.println(nodes[i].toHtml());
- htmlEndCodehtmlEndCode = htmlEndCode.replace(nodes[i].toHtml(), "");
- }
- // System.out.println("###去除js标签后:" + htmlEndCode);
- } catch (Exception e) {
- e.printStackTrace();
- }
- return htmlEndCode;
- }
- // 格式化指定的HTML源码
- public static String formatHtml(String htmlcode) {
- String result = htmlcode;
- if (htmlcode != null && htmlcode.trim().length() > 0) {
- // 去除回车符
- while (result.indexOf("\r") != -1) {
- resultresult = result.replaceAll("\r", "");
- }
- // 去除换行符
- while (result.indexOf("\n") != -1) {
- resultresult = result.replaceAll("\n", "");
- }
- // 去除制表符
- while (result.indexOf("\t") != -1) {
- resultresult = result.replaceAll("\t", "");
- }
- // 去除多余空格
- while (result.indexOf(" ") != -1) {
- resultresult = result.replaceAll(" ", " ");
- }
- // 去除全角空格
- while (result.indexOf(" ") != -1) {
- resultresult = result.replaceAll(" ", "");
- }
- return result;
- } else {
- return null;
- }
- }
- // 使用HtmlParser组件去除内容中的HTML标签,得到纯文本内容
- public static String getText(String content, String encode) {
- String result = content;
- try {
- Parser parser = Parser.createParser(content, encode);
- // 创建TextExtractingVisitor对象
- TextExtractingVisitor visitor = new TextExtractingVisitor();
- // 去除网页中的所有标签,提出纯文本内容
- parser.visitAllNodesWith(visitor);
- result = visitor.getExtractedText();
- // System.out.println("###去除HTML标签:" + result);
- } catch (Exception ex) {
- ex.printStackTrace();
- }
- return result;
- }
- // 查询网页包含某种标签的集合数组
- @SuppressWarnings("unchecked")
- public static Node[] getTagList(String htmlCode, String encode, Class t) {
- Node[] nodes = null;
- try {
- Parser parser = Parser.createParser(htmlCode, encode);
- ObjectFindingVisitor visitor = new ObjectFindingVisitor(t);
- parser.visitAllNodesWith(visitor);
- nodes = visitor.getTags();
- } catch (Exception e) {
- e.printStackTrace();
- }
- return nodes;
- }
- // 判断自定字符串是否符号某种正则规则
- public static boolean isRegex(String isRegexString, String regexString) {
- boolean regexStatus = Pattern.matches(regexString, isRegexString);
- return regexStatus;
- }
- }
- /**
- * 内容摘要:蛋花儿网图片采集类
- * 流程说明:
- * @author wanghao QQ:115308504
- * @return
- */
- package craw;
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.InputStream;
- import java.net.URL;
- import org.htmlparser.Node;
- import org.htmlparser.tags.ImageTag;
- import util.CrawNewsTools;
- public class CrawPicForDanHuaer {
- // 采集图片
- public void crawImage(String url, String encode) {
- // 采集源码
- String page = CrawNewsTools.getPage(url, encode);
- System.out.println("当前采集页源码:" + page);
- Node[] nodes = CrawNewsTools.getTagList(page, encode, ImageTag.class);
- System.out.println(nodes.length);
- for (Node node : nodes) {
- String picUrl = ((ImageTag) node).getImageURL();
- downloadPic(picUrl);
- }
- }
- public void downloadPic(String picUrl) {
- File d = new File("c:\\danhuaer");
- if (!d.exists()) {
- try {
- d.mkdirs();
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- String fileName = picUrl.substring(picUrl.lastIndexOf("/") + 1);
- if (!"touchad.gif".equals(fileName)) {
- try {
- URL url = new URL(picUrl);
- int fileSize = url.openConnection().getContentLength();
- if (fileSize >= 1024 * 20) {
- String filepath = "c:\\danhuaer\\danhuaer#" + fileName;
- File f = new File(filepath);
- FileOutputStream fos = new FileOutputStream(f);
- InputStream is = url.openStream();
- byte[] buf = new byte[1024 * 1000];
- int len = 0;
- while ((len = is.read(buf)) > 0) {
- System.out.println("--------------文件大于20K,准备下载:"
- + picUrl);
- fos.write(buf, 0, len);
- }
- System.out.println("!!!!!!!!!!!!!!!###文件大小:" + fileSize);
- if (is != null)
- is.close();
- if (fos != null)
- fos.close();
- }
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
- public static void main(String[] args) {
- String baseUrl = "http://danhuaer.com/ooxx/comment-page-{#page}#comments";
- CrawPicForDanHuaer crawPicForDanHuaer = new CrawPicForDanHuaer();
- for (int i = 1; i < 100; i++) {
- String url = baseUrl.replace("{#page}", Integer.toString(i));
- System.out.println("####开始下载图片的页面:" + url);
- crawPicForDanHuaer.crawImage(url, "UTF-8");
- try {
- Thread.sleep(1000);
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- }
- }
- }