抓取新闻

package com.htmlparser;

import java.net.URL;
import java.util.HashSet;
import java.util.Set;

import org.htmlparser.NodeFilter;
import org.htmlparser.beans.LinkBean;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.HeadingTag;
import org.htmlparser.tags.Span;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

/*
 * Author sanshang
 */
public class ParseNews {
	private Parser parser = null; // 用于分析网页的分析器。
	private Set<String> links = new HashSet<String>();

	/*
	 * 测试LinkBean的用法
	 */
	public Set<String> testLinkBean() {
		LinkBean linkBean = new LinkBean();
		linkBean.setURL("http://finance.sina.com.cn/stock/");
		URL[] urls = linkBean.getLinks();

		for (int i = 0; i < urls.length; i++) {
			URL url = urls[i];
			if (accept(url.toString())) {
				// System.out.println("url is :" + url);
				links.add(url.toString());
			}
		}
		return links;
	}

	public boolean accept(String url) {
		if (url
				.matches("http://finance.sina.com.cn/stock/gujiayidong/20090526/[\\d]+.shtml")) {
			return true;
		} else {
			return false;
		}
	}

	/*
	 * 得到标题
	 */
	private String getTitle(NodeFilter titleFilter, Parser parser) {
		String titleName = "";
		try {

			NodeList titleNodeList = (NodeList) parser.parse(titleFilter);
			for (int i = 0; i < titleNodeList.size(); i++) {
				HeadingTag title = (HeadingTag) titleNodeList.elementAt(i);
				titleName = title.getStringText();
			}

		} catch (ParserException ex) {
		}
		return titleName;
	}

	/*
	 * 得到新聞編輯
	 */
	private String getNewsAuthor(NodeFilter newsauthorFilter, Parser parser) {
		String newsAuthor = "";
		try {
			NodeList authorList = (NodeList) parser.parse(newsauthorFilter);
			for (int i = 0; i < authorList.size(); i++) {
				Span authorSpan = (Span) authorList.elementAt(i);
				newsAuthor = authorSpan.getStringText();
			}

		} catch (ParserException ex) {
		}
		return newsAuthor;

	}

	/*
	 * 获得新闻的日期
	 */
	private String getNewsDate(NodeFilter dateFilter, Parser parser) {
		String newsDate = null;
		try {
			NodeList dateList = (NodeList) parser.parse(dateFilter);
			for (int i = 0; i < dateList.size(); i++) {
				Span dateTag = (Span) dateList.elementAt(i);
				newsDate = dateTag.getStringText();
			}
		} catch (ParserException ex) {
		}

		return newsDate;
	}

	/**
	 * 获取新闻的内容
	 */
	private String getNewsContent(NodeFilter newsContentFilter, Parser parser) {
		String content = null;
		StringBuilder builder = new StringBuilder();

		try {
			NodeList newsContentList = (NodeList) parser
					.parse(newsContentFilter);
			for (int i = 0; i < newsContentList.size(); i++) {
				Div newsContenTag = (Div) newsContentList.elementAt(i);
				builder = builder.append(newsContenTag.getStringText());
			}
			content = builder.toString(); // 转换为String 类型。
			if (content != null) {
				parser.reset();
				parser = Parser.createParser(content, "gb2312");
				StringBean sb = new StringBean();
				sb.setCollapse(true);
				parser.visitAllNodesWith(sb);
				content = sb.getStrings();

				// content = content.replaceAll("\\\".*[a-z].*\\}", "");
				content = content.replace("已有_COUNT_位网友发表评论 我要评论", "");
				content = content
						.replace(
								"新浪声明:此消息系转载自新浪合作媒体,新浪网登载此文出于传递更多信息之目的,并不意味着赞同其观点或证实其描述。文章内容仅供参考,不构成投资建议。投资者据此操作,风险自担。",
								"");
				content = content.replace("以下是本文可能影响或涉及到的板块个股:", "");
				content = content
						.replace(
								"新浪声明:新浪网登载此文出于传递更多信息之目的,并不意味着赞同其观点或证实其描述。文章内容仅供参考,不构成投资建议。投资者据此操作,风险自担。",
								"");
			} else {
				System.out.println("没有得到新闻内容!");
			}

		} catch (ParserException ex) {
		}

		return content;
	}

	public void parser(String url) {
		try {
			parser = new Parser(url);
			// NodeFilter titleFilter = new TagNameFilter("h1");
			// 标题Filter
			NodeFilter titleFilter = new AndFilter(new TagNameFilter("h1"),
					new HasAttributeFilter("id", "artibodyTitle"));
			// 内容Filter
			NodeFilter contentFilter = new AndFilter(new TagNameFilter("div"),
					new HasAttributeFilter("id", "artibody"));
			// 日期Filter
			NodeFilter newsdateFilter = new AndFilter(
					new TagNameFilter("span"), new HasAttributeFilter("id",
							"pub_date"));
			// 作者Filter
			NodeFilter newsauthorFilter = new AndFilter(new TagNameFilter(
					"span"), new HasAttributeFilter("id", "media_name"));
			String newsTitle = getTitle(titleFilter, parser);
			System.out.println(newsTitle);
			parser.reset(); // 记得每次用完parser后,要重置一次parser。要不然就得不到我们想要的内容了。
			String newsContent = getNewsContent(contentFilter, parser);
			System.out.println(newsContent); // 输出新闻的内容,查看是否符合要求
			parser.reset();
			String newsDate = getNewsDate(newsdateFilter, parser);
			System.out.println(newsDate);
			parser.reset();
			String newsauthor = getNewsAuthor(newsauthorFilter, parser);
			System.out.println(newsauthor);
			System.out
					.println("------------------------------------------------------------");
		} catch (ParserException ex) {
		}
	}

	public static void main(String[] args) {
		ParseNews parseNews = new ParseNews();
		// parseNews.testLinkBean();
		parseNews.links = parseNews.testLinkBean();
		for (String o : parseNews.links) {
			parseNews.parser(o);
		}
	}
}

引用
备注:matches方法的第二个参数是正则。当URL中含有“?”时,记得要用正则表达:[?]

你可能感兴趣的:(.net)