上一篇文章介绍了,通过rss抓取新闻正文。这里介绍同时抓取多个RSS站点的正文,以及抓取正文中的图片。
我的RSS不是抓取 站点 中的内容,而是需要的正文,广告评论等都排除在外。
第一部分:同时抓取多个站点,看看我的站点配置
IT之家
http://www.ithome.com/rss/
]]>
]]>
GB2312
true
虎嗅网
http://www.huxiu.com/rss/0.xml
]]>
]]>
UTF-8
true
这两个站点就是我需要抓取的,url是rss地址 startTag,endTag 是正文开始和结束的位置,encoding是站点的编码格式,open 表示是否抓取该站点,如果不清晰 请看 http://blog.csdn.net/kissliux/article/details/14227057
需要抓取的站点准备好了,开始解析吧。使用dom4j,请引入相关jar 我习惯使用maven管理这些jar
dom4j
dom4j
1.6.1
public class Website {
private String name;
private String url;
private String startTag;
private String endTag;
private String encoding;
private String open;//省略 get set
}
/**
* @author hongliang.dinghl
* Dom4j 生成XML文档与解析XML文档
*/
public class Dom4jUtil {
public List parserXml(String fileName) {
SAXReader saxReader = new SAXReader();
List list = new ArrayList();
try {
URL url = getClass().getResource("/");
//System.out.println(url.getPath());
String path = url.getFile().replace("%20", " ") + fileName;
Document document = saxReader.read(new File(path));
Element websites = document.getRootElement();
for (Iterator i = websites.elementIterator(); i.hasNext(); ) {
Element employee = (Element) i.next();
Website website = new Website();
for (Iterator j = employee.elementIterator(); j.hasNext(); ) {
Element node = (Element) j.next();
String name = node.getName();
// System.out.println(name + ":" + node.getText());
String methodName = "set" + name.substring(0, 1).toUpperCase() + name.substring(1);
Method method = website.getClass().getMethod(methodName, String.class);
method.invoke(website, node.getText());
}
list.add(website);
}
} catch (DocumentException e) {
e.printStackTrace();
} catch (NoSuchMethodException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
} catch (IllegalAccessException e) {
e.printStackTrace();
}
return list;
}
}
多个站点解析结束。然后遍历站点,访问url,抓取正文, 请看我上一篇文字。
public class FeedReader {
private String CLASS_PAHT;
private String relative_path;
public FeedReader() {
Properties proerties = PropertiesUtil.getInstance().getProerties();
CLASS_PAHT= proerties.getProperty("image_path");
relative_path = proerties.getProperty("relative_path");
}
/**
* @param url rss 网站地址 比如:http://www.ithome.com/rss/
* @return 所有文章对象
* @throws Exception
*/
public List getRss(String url) throws Exception {
URL feedUrl = new URL(url);//SyndFeedInput:从远程读到xml结构的内容转成SyndFeedImpl实例
URLConnection conn = feedUrl.openConnection();
conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)"); //欺骗服务器
SyndFeedInput input = new SyndFeedInput();//rome按SyndFeed类型生成rss和atom的实例,
SyndFeed feed = input.build(new XmlReader(conn)); //SyndFeed是rss和atom实现类SyndFeedImpl的接口
List entries = feed.getEntries();
RSSItemBean item = null;
List rssItemBeans = new ArrayList();
for (SyndEntryImpl entry : entries) {
item = new RSSItemBean();
item.setTitle(entry.getTitle().trim());
item.setType(feed.getTitleEx().getValue().trim());
item.setUri(entry.getUri());
item.setPubDate(entry.getPublishedDate());
item.setAuthor(entry.getAuthor());
rssItemBeans.add(item);
}
return rssItemBeans;
}
/**
* 从html 中获取 新闻正文
*
* @param website 网站对象,我自己定义的
* @return 加入了新闻正文的 RSS对象 对象链表
* @throws Exception
*/
public List getContent(Website website) throws Exception {
String content;
List rssList = getRss(website.getUrl());
FindHtml findHtml = new FindHtml(website.getStartTag(), website.getEndTag(), website.getEncoding());
for (RSSItemBean rsItem : rssList) {
String link = rsItem.getUri();
content = findHtml.getContent(link); //关键方法,获取新闻征文
content = processImages(content); //转换图片
rsItem.setContent(content);
//break;
rsItem.setFid(Integer.parseInt(website.getFid()));
}
return rssList;
}
/**
* 去掉文章中的
*
* @param input
* @return
*/
private String removeLinks(String input) {
String output = input;
// 开头的的正则表达式
String regEx = "]*>";
Pattern p = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(input);
output = m.replaceAll("");
// 结尾的的正则表达式
regEx = "";
p = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE);
m = p.matcher(output);
output = m.replaceAll("");
return output;
}
public static void main(String[] args){
UUID uuid = UUID.randomUUID();
System.out.println(uuid.toString());
System.out.println(uuid.toString());
}
/**
* 处理文章中的图片
*
* @param input
* @return
*/
private String processImages(String input) {
String output = input;
String regEx = "]*>";
Pattern p = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(input);
List imgs = new ArrayList();
// 读取所有标签
while (m.find()) {
imgs.add(m.group());
}
// 把图存到本地,并替换标签的src值
for (String img : imgs) {
int begin = -1;
int end = -1;
String path = "";
if (img.indexOf("src=\"") != -1) {
begin = img.indexOf("src=\"");
path = img.substring(begin + 5);
end = path.indexOf("\"");
if (end != -1) {
path = path.substring(0, end);
} else {
path = "";
}
}
if (img.indexOf("src='") != -1) {
begin = img.indexOf("src='");
path = img.substring(begin + 5);
end = path.indexOf("'");
if (end != -1) {
path = path.substring(0, end);
} else {
path = "";
}
}
if (!path.equals("")) {
// String filepath = this.writeImageToServer(path);
String filepath = writeToFile(path);
while (filepath.indexOf('\\') != -1) {
filepath = filepath.replace('\\', '/');
}
output = output.replaceAll(path, filepath);
}
}
// System.out.println(output);
return output;
}
/**
* 把图片写到数据库
*
* @param path 原图片路径
* @return 本地图片路径
*/
public String writeToFile(String path) {
String dirName = "";
String fileName = "";
OutputStreamWriter osw = null;
File directory = null;
File file = null;
try {
// 取图像的格式
int begin = path.lastIndexOf(".");
String suffix = path.substring(begin + 1);
if(suffix.contains("!")){ //有些网站图片 jyijaktkyzkk.jpg!292x420
int index = suffix.indexOf("!");
suffix = suffix.substring(0,index);
}
// 读取图像
URL url = new URL(path);
BufferedImage image = ImageIO.read(url);
dirName = CLASS_PAHT; //文件目录
directory = new File(dirName);
if (!directory.exists()) {
directory.mkdirs();
}
if (directory.exists()) {
String name= UUID.randomUUID() + "." + suffix;
fileName = dirName + name;
file = new File(fileName); //真正文件名
FileOutputStream fos = new FileOutputStream(file);
ImageIO.write(image, suffix, fos);
fos.close();
return relative_path+name;
}
} catch (Exception e) {
e.printStackTrace();
}
return "";
}
}
还有第三篇文字,rss抓取正文保存到discuz