本程序可以完成的工作:转移csdn上面的文章(限于文本内容)到wordpress;不能完成的工作:1、不支持在wordpress上创建分类,所以需要提前在wordpress上手工创建分类(保持与csdn一致);2、不能以很好的格式转移文章,转移之后文章格式需要调整。
程序由采集、解析、发帖三部分构成。采集负责将指定url的内容下载下来,解析负责从网页内容中解析出正文链接、标题、发布时间、分类信息,发帖部分负责将解析出来的数据通过rpc发送给wordpress,生成博文。
本程序用到的jar包及其版本如下:
-rw-r--r-- 1 mingyuan mingyuan 46725 2011-09-03 23:05 commons-codec-1.3.jar -rw-r--r-- 1 mingyuan mingyuan 279781 2011-09-03 23:05 commons-httpclient-3.0.1.jar -rwxrwxrwx 1 mingyuan mingyuan 52915 2010-05-03 03:39 commons-logging-1.1.jar -rw-r--r-- 1 mingyuan mingyuan 281579 2011-09-04 01:40 jsoup-1.6.1.jar -rwxrwxrwx 1 mingyuan mingyuan 34407 2010-05-03 03:39 ws-commons-util-1.0.2.jar -rwxrwxrwx 1 mingyuan mingyuan 58573 2010-05-03 03:39 xmlrpc-client-3.1.3.jar -rwxrwxrwx 1 mingyuan mingyuan 109131 2010-05-03 03:39 xmlrpc-common-3.1.3.jar -rwxrwxrwx 1 mingyuan mingyuan 81555 2010-05-03 03:39 xmlrpc-server-3.1.3.jar
代码很简单,就不解释了,大伙看看即可明白。程序的入口函数是Mover.main
下面先给出主要的类Mover.java
- package cn.mingyuan.csdn2wordpress;
- import java.io.IOException;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.text.ParseException;
- import java.text.SimpleDateFormat;
- import java.util.Date;
- import java.util.HashMap;
- import java.util.LinkedList;
- import java.util.List;
- import java.util.Map;
- import java.util.concurrent.TimeUnit;
- import org.apache.xmlrpc.XmlRpcException;
- import org.apache.xmlrpc.client.XmlRpcClient;
- import org.apache.xmlrpc.client.XmlRpcClientConfigImpl;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- /**
- * 采集、解析、转移
- *
- * @author mingyuan
- *
- */
- public class Mover {
- private int totalPages;
- private XmlRpcClientConfigImpl config;
- private XmlRpcClient client;
- private String baseUrl;
- private Object userName;
- private Object password;
- private String csdnUserName;
- public Mover(int totalPages, String blogRpcUrl, String csdnUrl, String csdnUserName, String userName,
- String password) {
- this.totalPages = totalPages;
- this.baseUrl = csdnUrl;
- this.csdnUserName = csdnUserName;
- this.userName = userName;
- this.password = password;
- config = new XmlRpcClientConfigImpl();
- try {
- config.setServerURL(new URL(blogRpcUrl));
- } catch (MalformedURLException e) {
- System.out.println(“请检查url”);
- }
- client = new XmlRpcClient();
- client.setConfig(config);
- }
- private List<String> getlinks() {
- List<String> list = new LinkedList<String>();
- for (int i = 1; i <= totalPages; i++) {
- System.out.println(“processing page ” + i);
- Downloader downloader = new Downloader();
- String content = downloader.download(baseUrl + “/” + csdnUserName + “/article/list/” + i);
- if (content == null)
- continue;
- Document doc = Jsoup.parse(content);
- Elements first = doc.select(“.link_title”);
- for (int j = 0; j < first.size(); j++) {
- Element first2 = first.get(j).select(“a”).first();
- String link = baseUrl + first2.attr(“href”);
- list.add(link);
- System.out.println(“get link\t” + link);
- }
- System.out.println(“page ” + i + “ extractor done,sleep 2s”);
- try {
- TimeUnit.SECONDS.sleep(1);
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- }
- return list;
- }
- public List<CSDNPost> getPosts() {
- List<String> links = getlinks();
- List<CSDNPost> posts = new LinkedList<CSDNPost>();
- for (String link : links) {
- CSDNPost post = getPost(link);
- if (post != null) {
- posts.add(post);
- }
- }
- return posts;
- }
- private CSDNPost getPost(String url) {
- System.out.println(“url\t” + url);
- Downloader downloader = new Downloader();
- String html = downloader.download(url);
- if (html == null)
- return null;
- Document doc = Jsoup.parse(html);
- String title = doc.select(“.article_title”).first().text();
- String categroy = “Uncategorized”;
- Elements link_categories = doc.select(“.article_manage .link_categories”);
- if (link_categories != null) {
- Element first = link_categories.first();
- if (first != null) {
- Elements href = first.select(“a”);
- if (href != null) {
- categroy = href.text();
- }
- }
- }
- String postdate = doc.select(“.article_manage .link_postdate”).first().text();
- String content = doc.select(“.details .article_content”).first().text();
- SimpleDateFormat sdf = new SimpleDateFormat(“yyyy-MM-dd HH:mm”);
- CSDNPost post = new CSDNPost();
- post.setCategories(new String[] { categroy });
- post.setTitle(title);
- try {
- post.setDateCreated(sdf.parse(postdate));
- } catch (ParseException e) {
- post.setDateCreated(new Date());
- }
- post.setDescription(content);
- return post;
- }
- public void publish(CSDNPost post) {
- Map<String, Object> struct = new HashMap<String, Object>();
- struct.put(“dateCreated”, post.getDateCreated());
- struct.put(“description”, post.getDescription());
- struct.put(“title”, post.getTitle());
- struct.put(“categories”, post.getCategories());
- Object[] params = new Object[] { userName, userName, password, struct, true };
- String blogid = null;
- try {
- blogid = (String) client.execute(“metaWeblog.newPost”, params);
- } catch (XmlRpcException e) {
- e.printStackTrace();
- System.out.println(“导入出现错误:title=” + post.getTitle());
- }
- System.out.println(post.getTitle() + “>> 导入完毕,生成博文id为>>” + blogid);
- struct.clear();
- }
- public static void main(String[] args) throws IOException {
- Mover extractor = new Mover(19, “http://youthmemo.com/xmlrpc.php”, “http://blog.csdn.net”, “telnetor”, “admin”,
- “xxxx”);
- List<CSDNPost> posts = extractor.getPosts();
- for (CSDNPost post : posts) {
- extractor.publish(post);
- try {
- TimeUnit.SECONDS.sleep(1);
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- System.out.println(post.getTitle());
- }
- System.out.println(“done!”);
- }
- }
下面给出下载类Downloader.java
- package cn.mingyuan.csdn2wordpress;
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
- import org.apache.commons.httpclient.HttpClient;
- import org.apache.commons.httpclient.HttpException;
- import org.apache.commons.httpclient.HttpMethod;
- import org.apache.commons.httpclient.HttpStatus;
- import org.apache.commons.httpclient.cookie.CookiePolicy;
- import org.apache.commons.httpclient.methods.GetMethod;
- import org.apache.commons.httpclient.params.HttpClientParams;
- import org.apache.commons.httpclient.params.HttpMethodParams;
- /**
- * downloader
- *
- * @author mingyuan
- *
- */
- public class Downloader {
- private HttpClientParams params = null;
- private HttpClient client = null;
- /**
- * 默认构造函数,初始化一系列变量
- */
- public Downloader() {
- // 构造HttpClientParams参数
- params = new HttpClientParams();
- params.setParameter(
- HttpClientParams.USER_AGENT,
- “Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3 GTBDFff GTB7.0 (.NET CLR 3.5.30729)”);
- params.setParameter(HttpClientParams.ALLOW_CIRCULAR_REDIRECTS, false);
- params.setParameter(HttpClientParams.MAX_REDIRECTS, 4);
- params.setParameter(HttpClientParams.CONNECTION_MANAGER_TIMEOUT, (long) 60 * 1000);
- params.setParameter(HttpClientParams.SO_TIMEOUT, 60 * 1000);
- // 使用系统提供的默认的恢复策略
- params.setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());
- client = new HttpClient(params);
- }
- /**
- * 下载网页
- *
- * @param url
- * 网页url
- * @return String类型的网页源码
- */
- public String download(String url) {
- HttpMethod method = new GetMethod(url);
- String sourceCode = null;
- method.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
- // 读取内容
- StringBuilder builder = new StringBuilder();
- BufferedReader reader = null;
- try {
- int statusCode = client.executeMethod(method);
- if (statusCode != HttpStatus.SC_OK) {
- return null;
- }
- reader = new BufferedReader(new InputStreamReader(method.getResponseBodyAsStream(), “utf8″));
- String line;
- while ((line = reader.readLine()) != null) {
- builder.append(line + “\r\n”);
- }
- sourceCode = builder.toString();
- } catch (HttpException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- reader.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- // 释放连接
- method.releaseConnection();
- client.getHttpConnectionManager().closeIdleConnections(0);
- }
- return sourceCode;
- }
- }
最后发出一个pojo,CSDNPost.java
- package cn.mingyuan.csdn2wordpress;
- import java.util.Date;
- /**
- * csdn post
- *
- * @author mingyuan
- *
- */
- public class CSDNPost {
- /**
- * 博文创建日期
- */
- private Date dateCreated;
- /**
- * 博文内容
- */
- private String description;
- /**
- * 标题
- */
- private String title;
- /**
- * 博文分类
- */
- private String[] categories;
- public CSDNPost() {
- }
- public CSDNPost(String title, String description, String[] categories, Date dateCreated) {
- this.dateCreated = dateCreated;
- this.description = description;
- this.title = title;
- this.categories = categories;
- }
- public Date getDateCreated() {
- return dateCreated;
- }
- public void setDateCreated(Date dateCreated) {
- this.dateCreated = dateCreated;
- }
- public String getDescription() {
- return description;
- }
- public void setDescription(String description) {
- this.description = description;
- }
- public String getTitle() {
- return title;
- }
- public void setTitle(String title) {
- this.title = title;
- }
- public String[] getCategories() {
- return categories;
- }
- public void setCategories(String[] categories) {
- this.categories = categories;
- }
- }
以上是全部源码。
在文章的结尾,我愿意跟大家分享一下这个小程序的开发心得。
一开始写这个程序的时候,觉得会很快搞定,因为这个程序无非就是三个过程:采集、解析、发帖。其实也真是这样的一个过程。
这个程序耗费精力比较多的地方是在解析网页、提取链接、标题、内容、发布时间、分类方面。
一开始想用xpath解析网页,并且写xpath表达式都在chrome上测试通过xpath helper验证通过了。但在编码阶段发现现有的工具包,比如dom4j就不支持对html的解析,网上看了有通过htmlparser将html转换成xml的方法。但觉得太麻烦,最后发现了JSoup这个非常强大的工具,它可以通过类似jquery和css选取语法的表达式来提取内容。尝试了下非常方便,于是解析这个问题没有了(有个小窍门:chrome浏览器开发者工具可以看某节点的css样式,把这个样式直接传递给jsoup就能提取内容)。
wordpress支持MetaWeblog协议,可以通过XML-RPC进行发帖。关于它们的信息可以通过以下链接找到:
http://en.wikipedia.org/wiki/MetaWeblog
http://en.wikipedia.org/wiki/XML-RPC (可以找到各种语言版本的api)
另外JSoup的地址是:
http://jsoup.org/
程序写的太匆忙,肯定有很多不尽人意的地方,希望各位指出。我的联系方式是:admin#youthmemo.com。