从百度空间到CSDN——博客搬家源码

1.CSDN的博客使用的是MetaWeblog Api,可以使用xml-rpc进行操作。下面的代码演示了怎样使用api发布博客

package cn.mingyuan.baidu2csdn.core; import java.io.FileOutputStream; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.Date; import java.util.HashMap; import java.util.Map; import org.apache.xmlrpc.XmlRpcException; import org.apache.xmlrpc.client.XmlRpcClient; import org.apache.xmlrpc.client.XmlRpcClientConfigImpl; /** * csdn博文 * @author [email protected] * */ public class CSDNPost { /** * 博文创建日期 */ private Date dateCreated; /** * 博文内容 */ private String description; /** * 标题 */ private String title; /** * 博文分类 */ private String[] categories; public CSDNPost(){ } public CSDNPost(String title, String description, String[] categories, Date dateCreated) { this.dateCreated = dateCreated; this.description = description; this.title = title; this.categories = categories; } public Date getDateCreated() { return dateCreated; } public void setDateCreated(Date dateCreated) { this.dateCreated = dateCreated; } public String getDescription() { return description; } public void setDescription(String description) { this.description = description; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String[] getCategories() { return categories; } public void setCategories(String[] categories) { this.categories = categories; } /** * xml-rpc配置 */ private static XmlRpcClientConfigImpl config; /** * xml-rpcClient */ private static XmlRpcClient client; static{ config = new XmlRpcClientConfigImpl(); try { //此处请将telnetor替换为您的用户名 config.setServerURL(new URL("http://blog.csdn.net/telnetor/services/metablogapi.aspx")); } catch (MalformedURLException e) { System.out.println("请检查url"); } client = new XmlRpcClient(); client.setConfig(config); } /** * 日志记录 * @param log log */ private void writelog(String log){ FileOutputStream fos=null; try { fos=new FileOutputStream("post.log",true); fos.write((log+"/r/n").getBytes()); fos.flush(); fos.close(); } catch (IOException e) { System.out.println("写入日志错误:"+log); } } /** * 发布 */ public void publish(){ Map<String, Object> struct = new HashMap<String, Object>(); struct.put("dateCreated", dateCreated); struct.put("description", description); struct.put("title", title); struct.put("categories", categories); Object[] params = new Object[] { "your usrname", "replace it with your username", "replace it with your password", struct, true }; String blogid = null; try { blogid = (String) client.execute("metaWeblog.newPost", params); } catch (XmlRpcException e) { writelog("导入出现错误:title="+title); System.out.println("导入出现错误:title="+title); } writelog(title + ">> 导入完毕,生成博文id为>>" + blogid); System.out.println(title + ">> 导入完毕,生成博文id为>>" + blogid); struct.clear(); } public static void main(String[] args){ CSDNPost post=new CSDNPost(); post.publish(); } }  

其中需要注意的是categories是一个数组。

2.知道了怎样使用api发布博客之后,我们就该进行下一步:读取百度空间的博文内容了。首先定义一个BaiduHi的class,用来存放从百度博客读取出来的数据。

package cn.mingyuan.baidu2csdn.core; import java.util.Date; /** * 百度博客 * @author [email protected] * */ public class BaiduHi { /** * 标题 */ private String title; /** * 内容 */ private String description; /** * 分类 */ private String categories; /** * 发布日期 */ private Date dateCreated; public String getTitle() { return title; } public String getDescription() { return description; } public String getCategories() { return categories; } public Date getDateCreated() { return dateCreated; } public void setTitle(String title) { this.title = title; } public void setDescription(String description) { this.description = description; } public void setCategories(String categories) { this.categories = categories; } public void setDateCreated(Date dateCreated) { this.dateCreated = dateCreated; } public BaiduHi(String title, String description, String categories, Date dateCreated) { this.title = title; this.description = description; this.categories = categories; this.dateCreated = dateCreated; } public BaiduHi() { // TODO Auto-generated constructor stub } /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub } }  

3.读取百度空间博文内容,注意:我们使用的是游客权限来读取百度空间的内容,如果有私密信息,需要将其公开之后程序才能读取。另外我使用的百度空间模板为“80後青春”,如果您使用的不是此模板,html源码可能会不同,解析就可能失败。如果使用本程序请保持和我的模板一致。板式为空间装扮-板式-显示出来的第二行第二列那个

package cn.mingyuan.baidu2csdn.core; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Stack; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 百度博客数据抓取及解析 * @author [email protected] * */ public class BaiduHiFetcher { /** * 下载页面 * @param url url * @return 网页源码 */ private String downloadPage(String url) { URLConnection conn; InputStream in; BufferedReader reader = null; StringBuilder sb = new StringBuilder(); String line = null; try { conn = new URL(url).openConnection(); in = conn.getInputStream(); reader = new BufferedReader(new InputStreamReader(in, "gb2312")); while ((line = reader.readLine()) != null) { sb.append(line); } in.close(); reader.close(); } catch (MalformedURLException e) { System.out.println("请检查url是否规范"); } catch (IOException e) { System.out.println("读取源码错误:"+url); } return sb.toString(); } /** * 获取页面博文链接 * @param html 网页源码 * @return 页面中的博文链接 */ private List<String> getPostLinks(String html) { // 分析页面内容,取得页面中的文章链接 String titleDivRegex = "<div[//s]class=/"tit/"><a[//s]href=[^<>]+?target=/"_blank/">.+?</div>"; Pattern titleDivPattern = Pattern.compile(titleDivRegex); Matcher titleDivMatcher = titleDivPattern.matcher(html); List<String> posts = new ArrayList<String>(); while (titleDivMatcher.find()) { String div = titleDivMatcher.group(); String titleUrl = div.substring(div.indexOf("/"), div.indexOf("/" target")); posts.add("http://hi.baidu.com" + titleUrl); } return posts; } /** * <p>获取博客总页数 * <br>我的博客内容有16页,有上一页,下一页,尾页等这样的标志,如果博文少的话可能这些标志不会出现,请修改此方法 * @param html 源码(最好是第一页) * @return 博客总页数 */ private int getTotalPages(String html) { // 页码 // <a href="/cnjsp/blog/index/16" mce_href="cnjsp/blog/index/16">[尾页]</a> String pageRegex = "<a[//s]href=/"/cnjsp/blog/index/[//d][//d]/">//[尾页//]</a>"; Pattern pagePattern = Pattern.compile(pageRegex); Matcher pageMatcher = pagePattern.matcher(html); String totalPagesStr = null; int pages = 0; if (pageMatcher.find()) { String pagelink = pageMatcher.group(); totalPagesStr = pagelink.replaceAll("<a[//s]href=/"/cnjsp/blog/index/", "").replaceAll("/">//[尾页//]</a>", ""); pages = Integer.parseInt(totalPagesStr); } return pages; } /** * <p>获取博客的所有博文的地址 * <br>没有对url进行编码处理,如果博客地址含中文,请对url进行处理 * @param blogUrl 博客地址 * @return 所有博文地址,存放于栈中,使用的时候请使用pop方法取出元素,这样可以保证按照最先发表的博文最先处理 */ public Stack<String> getAllPostLink(String blogUrl){ Stack<String> posts = new Stack<String>(); // 1.下载第一页 String firstPageHtml = downloadPage(blogUrl + "/blog/index/0"); // 2.获取博文总页数 int totalPages = getTotalPages(firstPageHtml); // 3.下载各摘要页 posts.addAll(getPostLinks(firstPageHtml)); if (totalPages < 1) { return posts; } for (int i = 1; i <= totalPages; i++) { String page = downloadPage(blogUrl + "/blog/index/" + i); posts.addAll(getPostLinks(page)); } return posts; } /** * 解析博文,获取标题,发布时间,内容,分类等信息 * @param postUrl 博文地址 * @return 封装了博文信息的BaiduHi */ public BaiduHi getBaiduHi(String postUrl){ String html = downloadPage(postUrl); // /<div class="tit"> String titleDivRegex = "<div[//s]id=/"m_blog/"[//s]class=/"modbox/"[//s]style=/"overflow-x:hidden;/"><div[//s]class=/"tit/">.+?</div><div[//s]class=/"date/">"; Pattern titleDivPattern = Pattern.compile(titleDivRegex); Matcher titleDivMatcher = titleDivPattern.matcher(html); String title = null; if (titleDivMatcher.find()) { title = titleDivMatcher.group().replaceAll("<div[//s]id=/"m_blog/"[//s]class=/"modbox/"[//s]style=/"overflow-x:hidden;/"><div[//s]class=/"tit/">", "").replaceAll("</div><div[//s]class=/"date/">", "").trim(); } String dateDivRegex = "<div[//s]class=/"date/">.+?</div>"; Pattern dateDivPattern = Pattern.compile(dateDivRegex); Matcher dateMatcher = dateDivPattern.matcher(html); String dateStr = null; Date postDate = null; if (dateMatcher.find()) { dateStr = dateMatcher.group().replaceAll("<div[//s]class=/"date/">", "").replaceAll("</div>", "").trim(); postDate = getDate(dateStr); } String textDivRegex = "<div[//s]id=/"blog_text/"[//s]class=/"cnt/"[//s]+>.+?</div>"; Pattern textDivPattern = Pattern.compile(textDivRegex); Matcher textMatcher = textDivPattern.matcher(html); String text = null; if (textMatcher.find()) { text = textMatcher.group().replaceAll("<div[//s]id=/"blog_text/"[//s]class=/"cnt/"[//s]+>", "").replaceAll("</div>", "").trim(); } String categoriesRegex = "title=/"查看该分类中所有文章/">类别:.+?</a>"; Pattern categoriesDivPattern = Pattern.compile(categoriesRegex); Matcher categoriesMatcher = categoriesDivPattern.matcher(html); String categories = null; if (categoriesMatcher.find()) { categories = categoriesMatcher.group().replaceAll("title=/"查看该分类中所有文章/">类别:", "").replaceAll("</a>", "").trim(); } BaiduHi hi = new BaiduHi(); hi.setTitle(title); hi.setDescription(text); hi.setCategories(categories); hi.setDateCreated(postDate); return hi; } /** * 解析博文中的日期格式返回Date类型 * @param str 博文中的日期 * @return Date类型日期 */ @SuppressWarnings("deprecation") private Date getDate(String str) { String yearStr = str.substring(0, str.indexOf("年")).trim(); String monthStr = str.substring(str.indexOf("年"), str.indexOf("月")).replace("年", "").trim(); String dayStr = str.substring(str.indexOf("月"), str.indexOf("日")).replace("月", "").trim(); String timeStr = str.substring(str.indexOf("午")).replace("午", "").trim(); String hourStr = timeStr.split(":")[0]; String minutesStr = timeStr.split(":")[1]; Date date = new Date(); date.setYear(Integer.parseInt(yearStr) - 1900); date.setMonth(Integer.parseInt(monthStr) - 1); date.setDate(Integer.parseInt(dayStr)); if (str.contains("下午")) { date.setHours(Integer.parseInt(hourStr) + 12); } else { date.setHours(Integer.parseInt(hourStr)); } date.setMinutes(Integer.parseInt(minutesStr)); return date; } }  

4.我们现在完成了写CSDN博客与读取并解析百度空间博文的工作。接下来需要把它们连起来,完成导入

package cn.mingyuan.baidu2csdn.core; import java.util.Stack; /** * 搬家 * @author [email protected] * */ public class Transfer { /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub String postUrl = "http://hi.baidu.com/cnjsp"; BaiduHiFetcher fetcher = new BaiduHiFetcher(); Stack<String> urls = null; urls = fetcher.getAllPostLink(postUrl); while (!urls.isEmpty()) { String url = urls.pop(); BaiduHi hi = null; hi = fetcher.getBaiduHi(url); CSDNPost post = new CSDNPost(); post.setTitle(hi.getTitle()); post.setDescription(hi.getDescription()); post.setCategories(new String[] { hi.getCategories() }); post.setDateCreated(hi.getDateCreated()); post.publish(); try { Thread.sleep(5 * 1000); } catch (InterruptedException e) { System.out.println("休眠出错"); } } } }  

5.至此如果一切顺利的话您的博文应该已经全部导入到CSDN博客了,如果有一些小问题,如网络超时,操作太频繁并CSDN暂时封锁的话,在再次执行导入之前就需要将原来导入的数据删除,或者是跳过才能保证CSDN博文不会重复。下面是一个删除CSDN已有博文的方法:

package cn.mingyuan.baidu2csdn.core; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import org.apache.xmlrpc.XmlRpcException; import org.apache.xmlrpc.client.XmlRpcClient; import org.apache.xmlrpc.client.XmlRpcClientConfigImpl; public class DeletePostById { private static XmlRpcClientConfigImpl config; private static XmlRpcClient client; static{ config = new XmlRpcClientConfigImpl(); try { config.setServerURL(new URL("http://blog.csdn.net/telnetor/services/metablogapi.aspx")); } catch (MalformedURLException e) { System.out.println("请检查url"); } client = new XmlRpcClient(); client.setConfig(config); } /** * 删除帖子 * @param appkey appkey,可以任意,这是一个忽略的值 * @param postid 帖子id * @param username 用户名 * @param password 密码 * @param publish 博客在帖子被删除之后是否重新发布 */ public static void delete(String appkey,String postid,String username,String password,boolean publish){ Object[] params = new Object[] { "ignored value", postid,username,password, true }; try { client.execute("blogger.deletePost", params); } catch (XmlRpcException e) { System.out.println("删除出错,postid="+postid); } System.out.println(postid+"删除完毕"); } /** * @param args * @throws InterruptedException */ public static void main(String[] args) throws InterruptedException { BufferedReader reader = null; String line; try { reader = new BufferedReader(new InputStreamReader(new FileInputStream("content"))); while((line=reader.readLine())!=null){ line=line.split("生成博文id为:")[1]; delete("ignored",line,"your username","your password",true); Thread.sleep(1000*10); } } catch (FileNotFoundException e1) { System.out.println("文件没找到"); } catch (IOException e) { System.out.println("读取文件失败"); } } }  

6.好了,经过耐心操作我们的导入工作应该已经完成了。

本程序有不足的地方:

  • 不支持图片导入
  • 只是针对笔者自己博客量身定做的程序,里面许多情况如url中包含汉字,模板的通用性等问题没有加以考虑,在使用的时候应该根据实际情况加以修正
  • 有少数博客导入失败,笔者130多篇博客有4篇导入失败,不过由于比较少,可以用手动发帖来弥补
  • 速度问题 由于CSDN会短暂屏蔽频繁的访问,所以本程序在导入一篇博客后采取了休眠5秒钟的方法,这样会降低速度。另外如果百度上面的博文过多的话建议采集百度数据的时候也加上休眠时间,防止封锁

7.使用到的包为xml-rpc,您可以在这里下载到最新版本。CSDN博客API在这里可以找到。程序中涉及的发帖,删帖所需的参数均在api中有详细说明

你可能感兴趣的:(从百度空间到CSDN——博客搬家源码)