1.CSDN的博客使用的是MetaWeblog Api,可以使用xml-rpc进行操作。下面的代码演示了怎样使用api发布博客
package cn.mingyuan.baidu2csdn.core; import java.io.FileOutputStream; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.Date; import java.util.HashMap; import java.util.Map; import org.apache.xmlrpc.XmlRpcException; import org.apache.xmlrpc.client.XmlRpcClient; import org.apache.xmlrpc.client.XmlRpcClientConfigImpl; /** * csdn博文 * @author [email protected] * */ public class CSDNPost { /** * 博文创建日期 */ private Date dateCreated; /** * 博文内容 */ private String description; /** * 标题 */ private String title; /** * 博文分类 */ private String[] categories; public CSDNPost(){ } public CSDNPost(String title, String description, String[] categories, Date dateCreated) { this.dateCreated = dateCreated; this.description = description; this.title = title; this.categories = categories; } public Date getDateCreated() { return dateCreated; } public void setDateCreated(Date dateCreated) { this.dateCreated = dateCreated; } public String getDescription() { return description; } public void setDescription(String description) { this.description = description; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String[] getCategories() { return categories; } public void setCategories(String[] categories) { this.categories = categories; } /** * xml-rpc配置 */ private static XmlRpcClientConfigImpl config; /** * xml-rpcClient */ private static XmlRpcClient client; static{ config = new XmlRpcClientConfigImpl(); try { //此处请将telnetor替换为您的用户名 config.setServerURL(new URL("http://blog.csdn.net/telnetor/services/metablogapi.aspx")); } catch (MalformedURLException e) { System.out.println("请检查url"); } client = new XmlRpcClient(); client.setConfig(config); } /** * 日志记录 * @param log log */ private void writelog(String log){ FileOutputStream fos=null; try { fos=new FileOutputStream("post.log",true); fos.write((log+"/r/n").getBytes()); fos.flush(); fos.close(); } catch (IOException e) { System.out.println("写入日志错误:"+log); } } /** * 发布 */ public void publish(){ Map<String, Object> struct = new HashMap<String, Object>(); struct.put("dateCreated", dateCreated); struct.put("description", description); struct.put("title", title); struct.put("categories", categories); Object[] params = new Object[] { "your usrname", "replace it with your username", "replace it with your password", struct, true }; String blogid = null; try { blogid = (String) client.execute("metaWeblog.newPost", params); } catch (XmlRpcException e) { writelog("导入出现错误:title="+title); System.out.println("导入出现错误:title="+title); } writelog(title + ">> 导入完毕,生成博文id为>>" + blogid); System.out.println(title + ">> 导入完毕,生成博文id为>>" + blogid); struct.clear(); } public static void main(String[] args){ CSDNPost post=new CSDNPost(); post.publish(); } }
其中需要注意的是categories是一个数组。
2.知道了怎样使用api发布博客之后,我们就该进行下一步:读取百度空间的博文内容了。首先定义一个BaiduHi的class,用来存放从百度博客读取出来的数据。
package cn.mingyuan.baidu2csdn.core; import java.util.Date; /** * 百度博客 * @author [email protected] * */ public class BaiduHi { /** * 标题 */ private String title; /** * 内容 */ private String description; /** * 分类 */ private String categories; /** * 发布日期 */ private Date dateCreated; public String getTitle() { return title; } public String getDescription() { return description; } public String getCategories() { return categories; } public Date getDateCreated() { return dateCreated; } public void setTitle(String title) { this.title = title; } public void setDescription(String description) { this.description = description; } public void setCategories(String categories) { this.categories = categories; } public void setDateCreated(Date dateCreated) { this.dateCreated = dateCreated; } public BaiduHi(String title, String description, String categories, Date dateCreated) { this.title = title; this.description = description; this.categories = categories; this.dateCreated = dateCreated; } public BaiduHi() { // TODO Auto-generated constructor stub } /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub } }
3.读取百度空间博文内容,注意:我们使用的是游客权限来读取百度空间的内容,如果有私密信息,需要将其公开之后程序才能读取。另外我使用的百度空间模板为“80後青春”,如果您使用的不是此模板,html源码可能会不同,解析就可能失败。如果使用本程序请保持和我的模板一致。板式为空间装扮-板式-显示出来的第二行第二列那个
package cn.mingyuan.baidu2csdn.core; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Stack; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 百度博客数据抓取及解析 * @author [email protected] * */ public class BaiduHiFetcher { /** * 下载页面 * @param url url * @return 网页源码 */ private String downloadPage(String url) { URLConnection conn; InputStream in; BufferedReader reader = null; StringBuilder sb = new StringBuilder(); String line = null; try { conn = new URL(url).openConnection(); in = conn.getInputStream(); reader = new BufferedReader(new InputStreamReader(in, "gb2312")); while ((line = reader.readLine()) != null) { sb.append(line); } in.close(); reader.close(); } catch (MalformedURLException e) { System.out.println("请检查url是否规范"); } catch (IOException e) { System.out.println("读取源码错误:"+url); } return sb.toString(); } /** * 获取页面博文链接 * @param html 网页源码 * @return 页面中的博文链接 */ private List<String> getPostLinks(String html) { // 分析页面内容,取得页面中的文章链接 String titleDivRegex = "<div[//s]class=/"tit/"><a[//s]href=[^<>]+?target=/"_blank/">.+?</div>"; Pattern titleDivPattern = Pattern.compile(titleDivRegex); Matcher titleDivMatcher = titleDivPattern.matcher(html); List<String> posts = new ArrayList<String>(); while (titleDivMatcher.find()) { String div = titleDivMatcher.group(); String titleUrl = div.substring(div.indexOf("/"), div.indexOf("/" target")); posts.add("http://hi.baidu.com" + titleUrl); } return posts; } /** * <p>获取博客总页数 * <br>我的博客内容有16页,有上一页,下一页,尾页等这样的标志,如果博文少的话可能这些标志不会出现,请修改此方法 * @param html 源码(最好是第一页) * @return 博客总页数 */ private int getTotalPages(String html) { // 页码 // <a href="/cnjsp/blog/index/16" mce_href="cnjsp/blog/index/16">[尾页]</a> String pageRegex = "<a[//s]href=/"/cnjsp/blog/index/[//d][//d]/">//[尾页//]</a>"; Pattern pagePattern = Pattern.compile(pageRegex); Matcher pageMatcher = pagePattern.matcher(html); String totalPagesStr = null; int pages = 0; if (pageMatcher.find()) { String pagelink = pageMatcher.group(); totalPagesStr = pagelink.replaceAll("<a[//s]href=/"/cnjsp/blog/index/", "").replaceAll("/">//[尾页//]</a>", ""); pages = Integer.parseInt(totalPagesStr); } return pages; } /** * <p>获取博客的所有博文的地址 * <br>没有对url进行编码处理,如果博客地址含中文,请对url进行处理 * @param blogUrl 博客地址 * @return 所有博文地址,存放于栈中,使用的时候请使用pop方法取出元素,这样可以保证按照最先发表的博文最先处理 */ public Stack<String> getAllPostLink(String blogUrl){ Stack<String> posts = new Stack<String>(); // 1.下载第一页 String firstPageHtml = downloadPage(blogUrl + "/blog/index/0"); // 2.获取博文总页数 int totalPages = getTotalPages(firstPageHtml); // 3.下载各摘要页 posts.addAll(getPostLinks(firstPageHtml)); if (totalPages < 1) { return posts; } for (int i = 1; i <= totalPages; i++) { String page = downloadPage(blogUrl + "/blog/index/" + i); posts.addAll(getPostLinks(page)); } return posts; } /** * 解析博文,获取标题,发布时间,内容,分类等信息 * @param postUrl 博文地址 * @return 封装了博文信息的BaiduHi */ public BaiduHi getBaiduHi(String postUrl){ String html = downloadPage(postUrl); // /<div class="tit"> String titleDivRegex = "<div[//s]id=/"m_blog/"[//s]class=/"modbox/"[//s]style=/"overflow-x:hidden;/"><div[//s]class=/"tit/">.+?</div><div[//s]class=/"date/">"; Pattern titleDivPattern = Pattern.compile(titleDivRegex); Matcher titleDivMatcher = titleDivPattern.matcher(html); String title = null; if (titleDivMatcher.find()) { title = titleDivMatcher.group().replaceAll("<div[//s]id=/"m_blog/"[//s]class=/"modbox/"[//s]style=/"overflow-x:hidden;/"><div[//s]class=/"tit/">", "").replaceAll("</div><div[//s]class=/"date/">", "").trim(); } String dateDivRegex = "<div[//s]class=/"date/">.+?</div>"; Pattern dateDivPattern = Pattern.compile(dateDivRegex); Matcher dateMatcher = dateDivPattern.matcher(html); String dateStr = null; Date postDate = null; if (dateMatcher.find()) { dateStr = dateMatcher.group().replaceAll("<div[//s]class=/"date/">", "").replaceAll("</div>", "").trim(); postDate = getDate(dateStr); } String textDivRegex = "<div[//s]id=/"blog_text/"[//s]class=/"cnt/"[//s]+>.+?</div>"; Pattern textDivPattern = Pattern.compile(textDivRegex); Matcher textMatcher = textDivPattern.matcher(html); String text = null; if (textMatcher.find()) { text = textMatcher.group().replaceAll("<div[//s]id=/"blog_text/"[//s]class=/"cnt/"[//s]+>", "").replaceAll("</div>", "").trim(); } String categoriesRegex = "title=/"查看该分类中所有文章/">类别:.+?</a>"; Pattern categoriesDivPattern = Pattern.compile(categoriesRegex); Matcher categoriesMatcher = categoriesDivPattern.matcher(html); String categories = null; if (categoriesMatcher.find()) { categories = categoriesMatcher.group().replaceAll("title=/"查看该分类中所有文章/">类别:", "").replaceAll("</a>", "").trim(); } BaiduHi hi = new BaiduHi(); hi.setTitle(title); hi.setDescription(text); hi.setCategories(categories); hi.setDateCreated(postDate); return hi; } /** * 解析博文中的日期格式返回Date类型 * @param str 博文中的日期 * @return Date类型日期 */ @SuppressWarnings("deprecation") private Date getDate(String str) { String yearStr = str.substring(0, str.indexOf("年")).trim(); String monthStr = str.substring(str.indexOf("年"), str.indexOf("月")).replace("年", "").trim(); String dayStr = str.substring(str.indexOf("月"), str.indexOf("日")).replace("月", "").trim(); String timeStr = str.substring(str.indexOf("午")).replace("午", "").trim(); String hourStr = timeStr.split(":")[0]; String minutesStr = timeStr.split(":")[1]; Date date = new Date(); date.setYear(Integer.parseInt(yearStr) - 1900); date.setMonth(Integer.parseInt(monthStr) - 1); date.setDate(Integer.parseInt(dayStr)); if (str.contains("下午")) { date.setHours(Integer.parseInt(hourStr) + 12); } else { date.setHours(Integer.parseInt(hourStr)); } date.setMinutes(Integer.parseInt(minutesStr)); return date; } }
4.我们现在完成了写CSDN博客与读取并解析百度空间博文的工作。接下来需要把它们连起来,完成导入
package cn.mingyuan.baidu2csdn.core; import java.util.Stack; /** * 搬家 * @author [email protected] * */ public class Transfer { /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub String postUrl = "http://hi.baidu.com/cnjsp"; BaiduHiFetcher fetcher = new BaiduHiFetcher(); Stack<String> urls = null; urls = fetcher.getAllPostLink(postUrl); while (!urls.isEmpty()) { String url = urls.pop(); BaiduHi hi = null; hi = fetcher.getBaiduHi(url); CSDNPost post = new CSDNPost(); post.setTitle(hi.getTitle()); post.setDescription(hi.getDescription()); post.setCategories(new String[] { hi.getCategories() }); post.setDateCreated(hi.getDateCreated()); post.publish(); try { Thread.sleep(5 * 1000); } catch (InterruptedException e) { System.out.println("休眠出错"); } } } }
5.至此如果一切顺利的话您的博文应该已经全部导入到CSDN博客了,如果有一些小问题,如网络超时,操作太频繁并CSDN暂时封锁的话,在再次执行导入之前就需要将原来导入的数据删除,或者是跳过才能保证CSDN博文不会重复。下面是一个删除CSDN已有博文的方法:
package cn.mingyuan.baidu2csdn.core; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import org.apache.xmlrpc.XmlRpcException; import org.apache.xmlrpc.client.XmlRpcClient; import org.apache.xmlrpc.client.XmlRpcClientConfigImpl; public class DeletePostById { private static XmlRpcClientConfigImpl config; private static XmlRpcClient client; static{ config = new XmlRpcClientConfigImpl(); try { config.setServerURL(new URL("http://blog.csdn.net/telnetor/services/metablogapi.aspx")); } catch (MalformedURLException e) { System.out.println("请检查url"); } client = new XmlRpcClient(); client.setConfig(config); } /** * 删除帖子 * @param appkey appkey,可以任意,这是一个忽略的值 * @param postid 帖子id * @param username 用户名 * @param password 密码 * @param publish 博客在帖子被删除之后是否重新发布 */ public static void delete(String appkey,String postid,String username,String password,boolean publish){ Object[] params = new Object[] { "ignored value", postid,username,password, true }; try { client.execute("blogger.deletePost", params); } catch (XmlRpcException e) { System.out.println("删除出错,postid="+postid); } System.out.println(postid+"删除完毕"); } /** * @param args * @throws InterruptedException */ public static void main(String[] args) throws InterruptedException { BufferedReader reader = null; String line; try { reader = new BufferedReader(new InputStreamReader(new FileInputStream("content"))); while((line=reader.readLine())!=null){ line=line.split("生成博文id为:")[1]; delete("ignored",line,"your username","your password",true); Thread.sleep(1000*10); } } catch (FileNotFoundException e1) { System.out.println("文件没找到"); } catch (IOException e) { System.out.println("读取文件失败"); } } }
6.好了,经过耐心操作我们的导入工作应该已经完成了。
本程序有不足的地方:
7.使用到的包为xml-rpc,您可以在这里下载到最新版本。CSDN博客API在这里可以找到。程序中涉及的发帖,删帖所需的参数均在api中有详细说明