用Python实现备份CSDN博客:
功能:备份输入user的所有博文到当前目录下的user文件夹内(html格式)
不足:暂时还没开多线程,并且反盗链之类还没有处理(虽然转载的文章一般都是保存没反盗链的图,但是,还是一个不足),然后就是语言的精简还需提高,比如文件名修正,应该可以用循环匹配查找…………
谢谢python技术交流群里的菜鱼和xiao等前辈的指导,编码问题给我郁闷了好久,欢迎大家拍砖。
————————
添加多线程,知道join了,呵呵
转为的exe版本见0分下载页http://download.csdn.net/detail/betabin/4377512
————————
上面的exe资源被Csdn删掉了,这能接受,可是连事后通知都没有,这有点………
貌似又恢复了……这…………
————————
把文件名的过滤用sub替代,减去冗余代码
会python的就自己脚本跑吧。
# -*- coding: cp936 -*- ''' Author: BetaBin Date: 2012/06/16 Function: Backup the csdn blog. ''' import urllib import urllib2 import os import re import traceback import threading import datetime #Global data #日志链接,相对路径:日志名 blogurl = {} savednum = 1 threadnum = 5 bloghost = 'http://blog.csdn.net' bloguser = 'BetaBin' #获取infourl的html源码utf8编码 def getinfo(infourl, hostsite = ''): postdata = urllib.urlencode({}) headers = { 'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } if hostsite != '': print "hostsite: ", hostsite headers['Referer'] = hostsite req = urllib2.Request( url = infourl, data = postdata, headers = headers ) #return urllib2.urlopen(req).read().decode('utf8') urlfile = urllib2.urlopen(req) result = urlfile.read().decode('utf8') urlfile.close() return result #开始备份url页面的所有博客及后续页面博客 def backup(url): #获取内容 result = getinfo(url) #找本页日志链接 titlepatt = """class="link_title"><a href="(.*?)">[\\s]+(.*?)[\\s]+</a>""" retitle = re.compile(titlepatt) blogs = retitle.findall(result) #print "Got: ", blogs for blog in blogs: blogurl[blog[0]] = blog[1] #print blog[0] + '\t\t' + blog[1] #找下页链接 nextpagepatt = u"""<a href="(.*?)">下一页</a>""" renextpage = re.compile(nextpagepatt) pagelink = renextpage.search(result) if pagelink is not None: pagelink = renextpage.findall(result) truelink = pagelink[0].rfind('href') backup(bloghost + pagelink[0][truelink + 6:]) #打印blogurl映射里的内容 def printbloglink(): for item in blogurl: print item, "\t\t", blogurl[item] #找到博客正文内容 def getcontent(inforesult): contentpatt = """<div id="article_content" class="article_content"[\\s\\S]*?<div class="share_buttons\"""" recontent = re.compile(contentpatt) result = recontent.search(inforesult) if result is not None: #根据前面正则匹配,最后26个字符是CSDN分享链接的,不需要 return '<html><head><meta http-equiv="Content-Type" content="text ml; charset=utf-8" /></head><body>'+result.group()[:-26]+'</dody></html>' else: print "正文提取出错……" return None #根据博文相对链接,开始下载博文 blogurllock = threading.Lock() def downloadblog(): while True: #读取一条博文链接 blogurllock.acquire() if (len(blogurl) > 0): item = blogurl.popitem() url = bloghost + item[0] title = item[1] global savednum blognum = savednum savednum += 1 else: blogurllock.release() return blogurllock.release() #确定博文标题,重叠自动重命名 if not os.path.exists(bloguser): os.makedirs(bloguser) title = ReplaceBadCharOfFileName(title) filename = title file_no = 1 while os.path.isfile(bloguser + '/' + filename + '.html'): filename = title + '(' + str(file_no) + ')' file_no += 1 #获取博客正文html content = getcontent(getinfo(url)) if content is None: traceback.print_exc() #找图片链接 picturepatt = """<img src="(http:.*?)"[\\s\\S]*?alt""" repicture = re.compile(picturepatt) pictures = repicture.findall(content) #下载图片,保存至相应文件夹 for pictureurl in pictures: #创建文件夹 folder = bloguser + '/' + filename + '/' picturename = pictureurl[pictureurl.rfind('/') + 1:] if not os.path.exists(folder): os.makedirs(folder) try: path = os.path.join(os.getcwd(), bloguser, filename, picturename) urllib.urlretrieve(pictureurl, path) except: print '图片保存失败,跳过此图片:', pictureurl traceback.print_exc() #一般失败是因为转载的反盗链 #所以选择错了就该博文的全部图片都不下载 break else: content = content.replace(pictureurl, filename + '/' + picturename, 1) bloghtml = open(bloguser + '/' + filename + '.html', 'wb') bloghtml.write(content.encode('utf8')) bloghtml.close() print "第", blognum, "篇博文", title, "保存完毕" #去掉文件名的不合法字符 def ReplaceBadCharOfFileName(filename): filename=filename.replace(" ","") filename=filename.replace("\\", "") #把一些冗余的操作用sub来消去,不过上面两个…… badpatt = """[\*\?;|<>&/:]""" rebad = re.compile(badpatt) return rebad.sub("", filename) #检查账号是否存在 def isvaliduser(): #forbiddenpatt = """<head><title>403 Forbidden</title></head> #reforbidden = re.compile(forbiddenpatt) try: getinfo(bloghost + '/' + bloguser) except: return False else: return True #main function bloguser = raw_input('请输入您的CSDN博客账号(例如http://blog.csdn.net/betabin中的betabin): ') if not isvaliduser(): print "该账号无效" else: starttime = datetime.datetime.now() backup(bloghost + "/" + bloguser) #downloadblog() #开多线程支持 threads = [] for threadid in range(threadnum): downloadthread = threading.Thread(None, downloadblog) threads.append(downloadthread) downloadthread.start() for downloadthread in threads: downloadthread.join() endtime = datetime.datetime.now() print "共用时:", endtime - starttime print "备份完毕"