Python备份CSDN博客

用Python实现备份CSDN博客:

功能:备份输入user的所有博文到当前目录下的user文件夹内(html格式)

不足:暂时还没开多线程,并且反盗链之类还没有处理(虽然转载的文章一般都是保存没反盗链的图,但是,还是一个不足),然后就是语言的精简还需提高,比如文件名修正,应该可以用循环匹配查找…………

谢谢python技术交流群里的菜鱼和xiao等前辈的指导,编码问题给我郁闷了好久,欢迎大家拍砖。

————————

添加多线程,知道join了,呵呵

转为的exe版本见0分下载页http://download.csdn.net/detail/betabin/4377512

————————

上面的exe资源被Csdn删掉了,这能接受,可是连事后通知都没有,这有点………

貌似又恢复了……这…………

————————

把文件名的过滤用sub替代,减去冗余代码

会python的就自己脚本跑吧。

# -*- coding: cp936 -*-
'''
Author: BetaBin
Date: 2012/06/16
Function: Backup the csdn blog.
'''
import urllib
import urllib2
import os
import re
import traceback
import threading
import datetime

#Global data
#日志链接,相对路径:日志名
blogurl = {}
savednum = 1
threadnum = 5
bloghost = 'http://blog.csdn.net'
bloguser = 'BetaBin'

#获取infourl的html源码utf8编码
def getinfo(infourl, hostsite = ''):
    postdata = urllib.urlencode({})
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
        }
    if hostsite != '':
        print "hostsite: ", hostsite
        headers['Referer'] = hostsite
        
    req = urllib2.Request(
        url = infourl,
        data = postdata,
        headers = headers
        )
    #return urllib2.urlopen(req).read().decode('utf8')
    urlfile = urllib2.urlopen(req)
    result = urlfile.read().decode('utf8')
    urlfile.close()
    return result

#开始备份url页面的所有博客及后续页面博客
def backup(url):
    #获取内容
    result = getinfo(url)
    #找本页日志链接
    titlepatt = """class="link_title"><a href="(.*?)">[\\s]+(.*?)[\\s]+</a>"""
    retitle = re.compile(titlepatt)
    blogs = retitle.findall(result)
    #print "Got: ", blogs
    for blog in blogs:
        blogurl[blog[0]] = blog[1]
        #print blog[0] + '\t\t' + blog[1]
    #找下页链接
    nextpagepatt = u"""<a href="(.*?)">下一页</a>"""
    renextpage = re.compile(nextpagepatt)
    pagelink = renextpage.search(result)
    if pagelink is not None:
        pagelink = renextpage.findall(result)
        truelink = pagelink[0].rfind('href')
        backup(bloghost + pagelink[0][truelink + 6:])

#打印blogurl映射里的内容
def printbloglink():
    for item in blogurl:
        print item, "\t\t", blogurl[item]

#找到博客正文内容
def getcontent(inforesult):
    contentpatt = """<div id="article_content" class="article_content"[\\s\\S]*?<div class="share_buttons\""""
    recontent = re.compile(contentpatt)
    result = recontent.search(inforesult)
    if result is not None:
        #根据前面正则匹配,最后26个字符是CSDN分享链接的,不需要
        return '<html><head><meta http-equiv="Content-Type" content="text ml; charset=utf-8" /></head><body>'+result.group()[:-26]+'</dody></html>'
    else:
        print "正文提取出错……"
        return None

#根据博文相对链接,开始下载博文
blogurllock = threading.Lock()
def downloadblog():
    while True:
        #读取一条博文链接
        blogurllock.acquire()
        if (len(blogurl) > 0):
            item = blogurl.popitem()
            url = bloghost + item[0]
            title = item[1]
            global savednum
            blognum = savednum
            savednum += 1
        else:
            blogurllock.release()
            return
        blogurllock.release()
        #确定博文标题,重叠自动重命名
        if not os.path.exists(bloguser):
            os.makedirs(bloguser)
        title = ReplaceBadCharOfFileName(title)
        filename = title
        file_no = 1
        while os.path.isfile(bloguser + '/' + filename + '.html'):
            filename = title + '(' + str(file_no) + ')'
            file_no += 1
        #获取博客正文html
        content = getcontent(getinfo(url))
        if content is None:
            traceback.print_exc()
        #找图片链接
        picturepatt = """<img src="(http:.*?)"[\\s\\S]*?alt"""
        repicture = re.compile(picturepatt)
        pictures = repicture.findall(content)
        #下载图片,保存至相应文件夹
        for pictureurl in pictures:
            #创建文件夹
            folder = bloguser + '/' + filename + '/'
            picturename = pictureurl[pictureurl.rfind('/') + 1:]            
            if not os.path.exists(folder):
                os.makedirs(folder)          
            try:                
                path = os.path.join(os.getcwd(), bloguser, filename, picturename)
                urllib.urlretrieve(pictureurl, path)
            except:
                print '图片保存失败,跳过此图片:', pictureurl
                traceback.print_exc()
                #一般失败是因为转载的反盗链
                #所以选择错了就该博文的全部图片都不下载
                break
            else:
                content = content.replace(pictureurl, filename + '/' + picturename, 1)
        bloghtml = open(bloguser + '/' + filename + '.html', 'wb')
        bloghtml.write(content.encode('utf8'))
        bloghtml.close()
        print "第", blognum, "篇博文", title, "保存完毕"

#去掉文件名的不合法字符 
def ReplaceBadCharOfFileName(filename):
    filename=filename.replace(" ","")
    filename=filename.replace("\\", "")
    #把一些冗余的操作用sub来消去,不过上面两个……
    badpatt = """[\*\?;|<>&/:]"""
    rebad = re.compile(badpatt)
    return rebad.sub("", filename)
    
#检查账号是否存在
def isvaliduser():
    #forbiddenpatt = """<head><title>403 Forbidden</title></head>
    #reforbidden = re.compile(forbiddenpatt)
    try:
        getinfo(bloghost + '/' + bloguser)
    except:
        return False
    else:
        return True

#main function
bloguser = raw_input('请输入您的CSDN博客账号(例如http://blog.csdn.net/betabin中的betabin): ')

if not isvaliduser():
    print "该账号无效"
else:
    starttime = datetime.datetime.now()
    backup(bloghost + "/" + bloguser)
    #downloadblog()
    #开多线程支持
    threads = []
    for threadid in range(threadnum):
        downloadthread = threading.Thread(None, downloadblog)
        threads.append(downloadthread)
        downloadthread.start()
    for downloadthread in threads:
        downloadthread.join()
    endtime = datetime.datetime.now()
    print "共用时:", endtime - starttime
    print "备份完毕"


你可能感兴趣的:(多线程,python,Blog,url,Blogs)