

#! encoding=utf-8


import urllib2
import re
import os
import sys
# from HTMLParser import HTMLParser
import html5lib
# from xml.etree.ElementTree import ElementTree
from urlparse import urlparse
import xml
import codecs
import traceback
import time

# class MyHTMLParser(HTMLParser):

#     def handle_starttag(self, tag, attrs):
#         # if tag.lower() == "img":
#             print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs))
#             for x in attrs:
#                 print "name %s,value %s" % (x[0],x[1])
#     def handle_endtag(self, tag):
#         print "Encountered the end of a %s tag" % tag

#     def handle_startendtag(self, tag, attrs):
#         print "Encountered the beginning of a %s tag,attrs size %d" % (tag ,len(attrs))
#         for x in attrs:
#             print "name %s,value %s" % (x[0],x[1])

# 资源尝试次数
gTestTime = 5

def DownloadFile(url,output):
    responseText = None
    dirssPath = None
        res = urlparse(url)
        url = res.scheme+"://"+res.netloc+res.path
        path = res.path
        index = path.rfind('/')
        dirss = "/"
        if index != -1:
            dirss =  output + "/" + res.netloc.encode("utf-8") + path[0:index].encode("utf-8")
            dirssPath = output + "/" + res.netloc.encode("utf-8") + path.encode("utf-8")
            dirss_ansi = dirss.decode('utf-8')
            if not os.path.exists(dirss_ansi):
        global gTestTime
        count = gTestTime    
        while True:
            if count < 0:
            count = count - 1
            header={"User-Agent": "Mozilla-Firefox5.0"}
            if not url.startswith("http://"):
                # print "url: %s:%d" % (url,count)
                request = urllib2.Request(url,None,header)
                response = urllib2.urlopen(request)
                dirssPath_ansi = dirssPath.decode("utf-8")
                if not os.path.exists(dirssPath_ansi):
                    resourceFile = open(dirssPath_ansi,"wb")
                    responseText =
                    if url.endswith(".js"):
                        responseText = responseText.replace("http://","")
                        responseText = responseText.replace("https://","")
            except Exception,e:
                print "DownloadFile: %s:%s:%d" % (e,url,count)
                # pass
                # exstr = traceback.format_exc()
                # print exstr

    except Exception,e:
            # exstr = traceback.format_exc()
            # print exstr
    return (responseText,url,output)

def ReadCss(css):
    # print "ReadCss"
    mode = 'url\(\"?([^)]+)\"?\)'
    pattern = re.compile(mode)
        text = css[0]
        if css[0] == None:
        strMatch = pattern.findall(text)
        size = len(strMatch)
        # print "size: ",size
        for i in range(0,size,1):
            one = strMatch[i]
            newurl = GetConcatUrl(css[1],one)
    except Exception,e:
            # exstr = traceback.format_exc()
            # print exstr 

def Download(url,output):
    # try:
    header={"User-Agent": "Mozilla-Firefox5.0"}
    namespace = "{}"
    request = urllib2.Request(url,None,header)
    response = urllib2.urlopen(request)

    data =
    document = html5lib.parse(data)
    imgElements = document.findall('.//{0}img'.format(namespace))
    # print "imgElements %d" % len(imgElements)
    for img in imgElements:
        src = img.attrib["src"]
        # print "src %s" % src
            res = urlparse(src)
            # 非cnblogs的图片不下载
            if not res.netloc.endswith(""):
                print "image not download: %s:%s" % (src,res.netloc)
        except Exception,e:

    linkElements = document.findall('.//{0}link'.format(namespace))
    # print "linkElements %d" % len(linkElements)
    for link in linkElements:
        href = link.attrib["href"]
        # print "href %s" % href
        text = DownloadFile(href,output)
        if link.attrib.has_key("rel") and link.attrib["rel"].lower() == "stylesheet":

    scriptElements = document.findall('.//{0}script'.format(namespace))
    # print "scriptElements %d" % len(scriptElements)
    for script in scriptElements:
        if script.attrib.has_key("src"):
            src = script.attrib["src"]
            # print "src %s" % src
    htmlNameIndex = url.rfind("/");
    urlLen = len(url)
    htmlName = GetHtmlName(url)
    output = output.decode("utf-8") + "/"+htmlName+".htm"
    data = data.replace("http://","")
    data = data.replace("https://","")
    data = data.replace("","")

    resourceFile = open(output,"wb")

def GetConcatUrl(url,png):
    # one: "../images/f_icon.png" -- url
    count = 0
    index = png.find("..")
    startindex = None
    while index != -1:
        count = count + 1;
        startindex = index + 2
        index = png.find("..",startindex)

    second = png[startindex:]
    length = len(url)
    index = url.rfind("/")
    endindex = 0
    while count >= 0 and index != -1:
        endindex = index
        index = url.rfind("/",0, endindex)
        count = count - 1
    first = url[0:endindex]
    return first+second

def getAllListUrl(url):
    header={"User-Agent": "Mozilla-Firefox5.0"}
    request = urllib2.Request(url,None,header)
    response = urllib2.urlopen(request)
    data =
    # By default, the document will be an xml.etree element instance.Whenever possible, html5lib chooses the accelerated ElementTreeimplementation (i.e. xml.etree.cElementTree on Python 2.x).
    document = html5lib.parse(data)
    namespace = "{}"

    # get 
pageList = document.findall('.//{0}div[@id=\'homepage1_BottomPager\']'.format(namespace)) # get
alinks = list(pageList[0]) # get content in
, like: alinks1 = list(alinks[0]) lastArticle = alinks1[len(alinks1)-1] # lastArticleHref = u'' lastArticleHref = lastArticle.attrib["href"] lastPageIndex = lastArticleHref.rfind("=") lastPageNum = int(lastArticleHref[lastPageIndex+1:]) urlInfo = lastArticleHref[0:lastPageIndex] urlList = [] for x in xrange(1,lastPageNum+1): listUrl = urlInfo+"="+str(x) urlList.append(listUrl) return urlList def getArticleList(url): # 获取所有的文章url #
# #

# # #
# urlList = getAllListUrl(url) print "文章页数(number of pages) ",len(urlList) header={"User-Agent": "Mozilla-Firefox5.0"} allLists = [] strPage = "分析 第 {0} 页 ".decode("utf-8").encode("utf-8") pageNum = 0 global gTestTime for one in urlList: tryCount = gTestTime # try count pageNum = pageNum + 1 pageNumStr = strPage.format(pageNum) print pageNumStr while tryCount > 0: try: tryCount = tryCount - 1 time.sleep(0.5) #访问太快会不响应 request = urllib2.Request(one,None,header) response = urllib2.urlopen(request) data = document = html5lib.parse(data,encoding="utf-8") namespace = "{}" # .//{0}div[@id=\'article_toplist\'] #topLists = document.findall('.//{0}div[@id=\'article_toplist\']/{0}div[@class=\'list_item article_item\']'.format(namespace)) #articleLists = document.findall('.//{0}div[@id=\'article_list\']/{0}div[@class=\'list_item article_item\']'.format(namespace)) articleLists = document.findall('.//{0}div[@class=\'postTitle\']'.format(namespace)) allLists = allLists + articleLists break except Exception, e: print "getArticleList %s:%s:%d" % (e,one,tryCount) count = 0 # 文章数 artices = [] for article in allLists: count = count+1 alink = article.find(".//{0}a".format(namespace)) # href = u'' href = alink.attrib["href"] #oneHref = ""+href oneHref = href childElement = list(alink) linkIter = alink.itertext() title = "".encode("utf-8") for x in linkIter: title = title+x.strip().encode("utf-8") artices.append([oneHref,title]) return artices def GetUserName(url): htmlNameIndex = url.rfind("/"); urlLen = len(url) htmlName = "" htmlNameIndex1 = url.rfind("/",0,htmlNameIndex) htmlName = url[htmlNameIndex1+1:htmlNameIndex] # if htmlNameIndex+1 == urlLen: # htmlNameIndex = url.rfind("/",0,htmlNameIndex) # htmlName = url[htmlNameIndex+1:urlLen-1] # else: # htmlName = url[htmlNameIndex+1:] return htmlName def GetHtmlName(url): htmlNameIndex = url.rfind("/"); urlLen = len(url) htmlName = "" if htmlNameIndex+1 == urlLen: htmlNameIndex = url.rfind("/",0,htmlNameIndex) htmlName = url[htmlNameIndex+1:urlLen-1] else: htmlName = url[htmlNameIndex+1:] return htmlName #url必须是类似这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页,则URL建议为前面的URL def Start(url,output): print "备份开始" lists = getArticleList(url) username = GetUserName(url) output_username = output+"/"+username output_username.replace("\\","/") if not os.path.exists(output_username.decode("utf-8")): os.mkdir(output_username.decode("utf-8")) totalNum = len(lists) print "总文章数(number of articles): %d" % totalNum # 生成首页文件 doctype = '\n' charset = '' indexHtml = output_username + ".htm" f = open(indexHtml.decode("utf-8"),"w") print >> f,doctype print >> f,'' print >> f,'' print >> f,charset print >> f,'' print >> f,'' navigationHtmlName = username+'-navigation.htm' print >> f,'' firstHtmlName = GetHtmlName(lists[0][0]) print >> f,'' print >> f,'' print >> f,'' f.close() # 生成导航文件 navigationHtml = output+"/"+navigationHtmlName # f = open(navigationHtml.decode("utf-8"),"w") f ="utf-8"),"w","utf-8-sig") print >> f,doctype print >> f,'' print >> f,'' print >> f,charset print >> f,'' print >> f,'' print >> f,'' count = 0 for x in lists: count = count + 1 articleIdHtml = username+"/"+GetHtmlName(x[0])+".htm" print >> f,''+str(count)+'.'+x[1].decode("utf-8")+'

' print >> f,'' print >> f,'' f.close() print "开始下载文章" currentNum = 0 strPage = "{0}:{1}.".decode("utf-8").encode("utf-8") global gTestTime for x in lists: count = gTestTime currentNum = currentNum+1 while True: if count < 0: break count = count - 1 try: time.sleep(1) #访问太快,csdn会报503错误. strPageTemp = strPage.format(totalNum,currentNum) strPageTemp = strPageTemp+x[1] print strPageTemp #这里有时候会不能输出,报output is not utf-8错误,单独执行时 print x[0] print "\n" Download(x[0],output_username) break except Exception, e: # exstr = traceback.format_exc() # print exstr pass #url必须是类似这样的。并且这页必须包括最后一页的链接。例如GnagWang共20页,则URL建议为前面的URL if __name__=='__main__': url = "" #output = "C:/Users/apple/Desktop/新建文件夹" output = "f:/temp" Start(url,output) # Download("", # "C:/Users/apple/Desktop/新建文件夹/infoworld")
