python爬取博客标题和访问量

<span style="font-size:24px;">#!usr/bin/python
#coding:utf-8
#爬取CSDN博客
#网络爬虫技术


import requests
import sys,re
'''
reload(sys)
sys.setdefaultencoding('utf-8')
'''

#拼接目标博客地址,请求并获取响应内容
def blog_acount():
        #csdn账号:
        acount = raw_input('输入csdn的登录账号:')

        base_url ='http://blog.csdn.net/' + acount
        #页号
        page_num = 1
        while True:
                
                #目标博客地址
                des_url = base_url+'/article/list/'+str(page_num)

                #直接访问csdn拒绝,伪装成浏览器访问
                user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
                headers = {'User-Agent':user_agent}

                r = requests.get(des_url,headers=headers)
                #print r.encoding
                
                #获取返回的响应内容
                string = r.text
                #查找是否存在下一页来判断是否进入下一页爬取地址

                result = string.find(u'尾页')
                if result != -1:
                        #print '内容有好几页'
                        tlist = web_crawl_tile(string)
                        rlist = web_crawl_pagev(string)
                        page_num += 1
                        for i in range(len(tlist)):
                                print '标题为:%s,的文章访问量为:%s'%(tlist[i],rlist[i])
                        continue
                        
                else:
                        #print '内容就一页'
                        tlist = web_crawl_tile(string)
                        rlist = web_crawl_pagev(string)
                        for i in range(len(tlist)):
                                print '标题为:%s,的文章访问量为:%s'%(tlist[i],rlist[i])
                        break
                

#爬取博客标题
def web_crawl_tile(string):
        #爬取CSDN博客文章标题
        stitle = r'<span class="link_title"><a href=".*?">\s(.*?)\s*?</a></span>'

        title = re.findall(stitle,string,re.S)
        #print title
        titlelist = []
        for i in title:
                #strip()用来去除字符串前后空白字符,lstrip用来去除左边空白字符,rstrip为右边
                #先转义中文,然后再去除左右空白字符
                j = i.encode('utf-8').lstrip().rstrip()
                #print j
                titilelist = titlelist.append(j)

        return titlelist

#爬取文章的访问量
def web_crawl_pagev(string):
        string = string.encode('utf-8')

        #访问爬取后的csdn博客地址,取出该博客的访问量
        srnum = r'<span class="link_view" title="阅读次数"><a href=".*?" title="阅读次数">阅读</a>\((\d*)\)</span>'

        rnum = re.findall(srnum,string,re.S)
        #print rnum
        return rnum


blog_acount()

</span>

你可能感兴趣的:(python,爬虫)