使用calibre获取静觅爬虫学习系列教程

好像页面的处理做的不是很好

#coding='gbk'
from calibre.web.feeds.recipes import BasicNewsRecipe
import re
class lhxy(BasicNewsRecipe):
    title = u"静觅爬虫学习系列教程"  
    description = u"..."
    language = 'zh'
    max_articles_per_feed = 1000
    oldest_article = 500
    remove_javascript = True
    cover_url = 'https://xxx.jpg' #自己换成个封面链接
    no_stylesheets = True 
    keep_only_tags =[dict(name='h1', attrs={'class':'article-title'}),
    dict(name='article', attrs={'class':'article-content'}),]
    def get_title(self,link):
        return link.contents[0].strip()
    def parse_index(self):
        contents_soup = self.index_to_soup('http://cuiqingcai.com/1052.html')
        trans_Elem = contents_soup.find('article', attrs={'class': "article-content"})
        contents_Elem=trans_Elem.findAll('p')
        mn=[]
        for link in contents_Elem:
            xx=link.a
            if xx==None:
                continue
            mn.append(xx)
        articles = []
        for link1 in mn:
            title=self.get_title(link1)
            title = title.encode("utf-8")
            url = link1['href']
            gather = {'title':title,'url':url}
            articles.append(gather)
        ans = [(u'静觅爬虫学习系列教程',articles)]
        return ans

你可能感兴趣的:(使用calibre获取静觅爬虫学习系列教程)