python小爬虫,爬取文章(知乎专栏)片段

 爬取知乎专栏

#爬取知乎专栏

class XSSpider(scrapy.Spider):
    name = 'xiaoshuo'
    allowed_domains = ['zhihu.com']
    start_urls = ['https://www.zhihu.com/api/v4/columns/c_1059416559054893056/items?limit=10&offset=0']

    def parse(self, response):
        obj = json.loads(response.text)
        icount=0
        for p_item in obj['data']:
            icount = icount + 1
            p_content = get_html_of_response(p_item['url'])
            p_selector = etree.HTML(p_content.text)
            sid = re.sub(".+/p/","",p_item['url'])
            content_json = json.loads(p_selector.xpath("//script[@id='js-initialData']/text()")[0])
            txt = ""
            i_title = content_json['initialState']['entities']['articles'][sid]['title']
            txt = txt + i_title.strip() + "\r\n"
            i_p = content_json['initialState']['entities']['articles'][sid]['content']
            txt = txt + i_p.replace("

","\r\n").replace("

","\r\n") fo = open('G:/learn/3.txt', "ab+") # 打开小说文件 fo.write((txt).encode('UTF-8')) fo.close() # 自动翻页 pre_page_item = obj['paging']['next'] if icount>0 : yield scrapy.Request(pre_page_item, callback=self.parse) class XSSpider(scrapy.Spider): name = 'xiaoshuo' allowed_domains = ['zhihu.com'] start_urls = ['https://www.zhihu.com/api/v4/columns/c_1059416559054893056/items?limit=50&offset=490'] def parse(self, response): obj = json.loads(response.text) icount=0 for i in range(0,len(obj['data'])): icount = icount + 1 p_content = get_html_of_response(obj['data'][len(obj['data'])-1-i]['url']) p_selector = etree.HTML(p_content.text) txt = "" i_title = p_selector.xpath("//h1[@class='Post-Title']/text()") if len(i_title)>0: txt = txt + i_title[0].strip() + "\r\n" i_p = p_selector.xpath("//div[@class='RichText ztext Post-RichText']//p//text()") for p in i_p: txt = txt + p.strip() + "\r\n" fo = open('G:/learn/7.txt', "ab+") # 打开小说文件 fo.write((txt).encode('UTF-8')) fo.close() # 自动翻页 pre_page_item = obj['paging']['previous'] if pre_page_item != response.url : yield scrapy.Request(pre_page_item, callback=self.parse)

爬取普通小说网站 

class XSSpider(scrapy.Spider):
    name = 'xiaoshuo'
    allowed_domains = ['2mcn.com']
    start_urls = ['https://www.2mcn.com/html/book/73323/73323986/49627483.html']

    def parse(self, response):
        txt = ""
        i_title = response.xpath("//h1/text()").extract()[0]
        txt = txt + i_title.strip() + "\r\n"
        i_p = response.xpath("//div[@id='content']//text()").extract()
        for p in i_p:
            txt = txt + p.strip() + "\r\n"
        fo = open('3.txt', "ab+")  # 打开小说文件
        fo.write((txt).encode('UTF-8'))
        fo.close()

        # 自动翻页
        next_page_item = response.xpath("//a[contains(text(),'下一章')]/@href").extract()
        if len(next_page_item) >0:
            yield scrapy.Request(response.urljoin(next_page_item[0]), callback=self.parse)

 

 

你可能感兴趣的:(python小爬虫,爬取文章(知乎专栏)片段)