爬虫小练习:爬取内涵段子指定页数段子(可控制是否继续爬取)

import urllib.request
import re


# pattern1 = re.compile('(.*?)',re.S)  匹配点开段子标题后完整段子的内容
#
# content_list = pattern2.findall(html)
#
# http://www.neihan8.com/article/index_3.html
#
# User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36


class Spider:
    def __init__(self,page):
        self.page = page
        self.switch = True  # 爬取开关,决定用户是否继续爬取页面信息


    def loadPage(self):
        '''
            下载页面
        '''
        # 下载第一部分页面来获取完整段子内容的连接,且打开链接
        print("页面下载中......")
        if self.page == "1":
            url = "http://www.neihan8.com/article/index.html"
        else:
            url = "http://www.neihan8.com/article/index_"+ self.page +".html"
        headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
        request = urllib.request.Request(url,headers=headers)
        response = urllib.request.urlopen(request)
        html = response.read().decode("utf-8")
        # print(html)
        pattern1 = re.compile('(.*?)', re.S)
            content_list = pattern2.findall(html)
            self.dealPage(content_list)


    def dealPage(self,content_list):
        '''
            处理每页的段子信息
        '''
        for content in content_list:
            # print(content)
            # print("-" * 30)
            content = content.replace('

','').replace('

','') # print(content) # print("-" * 30) self.writPage(content) def writPage(self,content): ''' 把段子信息写入文件中 ''' print("文件写入中......") with open("内涵段子第"+ self.page +"页集合.txt","a") as f: f.write(content) f.write("\n" + ("-"*50)) def work(self): ''' 控制爬虫如何运行 ''' print("文件写入完成!感谢使用!") while self.switch: command = input("如果确定继续爬取,请按回车(退出按q):") if command == "q": self.switch = False else: page_num = input("请输入要再次爬取的页码:") self.page = page_num self.loadPage() if __name__ == '__main__': page_num = input("请输入要爬取的页码:") Spider = Spider(page_num) Spider.loadPage() Spider.work()

你可能感兴趣的:(Python爬虫)