python爬虫系列3-百度贴吧页面爬取

任务需求:

  • 网站地址:http://tieba.baidu.com/
  • 使用的库 urllib,ssl
# -*- coding: utf-8 -*-
# @Time    : 2020/7/29 6:05 下午
# @Author  : livein80
# @Email   : [email protected]
# @File    : ssyer.py
# @Software : PyCharm
import urllib.request
import urllib.parse
import ssl

class BaiduTieba():
    def __init__(self):
        self.base_url='http://tieba.baidu.com/f?'
        self.context = ssl._create_unverified_context()
        self.headers={
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
        }

    def read_html(self,url):
        req = urllib.request.Request(url,headers=self.headers)
        res = urllib.request.urlopen(req,context=self.context)
        html = res.read().decode('utf-8')
        return html

    def write_html(self,filename,html):
        with open(filename,'w',encoding='utf-8') as file:
            file.write(html)


    def main(self):
        key =  input('请输入关键字 : ')
        start_page = int(input('开始页'))
        end_page = int(input('结束页'))
        kw = {'kw':key}
        kw = urllib.parse.urlencode(kw)
        for i in range(start_page,end_page+1):
            pn = (i-1)*50
            url = self.base_url+kw+'&pn='+str(pn)
            html = self.read_html(url)
            file_name = '第{}页.html'.format(i)
            self.write_html(file_name,html)

if __name__=='__main__':
    spider =  BaiduTieba()
    spider.main()


你可能感兴趣的:(python爬虫系列3-百度贴吧页面爬取)