爬虫第一课

学习爬虫第一天,课后作业,需求:用户输入需要爬的百度贴吧的主题,输入起始页,结束页,然后把这几页保存下来。
先用的简单过程写,后来用面向对象组织的,虽然也能实现目的,但是代码总是怪怪的。
等熟练一点再来改动。

import urllib.request
import urllib.parse
import os

class Spider(object):

    def __init__(self,name,num1,num2):
        self.name = name
        self.num_start = num1
        self.num_end = num2
        self.url = ''
        self.html = ''

    def created_url(self,i):
        url_1 = "https://tieba.baidu.com/f?kw="
        url_2 = urllib.parse.quote(self.name)
        url_3 = '&ie=utf-8&pn=' + str((i - 1) * 50)
        self.url = url_1 + url_2 + url_3

    def created_req_res(self):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64;x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'}
        req = urllib.request.Request(self.url, headers=headers)
        res = urllib.request.urlopen(req)
        self.html = res.read().decode("utf-8")

    def created_html_file(self,i):
        self.created_req_res()
        file = open(self.name + str(i) + ".html", "w", encoding="utf-8")
        file.write(self.html)
        file.close()

    def start(self):
        for i in range(self.num_start,self.num_end+1):
            self.created_url(i)
            self.created_html_file(i)
        print(self.name + "爬虫运行完毕!")


if __name__ == '__main__':
    while True:
        next = input("是继续/退出(Y/N):")
        if next == 'Y' or next == 'y':
            theme = input("请输入你要爬的贴吧主题:")
            num_start = int(input("请输入起始页:"))
            num_end = int(input("请输入结束页:"))
            sp1 = Spider(theme,num_start,num_end)
            sp1.start()
        else:
            break

你可能感兴趣的:(爬虫第一课)