实现简单的贴吧爬虫——获取html

# coding = "utf-8"
import requests


class tiebaSpider:
	def __init__(self, tieba_name):
		self.name = tieba_name
		self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
		                              "Chrome/80.0.3987.132 Safari/537.36"}
		self.url = "https://tieba.baidu.com/f?kw="+tieba_name+"&ie=utf-8&pn={}"

	def get_url_list(self): # 获取url地址列表
		# url_list = []
		# for i in range(10):
		# 	url = self.url.format(i * 50)
		# 	url_list.append(url)
		# return url_list
		return [self.url.format(i * 50) for i in range(10)]

	def get_response(self, url): # 访问url,获得响应
		print(url)
		response = requests.get(url, headers=self.headers)
		return response.content.decode()

	def save_html(self, page_num, html): # 保存html文件
		file_path = "{}-第{}页.html".format(self.name, page_num)
		with open(file_path, "w", encoding="utf-8") as f:
			f.write(html)
		return None

	def run(self):
		# 1、获取url地址列表
		url_list = self.get_url_list()
		# 2、访问url,获得响应
		for url in url_list:
			html = self.get_response(url)
			# 3、保存html
			page_num = url_list.index(url) + 1
			self.save_html(page_num, html)
		return None


if __name__ == "__main__":
	tieba_spider = tiebaSpider("lol")
	tieba_spider.run()

你可能感兴趣的:(爬虫,python,python爬虫)