由于内涵段子被封了,换了一个网站www.neihan8.com,利用python爬取内涵吧段子标题和内容,使用python2实现
finditer 方法
finditer 方法的行为跟 findall 的行为类似,也是搜索整个字符串,获得所有匹配的结果。但它返回一个顺序访问每一个匹配结果(Match 对象)的迭代器。
#!/usr/bin/env python
# encoding:utf-8
import re
import requests
class NeihanSpider(object):
"""内涵吧段子爬取:www.neihan8.com"""
def __init__(self):
self.page = 1
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5478.400 QQBrowser/10.1.1550.400 Name"
}
self.pattern = r'\s+(.*?)
\s+(.*?)'
def send_request(self, url, params={}):
"""发送请求"""
print("[INFO]:正在请求<{}>".format(url))
html = requests.get(url, params=params, headers=self.headers).content
return html
def save_file(self, iter):
"""保存段子"""
with open("./段子.txt", "ab") as f:
while True:
try:
ret = next(iter)
title = ret.group(1)
content = ret.group(2)
# 处理掉空格
content = re.sub(r' ', "", content)
f.write(title + "\n")
f.write(content + "\n\n")
except StopIteration:
break
def parse_page(self, html):
"""正则处理文本"""
# re.finditer()返回一个顺序访问每一个结果的match对象迭代器
# 这里用finditer的原因是想要同时匹配到标题和内容
result_iter = re.finditer(self.pattern, html)
return result_iter
def main(self):
while True:
url = "https://www.neihan8.com/article/index_" + str(self.page) + ".html"
if self.page == 1:
url = "https://www.neihan8.com/article/index.html"
html = self.send_request(url)
result_iter = self.parse_page(html)
self.save_file(result_iter)
print("[INFO]:第<{}>页处理完成".format(self.page))
if raw_input("回车键继续,按q退出...") == "q":
break
self.page += 1
if __name__ == '__main__':
spider = NeihanSpider()
spider.main()
运行结果如下:
结果: