爬取页面中指定的内容
编码流程:
指定url——发起请求——获取响应数据——数据解析——进行持久化存储
数据解析分类
首先
如何实例化Beautiful对象
soup = BeautifulSoup(file,'lxml)
from bs4 import BeautifulSoup
对象实例化:
提供用于解析的属性和方法:
from bs4 import BeautifulSoup
import requests
import re
import os
#爬取三国演义小说所有的章节和章节内容
if __name__ == "__main__":
if not os.path.exists('三国演义'):
os.mkdir('三国演义')
ua = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4209.400"
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
headers = {
"User-Agent":ua,
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text,'lxml')
mulu = soup.select('.book-mulu > ul >li > a')
urc = soup.select('.book-mulu > ul > li > a')
ex = '.*?'
for i in range(len(urc)):
file_name = './三国演义/' + mulu[i].text + ".txt"
f = open(file_name,'w',encoding='utf-8')
a = re.findall(ex, str(urc[i]))
content_url = 'https://www.shicimingju.com' + a[0] #该章内容链接
content_all = requests.get(url=content_url,headers=headers)
content_soup = BeautifulSoup(content_all.text,'lxml')
for content_p in content_soup.select('.chapter_content p'):
f.write("\n ")
f.write(content_p.text)
f.close()