response=requests.get(href_list[page])
req_parser= BeautifulSoup(response.content.decode('utf-8'),features="html.parser")
div= req_parser.find_all('div',class_="content-body")
#div= req_parser.find_all('div',{"class":"content-body")#等价上一句
后面再从div里找p,跟前面的道理是一样的 ,就不赘述了。
完整代码:
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
url='http://www.seputu.com'
response = requests.get(url)
req_parser = BeautifulSoup(response.content.decode('utf-8'),features="html.parser")
li = req_parser.find_all('li')
temp = BeautifulSoup(str(li),features="html.parser")#进行进一步的字符解析因为获取要素类型的值时必须进行这一步
a = temp.find_all('a')
name_list=[]
href_list=[]
for i in a:
name=i.string
href=i['href']
name_list.append(name)
href_list.append(href)
def download(page):
response=requests.get(href_list[page])
req_parser= BeautifulSoup(response.content.decode('utf-8'),features="html.parser")
div= req_parser.find_all('div',class_="content-body")
temp = BeautifulSoup(str(div),features="html.parser")
temp=temp.find_all('p')
text = []
for i in temp:
temp=i.string
if temp!=None:
print(temp.encode('gbk','ignore').decode('gbk','ignore'))
text.append(temp)
with open('novel.txt','a+',encoding='utf-8') as f:
f.write(name_list[page])
f.write('\n')
for i in text:
f.write(i)
f.write('\n')
for i in range(len(href_list)):
try:
download(i)
except:
pass
print('%d is over'%i)
最后爬下来的txt文件有9000多行