爬小说案例:
from urllib import request
import requests
from bs4 import BeautifulSoup
url = "http://www.shicimingju.com/book/rulinwaishi.html"
headers = {'Host': 'www.shicimingju.com',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Referer': 'http://www.shicimingju.com/',
# 'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cookie': 'Hm_lvt_4c1638db937a6ad4a0e6a8bdfa32146f=1543454940; Hm_lpvt_4c1638db937a6ad4a0e6a8bdfa32146f=1543460971'
}
# 创建一个请求对象
# request1 = request.Request(url=url, headers=headers)
# 发送请求
# response1 = request.urlopen(request1).read().decode()
# 使用requests 返回一个response对象 然后用text读取里面的内容,返回一个unicode编码的str
response1 = requests.get(url=url, headers=headers).text
# print(response1)
# 转化网络文件
soup = BeautifulSoup(response1, "lxml")
# 选择这样的a标签 返回一个列表,列表里是a标签对象
list_title = soup.select(".book-mulu > ul > li > a")
# print(list_title)
# 遍历这个列表
for i in list_title:
# 读出a标签里的内容,返回一个unicode编码的str
title = i.text
# print(title)
# print(type(title))
# 以utf-8编码格式打开文件
with open("rulinwaishi.txt", mode="a", encoding="utf-8") as f:
# 在文件里写入标题
f.write("%s\n\n" % title)
# 读出 a标签里的href属性内容,返回字符串
url_hou = i.attrs["href"]
# 拼接url
url_end = "http://www.shicimingju.com" + url_hou
# request2 = request.Request(url=url_end, headers=headers)
# response2 = request.urlopen(request2).read().decode()
# 使用requests 返回一个response对象 然后用text读取里面的内容,返回一个unicode编码的str
response2 = requests.get(url=url_end, headers=headers).text
soup2 = BeautifulSoup(response2, "lxml")
# 选择这样的p标签 返回一个列表,列表里是p标签对象
list_content_tag = soup2.select(".chapter_content > p")
# 遍历这个列表
for i in list_content_tag:
# print(type(i.text))
# 读出p标签里的内容,返回str
ss = i.text
with open("rulinwaishi.txt", mode="a", encoding="utf-8") as f:
f.write("%s\n\n" % ss)