爬虫---爬小说案例:

爬小说案例:

from urllib import request

import requests
from bs4 import BeautifulSoup

url = "http://www.shicimingju.com/book/rulinwaishi.html"

headers = {'Host': 'www.shicimingju.com',
           'Connection': 'keep-alive',
           'Cache-Control': 'max-age=0',
           'Upgrade-Insecure-Requests': '1',
           'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
           'Referer': 'http://www.shicimingju.com/',
           # 'Accept-Encoding': 'gzip, deflate, sdch',
           'Accept-Language': 'zh-CN,zh;q=0.8',
           'Cookie': 'Hm_lvt_4c1638db937a6ad4a0e6a8bdfa32146f=1543454940; Hm_lpvt_4c1638db937a6ad4a0e6a8bdfa32146f=1543460971'
           }

# 创建一个请求对象
# request1 = request.Request(url=url, headers=headers)
# 发送请求
# response1 = request.urlopen(request1).read().decode()
# 使用requests 返回一个response对象 然后用text读取里面的内容,返回一个unicode编码的str
response1 = requests.get(url=url, headers=headers).text
# print(response1)
# 转化网络文件
soup = BeautifulSoup(response1, "lxml")
# 选择这样的a标签 返回一个列表,列表里是a标签对象
list_title = soup.select(".book-mulu > ul > li > a")
# print(list_title)
# 遍历这个列表
for i in list_title:
    # 读出a标签里的内容,返回一个unicode编码的str
    title = i.text
    # print(title)
    # print(type(title))
    # 以utf-8编码格式打开文件
    with open("rulinwaishi.txt", mode="a", encoding="utf-8") as f:
        # 在文件里写入标题
        f.write("%s\n\n" % title)
    # 读出 a标签里的href属性内容,返回字符串    
    url_hou = i.attrs["href"]
    # 拼接url
    url_end = "http://www.shicimingju.com" + url_hou
    # request2 = request.Request(url=url_end, headers=headers)
    # response2 = request.urlopen(request2).read().decode()
    # 使用requests 返回一个response对象 然后用text读取里面的内容,返回一个unicode编码的str
    response2 = requests.get(url=url_end, headers=headers).text
    soup2 = BeautifulSoup(response2, "lxml")
    # 选择这样的p标签 返回一个列表,列表里是p标签对象
    list_content_tag = soup2.select(".chapter_content > p")
    # 遍历这个列表
    for i in list_content_tag:
        # print(type(i.text))
        # 读出p标签里的内容,返回str
        ss = i.text
        with open("rulinwaishi.txt", mode="a", encoding="utf-8") as f:
            f.write("%s\n\n" % ss)

你可能感兴趣的:(python,爬虫,python,爬虫)