Python学习(一) 爬取一整篇小说


import requests
headers = {
proxies = {
session = requests.Session()
session.headers = headers
session.proxies = proxies
def main():
    url = 'http//'
    response = session.get(url)
if __name__ == '__main__'
2.requests.adapters中的HTTPAdapter, Retry
from requests.adapters import HTTPAdapter, Retry

session = requests.Session()
session.headers = headers
session.proxies = proxies

# 创建一个Retry对象,设置重试的参数
retries = Retry(total=5,            # total=5:重连次数5次
                backoff_factor=0.1,   # backoff_factor=0.1:第二次重连时间,每次重连+本身
                # status_forcelist=[500, 502, 503, 504]:当返回为[500, 502, 503, 504]时重连
                status_forcelist=[500, 502, 503, 504])
# 把Retry对象传给HTTPAdapter,然后把HTTPAdapter挂载到session上
session.mount('http://', HTTPAdapter(max_retries=retries))

url = ''
response = session.get(url)


# 导入模块
# 本来想用lxml解析的,结果无法返回数据
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup
import requests
import asyncio
import aiofiles
import pandas
import random
import os

def novelSearch():  # 书籍搜索
    name = input('输入想搜索的书名:')

    url = f'{name}'

    response = session.get(url)

    url_html = BeautifulSoup(response.text, features='lxml')
    url_find = url_html.find('table', align="center").find_all('tr')[1:]
    for i in url_find:
        book_object = i('td')
        book_name = book_object[0].text
        book_url_pd = str(book_object[0]).split('"')[3]
        book_url = '' + book_url_pd
        book_chapter = book_object[1].text
        book_author = book_object[2].text
        book_number = book_object[3].text
        book_renew = book_object[4].text
        book_state = book_object[5].text
        dic = {'文章名称': book_name, '链接': book_url, '最新章节': book_chapter, '作者': book_author,
               '字数': book_number, '更新': book_renew, '状态': book_state}

    pandas.set_option('display.max_columns', None)
    # book.to_excel('book.xlsx', index=False)

def novelAnalysis():  # 书籍子链接解析
    download_name = input('输入想下载的书名:')
    download_author = input('输入文章作者名称:')

    for dc in cones:

        if download_name == dc['文章名称'] and download_author == dc['作者']:
            url = f'{dc["链接"]}'

            response = session.get(url)

            url_html = BeautifulSoup(response.text, features='lxml')
            url_find = url_html.find('div', id="list").find_all('a')[9:]

            for i in url_find:
                book_name_title = i.text
                book_url_name_title = str(i).split('"')[1]
                title_url = '' + book_url_name_title
                book_dic = {'文章名称': dc['文章名称'], '章节名称': book_name_title, '章节链接': title_url}

async def book_download():  # 书籍内容解析并下载
    for i in book_title_download:
        book_read = ''
        url = i['章节链接']
        book_title = (i['章节名称'])

        response = session.get(url)  # proxies=random.choice(proxies)
        if response != '':
            response = session.get(url)

        url_html = BeautifulSoup(response.text, features='lxml')
        url_find = url_html.find('div', id="content")
        book_read = book_read + f'{book_title}\n'
        # 判断内容是否为None,因为老是报错写的后面加了断线重连的代码,基本上是稳定了。
        if url_find is not None:
            for p in url_find:
                book_chapter = (p.getText('p'))
                book_read = book_read + f'{book_chapter}\n'
            print("Can not find div tag with id content")

        if not os.path.exists('小说'):

        async with"小说/{i['文章名称']}.txt", 'a', encoding='utf-8') as download:  # 下载内容
            await download.write(book_read)
            print(f'{book_title}  --下载成功')
            await asyncio.sleep(.5)

if __name__ == '__main__':  # 执行程序

    cones = list()
    book_url_z = list()
    book_title_download = list()

    # 为了防止被反爬,专门弄了个随机headers...
    headers_list = [
            'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G955U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 10; SM-G981B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (iPad; CPU OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/87.0.4280.77 Mobile/15E148 Safari/604.1'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.109 Safari/537.36 CrKey/1.54.248666'
        }, {
            'user-agent': 'Mozilla/5.0 (X11; Linux aarch64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.188 Safari/537.36 CrKey/1.54.250320'
        }, {
            'user-agent': 'Mozilla/5.0 (BB10; Touch) AppleWebKit/537.10+ (KHTML, like Gecko) Version/ Mobile Safari/537.10+'
        }, {
            'user-agent': 'Mozilla/5.0 (PlayBook; U; RIM Tablet OS 2.1.0; en-US) AppleWebKit/536.2+ (KHTML like Gecko) Version/ Safari/536.2+'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.3; en-us; SM-N900T Build/JSS15J) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.1; en-us; GT-N7100 Build/JRO03C) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.0; en-us; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 7.0; SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; SM-G965U Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.111 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 8.1.0; SM-T837A) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.80 Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; U; Android 4.4.2; en-us; LGMS323 Build/KOT49I.MS32310c) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/ Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 550) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mobile Safari/537.36 Edge/14.14263'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 10 Build/MOB31T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Nexus 5X Build/OPR4.170623.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 7.1.1; Nexus 6 Build/N6F26U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Nexus 6P Build/OPP3.170518.006) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 7 Build/MOB30X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows Phone 8.0; Trident/6.0; IEMobile/10.0; ARM; Touch; NOKIA; Lumia 520)'
        }, {
            'user-agent': 'Mozilla/5.0 (MeeGo; NokiaN9) AppleWebKit/534.13 (KHTML, like Gecko) NokiaBrowser/8.5.0 Mobile Safari/534.13'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 9; Pixel 3 Build/PQ1A.181105.017.A1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.158 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 11; Pixel 3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.181 Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Mobile Safari/537.36'
        }, {
            'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1'
        }, {
            'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'
        }, {
            'user-agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'

    headers = {
        'Host': '',
        'Referer': '',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': f'{random.choice(headers_list)}'

    session = requests.Session()
    session.headers = headers

    # 创建一个Retry对象,设置重试的参数
    retries = Retry(total=10,
                    status_forcelist=[500, 502, 503, 504])
    # 把Retry对象传给HTTPAdapter,然后把HTTPAdapter挂载到session上
    session.mount('http://', HTTPAdapter(max_retries=retries))


