爬取笔趣网

import requests
from bs4 import BeautifulSoup
import html
import re
import time
def get_html_txt(url):
    try:
        html=Sess.get(url=url)
        html.encoding=html.apparent_encoding
        txts=BeautifulSoup(html.text,'html.parser')
        return txts.select('#content')[0].get_text()
    except:
        get_html_txt(url)
def wt(txt,tit):
    txt=str(txt)
    tit=str(tit).replace('*','')
    tit = str(tit).replace('~', '')
    tit = str(tit).replace('/', '')
    tit = str(tit).replace('(', '')
    tit = str(tit).replace(')', '')
    tit = str(tit).replace('【', '')
    tit = str(tit).replace('】', '')
    with open('./超神机械师/'+tit+'.txt','w',encoding='utf-8') as f:
        f.write(txt)
if __name__ == '__main__':
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'no-cache',  # 这里一定要改成no-cache否则会缓存304二次访问调试不会反回内容
        'Connection': 'keep-alive',
        'Cookie': 'clickbids=40160; Hm_lvt_c979821d0eeb958aa7201d31a6991f34=1584758773; Hm_lvt_6dfe3c8f195b43b8e667a2a2e5936122=1584758818; Hm_lpvt_6dfe3c8f195b43b8e667a2a2e5936122=1584758818; Hm_lpvt_c979821d0eeb958aa7201d31a6991f34=1584758818',
        'Host': 'www.biquge.info',
        # 'If-Modified-Since': 'Thu, 19 Dec 2019 16:37:49 GMT', #禁止缓存
        # 'If-None-Match': 'W/"5dfba75d-5b43"', #禁止缓存,参考https://blog.csdn.net/qq_37960324/article/details/83374855
        'Referer': 'http://www.biquge.info/40_40160/',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
    }
    Sess = requests.session()
    Sess.headers = headers
    url = 'http://www.biquge.info/40_40160/'
    html_txt = Sess.get(url=url)
    html_txt.encoding = html_txt.apparent_encoding
    txts = BeautifulSoup(html_txt.text, 'html.parser')
    urls=[]
    for t in txts.select('#list > dl > dd'):
        href=t.a['href']
        title=t.a['title']
        urls.append((url+href,title))
    for u in urls:
        #print(u)
        txt=get_html_txt(u[0])
        wt(txt,u[1])
        time.sleep(3)

你可能感兴趣的:(爬取笔趣网)