爬虫作业一

from bs4 import  BeautifulSoup
import requests
import time
import random
import re
if __name__ =='__main__':

    
    url = 'http://ajnnan.com/88_88846/'
   
    text = requests.get(url=url)
    text.encoding='utf-8'
    page_text =text.text
    soup = BeautifulSoup(page_text, 'lxml')
    dd_list = soup.find('div',id='list')
    a_list = dd_list.find_all('a')
    del a_list[0:12]
    print(a_list)
    fp=open('./南明第一狠人.txt', 'w', encoding='utf-8')
    path = r'C:\爬虫实验\xiaoshuo\ '
    for a in a_list:
        time.sleep(6.66)#固定间隔时长,括号内数值可以自定议,下一行同可自定义。不过间隔时长短会导致访问频繁被网站 KO
        time.sleep(random.random()*3.24)#随机间隔时长 代码基本完善,不过存在方法不够完美
        title_1 = a.string
        title2 = re.sub(u'\\(.*?\\)','',title_1)#去除了作者求月票的行为!!!!但是作者内藏吐槽章节未去除
        title = re.sub(u'\\(.*?\\)','', title2)#标题内的有两种括号,
        print(title)#打印章节的名字
        detail_url='http://ajnnan.com'+a['href']
        print(detail_url)#打印章节的url
        detail_page_text = requests.get(url=detail_url,).text #, headers=headers
        detail_page_text = detail_page_text.encode("ISO-8859-1")
        detail_page_text = detail_page_text.decode("utf-8")
        detail_soup = BeautifulSoup(detail_page_text, 'lxml')
        div_tag = detail_soup.find('div',attrs={
     'id':'content'})
        content = div_tag.text
        fp.write(title + ':' + content + '\n')
        #print(content)
        with open(path + title + '.txt', 'w', encoding='utf-8') as f:
            f.write(title + ':' + content + '\n')
        print('返回!!!'+title,'爬取成功!!!')

爬虫作业一_第1张图片
爬虫作业一_第2张图片
审核问题,之前的不能查看

你可能感兴趣的:(笔记)