python爬取笔趣阁小说

python爬取笔趣阁小说

完整代码:

import requests
from lxml import etree
import os
header = {
        'User-Agent':'Mozilla/5.0(Macintosh;Inter Mac OS X 10_13_3) AppleWebkit/537.36 (KHTML,like Gecko)'
                     'Chrom/65.0.3325.162 Safari/537.36'
    }
def gerbookurls():
    print('笔趣阁的官网是: https://www.biquge.com.cn/  去这里找想看的小说,然后把小说网址复制过来就可以了,文件存在C盘的笔趣阁小说文件夹里!')
    url = input('请输入要获取的笔趣阁小说的网页地址:')
    charptes = requests.get(url,headers = header).text
    #print(charptes)
    objects = etree.HTML(charptes)
    #print(objects)
    #章节链接
    objs = objects.xpath('//div[@class = "box_con"]/div/dl/dd')
    clist = []



    for obj in objs:
        try:



            chapt_urls = obj.xpath('a/@href')[0]


            chapt_names = obj.xpath('a/text()')[0]
            into = {
                'chapt_urls':'https://www.biquge.com.cn'+str(chapt_urls),
                'chapt_names':chapt_names
            }
            clist.append(into)
        except:
            pass
    return clist

    #https://www.biquge.com.cn/book/36681/52529.html




clist = gerbookurls()

def getcontent(url):
    res = requests.get(url,headers = header).text
    objects = etree.HTML(res)
    #print(objects)
    objs = objects.xpath('//div[@id="content"]/text()')
    content = []
    for i in objs:
        text = i.replace('\xa0\xa0\xa0\xa0','')
        content.append(text)
    return content



# for i in clist:
#     chapt_urls = i['chapt_urls']
#     chapt_names = i['chapt_names']
#     content = getcontent(chapt_urls)
#     text = ''
#     for j in content:
#         text = text + j
#         print('正在下载:%s'%chapt_names)
    # with open('/Users/baby/Desktop/biquge/斗罗大陆/斗罗大陆%s.doc'%chapt_names,'w')as f:
    #     f.write(text)

def mkdirss(path):

    folder = os.path.exists(path)

    if not folder:  # 判断是否存在文件夹如果不存在则创建为文件夹
        os.mkdir(path)  # makedirs 创建文件时如果路径不存在会创建这个路径
        print
        "---  new folder...  ---"
        print
        "---  OK  ---"

    else:
        print
        "---  There is this folder!  ---"
path = 'C:\\笔趣阁小说'
mkdirss(path)
for i in clist:
    chapt_urls = i['chapt_urls']
    chapt_names = i['chapt_names']
    content = getcontent(chapt_urls)
    text = ''
    for j in content:
        text = text + j
        print('正在下载:%s'%chapt_names)
    with open('C:\\笔趣阁小说\\%s.doc'%chapt_names,'w') as f:
        f.write(text)

你可能感兴趣的:(IT,python,科技)