import urllib.request
from bs4 import BeautifulSoup
import time
import os
def handle_request(url):
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
return request
def parse_content(content):
soup = BeautifulSoup(content,'lxml')
every_book_url = soup.select('.bookmark-list > ul > li > h2 > a')
return every_book_url
def parse_content2(content2):
soup = BeautifulSoup(content2,'lxml')
book_title = soup.select('.book-header > h1')[0].string.strip('《》')
filenam = os.mkdir('%s'%book_title)
filename = '{}\\'.format(filenam)
# print(typ?e(filename))
book_name = book_title + '.txt'
print(book_name)
content_urls = soup.select('.book-mulu > ul > li > a')
filename = r"{}\\".format(book_title)
filepath = os.path.join(filename, '{}.txt'.format(book_title))
print(filepath)
fp = open(filepath,'w',encoding='utf8')
for content_url in content_urls:
href = 'http://www.shicimingju.com' + content_url['href']
try:
title = content_url.string
print('正在下载--%s--' % title)
text = get_text(href)
content4 = title + '\n' + text
# urllib.request.urlretrieve(book_name,filepath)
# filename = r"C:\\Users\\Administrator\\Desktop\\pachong\\book"
# filepath = os.path.join(filename, '{}.txt'.format(book_title))
# print(filepath)
# fp = open(filepath,'w',encoding='utf8')
fp.write(content4)
except:
print('这个不能下载')
print('结束下载--%s--' % title)
fp.close()
def get_text(href):
request = handle_request(href)
content3 = urllib.request.urlopen(request).read().decode('utf8')
soup = BeautifulSoup(content3,'lxml')
# booktitle = soup.select('.nav-top > a')[2].string
# book_name = os.mkdir('%s'%booktitle)
# book_name = booktitle + '.txt'
# print(book_name)
# exit()
cha_content = soup.find('div',class_="chapter_content").text
return cha_content
# text = cha_content.text
# filename = r"C:\\Users\\Administrator\\Desktop\\pachong\\book"
# filepath = os.path.join(filename, book_name)
# print(filepath)
# fp = open(filepath,'w',encoding='utf8')
# fp.write(title + '\n' + text)
# print('结束下载--%s--' % title)
# fp.close()
def main():
# fp = open(filepath, 'w', encoding='utf8')
url = 'http://www.shicimingju.com/book/'
request = handle_request(url)
content = urllib.request.urlopen(request)
request = parse_content(content)
for book_url in request:
href = 'http://www.shicimingju.com' + book_url['href']
request2 = handle_request(href)
content2 = urllib.request.urlopen(request2)
parse_content2(content2)
time.sleep(3)
if __name__ == '__main__':
main()