import requests
target = 'https://www.booktxt.com/20_20244/714050.html'
req = requests.get(url=target)
req.encoding = 'GBK'
html = req.text
print(html )
https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/
具体章节下载
from bs4 import BeautifulSoup
import requests
target = 'https://www.booktxt.com/20_20244/714050.html'
req = requests.get(url=target)
req.encoding = 'GBK'
html = req.text
bb = BeautifulSoup(html, "lxml")
texts = bb.find_all('div', id='content')
print(texts)
print(texts[0].text.replace(' ', '\r\n'))
目录下载
def run():
target = 'https://www.booktxt.com/20_20244/'
req = requests.get(url=target)
req.encoding = 'GBK'
html = req.text
bb = BeautifulSoup(html, "lxml")
div = bb.find_all('div', id='list')
a_bf = BeautifulSoup(str(div[0]), "lxml")
a = a_bf.find_all('a')
for each in a:
print(each.string, target + each.get('href'))
合并下载
def run():
target = 'https://www.booktxt.com/20_20244/'
req = requests.get(url=target)
req.encoding = 'GBK'
html = req.text
bb = BeautifulSoup(html, "lxml")
div = bb.find_all('div', id='list')
a_bf = BeautifulSoup(str(div[0]), "lxml")
a = a_bf.find_all('a')
for each in a:
print(each.string, target + each.get('href'))
writer(each.string, '我不想当老大.txt', get_contents(target + each.get('href')))
def get_contents(target):
req = requests.get(url=target)
req.encoding = 'GBK'
html = req.text
bb = BeautifulSoup(html, "lxml")
texts = bb.find_all('div', id='content')
return texts[0].text.replace(' ', '\r\n')
def writer(name, path, text):
with open(path, 'a', encoding='utf-8') as f:
f.write(name + '\n')
f.writelines(text)
f.write('\n\n')
替代以前的PhantomJS,NightmareJS或Selenium,他们能做的事几乎能做
Python的ChromeDriver
ChromeDriver下载
下载地址:
安装selenium库
conda install selenium
测试selenium库是否安装成功
#用 Chrome 浏览器来测试
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('http://www.baidu.com/')
运行这段代码,会自动打开浏览器,然后访问百度。
参考:https://blog.csdn.net/u010591976/article/details/104166095