这只是源码,没什么意思,有意思的在这里:震惊!我用Python分析了天蚕土豆的玄幻三部曲竟然发现…
import re
import threading
from pyquery import PyQuery as pq
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
}
# 获取链接和标题
def get_href_and_title(url):
lists = []
html = pq(url, headers, encoding="utf-8")
i = 0
for item in html("#list > dl > dd").items():
i += 1
if i > 9:
lists.append(("http://www.tycqxs.com" + item('a').attr('href'), item.text()))
if item.text() == "第一千六百二十三章 结束,也是开始。":
break
return lists
# 获取小说网页内容
def get_one_page(url):
contents1 = "".join(
re.findall("(.*?)[\(|(]", pq(url, headers, encoding="utf-8")("#content").text().replace("\n", "")))
contents2 = pq(url, headers, encoding="utf-8")("#content").text().replace("\n", "")
return contents1 if contents1 != "" else contents2
def main():
for item in get_href_and_title("http://www.tycqxs.com/57_57672/"):
chapter_url, chapter_title = item
if chapter_title[0] != "第":
continue
with open('C:\天蚕土豆\DouPoCangQiong\{}.txt'.format(chapter_title), "w", encoding="utf-8")as f:
f.write(get_one_page(chapter_url))
print(chapter_url, chapter_title)
if __name__ == '__main__':
threading.Thread(target=main()).start()
import re
import threading
from pyquery import PyQuery as pq
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763'
}
# 获取链接和标题
def get_href_and_title(url):
lists = []
for item in pq(url, headers)("#list > dl > dd").items():
if item.text()[0] != "第":
continue
lists.append(("http://www.xbiquge.la" + item('a').attr('href'), item.text()))
return lists
# 获取小说网页内容
def get_one_page(url):
return "".join(re.findall("(.*?)微信.*?", pq(url, headers, encoding="utf-8")("#content").text().replace("\n", "")))
def main():
for item in get_href_and_title("http://www.xbiquge.la/15/15/"):
chapter_url, chapter_title = item
with open('C:\天蚕土豆\WuDongQianKun\{}.txt'.format(chapter_title), "w", encoding="utf-8")as f:
f.write(get_one_page(chapter_url))
print(chapter_url, chapter_title)
if __name__ == '__main__':
threading.Thread(target=main()).start()
import re
import threading
from pyquery import PyQuery as pq
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36'
}
# 获取链接和标题
def get_href_and_title(url):
lists = []
html = pq(url, headers, encoding="gbk")
for item in html("#main > div > dl > dd").items():
if item('a').attr('href') is None:
continue
lists.append((item('a').attr('href'), item.text()))
return lists
# 获取小说网页内容
def get_one_page(url):
contents1 = "".join(re.findall("(.*?)[(|(]", pq(url, headers, encoding="gbk")("#BookText").text().replace("\n", "")))
contents2 = pq(url, headers, encoding="gbk")("#BookText").text().replace("\n", "")
return contents1 if contents1 != "" else contents2
def main():
for item in get_href_and_title("http://www.32xs.org/html/0/1/index.html"):
chapter_url, chapter_title = item
if chapter_title[0] != "第":
continue
with open('C:\天蚕土豆\DaZhuZai\{}.txt'.format(chapter_title.replace("?", "").replace("?", "")), "w",
encoding="utf-8")as f:
f.write(get_one_page(chapter_url))
print(chapter_url, chapter_title)
if __name__ == '__main__':
threading.Thread(target=main()).start()
都是一个套路,代码可以直接拿来用。