命令参数是小说目录网址加任务号
不适用其他网站,根据测试的网站所写
# _*_coding:utf8_*_
from requests_html import HTMLSession
import sys
import os
session = HTMLSession()
# 字符串切分,筛选出域名和地址
array = sys.argv[1].split('/')
reId = sys.argv[2]
baseHost = array[0]+"//"+array[2]
bookUrl = baseHost+'/'+array[3]+'/'
# 打印一下域名跟目录地址,看看对不对
print(baseHost,bookUrl)
# 定义一个写入text文本的操作函数
def create_str_to_txt(fileName,str_data):
path_file_name = fileName
if not os.path.exists(path_file_name):
with open(path_file_name, "w+",encoding='utf-8') as f:
print(f)
with open(path_file_name, "a",encoding='utf-8') as f:
f.write(str_data)
# 发起请求并解析(网站结构不一,这里只是我测试网站的页面结构)
r1 = session.get(bookUrl)
r1.encoding = 'utf-8'
# 搜索div标签id为info的第一个元素定为topInfo
topInfo = r1.html.find('div#info',first=True)
# 获取标题文本
fileName = topInfo.find('h1',first=True).text
# 获取目录元素
content = r1.html.find('div#list',first=True)
# 获取目录元素中的所有章节的链接
strlink = content.find('a')
#遍历获取单个章节文本
for item in strlink:
isHaveHost = baseHost in item.attrs['href']
# 判断链接是否包含host不包含就拼接
if isHaveHost:
a_page = session.get(item.attrs['href'])
else:
a_page = session.get(baseHost + item.attrs['href'])
a_page.encoding = 'utf-8'
page_title = a_page.html.find('h1',first=True)
page_content = a_page.html.find('div#content',first=True)
# 成功获取章节文本的写入日志和小说文本
if not (page_content is None):
print(page_title.text,"--------100% 获取成功")
# 文本拼接添加换行
data = page_title.text+"\r\n"+page_content.text+"\r\n"+"\r\n"
# 写入txt文件
create_str_to_txt('./public/log/log.txt',reId+"《"+fileName+"》---"+page_title.text+"--------100% 获取成功"+"\r\n")
create_str_to_txt('./public/text/{}.txt'.format(fileName+reId),data)
else:
print(page_content)
# 全部获取完毕打印成功
create_str_to_txt('./public/log/log.txt',"已执行结束")
print("获取成功")
我这里是用Express搭建的node后端项目
var express = require('express');
var router = express.Router();
const fs = require('fs')
const path = require("path")
// const db = require("../utils/db")
const exec = require('child_process').exec;
const execSync = require('child_process').execSync;
function error(msg) {
return {
status: "error", message: msg, code: 10001 }
}
function success(body, message = "请求成功") {
return {
status: "success", message, code: 10000, body }
}
const text_path = path.join(__dirname, '../public/text/')
const log_path = path.join(__dirname, '../public/log/')
const log_file_path = path.join(__dirname, '../public/log/log.txt')
/* GET home page. */
router.get('/', function (req, res, next) {
res.render('index', {
title: 'Express' });
});
router.post('/webapi/python/getText', function (req, res, next) {
const {
url } = req.body
reId = new Date().getTime()
if (typeof (url) == 'string' && url!=="") {
// 异步执行
exec(`python3 python/getText.py ${
url} ${
reId}`, function (error, stdout, stderr) {
if (error) {
console.info('stderr : ' + stderr);
}
console.log('exec: ' + reId + ":执行结束");
})
res.json(success({
}, "操作成功"))
} else {
res.json(error("url数据类型错误"))
}
});
router.get('/webapi/public/text', function (req, res, next) {
fs.readdir(text_path, ((err, data) => {
if (err) {
res.json(error(err))
} else {
res.json(success({
files: data }, "操作成功"))
}
}));
});
router.get('/webapi/public/log', function (req, res, next) {
fs.readFile(log_file_path, 'utf8', function (err, data) {
if (err) {
console.log(err)
} else {
let arr = data.split('\r\n')
res.json(success(arr))
}
})
});
router.get('/webapi/public/clearLog', function (req, res, next) {
let time = new Date().toLocaleString()
fs.writeFile(log_file_path, `=================${
time}===================`, 'utf8', function (error) {
if (error) {
console.log(error);
res.json(error(error))
} else {
res.json(success({
message:"清空成功"}, "清空成功"))
}
})
});
module.exports = router;
这里我用的是vue3+element-plus
<template>
<div class="page" v-loading="loading">
<el-row>
<el-col :span="8">
<el-input v-model="url">el-input>
el-col>
<el-col :span="12" style="text-align: left; padding-left: 40px">
<el-button @click="sendUrl" type="primary">开始获取el-button>
<el-button @click="getBooklogOne">刷新日志el-button>
<el-button @click="clearAllLogs">清空日志el-button>
<el-button @click="openDownload">打开下载区el-button>
el-col>
el-row>
<div class="content" v-loading="logLoading">
<div ref="content" v-if="downloadList.length == 0" class="content1">
<p v-for="(item, index) in dataList" :key="index">{
{ item }}p>
div>
<div ref="content" v-else class="content2">
<el-row v-for="(item, index) in downloadList" :key="index">
<el-link :href="`./text/${item}`" :download="item">{
{
item
}} >>>>>点击下载el-link>
el-row>
div>
div>
div>
template>
<script>
import {
getAllFileName, clearLogs, postUrl, getLogs } from "@/api/common";
export default {
name: "HelloWorld",
props: {
msg: String,
},
data() {
return {
url: "",
dataList: [],
loading: false,
logLoading: false,
downloadList: [],
};
},
methods: {
sendUrl() {
this.loading = true;
postUrl({
url: this.url }).then((res) => {
setTimeout(this.getBooklog, 3000);
});
},
getBooklog() {
this.logLoading = true;
getLogs().then((res) => {
this.logLoading = false;
this.loading = false;
this.dataList = res;
this.$refs["content"].scrollIntoView(false);
if (res[res.length - 1] !== "已执行结束") {
setTimeout(this.getBooklog, 3000);
} else {
this.$message("执行结束");
}
});
},
getBooklogOne() {
this.logLoading = true;
getLogs().then((res) => {
this.logLoading = false;
this.loading = false;
this.dataList = res;
this.downloadList = [];
this.$refs["content"].scrollIntoView(false);
});
},
openDownload() {
getAllFileName().then((res) => {
this.downloadList = res.files;
});
},
clearAllLogs() {
clearLogs().then((res) => {
this.$message(res.message);
});
},
},
};
script>
<style scoped lang="scss">
h3 {
margin: 40px 0 0;
}
ul {
list-style-type: none;
padding: 0;
}
li {
display: inline-block;
margin: 0 10px;
}
a {
color: #42b983;
}
.page {
padding: 40px;
}
.content {
margin-top: 20px;
border-top: 1px solid #ccc;
text-align: left;
height: 600px;
overflow: auto;
.content1 {
background-color: #000;
color: #ddd;
p {
padding-left: 20px;
font-size: 16px;
line-height: 20px;
}
}
.content2 {
font-size: 18px;
line-height: 30px;
}
}
style>