方案一:
pdf2htmlex
package com.realize.controller;
import cn.hutool.http.HttpUtil;
import com.alibaba.fastjson2.JSONObject;
import com.realize.util.MsgUtil;
import com.realize.util.OssUtil;
import com.realize.util.PdfConvertUtil;
import com.realize.util.StreamGobbler;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RestController;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
@RestController
@Slf4j
public class ParserController {
@GetMapping("/test")
public String test() {
return "test";
}
// @PostMapping("/parseHtml")
// public JSONObject parseHtml(@ModelAttribute("htmlUrl") String htmlUrl) {
// try (Playwright playwright = Playwright.create()) {
// Browser browser = playwright.chromium().launch(new BrowserType.LaunchOptions().setHeadless(true));
// Page page = browser.newPage();
// String filePath = "/mnt/temp/html/" + RandomUtil.randomString(10) + ".html";
String filePath = "/Users/sunyechen/IdeaProjects/realize-nacos/bin/" + RandomUtil.randomString(10) + ".html";
// HttpUtil.downloadFile(htmlUrl, filePath);
// page.navigate("file:" + filePath);
// page.evaluate("var imgList=document.getElementsByTagName('img');" +
// "for(var i=0;i allLines = Files.readAllLines(Paths.get(htmlFilePath), Charset.forName("UTF-8"));
String content = String.join("\n", allLines);
// File file = new File(htmlFilePath);
// BufferedReader reader = new BufferedReader(new FileReader(file));
// String line = "", oldContent = "";
// while ((line = reader.readLine()) != null) {
// oldContent += line + "\n";
// }
// reader.close();
content = content.replaceAll("github", "zzz").replaceAll("pdf2htmlEX", "tg").replaceAll("", " ");
File file = new File(htmlFilePath);
file.delete();
FileWriter writer = new FileWriter(htmlFilePath);
writer.write(content);
writer.close();
log.info("html文件处理完成{},{}", pdfUrl, ossKey);
result.put("code", 0);
} catch (IOException e) {
e.printStackTrace();
result.put("code", -1);
} finally {
//上传所有文件
String ossPath = ossKey.substring(0, ossKey.lastIndexOf("/") + 1);
OssUtil.batchFileUploadOssUrl(fileFolder, ossPath);
log.info("文件上传成功,完整链接:https://oss.imvib.com/{}", ossKey.replace(".html", ".pdf"));
}
} else {
MsgUtil.sendDingTalkMsg(pdfUrl);
result.put("code", -1);
}
return result;
}
private static byte[] readAllBytes(File file) throws IOException {
try (FileInputStream fileInputStream = new FileInputStream(file)) {
byte[] buffer = new byte[(int) file.length()];
fileInputStream.read(buffer);
return buffer;
}
}
public static void main(String[] args) throws Exception {
// String htmlFilePath = "/Users/sunyechen/doc/test/矩阵股份:长江证券承销保荐有限公司关于矩阵纵横设计股份有限公司使用募集资金置换预先投入募投项目及已支付发行费用的自筹资金的核查意见.html";
// File htmlFile = new File(htmlFilePath);
// Document html = Jsoup.parse(htmlFile);
Element script = html.select("script").first();
String sourceScript = script.html();
script.html(sourceScript + PdfConvertUtil.addScript);
// FileOutputStream fos = new FileOutputStream(htmlFilePath.replace(".html", "_.html"), false);
// OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
// osw.write(html.outerHtml());
// osw.close();
// try (Playwright playwright = Playwright.create()) {
// Browser browser = playwright.chromium().launch(new BrowserType.LaunchOptions().setHeadless(true));
// BrowserContext context = browser.newContext(new Browser.NewContextOptions());
// Page page = browser.newPage();
// String htmlUrl = "https://realizedongmi.oss-cn-shanghai.aliyuncs.com/a-filings/test/2023-01-03%201%20%E5%8F%91%E8%A1%8C%E4%BA%BA%E5%8F%8A%E4%BF%9D%E8%8D%90%E6%9C%BA%E6%9E%84%E5%85%B3%E4%BA%8E%E4%BA%8C%E8%BD%AE%E5%AE%A1%E6%A0%B8%E9%97%AE%E8%AF%A2%E5%87%BD%E7%9A%84%E5%9B%9E%E5%A4%8D%EF%BC%88%E4%BF%AE%E8%AE%A2%E7%A8%BF%EF%BC%89_%E6%98%93%E7%91%9E%E7%94%9F%E7%89%A9.htm";
// HttpUtil.downloadFile(htmlUrl, "/Users/sunyechen/IdeaProjects/realize-nacos/bin/1.html");
// page.navigate("file:/Users/sunyechen/IdeaProjects/realize-nacos/bin/1.html");
// System.out.println(page.innerHTML("body"));
// }
// System.out.println(URLDecoder.decode("https://oss.imvib.com/a-filings%252Foriginal%252F000586%252F2023-04-07+%25E5%25B9%25B4%25E5%25BA%25A6%25E5%2585%25B3%25E8%2581%2594%25E6%2596%25B9%25E8%25B5%2584%25E9%2587%2591%25E5%258D%25A0%25E7%2594%25A8%25E4%25B8%2593%25E9%25A1%25B9%25E5%25AE%25A1%25E8%25AE%25A1%25E6%258A%25A5%25E5%2591%258A.PDF", "UTF-8"));
// try {
// File file = new File("/Users/sunyechen/doc/test/矩阵股份:长江证券承销保荐有限公司关于矩阵纵横设计股份有限公司使用募集资金置换预先投入募投项目及已支付发行费用的自筹资金的核查意见.html");
// BufferedReader reader = new BufferedReader(new FileReader(file));
// String line = "", oldContent = "";
// while ((line = reader.readLine()) != null) {
// oldContent += line + "\n";
// }
// reader.close();
// String newContent = oldContent.replaceAll("", " ");
// FileWriter writer = new FileWriter(new File("/Users/sunyechen/doc/test/矩阵股份:长江证券承销保荐有限公司关于矩阵纵横设计股份有限公司使用募集资金置换预先投入募投项目及已支付发行费用的自筹资金的核查意见_1.html"));
// writer.write(newContent);
// writer.close();
// System.out.println("File updated successfully.");
// } catch (IOException e) {
// e.printStackTrace();
// }
// log.info("start");
// String htmlFilePath = "/Users/sunyechen/sfit/7745c98a5ba34525937bce19519c0b1e.html";
// try {
// File file = new File(htmlFilePath);
// BufferedReader reader = new BufferedReader(new FileReader(file));
// String line = "", oldContent = "";
// while ((line = reader.readLine()) != null) {
// oldContent += line + "\n";
// }
// reader.close();
// String newContent = oldContent.replaceAll("github", "zzz").replaceAll("pdf2htmlEX", "tanqiuhuashigou").replaceAll("", " ");
// String newHtmlFilePath = htmlFilePath.replace(".html", "_.html");
// FileWriter writer = new FileWriter(newHtmlFilePath);
// writer.write(newContent);
// writer.close();
// log.info("html文件处理完成{},{}");
//
// } catch (IOException e) {
// e.printStackTrace();
// }
log.info("start");
// String ossKey = "ann/688249/2023/4/688249_20230412_9XYK/688249_20230412_9XYK.html";
// String ossPath = ossKey.substring(0, ossKey.lastIndexOf("/") + 1);
// OssUtil.batchFileUploadOssUrl("/Users/sunyechen/sfit/test/", ossPath);
// File[] fileList = new File("/Users/sunyechen/sfit/test/").listFiles();
// for (int i = 0; i < fileList.length; i++) {
// OssUtil.fileUploadOssUrl(fileList[i], ossPath + fileList[i].getName());
// fileList[i].delete();
// }
String htmlFilePath = "/Users/sunyechen/sfit/test/600499_20230415_EVH7.html";
List allLines = Files.readAllLines(Paths.get(htmlFilePath), Charset.forName("UTF-8"));
String content = String.join("\n", allLines);
System.out.println(content);
log.info("start");
File file = new File(htmlFilePath);
BufferedReader reader = new BufferedReader(new FileReader(file));
String line = "", oldContent = "";
while ((line = reader.readLine()) != null) {
oldContent += line + "\n";
}
reader.close();
System.out.println(oldContent);
log.info("end");
}
}
方案二:
kkFileView-4.0.0
kkFileView - 在线文件预览
方案三:
wkhtmltox-0.12.6-1.centos7.x86_64.rpm
wkhtmltopdf