import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.rendering.PDFRenderer; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.util.StringUtils; import sun.misc.BASE64Encoder; import javax.imageio.ImageIO; import javax.servlet.http.HttpServletRequest; import java.awt.image.BufferedImage; import java.io.*; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * PreviewUtil * * @author zhaomeng * @date 2017/4/13 */ public class PreviewUtil { private static final Logger logger = LoggerFactory.getLogger(PreviewUtil.class); /** * word 预览 * @param prefix 文件后缀 * @param savePath 存放路径 * @param fileName 文件名称 * @param projectUrl 项目路径 * @return */ public static String previewWord(String prefix, String savePath, String fileName, String projectUrl){ String html =""; OfficeTool o=new OfficeTool(savePath+fileName,savePath,projectUrl+"/wordFile/"); if(("docx").equals(prefix)||("DOCX").equals(prefix)){ html = o.get07Html(); }else if(("doc").equals(prefix)||("DOC").equals(prefix)){ html = o.get03Html(); } Document doc = Jsoup.parse(html); try{ File f = new File(savePath+"word"+File.separator+"media"+File.separator); File[] files = f.listFiles(); if(!StringUtils.isEmpty(files)){ for(int i=0;i
base64 * @param filepath 文件存放路径+文件名 * @return */ public static String getPdfImgString(String filepath){ try { PDDocument document = new PDDocument(); File pdfFile = new File(filepath); document = PDDocument.load(pdfFile, (String)null); int size = document.getNumberOfPages(); List piclist = new ArrayList(); for(int i=0 ; i < size; i++){ BufferedImage image = new PDFRenderer(document).renderImageWithDPI(i,130, ImageType.RGB); piclist.add(image); } document.close(); String s = yPic(piclist); return s; }catch (Exception e){ logger.error("图片转换base64出错"+e.getMessage()); return null; } } /** * 将宽度相同的图片,竖向追加在一起 ##注意:宽度必须相同 * * @param piclist 文件流数组 */ public static String yPic(List piclist) throws Exception{// 纵向处理图片 if (piclist == null || piclist.size() <= 0) { logger.error("图片数组为空"); return null; } int height = 0, // 总高度 width = 0, // 总宽度 _height = 0, // 临时的高度 , 或保存偏移高度 __height = 0, // 临时的高度,主要保存每个高度 picNum = piclist.size();// 图片的数量 int[] heightArray = new int[picNum]; // 保存每个文件的高度 BufferedImage buffer = null; // 保存图片流 List imgRGB = new ArrayList (); // 保存所有的图片的RGB int[] _imgRGB; // 保存一张图片中的RGB数据 for (int i = 0; i < picNum; i++) { buffer = piclist.get(i); heightArray[i] = _height = buffer.getHeight();// 图片高度 if (i == 0) { width = buffer.getWidth();// 图片宽度 } height += _height; // 获取总高度 _imgRGB = new int[width * _height];// 从图片中读取RGB _imgRGB = buffer .getRGB(0, 0, width, _height, _imgRGB, 0, width); imgRGB.add(_imgRGB); } _height = 0; // 设置偏移高度为0 // 生成新图片 BufferedImage imageResult = new BufferedImage(width, height, BufferedImage.TYPE_INT_BGR); for (int i = 0; i < picNum; i++) { __height = heightArray[i]; if (i != 0) _height += __height; // 计算偏移高度 imageResult.setRGB(0, _height, width, __height, imgRGB.get(i), 0, width); // 写入流中 } ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); ImageIO.write(imageResult, "jpg", outputStream); return new BASE64Encoder().encode(outputStream.toByteArray()); } /** * txt转html * @param s * @return */ public static String txtToHtml(String s) { try { StringBuilder builder = new StringBuilder(); String encoding="UTF-8"; File file=new File(s); if(file.isFile() && file.exists()){ //判断文件是否存在 InputStreamReader read = new InputStreamReader( new FileInputStream(file),encoding);//考虑到编码格式 BufferedReader bufferedReader = new BufferedReader(read); String lineTxt = null; while((lineTxt = bufferedReader.readLine()) != null){ boolean previousWasASpace = false; for (char c : (lineTxt+"\n").toCharArray()) { if (c == ' ') { if (previousWasASpace) { builder.append(" "); previousWasASpace = false; continue; } previousWasASpace = true; } else { previousWasASpace = false; } switch (c) { case '<': builder.append("<"); break; case '>': builder.append(">"); break; case '&': builder.append("&"); break; case '"': builder.append(""); break; case '\n': builder.append("
"); break; // We need Tab support here, because we print StackTraces as HTML case '\t': builder.append(" "); break; default: builder.append(c); } } } read.close(); String converted = builder.toString(); String str = "(?i)\\b((?:https?://|www\\d{0,3}[.]|[a-z0-9.\\-]+[.][a-z]{2,4}/)(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:\'\".,<>?«»“”‘’]))"; Pattern patt = Pattern.compile(str); Matcher matcher = patt.matcher(converted); converted = matcher.replaceAll("$1"); return converted; }else{ logger.error("找不到指定的文件"); return null; } } catch (Exception e) { logger.error("读取文件内容出错"); e.printStackTrace(); return null;
}} /** * 图片转Base64 * @param filePath * @return */ public static String GetImageStr(String filePath) {//将图片文件转化为字节数组字符串,并对其进行Base64编码处理 InputStream in = null; byte[] data = null; //读取图片字节数组 try { in = new FileInputStream(filePath); data = new byte[in.available()]; in.read(data); in.close(); } catch (IOException e) { e.printStackTrace(); return null; } //对字节数组Base64编码 BASE64Encoder encoder = new BASE64Encoder(); return encoder.encode(data);//返回Base64编码过的字节数组字符串 } }
下面是word转html的工具类
import java.awt.Color; import java.awt.Dimension; import java.awt.Graphics2D; import java.awt.RenderingHints; import java.awt.image.BufferedImage; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.OutputStreamWriter; import java.util.List; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.apache.commons.io.output.ByteArrayOutputStream; import org.apache.commons.lang3.StringUtils; import org.apache.log4j.Logger; import org.apache.poi.hslf.model.TextRun; import org.apache.poi.hslf.usermodel.RichTextRun; import org.apache.poi.hslf.usermodel.SlideShow; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.converter.HtmlDocumentFacade; import org.apache.poi.hwpf.converter.PicturesManager; import org.apache.poi.hwpf.converter.WordToHtmlConverter; import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.PictureType; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.xwpf.converter.core.FileImageExtractor; import org.apache.poi.xwpf.converter.core.IURIResolver; import org.apache.poi.xwpf.converter.xhtml.DefaultContentHandlerFactory; import org.apache.poi.xwpf.converter.xhtml.IContentHandlerFactory; import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter; import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.xml.sax.ContentHandler; public class OfficeTool { public Logger logger = Logger.getLogger(OfficeTool.class); private String path; //文件路径 private String imageSavePath; //图片保存保存路径 private String imageUrl; //图片 src属性 private String output; private HtmlDocumentFacade htmlDocumentFacade; private Element window; private Element topbar; private Element info; private Element outline; private Element page; private Element ul; public OfficeTool(File file) { this.path = file.getAbsolutePath(); String filename = this.getFileName(path, false); this.imageSavePath = file.getParentFile().getAbsolutePath() + "/" + filename + "_img/"; this.imageUrl = this.imageSavePath.replaceAll("([D][:])*[/|\\\\]+", "/"); System.out.println(imageUrl); this.output = file.getParentFile().getAbsolutePath() + "/" + filename + ".html"; new File(this.imageSavePath).mkdirs(); } public OfficeTool(String path, String imageSavePath, String imageUrl) { this.path = path; this.imageSavePath = imageSavePath; this.imageUrl = imageUrl; File imFile = new File(imageSavePath); boolean mkdirs = imFile.mkdirs(); } public OfficeTool(String path, String imageSavePath, String imageUrl, String output) { this.path = path; this.imageSavePath = imageSavePath; this.imageUrl = imageUrl; this.output = output; File imFile = new File(imageSavePath); imFile.mkdirs(); } public String get03Html() { String html = ""; try { html = convert2Html(path); } catch (TransformerException | IOException | ParserConfigurationException e) { e.printStackTrace(); } return html; } public String get07Html() { String html = ""; try { html = this.doGenerateHTMLFile(); } catch (Exception e) { e.printStackTrace(); } return html; } public String getPPTHtml(){ String html = ""; try { this.convert(this.path, this.output); // html = MyFileUtils.fileToString(output,"UTF-8"); //html = MyFileUtils.fileToString(output,"GB2312"); } catch (IOException | TransformerException e) { e.printStackTrace(); } return html; } public String save03AsHtml(String content) { this.writeFile(content, output); return output; } public String save07AsHtml(String content) { this.writeFile(content, output); return output; } private void writeFile(String content, String path) { try (FileOutputStream fos = new FileOutputStream(new File(path)); BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(fos, "GB2312"))){ // bw = new BufferedWriter(new OutputStreamWriter(fos, "UTF-8")); bw.write(content); } catch (IOException ioe) { ioe.printStackTrace(); } } private String convert2Html(String fileName) throws TransformerException, IOException, ParserConfigurationException { POIFSFileSystem fss = new POIFSFileSystem(new FileInputStream(fileName)); HWPFDocument wordDocument = new HWPFDocument(fss); WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter( DocumentBuilderFactory.newInstance().newDocumentBuilder() .newDocument()); wordToHtmlConverter.setPicturesManager(new PicturesManager() { public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) { return imageUrl + suggestedName; } }); wordToHtmlConverter.processDocument(wordDocument); // save pictures List pics = wordDocument.getPicturesTable().getAllPictures(); File file = new File(imageSavePath+"word"+File.separator+"media"+File.separator); //判断上传文件的保存目录是否存在 if (!file.exists() && !file.isDirectory()) { logger.info(imageSavePath+"word"+File.separator+"media"+File.separator + "目录不存在,需要创建"); //创建目录 file.mkdirs(); } if (pics != null) { for (int i = 0; i < pics.size(); i++) { Picture pic = (Picture) pics.get(i); try { pic.writeImageContent(new FileOutputStream(imageSavePath+"word"+File.separator+"media"+File.separator + pic.suggestFullFileName())); } catch (FileNotFoundException e) { e.printStackTrace(); } } } Document htmlDocument = wordToHtmlConverter.getDocument(); ByteArrayOutputStream out = new ByteArrayOutputStream(); DOMSource domSource = new DOMSource(htmlDocument); StreamResult streamResult = new StreamResult(out); TransformerFactory tf = TransformerFactory.newInstance(); Transformer serializer = tf.newTransformer(); // serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.transform(domSource, streamResult); out.close(); return out.toString(); } /** * word07转HTML * * @return * @throws IOException */ private String doGenerateHTMLFile() throws Exception { logger.info("=============进入解析================="); XWPFDocument document = new XWPFDocument(new FileInputStream(new File(path))); XHTMLOptions options = XHTMLOptions.create();// .indent( 4 ); IContentHandlerFactory f = new DefaultContentHandlerFactory(); // Extract image options.setExtractor(new FileImageExtractor(new File(imageSavePath))); // URI resolver options.URIResolver(new IURIResolver() { @Override public String resolve(String uri) { return imageUrl + uri; } }); ByteArrayOutputStream out = new ByteArrayOutputStream(); ContentHandler contentHandler = f.create(out, null, options); XHTMLConverter.getInstance().convert(document, out, options); out.close(); return out.toString(); } /** * PPT to HTML * @param filePath PPT file path * @param output html file path * @throws IOException * @throws TransformerException */ public void convert(String filePath, String output) throws IOException, TransformerException { this.init(); File pptFile = new File(filePath); if (!isPPt(pptFile)) { return; } try { process(pptFile, output); } catch (Exception e) { e.printStackTrace(); } saveAsHtml(output, htmlDocumentFacade.getDocument()); } /** * convert ppt to '.png' file and generate '.html' code. * * @param pptFile * @param output * , html save path; * @throws Exception */ private void process(File pptFile, String output) throws Exception { FileInputStream is = new FileInputStream(pptFile); SlideShow ppt = new SlideShow(is); is.close(); Dimension pgsize = ppt.getPageSize(); org.apache.poi.hslf.model.Slide[] slide = ppt.getSlides(); this.info.appendChild(this.htmlDocumentFacade.createText(this .getFileName(pptFile.getPath(), true))); for (int i = 0; i < slide.length; i++) { addSlideTitle(slide[i]); TextRun[] truns = slide[i].getTextRuns(); for (int k = 0; k < truns.length; k++) { RichTextRun[] richTexts = truns[k].getRichTextRuns(); for (int l = 0; l < richTexts.length; l++) { String fontName = richTexts[l].getFontName(); // rtruns[l].setFontIndex(1); // System.out.println(fontName); // POI bug??? if (isTrueType(fontName)) richTexts[l].setFontName("宋体"); else richTexts[l].setFontName(fontName); } } BufferedImage img = new BufferedImage(pgsize.width, pgsize.height, BufferedImage.TYPE_INT_RGB); Graphics2D graphics = img.createGraphics(); graphics.setRenderingHint(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON); graphics.setRenderingHint(RenderingHints.KEY_RENDERING, RenderingHints.VALUE_RENDER_QUALITY); graphics.setRenderingHint(RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BICUBIC); graphics.setRenderingHint(RenderingHints.KEY_FRACTIONALMETRICS, RenderingHints.VALUE_FRACTIONALMETRICS_ON); graphics.setColor(Color.white); graphics.clearRect(0, 0, pgsize.width, pgsize.height); slide[i].draw(graphics); // String imgName = StringUtil.getFileName(pptFile.getPath(),false) // + "_" + (i + 1) + ".png"; String imgName = getFileName(pptFile.getPath(), false) + "_" + (i + 1) + ".png"; // String imgPath = StringUtil.getFilePath(output) + "/images/"; String imgPath = this.imageSavePath; if (i == 0) { new File(imgPath).mkdir(); } FileOutputStream out = new FileOutputStream(imgPath + imgName); javax.imageio.ImageIO.write(img, "png", out); out.close(); addSlide(imageUrl + imgName, slide[i].getSlideNumber()); } } /** * generate outline; * * @param slide */ private void addSlideTitle(org.apache.poi.hslf.model.Slide slide) { String title = StringUtils.isBlank(slide.getTitle()) ? slide .getSlideNumber() + ": Untitled" : slide.getSlideNumber() + ": " + slide.getTitle(); Element list = htmlDocumentFacade.createListItem(); Element a = htmlDocumentFacade.createHyperlink("#link" + slide.getSlideNumber()); // a.setTextContent(title); a.appendChild(htmlDocumentFacade.createText(title)); // a.setNodeValue(title); list.appendChild(a); this.ul.appendChild(list); } /** * generate slide, block style * * @param imagePath * @param index */ private void addSlide(String imagePath, int index) { Element slideBlock = htmlDocumentFacade.createBlock(); slideBlock.setAttribute("class", "slide"); // anchor pointer Element link = htmlDocumentFacade.createBookmark("link" + index); Element img = htmlDocumentFacade.createImage(imagePath); // img.setAttribute("style", "position:relative;top:15%;"); slideBlock.appendChild(link); slideBlock.appendChild(img); this.page.appendChild(slideBlock); } private void init() { try { // build document DocumentBuilder builder = DocumentBuilderFactory.newInstance() .newDocumentBuilder(); // DOMImplementation domImpl = builder.getDOMImplementation(); // DocumentType doctype = domImpl.createDocumentType("", // "-//W3C//DTD HTML 4.01 Transitional//EN", // "http://www.w3.org/TR/html4/strict.dtd"); // Document document = domImpl.createDocument(null, null, doctype); Document document = builder.newDocument(); htmlDocumentFacade = new HtmlDocumentFacade(document); // glob layout window = htmlDocumentFacade.createBlock(); window.setAttribute("id", "window"); info = htmlDocumentFacade.createBlock(); info.setAttribute("id", "info"); outline = htmlDocumentFacade.createBlock(); outline.setAttribute("id", "outline"); page = htmlDocumentFacade.createBlock(); page.setAttribute("id", "page"); // outline layout ul = htmlDocumentFacade.createUnorderedList(); outline.appendChild(ul); window.appendChild(info); window.appendChild(outline); window.appendChild(page); htmlDocumentFacade.getBody().appendChild(window); /* * DocumentType docType = * htmlDocumentFacade.getDocument().getDoctype(); * * System.out.println("docType : ------- " + docType); */ setCommonStyle(htmlDocumentFacade.getDocument()); } catch (ParserConfigurationException e) { e.printStackTrace(); } } private void saveAsHtml(String output, org.w3c.dom.Document document) throws IOException, TransformerException { FileWriter out = new FileWriter(output); DOMSource domSource = new DOMSource(document); StreamResult streamResult = new StreamResult(out); TransformerFactory tf = TransformerFactory.newInstance(); Transformer serializer = tf.newTransformer(); // TODO set encoding from a command argument // serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); serializer.setOutputProperty(OutputKeys.ENCODING, "GB2312"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.setOutputProperty(OutputKeys.STANDALONE, "yes"); serializer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC, "-//W3C//DTD HTML 4.01 Transitional//EN"); // serializer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, "http://www.w3.org/TR/html4/strict.dtd"); serializer.transform(domSource, streamResult); out.close(); } private static boolean isPPt(File file) { if (!file.canRead()) return false; int sep = file.getPath().lastIndexOf("."); if ("ppt".equals(file.getPath().substring(sep + 1, file.getPath().length()))) return true; return false; } public static boolean isTrueType(String fontName) { String[] trueType = new String[] { "Tahoma", "Times New Roman", "Calibri", "Arial" }; for (String type : trueType) { if (type.equals(fontName)) return true; } return false; } private void setCommonStyle(Document document) { Element styleSheet = (Element) document.getElementsByTagName("style") .item(0); if (styleSheet == null) return; String sep = "\n"; StringBuffer sb = new StringBuffer(); sb.append(sep); sb.append("html{height: 100%;overflow-y:hidden;}"); sb.append(sep); sb.append("body{height:100%;overflow-y: hidden;margin:0; background:#bdc2cd}"); sb.append(sep); sb.append("#window{min-width: 800px;height:100%;}"); sb.append(sep); // sb.append("#topbar{min-height:24px;}"); // sb.append(sep); sb.append("#info{font-size: 24px;font-weight: 800;border-bottom:2px solid gray;height:5%; background:#eee;padding-left:5px;}"); sb.append(sep); sb.append("#outline{float:left;height:95%;width:20%;overflow-y: auto; overflow-x:hidden;background:#fff; " + "margin-right:3%; -moz-box-shadow: 4px 4px 12px #2b2b2b;-webkit-box-shadow: 4px 4px 12px #2b2b2b;" + "box-shadow: 4px 4px 12px #2b2b2b;}"); sb.append(sep); sb.append("#outline ul li {list-style:none;line-height: 25px;}"); sb.append(sep); sb.append("#outline ul li a{text-decoration:none;white-space:nowrap;text-overflow:ellipsis;}"); sb.append(sep); sb.append("#page{float:left;overflow-y: auto;overflow-x:hidden;height:95%;width:77%;margin-right:-20%;background:#bdc2cd; text-align:center;}"); sb.append(sep); sb.append("#page div{width:100%;height:100%;}"); sb.append(sep); // styleSheet.setNodeValue(sb.toString()); // styleSheet.setTextContent(sb.toString()); styleSheet.appendChild(document.createTextNode(sb.toString())); } private String getFileName(String path, boolean issuffix) { String name = ""; int index = path.lastIndexOf("/") == -1 ? path.lastIndexOf("\\") : path .lastIndexOf("/"); if (StringUtils.isNotBlank(path)) { if (issuffix) { name = path.substring(index + 1); } else { name = path.substring(index + 1, path.lastIndexOf(".")); } } return name; } private String getFilePath(String path) { String p = ""; int index = path.lastIndexOf("/") == -1 ? path.lastIndexOf("\\") : path .lastIndexOf("/"); if (StringUtils.isNotBlank(path)) { p = path.substring(0, index); } return p; } }
部分pom依赖
org.apache.pdfbox pdfbox 2.0.1 org.apache.poi poi 3.9 org.apache.poi poi-ooxml 3.9 org.apache.poi poi-ooxml-schemas 3.9 org.apache.poi poi-scratchpad 3.9 org.jsoup jsoup 1.8.3
首先从文件服务器上把文件下载到本地,然后进行解析。具体下载过程不再多说。
以上来自网友分享。