近期由于项目的原因接触到word转html,pdf转html,之后在线编辑的模块,在网上找了许多资料,经过整理测试,已初具规模
首先doc(docx)在线编辑
1 推荐使用:zohowriter,无插件的web word编辑器
2 推荐使用:docx4J 可以先把docx文档转换为html,
package com.zoma.common;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import org.docx4j.XmlUtils;
import org.docx4j.convert.in.xhtml.XHTMLImporter;
import org.docx4j.convert.out.html.HTMLConversionImageHandler;
import org.docx4j.convert.out.html.HtmlExporterNonXSLT;
import org.docx4j.openpackaging.exceptions.Docx4JException;
import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.w3c.tidy.Tidy;
public class DocxUtil {
/**
* docx文档转换为html
* @param filepath --docx 文件路径(f:/1.docx)
* @param outpath--生成html路径(f:1.html)
* @param imgpath--图片保存路径(f/img)
* @param imguri--图片引用(img/)
* @return 转换成功返回true,失败返回false
*/
public static boolean docToHtml(String filepath,String outpath,String imgpath,String imguri)
{
boolean bo = false ;
FileWriter fw = null;
try {
File infile = new File(filepath);
File outfile = new File(outpath);
WordprocessingMLPackage wmp=WordprocessingMLPackage.load(infile);
HtmlExporterNonXSLT hn=new HtmlExporterNonXSLT(wmp, new HTMLConversionImageHandler(imgpath,imguri, true));
String html=(XmlUtils.w3CDomNodeToString(hn.export()));
fw=new FileWriter(outfile);
fw.write(html);
} catch (Exception e) {
e.printStackTrace();
return bo ;
}finally{
try {
fw.close();
} catch (IOException e) {
e.printStackTrace();
}
System.gc();
}
return bo ;
}
/**
* html转换为xhtml
* @param f_in --docx 文件路径(f:/1.html)
* @param outfile--生成xhtml路径(f:1.xhtml)
* @return 转换成功返回true,失败返回false
*/
public static boolean parseToXhtml(String f_in,String outfile)
{
boolean bo = false;
//BufferedInputStream sourceIn; //输入流
ByteArrayOutputStream tidyOutStream = null; //输出流
FileInputStream fis = null;
ByteArrayOutputStream bos = null;
ByteArrayInputStream stream = null;
DataOutputStream to = null;
try
{
// Reader reader;
fis = new FileInputStream(f_in); //读文件
bos = new ByteArrayOutputStream();
int ch;
while((ch=fis.read())!=-1)
{
bos.write(ch);
}
byte[] bs = bos.toByteArray();
bos.close();
String hope_gb2312=new String(bs,"UTF-8");//注意,默认是GB2312,所以这里先转化成GB2312然后再转化成其他的。
byte[] hope_b=hope_gb2312.getBytes();
String basil=new String(hope_b,"UTF-8");//将GB2312转化成 UTF-8
// byte[] basil_b=basil.getBytes();
stream= new ByteArrayInputStream(basil.getBytes());
tidyOutStream = new ByteArrayOutputStream();
Tidy tidy = new Tidy();
tidy.setInputEncoding("UTF-8");
tidy.setQuiet(true);
tidy.setOutputEncoding("UTF-8");
tidy.setShowWarnings(true); //不显示警告信息
tidy.setIndentContent(true);//
tidy.setSmartIndent(true);
tidy.setIndentAttributes(false);
tidy.setWraplen(1024); //多长换行
//输出为xhtml
tidy.setXHTML(true);
tidy.setErrout(new PrintWriter(System.out));
tidy.parse(stream, tidyOutStream);
to=new DataOutputStream(new FileOutputStream(outfile)); //将生成的xhtml写入
tidyOutStream.writeTo(to);
System.out.println(tidyOutStream.toString());
bo = true ;
}
catch ( Exception ex )
{
System.out.println( ex.toString());
ex.printStackTrace();
return bo ;
}finally{
try {
if(to!=null)
{
to.close();
}
if(stream !=null)
{
stream.close();
}
if(fis !=null)
{
fis.close();
}
if(bos !=null)
{
bos.close();
}
if(tidyOutStream !=null)
{
tidyOutStream.close();
}
} catch (IOException e) {
e.printStackTrace();
}
System.gc();
}
return bo;
}
/**
* 获取html内容
* @param filepath 文件路径 (f:/1.xhtml)
* @param exps 搜索表达式(html元素标签等)
* @return 搜索内容
*/
public static String getHtmlStyle(String filepath,String exps)
{
String str="";
try {
File input = new File(filepath);
Document doc = Jsoup.parse(input, "UTF-8");
Elements els = null;
if(exps.equals("body"))
{
els= doc.body().children();
}else{
els= doc.select(exps);
}
for(Element el :els)
{
str+=el;
}
} catch (IOException e) {
e.printStackTrace();
return str;
}finally{
System.gc();
}
return str;
}
/**
* 修改html内容
* @param filepath
* @param exps
* @param htmls
* @return
*/
public static boolean modifyHtml(String filepath,String exps,String htmls)
{
boolean bo = false ;
try {
File input = new File(filepath);
Document doc = Jsoup.parse(input, "UTF-8");
if(exps.equals("body"))
{
Element el = doc.body();
el.html("");
//el.children().html(htmls);
//el.html(htmls);
}
//Elements els = doc.select(exps);
//els.html(htmls);
bo = true;
} catch (IOException e) {
e.printStackTrace();
return bo ;
}
return bo ;
}
public static boolean modifyBody(String infile,String content,String outfile)
{
File file = new File(infile);
BufferedReader reader = null;
FileWriter writer =null;
String fileStr="" ;
try {
System.out.println("以行为单位读取文件内容,一次读一整行:");
reader = new BufferedReader(new FileReader(file));
String tempString = null;
// 一次读入一行,直到读入null为文件结束
while ((tempString = reader.readLine()) != null) {
// 显示行号
fileStr+=tempString;
}
String newStr = fileStr.substring(0,fileStr.indexOf("<body>"));
newStr+=content;
newStr+=fileStr.substring(fileStr.indexOf("</body>"),fileStr.length());
//打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件
writer = new FileWriter(outfile, true);
writer.write(newStr);
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if(writer !=null)
{
writer.close();
}
} catch (IOException e) {
e.printStackTrace();
}
if (reader != null) {
try {
reader.close();
} catch (IOException e1) {
}
}
System.gc();
}
return false ;
}
public static boolean modifyHead(String infile,String content,String outfile)
{
File file = new File(infile);
BufferedReader reader = null;
String fileStr="" ;
try {
System.out.println("以行为单位读取文件内容,一次读一整行:");
reader = new BufferedReader(new FileReader(file));
String tempString = null;
// 一次读入一行,直到读入null为文件结束
while ((tempString = reader.readLine()) != null) {
// 显示行号
fileStr+=tempString;
}
reader.close();
String newStr = fileStr.substring(0,fileStr.indexOf("<head>"));
newStr+="<head>";
newStr+="<meta http-equiv='Content-Type' content='text/html; charset=UTF-8' />";
newStr+=content;
newStr+=fileStr.substring(fileStr.indexOf("</head>"),fileStr.length());
//打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件
FileWriter writer = new FileWriter(outfile, true);
writer.write(newStr);
writer.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e1) {
}
}
}
return false ;
}
/**
* xhtml转换为docx文档
* @param infile xhtml路径(f:/1.xhtml)
* @param outfile docx生成路径(f:/1.docx)
* @return
*/
public static boolean xhtmlToDocx(String infile,String outfile)
{
boolean bo = false;
try {
WordprocessingMLPackage wxm=WordprocessingMLPackage.createPackage();
wxm.getMainDocumentPart().getContent().addAll(XHTMLImporter.convert(new File(infile),null, wxm));
wxm.save(new File(outfile));
} catch (Docx4JException e) {
e.printStackTrace();
return bo ;
} finally{
System.gc();
}
return bo ;
}
}
3 pdf 转html 推荐pdf2htmlex 高保真转化
需要ubutun 12.04以上版本并且安装一下软件
3.1 apt-get install python-software-properties
3.2 sudo add-apt-repository ppa:coolwanglu/pdf2htmlex
3.3 sudo apt-get update
3.4 sudo apt-get install fontforge
3.5 sudo aptitude install poppler-utils
3.6 sudo apt-get install pdf2htmlex
测试输入:pdf2htmlEX --zoom 1.3 /home/1.pdf --dest-dir /home/1
会在home/1文件夹下生成html文件
4 pdf合并 使用pdfbox类库
/**
*
* @param savepath 原来文件夹路径
* @param filePath合并后名字,临时PDF文件夹
* 生成新的pdf文件后,删除原有pdf
* @return
* @throws COSVisitorException
* @throws IOException
*/
public static String mergePdfFiles(String savepath, String filePath) throws COSVisitorException, IOException
{
PDFMergerUtility mergePdf = new PDFMergerUtility();
List list = new ArrayList();
File dir = new File(savepath);
System.out.println("------------merge savepath--"+savepath);
System.out.println("------------merge dir--"+dir.getAbsolutePath());
System.out.println("------------merge to file--"+filePath);
File file[] = dir.listFiles();
for (int i = 0; i < file.length; i++) {
if (file[i].isFile())
{
list.add(file[i]);
}
}
System.out.println("--------------file--------list----------------------------"+list.size());
for(int i=0;i<list.size();i++)
{
File f = (File) list.get(i);
InputStream is= new FileInputStream(f);
mergePdf.addSource(is);
}
mergePdf.setDestinationFileName(filePath);
mergePdf.mergeDocuments();
for(int i=0;i<list.size();i++)
{
File f = (File) list.get(i);
f.deleteOnExit();
}
return filePath;
}
5 jpg图片合并转pdf
首先ubutun安装convert软件
apt-get install imagemagick
apt-get install graphicsmagick-imagemagick-compat
使用命令convert /usr/*.jpg /usr/1.pdf
6java程序调研linux命令
Process proc = Runtime.getRuntime().exec(“”);