itext将html转为pdf,图片标签为base64的处理

2019独角兽企业重金招聘Python工程师标准>>>

在http://my.oschina.net/yifanxiang/blog/678139中。修改了一下代码如下：

/**
	 * 生成单页pdf
	 * @param ctx
	 * @return
	 * @throws DocumentException
	 * @throws IOException
	 */
	public static byte[] buildPdf(String ctx) throws DocumentException, IOException{
		ByteArrayOutputStream baos=new ByteArrayOutputStream(1024);
		Document document = new Document();
		PdfWriter writer = PdfWriter.getInstance(document, baos);
	    writer.setInitialLeading(12);//文字间距
	    document.open();
	    HtmlToPdfUtil.MyFontsProvider fontProvider = new HtmlToPdfUtil.MyFontsProvider();
        fontProvider.addFontSubstitute("lowagie", "garamond");
        fontProvider.setUseUnicode(true);
        CssAppliers cssAppliers = new CssAppliersImpl(fontProvider);
        HtmlPipelineContext htmlContext = new HtmlPipelineContext(cssAppliers);
        DefaultTagProcessorFactory tpf=(DefaultTagProcessorFactory)Tags.getHtmlTagProcessorFactory();
        tpf.addProcessor(Tag.IMG, Image.class.getName());//默认是com.itextpdf.tool.xml.html.Image|自个定义一个image的处理类
        htmlContext.setTagFactory(tpf);
        CSSResolver cssResolver = XMLWorkerHelper.getInstance().getDefaultCssResolver(true);
        Pipeline pipeline = new CssResolverPipeline(cssResolver,new HtmlPipeline(htmlContext, new PdfWriterPipeline(document,writer)));
        XMLWorker worker = new XMLWorker(pipeline, true);
        XMLParser p = new XMLParser(worker);
        ByteArrayInputStream bais=new ByteArrayInputStream(ctx.getBytes());
    	p.parse(new InputStreamReader(bais));
    	p.flush();

        document.close();
		byte[] result=baos.toByteArray();
		baos.flush();
		baos.close();
		return result;
	}

主要就是以下两句：

DefaultTagProcessorFactory tpf=(DefaultTagProcessorFactory)Tags.getHtmlTagProcessorFactory();
tpf.addProcessor(Tag.IMG, Image.class.getName());//默认是com.itextpdf.tool.xml.html.Image|自个定义一个image的处理类（这个Image是我自个定义的image）

自定义的Image类：

package com.junziqian.common.convert;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.apache.commons.codec.binary.Base64;

import com.itextpdf.text.BadElementException;
import com.itextpdf.text.Chunk;
import com.itextpdf.text.Element;
import com.itextpdf.text.log.Level;
import com.itextpdf.text.log.Logger;
import com.itextpdf.text.log.LoggerFactory;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfString;
import com.itextpdf.text.xml.XMLUtil;
import com.itextpdf.tool.xml.NoCustomContextException;
import com.itextpdf.tool.xml.Tag;
import com.itextpdf.tool.xml.WorkerContext;
import com.itextpdf.tool.xml.css.CssUtils;
import com.itextpdf.tool.xml.exceptions.LocaleMessages;
import com.itextpdf.tool.xml.exceptions.RuntimeWorkerException;
import com.itextpdf.tool.xml.html.AbstractTagProcessor;
import com.itextpdf.tool.xml.html.HTML;
import com.itextpdf.tool.xml.net.ImageRetrieve;
import com.itextpdf.tool.xml.net.exc.NoImageException;
import com.itextpdf.tool.xml.pipeline.html.HtmlPipelineContext;

public class Image extends AbstractTagProcessor {
	
	private final CssUtils utils = CssUtils.getInstance();
	private static final Logger logger = LoggerFactory.getLogger(Image.class);
	
	
	/*
	 * (non-Javadoc)
	 *
	 * @see
	 * com.itextpdf.tool.xml.TagProcessor#endElement(com.itextpdf.tool.xml.Tag,
	 * java.util.List, com.itextpdf.text.Document)
	 */
	@Override
	public List end(final WorkerContext ctx, final Tag tag, final List currentContent) {
		Map attributes = tag.getAttributes();
		String src = attributes.get(HTML.Attribute.SRC);
		com.itextpdf.text.Image img = null;
		List l = new ArrayList(1);
		if (null != src && src.length() > 0) {
			// check if the image was already added once
            src = XMLUtil.unescapeXML(src);
            src = src.trim();
			try {
				if (logger.isLogging(Level.TRACE)) {
					logger.trace(String.format(LocaleMessages.getInstance().getMessage(LocaleMessages.HTML_IMG_USE), src));
				}
				if(src.matches("^data:image/.{1,10};base64,.+$")){//base64的图片数据
					//byte[] imgData=Base64.decodeBase64(src.substring(0,src.indexOf("base64,")));
                                        byte[] imgData=Base64.decodeBase64(src.substring(src.indexOf("base64,")+7));
					try {
						img=com.itextpdf.text.Image.getInstance(imgData);
					} catch (BadElementException | IOException e) {
						throw new RuntimeException(e);
					}
				}else{
					HtmlPipelineContext context = getHtmlPipelineContext(ctx);
					img = new ImageRetrieve(context.getResourcesRootPath(), context.getImageProvider()).retrieveImage(src);
				}
			} catch (NoImageException e) {
				if (logger.isLogging(Level.ERROR)) {
					logger.error(String.format(LocaleMessages.getInstance().getMessage(LocaleMessages.HTML_IMG_RETRIEVE_FAIL), src), e);
				}
			} catch (NoCustomContextException e) {
				throw new RuntimeWorkerException(LocaleMessages.getInstance().getMessage(LocaleMessages.NO_CUSTOM_CONTEXT), e);
			}
			if (null != img) {
				try {
                    if ( attributes.get(HTML.Attribute.ALT) != null) {
                        img.setAccessibleAttribute(PdfName.ALT, new PdfString(attributes.get(HTML.Attribute.ALT)));
                    }
					HtmlPipelineContext htmlPipelineContext = getHtmlPipelineContext(ctx);
					l.add(getCssAppliers().apply(new Chunk((com.itextpdf.text.Image) getCssAppliers().apply(img, tag, htmlPipelineContext), 0, 0, true), tag, htmlPipelineContext));
				} catch (NoCustomContextException e) {
					throw new RuntimeWorkerException(e);
				}
			}
		}
		return l;
	}


	/*
	 * (non-Javadoc)
	 *
	 * @see com.itextpdf.tool.xml.TagProcessor#isStackOwner()
	 */
	@Override
	public boolean isStackOwner() {
		return false;
	}	
}

这样运行下如下代码就可以生成pdf中代入图片了：

    public static void main(String[] args) throws IOException, DocumentException {
    	String DEST = "./test2015-11.pdf";
        File file = new File(DEST);
        file.getParentFile().mkdirs();
        ArrayList str=new ArrayList();
        str.add(JsoupUtil.getXhtml("中文hello

"));
        str.add(JsoupUtil.getXhtml("中文hello111

"));
        HtmlToPdfUtil.buildPdf(str, DEST);
    }

生成了pdf，自个看下，没有问题

加上jsoup的改动(org.jsoup.nodes.Element)：

void outerHtmlHead(Appendable accum, int depth, OutputSettings out) throws IOException {
    if(out.prettyPrint() && (this.tag.formatAsBlock() || this.parent() != null && this.parent().tag().formatAsBlock() || out.outline())) {
        if(accum instanceof StringBuilder) {
            if(((StringBuilder)accum).length() > 0) {
                this.indent(accum, depth, out);
            }
        } else {
            this.indent(accum, depth, out);
        }
    }

    accum.append("<").append(this.tagName());
    this.attributes.html(accum, out);
    if(this.childNodes.isEmpty() && this.tag.isSelfClosing()) {
        if(!OutputSettings.formatXhtml && out.syntax() == Syntax.html && this.tag.isEmpty()) {
            accum.append('>');
        } else {
            accum.append(" />");
        }
    } else {
        accum.append(">");
    }

}

及org.jsoup.nodes.Document.OutputSettings:

public static boolean formatXhtml = false;//加入这行

jsoup使用方法：

/**
 * 返回标准的html文本
 * @param strHtml
 * @return
 */
public static String getHtml(String strHtml){
   Document.OutputSettings.formatXhtml=false;
   Document doc=Jsoup.parse(strHtml);
   Element head=doc.getElementsByTag("head").first();
   return doc.html();
}

/**
 * 返回标准的xhtml文本|并去掉了javascript代码标签
 * @param strHtml
 * @return
 */
public static String getXhtml(String strHtml){
   Document.OutputSettings.formatXhtml=true;
   Document doc=Jsoup.parse(strHtml,"UTF-8");
   Element head=doc.getElementsByTag("head").first();
   head.append("");
   doc.select("script").remove();
   return doc.html();
}

itext将html转为pdf,图片标签为base64的处理

你可能感兴趣的:(itext将html转为pdf,图片标签为base64的处理)