有关pdfbox-1.3.1中Identity-H编码为乱码的解决方法

最近用lucene做一个搜索文档的小程序,其中索引pdf文件时使用pdfbox1.3时出现乱码。

索引pdf的函数如下:(使用pdfbox-1.3.1.jar以及fontbox-1.3.1.jar)

package luceneTest;

import java.io.File;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

public class LucenePdf {
  public static Document getDocument(File pdf){
             String pdfpath = pdf.getAbsolutePath();
             PDDocument pdDocument = null;
             Document document = new Document();
             String title = pdf.getName();
             try{

                pdDocument = PDDocument.load(pdf);
                PDFTextStripper stripper = new PDFTextStripper();
                String s1 = stripper.getText(pdDocument);
                System.out.println(s1);
                 Reader contents = new StringReader(s1);
                 document.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
          document.add(new Field("contents",contents));
          document.add(new Field("path", pdfpath, Field.Store.YES, Field.Index.NO));
   pdDocument.close();
       }catch(Exception e){
   e.printStackTrace();
       }
       return document;
  }
}

结果出现了乱码,调试时发现pdf文档的编码格式为Identify-H。我又用了pdfbox-1.2.1.jar来替换pdfbox-1.3.1.jar,结果该文档可以正常显示。通过比较两个不同版本中的org.apache.pdfbox.pdmodel.font.PDFont源代码,我发现1.2.1中有一段代码专门用来处理Identity-H编码,而在1.3.1中则没有。于是将这段代码放入1.3.1版本中的PDFont中。

下面是pdfbox-1.3.1.jar中的org.apache.pdfbox.pdmodel.font.PDFont中有关编码的函数

   private void determineEncoding() throws IOException
    {
        String cmapName = null;
        COSName encodingName = null;
        COSBase toUnicode = font.getDictionaryObject( COSName.TO_UNICODE );
        COSBase encoding = getEncodingObject();
        if( toUnicode != null )
        {
            if ( toUnicode instanceof COSStream )
            {
                try {
                    parseCmap(null, ((COSStream)toUnicode).getUnfilteredStream(), null);
                }
                catch(IOException exception)
                {
                    log.error("Error: Could not load embedded CMAP" );
                }
            }
            else if ( toUnicode instanceof COSName)
            {
                encodingName = (COSName)toUnicode;
                cmap = cmapObjects.get( encodingName.getName() );
                if (cmap == null)
                {
                    cmapName = encodingName.getName();
                }
            }
        }
        if (encoding != null)
        {
            if (encoding instanceof COSName)
            {
                if (cmap == null)
                {
                    encodingName = (COSName)encoding;
                    cmap = cmapObjects.get( encodingName.getName() );
                    if (cmap == null)
                    {
                        cmapName = encodingName.getName();

                        //其中红色部分为我后加的为解决idefntity-H编码的代码
                         if (encodingName.getName().equals( COSName.IDENTITY_H.getName() ))
                            {
                                COSArray descendantFontArray =
                                    (COSArray)font.getDictionaryObject( COSName.DESCENDANT_FONTS );
                                if (descendantFontArray != null)
                                {
                                    COSDictionary descendantFontDictionary =
                                        (COSDictionary)descendantFontArray.getObject( 0 );
                                    PDFont descendentFont = PDFontFactory.createFont( descendantFontDictionary );
                                    COSDictionary cidsysteminfo =
                                        (COSDictionary)descendentFont.font.getDictionaryObject(COSName.CIDSYSTEMINFO);
                                    if (cidsysteminfo != null)
                                    {
                                        String ordering = cidsysteminfo.getString(COSName.ORDERING);
                                        String registry = cidsysteminfo.getString(COSName.REGISTRY);
                                        cmapName = registry + "-" + ordering+"-UCS2";
                                    }
                                }
                            }
                    }
                }
                if (cmap == null && cmapName != null)
                {
                    try
                    {
                        fontEncoding =
                            EncodingManager.INSTANCE.getEncoding(encodingName);
                    }
                    catch(IOException exception)
                    {
                        log.debug("Debug: Could not find encoding for " + encodingName );
                    }
                }
            }
            else if (encoding instanceof COSDictionary)
            {
                try
                {
                    fontEncoding = new DictionaryEncoding((COSDictionary)encoding);
                }
                catch(IOException exception)
                {
                    log.error("Error: Could not create the DictionaryEncoding" );
                }
            }
            else if(encoding instanceof COSStream )
            {
                if (cmap == null)
                {
                    COSStream encodingStream = (COSStream)encoding;
                    try
                    {
                        parseCmap( null, encodingStream.getUnfilteredStream(), null );
                    }
                    catch(IOException exception)
                    {
                        log.error("Error: Could not parse the embedded CMAP" );
                    }
                }
            }
        }
        COSDictionary cidsysteminfo = (COSDictionary)font.getDictionaryObject(COSName.CIDSYSTEMINFO);
        if (cidsysteminfo != null)
        {
            String ordering = cidsysteminfo.getString(COSName.ORDERING);
            String registry = cidsysteminfo.getString(COSName.REGISTRY);
            int supplement = cidsysteminfo.getInt(COSName.SUPPLEMENT);
            cmapName = registry + "-" + ordering+ "-" + supplement;
            cmapName = CMapSubstitution.substituteCMap( cmapName );
            cmap = cmapObjects.get( cmapName );
        }
        FontMetric metric = getAFM();
        if( metric != null )
        {
            fontEncoding = new AFMEncoding( metric );
        }
       
        if (cmap == null && cmapName != null)
        {
            String resourceName = resourceRootCMAP + cmapName;
            try {
                parseCmap( resourceRootCMAP, ResourceLoader.loadResource( resourceName ), encodingName );
                if( cmap == null && encodingName == null)
                {
                    log.error("Error: Could not parse predefined CMAP file for '" + cmapName + "'" );
                }
            }
            catch(IOException exception)
            {
                log.error("Error: Could not find predefined CMAP file for '" + cmapName + "'" );
            }
        }
        getEncodingFromFont();
    }

这样还是不行,原因是1.3.1版本jar包的cmap中没有adobe-gb1-us2的转码表,于是我又将pdfbox-1.2.1.jar解压缩,将其中\org\apache\pdfbox\resources\cmap目录下的Adobe-GB1-US2表复制到pdfbox-1.3.1.jar中相同目录下的cmap中,这样在将修改后的pdfbox-1.3.1.jar放入工程中,则Identity-H编码问题解决了。

你可能感兴趣的:(apache,Lucene,Adobe)