package luceneTest;
import java.io.File;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
public class LucenePdf {
public static Document getDocument(File pdf){
String pdfpath = pdf.getAbsolutePath();
PDDocument pdDocument = null;
Document document = new Document();
String title = pdf.getName();
pdDocument = PDDocument.load(pdf);
PDFTextStripper stripper = new PDFTextStripper();
String s1 = stripper.getText(pdDocument);
Reader contents = new StringReader(s1);
document.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
document.add(new Field("contents",contents));
document.add(new Field("path", pdfpath, Field.Store.YES, Field.Index.NO));
}catch(Exception e){
return document;
private void determineEncoding() throws IOException
String cmapName = null;
COSName encodingName = null;
COSBase toUnicode = font.getDictionaryObject( COSName.TO_UNICODE );
COSBase encoding = getEncodingObject();
if( toUnicode != null )
if ( toUnicode instanceof COSStream )
try {
parseCmap(null, ((COSStream)toUnicode).getUnfilteredStream(), null);
catch(IOException exception)
log.error("Error: Could not load embedded CMAP" );
else if ( toUnicode instanceof COSName)
encodingName = (COSName)toUnicode;
cmap = cmapObjects.get( encodingName.getName() );
if (cmap == null)
cmapName = encodingName.getName();
if (encoding != null)
if (encoding instanceof COSName)
if (cmap == null)
encodingName = (COSName)encoding;
cmap = cmapObjects.get( encodingName.getName() );
if (cmap == null)
cmapName = encodingName.getName();
if (encodingName.getName().equals( COSName.IDENTITY_H.getName() ))
COSArray descendantFontArray =
(COSArray)font.getDictionaryObject( COSName.DESCENDANT_FONTS );
if (descendantFontArray != null)
COSDictionary descendantFontDictionary =
(COSDictionary)descendantFontArray.getObject( 0 );
PDFont descendentFont = PDFontFactory.createFont( descendantFontDictionary );
COSDictionary cidsysteminfo =
if (cidsysteminfo != null)
String ordering = cidsysteminfo.getString(COSName.ORDERING);
String registry = cidsysteminfo.getString(COSName.REGISTRY);
cmapName = registry + "-" + ordering+"-UCS2";
if (cmap == null && cmapName != null)
fontEncoding =
catch(IOException exception)
log.debug("Debug: Could not find encoding for " + encodingName );
else if (encoding instanceof COSDictionary)
fontEncoding = new DictionaryEncoding((COSDictionary)encoding);
catch(IOException exception)
log.error("Error: Could not create the DictionaryEncoding" );
else if(encoding instanceof COSStream )
if (cmap == null)
COSStream encodingStream = (COSStream)encoding;
parseCmap( null, encodingStream.getUnfilteredStream(), null );
catch(IOException exception)
log.error("Error: Could not parse the embedded CMAP" );
COSDictionary cidsysteminfo = (COSDictionary)font.getDictionaryObject(COSName.CIDSYSTEMINFO);
if (cidsysteminfo != null)
String ordering = cidsysteminfo.getString(COSName.ORDERING);
String registry = cidsysteminfo.getString(COSName.REGISTRY);
int supplement = cidsysteminfo.getInt(COSName.SUPPLEMENT);
cmapName = registry + "-" + ordering+ "-" + supplement;
cmapName = CMapSubstitution.substituteCMap( cmapName );
cmap = cmapObjects.get( cmapName );
FontMetric metric = getAFM();
if( metric != null )
fontEncoding = new AFMEncoding( metric );
if (cmap == null && cmapName != null)
String resourceName = resourceRootCMAP + cmapName;
try {
parseCmap( resourceRootCMAP, ResourceLoader.loadResource( resourceName ), encodingName );
if( cmap == null && encodingName == null)
log.error("Error: Could not parse predefined CMAP file for '" + cmapName + "'" );
catch(IOException exception)
log.error("Error: Could not find predefined CMAP file for '" + cmapName + "'" );