You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-user@lucene.apache.org by Günter Kukies <gu...@heuft.com> on 2003/02/21 09:17:14 UTC
Why is document.get("contents")); null ?
Hello,
Why is document.get("contents"); null ?
Thanks,
Günter
private static void addContent(PortalServlet servlet, Document document, InputStream is, String documentLocation ) throws IOException {
try {
PDFParser parser = new PDFParser( is );
parser.parse();
COSDocument pdfDocument = parser.getDocument();
if( pdfDocument.isEncrypted() ) {
DecryptDocument decryptor = new DecryptDocument( pdfDocument );
/*Just try using the default password and move on */
decryptor.decryptDocument( "" );
}
/*create a tmp output stream with the size of the content.*/
ByteArrayOutputStream out = new ByteArrayOutputStream();
PDFTextStripper stripper = new PDFTextStripper();
stripper.writeText( pdfDocument, new OutputStreamWriter( out ) );
byte[] contents = out.toByteArray();
InputStreamReader input = new InputStreamReader( new ByteArrayInputStream( contents ) );
// Add the tag-stripped contents as a Reader-valued Text field so it will
// get tokenized and indexed.
document.add(Field.Text("contents", input ));
servlet.log("documentstripper: "+stripper.getText(pdfDocument));
servlet.log("documentLocation: "+documentLocation);
servlet.log("contents: "+input+" doc: "+document.get("contents"));
servlet.log("document: "+document);
}
catch( CryptographyException e ) {
throw new IOException( "Error decrypting document(" + documentLocation + "): " + e );
}
catch( InvalidPasswordException e ) {
throw new IOException( "Error: The document(" + documentLocation + ") is encrypted and will not be indexed." );
}
finally {
if( is != null ) {
is.close();
}
}
}