You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by se...@apache.org on 2014/07/11 12:27:15 UTC
svn commit: r1609677 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/parser/
tika-parsers/src/main/java/org/apache/tika/parser/pdf/
tika-parsers/src/test/java/org/apache/tika/parser/pdf/
Author: sergeyb
Date: Fri Jul 11 10:27:15 2014
New Revision: 1609677
URL: http://svn.apache.org/r1609677
Log:
[TIKA-1351] Updating AutoDetect, Composite and PDF parsers to guard against null content handlers
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=1609677&r1=1609676&r2=1609677&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java Fri Jul 11 10:27:15 2014
@@ -114,7 +114,8 @@ public class AutoDetectParser extends Co
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// TIKA-216: Zip bomb prevention
- SecureContentHandler sch = new SecureContentHandler(handler, tis);
+ SecureContentHandler sch =
+ handler != null ? new SecureContentHandler(handler, tis) : null;
try {
// Parse the document
super.parse(tis, sch, metadata, context);
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=1609677&r1=1609676&r2=1609677&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java Fri Jul 11 10:27:15 2014
@@ -237,7 +237,8 @@ public class CompositeParser extends Abs
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
- TaggedContentHandler taggedHandler = new TaggedContentHandler(handler);
+ TaggedContentHandler taggedHandler =
+ handler != null ? new TaggedContentHandler(handler) : null;
try {
parser.parse(taggedStream, taggedHandler, metadata, context);
} catch (RuntimeException e) {
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1609677&r1=1609676&r2=1609677&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Fri Jul 11 10:27:15 2014
@@ -154,7 +154,9 @@ public class PDFParser extends AbstractP
}
metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
extractMetadata(pdfDocument, metadata);
- PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
+ if (handler != null) {
+ PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig);
+ }
} finally {
if (pdfDocument != null) {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1609677&r1=1609676&r2=1609677&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Fri Jul 11 10:27:15 2014
@@ -16,7 +16,6 @@
*/
package org.apache.tika.parser.pdf;
-import org.junit.Ignore;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNull;
@@ -25,7 +24,6 @@ import static org.junit.Assert.assertTru
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
-import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
@@ -90,6 +88,26 @@ public class PDFParserTest extends TikaT
assertTrue("should have word boundary between paragraphs",
!content.contains("libraries.Apache"));
}
+
+ @Test
+ public void testPdfParsingMetadataOnly() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ Metadata metadata = new Metadata();
+
+ InputStream stream = PDFParserTest.class.getResourceAsStream(
+ "/test-documents/testPDF.pdf");
+
+ try {
+ parser.parse(stream, null, metadata, new ParseContext());
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Bertrand Delacr\u00e9taz", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Firefox", metadata.get(TikaCoreProperties.CREATOR_TOOL));
+ assertEquals("Apache Tika - Apache Tika", metadata.get(TikaCoreProperties.TITLE));
+ }
@Test
public void testCustomMetadata() throws Exception {