You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/06/24 03:12:46 UTC
svn commit: r1604989 - in /tika/trunk/tika-parsers: pom.xml
src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
src/main/java/org/apache/tika/parser/pdf/PDFParser.java
src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Author: tallison
Date: Tue Jun 24 01:12:45 2014
New Revision: 1604989
URL: http://svn.apache.org/r1604989
Log:
TIKA-1352 upgrade to PDFBox 1.8.6
Modified:
tika/trunk/tika-parsers/pom.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1604989&r1=1604988&r2=1604989&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Tue Jun 24 01:12:45 2014
@@ -39,7 +39,7 @@
<codec.version>1.5</codec.version> <!-- NOTE: sync with POI -->
<mime4j.version>0.7.2</mime4j.version>
<vorbis.version>0.6</vorbis.version>
- <pdfbox.version>1.8.5</pdfbox.version>
+ <pdfbox.version>1.8.6</pdfbox.version>
</properties>
<dependencies>
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1604989&r1=1604988&r2=1604989&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Tue Jun 24 01:12:45 2014
@@ -92,7 +92,7 @@ class PDF2XHTML extends PDFTextStripper
private final static int MAX_ACROFORM_RECURSIONS = 10;
- // TODO: remove once PDFBOX-1130 is fixed:
+ // TODO: remove once PDFBOX-2160 is fixed:
private boolean inParagraph = false;
/**
@@ -353,7 +353,7 @@ class PDF2XHTML extends PDFTextStripper
@Override
protected void writeParagraphStart() throws IOException {
- // TODO: remove once PDFBOX-1130 is fixed
+ // TODO: remove once PDFBOX-2160 is fixed
if (inParagraph) {
// Close last paragraph
writeParagraphEnd();
@@ -369,7 +369,7 @@ class PDF2XHTML extends PDFTextStripper
@Override
protected void writeParagraphEnd() throws IOException {
- // TODO: remove once PDFBOX-1130 is fixed
+ // TODO: remove once PDFBOX-2160 is fixed
if (!inParagraph) {
writeParagraphStart();
}
@@ -535,29 +535,22 @@ class PDF2XHTML extends PDFTextStripper
handler.endElement("div");
}
- private void processAcroField(PDField field, XHTMLContentHandler handler, final int recurseDepth)
+ private void processAcroField(PDField field, XHTMLContentHandler handler, final int currentRecursiveDepth)
throws SAXException, IOException {
- if (recurseDepth >= MAX_ACROFORM_RECURSIONS) {
+ if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
return;
}
addFieldString(field, handler);
- @SuppressWarnings("rawtypes")
- List kids = field.getKids();
+ List<COSObjectable> kids = field.getKids();
if(kids != null) {
- @SuppressWarnings("rawtypes")
- Iterator kidsIter = kids.iterator();
- if (kidsIter == null) {
- return;
- }
- int r = recurseDepth+1;
+ int r = currentRecursiveDepth+1;
handler.startElement("ol");
//TODO: can generate <ol/>. Rework to avoid that.
- while(kidsIter.hasNext()) {
- Object pdfObj = kidsIter.next();
+ for(COSObjectable pdfObj : kids) {
if(pdfObj != null && pdfObj instanceof PDField) {
PDField kid = (PDField)pdfObj;
//recurse
@@ -596,6 +589,8 @@ class PDF2XHTML extends PDFTextStripper
}
} catch (IOException e) {
//swallow
+ } catch (NullPointerException e) {
+ //TODO: remove once PDFBOX-2161 is fixed
}
if (attrs.getLength() > 0 || sb.length() > 0) {
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1604989&r1=1604988&r2=1604989&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Tue Jun 24 01:12:45 2014
@@ -227,10 +227,11 @@ public class PDFParser extends AbstractP
//Caveats:
// there is currently a fair amount of redundancy
// TikaCoreProperties.FORMAT can be multivalued
- // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion
+ // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion
metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion()));
metadata.add(TikaCoreProperties.FORMAT.getName(),
- MEDIA_TYPE.toString()+"; version="+Float.toString(document.getDocument().getVersion()));
+ MEDIA_TYPE.toString()+"; version="+
+ Float.toString(document.getDocument().getVersion()));
try {
if( xmp != null ) {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1604989&r1=1604988&r2=1604989&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Tue Jun 24 01:12:45 2014
@@ -467,9 +467,6 @@ public class PDFParserTest extends TikaT
}
//TIKA-1124
- //IGNORE until TIKA-1298/PDFBOX 2079 is fixed or we all
- //move to Java 1.7
- @Ignore
@Test
public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception {
/* format of test doc:
@@ -527,20 +524,26 @@ public class PDFParserTest extends TikaT
*/
@Test
public void testSequentialParser() throws Exception{
- Parser defaultParser = new AutoDetectParser();
+
Parser sequentialParser = new AutoDetectParser();
- ParseContext context = new ParseContext();
- PDFParserConfig config = new PDFParserConfig();
- config.setUseNonSequentialParser(true);
- context.set(PDFParserConfig.class, config);
+ Parser nonSequentialParser = new AutoDetectParser();
+
+ ParseContext seqContext = new ParseContext();
+ PDFParserConfig seqConfig = new PDFParserConfig();
+ seqConfig.setUseNonSequentialParser(false);
+ seqContext.set(PDFParserConfig.class, seqConfig);
+
+ ParseContext nonSeqContext = new ParseContext();
+ PDFParserConfig nonSeqConfig = new PDFParserConfig();
+ nonSeqConfig.setUseNonSequentialParser(true);
+ nonSeqContext.set(PDFParserConfig.class, nonSeqConfig);
File testDocs = new File(this.getClass().getResource("/test-documents").toURI());
int pdfs = 0;
Set<String> knownMetadataDiffs = new HashSet<String>();
//PDFBox-1792/Tika-1203
knownMetadataDiffs.add("testAnnotations.pdf");
- //PDFBox-1792
- knownMetadataDiffs.add("test_acroForm2.pdf");
+
//empty for now
Set<String> knownContentDiffs = new HashSet<String>();
@@ -550,25 +553,25 @@ public class PDFParserTest extends TikaT
}
pdfs++;
- Metadata defaultMetadata = new Metadata();
- String defaultContent = getText(new FileInputStream(f), defaultParser, defaultMetadata);
-
Metadata sequentialMetadata = new Metadata();
- String sequentialContent = getText(new FileInputStream(f), sequentialParser, context, sequentialMetadata);
+ String sequentialContent = getText(new FileInputStream(f),
+ sequentialParser, seqContext, sequentialMetadata);
+
+ Metadata nonSequentialMetadata = new Metadata();
+ String nonSequentialContent = getText(new FileInputStream(f),
+ nonSequentialParser, nonSeqContext, nonSequentialMetadata);
if (knownContentDiffs.contains(f.getName())) {
- assertFalse(f.getName(), defaultContent.equals(sequentialContent));
+ assertFalse(f.getName(), sequentialContent.equals(nonSequentialContent));
} else {
- assertEquals(f.getName(), defaultContent, sequentialContent);
+ assertEquals(f.getName(), sequentialContent, nonSequentialContent);
}
//skip this one file.
if (knownMetadataDiffs.contains(f.getName())) {
- //turn back on once PDFBOX-1922 is fixed
- //assertFalse(f.getName(), defaultMetadata.equals(sequentialMetadata));
+ assertFalse(f.getName(), sequentialMetadata.equals(nonSequentialMetadata));
} else {
- //assertEquals(f.getName(), defaultMetadata, sequentialMetadata);
- testMetadataEquality(f.getName(), defaultMetadata, sequentialMetadata);
+ assertEquals(f.getName(), sequentialMetadata, nonSequentialMetadata);
}
}
//make sure nothing went wrong with getting the resource to test-documents
@@ -816,55 +819,6 @@ public class PDFParserTest extends TikaT
assertEquals("Hello World", m.get("dc:title"));
}
- /**
- * This is a workaround until PDFBox-1922 is fixed.
- * The goal is to test for equality but skip the version issue.
- * TODO: get rid of this asap and revert back to this.Metadata.equals(thatMetadata)!
- * @return equal or not (ignore version differences)
- */
- private void testMetadataEquality(String fName, Metadata thisMetadata,
- Metadata thatMetadata) {
- String[] thisNames = thisMetadata.names();
- String[] thatNames = thatMetadata.names();
-
- assertTrue("metadata null test: "+fName,
- (thisNames == null && thatNames == null) ||
- (thisNames != null && thatNames != null));
-
- assertEquals("metadata length: "+fName, thisNames.length, thatMetadata.names().length);
-
- for (String n : thisNames) {
- //don't pay attention to differences here for now
- if (n.equals("pdf:PDFVersion") || n.equals("dc:format")) {
- continue;
- }
- if (thisMetadata.isMultiValued(n) && thatMetadata.isMultiValued(n)) {
- String[] thisValues = thisMetadata.getValues(n);
- String[] thatValues = thatMetadata.getValues(n);
- testEqualMetadataValue(fName, thisValues, thatValues);
- } else if (! thisMetadata.isMultiValued(n) && ! thatMetadata.isMultiValued(n)) {
- assertEquals("unequal multivalued values: " + fName, thisMetadata.get(n), thatMetadata.get(n));
- } else {
- //one is multivalued and the other isn't
- assertTrue("one multivalued, other isn't: "+fName, false);
- }
- }
- }
-
- private void testEqualMetadataValue(String fName, String[] thisValues, String[] thatValues) {
- assertTrue("null equality of metadata values: "+fName,
- (thisValues == null && thatValues == null) ||
- (thisValues != null && thatValues != null));
-
- assertEquals("metadata values length: "+fName, thisValues.length, thatValues.length);
- List<String> list = Arrays.asList(thatValues);
- for (String v : thisValues) {
- if (! list.contains(v)) {
- assertTrue("metadata value; that doesn't contain" + v, false);
- }
- }
- }
-
@Test
public void testInlineSelector() throws Exception {