You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/04/03 16:33:26 UTC
[tika] branch master updated: TIKA-2846 -- store number of
characters per page and number of characters with bad/missing unicode
mapping per page for PDFs.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new f3099c8 TIKA-2846 -- store number of characters per page and number of characters with bad/missing unicode mapping per page for PDFs.
f3099c8 is described below
commit f3099c81e57b5b7f297519a7586b9eb20d89b688
Author: TALLISON <ta...@apache.org>
AuthorDate: Wed Apr 3 12:33:11 2019 -0400
TIKA-2846 -- store number of characters per page and number of characters with bad/missing unicode mapping per page for PDFs.
---
.../java/org/apache/tika/metadata/Metadata.java | 39 ++++++++++++++++++++++
.../main/java/org/apache/tika/metadata/PDF.java | 5 +++
.../src/test/java/org/apache/tika/TikaTest.java | 5 +++
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 22 ++++++++++--
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 3 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 23 +++++++++++++
6 files changed, 94 insertions(+), 3 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
index 80666ba..ce69e0a 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
@@ -386,6 +386,45 @@ public class Metadata implements CreativeCommons, Geographic, HttpHeaders,
}
/**
+ * Adds the integer value of the identified metadata property.
+ *
+ * @since Apache Tika 1.21
+ * @param property seq integer property definition
+ * @param value property value
+ */
+ public void add(Property property, int value) {
+ if(property.getPrimaryProperty().getPropertyType() != PropertyType.SEQ) {
+ throw new PropertyTypeException(PropertyType.SEQ, property.getPrimaryProperty().getPropertyType());
+ }
+ if(property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) {
+ throw new PropertyTypeException(Property.ValueType.INTEGER, property.getPrimaryProperty().getValueType());
+ }
+ add(property, Integer.toString(value));
+ }
+
+ /**
+ * Gets the array of ints of the identified "seq" integer metadata property.
+ *
+ * @since Apache Tika 1.21
+ * @param property seq integer property definition
+ * @return array of ints
+ */
+ public int[] getIntValues(Property property) {
+ if(property.getPrimaryProperty().getPropertyType() != PropertyType.SEQ) {
+ throw new PropertyTypeException(PropertyType.SEQ, property.getPrimaryProperty().getPropertyType());
+ }
+ if(property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) {
+ throw new PropertyTypeException(Property.ValueType.INTEGER, property.getPrimaryProperty().getValueType());
+ }
+ String[] vals = getValues(property);
+ int[] ret = new int[vals.length];
+ for (int i = 0; i < vals.length; i++) {
+ ret[i] = Integer.parseInt(vals[i]);
+ }
+ return ret;
+ }
+
+ /**
* Sets the real or rational value of the identified metadata property.
*
* @since Apache Tika 0.8
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index b08643e..054b7e0 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -70,4 +70,9 @@ public interface PDF {
* in the document: on document open, before close, etc.
*/
Property ACTION_TRIGGER = Property.internalText(PDF_PREFIX+"actionTrigger");
+
+ Property CHARACTERS_PER_PAGE = Property.internalIntegerSequence(PDF_PREFIX+"charsPerPage");
+
+ Property UNMAPPED_UNICODE_CHARS_PER_PAGE =
+ Property.internalIntegerSequence(PDF_PREFIX+"unmappedUnicodeCharsPerPage");
}
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 931266c..00d8600 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -217,6 +217,11 @@ public abstract class TikaTest {
return getRecursiveMetadata(filePath, new ParseContext(), new Metadata(), suppressException);
}
+ protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext parseContext, boolean suppressException) throws Exception {
+ return getRecursiveMetadata(filePath, parseContext, new Metadata(), suppressException);
+ }
+
+
protected List<Metadata> getRecursiveMetadata(String filePath) throws Exception {
return getRecursiveMetadata(filePath, new ParseContext());
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 6bb42f2..c0ccc28 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -53,6 +53,7 @@ import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecifica
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDSimpleFileSpecification;
+import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionImportData;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript;
@@ -79,6 +80,8 @@ import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
+import org.apache.pdfbox.util.Matrix;
+import org.apache.pdfbox.util.Vector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -151,6 +154,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
//zero-based pageIndex
int pageIndex = 0;
int startPage = -1;//private in PDFTextStripper...must have own copy because we override processpages
+ int unmappedUnicodeCharsPerPage = 0;
+ int totalCharsPerPage = 0;
AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, ParseContext context, Metadata metadata,
PDFParserConfig config) throws IOException {
@@ -350,6 +355,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
@Override
protected void endPage(PDPage page) throws IOException {
+ metadata.add(PDF.CHARACTERS_PER_PAGE, totalCharsPerPage);
+ metadata.add(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE,
+ unmappedUnicodeCharsPerPage);
+ totalCharsPerPage = 0;
+ unmappedUnicodeCharsPerPage = 0;
try {
for (PDAnnotation annotation : page.getAnnotations()) {
@@ -430,7 +440,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
} catch (SAXException|TikaException e) {
throw new IOExceptionWithCause("Unable to end a page", e);
} catch (IOException e) {
- exceptions.add(e);
+ handleCatchableIOE(e);
}
}
@@ -843,5 +853,13 @@ class AbstractPDF2XHTML extends PDFTextStripper {
return startPage;
}
-
+ @Override
+ protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, String unicode, Vector displacement) throws IOException
+ {
+ super.showGlyph(textRenderingMatrix, font, code, unicode, displacement);
+ if (unicode == null || unicode.isEmpty()) {
+ unmappedUnicodeCharsPerPage++;
+ }
+ totalCharsPerPage++;
+ }
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index b136364..8f72429 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -153,6 +153,7 @@ class PDF2XHTML extends AbstractPDF2XHTML {
super.processPage(page);
} catch (IOException e) {
handleCatchableIOE(e);
+ endPage(page);
}
}
@@ -169,7 +170,7 @@ class PDF2XHTML extends AbstractPDF2XHTML {
} catch (SAXException e) {
throw new IOException("Unable to end a page", e);
} catch (IOException e) {
- exceptions.add(e);
+ handleCatchableIOE(e);
}
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index e1a10bc..5eab912 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -24,6 +24,7 @@ import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
+import java.io.File;
import java.io.InputStream;
import java.util.Arrays;
import java.util.HashMap;
@@ -1394,6 +1395,28 @@ public class PDFParserTest extends TikaTest {
assertContains("This is a Excel", contents.get(1).get(RecursiveParserWrapperHandler.TIKA_CONTENT));
}
+ @Test
+ public void testUnmappedUnicodeStats() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testPDF_bad_page_303226.pdf", true);
+ Metadata m = metadataList.get(0);
+ int[] totalChars = m.getIntValues(PDF.CHARACTERS_PER_PAGE);
+ int[] unmappedUnicodeChars = m.getIntValues(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE);
+ assertEquals(3805, totalChars[15]);
+ assertEquals(120, unmappedUnicodeChars[15]);
+
+ //confirm all works with angles
+ PDFParserConfig pdfParserConfig = new PDFParserConfig();
+ pdfParserConfig.setDetectAngles(true);
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(PDFParserConfig.class, pdfParserConfig);
+ metadataList = getRecursiveMetadata("testPDF_bad_page_303226.pdf", parseContext,true);
+ m = metadataList.get(0);
+ totalChars = m.getIntValues(PDF.CHARACTERS_PER_PAGE);
+ unmappedUnicodeChars = m.getIntValues(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE);
+ assertEquals(3805, totalChars[15]);
+ assertEquals(120, unmappedUnicodeChars[15]);
+
+ }
/**
* Simple class to count end of document events. If functionality is useful,
* move to org.apache.tika in src/test