You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/04/03 16:33:26 UTC

[tika] branch master updated: TIKA-2846 -- store number of characters per page and number of characters with bad/missing unicode mapping per page for PDFs.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new f3099c8  TIKA-2846 -- store number of characters per page and number of characters with bad/missing unicode mapping per page for PDFs.
f3099c8 is described below

commit f3099c81e57b5b7f297519a7586b9eb20d89b688
Author: TALLISON <ta...@apache.org>
AuthorDate: Wed Apr 3 12:33:11 2019 -0400

    TIKA-2846 -- store number of characters per page and number of characters with bad/missing unicode mapping per page for PDFs.
---
 .../java/org/apache/tika/metadata/Metadata.java    | 39 ++++++++++++++++++++++
 .../main/java/org/apache/tika/metadata/PDF.java    |  5 +++
 .../src/test/java/org/apache/tika/TikaTest.java    |  5 +++
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 22 ++++++++++--
 .../java/org/apache/tika/parser/pdf/PDF2XHTML.java |  3 +-
 .../org/apache/tika/parser/pdf/PDFParserTest.java  | 23 +++++++++++++
 6 files changed, 94 insertions(+), 3 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
index 80666ba..ce69e0a 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
@@ -386,6 +386,45 @@ public class Metadata implements CreativeCommons, Geographic, HttpHeaders,
     }
 
     /**
+     * Adds the integer value of the identified metadata property.
+     *
+     * @since Apache Tika 1.21
+     * @param property seq integer property definition
+     * @param value    property value
+     */
+    public void add(Property property, int value) {
+        if(property.getPrimaryProperty().getPropertyType() != PropertyType.SEQ) {
+            throw new PropertyTypeException(PropertyType.SEQ, property.getPrimaryProperty().getPropertyType());
+        }
+        if(property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) {
+            throw new PropertyTypeException(Property.ValueType.INTEGER, property.getPrimaryProperty().getValueType());
+        }
+        add(property, Integer.toString(value));
+    }
+
+    /**
+     * Gets the array of ints of the identified "seq" integer metadata property.
+     *
+     * @since Apache Tika 1.21
+     * @param property seq integer property definition
+     * @return array of ints
+     */
+    public int[] getIntValues(Property property) {
+        if(property.getPrimaryProperty().getPropertyType() != PropertyType.SEQ) {
+            throw new PropertyTypeException(PropertyType.SEQ, property.getPrimaryProperty().getPropertyType());
+        }
+        if(property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) {
+            throw new PropertyTypeException(Property.ValueType.INTEGER, property.getPrimaryProperty().getValueType());
+        }
+        String[] vals = getValues(property);
+        int[] ret = new int[vals.length];
+        for (int i = 0; i < vals.length; i++) {
+            ret[i] = Integer.parseInt(vals[i]);
+        }
+        return ret;
+    }
+
+    /**
      * Sets the real or rational value of the identified metadata property.
      *
      * @since Apache Tika 0.8
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index b08643e..054b7e0 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -70,4 +70,9 @@ public interface PDF {
      * in the document: on document open, before close, etc.
      */
     Property ACTION_TRIGGER = Property.internalText(PDF_PREFIX+"actionTrigger");
+
+    Property CHARACTERS_PER_PAGE = Property.internalIntegerSequence(PDF_PREFIX+"charsPerPage");
+
+    Property UNMAPPED_UNICODE_CHARS_PER_PAGE =
+            Property.internalIntegerSequence(PDF_PREFIX+"unmappedUnicodeCharsPerPage");
 }
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 931266c..00d8600 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -217,6 +217,11 @@ public abstract class TikaTest {
         return getRecursiveMetadata(filePath, new ParseContext(), new Metadata(), suppressException);
     }
 
+    protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext parseContext, boolean suppressException) throws Exception {
+        return getRecursiveMetadata(filePath, parseContext, new Metadata(), suppressException);
+    }
+
+
     protected List<Metadata> getRecursiveMetadata(String filePath) throws Exception {
         return getRecursiveMetadata(filePath, new ParseContext());
     }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 6bb42f2..c0ccc28 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -53,6 +53,7 @@ import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecifica
 import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
 import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification;
 import org.apache.pdfbox.pdmodel.common.filespecification.PDSimpleFileSpecification;
+import org.apache.pdfbox.pdmodel.font.PDFont;
 import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
 import org.apache.pdfbox.pdmodel.interactive.action.PDActionImportData;
 import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript;
@@ -79,6 +80,8 @@ import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource;
 import org.apache.pdfbox.rendering.PDFRenderer;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.tools.imageio.ImageIOUtil;
+import org.apache.pdfbox.util.Matrix;
+import org.apache.pdfbox.util.Vector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -151,6 +154,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     //zero-based pageIndex
     int pageIndex = 0;
     int startPage = -1;//private in PDFTextStripper...must have own copy because we override processpages
+    int unmappedUnicodeCharsPerPage = 0;
+    int totalCharsPerPage = 0;
 
     AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, ParseContext context, Metadata metadata,
                       PDFParserConfig config) throws IOException {
@@ -350,6 +355,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
 
     @Override
     protected void endPage(PDPage page) throws IOException {
+        metadata.add(PDF.CHARACTERS_PER_PAGE, totalCharsPerPage);
+        metadata.add(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE,
+                unmappedUnicodeCharsPerPage);
+        totalCharsPerPage = 0;
+        unmappedUnicodeCharsPerPage = 0;
 
         try {
             for (PDAnnotation annotation : page.getAnnotations()) {
@@ -430,7 +440,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         } catch (SAXException|TikaException e) {
             throw new IOExceptionWithCause("Unable to end a page", e);
         } catch (IOException e) {
-            exceptions.add(e);
+            handleCatchableIOE(e);
         }
     }
 
@@ -843,5 +853,13 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         return startPage;
     }
 
-
+    @Override
+    protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, String unicode, Vector displacement) throws IOException
+    {
+        super.showGlyph(textRenderingMatrix, font, code, unicode, displacement);
+        if (unicode == null || unicode.isEmpty()) {
+            unmappedUnicodeCharsPerPage++;
+        }
+        totalCharsPerPage++;
+    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index b136364..8f72429 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -153,6 +153,7 @@ class PDF2XHTML extends AbstractPDF2XHTML {
             super.processPage(page);
         } catch (IOException e) {
             handleCatchableIOE(e);
+            endPage(page);
         }
     }
 
@@ -169,7 +170,7 @@ class PDF2XHTML extends AbstractPDF2XHTML {
         } catch (SAXException e) {
             throw new IOException("Unable to end a page", e);
         } catch (IOException e) {
-            exceptions.add(e);
+            handleCatchableIOE(e);
         }
     }
 
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index e1a10bc..5eab912 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -24,6 +24,7 @@ import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
+import java.io.File;
 import java.io.InputStream;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -1394,6 +1395,28 @@ public class PDFParserTest extends TikaTest {
         assertContains("This is a Excel", contents.get(1).get(RecursiveParserWrapperHandler.TIKA_CONTENT));
     }
 
+    @Test
+    public void testUnmappedUnicodeStats() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testPDF_bad_page_303226.pdf", true);
+        Metadata m = metadataList.get(0);
+        int[] totalChars = m.getIntValues(PDF.CHARACTERS_PER_PAGE);
+        int[] unmappedUnicodeChars = m.getIntValues(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE);
+        assertEquals(3805, totalChars[15]);
+        assertEquals(120, unmappedUnicodeChars[15]);
+
+        //confirm all works with angles
+        PDFParserConfig pdfParserConfig = new PDFParserConfig();
+        pdfParserConfig.setDetectAngles(true);
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(PDFParserConfig.class, pdfParserConfig);
+        metadataList = getRecursiveMetadata("testPDF_bad_page_303226.pdf", parseContext,true);
+        m = metadataList.get(0);
+        totalChars = m.getIntValues(PDF.CHARACTERS_PER_PAGE);
+        unmappedUnicodeChars = m.getIntValues(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE);
+        assertEquals(3805, totalChars[15]);
+        assertEquals(120, unmappedUnicodeChars[15]);
+
+    }
     /**
      * Simple class to count end of document events.  If functionality is useful,
      * move to org.apache.tika in src/test