You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ti...@apache.org on 2023/08/13 14:12:42 UTC

[tika] branch main updated (c1796feda -> be4c708c1)

This is an automated email from the ASF dual-hosted git repository.

tilman pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


    from c1796feda TIKA-4114: add comment
     new 4f1e8f25e TIKA-4114: add comment; add dummy throw TIKA-4064: avoid deprecated method
     new be4c708c1 TIKA-4114: add comment TIKA-4064: avoid outdated method

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |  5 +++--
 .../java/org/apache/tika/parser/pdf/PDFParser.java | 23 ++++++++++++++++++----
 2 files changed, 22 insertions(+), 6 deletions(-)


[tika] 01/02: TIKA-4114: add comment; add dummy throw TIKA-4064: avoid deprecated method

Posted by ti...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 4f1e8f25efb866d8576b0ee29380a8a68a88fdda
Author: Tilman Hausherr <ti...@apache.org>
AuthorDate: Sun Aug 13 16:10:36 2023 +0200

    TIKA-4114: add comment; add dummy throw
    TIKA-4064: avoid deprecated method
---
 .../java/org/apache/tika/parser/pdf/PDFParser.java | 23 ++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 15d0718d1..0be92429a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -187,6 +187,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
                 memoryUsageSetting = MemoryUsageSetting.setupMainMemoryOnly();
             }
 
+            //TODO PDFBOX30 replace "memoryUsageSetting" with "memoryUsageSetting.streamCache"
             pdfDocument = getPDDocument(stream, tstream, password, memoryUsageSetting, metadata,
                     context);
 
@@ -259,11 +260,12 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
         //Do we want to also check that this is a portfolio PDF/contains a "collection"?
         for (COSObject obj : fileSpecs) {
             if (obj.getObject() instanceof COSDictionary) {
-                COSBase relationship = obj.getDictionaryObject(AF_RELATIONSHIP);
+                COSDictionary dict = (COSDictionary) obj.getObject();
+                COSBase relationship = dict.getDictionaryObject(AF_RELATIONSHIP);
                 if (relationship != null && relationship.equals(ENCRYPTED_PAYLOAD)) {
                     String name = "";
-                    COSBase uf = obj.getDictionaryObject(COSName.UF);
-                    COSBase f = obj.getDictionaryObject(COSName.F);
+                    COSBase uf = dict.getDictionaryObject(COSName.UF);
+                    COSBase f = dict.getDictionaryObject(COSName.F);
                     if (uf != null && uf instanceof COSString) {
                         name = ((COSString)uf).getString();
                     } else if (f != null && f instanceof COSString) {
@@ -294,6 +296,8 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
         List<StartXRefOffset> xRefOffsets = new ArrayList<>();
         //TODO -- can we use the PDFBox parser's RandomAccessRead
         //so that we don't have to reopen from file?
+        //TODO PDFBOX30 replace RandomAccessBufferedFileInputStream
+        // with RandomAccessReadBufferedFile
         try (RandomAccessRead ra =
                      new RandomAccessBufferedFileInputStream(tikaInputStream.getFile())) {
             StartXRefScanner xRefScanner = new StartXRefScanner(ra);
@@ -381,6 +385,10 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
                 PDMetadataExtractor.addNotNull(TikaCoreProperties.SIGNATURE_REASON,
                         signature.getReason(), metadata);
                 hasSignature = true;
+                //TODO PDFBOX30 remove this segment and the exception handling after migration
+                if (false != false) {
+                    throw new IOException();
+                }
             }
         } catch (IOException e) {
             //swallow
@@ -452,6 +460,8 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
                 tstream, metadata, parseContext, PageRangeRequest.RENDER_ALL);
     }
 
+    //TODO PDFBOX30 replace "MemoryUsageSetting memoryUsageSetting" with
+    // "StreamCacheCreateFunction streamCacheCreateFunction"
     protected PDDocument getPDDocument(InputStream stream, TikaInputStream tstream, String password,
                                        MemoryUsageSetting memoryUsageSetting, Metadata metadata,
                                        ParseContext context)
@@ -480,12 +490,16 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
         }
     }
 
+    //TODO PDFBOX30 replace "MemoryUsageSetting memoryUsageSetting" with
+    // "StreamCacheCreateFunction streamCacheCreateFunction"
     protected PDDocument getPDDocument(InputStream inputStream, String password,
                                        MemoryUsageSetting memoryUsageSetting, Metadata metadata,
                                        ParseContext parseContext) throws IOException {
         return PDDocument.load(inputStream, password, memoryUsageSetting);
     }
 
+    //TODO PDFBOX30 replace "MemoryUsageSetting memoryUsageSetting" with
+    // "StreamCacheCreateFunction streamCacheCreateFunction"
     protected PDDocument getPDDocument(Path path, String password,
                                        MemoryUsageSetting memoryUsageSetting, Metadata metadata,
                                        ParseContext parseContext) throws IOException {
@@ -573,7 +587,8 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
         metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS,
                 Boolean.toString(ap.canModifyAnnotations()));
         metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
-        metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded()));
+        //TODO PDFBOX30 replace "CAN_PRINT_DEGRADED" with "CAN_PRINT_FAITHFUL"
+        metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintFaithful()));
         metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(document.isEncrypted()));
 
         if (document.getDocumentCatalog().getLanguage() != null) {


[tika] 02/02: TIKA-4114: add comment TIKA-4064: avoid outdated method

Posted by ti...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit be4c708c13c4eca8588daa3e3ec8536eb36a158a
Author: Tilman Hausherr <ti...@apache.org>
AuthorDate: Sun Aug 13 16:12:26 2023 +0200

    TIKA-4114: add comment
    TIKA-4064: avoid outdated method
---
 .../src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 51fd4b63c..e03e14a4f 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -1397,9 +1397,10 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     }
 
     @Override
-    protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, String unicode,
+    protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code,
                              Vector displacement) throws IOException {
-        super.showGlyph(textRenderingMatrix, font, code, unicode, displacement);
+        super.showGlyph(textRenderingMatrix, font, code, displacement);
+        String unicode = font.toUnicode(code);
         if (unicode == null || unicode.isEmpty()) {
             unmappedUnicodeCharsPerPage++;
             totalUnmappedUnicodeCharacters++;