You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/03/21 17:48:32 UTC

(tika) branch TIKA-4207 updated (7ca6d1759 -> 9ffc4df4a)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4207
in repository https://gitbox.apache.org/repos/asf/tika.git


    from 7ca6d1759 TIKA-4207 -- small improvements to AsyncResource and WMFParser
     add 36a0dca43 TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler (#1629)
     add 2bc0f9bdc TIKA-4202 -- add ocr page count to PDFs -- actually increment counter and move the location of the counter to before OCR is invoked (#1630)
     add 0be76cf28 Bump logback.version from 1.5.0 to 1.5.1
     add 4a5a21ea1 Merge pull request #1632 from apache/dependabot/maven/logback.version-1.5.1
     add 8ab8673ce Bump aws.version from 1.12.668 to 1.12.669
     add 386a5934a Merge pull request #1631 from apache/dependabot/maven/aws.version-1.12.669
     add 215b75b67 TIKA-4166: update puppycrawl
     add b3e4252b2 Bump aws.version from 1.12.669 to 1.12.670
     add 1f9e773e8 Merge pull request #1634 from apache/dependabot/maven/aws.version-1.12.670
     add 6b726fbe5 Bump jakarta.activation:jakarta.activation-api from 2.1.2 to 2.1.3
     add 6a0a59d42 Merge pull request #1635 from apache/dependabot/maven/jakarta.activation-jakarta.activation-api-2.1.3
     add ffc7df20f TIKA-4166: update aws, azure, mockito
     add b5023198b Bump logback.version from 1.5.1 to 1.5.2
     add 86d1e897e Merge pull request #1637 from apache/dependabot/maven/logback.version-1.5.2
     add 1a5f23ff4 Bump aws.version from 1.12.671 to 1.12.672
     add e3bb8cfea Merge pull request #1638 from apache/dependabot/maven/aws.version-1.12.672
     add c8097b6ad Bump logback.version from 1.5.2 to 1.5.3
     add dc612a7b5 Merge pull request #1639 from apache/dependabot/maven/logback.version-1.5.3
     add 32ef34ff4 TIKA-4199: add comment, print to stderr
     add 64c083d12 Bump aws.version from 1.12.672 to 1.12.673
     add 2f6e4cd30 Merge pull request #1640 from apache/dependabot/maven/aws.version-1.12.673
     add 36664ef41 Bump com.google.cloud:google-cloud-storage from 2.34.0 to 2.35.0
     add 26c33d46c Merge pull request #1641 from apache/dependabot/maven/com.google.cloud-google-cloud-storage-2.35.0
     add 6cf215017 Bump org.testcontainers:testcontainers-bom from 1.19.6 to 1.19.7
     add 8b3230dff Merge pull request #1642 from apache/dependabot/maven/org.testcontainers-testcontainers-bom-1.19.7
     add 5221d8874 Bump aws.version from 1.12.673 to 1.12.674
     add 43a4e58cc Merge pull request #1643 from apache/dependabot/maven/aws.version-1.12.674
     add b7c5d48ce Bump aws.version from 1.12.674 to 1.12.675
     add 79b194a69 Merge pull request #1644 from apache/dependabot/maven/aws.version-1.12.675
     add a89e9779f Bump jakarta.xml.bind:jakarta.xml.bind-api from 4.0.1 to 4.0.2
     add 4af4be5be Merge pull request #1645 from apache/dependabot/maven/jakarta.xml.bind-jakarta.xml.bind-api-4.0.2
     add 8b398201a TIKA-4199: revert "complete delegate class", field "in" is a dummy; remove workaround for commons-compress 1.26
     add 5b259d60a TIKA-4199: adjust test results now that commons compress bug has been fixed
     add 4d6acfc10 TIKA-4199: update commons-compress
     add 1dd99bf45 TIKA-4166: update aws
     add 5f4e380ff TIKA-4166: update jaxb
     add d477bfd3b TIKA-4166: revert jaxb update
     add 0f077da2a TIKA-4166: update jaxb and prevent convergence problem
     add f0b76e503 Bump com.googlecode.plist:dd-plist from 1.27 to 1.28
     add da3f8c970 Merge pull request #1649 from apache/dependabot/maven/com.googlecode.plist-dd-plist-1.28
     add 67790a364 Bump org.apache.maven.plugins:maven-assembly-plugin from 3.6.0 to 3.7.0
     add 418258161 Merge pull request #1646 from apache/dependabot/maven/org.apache.maven.plugins-maven-assembly-plugin-3.7.0
     add bc2167a30 Bump log4j2.version from 2.23.0 to 2.23.1
     add 17caf585d Merge pull request #1648 from apache/dependabot/maven/log4j2.version-2.23.1
     add b980d9d86 Bump com.fasterxml.jackson:jackson-bom from 2.16.1 to 2.16.2
     add bdb6a4656 Merge pull request #1647 from apache/dependabot/maven/com.fasterxml.jackson-jackson-bom-2.16.2
     add 84f0a5b7f Bump aws.version from 1.12.676 to 1.12.677
     add 3a7bbc50d Merge pull request #1651 from apache/dependabot/maven/aws.version-1.12.677
     add 3ffadd5a3 Bump aws.version from 1.12.677 to 1.12.678
     add 49064dbe2 Merge pull request #1652 from apache/dependabot/maven/aws.version-1.12.678
     add e65d52cb5 Bump org.xerial:sqlite-jdbc from 3.45.1.0 to 3.45.2.0
     add 846f3a080 Merge pull request #1655 from apache/dependabot/maven/org.xerial-sqlite-jdbc-3.45.2.0
     add be7640d53 Bump com.fasterxml.jackson:jackson-bom from 2.16.2 to 2.17.0
     add 7cd6ee86b Merge pull request #1653 from apache/dependabot/maven/com.fasterxml.jackson-jackson-bom-2.17.0
     add 23d26d770 Bump reactor.netty.version from 1.1.15 to 1.1.17
     add 18d9fd769 Merge pull request #1654 from apache/dependabot/maven/reactor.netty.version-1.1.17
     add 1d666ea04 Bump io.projectreactor:reactor-core from 3.6.2 to 3.6.4
     add 207594f9f Merge pull request #1656 from apache/dependabot/maven/io.projectreactor-reactor-core-3.6.4
     add 533e056bb TIKA-4166: update puppycrawl, cxf
     add 8d5c3578a Bump aws.version from 1.12.678 to 1.12.679
     add ef75d45aa Merge pull request #1658 from apache/dependabot/maven/aws.version-1.12.679
     add df573d07c Bump com.google.guava:guava from 33.0.0-jre to 33.1.0-jre
     add 290742590 Merge pull request #1657 from apache/dependabot/maven/com.google.guava-guava-33.1.0-jre
     add 91820226e TIKA-4166: update mime4j
     add 3ccfcb485 Bump pdfbox.version from 3.0.1 to 3.0.2
     add e9aa16994 Merge pull request #1660 from apache/dependabot/maven/pdfbox.version-3.0.2
     add 3c131e76a Bump org.springframework:spring-context from 5.3.32 to 5.3.33
     add d90a564ad Merge pull request #1662 from apache/dependabot/maven/org.springframework-spring-context-5.3.33
     add 6d02aa2ed Bump aws.version from 1.12.679 to 1.12.680
     add cf2073dda Merge pull request #1661 from apache/dependabot/maven/aws.version-1.12.680
     add 2ec57fb14 Bump aws.version from 1.12.680 to 1.12.681
     add 0a224b32d Merge pull request #1664 from apache/dependabot/maven/aws.version-1.12.681
     add c963c51da Bump com.google.cloud:google-cloud-storage from 2.35.0 to 2.36.0
     add 2e614b438 Merge pull request #1663 from apache/dependabot/maven/com.google.cloud-google-cloud-storage-2.36.0
     add 67d593c27 TIKA-4166: update puppycrawl
     add 7735eeb16 Bump aws.version from 1.12.681 to 1.12.682
     add f1b7f07b7 Merge pull request #1665 from apache/dependabot/maven/aws.version-1.12.682
     add 0a9f17c2d TIKA-4166: update zookeeper
     add fcdff7cf7 Bump org.apache.maven.plugins:maven-assembly-plugin from 3.7.0 to 3.7.1
     add b3c8c3e7e Merge pull request #1666 from apache/dependabot/maven/org.apache.maven.plugins-maven-assembly-plugin-3.7.1
     add 2fa9ab30c Bump org.apache.maven.plugins:maven-compiler-plugin
     add 0e166b0d1 Merge pull request #1667 from apache/dependabot/maven/org.apache.maven.plugins-maven-compiler-plugin-3.13.0
     add 880b34556 Bump aws.version from 1.12.682 to 1.12.683
     add eac6f090b Merge pull request #1668 from apache/dependabot/maven/aws.version-1.12.683
     add 9ea184af5 Bump aws.version from 1.12.683 to 1.12.684
     add 96fd5fd6c Merge pull request #1671 from apache/dependabot/maven/aws.version-1.12.684
     add e63730e12 TIKA-4213 -- improve jdbc pipes reporter (#1669)
     add 7dc3d28a5 TIKA-4211 -- first attempt (#1670)
     add 85d713a9a TIKA-4215 -- avoid loading all the tika resources just to get the version (#1672)
     add 237e73f18 TIKA-4216 (#1673)
     add 08727d522 TIKA-4217 -- require new line or white space as part of bitmap magic (#1674)
     new dae75c632 Merge remote-tracking branch 'origin/main' into TIKA-4207
     new 9ffc4df4a TIKA-4207 -- allow users to configure include/exclude for attachment types and/or mime types

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |  2 +-
 tika-core/src/main/java/org/apache/tika/Tika.java  |  4 ++
 .../tika/extractor/BasicEmbeddedBytesSelector.java | 77 ++++++++++++++++++++++
 ...ctorFactory.java => EmbeddedBytesSelector.java} | 16 +++--
 .../ParsingEmbeddedDocumentExtractor.java          | 28 +++++++-
 .../ParsingEmbeddedDocumentExtractorFactory.java   | 56 ++++++++++++++--
 .../main/java/org/apache/tika/metadata/PDF.java    |  4 ++
 .../apache/tika/metadata/TikaCoreProperties.java   |  4 ++
 .../org/apache/tika/mime/tika-mimetypes.xml        | 53 ++++++++++++---
 .../tika/parser/AutoDetectParserConfigTest.java    | 72 ++++++++++++++++++++
 .../config/TIKA-4207-embedded-bytes-config.xml     | 11 +++-
 tika-eval/tika-eval-app/pom.xml                    |  2 -
 .../org/apache/tika/eval/app/AbstractProfiler.java | 17 ++++-
 .../org/apache/tika/eval/app/ExtractProfiler.java  |  4 ++
 .../java/org/apache/tika/eval/app/db/Cols.java     |  3 +
 tika-parent/pom.xml                                | 60 +++++++++--------
 .../ooxml/XSLFPowerPointExtractorDecorator.java    |  3 +-
 .../apache/tika/parser/ocr/TesseractOCRParser.java | 20 +++++-
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |  6 ++
 .../org/apache/tika/parser/pdf/OCRPageCounter.java |  4 ++
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  8 +++
 .../org/apache/tika/parser/pkg/PackageParser.java  | 50 +-------------
 .../parser/microsoft/ooxml/TruncatedOOXMLTest.java |  4 +-
 .../tika/parser/ocr/TesseractOCRParserTest.java    |  9 +++
 .../apache/tika/parser/pkg/Seven7ParserTest.java   |  3 +-
 .../pipes/reporters/jdbc/JDBCPipesReporter.java    | 52 ++++++++-------
 .../apache/tika/server/core/TikaServerProcess.java |  2 +-
 .../tika/server/core/resource/TikaResource.java    |  2 +-
 .../apache/tika/server/core/TikaVersionTest.java   |  2 +-
 .../apache/tika/server/core/TikaWelcomeTest.java   |  4 +-
 30 files changed, 444 insertions(+), 138 deletions(-)
 create mode 100644 tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java
 copy tika-core/src/main/java/org/apache/tika/extractor/{EmbeddedDocumentExtractorFactory.java => EmbeddedBytesSelector.java} (74%)
 create mode 100644 tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
 copy tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.xml => tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml (78%)


(tika) 02/02: TIKA-4207 -- allow users to configure include/exclude for attachment types and/or mime types

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4207
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 9ffc4df4a3d059d54e1e1851b8d024b24d2043f9
Author: tallison <ta...@apache.org>
AuthorDate: Thu Mar 21 13:48:16 2024 -0400

    TIKA-4207 -- allow users to configure include/exclude for attachment types and/or mime types
---
 .../tika/extractor/BasicEmbeddedBytesSelector.java | 77 ++++++++++++++++++++++
 ...ctorFactory.java => EmbeddedBytesSelector.java} | 24 +++----
 .../ParsingEmbeddedDocumentExtractor.java          | 28 +++++++-
 .../ParsingEmbeddedDocumentExtractorFactory.java   | 56 ++++++++++++++--
 .../apache/tika/metadata/TikaCoreProperties.java   |  4 ++
 .../tika/parser/AutoDetectParserConfigTest.java    | 72 ++++++++++++++++++++
 .../config/TIKA-4207-embedded-bytes-config.xml     | 38 +++++++++++
 7 files changed, 277 insertions(+), 22 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java
new file mode 100644
index 000000000..1d5a239db
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.extractor;
+
+import java.util.Set;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.utils.StringUtils;
+
+public class BasicEmbeddedBytesSelector implements EmbeddedBytesSelector {
+
+
+
+    private final Set<String> includeMimes;
+    private final Set<String> excludeMimes;
+    private final Set<String> includeEmbeddedResourceTypes;
+
+    private final Set<String> excludeEmbeddedResourceTypes;
+
+    public BasicEmbeddedBytesSelector(Set<String> includeMimes, Set<String> excludeMimes,
+                                      Set<String> includeEmbeddedResourceTypes,
+                                      Set<String> excludeEmbeddedResourceTypes) {
+        this.includeMimes = includeMimes;
+        this.excludeMimes = excludeMimes;
+        this.includeEmbeddedResourceTypes = includeEmbeddedResourceTypes;
+        this.excludeEmbeddedResourceTypes = excludeEmbeddedResourceTypes;
+    }
+
+    public boolean select(Metadata metadata) {
+        String mime = metadata.get(Metadata.CONTENT_TYPE);
+        if (mime == null) {
+            mime = "";
+        } else {
+            //if mime matters at all, make sure to get the mime without parameters
+            if (includeMimes.size() > 0 || excludeMimes.size() > 0) {
+                MediaType mt = MediaType.parse(mime);
+                if (mt != null) {
+                    mime = mt.getType() + "/" + mt.getSubtype();
+                }
+            }
+        }
+        if (excludeMimes.contains(mime)) {
+            return false;
+        }
+        if (includeMimes.size() > 0 && ! includeMimes.contains(mime)) {
+            return false;
+        }
+        String embeddedResourceType = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+        //if a parser doesn't specify the type, treat it as ATTACHMENT
+        embeddedResourceType = StringUtils.isBlank(embeddedResourceType) ? "ATTACHMENT" :
+                embeddedResourceType;
+
+        if (excludeEmbeddedResourceTypes.contains(embeddedResourceType)) {
+            return false;
+        }
+        if (includeEmbeddedResourceTypes.size() > 0 && includeEmbeddedResourceTypes.contains(embeddedResourceType)) {
+            return true;
+        }
+        return false;
+    }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java
similarity index 55%
copy from tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
copy to tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java
index 9136228c4..2ec7df667 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java
@@ -16,25 +16,17 @@
  */
 package org.apache.tika.extractor;
 
-import org.apache.tika.config.Field;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
 
-public class ParsingEmbeddedDocumentExtractorFactory
-        implements EmbeddedDocumentExtractorFactory {
+public interface EmbeddedBytesSelector {
 
-    private boolean writeFileNameToContent = true;
-
-    @Field
-    public void setWriteFileNameToContent(boolean writeFileNameToContent) {
-        this.writeFileNameToContent = writeFileNameToContent;
+    class AcceptAll implements EmbeddedBytesSelector {
+        @Override
+        public boolean select(Metadata metadata) {
+            return true;
+        }
     }
+    EmbeddedBytesSelector ACCEPT_ALL = new AcceptAll();
 
-    @Override
-    public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) {
-        ParsingEmbeddedDocumentExtractor ex =
-                new ParsingEmbeddedDocumentExtractor(parseContext);
-        ex.setWriteFileNameToContent(writeFileNameToContent);
-        return ex;
-    }
+    boolean select(Metadata metadata);
 }
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
index 46672838b..ee15c1e22 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
@@ -26,6 +26,8 @@ import java.nio.file.Files;
 import java.nio.file.Path;
 
 import org.apache.commons.io.input.CloseShieldInputStream;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
@@ -43,6 +45,7 @@ import org.apache.tika.parser.ParseRecord;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.utils.ExceptionUtils;
 
 /**
  * Helper class for parsers of package archives or other compound document
@@ -52,6 +55,9 @@ import org.apache.tika.sax.EmbeddedContentHandler;
  */
 public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor {
 
+    private static final Logger LOGGER =
+            LoggerFactory.getLogger(ParsingEmbeddedDocumentExtractor.class);
+
     private static final File ABSTRACT_PATH = new File("");
 
     private static final Parser DELEGATING_PARSER = new DelegatingParser();
@@ -60,6 +66,8 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
 
     private final ParseContext context;
 
+    private EmbeddedBytesSelector embeddedBytesSelector = EmbeddedBytesSelector.ACCEPT_ALL;
+
     public ParsingEmbeddedDocumentExtractor(ParseContext context) {
         this.context = context;
     }
@@ -147,6 +155,14 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
     }
 
     private void storeEmbeddedBytes(Path p, Metadata metadata) {
+        if (! embeddedBytesSelector.select(metadata)) {
+            if (LOGGER.isDebugEnabled()) {
+                LOGGER.debug("skipping embedded bytes {} {}",
+                        metadata.get(Metadata.CONTENT_TYPE),
+                        metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+            }
+            return;
+        }
         EmbeddedDocumentByteStore embeddedDocumentByteStore =
                 context.get(EmbeddedDocumentByteStore.class);
         int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID);
@@ -154,8 +170,8 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
         try {
             embeddedDocumentByteStore.add(id, metadata, Files.readAllBytes(p));
         } catch (IOException e) {
-            e.printStackTrace();
-            //log, or better, store embdocstore exception
+            metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION,
+                    ExceptionUtils.getStackTrace(e));
         }
     }
 
@@ -175,4 +191,12 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
     public void setWriteFileNameToContent(boolean writeFileNameToContent) {
         this.writeFileNameToContent = writeFileNameToContent;
     }
+
+    public void setEmbeddedBytesSelector(EmbeddedBytesSelector embeddedBytesSelector) {
+        this.embeddedBytesSelector = embeddedBytesSelector;
+    }
+
+    public EmbeddedBytesSelector getEmbeddedBytesSelector() {
+        return embeddedBytesSelector;
+    }
 }
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
index 9136228c4..7632ed49c 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
@@ -16,25 +16,73 @@
  */
 package org.apache.tika.extractor;
 
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
 import org.apache.tika.config.Field;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 
-public class ParsingEmbeddedDocumentExtractorFactory
-        implements EmbeddedDocumentExtractorFactory {
+public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocumentExtractorFactory {
 
     private boolean writeFileNameToContent = true;
+    private Set<String> embeddedBytesIncludeMimeTypes = Collections.EMPTY_SET;
+    private Set<String> embeddedBytesExcludeMimeTypes = Collections.EMPTY_SET;
+    private Set<String> embeddedBytesIncludeEmbeddedResourceTypes = Collections.EMPTY_SET;
+    private Set<String> embeddedBytesExcludeEmbeddedResourceTypes = Collections.EMPTY_SET;
 
     @Field
     public void setWriteFileNameToContent(boolean writeFileNameToContent) {
         this.writeFileNameToContent = writeFileNameToContent;
     }
 
+    @Field
+    public void setEmbeddedBytesIncludeMimeTypes(List<String> includeMimeTypes) {
+        embeddedBytesIncludeMimeTypes = new HashSet<>();
+        embeddedBytesIncludeMimeTypes.addAll(includeMimeTypes);
+    }
+
+    @Field
+    public void setEmbeddedBytesExcludeMimeTypes(List<String> excludeMimeTypes) {
+        embeddedBytesExcludeMimeTypes = new HashSet<>();
+        embeddedBytesExcludeMimeTypes.addAll(excludeMimeTypes);
+
+    }
+
+    @Field
+    public void setEmbeddedBytesIncludeEmbeddedResourceTypes(List<String> includeAttachmentTypes) {
+        embeddedBytesIncludeEmbeddedResourceTypes = new HashSet<>();
+        embeddedBytesIncludeEmbeddedResourceTypes.addAll(includeAttachmentTypes);
+
+    }
+
+    @Field
+    public void setEmbeddedBytesExcludeEmbeddedResourceTypes(List<String> excludeAttachmentTypes) {
+        embeddedBytesExcludeEmbeddedResourceTypes = new HashSet<>();
+        embeddedBytesExcludeEmbeddedResourceTypes.addAll(excludeAttachmentTypes);
+
+    }
+
+
     @Override
     public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) {
-        ParsingEmbeddedDocumentExtractor ex =
-                new ParsingEmbeddedDocumentExtractor(parseContext);
+        ParsingEmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(parseContext);
         ex.setWriteFileNameToContent(writeFileNameToContent);
+        ex.setEmbeddedBytesSelector(createEmbeddedBytesSelector());
         return ex;
     }
+
+    private EmbeddedBytesSelector createEmbeddedBytesSelector() {
+        if (embeddedBytesIncludeMimeTypes.size() == 0 &&
+                embeddedBytesExcludeMimeTypes.size() == 0 &&
+                embeddedBytesIncludeEmbeddedResourceTypes.size() == 0 &&
+                embeddedBytesExcludeEmbeddedResourceTypes.size() == 0) {
+            return EmbeddedBytesSelector.ACCEPT_ALL;
+        }
+        return new BasicEmbeddedBytesSelector(embeddedBytesIncludeMimeTypes,
+                embeddedBytesExcludeMimeTypes, embeddedBytesIncludeEmbeddedResourceTypes,
+                embeddedBytesExcludeEmbeddedResourceTypes);
+    }
 }
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 6ff02c1cf..effa4a667 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -98,6 +98,10 @@ public interface TikaCoreProperties {
     Property EMBEDDED_EXCEPTION =
             Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
 
+    //exception handling the raw bytes of an embedded file by an EmbeddedDocumentByteStore
+    Property EMBEDDED_BYTES_EXCEPTION =
+            Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_bytes_exception");
+
     //warning while parsing in an embedded file
     Property EMBEDDED_WARNING =
             Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_warning");
diff --git a/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
new file mode 100644
index 000000000..a0d5d4896
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.InputStream;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.extractor.EmbeddedBytesSelector;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.utils.StringUtils;
+
+public class AutoDetectParserConfigTest {
+
+    @Test
+    public void testEmbeddedBytesSelector() throws Exception {
+        TikaConfig config;
+        try (InputStream is = TikaConfig.class.getResourceAsStream(
+                "TIKA-4207-embedded-bytes-config.xml")) {
+            config = new TikaConfig(is);
+        }
+        AutoDetectParserConfig c = config.getAutoDetectParserConfig();
+        ParsingEmbeddedDocumentExtractorFactory f =
+                (ParsingEmbeddedDocumentExtractorFactory) c.getEmbeddedDocumentExtractorFactory();
+
+        Metadata metadata = new Metadata();
+        ParseContext parseContext = new ParseContext();
+        ParsingEmbeddedDocumentExtractor ex = (ParsingEmbeddedDocumentExtractor) f.newInstance(metadata, parseContext);
+        EmbeddedBytesSelector selector = ex.getEmbeddedBytesSelector();
+        assertFalse(selector.select(getMetadata("", "")));
+        assertTrue(selector.select(getMetadata("application/pdf", "")));
+        assertTrue(selector.select(getMetadata("application/pdf", "ATTACHMENT")));
+        assertTrue(selector.select(getMetadata("application/pdf", "INLINE")));
+        assertTrue(selector.select(getMetadata("text/plain;charset=UTF-7", "INLINE")));
+
+        assertFalse(selector.select(getMetadata("application/pdf", "MACRO")));
+        assertFalse(selector.select(getMetadata("application/docx", "")));
+
+    }
+
+    private Metadata getMetadata(String mime, String embeddedResourceType) {
+        Metadata m = new Metadata();
+        if (!StringUtils.isBlank(mime)) {
+            m.set(Metadata.CONTENT_TYPE, mime);
+        }
+        if (!StringUtils.isBlank(embeddedResourceType)) {
+            m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, embeddedResourceType);
+        }
+        return m;
+    }
+}
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml
new file mode 100644
index 000000000..d60c6b1ca
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser"/>
+  </parsers>
+  <autoDetectParserConfig>
+    <spoolToDisk>123450</spoolToDisk>
+    <outputThreshold>678900</outputThreshold>
+    <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory">
+      <writeFileNameToContent>false</writeFileNameToContent>
+      <embeddedBytesIncludeMimeTypes>
+        <mime>application/pdf</mime>
+        <mime>application/rtf</mime>
+        <mime>text/plain</mime>
+      </embeddedBytesIncludeMimeTypes>
+      <embeddedBytesIncludeEmbeddedResourceTypes>
+        <type>ATTACHMENT</type>
+        <type>INLINE</type>
+      </embeddedBytesIncludeEmbeddedResourceTypes>
+    </embeddedDocumentExtractorFactory>
+  </autoDetectParserConfig>
+</properties>
\ No newline at end of file


(tika) 01/02: Merge remote-tracking branch 'origin/main' into TIKA-4207

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4207
in repository https://gitbox.apache.org/repos/asf/tika.git

commit dae75c632055d980fdad047fe07dd745359fca3f
Merge: 7ca6d1759 08727d522
Author: tallison <ta...@apache.org>
AuthorDate: Thu Mar 21 12:21:52 2024 -0400

    Merge remote-tracking branch 'origin/main' into TIKA-4207

 .../src/main/java/org/apache/tika/cli/TikaCLI.java |  2 +-
 tika-core/src/main/java/org/apache/tika/Tika.java  |  4 ++
 .../main/java/org/apache/tika/metadata/PDF.java    |  4 ++
 .../org/apache/tika/mime/tika-mimetypes.xml        | 53 +++++++++++++++----
 tika-eval/tika-eval-app/pom.xml                    |  2 -
 .../org/apache/tika/eval/app/AbstractProfiler.java | 17 +++++-
 .../org/apache/tika/eval/app/ExtractProfiler.java  |  4 ++
 .../java/org/apache/tika/eval/app/db/Cols.java     |  3 ++
 tika-parent/pom.xml                                | 60 ++++++++++++----------
 .../ooxml/XSLFPowerPointExtractorDecorator.java    |  3 +-
 .../apache/tika/parser/ocr/TesseractOCRParser.java | 20 ++++++--
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |  6 +++
 .../org/apache/tika/parser/pdf/OCRPageCounter.java |  4 ++
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  8 +++
 .../org/apache/tika/parser/pkg/PackageParser.java  | 50 +-----------------
 .../parser/microsoft/ooxml/TruncatedOOXMLTest.java |  4 +-
 .../tika/parser/ocr/TesseractOCRParserTest.java    |  9 ++++
 .../apache/tika/parser/pkg/Seven7ParserTest.java   |  3 +-
 .../pipes/reporters/jdbc/JDBCPipesReporter.java    | 52 ++++++++++---------
 .../apache/tika/server/core/TikaServerProcess.java |  2 +-
 .../tika/server/core/resource/TikaResource.java    |  2 +-
 .../apache/tika/server/core/TikaVersionTest.java   |  2 +-
 .../apache/tika/server/core/TikaWelcomeTest.java   |  4 +-
 23 files changed, 193 insertions(+), 125 deletions(-)