You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/15 17:57:41 UTC

[tika] branch branch_2x updated: TIKA-4133 -- add a capture group metadatafilter (#1346)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_2x by this push:
     new 490038984 TIKA-4133 -- add a capture group metadatafilter (#1346)
     new 688f9d225 Merge remote-tracking branch 'origin/branch_2x' into branch_2x
490038984 is described below

commit 4900389846fa65006392473e611b2afe3efb5acb
Author: Tim Allison <ta...@apache.org>
AuthorDate: Fri Sep 15 13:56:46 2023 -0400

    TIKA-4133 -- add a capture group metadatafilter (#1346)
    
    * TIKA-4133 -- add a capture group metadata filter
---
 .../filter/CaptureGroupMetadataFilter.java         | 110 +++++++++++++++++++++
 .../tika/metadata/filter/TestMetadataFilter.java   |  53 ++++++++++
 .../config/TIKA-4133-capture-group-overwrite.xml   |  26 +++++
 .../apache/tika/config/TIKA-4133-capture-group.xml |  26 +++++
 4 files changed, 215 insertions(+)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java
new file mode 100644
index 000000000..ca9b1e6ea
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.utils.StringUtils;
+
+
+/**
+ * This filter runs a regex against the first value in the "sourceField".
+ * If the pattern matches, it extracts the first group of the first match and
+ * set's the "targetField"'s value to that first group.
+ * <p/>
+ * If there is a match, this will overwrite whatever value is in the
+ * "targetField".
+ * <p/>
+ * If there is not a match, this filter will be a no-op.
+ * <p/>
+ * If there are multiple matches, this filter will capture only the first.
+ * Open a ticket if you need different behavior.
+ * <p/>
+ * If the source field has multiple values, this will run the regex
+ * against only the first value.
+ * <p/>
+ * If the source field does not exist, this filter will be a no-op.
+ * <p/>
+ * If the target field is the same value as the source field, this filter
+ * will overwrite the value in that field. Again, if there are multiple
+ * values in that field, those will all be overwritten.
+ */
+public class CaptureGroupMetadataFilter extends MetadataFilter implements Initializable {
+
+    private String regexString;
+    private Pattern regex;
+    private String sourceField;
+    private String targetField;
+
+    @Override
+    public void filter(Metadata metadata) throws TikaException {
+        String val = metadata.get(sourceField);
+        if (StringUtils.isBlank(val)) {
+            return;
+        }
+        Matcher m = regex.matcher(val);
+        if (m.find()) {
+            metadata.set(targetField, m.group(1));
+        }
+    }
+
+    @Field
+    public void setRegex(String regex) {
+        this.regexString = regex;
+    }
+
+    @Field
+    public void setSourceField(String sourceField) {
+        this.sourceField = sourceField;
+    }
+
+    @Field
+    public void setTargetField(String targetField) {
+        this.targetField = targetField;
+    }
+
+    @Override
+    public void initialize(Map<String, Param> params) throws TikaConfigException {
+        try {
+            regex = Pattern.compile(regexString);
+        } catch (PatternSyntaxException e) {
+            throw new TikaConfigException("Couldn't parse regex", e);
+        }
+
+    }
+
+    @Override
+    public void checkInitialization(InitializableProblemHandler problemHandler)
+            throws TikaConfigException {
+        if (StringUtils.isBlank(sourceField)) {
+            throw new TikaConfigException("Must specify a 'sourceField'");
+        }
+        if (StringUtils.isBlank(targetField)) {
+            throw new TikaConfigException("Must specify a 'targetField'");
+        }
+    }
+}
diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
index 88d510d57..0b071d0be 100644
--- a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
+++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
@@ -191,4 +191,57 @@ public class TestMetadataFilter extends AbstractTikaConfigTest {
         filter.filter(m);
         assertEquals("2021-07-23T08:02:24Z", m.get(TikaCoreProperties.CREATED));
     }
+
+    @Test
+    public void testCaptureGroupBasic() throws Exception {
+        TikaConfig config = getConfig("TIKA-4133-capture-group.xml");
+
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.TIKA_CONTENT, "quick brown fox");
+        metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
+
+        MetadataFilter filter = config.getMetadataFilter();
+        filter.filter(metadata);
+        assertEquals("quick brown fox", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+        assertEquals("text/html", metadata.get("mime"));
+    }
+
+    @Test
+    public void testCaptureGroupNoSemiColon() throws Exception {
+        TikaConfig config = getConfig("TIKA-4133-capture-group.xml");
+
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.TIKA_CONTENT, "quick brown fox");
+        metadata.set(Metadata.CONTENT_TYPE, "text/html");
+
+        MetadataFilter filter = config.getMetadataFilter();
+        filter.filter(metadata);
+        assertEquals("quick brown fox", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+        assertEquals("text/html", metadata.get("mime"));
+    }
+
+    @Test
+    public void testCaptureGroupOverwrite() throws Exception {
+        TikaConfig config = getConfig("TIKA-4133-capture-group-overwrite.xml");
+
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.TIKA_CONTENT, "quick brown fox");
+        metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
+
+        MetadataFilter filter = config.getMetadataFilter();
+        filter.filter(metadata);
+        assertEquals("quick brown fox", metadata.get(TikaCoreProperties.TIKA_CONTENT));
+        assertEquals("text/html", metadata.get(Metadata.CONTENT_TYPE));
+
+        // now test that a single match overwrites all the values
+        metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8");
+        metadata.add(TikaCoreProperties.TIKA_CONTENT.toString(), "text/html; charset=UTF-8");
+        metadata.add(TikaCoreProperties.TIKA_CONTENT.toString(), "text/plain; charset=UTF-8");
+        metadata.add(TikaCoreProperties.TIKA_CONTENT.toString(), "application/pdf; charset=UTF-8");
+
+        filter.filter(metadata);
+        assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length);
+        assertEquals("text/html", metadata.get(Metadata.CONTENT_TYPE));
+    }
+
 }
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4133-capture-group-overwrite.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4133-capture-group-overwrite.xml
new file mode 100644
index 000000000..b43655840
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4133-capture-group-overwrite.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <metadataFilters>
+    <metadataFilter class="org.apache.tika.metadata.filter.CaptureGroupMetadataFilter">
+      <sourceField>Content-Type</sourceField>
+      <targetField>Content-Type</targetField>
+      <regex>\A([^;]+)</regex>
+    </metadataFilter>
+  </metadataFilters>
+</properties>
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4133-capture-group.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4133-capture-group.xml
new file mode 100644
index 000000000..7ad7378e0
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4133-capture-group.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <metadataFilters>
+    <metadataFilter class="org.apache.tika.metadata.filter.CaptureGroupMetadataFilter">
+      <sourceField>Content-Type</sourceField>
+      <targetField>mime</targetField>
+      <regex>\A([^;]+)</regex>
+    </metadataFilter>
+  </metadataFilters>
+</properties>