You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/23 21:22:32 UTC

[tika] branch main updated: TIKA-3496 -- add a metadatafilter to allow users to ensure that dates emitted to Solr/OpenSearch are all UTC 'Z' formatted.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 381b36e  TIKA-3496 -- add a metadatafilter to allow users to ensure that dates emitted to Solr/OpenSearch are all UTC 'Z' formatted.
381b36e is described below

commit 381b36e8cc9a83fb34f22a74eaf20b86c78b6274
Author: tallison <ta...@apache.org>
AuthorDate: Fri Jul 23 17:22:02 2021 -0400

    TIKA-3496 -- add a metadatafilter to allow users to ensure that dates emitted to Solr/OpenSearch are all UTC 'Z' formatted.
---
 CHANGES.txt                                        |  5 ++
 .../java/org/apache/tika/metadata/Property.java    | 18 ++---
 .../filter/DateNormalizingMetadataFilter.java      | 90 ++++++++++++++++++++++
 .../tika/metadata/filter/TestMetadataFilter.java   | 12 +++
 .../opensearch/tika-config-opensearch.xml          |  5 ++
 .../src/test/resources/tika-config-solr-urls.xml   |  5 ++
 .../pipes/emitter/solr/SolrEmitterDevTest.java     | 62 +++++++++++++++
 7 files changed, 188 insertions(+), 9 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 1d7818f..fe0ea2a 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,10 @@
 Release 2.0.1 - ???
 
+   * Add DateNormalizingMetadataFilter let users ensure that all dates
+     emitted to Solr/OpenSearch are in UTC. Users can configure which
+     timezone they'd like to use in cases where the file format does
+     not store a timezone (TIKA-3496).
+
    * Breaking change in the Solr and OpenSearch emitters. To achieve
      the SKIP or CONCATENATE attachment strategy, modify the
      parseMode in the pipesiterators or in the FetchEmitTuple (TIKA-3494).
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Property.java b/tika-core/src/main/java/org/apache/tika/metadata/Property.java
index 40ccb68..3d75ad1 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Property.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Property.java
@@ -18,12 +18,12 @@ package org.apache.tika.metadata;
 
 import java.util.Arrays;
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
 import java.util.SortedSet;
 import java.util.TreeSet;
+import java.util.concurrent.ConcurrentHashMap;
 
 /**
  * XMP property definition. Each instance of this class defines a single
@@ -36,7 +36,7 @@ import java.util.TreeSet;
  */
 public final class Property implements Comparable<Property> {
 
-    private static final Map<String, Property> properties = new HashMap<>();
+    private static final Map<String, Property> PROPERTIES = new ConcurrentHashMap<>();
     private final String name;
     private final boolean internal;
     private final PropertyType propertyType;
@@ -70,8 +70,8 @@ public final class Property implements Comparable<Property> {
             this.secondaryExtractProperties = null;
 
             // Only store primary properties for lookup, not composites
-            synchronized (properties) {
-                properties.put(name, this);
+            synchronized (PROPERTIES) {
+                PROPERTIES.put(name, this);
             }
         }
     }
@@ -102,7 +102,7 @@ public final class Property implements Comparable<Property> {
      */
     public static PropertyType getPropertyType(String key) {
         PropertyType type = null;
-        Property prop = properties.get(key);
+        Property prop = PROPERTIES.get(key);
         if (prop != null) {
             type = prop.getPropertyType();
         }
@@ -116,16 +116,16 @@ public final class Property implements Comparable<Property> {
      * @return the Property object
      */
     public static Property get(String key) {
-        return properties.get(key);
+        return PROPERTIES.get(key);
     }
 
     public static SortedSet<Property> getProperties(String prefix) {
         SortedSet<Property> set = new TreeSet<>();
         String p = prefix + ":";
-        synchronized (properties) {
-            for (String name : properties.keySet()) {
+        synchronized (PROPERTIES) {
+            for (String name : PROPERTIES.keySet()) {
                 if (name.startsWith(p)) {
-                    set.add(properties.get(name));
+                    set.add(PROPERTIES.get(name));
                 }
             }
         }
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java
new file mode 100644
index 0000000..e093873
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.time.ZoneId;
+import java.util.Date;
+import java.util.Locale;
+import java.util.TimeZone;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+
+/**
+ * Some dates in some file formats do not have a timezone.
+ * Tika correctly stores these without a timezone, e.g. 'yyyy-MM-dd'T'HH:mm:ss'
+ * This can be a problem if end points expect a 'Z' timezone.
+ * This filter makes the assumption that dates without timezones are UTC
+ * and always modifies the date to: "yyyy-MM-dd'T'HH:mm:ss'Z'"
+ *
+ * Users can specify an alternate defaultTimeZone with
+ * {@link DateNormalizingMetadataFilter#setDefaultTimeZone(String)} to apply
+ * if the file format does not specify a timezone.
+ *
+ */
+public class DateNormalizingMetadataFilter extends MetadataFilter {
+
+    private static TimeZone UTC = TimeZone.getTimeZone("UTC");
+
+    private static final Logger LOGGER = LoggerFactory.getLogger(DateNormalizingMetadataFilter.class);
+
+    private TimeZone defaultTimeZone = UTC;
+
+    @Override
+    public void filter(Metadata metadata) throws TikaException {
+        SimpleDateFormat dateFormatter = null;
+        SimpleDateFormat utcFormatter = null;
+        for (String n : metadata.names()) {
+
+            Property property = Property.get(n);
+            if (property != null) {
+                if (property.getValueType().equals(Property.ValueType.DATE)) {
+                    String dateString = metadata.get(property);
+                    if (dateString.endsWith("Z")) {
+                        continue;
+                    }
+                    if (dateFormatter == null) {
+                        dateFormatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.US);
+                        dateFormatter.setTimeZone(defaultTimeZone);
+                        utcFormatter = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.US);
+                        utcFormatter.setTimeZone(UTC);
+                    }
+                    Date d = null;
+                    try {
+                        d = dateFormatter.parse(dateString);
+                        metadata.set(property, utcFormatter.format(d));
+                    } catch (ParseException e) {
+                        LOGGER.warn("Couldn't convert date to default time zone: >"
+                                + dateString + "<");
+                    }
+                }
+            }
+        }
+    }
+
+    @Field
+    public void setDefaultTimeZone(String timeZoneId) {
+        this.defaultTimeZone = TimeZone.getTimeZone(ZoneId.of(timeZoneId));
+    }
+}
diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
index d77e373..c7368df 100644
--- a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
+++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
@@ -179,4 +179,16 @@ public class TestMetadataFilter extends AbstractTikaConfigTest {
         assertNull(metadata.get("author"));
         assertNull(metadata.get("a"));
     }
+
+    @Test
+    public void testDateNormalizingFilter() throws Exception {
+        //test that a Date lacking a timezone, if interpreted as Los Angeles, for example,
+        //yields a UTC string that is properly +7 hours.
+        Metadata m = new Metadata();
+        m.set(TikaCoreProperties.CREATED, "2021-07-23T01:02:24");
+        DateNormalizingMetadataFilter filter = new DateNormalizingMetadataFilter();
+        filter.setDefaultTimeZone("America/Los_Angeles");
+        filter.filter(m);
+        assertEquals("2021-07-23T08:02:24Z", m.get(TikaCoreProperties.CREATED));
+    }
 }
diff --git a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.xml b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.xml
index af0b53b..df9452a 100644
--- a/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.xml
+++ b/tika-integration-tests/tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/tika-config-opensearch.xml
@@ -45,6 +45,11 @@
     </parser>
   </parsers>
   <metadataFilters>
+    <!-- depending on the file format, some dates do not have a timezone. This
+     filter arbitrarily assumes dates have a UTC timezone and will format all
+     dates as yyyy-MM-dd'T'HH:mm:ss'Z' whether or not they actually have a timezone.
+     -->
+    <metadataFilter class="org.apache.tika.metadata.filter.DateNormalizingMetadataFilter"/>
     <metadataFilter class="org.apache.tika.metadata.filter.FieldNameMappingFilter">
       <params>
         <excludeUnmapped>true</excludeUnmapped>
diff --git a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.xml b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.xml
index 5f2740f..7517a9c 100644
--- a/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.xml
+++ b/tika-integration-tests/tika-pipes-solr-integration-tests/src/test/resources/tika-config-solr-urls.xml
@@ -45,6 +45,11 @@
     </parser>
   </parsers>
   <metadataFilters>
+    <!-- depending on the file format, some dates do not have a timezone. This
+         filter arbitrarily assumes dates have a UTC timezone and will format all
+         dates as yyyy-MM-dd'T'HH:mm:ss'Z' whether or not they actually have a timezone.
+         -->
+    <metadataFilter class="org.apache.tika.metadata.filter.DateNormalizingMetadataFilter"/>
     <metadataFilter class="org.apache.tika.metadata.filter.FieldNameMappingFilter">
       <params>
         <excludeUnmapped>true</excludeUnmapped>
diff --git a/tika-pipes/tika-emitters/tika-emitter-solr/src/test/java/org/apache/tika/pipes/emitter/solr/SolrEmitterDevTest.java b/tika-pipes/tika-emitters/tika-emitter-solr/src/test/java/org/apache/tika/pipes/emitter/solr/SolrEmitterDevTest.java
new file mode 100644
index 0000000..779f8bb
--- /dev/null
+++ b/tika-pipes/tika-emitters/tika-emitter-solr/src/test/java/org/apache/tika/pipes/emitter/solr/SolrEmitterDevTest.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.emitter.solr;
+
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.junit.Ignore;
+import org.junit.Test;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.filter.FieldNameMappingFilter;
+
+/**
+ * This is meant only for one off development tests with a locally
+ * running instance of Solr.  Please add unit tests to the
+ * tika-integration-tests/solr-*
+ */
+@Ignore
+public class SolrEmitterDevTest {
+
+    @Test
+    public void oneOff() throws Exception {
+        String core = "tika-example";
+        String url = "http://localhost:8983/solr";
+        String emitKey = "one-off-test-doc";
+        SolrEmitter solrEmitter = new SolrEmitter();
+        solrEmitter.setSolrUrls(Collections.singletonList(url));
+        solrEmitter.setSolrCollection(core);
+        solrEmitter.initialize(Collections.EMPTY_MAP);
+
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.CREATED, new Date());
+        metadata.set(TikaCoreProperties.TIKA_CONTENT, "the quick brown fox");
+
+        Map<String, String> mappings = new HashMap();
+        FieldNameMappingFilter filter = new FieldNameMappingFilter();
+        mappings.put(TikaCoreProperties.CREATED.getName(), "created");
+        mappings.put(TikaCoreProperties.TIKA_CONTENT.getName(), "content");
+        filter.setMappings(mappings);
+        filter.filter(metadata);
+
+        solrEmitter.emit(emitKey, Collections.singletonList(metadata));
+    }
+}