You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2018/08/15 04:04:23 UTC

lucene-solr:master: SOLR-12591: ParseDateFieldUpdateProcessorFactory: Use "lenient" and strip surrounding quotes. More tests, ported from "extract" contrib stuff.

Repository: lucene-solr
Updated Branches:
  refs/heads/master 0d89ff2e6 -> ec01cc981


SOLR-12591: ParseDateFieldUpdateProcessorFactory: Use "lenient" and strip surrounding quotes.
More tests, ported from "extract" contrib stuff.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/ec01cc98
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/ec01cc98
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/ec01cc98

Branch: refs/heads/master
Commit: ec01cc981c0ff221c79014f3665fd21c227d5651
Parents: 0d89ff2
Author: Bar Rotstein <ba...@gmail.com>
Authored: Wed Aug 15 00:04:09 2018 -0400
Committer: David Smiley <ds...@apache.org>
Committed: Wed Aug 15 00:04:09 2018 -0400

----------------------------------------------------------------------
 solr/CHANGES.txt                                |   4 +-
 .../ParseDateFieldUpdateProcessorFactory.java   |  20 +++-
 ...lrconfig-parsing-update-processor-chains.xml |  15 +++
 .../ParsingFieldUpdateProcessorsTest.java       | 117 ++++++++++++++++++-
 .../src/update-request-processors.adoc          |   2 +-
 5 files changed, 149 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec01cc98/solr/CHANGES.txt
----------------------------------------------------------------------
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index d165e21..f515c7e 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -55,8 +55,8 @@ Other Changes
 
 * SOLR-12614: Make "Nodes" view the default in AdminUI "Cloud" tab (janhoy)
 
-* SOLR-12586: Remove Joda Time dependency.  Upgrade ParseDateFieldUpdateProcessorFactory (present in "schemaless mode")
-  to use Java 8's java.time.DateTimeFormatter instead (see upgrade notes).
+* SOLR-12586, SOLR-12591: Upgrade ParseDateFieldUpdateProcessorFactory (present in "schemaless mode") to use Java 8's
+  java.time.DateTimeFormatter instead of Joda time (see upgrade notes).  "Lenient" is enabled.  Removed Joda Time dependency.
   (David Smiley, Bar Rotstein)
 
 ==================  7.5.0 ==================

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec01cc98/solr/core/src/java/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.java b/solr/core/src/java/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.java
index f0ea5d2..2561fdb 100644
--- a/solr/core/src/java/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.java
+++ b/solr/core/src/java/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.java
@@ -26,6 +26,7 @@ import java.time.ZoneOffset;
 import java.time.format.DateTimeFormatter;
 import java.time.format.DateTimeFormatterBuilder;
 import java.time.format.DateTimeParseException;
+import java.time.format.ResolverStyle;
 import java.time.temporal.TemporalAccessor;
 import java.time.temporal.TemporalQueries;
 import java.util.Collection;
@@ -51,7 +52,7 @@ import org.slf4j.LoggerFactory;
  * Attempts to mutate selected fields that have only CharSequence-typed values
  * into Date values.  Solr will continue to index date/times in the UTC time
  * zone, but the input date/times may be expressed using other time zones,
- * and will be converted to UTC when they are mutated.
+ * and will be converted to an unambiguous {@link Date} when they are mutated.
  * </p>
  * <p>
  * The default selection behavior is to mutate both those fields that don't match
@@ -67,6 +68,8 @@ import org.slf4j.LoggerFactory;
  * One or more date "format" specifiers must be specified.  See 
  * <a href="https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html"
  * >Java 8's DateTimeFormatter javadocs</a> for a description of format strings.
+ * Note that "lenient" and case insensitivity is enabled.
+ * Furthermore, inputs surrounded in single quotes will be removed if found.
  * </p>
  * <p>
  * A default time zone name or offset may optionally be specified for those dates
@@ -120,6 +123,16 @@ public class ParseDateFieldUpdateProcessorFactory extends FieldMutatingUpdatePro
       protected Object mutateValue(Object srcVal) {
         if (srcVal instanceof CharSequence) {
           String srcStringVal = srcVal.toString();
+          // trim single quotes around date if present
+          // see issue #5279  (Apache HttpClient)
+          int stringValLen = srcStringVal.length();
+          if (stringValLen > 1
+              && srcStringVal.startsWith("'")
+              && srcStringVal.endsWith("'")
+          ) {
+            srcStringVal = srcStringVal.substring(1, stringValLen - 1);
+          }
+
           for (Map.Entry<String,DateTimeFormatter> format : formats.entrySet()) {
             DateTimeFormatter parser = format.getValue();
             try {
@@ -159,8 +172,9 @@ public class ParseDateFieldUpdateProcessorFactory extends FieldMutatingUpdatePro
     Collection<String> formatsParam = args.removeConfigArgs(FORMATS_PARAM);
     if (null != formatsParam) {
       for (String value : formatsParam) {
-        DateTimeFormatter formatter = new DateTimeFormatterBuilder().parseCaseInsensitive()
-            .appendPattern(value).toFormatter(locale).withZone(defaultTimeZone);
+        DateTimeFormatter formatter = new DateTimeFormatterBuilder().parseLenient().parseCaseInsensitive()
+            .appendPattern(value).toFormatter(locale)
+            .withResolverStyle(ResolverStyle.LENIENT).withZone(defaultTimeZone);
         validateFormatter(formatter);
         formats.put(value, formatter);
       }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec01cc98/solr/core/src/test-files/solr/collection1/conf/solrconfig-parsing-update-processor-chains.xml
----------------------------------------------------------------------
diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig-parsing-update-processor-chains.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig-parsing-update-processor-chains.xml
index 83be4ee..6914ba2 100644
--- a/solr/core/src/test-files/solr/collection1/conf/solrconfig-parsing-update-processor-chains.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-parsing-update-processor-chains.xml
@@ -109,6 +109,21 @@
     </processor>
   </updateRequestProcessorChain>
 
+  <updateRequestProcessorChain name="parse-date-patterns-from-extract-contrib">
+    <processor class="solr.ParseDateFieldUpdateProcessorFactory">
+      <str name="defaultTimeZone">UTC</str>
+      <str name="locale">en</str>
+      <arr name="format">
+        <str>yyyy-MM-dd['T'[HH:mm:ss['.'SSS][z</str>
+        <str>yyyy-MM-dd HH:mm:ss</str>
+        <str>EEE MMM d HH:mm:ss [z ]yyyy</str>
+        <str>EEEE, dd-MMM-yy HH:mm:ss zzz</str>
+        <str>EEE, dd MMM yyyy HH:mm:ss zzz</str>
+      </arr>
+    </processor>
+    <processor class="solr.RunUpdateProcessorFactory" />
+  </updateRequestProcessorChain>
+
   <updateRequestProcessorChain name="parse-int">
     <processor class="solr.ParseIntFieldUpdateProcessorFactory"/>
     <processor class="solr.RunUpdateProcessorFactory"/>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec01cc98/solr/core/src/test/org/apache/solr/update/processor/ParsingFieldUpdateProcessorsTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/update/processor/ParsingFieldUpdateProcessorsTest.java b/solr/core/src/test/org/apache/solr/update/processor/ParsingFieldUpdateProcessorsTest.java
index e26ca41..334b14a 100644
--- a/solr/core/src/test/org/apache/solr/update/processor/ParsingFieldUpdateProcessorsTest.java
+++ b/solr/core/src/test/org/apache/solr/update/processor/ParsingFieldUpdateProcessorsTest.java
@@ -16,6 +16,7 @@
  */
 package org.apache.solr.update.processor;
 
+import java.io.IOException;
 import java.time.Instant;
 import java.time.LocalDate;
 import java.time.LocalDateTime;
@@ -36,7 +37,6 @@ import java.util.Set;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.schema.IndexSchema;
 import org.junit.BeforeClass;
-
 /**
  * Tests for the field mutating update processors
  * that parse Dates, Longs, Doubles, and Booleans.
@@ -896,13 +896,124 @@ public class ParsingFieldUpdateProcessorsTest extends UpdateProcessorTestBase {
     assertTrue(mixedDates.isEmpty());
   }
 
-  private Date parse(DateTimeFormatter dateTimeFormatter, String dateString) {
+  // tests that mimic the tests that were in TestExtractionDateUtil
+  public void testISO8601() throws IOException {
+    // dates with atypical years
+    // This test tries to mimic TestExtractionDateUtil#testISO8601
+
+    String[] dateStrings = {
+        "0001-01-01T01:01:01Z", "+12021-12-01T03:03:03Z",
+        "0000-04-04T04:04:04Z", "-0005-05-05T05:05:05Z",
+        "-2021-12-01T04:04:04Z", "-12021-12-01T02:02:02Z"
+    };
+
+    int id = 1;
+
+    // ensure strings are parsed
+    for(String notInFormatDateString: dateStrings) {
+      IndexSchema schema = h.getCore().getLatestSchema();
+      assertNotNull(schema.getFieldOrNull("date_dt")); // should match "*_dt" dynamic field
+      SolrInputDocument d = processAdd("parse-date-patterns-from-extract-contrib", doc(f("id", id), f("date_dt", notInFormatDateString)));
+      assertNotNull(d);
+      assertTrue("Date string: " + notInFormatDateString + " was not parsed as a date", d.getFieldValue("date_dt") instanceof Date);
+      assertEquals(notInFormatDateString, ((Date) d.getField("date_dt").getFirstValue()).toInstant().toString());
+      assertU(commit());
+      assertQ(req("id:" + id), "//date[@name='date_dt'][.='" + notInFormatDateString + "']");
+      ++id;
+    }
+
+    // odd values are date strings, even values are expected strings
+    String[] lenientDateStrings = {
+        "10995-12-31T23:59:59.990Z", "+10995-12-31T23:59:59.990Z",
+        "995-1-2T3:4:5Z", "0995-01-02T03:04:05Z",
+        "2021-01-01t03:04:05", "2021-01-01T03:04:05Z",
+        "2021-12-01 04:04:04", "2021-12-01T04:04:04Z"
+    };
+
+    // ensure sure strings that should be parsed using lenient resolver are properly parsed
+    for(int i = 0; i < lenientDateStrings.length; ++i) {
+      String lenientDateString = lenientDateStrings[i];
+      String expectedString = lenientDateStrings[++i];
+      IndexSchema schema = h.getCore().getLatestSchema();
+      assertNotNull(schema.getFieldOrNull("date_dt")); // should match "*_dt" dynamic field
+      SolrInputDocument d = processAdd("parse-date-patterns-from-extract-contrib", doc(f("id", id), f("date_dt", lenientDateString)));
+      assertNotNull(d);
+      assertTrue("Date string: " + lenientDateString + " was not parsed as a date",
+          d.getFieldValue("date_dt") instanceof Date);
+      assertEquals(expectedString, ((Date) d.getField("date_dt").getFirstValue()).toInstant().toString());
+      ++id;
+    }
+  }
+
+  // this test has had problems when the JDK timezone is Americas/Metlakatla
+  public void testAKSTZone() throws IOException {
+    final String inputString = "Thu Nov 13 04:35:51 AKST 2008";
+
+    final long expectTs = 1226583351000L;
+    assertEquals(expectTs,
+        DateTimeFormatter.ofPattern("EEE MMM d HH:mm:ss z yyyy", Locale.ENGLISH)
+            .withZone(ZoneId.of("UTC")).parse(inputString, Instant::from).toEpochMilli());
+
+    assertParsedDate(inputString, Date.from(Instant.ofEpochMilli(expectTs)), "parse-date-patterns-from-extract-contrib");
+  }
+
+  public void testNoTime() throws IOException {
+    Instant instant = instant(2005, 10, 7, 0, 0, 0);
+    String inputString = "2005-10-07";
+    assertParsedDate(inputString, Date.from(instant), "parse-date-patterns-from-extract-contrib");
+  }
+
+  public void testRfc1123() throws IOException {
+    assertParsedDate("Fri, 07 Oct 2005 13:14:15 GMT", Date.from(inst20051007131415()), "parse-date-patterns-from-extract-contrib");
+  }
+
+  public void testRfc1036() throws IOException {
+    assertParsedDate("Friday, 07-Oct-05 13:14:15 GMT", Date.from(inst20051007131415()), "parse-date-patterns-from-extract-contrib");
+  }
+
+  public void testAnsiC() throws IOException {
+    assertParsedDate(
+        "Fri Oct 7 13:14:15 2005", Date.from(inst20051007131415()), "parse-date-patterns-from-extract-contrib");
+
+    assertParsedDate("Fri Oct 7 05:14:15 AKDT 2005", Date.from(inst20051007131415()), "parse-date-patterns-from-extract-contrib"); // with timezone (not ANSI C) in DST
+  }
+
+  public void testLenient() throws IOException {
+    /// the Ansi C format, but input here has longer day of week
+    assertParsedDate("Friday Oct 7 13:14:15 2005", Date.from(inst20051007131415()), "parse-date-patterns-from-extract-contrib");
+  }
+
+  public void testParseQuotedDate() throws IOException {
+    // also using 2 digit day
+    assertParsedDate("'Fri, 14 Oct 2005 13:14:15 GMT'",
+        Date.from(instant(2005, 10, 14, 13, 14, 15)), "parse-date-patterns-from-extract-contrib");
+  }
+
+  private static Instant instant(final int year, final int month, final int day, int hour, int minute, int second) {
+    return LocalDate.of(year, month, day).atTime(hour, minute, second).toInstant(ZoneOffset.UTC);
+  }
+
+  private Instant inst20051007131415() {
+    return instant(2005, 10, 7, 13, 14, 15);
+  }
+
+  private void assertParsedDate(String inputDateString, Date expectedDate, String chain) throws IOException {
+    IndexSchema schema = h.getCore().getLatestSchema();
+    assertNotNull(schema.getFieldOrNull("date_dt")); // should match "*_dt" dynamic field
+    SolrInputDocument d = processAdd(chain, doc(f("id", "1"), f("date_dt", inputDateString)));
+    assertNotNull(d);
+    assertTrue("Date string: " + inputDateString + " was not parsed as a date",
+        d.getFieldValue("date_dt") instanceof Date);
+    assertEquals(expectedDate, d.getField("date_dt").getFirstValue());
+  }
+
+  private static Date parse(DateTimeFormatter dateTimeFormatter, String dateString) {
     final TemporalAccessor temporalAccessor = dateTimeFormatter.parseBest(dateString, OffsetDateTime::from,
         ZonedDateTime::from, LocalDateTime::from, LocalDate::from, Instant::from);
     return temporalToDate(temporalAccessor, dateTimeFormatter.getZone());
   }
 
-  private Date temporalToDate(TemporalAccessor in, ZoneId timeZoneId) {
+  private static Date temporalToDate(TemporalAccessor in, ZoneId timeZoneId) {
     if (in instanceof OffsetDateTime) {
       return Date.from(((OffsetDateTime) in).toInstant());
     } else if (in instanceof ZonedDateTime) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec01cc98/solr/solr-ref-guide/src/update-request-processors.adoc
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/src/update-request-processors.adoc b/solr/solr-ref-guide/src/update-request-processors.adoc
index 21e56cf..267ffbd 100644
--- a/solr/solr-ref-guide/src/update-request-processors.adoc
+++ b/solr/solr-ref-guide/src/update-request-processors.adoc
@@ -317,7 +317,7 @@ These factories all provide functionality to _modify_ fields in a document as th
 
 {solr-javadocs}/solr-core/org/apache/solr/update/processor/ParseBooleanFieldUpdateProcessorFactory.html[ParseBooleanFieldUpdateProcessorFactory]:: Attempts to mutate selected fields that have only CharSequence-typed values into Boolean values.
 
-{solr-javadocs}/solr-core/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.html[ParseDateFieldUpdateProcessorFactory]:: Attempts to mutate selected fields that have only CharSequence-typed values into Solr date values.
+{solr-javadocs}/solr-core/org/apache/solr/update/processor/ParseDateFieldUpdateProcessorFactory.html[ParseDateFieldUpdateProcessorFactory]:: Attempts to mutate selected fields that have only CharSequence-typed values into Date values.
 
 {solr-javadocs}/solr-core/org/apache/solr/update/processor/ParseNumericFieldUpdateProcessorFactory.html[ParseNumericFieldUpdateProcessorFactory] derived classes::