You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2019/01/18 15:24:19 UTC

[nutch] branch master updated: NUTCH-2663 Improve the JEXL syntax for getting values from the metadata/context

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new b2ec5c4  NUTCH-2663 Improve the JEXL syntax for getting values from the metadata/context
     new ac2d578  Merge pull request #400 from jorgelbg/jexl-improve-syntax
b2ec5c4 is described below

commit b2ec5c4c1f19a704984f71e9b1c10d489017ae01
Author: Jorge Luis Betancourt Gonzalez <jo...@trivago.com>
AuthorDate: Thu Oct 18 15:13:25 2018 +0200

    NUTCH-2663 Improve the JEXL syntax for getting values from the metadata/context
---
 src/java/org/apache/nutch/util/JexlUtil.java       | 28 ++++++++++++----------
 .../nutch/indexer/jexl/JexlIndexingFilter.java     | 27 ++++++++++++++-------
 .../nutch/indexer/jexl/TestJexlIndexingFilter.java |  6 ++---
 3 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/src/java/org/apache/nutch/util/JexlUtil.java b/src/java/org/apache/nutch/util/JexlUtil.java
index f5e67cc..b480033 100644
--- a/src/java/org/apache/nutch/util/JexlUtil.java
+++ b/src/java/org/apache/nutch/util/JexlUtil.java
@@ -29,45 +29,47 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * A collection of Jexl utilit(y|ies).
+ * Utility methods for handling JEXL expressions
  */
 public class JexlUtil {
 
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
 
-  /**
-   * 
-   */
-  public static Pattern datePattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
+  /** Supported format for date parsing yyyy-MM-ddTHH:mm:ssZ */
+  private static final Pattern DATE_PATTERN = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
 
   /**
-   * Parses the given experssion to a Jexl expression. This supports
+   * Parses the given expression to a JEXL expression. This supports
    * date parsing.
    *
-   * @param expr the Jexl expression
-   * @return parsed Jexl expression or null in case of parse error
+   * @param expr string JEXL expression
+   * @return parsed JEXL expression or null in case of parse error
    */
   public static Expression parseExpression(String expr) {
     if (expr == null) return null;
     
     try {
-      // Translate any date object into a long, dates must be specified as 20-03-2016T00:00:00Z
-      Matcher matcher = datePattern.matcher(expr);
+      // Translate any date object into a long. Dates must be in the DATE_PATTERN
+      // format. For example: 2016-03-20T00:00:00Z
+      Matcher matcher = DATE_PATTERN.matcher(expr);
+
       if (matcher.find()) {
         String date = matcher.group();
         
-        // Parse the thing and get epoch!
+        // parse the matched substring and get the epoch
         Date parsedDate = DateUtils.parseDateStrictly(date, new String[] {"yyyy-MM-dd'T'HH:mm:ss'Z'"});
         long time = parsedDate.getTime();
         
-        // Replace in the original expression
+        // replace the original string date with the numeric value
         expr = expr.replace(date, Long.toString(time));
       }
-      
+
       JexlEngine jexl = new JexlEngine();
+
       jexl.setSilent(true);
       jexl.setStrict(true);
+
       return jexl.createExpression(expr);
     } catch (Exception e) {
       LOG.error(e.getMessage());
diff --git a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
index 3fa2294..890020a 100644
--- a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
+++ b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
@@ -18,6 +18,7 @@
 package org.apache.nutch.indexer.jexl;
 
 import java.lang.invoke.MethodHandles;
+import java.util.List;
 import java.util.Map.Entry;
 
 import org.apache.commons.jexl2.Expression;
@@ -41,7 +42,6 @@ import org.slf4j.LoggerFactory;
 /**
  * An {@link org.apache.nutch.indexer.IndexingFilter} that allows filtering of
  * documents based on a JEXL expression.
- *
  */
 public class JexlIndexingFilter implements IndexingFilter {
 
@@ -84,9 +84,12 @@ public class JexlIndexingFilter implements IndexingFilter {
         metadataToContext(parse.getData().getParseMeta()));
 
     JexlContext context = new MapContext();
+
     for (Entry<String, NutchField> entry : doc) {
-      context.set(entry.getKey(), entry.getValue().getValues());
+      List<Object> values = entry.getValue().getValues();
+      context.set(entry.getKey(), values.size() > 1 ? values : values.get(0));
     }
+
     jcontext.set("doc", context);
 
     try {
@@ -102,16 +105,21 @@ public class JexlIndexingFilter implements IndexingFilter {
   @Override
   public void setConf(Configuration conf) {
     this.conf = conf;
-    String str = conf.get("index.jexl.filter");
-    if (str == null) {
-      LOG.warn(
+    String strExpr = conf.get("index.jexl.filter");
+
+    if (strExpr == null) {
+      LOG.error(
           "The property index.jexl.filter must have a value when index-jexl-filter is used. You can use 'true' or 'false' to index all/none");
+
       throw new RuntimeException(
           "The property index.jexl.filter must have a value when index-jexl-filter is used. You can use 'true' or 'false' to index all/none");
     }
-    expr = JexlUtil.parseExpression(str);
+
+    expr = JexlUtil.parseExpression(strExpr);
+
     if (expr == null) {
-      LOG.warn("Failed parsing JEXL from index.jexl.filter: {}", str);
+      LOG.error("Failed parsing JEXL from index.jexl.filter: {}", strExpr);
+
       throw new RuntimeException("Failed parsing JEXL from index.jexl.filter");
     }
   }
@@ -123,9 +131,12 @@ public class JexlIndexingFilter implements IndexingFilter {
 
   private JexlContext metadataToContext(Metadata metadata) {
     JexlContext context = new MapContext();
+
     for (String name : metadata.names()) {
-      context.set(name, metadata.getValues(name));
+      String[] values = metadata.getValues(name);
+      context.set(name, values.length > 1 ? values : values[0]);
     }
+
     return context;
   }
 }
diff --git a/src/plugin/index-jexl-filter/src/test/org/apache/nutch/indexer/jexl/TestJexlIndexingFilter.java b/src/plugin/index-jexl-filter/src/test/org/apache/nutch/indexer/jexl/TestJexlIndexingFilter.java
index 0427ad4..f3cc655 100644
--- a/src/plugin/index-jexl-filter/src/test/org/apache/nutch/indexer/jexl/TestJexlIndexingFilter.java
+++ b/src/plugin/index-jexl-filter/src/test/org/apache/nutch/indexer/jexl/TestJexlIndexingFilter.java
@@ -39,7 +39,7 @@ public class TestJexlIndexingFilter {
   @Test
   public void testAllowMatchingDocument() throws Exception {
     Configuration conf = NutchConfiguration.create();
-    conf.set("index.jexl.filter", "doc.lang[0]=='en'");
+    conf.set("index.jexl.filter", "doc.lang=='en'");
 
     JexlIndexingFilter filter = new JexlIndexingFilter();
     filter.setConf(conf);
@@ -73,7 +73,7 @@ public class TestJexlIndexingFilter {
   @Test
   public void testBlockNotMatchingDocuments() throws Exception {
     Configuration conf = NutchConfiguration.create();
-    conf.set("index.jexl.filter", "doc.lang[0]=='en'");
+    conf.set("index.jexl.filter", "doc.lang=='en'");
 
     JexlIndexingFilter filter = new JexlIndexingFilter();
     filter.setConf(conf);
@@ -115,7 +115,7 @@ public class TestJexlIndexingFilter {
   @Test
   public void testInvalidExpression() throws Exception {
     Configuration conf = NutchConfiguration.create();
-    conf.set("index.jexl.filter", "doc.lang[0]=<>:='en'");
+    conf.set("index.jexl.filter", "doc.lang=<>:='en'");
 
     JexlIndexingFilter filter = new JexlIndexingFilter();
     thrown.expect(RuntimeException.class);