You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2017/12/18 15:50:05 UTC

[nutch] 21/23: NUTCH-2322 URL not available for Jexl operations - apply patch contributed by Markus Jelsma

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 22fc7f0defb22588c4ade33b5693303f18d96253
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Sun Dec 17 15:32:04 2017 +0100

    NUTCH-2322 URL not available for Jexl operations
    - apply patch contributed by Markus Jelsma
---
 src/java/org/apache/nutch/crawl/CrawlDatum.java    | 18 ++++++++++++------
 src/java/org/apache/nutch/crawl/CrawlDbReader.java |  2 +-
 src/java/org/apache/nutch/crawl/Generator.java     |  2 +-
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/java/org/apache/nutch/crawl/CrawlDatum.java b/src/java/org/apache/nutch/crawl/CrawlDatum.java
index e54c791..1facf0a 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDatum.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDatum.java
@@ -23,14 +23,15 @@ import java.util.Map.Entry;
 
 import org.apache.commons.jexl2.JexlContext;
 import org.apache.commons.jexl2.Expression;
-import org.apache.commons.jexl2.JexlEngine;
 import org.apache.commons.jexl2.MapContext;
 
 import org.apache.hadoop.io.*;
 import org.apache.nutch.util.*;
+import org.apache.nutch.protocol.ProtocolStatus;
 
 /* The crawl state of a url. */
 public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
+
   public static final String GENERATE_DIR_NAME = "crawl_generate";
   public static final String FETCH_DIR_NAME = "crawl_fetch";
   public static final String PARSE_DIR_NAME = "crawl_parse";
@@ -525,12 +526,13 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
     }
   }
   
-  public boolean evaluate(Expression expr) {
-    if (expr != null) {
+  public boolean evaluate(Expression expr, String url) {
+    if (expr != null && url != null) {
       // Create a context and add data
       JexlContext jcontext = new MapContext();
       
       // https://issues.apache.org/jira/browse/NUTCH-2229
+      jcontext.set("url", url);
       jcontext.set("status", getStatusName(getStatus()));
       jcontext.set("fetchTime", (long)(getFetchTime()));
       jcontext.set("modifiedTime", (long)(getModifiedTime()));
@@ -542,24 +544,28 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
       // Set metadata variables
       for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
         Object value = entry.getValue();
+        Text tkey = (Text)entry.getKey();
         
         if (value instanceof FloatWritable) {
           FloatWritable fvalue = (FloatWritable)value;
-          Text tkey = (Text)entry.getKey();
           jcontext.set(tkey.toString(), fvalue.get());
         }
         
         if (value instanceof IntWritable) {
           IntWritable ivalue = (IntWritable)value;
-          Text tkey = (Text)entry.getKey();
           jcontext.set(tkey.toString(), ivalue.get());
         }
         
         if (value instanceof Text) {
           Text tvalue = (Text)value;
-          Text tkey = (Text)entry.getKey();     
           jcontext.set(tkey.toString().replace("-", "_"), tvalue.toString());
         }
+        
+        if (value instanceof ProtocolStatus) {
+          ProtocolStatus pvalue = (ProtocolStatus)value;
+          jcontext.set(tkey.toString().replace("-", "_"), pvalue.toString());
+        }
+
       }
                   
       try {
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index af30664..ddd25ef 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -700,7 +700,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
       
       // check expr
       if (expr != null) {
-        if (!value.evaluate(expr)) {
+        if (!value.evaluate(expr, key.toString())) {
           return;
         }
       }
diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
index e5f4831..d85d578 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -252,7 +252,7 @@ public class Generator extends NutchTool implements Tool {
       
       // check expr
       if (expr != null) {
-        if (!crawlDatum.evaluate(expr)) {
+        if (!crawlDatum.evaluate(expr, key.toString())) {
           return;
         }
       }

-- 
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.