You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/02/24 14:05:03 UTC

svn commit: r1732140 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDatum.java src/java/org/apache/nutch/crawl/CrawlDbReader.java

Author: markus
Date: Wed Feb 24 13:05:02 2016
New Revision: 1732140

URL: http://svn.apache.org/viewvc?rev=1732140&view=rev
Log:
NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
    nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1732140&r1=1732139&r2=1732140&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Feb 24 13:05:02 2016
@@ -10,6 +10,8 @@ in the release announcement and keep it
 
 Nutch Change Log
 
+* NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes (markus)
+
 * NUTCH-2227 RegexParseFilter (markus)
 
 * NUTCH-2221 Introduce db.ignore.internal.links to FetcherThread (markus)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=1732140&r1=1732139&r2=1732140&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Wed Feb 24 13:05:02 2016
@@ -521,30 +521,20 @@ public class CrawlDatum implements Writa
     }
   }
   
-  public boolean evaluate(String expr) {
-    return evaluate(expr, true, true);
-  }
-  
-  public boolean evaluate(String expr, boolean silent, boolean strict) {
-    if (expr != null) {
-      // Create or retrieve a JexlEngine
-      JexlEngine jexl = new JexlEngine();
-      
-      jexl.setSilent(silent);
-      jexl.setStrict(strict);
-      
-      // Create an expression object and evaluate
-      return evaluate(jexl.createExpression(expr));
-    }
-    
-    return false;
-  }
-  
   public boolean evaluate(Expression expr) {
     if (expr != null) {
       // Create a context and add data
       JexlContext jcontext = new MapContext();
-            
+      
+      // https://issues.apache.org/jira/browse/NUTCH-2229
+      jcontext.set("status", getStatusName(getStatus()));
+      jcontext.set("fetchTime", (long)(getFetchTime()));
+      jcontext.set("modifiedTime", (long)(getModifiedTime()));
+      jcontext.set("retries", getRetriesSinceFetch());
+      jcontext.set("interval", new Integer(getFetchInterval()));
+      jcontext.set("score", getScore());
+      jcontext.set("signature", StringUtil.toHexString(getSignature()));
+      
       // Set metadata variables
       for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
         Object value = entry.getValue();
@@ -571,7 +561,7 @@ public class CrawlDatum implements Writa
           } catch (Exception e) {}
         }
       }
-      
+            
       try {
         if (Boolean.TRUE.equals(expr.evaluate(jcontext))) {
           return true;

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1732140&r1=1732139&r2=1732140&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Wed Feb 24 13:05:02 2016
@@ -70,6 +70,7 @@ import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.StringUtil;
 import org.apache.commons.jexl2.Expression;
 import org.apache.commons.jexl2.JexlEngine;
+import org.apache.commons.lang.time.DateUtils;
 
 /**
  * Read utility for the CrawlDB.
@@ -522,6 +523,7 @@ public class CrawlDbReader extends Confi
 
   public static class CrawlDbDumpMapper implements
       Mapper<Text, CrawlDatum, Text, CrawlDatum> {
+    Pattern datePattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
     Pattern pattern = null;
     Matcher matcher = null;
     String status = null;
@@ -534,12 +536,30 @@ public class CrawlDbReader extends Confi
       }
       status = job.get("status", null);
       retry = job.getInt("retry", -1);
-
+      String exprStr = job.get("expr", null);
+      
       if (job.get("expr", null) != null) {
-        JexlEngine jexl = new JexlEngine();
-        jexl.setSilent(true);
-        jexl.setStrict(true);
-        expr = jexl.createExpression(job.get("expr", null));
+        try {
+          // Translate any date object into a long, dates must be specified as 20-03-2016T00:00:00Z
+          Matcher matcher = datePattern.matcher(exprStr);
+          if (matcher.find()) {
+            String date = matcher.group();
+            
+            // Parse the thing and get epoch!
+            Date parsedDate = DateUtils.parseDateStrictly(date, new String[] {"yyyy-MM-dd'T'HH:mm:ss'Z'"});
+            long time = parsedDate.getTime();
+            
+            // Replace in the original expression
+            exprStr = exprStr.replace(date, Long.toString(time));
+          }
+          
+          JexlEngine jexl = new JexlEngine();
+          jexl.setSilent(true);
+          jexl.setStrict(true);
+          expr = jexl.createExpression(exprStr);
+        } catch (Exception e) {
+          LOG.error(e.getMessage());
+        }
       }
     }