You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/02/24 14:05:03 UTC
svn commit: r1732140 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/CrawlDatum.java
src/java/org/apache/nutch/crawl/CrawlDbReader.java
Author: markus
Date: Wed Feb 24 13:05:02 2016
New Revision: 1732140
URL: http://svn.apache.org/viewvc?rev=1732140&view=rev
Log:
NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1732140&r1=1732139&r2=1732140&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Feb 24 13:05:02 2016
@@ -10,6 +10,8 @@ in the release announcement and keep it
Nutch Change Log
+* NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes (markus)
+
* NUTCH-2227 RegexParseFilter (markus)
* NUTCH-2221 Introduce db.ignore.internal.links to FetcherThread (markus)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=1732140&r1=1732139&r2=1732140&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Wed Feb 24 13:05:02 2016
@@ -521,30 +521,20 @@ public class CrawlDatum implements Writa
}
}
- public boolean evaluate(String expr) {
- return evaluate(expr, true, true);
- }
-
- public boolean evaluate(String expr, boolean silent, boolean strict) {
- if (expr != null) {
- // Create or retrieve a JexlEngine
- JexlEngine jexl = new JexlEngine();
-
- jexl.setSilent(silent);
- jexl.setStrict(strict);
-
- // Create an expression object and evaluate
- return evaluate(jexl.createExpression(expr));
- }
-
- return false;
- }
-
public boolean evaluate(Expression expr) {
if (expr != null) {
// Create a context and add data
JexlContext jcontext = new MapContext();
-
+
+ // https://issues.apache.org/jira/browse/NUTCH-2229
+ jcontext.set("status", getStatusName(getStatus()));
+ jcontext.set("fetchTime", (long)(getFetchTime()));
+ jcontext.set("modifiedTime", (long)(getModifiedTime()));
+ jcontext.set("retries", getRetriesSinceFetch());
+ jcontext.set("interval", new Integer(getFetchInterval()));
+ jcontext.set("score", getScore());
+ jcontext.set("signature", StringUtil.toHexString(getSignature()));
+
// Set metadata variables
for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
Object value = entry.getValue();
@@ -571,7 +561,7 @@ public class CrawlDatum implements Writa
} catch (Exception e) {}
}
}
-
+
try {
if (Boolean.TRUE.equals(expr.evaluate(jcontext))) {
return true;
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1732140&r1=1732139&r2=1732140&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Wed Feb 24 13:05:02 2016
@@ -70,6 +70,7 @@ import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;
import org.apache.commons.jexl2.Expression;
import org.apache.commons.jexl2.JexlEngine;
+import org.apache.commons.lang.time.DateUtils;
/**
* Read utility for the CrawlDB.
@@ -522,6 +523,7 @@ public class CrawlDbReader extends Confi
public static class CrawlDbDumpMapper implements
Mapper<Text, CrawlDatum, Text, CrawlDatum> {
+ Pattern datePattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
Pattern pattern = null;
Matcher matcher = null;
String status = null;
@@ -534,12 +536,30 @@ public class CrawlDbReader extends Confi
}
status = job.get("status", null);
retry = job.getInt("retry", -1);
-
+ String exprStr = job.get("expr", null);
+
if (job.get("expr", null) != null) {
- JexlEngine jexl = new JexlEngine();
- jexl.setSilent(true);
- jexl.setStrict(true);
- expr = jexl.createExpression(job.get("expr", null));
+ try {
+ // Translate any date object into a long, dates must be specified as 20-03-2016T00:00:00Z
+ Matcher matcher = datePattern.matcher(exprStr);
+ if (matcher.find()) {
+ String date = matcher.group();
+
+ // Parse the thing and get epoch!
+ Date parsedDate = DateUtils.parseDateStrictly(date, new String[] {"yyyy-MM-dd'T'HH:mm:ss'Z'"});
+ long time = parsedDate.getTime();
+
+ // Replace in the original expression
+ exprStr = exprStr.replace(date, Long.toString(time));
+ }
+
+ JexlEngine jexl = new JexlEngine();
+ jexl.setSilent(true);
+ jexl.setStrict(true);
+ expr = jexl.createExpression(exprStr);
+ } catch (Exception e) {
+ LOG.error(e.getMessage());
+ }
}
}