You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2017/12/18 15:50:05 UTC
[nutch] 21/23: NUTCH-2322 URL not available for Jexl operations -
apply patch contributed by Markus Jelsma
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 22fc7f0defb22588c4ade33b5693303f18d96253
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Sun Dec 17 15:32:04 2017 +0100
NUTCH-2322 URL not available for Jexl operations
- apply patch contributed by Markus Jelsma
---
src/java/org/apache/nutch/crawl/CrawlDatum.java | 18 ++++++++++++------
src/java/org/apache/nutch/crawl/CrawlDbReader.java | 2 +-
src/java/org/apache/nutch/crawl/Generator.java | 2 +-
3 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/src/java/org/apache/nutch/crawl/CrawlDatum.java b/src/java/org/apache/nutch/crawl/CrawlDatum.java
index e54c791..1facf0a 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDatum.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDatum.java
@@ -23,14 +23,15 @@ import java.util.Map.Entry;
import org.apache.commons.jexl2.JexlContext;
import org.apache.commons.jexl2.Expression;
-import org.apache.commons.jexl2.JexlEngine;
import org.apache.commons.jexl2.MapContext;
import org.apache.hadoop.io.*;
import org.apache.nutch.util.*;
+import org.apache.nutch.protocol.ProtocolStatus;
/* The crawl state of a url. */
public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
+
public static final String GENERATE_DIR_NAME = "crawl_generate";
public static final String FETCH_DIR_NAME = "crawl_fetch";
public static final String PARSE_DIR_NAME = "crawl_parse";
@@ -525,12 +526,13 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
}
}
- public boolean evaluate(Expression expr) {
- if (expr != null) {
+ public boolean evaluate(Expression expr, String url) {
+ if (expr != null && url != null) {
// Create a context and add data
JexlContext jcontext = new MapContext();
// https://issues.apache.org/jira/browse/NUTCH-2229
+ jcontext.set("url", url);
jcontext.set("status", getStatusName(getStatus()));
jcontext.set("fetchTime", (long)(getFetchTime()));
jcontext.set("modifiedTime", (long)(getModifiedTime()));
@@ -542,24 +544,28 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
// Set metadata variables
for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
Object value = entry.getValue();
+ Text tkey = (Text)entry.getKey();
if (value instanceof FloatWritable) {
FloatWritable fvalue = (FloatWritable)value;
- Text tkey = (Text)entry.getKey();
jcontext.set(tkey.toString(), fvalue.get());
}
if (value instanceof IntWritable) {
IntWritable ivalue = (IntWritable)value;
- Text tkey = (Text)entry.getKey();
jcontext.set(tkey.toString(), ivalue.get());
}
if (value instanceof Text) {
Text tvalue = (Text)value;
- Text tkey = (Text)entry.getKey();
jcontext.set(tkey.toString().replace("-", "_"), tvalue.toString());
}
+
+ if (value instanceof ProtocolStatus) {
+ ProtocolStatus pvalue = (ProtocolStatus)value;
+ jcontext.set(tkey.toString().replace("-", "_"), pvalue.toString());
+ }
+
}
try {
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index af30664..ddd25ef 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -700,7 +700,7 @@ public class CrawlDbReader extends Configured implements Closeable, Tool {
// check expr
if (expr != null) {
- if (!value.evaluate(expr)) {
+ if (!value.evaluate(expr, key.toString())) {
return;
}
}
diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
index e5f4831..d85d578 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -252,7 +252,7 @@ public class Generator extends NutchTool implements Tool {
// check expr
if (expr != null) {
- if (!crawlDatum.evaluate(expr)) {
+ if (!crawlDatum.evaluate(expr, key.toString())) {
return;
}
}
--
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.