You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/02/24 16:51:21 UTC
svn commit: r1732177 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/CrawlDatum.java
src/java/org/apache/nutch/crawl/CrawlDbReader.java
src/java/org/apache/nutch/crawl/Generator.java
src/java/org/apache/nutch/util/JexlUtil.java
Author: markus
Date: Wed Feb 24 15:51:21 2016
New Revision: 1732177
URL: http://svn.apache.org/viewvc?rev=1732177&view=rev
Log:
NUTCH-2231 Jexl support in generator job
Added:
nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1732177&r1=1732176&r2=1732177&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Feb 24 15:51:21 2016
@@ -10,6 +10,8 @@ in the release announcement and keep it
Nutch Change Log
+* NUTCH-2231 Jexl support in generator job (markus)
+
* NUTCH-2232 DeduplicationJob should decode URL's before length is compared (Ron van der Vegt via markus)
* NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes (markus)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=1732177&r1=1732176&r2=1732177&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Wed Feb 24 15:51:21 2016
@@ -534,7 +534,7 @@ public class CrawlDatum implements Writa
jcontext.set("interval", new Integer(getFetchInterval()));
jcontext.set("score", getScore());
jcontext.set("signature", StringUtil.toHexString(getSignature()));
-
+
// Set metadata variables
for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
Object value = entry.getValue();
@@ -553,15 +553,11 @@ public class CrawlDatum implements Writa
if (value instanceof Text) {
Text tvalue = (Text)value;
- Text tkey = (Text)entry.getKey();
-
- try {
- Float number = Float.parseFloat(tvalue.toString());
- jcontext.set(tkey.toString(), number);
- } catch (Exception e) {}
+ Text tkey = (Text)entry.getKey();
+ jcontext.set(tkey.toString().replace("-", "_"), tvalue.toString());
}
}
-
+
try {
if (Boolean.TRUE.equals(expr.evaluate(jcontext))) {
return true;
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1732177&r1=1732176&r2=1732177&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Wed Feb 24 15:51:21 2016
@@ -65,6 +65,7 @@ import org.apache.hadoop.mapred.lib.Iden
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.util.JexlUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;
@@ -508,8 +509,10 @@ public class CrawlDbReader extends Confi
job.set("regex", regex);
if (retry != null)
job.setInt("retry", retry);
- if (expr != null)
+ if (expr != null) {
job.set("expr", expr);
+ LOG.info("CrawlDb db: expr: " + expr);
+ }
job.setMapperClass(CrawlDbDumpMapper.class);
job.setOutputKeyClass(Text.class);
@@ -523,7 +526,6 @@ public class CrawlDbReader extends Confi
public static class CrawlDbDumpMapper implements
Mapper<Text, CrawlDatum, Text, CrawlDatum> {
- Pattern datePattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
Pattern pattern = null;
Matcher matcher = null;
String status = null;
@@ -536,30 +538,9 @@ public class CrawlDbReader extends Confi
}
status = job.get("status", null);
retry = job.getInt("retry", -1);
- String exprStr = job.get("expr", null);
if (job.get("expr", null) != null) {
- try {
- // Translate any date object into a long, dates must be specified as 20-03-2016T00:00:00Z
- Matcher matcher = datePattern.matcher(exprStr);
- if (matcher.find()) {
- String date = matcher.group();
-
- // Parse the thing and get epoch!
- Date parsedDate = DateUtils.parseDateStrictly(date, new String[] {"yyyy-MM-dd'T'HH:mm:ss'Z'"});
- long time = parsedDate.getTime();
-
- // Replace in the original expression
- exprStr = exprStr.replace(date, Long.toString(time));
- }
-
- JexlEngine jexl = new JexlEngine();
- jexl.setSilent(true);
- jexl.setStrict(true);
- expr = jexl.createExpression(exprStr);
- } catch (Exception e) {
- LOG.error(e.getMessage());
- }
+ expr = JexlUtil.parseExpression(job.get("expr", null));
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=1732177&r1=1732176&r2=1732177&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Wed Feb 24 15:51:21 2016
@@ -25,6 +25,7 @@ import java.text.*;
// rLogging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.apache.commons.jexl2.Expression;
import org.apache.hadoop.io.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
@@ -39,6 +40,7 @@ import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.JexlUtil;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -46,6 +48,7 @@ import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
+
/**
* Generates a subset of a crawl db to fetch. This version allows to generate
* fetchlists for several segments in one go. Unlike in the initial version
@@ -72,6 +75,7 @@ public class Generator extends NutchTool
public static final String GENERATOR_CUR_TIME = "generate.curTime";
public static final String GENERATOR_DELAY = "crawl.gen.delay";
public static final String GENERATOR_MAX_NUM_SEGMENTS = "generate.max.num.segments";
+ public static final String GENERATOR_EXPR = "generate.expr";
public static class SelectorEntry implements Writable {
public Text url;
@@ -129,7 +133,8 @@ public class Generator extends NutchTool
private int intervalThreshold = -1;
private String restrictStatus = null;
private int maxNumSegments = 1;
- int currentsegmentnum = 1;
+ private Expression expr = null;
+ private int currentsegmentnum = 1;
public void configure(JobConf job) {
curTime = job.getLong(GENERATOR_CUR_TIME, System.currentTimeMillis());
@@ -157,6 +162,7 @@ public class Generator extends NutchTool
scoreThreshold = job.getFloat(GENERATOR_MIN_SCORE, Float.NaN);
intervalThreshold = job.getInt(GENERATOR_MIN_INTERVAL, -1);
restrictStatus = job.get(GENERATOR_RESTRICT_STATUS, null);
+ expr = JexlUtil.parseExpression(job.get(GENERATOR_EXPR, null));
maxNumSegments = job.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1);
segCounts = new int[maxNumSegments];
}
@@ -206,6 +212,13 @@ public class Generator extends NutchTool
LOG.warn("Couldn't filter generatorSortValue for " + key + ": " + sfe);
}
}
+
+ // check expr
+ if (expr != null) {
+ if (!crawlDatum.evaluate(expr)) {
+ return;
+ }
+ }
if (restrictStatus != null
&& !restrictStatus.equalsIgnoreCase(CrawlDatum
@@ -476,7 +489,7 @@ public class Generator extends NutchTool
boolean filter = job.getBoolean(GENERATOR_FILTER, true);
boolean normalise = job.getBoolean(GENERATOR_NORMALISE, true);
return generate(dbDir, segments, numLists, topN, curTime, filter,
- normalise, false, 1);
+ normalise, false, 1, null);
}
/**
@@ -486,7 +499,7 @@ public class Generator extends NutchTool
public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
long curTime, boolean filter, boolean force) throws IOException {
return generate(dbDir, segments, numLists, topN, curTime, filter, true,
- force, 1);
+ force, 1, null);
}
/**
@@ -513,7 +526,7 @@ public class Generator extends NutchTool
*/
public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
long curTime, boolean filter, boolean norm, boolean force,
- int maxNumSegments) throws IOException {
+ int maxNumSegments, String expr) throws IOException {
Path tempDir = new Path(getConf().get("mapred.temp.dir", ".")
+ "/generate-temp-" + java.util.UUID.randomUUID().toString());
@@ -521,7 +534,7 @@ public class Generator extends NutchTool
Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
FileSystem fs = FileSystem.get(getConf());
LockUtil.createLockFile(fs, lock, force);
-
+
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
LOG.info("Generator: starting at " + sdf.format(start));
@@ -531,7 +544,10 @@ public class Generator extends NutchTool
if (topN != Long.MAX_VALUE) {
LOG.info("Generator: topN: " + topN);
}
-
+ if (expr != null) {
+ LOG.info("Generator: expr: " + expr);
+ }
+
// map to inverted subset due for fetch, sort by score
JobConf job = new NutchJob(getConf());
job.setJobName("generate: select from " + dbDir);
@@ -552,7 +568,9 @@ public class Generator extends NutchTool
job.setBoolean(GENERATOR_FILTER, filter);
job.setBoolean(GENERATOR_NORMALISE, norm);
job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);
-
+ if (expr != null) {
+ job.set(GENERATOR_EXPR, expr);
+ }
FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
job.setInputFormat(SequenceFileInputFormat.class);
@@ -703,7 +721,7 @@ public class Generator extends NutchTool
public int run(String[] args) throws Exception {
if (args.length < 2) {
System.out
- .println("Usage: Generator <crawldb> <segments_dir> [-force] [-topN N] [-numFetchers numFetchers] [-adddays numDays] [-noFilter] [-noNorm][-maxNumSegments num]");
+ .println("Usage: Generator <crawldb> <segments_dir> [-force] [-topN N] [-numFetchers numFetchers] [-expr <expr>] [-adddays <numDays>] [-noFilter] [-noNorm] [-maxNumSegments <num>]");
return -1;
}
@@ -715,6 +733,7 @@ public class Generator extends NutchTool
boolean filter = true;
boolean norm = true;
boolean force = false;
+ String expr = null;
int maxNumSegments = 1;
for (int i = 2; i < args.length; i++) {
@@ -735,13 +754,15 @@ public class Generator extends NutchTool
force = true;
} else if ("-maxNumSegments".equals(args[i])) {
maxNumSegments = Integer.parseInt(args[i + 1]);
+ } else if ("-expr".equals(args[i])) {
+ expr = args[i + 1];
}
}
try {
Path[] segs = generate(dbDir, segmentsDir, numFetchers, topN, curTime,
- filter, norm, force, maxNumSegments);
+ filter, norm, force, maxNumSegments, expr);
if (segs == null)
return 1;
} catch (Exception e) {
@@ -763,6 +784,7 @@ public class Generator extends NutchTool
boolean norm = true;
boolean force = false;
int maxNumSegments = 1;
+ String expr = null;
Path crawlDb;
if(args.containsKey(Nutch.ARG_CRAWLDB)) {
@@ -791,7 +813,10 @@ public class Generator extends NutchTool
else {
segmentsDir = new Path(crawlId+"/segments");
}
-
+
+ if (args.containsKey("expr")) {
+ expr = (String)args.get("expr");
+ }
if (args.containsKey("topN")) {
topN = Long.parseLong((String)args.get("topN"));
}
@@ -817,7 +842,7 @@ public class Generator extends NutchTool
try {
Path[] segs = generate(crawlDb, segmentsDir, numFetchers, topN, curTime,
- filter, norm, force, maxNumSegments);
+ filter, norm, force, maxNumSegments, expr);
if (segs == null){
results.put(Nutch.VAL_RESULT, Integer.toString(1));
return results;
Added: nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java?rev=1732177&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java Wed Feb 24 15:51:21 2016
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import java.util.Date;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.jexl2.Expression;
+import org.apache.commons.jexl2.JexlEngine;
+import org.apache.commons.lang.time.DateUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A collection of Jexl utilit(y|ies).
+ */
+public class JexlUtil {
+
+ public static final Logger LOG = LoggerFactory.getLogger(JexlUtil.class);
+
+ /**
+ *
+ */
+ public static Pattern datePattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
+
+ /**
+ * Parses the given experssion to a Jexl expression. This supports
+ * date parsing.
+ *
+ * @param expr the Jexl expression
+ * @return parsed Jexl expression or null in case of parse error
+ */
+ public static Expression parseExpression(String expr) {
+ try {
+ // Translate any date object into a long, dates must be specified as 20-03-2016T00:00:00Z
+ Matcher matcher = datePattern.matcher(expr);
+ if (matcher.find()) {
+ String date = matcher.group();
+
+ // Parse the thing and get epoch!
+ Date parsedDate = DateUtils.parseDateStrictly(date, new String[] {"yyyy-MM-dd'T'HH:mm:ss'Z'"});
+ long time = parsedDate.getTime();
+
+ // Replace in the original expression
+ expr = expr.replace(date, Long.toString(time));
+ }
+
+ JexlEngine jexl = new JexlEngine();
+ jexl.setSilent(true);
+ jexl.setStrict(true);
+ return jexl.createExpression(expr);
+ } catch (Exception e) {
+ LOG.error(e.getMessage());
+ }
+
+ return null;
+ }
+}
\ No newline at end of file