You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/02/24 16:51:21 UTC

svn commit: r1732177 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDatum.java src/java/org/apache/nutch/crawl/CrawlDbReader.java src/java/org/apache/nutch/crawl/Generator.java src/java/org/apache/nutch/util/JexlUtil.java

Author: markus
Date: Wed Feb 24 15:51:21 2016
New Revision: 1732177

URL: http://svn.apache.org/viewvc?rev=1732177&view=rev
Log:
NUTCH-2231 Jexl support in generator job

Added:
    nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
    nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
    nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1732177&r1=1732176&r2=1732177&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Feb 24 15:51:21 2016
@@ -10,6 +10,8 @@ in the release announcement and keep it
 
 Nutch Change Log
 
+* NUTCH-2231 Jexl support in generator job (markus)
+
 * NUTCH-2232 DeduplicationJob should decode URL's before length is compared (Ron van der Vegt via markus)
 
 * NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes (markus)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=1732177&r1=1732176&r2=1732177&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Wed Feb 24 15:51:21 2016
@@ -534,7 +534,7 @@ public class CrawlDatum implements Writa
       jcontext.set("interval", new Integer(getFetchInterval()));
       jcontext.set("score", getScore());
       jcontext.set("signature", StringUtil.toHexString(getSignature()));
-      
+            
       // Set metadata variables
       for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
         Object value = entry.getValue();
@@ -553,15 +553,11 @@ public class CrawlDatum implements Writa
         
         if (value instanceof Text) {
           Text tvalue = (Text)value;
-          Text tkey = (Text)entry.getKey();
-          
-          try {
-            Float number = Float.parseFloat(tvalue.toString());
-            jcontext.set(tkey.toString(), number);
-          } catch (Exception e) {}
+          Text tkey = (Text)entry.getKey();     
+          jcontext.set(tkey.toString().replace("-", "_"), tvalue.toString());
         }
       }
-            
+                  
       try {
         if (Boolean.TRUE.equals(expr.evaluate(jcontext))) {
           return true;

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1732177&r1=1732176&r2=1732177&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Wed Feb 24 15:51:21 2016
@@ -65,6 +65,7 @@ import org.apache.hadoop.mapred.lib.Iden
 import org.apache.hadoop.util.Progressable;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.util.JexlUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.StringUtil;
@@ -508,8 +509,10 @@ public class CrawlDbReader extends Confi
       job.set("regex", regex);
     if (retry != null)
       job.setInt("retry", retry);
-    if (expr != null)
+    if (expr != null) {
       job.set("expr", expr);
+      LOG.info("CrawlDb db: expr: " + expr);
+    }
 
     job.setMapperClass(CrawlDbDumpMapper.class);
     job.setOutputKeyClass(Text.class);
@@ -523,7 +526,6 @@ public class CrawlDbReader extends Confi
 
   public static class CrawlDbDumpMapper implements
       Mapper<Text, CrawlDatum, Text, CrawlDatum> {
-    Pattern datePattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
     Pattern pattern = null;
     Matcher matcher = null;
     String status = null;
@@ -536,30 +538,9 @@ public class CrawlDbReader extends Confi
       }
       status = job.get("status", null);
       retry = job.getInt("retry", -1);
-      String exprStr = job.get("expr", null);
       
       if (job.get("expr", null) != null) {
-        try {
-          // Translate any date object into a long, dates must be specified as 20-03-2016T00:00:00Z
-          Matcher matcher = datePattern.matcher(exprStr);
-          if (matcher.find()) {
-            String date = matcher.group();
-            
-            // Parse the thing and get epoch!
-            Date parsedDate = DateUtils.parseDateStrictly(date, new String[] {"yyyy-MM-dd'T'HH:mm:ss'Z'"});
-            long time = parsedDate.getTime();
-            
-            // Replace in the original expression
-            exprStr = exprStr.replace(date, Long.toString(time));
-          }
-          
-          JexlEngine jexl = new JexlEngine();
-          jexl.setSilent(true);
-          jexl.setStrict(true);
-          expr = jexl.createExpression(exprStr);
-        } catch (Exception e) {
-          LOG.error(e.getMessage());
-        }
+        expr = JexlUtil.parseExpression(job.get("expr", null));
       }
     }
 

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=1732177&r1=1732176&r2=1732177&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Wed Feb 24 15:51:21 2016
@@ -25,6 +25,7 @@ import java.text.*;
 // rLogging imports
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.apache.commons.jexl2.Expression;
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
@@ -39,6 +40,7 @@ import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.JexlUtil;
 import org.apache.nutch.util.LockUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
@@ -46,6 +48,7 @@ import org.apache.nutch.util.NutchTool;
 import org.apache.nutch.util.TimingUtil;
 import org.apache.nutch.util.URLUtil;
 
+
 /**
  * Generates a subset of a crawl db to fetch. This version allows to generate
  * fetchlists for several segments in one go. Unlike in the initial version
@@ -72,6 +75,7 @@ public class Generator extends NutchTool
   public static final String GENERATOR_CUR_TIME = "generate.curTime";
   public static final String GENERATOR_DELAY = "crawl.gen.delay";
   public static final String GENERATOR_MAX_NUM_SEGMENTS = "generate.max.num.segments";
+  public static final String GENERATOR_EXPR = "generate.expr";
 
   public static class SelectorEntry implements Writable {
     public Text url;
@@ -129,7 +133,8 @@ public class Generator extends NutchTool
     private int intervalThreshold = -1;
     private String restrictStatus = null;
     private int maxNumSegments = 1;
-    int currentsegmentnum = 1;
+    private Expression expr = null;
+    private int currentsegmentnum = 1;
 
     public void configure(JobConf job) {
       curTime = job.getLong(GENERATOR_CUR_TIME, System.currentTimeMillis());
@@ -157,6 +162,7 @@ public class Generator extends NutchTool
       scoreThreshold = job.getFloat(GENERATOR_MIN_SCORE, Float.NaN);
       intervalThreshold = job.getInt(GENERATOR_MIN_INTERVAL, -1);
       restrictStatus = job.get(GENERATOR_RESTRICT_STATUS, null);
+      expr = JexlUtil.parseExpression(job.get(GENERATOR_EXPR, null));
       maxNumSegments = job.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1);
       segCounts = new int[maxNumSegments];
     }
@@ -206,6 +212,13 @@ public class Generator extends NutchTool
           LOG.warn("Couldn't filter generatorSortValue for " + key + ": " + sfe);
         }
       }
+      
+      // check expr
+      if (expr != null) {
+        if (!crawlDatum.evaluate(expr)) {
+          return;
+        }
+      }
 
       if (restrictStatus != null
           && !restrictStatus.equalsIgnoreCase(CrawlDatum
@@ -476,7 +489,7 @@ public class Generator extends NutchTool
     boolean filter = job.getBoolean(GENERATOR_FILTER, true);
     boolean normalise = job.getBoolean(GENERATOR_NORMALISE, true);
     return generate(dbDir, segments, numLists, topN, curTime, filter,
-        normalise, false, 1);
+        normalise, false, 1, null);
   }
 
   /**
@@ -486,7 +499,7 @@ public class Generator extends NutchTool
   public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
       long curTime, boolean filter, boolean force) throws IOException {
     return generate(dbDir, segments, numLists, topN, curTime, filter, true,
-        force, 1);
+        force, 1, null);
   }
 
   /**
@@ -513,7 +526,7 @@ public class Generator extends NutchTool
    */
   public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
       long curTime, boolean filter, boolean norm, boolean force,
-      int maxNumSegments) throws IOException {
+      int maxNumSegments, String expr) throws IOException {
 
     Path tempDir = new Path(getConf().get("mapred.temp.dir", ".")
         + "/generate-temp-" + java.util.UUID.randomUUID().toString());
@@ -521,7 +534,7 @@ public class Generator extends NutchTool
     Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
     FileSystem fs = FileSystem.get(getConf());
     LockUtil.createLockFile(fs, lock, force);
-
+    
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
     LOG.info("Generator: starting at " + sdf.format(start));
@@ -531,7 +544,10 @@ public class Generator extends NutchTool
     if (topN != Long.MAX_VALUE) {
       LOG.info("Generator: topN: " + topN);
     }
-
+    if (expr != null) {
+      LOG.info("Generator: expr: " + expr);
+    }
+    
     // map to inverted subset due for fetch, sort by score
     JobConf job = new NutchJob(getConf());
     job.setJobName("generate: select from " + dbDir);
@@ -552,7 +568,9 @@ public class Generator extends NutchTool
     job.setBoolean(GENERATOR_FILTER, filter);
     job.setBoolean(GENERATOR_NORMALISE, norm);
     job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);
-
+    if (expr != null) {
+      job.set(GENERATOR_EXPR, expr);
+    }
     FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
     job.setInputFormat(SequenceFileInputFormat.class);
 
@@ -703,7 +721,7 @@ public class Generator extends NutchTool
   public int run(String[] args) throws Exception {
     if (args.length < 2) {
       System.out
-          .println("Usage: Generator <crawldb> <segments_dir> [-force] [-topN N] [-numFetchers numFetchers] [-adddays numDays] [-noFilter] [-noNorm][-maxNumSegments num]");
+          .println("Usage: Generator <crawldb> <segments_dir> [-force] [-topN N] [-numFetchers numFetchers] [-expr <expr>] [-adddays <numDays>] [-noFilter] [-noNorm] [-maxNumSegments <num>]");
       return -1;
     }
 
@@ -715,6 +733,7 @@ public class Generator extends NutchTool
     boolean filter = true;
     boolean norm = true;
     boolean force = false;
+    String expr = null;
     int maxNumSegments = 1;
 
     for (int i = 2; i < args.length; i++) {
@@ -735,13 +754,15 @@ public class Generator extends NutchTool
         force = true;
       } else if ("-maxNumSegments".equals(args[i])) {
         maxNumSegments = Integer.parseInt(args[i + 1]);
+      } else if ("-expr".equals(args[i])) {
+        expr = args[i + 1];
       }
 
     }
 
     try {
       Path[] segs = generate(dbDir, segmentsDir, numFetchers, topN, curTime,
-          filter, norm, force, maxNumSegments);
+          filter, norm, force, maxNumSegments, expr);
       if (segs == null)
         return 1;
     } catch (Exception e) {
@@ -763,6 +784,7 @@ public class Generator extends NutchTool
     boolean norm = true;
     boolean force = false;
     int maxNumSegments = 1;
+    String expr = null;
 
     Path crawlDb;
     if(args.containsKey(Nutch.ARG_CRAWLDB)) {
@@ -791,7 +813,10 @@ public class Generator extends NutchTool
     else {
       segmentsDir = new Path(crawlId+"/segments");
     }
-
+    
+    if (args.containsKey("expr")) {
+      expr = (String)args.get("expr");
+    }
     if (args.containsKey("topN")) {
       topN = Long.parseLong((String)args.get("topN"));
     }
@@ -817,7 +842,7 @@ public class Generator extends NutchTool
 
     try {
       Path[] segs = generate(crawlDb, segmentsDir, numFetchers, topN, curTime,
-          filter, norm, force, maxNumSegments);
+          filter, norm, force, maxNumSegments, expr);
       if (segs == null){
         results.put(Nutch.VAL_RESULT, Integer.toString(1));
         return results;

Added: nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java?rev=1732177&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java Wed Feb 24 15:51:21 2016
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.util;
+
+import java.util.Date;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.jexl2.Expression;
+import org.apache.commons.jexl2.JexlEngine;
+import org.apache.commons.lang.time.DateUtils;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A collection of Jexl utilit(y|ies).
+ */
+public class JexlUtil {
+
+  public static final Logger LOG = LoggerFactory.getLogger(JexlUtil.class);
+
+  /**
+   * 
+   */
+  public static Pattern datePattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
+
+  /**
+   * Parses the given experssion to a Jexl expression. This supports
+   * date parsing.
+   *
+   * @param expr the Jexl expression
+   * @return parsed Jexl expression or null in case of parse error
+   */
+  public static Expression parseExpression(String expr) {
+    try {
+      // Translate any date object into a long, dates must be specified as 20-03-2016T00:00:00Z
+      Matcher matcher = datePattern.matcher(expr);
+      if (matcher.find()) {
+        String date = matcher.group();
+        
+        // Parse the thing and get epoch!
+        Date parsedDate = DateUtils.parseDateStrictly(date, new String[] {"yyyy-MM-dd'T'HH:mm:ss'Z'"});
+        long time = parsedDate.getTime();
+        
+        // Replace in the original expression
+        expr = expr.replace(date, Long.toString(time));
+      }
+      
+      JexlEngine jexl = new JexlEngine();
+      jexl.setSilent(true);
+      jexl.setStrict(true);
+      return jexl.createExpression(expr);
+    } catch (Exception e) {
+      LOG.error(e.getMessage());
+    }
+    
+    return null;
+  }
+}
\ No newline at end of file