You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/07/20 16:22:19 UTC
svn commit: r1363793 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
src/java/org/apache/nutch/crawl/Injector.java
src/java/org/apache/nutch/metadata/Nutch.java
Author: markus
Date: Fri Jul 20 14:22:19 2012
New Revision: 1363793
URL: http://svn.apache.org/viewvc?rev=1363793&view=rev
Log:
NUTCH-1388 Optionally maintain custom fetch interval despite AdaptiveFetchSchedule
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1363793&r1=1363792&r2=1363793&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jul 20 14:22:19 2012
@@ -2,6 +2,10 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1388 Optionally maintain custom fetch interval despite AdaptiveFetchSchedule (markus)
+
+* NUTCH-1430 Freegenerator records overwrite CrawlDB records with AdaptiveFetchSchedule (markus)
+
* NUTCH-1087 Deprecate crawl command and replace with example script (jnioche)
* NUTCH-1306 Add option to not commit and clarify existing solr.commit.size (ferdy)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java?rev=1363793&r1=1363792&r2=1363793&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java Fri Jul 20 14:22:19 2012
@@ -94,9 +94,9 @@ public class AdaptiveFetchSchedule exten
float interval = datum.getFetchInterval();
long refTime = fetchTime;
- if (datum.getMetaData().containsKey(Nutch.WRITABLE_CUSTOM_INTERVAL_KEY)) {
+ if (datum.getMetaData().containsKey(Nutch.WRITABLE_FIXED_INTERVAL_KEY)) {
// Is fetch interval preset in CrawlDatum MD? Then use preset interval
- FloatWritable customIntervalWritable= (FloatWritable)(datum.getMetaData().get(Nutch.WRITABLE_CUSTOM_INTERVAL_KEY));
+ FloatWritable customIntervalWritable= (FloatWritable)(datum.getMetaData().get(Nutch.WRITABLE_FIXED_INTERVAL_KEY));
interval = customIntervalWritable.get();
} else {
if (modifiedTime <= 0) modifiedTime = fetchTime;
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=1363793&r1=1363792&r2=1363793&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Fri Jul 20 14:22:19 2012
@@ -32,6 +32,7 @@ import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import org.apache.nutch.net.*;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
@@ -45,6 +46,7 @@ import org.apache.nutch.util.TimingUtil;
* Note that some metadata keys are reserved : <br>
* - <i>nutch.score</i> : allows to set a custom score for a specific URL <br>
* - <i>nutch.fetchInterval</i> : allows to set a custom fetch interval for a specific URL <br>
+ * - <i>nutch.fetchInterval.fixed</i> : allows to set a custom fetch interval for a specific URL that is not changed by AdaptiveFetchSchedule <br>
* e.g. http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000 \t userType=open_source
**/
public class Injector extends Configured implements Tool {
@@ -54,6 +56,8 @@ public class Injector extends Configured
public static String nutchScoreMDName = "nutch.score";
/** metadata key reserved for setting a custom fetchInterval for a specific URL */
public static String nutchFetchIntervalMDName = "nutch.fetchInterval";
+ /** metadata key reserved for setting a fixed custom fetchInterval for a specific URL */
+ public static String nutchFixedFetchIntervalMDName = "nutch.fetchInterval.fixed";
/** Normalize and filter injected urls. */
public static class InjectMapper implements Mapper<WritableComparable, Text, Text, CrawlDatum> {
@@ -91,6 +95,7 @@ public class Injector extends Configured
// must be name=value and separated by \t
float customScore = -1f;
int customInterval = interval;
+ int fixedInterval = -1;
Map<String,String> metadata = new TreeMap<String,String>();
if (url.indexOf("\t")!=-1){
String[] splits = url.split("\t");
@@ -109,11 +114,16 @@ public class Injector extends Configured
customScore = Float.parseFloat(metavalue);}
catch (NumberFormatException nfe){}
}
- else if (metaname.equals(nutchFetchIntervalMDName)) {
- try {
- customInterval = Integer.parseInt(metavalue);}
- catch (NumberFormatException nfe){}
- }
+ else if (metaname.equals(nutchFetchIntervalMDName)) {
+ try {
+ customInterval = Integer.parseInt(metavalue);}
+ catch (NumberFormatException nfe){}
+ }
+ else if (metaname.equals(nutchFixedFetchIntervalMDName)) {
+ try {
+ fixedInterval = Integer.parseInt(metavalue);}
+ catch (NumberFormatException nfe){}
+ }
else metadata.put(metaname,metavalue);
}
}
@@ -126,7 +136,18 @@ public class Injector extends Configured
}
if (url != null) { // if it passes
value.set(url); // collect it
- CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_INJECTED, customInterval);
+ CrawlDatum datum = new CrawlDatum();
+ datum.setStatus(CrawlDatum.STATUS_INJECTED);
+
+ // Is interval custom? Then set as meta data
+ if (fixedInterval > -1) {
+ // Set writable using float. Flaot is used by AdaptiveFetchSchedule
+ datum.getMetaData().put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval));
+ datum.setFetchInterval(fixedInterval);
+ } else {
+ datum.setFetchInterval(customInterval);
+ }
+
datum.setFetchTime(curTime);
// now add the metadata
Iterator<String> keysIter = metadata.keySet().iterator();
Modified: nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?rev=1363793&r1=1363792&r2=1363793&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Fri Jul 20 14:22:19 2012
@@ -68,7 +68,7 @@ public interface Nutch {
public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY);
/** Used by AdaptiveFetchSchedule to maintain custom fetch interval */
- public static final String CUSTOM_INTERVAL_KEY = "interval";
+ public static final String FIXED_INTERVAL_KEY = "fixedInterval";
- public static final Text WRITABLE_CUSTOM_INTERVAL_KEY = new Text(CUSTOM_INTERVAL_KEY);
+ public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text(FIXED_INTERVAL_KEY);
}