You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/07/05 18:57:57 UTC
svn commit: r1357739 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml
src/java/org/apache/nutch/crawl/Injector.java
src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
src/test/org/apache/nutch/crawl/TestInjector.java
Author: markus
Date: Thu Jul 5 16:57:57 2012
New Revision: 1357739
URL: http://svn.apache.org/viewvc?rev=1357739&view=rev
Log:
NUTCH-1405 Allow to overwrite CrawlDatum's with injected entries
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1357739&r1=1357738&r2=1357739&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jul 5 16:57:57 2012
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1405 Allow to overwrite CrawlDatum's with injected entries (markus)
+
* NUTCH-1412 Upgrade commons lang (markus)
* NUTCH-1251 SolrDedup to use proper Lucene catch-all query (Arkadi Kosmynin via markus)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1357739&r1=1357738&r2=1357739&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Thu Jul 5 16:57:57 2012
@@ -468,6 +468,23 @@
</description>
</property>
+ <property>
+ <name>db.injector.overwrite</name>
+ <value>false</value>
+ <description>Whether existing records in the CrawlDB will be overwritten
+ by injected records.
+ </description>
+</property>
+
+<property>
+ <name>db.injector.update</name>
+ <value>false</value>
+ <description>If true existing records in the CrawlDB will be updated with
+ injected records. Old meta data is preserved. The db.injector.overwrite
+ parameter has precedence.
+ </description>
+</property>
+
<property>
<name>db.score.injected</name>
<value>1.0</value>
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=1357739&r1=1357738&r2=1357739&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Thu Jul 5 16:57:57 2012
@@ -152,7 +152,18 @@ public class Injector extends Configured
/** Combine multiple new entries for a url. */
public static class InjectReducer implements Reducer<Text, CrawlDatum, Text, CrawlDatum> {
- public void configure(JobConf job) {}
+ private int interval;
+ private float scoreInjected;
+ private boolean overwrite = false;
+ private boolean update = false;
+
+ public void configure(JobConf job) {
+ interval = job.getInt("db.fetch.interval.default", 2592000);
+ scoreInjected = job.getFloat("db.score.injected", 1.0f);
+ overwrite = job.getBoolean("db.injector.overwrite", false);
+ update = job.getBoolean("db.injector.update", false);
+ }
+
public void close() {}
private CrawlDatum old = new CrawlDatum();
@@ -162,19 +173,48 @@ public class Injector extends Configured
OutputCollector<Text, CrawlDatum> output, Reporter reporter)
throws IOException {
boolean oldSet = false;
+ boolean injectedSet = false;
while (values.hasNext()) {
CrawlDatum val = values.next();
if (val.getStatus() == CrawlDatum.STATUS_INJECTED) {
injected.set(val);
injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
+ injectedSet = true;
} else {
old.set(val);
oldSet = true;
}
}
CrawlDatum res = null;
- if (oldSet) res = old; // don't overwrite existing value
- else res = injected;
+
+ /**
+ * Whether to overwrite, ignore or update existing records
+ * @see https://issues.apache.org/jira/browse/NUTCH-1405
+ */
+
+ // Injected record already exists and overwrite but not update
+ if (injectedSet && oldSet && overwrite) {
+ res = injected;
+
+ if (update) {
+ LOG.info(key.toString() + " overwritten with injected record but update was specified.");
+ }
+ }
+
+ // Injected record already exists and update but not overwrite
+ if (injectedSet && oldSet && update && !overwrite) {
+ res = old;
+ old.putAllMetaData(injected);
+ old.setScore(injected.getScore() != scoreInjected ? injected.getScore() : old.getScore());
+ old.setFetchInterval(injected.getFetchInterval() != interval ? injected.getFetchInterval() : old.getFetchInterval());
+ }
+
+ // Old default behaviour
+ if (injectedSet && !oldSet) {
+ res = injected;
+ } else {
+ res = old;
+ }
output.collect(key, res);
}
Modified: nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java?rev=1357739&r1=1357738&r2=1357739&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java Thu Jul 5 16:57:57 2012
@@ -18,6 +18,7 @@ package org.apache.nutch.crawl;
import java.io.IOException;
import java.net.UnknownHostException;
+import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
@@ -104,22 +105,43 @@ public class CrawlDBTestUtil {
this.datum = datum;
}
}
+
+ /**
+ * Generate seedlist
+ * @throws IOException
+ */
+ public static void generateSeedList(FileSystem fs, Path urlPath, List<String> urls) throws IOException{
+ generateSeedList(fs, urlPath, urls, new ArrayList<String>());
+ }
/**
* Generate seedlist
* @throws IOException
*/
- public static void generateSeedList(FileSystem fs, Path urlPath, List<String> contents) throws IOException{
+ public static void generateSeedList(FileSystem fs, Path urlPath, List<String> urls, List<String>metadata) throws IOException{
FSDataOutputStream out;
Path file=new Path(urlPath,"urls.txt");
fs.mkdirs(urlPath);
out=fs.create(file);
- Iterator<String> iterator=contents.iterator();
- while(iterator.hasNext()){
- String url=iterator.next();
+
+ Iterator<String> urls_i=urls.iterator();
+ Iterator<String> metadata_i=metadata.iterator();
+
+ String url;
+ String md;
+ while(urls_i.hasNext()){
+ url=urls_i.next();
+
out.writeBytes(url);
+
+ if (metadata_i.hasNext()) {
+ md = metadata_i.next();
+ out.writeBytes(md);
+ }
+
out.writeBytes("\n");
}
+
out.flush();
out.close();
}
Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java?rev=1357739&r1=1357738&r2=1357739&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java Thu Jul 5 16:57:57 2012
@@ -19,7 +19,9 @@ package org.apache.nutch.crawl;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
@@ -62,10 +64,13 @@ public class TestInjector extends TestCa
public void testInject() throws IOException {
ArrayList<String> urls=new ArrayList<String>();
+ // We'll use a separate list for MD so we can still compare url with containsAll
+ ArrayList<String> metadata=new ArrayList<String>();
for(int i=0;i<100;i++) {
urls.add("http://zzz.com/" + i + ".html");
+ metadata.add("\tnutch.score=2." + i + "\tnutch.fetchInterval=171717\tkey=value");
}
- CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
+ CrawlDBTestUtil.generateSeedList(fs, urlPath, urls, metadata);
Injector injector=new Injector(conf);
injector.inject(crawldbPath, urlPath);
@@ -85,23 +90,44 @@ public class TestInjector extends TestCa
ArrayList<String> urls2=new ArrayList<String>();
for(int i=0;i<100;i++) {
urls2.add("http://xxx.com/" + i + ".html");
+ // We'll overwrite previously injected records but preserve their original MD
+ urls2.add("http://zzz.com/" + i + ".html");
}
CrawlDBTestUtil.generateSeedList(fs, urlPath, urls2);
+ injector=new Injector(conf);
+ conf.setBoolean("db.injector.update", true);
injector.inject(crawldbPath, urlPath);
urls.addAll(urls2);
// verify results
read=readCrawldb();
-
Collections.sort(read);
Collections.sort(urls);
- assertEquals(urls.size(), read.size());
+ // We should have 100 less records because we've overwritten
+ assertEquals(urls.size() - 100, read.size());
assertTrue(read.containsAll(urls));
assertTrue(urls.containsAll(read));
+ // Check if we correctly preserved MD
+ Map<String, CrawlDatum> records = readCrawldbRecords();
+
+ // Iterate over the urls, we're looking for http://zzz.com/ prefixed URLs
+ // so we can check for MD and score and interval
+ Text writableKey = new Text("key");
+ Text writableValue = new Text("value");
+ for (String url : urls) {
+ if (url.indexOf("http://zzz") == 0) {
+ // Check for fetch interval
+ assertTrue(records.get(url).getFetchInterval() == 171717);
+ // Check for default score
+ assertTrue(records.get(url).getScore() != 1.0);
+ // Check for MD key=value
+ assertEquals(writableValue, records.get(url).getMetaData().get(writableKey));
+ }
+ }
}
private List<String> readCrawldb() throws IOException{
@@ -120,5 +146,21 @@ public class TestInjector extends TestCa
return read;
}
+
+ private HashMap<String,CrawlDatum> readCrawldbRecords() throws IOException{
+ Path dbfile=new Path(crawldbPath,CrawlDb.CURRENT_NAME + "/part-00000/data");
+ System.out.println("reading:" + dbfile);
+ SequenceFile.Reader reader=new SequenceFile.Reader(fs, dbfile, conf);
+ HashMap<String,CrawlDatum> read=new HashMap<String,CrawlDatum>();
+
+ READ:
+ do {
+ Text key=new Text();
+ CrawlDatum value=new CrawlDatum();
+ if(!reader.next(key, value)) break READ;
+ read.put(key.toString(), value);
+ } while(true);
+ return read;
+ }
}