You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2010/03/30 10:30:29 UTC
svn commit: r929038 - in /lucene/nutch/trunk: ./ conf/
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/indexer/
src/java/org/apache/nutch/parse/
Author: jnioche
Date: Tue Mar 30 08:30:28 2010
New Revision: 929038
URL: http://svn.apache.org/viewvc?rev=929038&view=rev
Log:
NUTCH-779 Mechanism for passing metadata from parse to crawldb
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=929038&r1=929037&r2=929038&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Mar 30 08:30:28 2010
@@ -2,6 +2,8 @@ Nutch Change Log
Unreleased Changes
+* NUTCH-779 Mechanism for passing metadata from parse to crawldb (jnioche)
+
* NUTCH-784 CrawlDBScanner (jnioche)
* NUTCH-762 Generator can generate several segments in one parse of the crawlDB (jnioche)
Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=929038&r1=929037&r2=929038&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Tue Mar 30 08:30:28 2010
@@ -479,6 +479,15 @@
</description>
</property>
+ <property>
+ <name>db.parsemeta.to.crawldb</name>
+ <value></value>
+ <description>Comma-separated list of parse metadata keys to transfer to the crawldb (NUTCH-779).
+ Assuming for instance that the languageidentifier plugin is enabled, setting the value to 'lang'
+ will copy both the key 'lang' and its value to the corresponding entry in the crawldb.
+ </description>
+</property>
+
<property>
<name>db.fetch.retry.max</name>
<value>3</value>
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=929038&r1=929037&r2=929038&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Tue Mar 30 08:30:28 2010
@@ -82,6 +82,8 @@ public class CrawlDatum implements Writa
public static final byte STATUS_INJECTED = 0x42;
/** Page discovered through a link. */
public static final byte STATUS_LINKED = 0x43;
+ /** Page got metadata from a parser */
+ public static final byte STATUS_PARSE_META = 0x44;
public static final HashMap<Byte, String> statNames = new HashMap<Byte, String>();
@@ -101,6 +103,7 @@ public class CrawlDatum implements Writa
statNames.put(STATUS_FETCH_REDIR_PERM, "fetch_redir_perm");
statNames.put(STATUS_FETCH_GONE, "fetch_gone");
statNames.put(STATUS_FETCH_NOTMODIFIED, "fetch_notmodified");
+ statNames.put(STATUS_PARSE_META, "parse_metadata");
oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED);
oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=929038&r1=929037&r2=929038&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Tue Mar 30 08:30:28 2010
@@ -20,6 +20,7 @@ package org.apache.nutch.crawl;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
+import java.util.Map.Entry;
import java.io.IOException;
// Commons Logging imports
@@ -71,7 +72,8 @@ public class CrawlDbReducer implements R
byte[] signature = null;
boolean multiple = false; // avoid deep copy when only single value exists
linked.clear();
-
+ org.apache.hadoop.io.MapWritable metaFromParse = null;
+
while (values.hasNext()) {
CrawlDatum datum = (CrawlDatum)values.next();
if (!multiple && values.hasNext()) multiple = true;
@@ -120,6 +122,9 @@ public class CrawlDbReducer implements R
case CrawlDatum.STATUS_SIGNATURE:
signature = datum.getSignature();
break;
+ case CrawlDatum.STATUS_PARSE_META:
+ metaFromParse = datum.getMetaData();
+ break;
default:
LOG.warn("Unknown status, key: " + key + ", datum: " + datum);
}
@@ -233,6 +238,11 @@ public class CrawlDbReducer implements R
else result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
}
result.setSignature(signature);
+ if (metaFromParse != null) {
+ for (Entry<Writable, Writable> e : metaFromParse.entrySet()) {
+ result.getMetaData().put(e.getKey(), e.getValue());
+ }
+ }
}
// if fetchInterval is larger than the system-wide maximum, trigger
// an unconditional recrawl. This prevents the page to be stuck at
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=929038&r1=929037&r2=929038&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Tue Mar 30 08:30:28 2010
@@ -88,7 +88,8 @@ implements Mapper<Text, Writable, Text,
if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED)
fetchDatum = datum;
} else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||
- CrawlDatum.STATUS_SIGNATURE == datum.getStatus()) {
+ CrawlDatum.STATUS_SIGNATURE == datum.getStatus() ||
+ CrawlDatum.STATUS_PARSE_META == datum.getStatus()) {
continue;
} else {
throw new RuntimeException("Unexpected status: "+datum.getStatus());
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=929038&r1=929037&r2=929038&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Tue Mar 30 08:30:28 2010
@@ -98,6 +98,8 @@ public class ParseOutputFormat implement
Path data = new Path(new Path(out, ParseData.DIR_NAME), name);
Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name);
+ final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb","").split(" *, *");
+
final MapFile.Writer textOut =
new MapFile.Writer(job, fs, text.toString(), Text.class, ParseText.class,
CompressionType.RECORD, progress);
@@ -133,6 +135,20 @@ public class ParseOutputFormat implement
crawlOut.append(key, d);
}
}
+
+ // see if the parse metadata contain things that we'd like
+ // to pass to the metadata of the crawlDB entry
+ CrawlDatum parseMDCrawlDatum = null;
+ for (String mdname : parseMDtoCrawlDB) {
+ String mdvalue = parse.getData().getParseMeta().get(mdname);
+ if (mdvalue != null) {
+ if (parseMDCrawlDatum == null) parseMDCrawlDatum = new CrawlDatum(
+ CrawlDatum.STATUS_PARSE_META, 0);
+ parseMDCrawlDatum.getMetaData().put(new Text(mdname),
+ new Text(mdvalue));
+ }
+ }
+ if (parseMDCrawlDatum != null) crawlOut.append(key, parseMDCrawlDatum);
try {
ParseStatus pstatus = parseData.getStatus();