You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/07/01 08:56:32 UTC
svn commit: r1688561 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/CrawlDbReducer.java
Author: markus
Date: Wed Jul 1 06:56:32 2015
New Revision: 1688561
URL: http://svn.apache.org/r1688561
Log:
NUTCH-1684 ParseMeta to be added before fetch schedulers are run
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1688561&r1=1688560&r2=1688561&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jul 1 06:56:32 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-1684 ParseMeta to be added before fetch schedulers are run (markus)
+
* NUTCH-2038 fix for NUTCH-2038: Naive Bayes classifier based html Parse filter (for filtering outlinks)
(Asitang Mishra, snagel via mattmann)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=1688561&r1=1688560&r2=1688561&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Wed Jul 1 06:56:32 2015
@@ -209,6 +209,13 @@ public class CrawlDbReducer implements
case CrawlDatum.STATUS_FETCH_REDIR_TEMP: // successful fetch, redirected
case CrawlDatum.STATUS_FETCH_REDIR_PERM:
case CrawlDatum.STATUS_FETCH_NOTMODIFIED: // successful fetch, notmodified
+ // https://issues.apache.org/jira/browse/NUTCH-1656
+ if (metaFromParse != null) {
+ for (Entry<Writable, Writable> e : metaFromParse.entrySet()) {
+ result.getMetaData().put(e.getKey(), e.getValue());
+ }
+ }
+
// determine the modification status
int modified = FetchSchedule.STATUS_UNKNOWN;
if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
@@ -260,13 +267,6 @@ public class CrawlDbReducer implements
result.setSignature(signature);
}
- // https://issues.apache.org/jira/browse/NUTCH-1656
- if (metaFromParse != null) {
- for (Entry<Writable, Writable> e : metaFromParse.entrySet()) {
- result.getMetaData().put(e.getKey(), e.getValue());
- }
- }
-
// if fetchInterval is larger than the system-wide maximum, trigger
// an unconditional recrawl. This prevents the page to be stuck at
// NOTMODIFIED state, when the old fetched copy was already removed with