You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@nutch.apache.org by sn...@apache.org on 2017/12/05 09:39:40 UTC

[nutch] branch master updated (55c7f75 -> 3f0ecdf)

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git.


    from 55c7f75  Merge pull request #244 from jorgelbg/NUTCH-2464
     new a7bc1a8  NUTCH-2456: Redirected documents are not indexed
     new 17a4cb5  Code style fixes.
     new 9854f7a  Allow index removals even if dbDatum is null.
     new 4592eb6  Fix for previous commit
     new 3f0ecdf  NUTCH-2456 Allow to index pages/URLs not contained in CrawlDb

The 5 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../org/apache/nutch/indexer/IndexerMapReduce.java | 49 +++++++++++-----------
 1 file changed, 25 insertions(+), 24 deletions(-)

-- 
To stop receiving notification emails like this one, please contact
['"commits@nutch.apache.org" <co...@nutch.apache.org>'].

[nutch] 02/05: Code style fixes.

Posted by sn...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 17a4cb5d772649e19cd60037567af53e216cf41b
Author: YossiTamari <33...@users.noreply.github.com>
AuthorDate: Wed Nov 8 17:02:17 2017 +0200

    Code style fixes.
---
 .../org/apache/nutch/indexer/IndexerMapReduce.java | 42 +++++++++++-----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
index 9598a89..12d379e 100644
--- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
+++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -261,14 +261,14 @@ public class IndexerMapReduce extends Configured implements
     }
 
     // Whether to delete pages marked as duplicates
-    if (delete && dbDatum!=null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
+    if (delete && dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
       reporter.incrCounter("IndexerStatus", "deleted (duplicates)", 1);
       output.collect(key, DELETE_ACTION);
       return;
     }
 
     // Whether to skip DB_NOTMODIFIED pages
-    if (skip && dbDatum!=null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
+    if (skip && dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
       reporter.incrCounter("IndexerStatus", "skipped (not modified)", 1);
       return;
     }
@@ -308,25 +308,25 @@ public class IndexerMapReduce extends Configured implements
     doc.add("boost", Float.toString(boost));
 
     try {
-      if (dbDatum!=null) {
-	      // Indexing filters may also be interested in the signature
-	      fetchDatum.setSignature(dbDatum.getSignature());
-	      
-	      // extract information from dbDatum and pass it to
-	      // fetchDatum so that indexing filters can use it
-	      final Text url = (Text) dbDatum.getMetaData().get(
-	          Nutch.WRITABLE_REPR_URL_KEY);
-	      if (url != null) {
-	        // Representation URL also needs normalization and filtering.
-	        // If repr URL is excluded by filters we still accept this document
-	        // but represented by its primary URL ("key") which has passed URL
-	        // filters.
-	        String urlString = filterUrl(normalizeUrl(url.toString()));
-	        if (urlString != null) {
-	          url.set(urlString);
-	          fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
-	        }
-	      }
+      if (dbDatum != null) {
+        // Indexing filters may also be interested in the signature
+        fetchDatum.setSignature(dbDatum.getSignature());
+        
+        // extract information from dbDatum and pass it to
+        // fetchDatum so that indexing filters can use it
+        final Text url = (Text) dbDatum.getMetaData().get(
+            Nutch.WRITABLE_REPR_URL_KEY);
+        if (url != null) {
+          // Representation URL also needs normalization and filtering.
+          // If repr URL is excluded by filters we still accept this document
+          // but represented by its primary URL ("key") which has passed URL
+          // filters.
+          String urlString = filterUrl(normalizeUrl(url.toString()));
+          if (urlString != null) {
+            url.set(urlString);
+            fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
+          }
+        }
       }
       // run indexing filters
       doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);

-- 
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.

[nutch] 05/05: NUTCH-2456 Allow to index pages/URLs not contained in CrawlDb

Posted by sn...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 3f0ecdf97cb9a8ddb7b17aee1deea74da3ed5b74
Merge: 55c7f75 4592eb6
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Tue Dec 5 10:12:32 2017 +0100

    NUTCH-2456 Allow to index pages/URLs not contained in CrawlDb
    
    Merge branch 'YossiTamari/nutch-2456', closes #240

 .../org/apache/nutch/indexer/IndexerMapReduce.java | 49 +++++++++++-----------
 1 file changed, 25 insertions(+), 24 deletions(-)

-- 
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.

[nutch] 04/05: Fix for previous commit

Posted by sn...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 4592eb695edd0b032dbf37678039368925b3c47d
Author: YossiTamari <33...@users.noreply.github.com>
AuthorDate: Wed Nov 8 17:55:00 2017 +0200

    Fix for previous commit
---
 src/java/org/apache/nutch/indexer/IndexerMapReduce.java | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
index 7e3438c..00829c4 100644
--- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
+++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -238,16 +238,16 @@ public class IndexerMapReduce extends Configured implements
     }
 
     // Whether to delete GONE or REDIRECTS
-    if (delete) {
-      if (fetchDatum != null && fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE
+    if (delete && fetchDatum != null) {
+      if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE
           || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
         reporter.incrCounter("IndexerStatus", "deleted (gone)", 1);
         output.collect(key, DELETE_ACTION);
         return;
       }
 
-      if (fetchDatum != null && fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM
-          || fetchDatum != null && fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
+      if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM
+          || fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
           || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM
           || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
         reporter.incrCounter("IndexerStatus", "deleted (redirects)", 1);

-- 
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.

[nutch] 01/05: NUTCH-2456: Redirected documents are not indexed

Posted by sn...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit a7bc1a8c5a3a5ab9c72574afd98089a354bf0484
Author: YossiTamari <33...@users.noreply.github.com>
AuthorDate: Tue Nov 7 12:13:41 2017 +0200

    NUTCH-2456: Redirected documents are not indexed
    
    This is a defensive, minimal approach for fixing this issue.
---
 .../org/apache/nutch/indexer/IndexerMapReduce.java | 43 +++++++++++-----------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
index cb6e121..9598a89 100644
--- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
+++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -256,20 +256,19 @@ public class IndexerMapReduce extends Configured implements
       }
     }
 
-    if (fetchDatum == null || dbDatum == null || parseText == null
-        || parseData == null) {
+    if (fetchDatum == null || parseText == null || parseData == null) {
       return; // only have inlinks
     }
 
     // Whether to delete pages marked as duplicates
-    if (delete && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
+    if (delete && dbDatum!=null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
       reporter.incrCounter("IndexerStatus", "deleted (duplicates)", 1);
       output.collect(key, DELETE_ACTION);
       return;
     }
 
     // Whether to skip DB_NOTMODIFIED pages
-    if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
+    if (skip && dbDatum!=null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
       reporter.incrCounter("IndexerStatus", "skipped (not modified)", 1);
       return;
     }
@@ -309,23 +308,25 @@ public class IndexerMapReduce extends Configured implements
     doc.add("boost", Float.toString(boost));
 
     try {
-      // Indexing filters may also be interested in the signature
-      fetchDatum.setSignature(dbDatum.getSignature());
-      
-      // extract information from dbDatum and pass it to
-      // fetchDatum so that indexing filters can use it
-      final Text url = (Text) dbDatum.getMetaData().get(
-          Nutch.WRITABLE_REPR_URL_KEY);
-      if (url != null) {
-        // Representation URL also needs normalization and filtering.
-        // If repr URL is excluded by filters we still accept this document
-        // but represented by its primary URL ("key") which has passed URL
-        // filters.
-        String urlString = filterUrl(normalizeUrl(url.toString()));
-        if (urlString != null) {
-          url.set(urlString);
-          fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
-        }
+      if (dbDatum!=null) {
+	      // Indexing filters may also be interested in the signature
+	      fetchDatum.setSignature(dbDatum.getSignature());
+	      
+	      // extract information from dbDatum and pass it to
+	      // fetchDatum so that indexing filters can use it
+	      final Text url = (Text) dbDatum.getMetaData().get(
+	          Nutch.WRITABLE_REPR_URL_KEY);
+	      if (url != null) {
+	        // Representation URL also needs normalization and filtering.
+	        // If repr URL is excluded by filters we still accept this document
+	        // but represented by its primary URL ("key") which has passed URL
+	        // filters.
+	        String urlString = filterUrl(normalizeUrl(url.toString()));
+	        if (urlString != null) {
+	          url.set(urlString);
+	          fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
+	        }
+	      }
       }
       // run indexing filters
       doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);

-- 
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.

[nutch] 03/05: Allow index removals even if dbDatum is null.

Posted by sn...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 9854f7af644a68db884f1b03eaf69359019e212e
Author: YossiTamari <33...@users.noreply.github.com>
AuthorDate: Wed Nov 8 17:13:05 2017 +0200

    Allow index removals even if dbDatum is null.
---
 src/java/org/apache/nutch/indexer/IndexerMapReduce.java | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
index 12d379e..7e3438c 100644
--- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
+++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -238,18 +238,18 @@ public class IndexerMapReduce extends Configured implements
     }
 
     // Whether to delete GONE or REDIRECTS
-    if (delete && fetchDatum != null && dbDatum != null) {
-      if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE
-          || dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
+    if (delete) {
+      if (fetchDatum != null && fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE
+          || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
         reporter.incrCounter("IndexerStatus", "deleted (gone)", 1);
         output.collect(key, DELETE_ACTION);
         return;
       }
 
-      if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM
-          || fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
-          || dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM
-          || dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
+      if (fetchDatum != null && fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM
+          || fetchDatum != null && fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
+          || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM
+          || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
         reporter.incrCounter("IndexerStatus", "deleted (redirects)", 1);
         output.collect(key, DELETE_ACTION);
         return;

-- 
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.