You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2017/12/05 09:39:40 UTC
[nutch] branch master updated (55c7f75 -> 3f0ecdf)
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git.
from 55c7f75 Merge pull request #244 from jorgelbg/NUTCH-2464
new a7bc1a8 NUTCH-2456: Redirected documents are not indexed
new 17a4cb5 Code style fixes.
new 9854f7a Allow index removals even if dbDatum is null.
new 4592eb6 Fix for previous commit
new 3f0ecdf NUTCH-2456 Allow to index pages/URLs not contained in CrawlDb
The 5 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
.../org/apache/nutch/indexer/IndexerMapReduce.java | 49 +++++++++++-----------
1 file changed, 25 insertions(+), 24 deletions(-)
--
To stop receiving notification emails like this one, please contact
['"commits@nutch.apache.org" <co...@nutch.apache.org>'].
[nutch] 02/05: Code style fixes.
Posted by sn...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 17a4cb5d772649e19cd60037567af53e216cf41b
Author: YossiTamari <33...@users.noreply.github.com>
AuthorDate: Wed Nov 8 17:02:17 2017 +0200
Code style fixes.
---
.../org/apache/nutch/indexer/IndexerMapReduce.java | 42 +++++++++++-----------
1 file changed, 21 insertions(+), 21 deletions(-)
diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
index 9598a89..12d379e 100644
--- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
+++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -261,14 +261,14 @@ public class IndexerMapReduce extends Configured implements
}
// Whether to delete pages marked as duplicates
- if (delete && dbDatum!=null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
+ if (delete && dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
reporter.incrCounter("IndexerStatus", "deleted (duplicates)", 1);
output.collect(key, DELETE_ACTION);
return;
}
// Whether to skip DB_NOTMODIFIED pages
- if (skip && dbDatum!=null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
+ if (skip && dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
reporter.incrCounter("IndexerStatus", "skipped (not modified)", 1);
return;
}
@@ -308,25 +308,25 @@ public class IndexerMapReduce extends Configured implements
doc.add("boost", Float.toString(boost));
try {
- if (dbDatum!=null) {
- // Indexing filters may also be interested in the signature
- fetchDatum.setSignature(dbDatum.getSignature());
-
- // extract information from dbDatum and pass it to
- // fetchDatum so that indexing filters can use it
- final Text url = (Text) dbDatum.getMetaData().get(
- Nutch.WRITABLE_REPR_URL_KEY);
- if (url != null) {
- // Representation URL also needs normalization and filtering.
- // If repr URL is excluded by filters we still accept this document
- // but represented by its primary URL ("key") which has passed URL
- // filters.
- String urlString = filterUrl(normalizeUrl(url.toString()));
- if (urlString != null) {
- url.set(urlString);
- fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
- }
- }
+ if (dbDatum != null) {
+ // Indexing filters may also be interested in the signature
+ fetchDatum.setSignature(dbDatum.getSignature());
+
+ // extract information from dbDatum and pass it to
+ // fetchDatum so that indexing filters can use it
+ final Text url = (Text) dbDatum.getMetaData().get(
+ Nutch.WRITABLE_REPR_URL_KEY);
+ if (url != null) {
+ // Representation URL also needs normalization and filtering.
+ // If repr URL is excluded by filters we still accept this document
+ // but represented by its primary URL ("key") which has passed URL
+ // filters.
+ String urlString = filterUrl(normalizeUrl(url.toString()));
+ if (urlString != null) {
+ url.set(urlString);
+ fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
+ }
+ }
}
// run indexing filters
doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);
--
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.
[nutch] 05/05: NUTCH-2456 Allow to index pages/URLs not contained
in CrawlDb
Posted by sn...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 3f0ecdf97cb9a8ddb7b17aee1deea74da3ed5b74
Merge: 55c7f75 4592eb6
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Tue Dec 5 10:12:32 2017 +0100
NUTCH-2456 Allow to index pages/URLs not contained in CrawlDb
Merge branch 'YossiTamari/nutch-2456', closes #240
.../org/apache/nutch/indexer/IndexerMapReduce.java | 49 +++++++++++-----------
1 file changed, 25 insertions(+), 24 deletions(-)
--
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.
[nutch] 04/05: Fix for previous commit
Posted by sn...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 4592eb695edd0b032dbf37678039368925b3c47d
Author: YossiTamari <33...@users.noreply.github.com>
AuthorDate: Wed Nov 8 17:55:00 2017 +0200
Fix for previous commit
---
src/java/org/apache/nutch/indexer/IndexerMapReduce.java | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
index 7e3438c..00829c4 100644
--- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
+++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -238,16 +238,16 @@ public class IndexerMapReduce extends Configured implements
}
// Whether to delete GONE or REDIRECTS
- if (delete) {
- if (fetchDatum != null && fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE
+ if (delete && fetchDatum != null) {
+ if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE
|| dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
reporter.incrCounter("IndexerStatus", "deleted (gone)", 1);
output.collect(key, DELETE_ACTION);
return;
}
- if (fetchDatum != null && fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM
- || fetchDatum != null && fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
+ if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM
+ || fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
|| dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM
|| dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
reporter.incrCounter("IndexerStatus", "deleted (redirects)", 1);
--
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.
[nutch] 01/05: NUTCH-2456: Redirected documents are not indexed
Posted by sn...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
commit a7bc1a8c5a3a5ab9c72574afd98089a354bf0484
Author: YossiTamari <33...@users.noreply.github.com>
AuthorDate: Tue Nov 7 12:13:41 2017 +0200
NUTCH-2456: Redirected documents are not indexed
This is a defensive, minimal approach for fixing this issue.
---
.../org/apache/nutch/indexer/IndexerMapReduce.java | 43 +++++++++++-----------
1 file changed, 22 insertions(+), 21 deletions(-)
diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
index cb6e121..9598a89 100644
--- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
+++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -256,20 +256,19 @@ public class IndexerMapReduce extends Configured implements
}
}
- if (fetchDatum == null || dbDatum == null || parseText == null
- || parseData == null) {
+ if (fetchDatum == null || parseText == null || parseData == null) {
return; // only have inlinks
}
// Whether to delete pages marked as duplicates
- if (delete && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
+ if (delete && dbDatum!=null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
reporter.incrCounter("IndexerStatus", "deleted (duplicates)", 1);
output.collect(key, DELETE_ACTION);
return;
}
// Whether to skip DB_NOTMODIFIED pages
- if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
+ if (skip && dbDatum!=null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
reporter.incrCounter("IndexerStatus", "skipped (not modified)", 1);
return;
}
@@ -309,23 +308,25 @@ public class IndexerMapReduce extends Configured implements
doc.add("boost", Float.toString(boost));
try {
- // Indexing filters may also be interested in the signature
- fetchDatum.setSignature(dbDatum.getSignature());
-
- // extract information from dbDatum and pass it to
- // fetchDatum so that indexing filters can use it
- final Text url = (Text) dbDatum.getMetaData().get(
- Nutch.WRITABLE_REPR_URL_KEY);
- if (url != null) {
- // Representation URL also needs normalization and filtering.
- // If repr URL is excluded by filters we still accept this document
- // but represented by its primary URL ("key") which has passed URL
- // filters.
- String urlString = filterUrl(normalizeUrl(url.toString()));
- if (urlString != null) {
- url.set(urlString);
- fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
- }
+ if (dbDatum!=null) {
+ // Indexing filters may also be interested in the signature
+ fetchDatum.setSignature(dbDatum.getSignature());
+
+ // extract information from dbDatum and pass it to
+ // fetchDatum so that indexing filters can use it
+ final Text url = (Text) dbDatum.getMetaData().get(
+ Nutch.WRITABLE_REPR_URL_KEY);
+ if (url != null) {
+ // Representation URL also needs normalization and filtering.
+ // If repr URL is excluded by filters we still accept this document
+ // but represented by its primary URL ("key") which has passed URL
+ // filters.
+ String urlString = filterUrl(normalizeUrl(url.toString()));
+ if (urlString != null) {
+ url.set(urlString);
+ fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url);
+ }
+ }
}
// run indexing filters
doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks);
--
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.
[nutch] 03/05: Allow index removals even if dbDatum is null.
Posted by sn...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 9854f7af644a68db884f1b03eaf69359019e212e
Author: YossiTamari <33...@users.noreply.github.com>
AuthorDate: Wed Nov 8 17:13:05 2017 +0200
Allow index removals even if dbDatum is null.
---
src/java/org/apache/nutch/indexer/IndexerMapReduce.java | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
index 12d379e..7e3438c 100644
--- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
+++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -238,18 +238,18 @@ public class IndexerMapReduce extends Configured implements
}
// Whether to delete GONE or REDIRECTS
- if (delete && fetchDatum != null && dbDatum != null) {
- if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE
- || dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
+ if (delete) {
+ if (fetchDatum != null && fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE
+ || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
reporter.incrCounter("IndexerStatus", "deleted (gone)", 1);
output.collect(key, DELETE_ACTION);
return;
}
- if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM
- || fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
- || dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM
- || dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
+ if (fetchDatum != null && fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM
+ || fetchDatum != null && fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
+ || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM
+ || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
reporter.incrCounter("IndexerStatus", "deleted (redirects)", 1);
output.collect(key, DELETE_ACTION);
return;
--
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.