You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2017/12/18 15:49:54 UTC
[nutch] 10/23: make fully configurable
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
commit e0326de05197f8415eeb750d4d8fff764db87aa9
Author: Nicola Marcacci Rossi <ni...@gmail.com>
AuthorDate: Fri Dec 15 14:18:57 2017 +0100
make fully configurable
---
conf/nutch-default.xml | 20 ++++++++++++++--
.../elasticrest/ElasticRestConstants.java | 2 ++
.../elasticrest/ElasticRestIndexWriter.java | 28 ++++++++++++++++++----
3 files changed, 43 insertions(+), 7 deletions(-)
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index bcb2e9e..1d9837f 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -2122,12 +2122,28 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
A list of strings denoting the supported languages (e.g. `en,de,fr,it`).
If this value is empty all documents will be sent to index ${elastic.rest.index}.
If not empty the Rest client will distribute documents in different indices based on their `lang` property.
- Indices are named with the following schema: ${elastic.rest.index}_${lang} (e.g. `nutch_de`).
- Entries with an unsupported `lang` value will be added to index ${elastic.rest.index}_others (e.g. `nutch_others`).
+ Indices are named with the following schema: ${elastic.rest.index}${elastic.rest.separator}${lang} (e.g. `nutch_de`).
+ Entries with an unsupported `lang` value will be added to index ${elastic.rest.index}${elastic.rest.separator}${elastic.rest.sink} (e.g. `nutch_others`).
</description>
</property>
<property>
+ <name>elastic.rest.separator</name>
+ <value>_</value>
+ <description>
+ Default value is `_`. Is used only if `elastic.rest.languages` is defined to build the index name (i.e. ${elastic.rest.index}${elastic.rest.separator}${lang}).
+ </description>
+</property>
+
+<property>
+ <name>elastic.rest.sink</name>
+ <value>others</value>
+ <description>
+ Default value is `others`. Is used only if `elastic.rest.languages` is defined to build the index name where to store documents with unsupported languages (i.e. ${elastic.rest.index}${elastic.rest.separator}${elastic.rest.sink}).
+ </description>
+</property>
+
+<property>
<name>elastic.rest.type</name>
<value>doc</value>
<description>Default type to send documents to.</description>
diff --git a/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestConstants.java b/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestConstants.java
index 74f37eb..c0f5fe7 100644
--- a/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestConstants.java
+++ b/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestConstants.java
@@ -32,4 +32,6 @@ public interface ElasticRestConstants {
public static final String HOSTNAME_TRUST = ELASTIC_PREFIX + "trustallhostnames";
public static final String LANGUAGES = ELASTIC_PREFIX + "languages";
+ public static final String SEPARATOR = ELASTIC_PREFIX + "separator";
+ public static final String SINK = ELASTIC_PREFIX + "sink";
}
diff --git a/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestIndexWriter.java b/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestIndexWriter.java
index 56cfab1..5e71b3c 100644
--- a/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestIndexWriter.java
+++ b/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestIndexWriter.java
@@ -67,7 +67,9 @@ public class ElasticRestIndexWriter implements IndexWriter {
.getLogger(ElasticRestIndexWriter.class);
private static final int DEFAULT_MAX_BULK_DOCS = 250;
- private static final int DEFAULT_MAX_BULK_LENGTH = 2500500;
+ private static final int DEFAULT_MAX_BULK_LENGTH = 2500500;
+ private static final String DEFAULT_SEPARATOR = "_";
+ private static final String DEFAULT_SINK = "others";
private JestClient client;
private String defaultIndex;
@@ -93,6 +95,8 @@ public class ElasticRestIndexWriter implements IndexWriter {
private BasicFuture<JestResult> basicFuture = null;
private String[] languages = null;
+ private String separator = null;
+ private String sink = null;
@Override
public void open(JobConf job, String name) throws IOException {
@@ -104,6 +108,8 @@ public class ElasticRestIndexWriter implements IndexWriter {
https = job.getBoolean(ElasticRestConstants.HTTPS, false);
trustAllHostnames = job.getBoolean(ElasticRestConstants.HOSTNAME_TRUST, false);
languages = job.getStrings(ElasticRestConstants.LANGUAGES);
+ separator = job.get(ElasticRestConstants.SEPARATOR, DEFAULT_SEPARATOR);
+ sink = job.get(ElasticRestConstants.SINK, DEFAULT_SINK);
// trust ALL certificates
SSLContext sslContext = null;
@@ -205,9 +211,9 @@ public class ElasticRestIndexWriter implements IndexWriter {
}
}
if (exists) {
- index = defaultIndex + "_" + language;
+ index = getLanguageIndexName(language);
} else {
- index = defaultIndex + "_others";
+ index = getSinkIndexName();
}
} else {
index = defaultIndex;
@@ -237,9 +243,9 @@ public class ElasticRestIndexWriter implements IndexWriter {
if (languages != null && languages.length > 0) {
Bulk.Builder bulkBuilder = new Bulk.Builder().defaultType(defaultType);
for (String lang : languages) {
- bulkBuilder.addAction(new Delete.Builder(key).index(defaultIndex + "_" + lang).type(defaultType).build());
+ bulkBuilder.addAction(new Delete.Builder(key).index(getLanguageIndexName(lang)).type(defaultType).build());
}
- bulkBuilder.addAction(new Delete.Builder(key).index(defaultIndex + "_others").type(defaultType).build());
+ bulkBuilder.addAction(new Delete.Builder(key).index(getSinkIndexName()).type(defaultType).build());
client.execute(bulkBuilder.build());
} else {
client.execute(new Delete.Builder(key).index(defaultIndex)
@@ -359,4 +365,16 @@ public class ElasticRestIndexWriter implements IndexWriter {
public Configuration getConf() {
return config;
}
+
+ private String getLanguageIndexName(String lang) {
+ return getComposedIndexName(defaultIndex, lang);
+ }
+
+ private String getSinkIndexName() {
+ return getComposedIndexName(defaultIndex, sink);
+ }
+
+ private String getComposedIndexName(String prefix, String postfix) {
+ return prefix + separator + postfix;
+ }
}
--
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.