You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2017/12/18 15:49:54 UTC

[nutch] 10/23: make fully configurable

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit e0326de05197f8415eeb750d4d8fff764db87aa9
Author: Nicola Marcacci Rossi <ni...@gmail.com>
AuthorDate: Fri Dec 15 14:18:57 2017 +0100

    make fully configurable
---
 conf/nutch-default.xml                             | 20 ++++++++++++++--
 .../elasticrest/ElasticRestConstants.java          |  2 ++
 .../elasticrest/ElasticRestIndexWriter.java        | 28 ++++++++++++++++++----
 3 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index bcb2e9e..1d9837f 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -2122,12 +2122,28 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
         A list of strings denoting the supported languages (e.g. `en,de,fr,it`).
         If this value is empty all documents will be sent to index ${elastic.rest.index}.
         If not empty the Rest client will distribute documents in different indices based on their `lang` property.
-        Indices are named with the following schema: ${elastic.rest.index}_${lang} (e.g. `nutch_de`).
-        Entries with an unsupported `lang` value will be added to index ${elastic.rest.index}_others (e.g. `nutch_others`).
+        Indices are named with the following schema: ${elastic.rest.index}${elastic.rest.separator}${lang} (e.g. `nutch_de`).
+        Entries with an unsupported `lang` value will be added to index ${elastic.rest.index}${elastic.rest.separator}${elastic.rest.sink} (e.g. `nutch_others`).
     </description>
 </property>
 
 <property>
+    <name>elastic.rest.separator</name>
+    <value>_</value>
+    <description>
+        Default value is `_`. Is used only if `elastic.rest.languages` is defined to build the index name (i.e. ${elastic.rest.index}${elastic.rest.separator}${lang}). 
+    </description>
+</property>
+
+<property>
+	<name>elastic.rest.sink</name>
+	<value>others</value>
+	<description>
+		Default value is `others`. Is used only if `elastic.rest.languages` is defined to build the index name where to store documents with unsupported languages (i.e. ${elastic.rest.index}${elastic.rest.separator}${elastic.rest.sink}).
+	</description>
+</property>
+
+<property>
     <name>elastic.rest.type</name>
     <value>doc</value>
     <description>Default type to send documents to.</description>
diff --git a/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestConstants.java b/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestConstants.java
index 74f37eb..c0f5fe7 100644
--- a/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestConstants.java
+++ b/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestConstants.java
@@ -32,4 +32,6 @@ public interface ElasticRestConstants {
   public static final String HOSTNAME_TRUST = ELASTIC_PREFIX + "trustallhostnames";
   
   public static final String LANGUAGES = ELASTIC_PREFIX + "languages";
+  public static final String SEPARATOR = ELASTIC_PREFIX + "separator";
+  public static final String SINK = ELASTIC_PREFIX + "sink";
 }
diff --git a/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestIndexWriter.java b/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestIndexWriter.java
index 56cfab1..5e71b3c 100644
--- a/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestIndexWriter.java
+++ b/src/plugin/indexer-elastic-rest/src/java/org/apache/nutch/indexwriter/elasticrest/ElasticRestIndexWriter.java
@@ -67,7 +67,9 @@ public class ElasticRestIndexWriter implements IndexWriter {
       .getLogger(ElasticRestIndexWriter.class);
 
   private static final int DEFAULT_MAX_BULK_DOCS = 250;
-  private static final int DEFAULT_MAX_BULK_LENGTH = 2500500;
+  private static final int DEFAULT_MAX_BULK_LENGTH = 2500500;  
+  private static final String DEFAULT_SEPARATOR = "_";
+  private static final String DEFAULT_SINK = "others";
 
   private JestClient client;
   private String defaultIndex;
@@ -93,6 +95,8 @@ public class ElasticRestIndexWriter implements IndexWriter {
   private BasicFuture<JestResult> basicFuture = null;
   
   private String[] languages = null;
+  private String separator = null;
+  private String sink = null;
 
   @Override
   public void open(JobConf job, String name) throws IOException {
@@ -104,6 +108,8 @@ public class ElasticRestIndexWriter implements IndexWriter {
     https = job.getBoolean(ElasticRestConstants.HTTPS, false);
     trustAllHostnames = job.getBoolean(ElasticRestConstants.HOSTNAME_TRUST, false);
     languages = job.getStrings(ElasticRestConstants.LANGUAGES);
+    separator = job.get(ElasticRestConstants.SEPARATOR, DEFAULT_SEPARATOR);
+    sink = job.get(ElasticRestConstants.SINK, DEFAULT_SINK);
 
     // trust ALL certificates
     SSLContext sslContext = null;
@@ -205,9 +211,9 @@ public class ElasticRestIndexWriter implements IndexWriter {
         }
       }
       if (exists) {
-        index = defaultIndex + "_" + language;
+        index = getLanguageIndexName(language);
       } else {
-        index = defaultIndex + "_others";
+        index = getSinkIndexName();
       }
     } else {
       index = defaultIndex;
@@ -237,9 +243,9 @@ public class ElasticRestIndexWriter implements IndexWriter {
       if (languages != null && languages.length > 0) {
         Bulk.Builder bulkBuilder = new Bulk.Builder().defaultType(defaultType);
         for (String lang : languages) {          
-          bulkBuilder.addAction(new Delete.Builder(key).index(defaultIndex + "_" + lang).type(defaultType).build());
+          bulkBuilder.addAction(new Delete.Builder(key).index(getLanguageIndexName(lang)).type(defaultType).build());
         }
-        bulkBuilder.addAction(new Delete.Builder(key).index(defaultIndex + "_others").type(defaultType).build());
+        bulkBuilder.addAction(new Delete.Builder(key).index(getSinkIndexName()).type(defaultType).build());
         client.execute(bulkBuilder.build());
       } else {
         client.execute(new Delete.Builder(key).index(defaultIndex)
@@ -359,4 +365,16 @@ public class ElasticRestIndexWriter implements IndexWriter {
   public Configuration getConf() {
     return config;
   }
+
+  private String getLanguageIndexName(String lang) {
+    return getComposedIndexName(defaultIndex, lang);
+  }
+  
+  private String getSinkIndexName() {
+    return getComposedIndexName(defaultIndex, sink);
+  }
+  
+  private String getComposedIndexName(String prefix, String postfix) {
+    return prefix + separator + postfix;
+  }
 }

-- 
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.