You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@unomi.apache.org by sh...@apache.org on 2016/12/20 15:38:51 UTC

[2/2] incubator-unomi git commit: - Introduce a new save method that supports batching - Use new save method batching to import Geonames (a lot) faster - Replace slow running ASCIIFoldingFilter call with faster MappingCharFilter call. See https://issues.

- Introduce a new save method that supports batching
- Use new save method batching to import Geonames (a lot) faster
- Replace slow running ASCIIFoldingFilter call with faster MappingCharFilter call. See https://issues.apache.org/jira/browse/LUCENE-7525 and https://issues.apache.org/jira/browse/SOLR-2013
-


Project: http://git-wip-us.apache.org/repos/asf/incubator-unomi/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-unomi/commit/ac2c9ba5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-unomi/tree/ac2c9ba5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-unomi/diff/ac2c9ba5

Branch: refs/heads/feature-UNOMI-70-ES5X
Commit: ac2c9ba5c6b955048ac47afb402dc80e1213f16f
Parents: 69b0403
Author: Serge Huber <sh...@apache.org>
Authored: Tue Dec 20 16:38:19 2016 +0100
Committer: Serge Huber <sh...@apache.org>
Committed: Tue Dec 20 16:38:46 2016 +0100

----------------------------------------------------------------------
 .../geonames/services/GeonamesServiceImpl.java  |   26 +-
 .../ElasticSearchPersistenceServiceImpl.java    |   11 +-
 .../conditions/ConditionContextHelper.java      |   31 +-
 .../src/main/resources/mapping-FoldToASCII.txt  | 3813 ++++++++++++++++++
 .../persistence/spi/PersistenceService.java     |   11 +
 5 files changed, 3879 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-unomi/blob/ac2c9ba5/extensions/geonames/services/src/main/java/org/apache/unomi/geonames/services/GeonamesServiceImpl.java
----------------------------------------------------------------------
diff --git a/extensions/geonames/services/src/main/java/org/apache/unomi/geonames/services/GeonamesServiceImpl.java b/extensions/geonames/services/src/main/java/org/apache/unomi/geonames/services/GeonamesServiceImpl.java
index ebe53a0..2b0a0dc 100644
--- a/extensions/geonames/services/src/main/java/org/apache/unomi/geonames/services/GeonamesServiceImpl.java
+++ b/extensions/geonames/services/src/main/java/org/apache/unomi/geonames/services/GeonamesServiceImpl.java
@@ -112,12 +112,15 @@ public class GeonamesServiceImpl implements GeonamesService {
 
             ZipInputStream zipInputStream = new ZipInputStream(new FileInputStream(f));
             ZipEntry zipEntry = zipInputStream.getNextEntry(); // used to advance to the first entry in the ZipInputStream
+            long fileSize = zipEntry.getSize();
             BufferedReader reader = new BufferedReader(new InputStreamReader(zipInputStream, "UTF-8"));
 
             SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
             String line;
             logger.info("Starting to import geonames database from file {}...", f);
-            long lineCount = 0;
+            long charCount = 0;
+            double lastCompletionPourcentage = 0.0;
+            long lastCharCount = 0;
             long importStartTime = System.currentTimeMillis();
             while ((line = reader.readLine()) != null) {
                 String[] values = line.split("\t");
@@ -134,14 +137,25 @@ public class GeonamesServiceImpl implements GeonamesService {
                             values[16], values[17],
                             sdf.parse(values[18]));
 
-                    persistenceService.save(geonameEntry);
+                    persistenceService.save(geonameEntry, true);
                 }
-                lineCount++;
-                if (lineCount % 1000 == 0) {
-                    logger.info("{} lines imported from file {}", lineCount, f);
+                charCount+=line.length();
+                if (fileSize > 0) {
+                    double completionPourcentage = 100.0 * charCount / fileSize;
+                    if (completionPourcentage - lastCompletionPourcentage > 1.0) {
+                        int roundedPourcentage = (int) completionPourcentage;
+                        logger.info("{}% imported from file {}", roundedPourcentage, f);
+                        lastCompletionPourcentage = completionPourcentage;
+                    }
+                } else {
+                    if (charCount - lastCharCount > (100*1024*1024)) {
+                        logger.info("{}MB imported from file {}", charCount / (1024*1024), f);
+                        lastCharCount = charCount;
+                    }
                 }
             }
-            logger.info("{} lines from Geonames database file {} imported in {}ms", lineCount, f, System.currentTimeMillis()-importStartTime);
+            long totalTimeMillis = System.currentTimeMillis()-importStartTime;
+            logger.info("{} characters from Geonames database file {} imported in {}ms. Speed={}MB/s", charCount, f, totalTimeMillis, charCount / (1024*1024) / (totalTimeMillis / 1000));
         } catch (Exception e) {
             logger.error(e.getMessage(), e);
         }

http://git-wip-us.apache.org/repos/asf/incubator-unomi/blob/ac2c9ba5/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/ElasticSearchPersistenceServiceImpl.java
----------------------------------------------------------------------
diff --git a/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/ElasticSearchPersistenceServiceImpl.java b/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/ElasticSearchPersistenceServiceImpl.java
index 49e09d0..e7ed75b 100644
--- a/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/ElasticSearchPersistenceServiceImpl.java
+++ b/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/ElasticSearchPersistenceServiceImpl.java
@@ -739,6 +739,11 @@ public class ElasticSearchPersistenceServiceImpl implements PersistenceService,
 
     @Override
     public boolean save(final Item item) {
+        return save(item, false);
+    }
+
+    @Override
+    public boolean save(final Item item, final boolean useBatching) {
 
         return new InClassLoaderExecute<Boolean>() {
             protected Boolean execute(Object... args) {
@@ -769,7 +774,11 @@ public class ElasticSearchPersistenceServiceImpl implements PersistenceService,
                     }
 
                     try {
-                        indexBuilder.execute().actionGet();
+                        if (bulkProcessor == null || !useBatching) {
+                            indexBuilder.execute().actionGet();
+                        } else {
+                            bulkProcessor.add(indexBuilder.request());
+                        }
                     } catch (IndexNotFoundException e) {
                         if (existingIndexNames.contains(index)) {
                             existingIndexNames.remove(index);

http://git-wip-us.apache.org/repos/asf/incubator-unomi/blob/ac2c9ba5/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/conditions/ConditionContextHelper.java
----------------------------------------------------------------------
diff --git a/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/conditions/ConditionContextHelper.java b/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/conditions/ConditionContextHelper.java
index 63964e1..6f3f1b4 100644
--- a/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/conditions/ConditionContextHelper.java
+++ b/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/conditions/ConditionContextHelper.java
@@ -20,14 +20,18 @@ package org.apache.unomi.persistence.elasticsearch.conditions;
 import com.google.common.base.Function;
 import com.google.common.collect.Lists;
 import org.apache.commons.lang3.StringUtils;
-import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
-import org.apache.lucene.util.ArrayUtil;
+import org.apache.logging.log4j.core.util.IOUtils;
+import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
 import org.apache.unomi.api.conditions.Condition;
 import org.mvel2.MVEL;
 import org.mvel2.ParserConfiguration;
 import org.mvel2.ParserContext;
 
+import java.io.IOException;
+import java.io.Reader;
 import java.io.Serializable;
+import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@@ -37,6 +41,18 @@ import java.util.concurrent.ConcurrentHashMap;
 public class ConditionContextHelper {
     private static Map<String,Serializable> mvelExpressions = new ConcurrentHashMap<>();
 
+    private static MappingCharFilterFactory mappingCharFilterFactory;
+    static {
+        Map<String,String> args = new HashMap<>();
+        args.put("mapping", "mapping-FoldToASCII.txt");
+        mappingCharFilterFactory = new MappingCharFilterFactory(args);
+        try {
+            mappingCharFilterFactory.inform(new ClasspathResourceLoader(ConditionContextHelper.class.getClassLoader()));
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+    }
+
     public static Condition getContextualCondition(Condition condition, Map<String, Object> context) {
         if (context.isEmpty() || !hasContextualParameter(condition.getParameterValues())) {
             return condition;
@@ -124,10 +140,13 @@ public class ConditionContextHelper {
     public static String foldToASCII(String s) {
         if (s != null) {
             s = s.toLowerCase();
-            int maxSizeNeeded = 4 * s.length();
-            char[] output = new char[ArrayUtil.oversize(maxSizeNeeded, 2)];
-            int length = ASCIIFoldingFilter.foldToASCII(s.toCharArray(), 0, output, 0, s.length());
-            return new String(output, 0, length);
+            StringReader stringReader = new StringReader(s);
+            Reader foldedStringReader = mappingCharFilterFactory.create(stringReader);
+            try {
+                return IOUtils.toString(foldedStringReader);
+            } catch (IOException e) {
+                e.printStackTrace();
+            }
         }
         return null;
     }