You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@unomi.apache.org by sh...@apache.org on 2016/12/20 15:38:51 UTC
[2/2] incubator-unomi git commit: - Introduce a new save method that
supports batching - Use new save method batching to import Geonames (a lot)
faster - Replace slow running ASCIIFoldingFilter call with faster
MappingCharFilter call. See https://issues.
- Introduce a new save method that supports batching
- Use new save method batching to import Geonames (a lot) faster
- Replace slow running ASCIIFoldingFilter call with faster MappingCharFilter call. See https://issues.apache.org/jira/browse/LUCENE-7525 and https://issues.apache.org/jira/browse/SOLR-2013
-
Project: http://git-wip-us.apache.org/repos/asf/incubator-unomi/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-unomi/commit/ac2c9ba5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-unomi/tree/ac2c9ba5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-unomi/diff/ac2c9ba5
Branch: refs/heads/feature-UNOMI-70-ES5X
Commit: ac2c9ba5c6b955048ac47afb402dc80e1213f16f
Parents: 69b0403
Author: Serge Huber <sh...@apache.org>
Authored: Tue Dec 20 16:38:19 2016 +0100
Committer: Serge Huber <sh...@apache.org>
Committed: Tue Dec 20 16:38:46 2016 +0100
----------------------------------------------------------------------
.../geonames/services/GeonamesServiceImpl.java | 26 +-
.../ElasticSearchPersistenceServiceImpl.java | 11 +-
.../conditions/ConditionContextHelper.java | 31 +-
.../src/main/resources/mapping-FoldToASCII.txt | 3813 ++++++++++++++++++
.../persistence/spi/PersistenceService.java | 11 +
5 files changed, 3879 insertions(+), 13 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-unomi/blob/ac2c9ba5/extensions/geonames/services/src/main/java/org/apache/unomi/geonames/services/GeonamesServiceImpl.java
----------------------------------------------------------------------
diff --git a/extensions/geonames/services/src/main/java/org/apache/unomi/geonames/services/GeonamesServiceImpl.java b/extensions/geonames/services/src/main/java/org/apache/unomi/geonames/services/GeonamesServiceImpl.java
index ebe53a0..2b0a0dc 100644
--- a/extensions/geonames/services/src/main/java/org/apache/unomi/geonames/services/GeonamesServiceImpl.java
+++ b/extensions/geonames/services/src/main/java/org/apache/unomi/geonames/services/GeonamesServiceImpl.java
@@ -112,12 +112,15 @@ public class GeonamesServiceImpl implements GeonamesService {
ZipInputStream zipInputStream = new ZipInputStream(new FileInputStream(f));
ZipEntry zipEntry = zipInputStream.getNextEntry(); // used to advance to the first entry in the ZipInputStream
+ long fileSize = zipEntry.getSize();
BufferedReader reader = new BufferedReader(new InputStreamReader(zipInputStream, "UTF-8"));
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
String line;
logger.info("Starting to import geonames database from file {}...", f);
- long lineCount = 0;
+ long charCount = 0;
+ double lastCompletionPourcentage = 0.0;
+ long lastCharCount = 0;
long importStartTime = System.currentTimeMillis();
while ((line = reader.readLine()) != null) {
String[] values = line.split("\t");
@@ -134,14 +137,25 @@ public class GeonamesServiceImpl implements GeonamesService {
values[16], values[17],
sdf.parse(values[18]));
- persistenceService.save(geonameEntry);
+ persistenceService.save(geonameEntry, true);
}
- lineCount++;
- if (lineCount % 1000 == 0) {
- logger.info("{} lines imported from file {}", lineCount, f);
+ charCount+=line.length();
+ if (fileSize > 0) {
+ double completionPourcentage = 100.0 * charCount / fileSize;
+ if (completionPourcentage - lastCompletionPourcentage > 1.0) {
+ int roundedPourcentage = (int) completionPourcentage;
+ logger.info("{}% imported from file {}", roundedPourcentage, f);
+ lastCompletionPourcentage = completionPourcentage;
+ }
+ } else {
+ if (charCount - lastCharCount > (100*1024*1024)) {
+ logger.info("{}MB imported from file {}", charCount / (1024*1024), f);
+ lastCharCount = charCount;
+ }
}
}
- logger.info("{} lines from Geonames database file {} imported in {}ms", lineCount, f, System.currentTimeMillis()-importStartTime);
+ long totalTimeMillis = System.currentTimeMillis()-importStartTime;
+ logger.info("{} characters from Geonames database file {} imported in {}ms. Speed={}MB/s", charCount, f, totalTimeMillis, charCount / (1024*1024) / (totalTimeMillis / 1000));
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
http://git-wip-us.apache.org/repos/asf/incubator-unomi/blob/ac2c9ba5/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/ElasticSearchPersistenceServiceImpl.java
----------------------------------------------------------------------
diff --git a/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/ElasticSearchPersistenceServiceImpl.java b/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/ElasticSearchPersistenceServiceImpl.java
index 49e09d0..e7ed75b 100644
--- a/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/ElasticSearchPersistenceServiceImpl.java
+++ b/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/ElasticSearchPersistenceServiceImpl.java
@@ -739,6 +739,11 @@ public class ElasticSearchPersistenceServiceImpl implements PersistenceService,
@Override
public boolean save(final Item item) {
+ return save(item, false);
+ }
+
+ @Override
+ public boolean save(final Item item, final boolean useBatching) {
return new InClassLoaderExecute<Boolean>() {
protected Boolean execute(Object... args) {
@@ -769,7 +774,11 @@ public class ElasticSearchPersistenceServiceImpl implements PersistenceService,
}
try {
- indexBuilder.execute().actionGet();
+ if (bulkProcessor == null || !useBatching) {
+ indexBuilder.execute().actionGet();
+ } else {
+ bulkProcessor.add(indexBuilder.request());
+ }
} catch (IndexNotFoundException e) {
if (existingIndexNames.contains(index)) {
existingIndexNames.remove(index);
http://git-wip-us.apache.org/repos/asf/incubator-unomi/blob/ac2c9ba5/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/conditions/ConditionContextHelper.java
----------------------------------------------------------------------
diff --git a/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/conditions/ConditionContextHelper.java b/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/conditions/ConditionContextHelper.java
index 63964e1..6f3f1b4 100644
--- a/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/conditions/ConditionContextHelper.java
+++ b/persistence-elasticsearch/core/src/main/java/org/apache/unomi/persistence/elasticsearch/conditions/ConditionContextHelper.java
@@ -20,14 +20,18 @@ package org.apache.unomi.persistence.elasticsearch.conditions;
import com.google.common.base.Function;
import com.google.common.collect.Lists;
import org.apache.commons.lang3.StringUtils;
-import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
-import org.apache.lucene.util.ArrayUtil;
+import org.apache.logging.log4j.core.util.IOUtils;
+import org.apache.lucene.analysis.charfilter.MappingCharFilterFactory;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.unomi.api.conditions.Condition;
import org.mvel2.MVEL;
import org.mvel2.ParserConfiguration;
import org.mvel2.ParserContext;
+import java.io.IOException;
+import java.io.Reader;
import java.io.Serializable;
+import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@@ -37,6 +41,18 @@ import java.util.concurrent.ConcurrentHashMap;
public class ConditionContextHelper {
private static Map<String,Serializable> mvelExpressions = new ConcurrentHashMap<>();
+ private static MappingCharFilterFactory mappingCharFilterFactory;
+ static {
+ Map<String,String> args = new HashMap<>();
+ args.put("mapping", "mapping-FoldToASCII.txt");
+ mappingCharFilterFactory = new MappingCharFilterFactory(args);
+ try {
+ mappingCharFilterFactory.inform(new ClasspathResourceLoader(ConditionContextHelper.class.getClassLoader()));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
public static Condition getContextualCondition(Condition condition, Map<String, Object> context) {
if (context.isEmpty() || !hasContextualParameter(condition.getParameterValues())) {
return condition;
@@ -124,10 +140,13 @@ public class ConditionContextHelper {
public static String foldToASCII(String s) {
if (s != null) {
s = s.toLowerCase();
- int maxSizeNeeded = 4 * s.length();
- char[] output = new char[ArrayUtil.oversize(maxSizeNeeded, 2)];
- int length = ASCIIFoldingFilter.foldToASCII(s.toCharArray(), 0, output, 0, s.length());
- return new String(output, 0, length);
+ StringReader stringReader = new StringReader(s);
+ Reader foldedStringReader = mappingCharFilterFactory.create(stringReader);
+ try {
+ return IOUtils.toString(foldedStringReader);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
}
return null;
}