You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by mm...@apache.org on 2019/06/07 15:22:28 UTC
[accumulo-wikisearch] branch master updated: Upgrade to lucene 7.1.0
This is an automated email from the ASF dual-hosted git repository.
mmiller pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/accumulo-wikisearch.git
The following commit(s) were added to refs/heads/master by this push:
new 134b78d Upgrade to lucene 7.1.0
134b78d is described below
commit 134b78df77f45184f2193e6f5d822f1d99071f59
Author: Mike Miller <mm...@apache.org>
AuthorDate: Fri Jun 7 11:21:54 2019 -0400
Upgrade to lucene 7.1.0
---
ingest/pom.xml | 6 +-
.../wikisearch/ingest/ArticleExtractor.java | 8 +-
.../wikisearch/ingest/WikipediaConfiguration.java | 7 --
.../wikisearch/ingest/WikipediaMapper.java | 25 ++----
.../wikisearch/normalizer/NumberNormalizer.java | 42 ----------
.../normalizer/testNumberNormalizer.java | 90 ----------------------
pom.xml | 21 ++---
7 files changed, 18 insertions(+), 181 deletions(-)
diff --git a/ingest/pom.xml b/ingest/pom.xml
index 295c8a8..1f6bc99 100644
--- a/ingest/pom.xml
+++ b/ingest/pom.xml
@@ -57,7 +57,7 @@
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
- <artifactId>lucene-wikipedia</artifactId>
+ <artifactId>lucene-analyzers-common</artifactId>
</dependency>
<dependency>
<groupId>org.apache.zookeeper</groupId>
@@ -84,8 +84,8 @@
<phase>prepare-package</phase>
<configuration>
<outputDirectory>lib</outputDirectory>
- <!-- just grab the non-provided runtime dependencies -->
- <!-- XXX we include guava at the same version as hadoop 2 provides so that we have it on hadoop 1 -->
+ <!– just grab the non-provided runtime dependencies –>
+ <!– XXX we include guava at the same version as hadoop 2 provides so that we have it on hadoop 1 –>
<includeArtifactIds>commons-lang,guava,lucene-core,lucene-analyzers,lucene-wikipedia,protobuf-java,accumulo-core,hadoop-core,libthrift,zookeeper,commons-codec,accumulo-fate,accumulo-trace</includeArtifactIds>
<excludeTransitive>false</excludeTransitive>
</configuration>
diff --git a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java
index 0699cfa..cda08d8 100644
--- a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java
+++ b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java
@@ -31,7 +31,6 @@ import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.accumulo.examples.wikisearch.normalizer.LcNoDiacriticsNormalizer;
-import org.apache.accumulo.examples.wikisearch.normalizer.NumberNormalizer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
@@ -39,7 +38,6 @@ import org.apache.hadoop.io.Writable;
public class ArticleExtractor {
public final static SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'Z");
- private static NumberNormalizer nn = new NumberNormalizer();
private static LcNoDiacriticsNormalizer lcdn = new LcNoDiacriticsNormalizer();
public static class Article implements Writable {
@@ -91,9 +89,11 @@ public class ArticleExtractor {
public Map<String,String> getNormalizedFieldValues() {
Map<String,String> fields = new HashMap<String,String>();
- fields.put("ID", nn.normalizeFieldValue("ID", this.id));
+ //fields.put("ID", nn.normalizeFieldValue("ID", this.id));
+ fields.put("ID", Integer.toString(this.id));
fields.put("TITLE", lcdn.normalizeFieldValue("TITLE", this.title));
- fields.put("TIMESTAMP", nn.normalizeFieldValue("TIMESTAMP", this.timestamp));
+ //fields.put("TIMESTAMP", nn.normalizeFieldValue("TIMESTAMP", this.timestamp));
+ fields.put("TIMESTAMP", Long.toString(this.timestamp));
fields.put("COMMENTS", lcdn.normalizeFieldValue("COMMENTS", this.comments));
return fields;
}
diff --git a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java
index 44a3fbc..05ce8d8 100644
--- a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java
+++ b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java
@@ -28,7 +28,6 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.SimpleAnalyzer;
public class WikipediaConfiguration {
public final static String INSTANCE_NAME = "wikipedia.accumulo.instance_name";
@@ -107,12 +106,6 @@ public class WikipediaConfiguration {
return new Path(filename);
}
- public static Analyzer getAnalyzer(Configuration conf) throws IOException {
- Class<? extends Analyzer> analyzerClass =
- conf.getClass(ANALYZER, SimpleAnalyzer.class, Analyzer.class);
- return ReflectionUtils.newInstance(analyzerClass, conf);
- }
-
public static Connector getConnector(Configuration conf)
throws AccumuloException, AccumuloSecurityException {
return getInstance(conf).getConnector(getUser(conf), getPassword(conf));
diff --git a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
index c751637..c2fed03 100644
--- a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
+++ b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
@@ -47,8 +47,8 @@ import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.log4j.Logger;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-import org.apache.lucene.wikipedia.analysis.WikipediaTokenizer;
+import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
+import org.apache.lucene.util.Attribute;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
@@ -223,31 +223,18 @@ public class WikipediaMapper extends Mapper<LongWritable,Text,Text,Mutation> {
/**
* Tokenize the wikipedia content
*/
- static Set<String> getTokens(Article article) throws IOException {
+ static Set<String> getTokens(Article article) {
Set<String> tokenList = new HashSet<>();
- WikipediaTokenizer tok = new WikipediaTokenizer(new StringReader(article.getText()));
- TermAttribute term = tok.addAttribute(TermAttribute.class);
- try {
+ try (WikipediaTokenizer tok = new WikipediaTokenizer(new StringReader(article.getText()))) {
+ Attribute term = tok.addAttribute(Attribute.class);
while (tok.incrementToken()) {
- String token = term.term();
+ String token = term.toString();
if (!StringUtils.isEmpty(token)) {
tokenList.add(token);
}
}
} catch (IOException e) {
log.error("Error tokenizing text", e);
- } finally {
- try {
- tok.end();
- } catch (IOException e) {
- log.error("Error calling end()", e);
- } finally {
- try {
- tok.close();
- } catch (IOException e) {
- log.error("Error closing tokenizer", e);
- }
- }
}
return tokenList;
}
diff --git a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java
deleted file mode 100644
index e0a5cc8..0000000
--- a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.normalizer;
-
-import org.apache.commons.lang.math.NumberUtils;
-import org.apache.lucene.util.NumericUtils;
-
-public class NumberNormalizer implements Normalizer {
-
- public String normalizeFieldValue(String field, Object value) {
- if (NumberUtils.isNumber(value.toString())) {
- Number n = NumberUtils.createNumber(value.toString());
- if (n instanceof Integer)
- return NumericUtils.intToPrefixCoded((Integer) n);
- else if (n instanceof Long)
- return NumericUtils.longToPrefixCoded((Long) n);
- else if (n instanceof Float)
- return NumericUtils.floatToPrefixCoded((Float) n);
- else if (n instanceof Double)
- return NumericUtils.doubleToPrefixCoded((Double) n);
- else
- throw new IllegalArgumentException("Unhandled numeric type: " + n.getClass());
- } else {
- throw new IllegalArgumentException("Value is not a number: " + value);
- }
- }
-
-}
diff --git a/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/normalizer/testNumberNormalizer.java b/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/normalizer/testNumberNormalizer.java
deleted file mode 100644
index 470633c..0000000
--- a/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/normalizer/testNumberNormalizer.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.normalizer;
-
-import static org.junit.Assert.assertTrue;
-
-import org.apache.accumulo.examples.wikisearch.normalizer.NumberNormalizer;
-import org.junit.Test;
-
-public class testNumberNormalizer {
-
- @Test
- public void test1() throws Exception {
- NumberNormalizer nn = new NumberNormalizer();
-
- String n1 = nn.normalizeFieldValue(null, "1");
- String n2 = nn.normalizeFieldValue(null, "1.00000000");
-
- assertTrue(n1.compareTo(n2) < 0);
-
- }
-
- @Test
- public void test2() {
- NumberNormalizer nn = new NumberNormalizer();
-
- String n1 = nn.normalizeFieldValue(null, "-1.0");
- String n2 = nn.normalizeFieldValue(null, "1.0");
-
- assertTrue(n1.compareTo(n2) < 0);
-
- }
-
- @Test
- public void test3() {
- NumberNormalizer nn = new NumberNormalizer();
- String n1 = nn.normalizeFieldValue(null, "-0.0001");
- String n2 = nn.normalizeFieldValue(null, "0");
- String n3 = nn.normalizeFieldValue(null, "0.00001");
-
- assertTrue((n1.compareTo(n2) < 0) && (n2.compareTo(n3) < 0));
- }
-
- @Test
- public void test4() {
- NumberNormalizer nn = new NumberNormalizer();
- String nn1 = nn.normalizeFieldValue(null, Integer.toString(Integer.MAX_VALUE));
- String nn2 = nn.normalizeFieldValue(null, Integer.toString(Integer.MAX_VALUE - 1));
-
- assertTrue((nn2.compareTo(nn1) < 0));
-
- }
-
- @Test
- public void test5() {
- NumberNormalizer nn = new NumberNormalizer();
- String nn1 = nn.normalizeFieldValue(null, "-0.001");
- String nn2 = nn.normalizeFieldValue(null, "-0.0009");
- String nn3 = nn.normalizeFieldValue(null, "-0.00090");
-
- assertTrue((nn3.compareTo(nn2) == 0) && (nn2.compareTo(nn1) > 0));
-
- }
-
- @Test
- public void test6() {
- NumberNormalizer nn = new NumberNormalizer();
- String nn1 = nn.normalizeFieldValue(null, "00.0");
- String nn2 = nn.normalizeFieldValue(null, "0");
- String nn3 = nn.normalizeFieldValue(null, "0.0");
-
- assertTrue((nn3.compareTo(nn2) == 0) && (nn2.compareTo(nn1) == 0));
-
- }
-
-}
diff --git a/pom.xml b/pom.xml
index 48c3c46..ba62cf0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -54,9 +54,8 @@
<version.kryo>1.04</version.kryo>
<version.log4j>1.2.16</version.log4j>
<version.log4j-extras>1.0</version.log4j-extras>
- <version.lucene>3.6.2</version.lucene>
- <version.lucene-analyzers>3.6.2</version.lucene-analyzers>
- <version.lucene-wikipedia>3.0.3</version.lucene-wikipedia>
+ <version.lucene>7.1.0</version.lucene>
+ <version.lucene-analyzers>4.0.0</version.lucene-analyzers>
<version.minlog>1.2</version.minlog>
<version.protobuf>2.5.0</version.protobuf>
<version.thrift>0.12.0</version.thrift>
@@ -226,18 +225,8 @@
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
- <artifactId>lucene-wikipedia</artifactId>
- <version>${version.lucene-wikipedia}</version>
- <exclusions>
- <exclusion>
- <groupId>commons-digester</groupId>
- <artifactId>commons-digester</artifactId>
- </exclusion>
- <exclusion>
- <groupId>commons-logging</groupId>
- <artifactId>commons-logging</artifactId>
- </exclusion>
- </exclusions>
+ <artifactId>lucene-analyzers-common</artifactId>
+ <version>${version.lucene-analyzers}</version>
</dependency>
<dependency>
<groupId>org.apache.thrift</groupId>
@@ -386,7 +375,7 @@
<phase>prepare-package</phase>
<configuration>
<outputDirectory>../../lib</outputDirectory>
- <!-- just grab the non-provided runtime dependencies -->
+ <!– just grab the non-provided runtime dependencies –>
<includeArtifactIds>commons-collections,commons-configuration,commons-io,commons-lang,jline,log4j,libthrift,commons-jci-core,commons-jci-fam,commons-logging,commons-logging-api</includeArtifactIds>
<excludeGroupIds>accumulo</excludeGroupIds>
<excludeTransitive>true</excludeTransitive>