You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by mm...@apache.org on 2019/06/07 15:22:28 UTC

[accumulo-wikisearch] branch master updated: Upgrade to lucene 7.1.0

This is an automated email from the ASF dual-hosted git repository.

mmiller pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/accumulo-wikisearch.git


The following commit(s) were added to refs/heads/master by this push:
     new 134b78d  Upgrade to lucene 7.1.0
134b78d is described below

commit 134b78df77f45184f2193e6f5d822f1d99071f59
Author: Mike Miller <mm...@apache.org>
AuthorDate: Fri Jun 7 11:21:54 2019 -0400

    Upgrade to lucene 7.1.0
---
 ingest/pom.xml                                     |  6 +-
 .../wikisearch/ingest/ArticleExtractor.java        |  8 +-
 .../wikisearch/ingest/WikipediaConfiguration.java  |  7 --
 .../wikisearch/ingest/WikipediaMapper.java         | 25 ++----
 .../wikisearch/normalizer/NumberNormalizer.java    | 42 ----------
 .../normalizer/testNumberNormalizer.java           | 90 ----------------------
 pom.xml                                            | 21 ++---
 7 files changed, 18 insertions(+), 181 deletions(-)

diff --git a/ingest/pom.xml b/ingest/pom.xml
index 295c8a8..1f6bc99 100644
--- a/ingest/pom.xml
+++ b/ingest/pom.xml
@@ -57,7 +57,7 @@
     </dependency>
     <dependency>
       <groupId>org.apache.lucene</groupId>
-      <artifactId>lucene-wikipedia</artifactId>
+      <artifactId>lucene-analyzers-common</artifactId>
     </dependency>
     <dependency>
       <groupId>org.apache.zookeeper</groupId>
@@ -84,8 +84,8 @@
             <phase>prepare-package</phase>
             <configuration>
               <outputDirectory>lib</outputDirectory>
-              <!-- just grab the non-provided runtime dependencies -->
-              <!-- XXX we include guava at the same version as hadoop 2 provides so that we have it on hadoop 1 -->
+              &lt;!&ndash; just grab the non-provided runtime dependencies &ndash;&gt;
+              &lt;!&ndash; XXX we include guava at the same version as hadoop 2 provides so that we have it on hadoop 1 &ndash;&gt;
               <includeArtifactIds>commons-lang,guava,lucene-core,lucene-analyzers,lucene-wikipedia,protobuf-java,accumulo-core,hadoop-core,libthrift,zookeeper,commons-codec,accumulo-fate,accumulo-trace</includeArtifactIds>
               <excludeTransitive>false</excludeTransitive>
             </configuration>
diff --git a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java
index 0699cfa..cda08d8 100644
--- a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java
+++ b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java
@@ -31,7 +31,6 @@ import javax.xml.stream.XMLStreamException;
 import javax.xml.stream.XMLStreamReader;
 
 import org.apache.accumulo.examples.wikisearch.normalizer.LcNoDiacriticsNormalizer;
-import org.apache.accumulo.examples.wikisearch.normalizer.NumberNormalizer;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 
@@ -39,7 +38,6 @@ import org.apache.hadoop.io.Writable;
 public class ArticleExtractor {
   
   public final static SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'Z");
-  private static NumberNormalizer nn = new NumberNormalizer();
   private static LcNoDiacriticsNormalizer lcdn = new LcNoDiacriticsNormalizer();
   
   public static class Article implements Writable {
@@ -91,9 +89,11 @@ public class ArticleExtractor {
     
     public Map<String,String> getNormalizedFieldValues() {
       Map<String,String> fields = new HashMap<String,String>();
-      fields.put("ID", nn.normalizeFieldValue("ID", this.id));
+      //fields.put("ID", nn.normalizeFieldValue("ID", this.id));
+      fields.put("ID", Integer.toString(this.id));
       fields.put("TITLE", lcdn.normalizeFieldValue("TITLE", this.title));
-      fields.put("TIMESTAMP", nn.normalizeFieldValue("TIMESTAMP", this.timestamp));
+      //fields.put("TIMESTAMP", nn.normalizeFieldValue("TIMESTAMP", this.timestamp));
+      fields.put("TIMESTAMP", Long.toString(this.timestamp));
       fields.put("COMMENTS", lcdn.normalizeFieldValue("COMMENTS", this.comments));
       return fields;
     }
diff --git a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java
index 44a3fbc..05ce8d8 100644
--- a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java
+++ b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java
@@ -28,7 +28,6 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.util.ReflectionUtils;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.SimpleAnalyzer;
 
 public class WikipediaConfiguration {
   public final static String INSTANCE_NAME = "wikipedia.accumulo.instance_name";
@@ -107,12 +106,6 @@ public class WikipediaConfiguration {
     return new Path(filename);
   }
 
-  public static Analyzer getAnalyzer(Configuration conf) throws IOException {
-    Class<? extends Analyzer> analyzerClass =
-        conf.getClass(ANALYZER, SimpleAnalyzer.class, Analyzer.class);
-    return ReflectionUtils.newInstance(analyzerClass, conf);
-  }
-
   public static Connector getConnector(Configuration conf)
       throws AccumuloException, AccumuloSecurityException {
     return getInstance(conf).getConnector(getUser(conf), getPassword(conf));
diff --git a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
index c751637..c2fed03 100644
--- a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
+++ b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
@@ -47,8 +47,8 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 import org.apache.log4j.Logger;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-import org.apache.lucene.wikipedia.analysis.WikipediaTokenizer;
+import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
+import org.apache.lucene.util.Attribute;
 
 import com.google.common.collect.HashMultimap;
 import com.google.common.collect.Multimap;
@@ -223,31 +223,18 @@ public class WikipediaMapper extends Mapper<LongWritable,Text,Text,Mutation> {
   /**
    * Tokenize the wikipedia content
    */
-  static Set<String> getTokens(Article article) throws IOException {
+  static Set<String> getTokens(Article article) {
     Set<String> tokenList = new HashSet<>();
-    WikipediaTokenizer tok = new WikipediaTokenizer(new StringReader(article.getText()));
-    TermAttribute term = tok.addAttribute(TermAttribute.class);
-    try {
+    try (WikipediaTokenizer tok = new WikipediaTokenizer(new StringReader(article.getText()))) {
+      Attribute term = tok.addAttribute(Attribute.class);
       while (tok.incrementToken()) {
-        String token = term.term();
+        String token = term.toString();
         if (!StringUtils.isEmpty(token)) {
           tokenList.add(token);
         }
       }
     } catch (IOException e) {
       log.error("Error tokenizing text", e);
-    } finally {
-      try {
-        tok.end();
-      } catch (IOException e) {
-        log.error("Error calling end()", e);
-      } finally {
-        try {
-          tok.close();
-        } catch (IOException e) {
-          log.error("Error closing tokenizer", e);
-        }
-      }
     }
     return tokenList;
   }
diff --git a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java
deleted file mode 100644
index e0a5cc8..0000000
--- a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.normalizer;
-
-import org.apache.commons.lang.math.NumberUtils;
-import org.apache.lucene.util.NumericUtils;
-
-public class NumberNormalizer implements Normalizer {
-  
-  public String normalizeFieldValue(String field, Object value) {
-    if (NumberUtils.isNumber(value.toString())) {
-      Number n = NumberUtils.createNumber(value.toString());
-      if (n instanceof Integer)
-        return NumericUtils.intToPrefixCoded((Integer) n);
-      else if (n instanceof Long)
-        return NumericUtils.longToPrefixCoded((Long) n);
-      else if (n instanceof Float)
-        return NumericUtils.floatToPrefixCoded((Float) n);
-      else if (n instanceof Double)
-        return NumericUtils.doubleToPrefixCoded((Double) n);
-      else
-        throw new IllegalArgumentException("Unhandled numeric type: " + n.getClass());
-    } else {
-      throw new IllegalArgumentException("Value is not a number: " + value);
-    }
-  }
-  
-}
diff --git a/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/normalizer/testNumberNormalizer.java b/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/normalizer/testNumberNormalizer.java
deleted file mode 100644
index 470633c..0000000
--- a/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/normalizer/testNumberNormalizer.java
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.normalizer;
-
-import static org.junit.Assert.assertTrue;
-
-import org.apache.accumulo.examples.wikisearch.normalizer.NumberNormalizer;
-import org.junit.Test;
-
-public class testNumberNormalizer {
-  
-  @Test
-  public void test1() throws Exception {
-    NumberNormalizer nn = new NumberNormalizer();
-    
-    String n1 = nn.normalizeFieldValue(null, "1");
-    String n2 = nn.normalizeFieldValue(null, "1.00000000");
-    
-    assertTrue(n1.compareTo(n2) < 0);
-    
-  }
-  
-  @Test
-  public void test2() {
-    NumberNormalizer nn = new NumberNormalizer();
-    
-    String n1 = nn.normalizeFieldValue(null, "-1.0");
-    String n2 = nn.normalizeFieldValue(null, "1.0");
-    
-    assertTrue(n1.compareTo(n2) < 0);
-    
-  }
-  
-  @Test
-  public void test3() {
-    NumberNormalizer nn = new NumberNormalizer();
-    String n1 = nn.normalizeFieldValue(null, "-0.0001");
-    String n2 = nn.normalizeFieldValue(null, "0");
-    String n3 = nn.normalizeFieldValue(null, "0.00001");
-    
-    assertTrue((n1.compareTo(n2) < 0) && (n2.compareTo(n3) < 0));
-  }
-  
-  @Test
-  public void test4() {
-    NumberNormalizer nn = new NumberNormalizer();
-    String nn1 = nn.normalizeFieldValue(null, Integer.toString(Integer.MAX_VALUE));
-    String nn2 = nn.normalizeFieldValue(null, Integer.toString(Integer.MAX_VALUE - 1));
-    
-    assertTrue((nn2.compareTo(nn1) < 0));
-    
-  }
-  
-  @Test
-  public void test5() {
-    NumberNormalizer nn = new NumberNormalizer();
-    String nn1 = nn.normalizeFieldValue(null, "-0.001");
-    String nn2 = nn.normalizeFieldValue(null, "-0.0009");
-    String nn3 = nn.normalizeFieldValue(null, "-0.00090");
-    
-    assertTrue((nn3.compareTo(nn2) == 0) && (nn2.compareTo(nn1) > 0));
-    
-  }
-  
-  @Test
-  public void test6() {
-    NumberNormalizer nn = new NumberNormalizer();
-    String nn1 = nn.normalizeFieldValue(null, "00.0");
-    String nn2 = nn.normalizeFieldValue(null, "0");
-    String nn3 = nn.normalizeFieldValue(null, "0.0");
-    
-    assertTrue((nn3.compareTo(nn2) == 0) && (nn2.compareTo(nn1) == 0));
-    
-  }
-  
-}
diff --git a/pom.xml b/pom.xml
index 48c3c46..ba62cf0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -54,9 +54,8 @@
     <version.kryo>1.04</version.kryo>
     <version.log4j>1.2.16</version.log4j>
     <version.log4j-extras>1.0</version.log4j-extras>
-    <version.lucene>3.6.2</version.lucene>
-    <version.lucene-analyzers>3.6.2</version.lucene-analyzers>
-    <version.lucene-wikipedia>3.0.3</version.lucene-wikipedia>
+    <version.lucene>7.1.0</version.lucene>
+    <version.lucene-analyzers>4.0.0</version.lucene-analyzers>
     <version.minlog>1.2</version.minlog>
     <version.protobuf>2.5.0</version.protobuf>
     <version.thrift>0.12.0</version.thrift>
@@ -226,18 +225,8 @@
       </dependency>
       <dependency>
         <groupId>org.apache.lucene</groupId>
-        <artifactId>lucene-wikipedia</artifactId>
-        <version>${version.lucene-wikipedia}</version>
-        <exclusions>
-          <exclusion>
-            <groupId>commons-digester</groupId>
-            <artifactId>commons-digester</artifactId>
-          </exclusion>
-          <exclusion>
-            <groupId>commons-logging</groupId>
-            <artifactId>commons-logging</artifactId>
-          </exclusion>
-        </exclusions>
+        <artifactId>lucene-analyzers-common</artifactId>
+        <version>${version.lucene-analyzers}</version>
       </dependency>
       <dependency>
         <groupId>org.apache.thrift</groupId>
@@ -386,7 +375,7 @@
             <phase>prepare-package</phase>
             <configuration>
               <outputDirectory>../../lib</outputDirectory>
-                <!-- just grab the non-provided runtime dependencies -->
+                &lt;!&ndash; just grab the non-provided runtime dependencies &ndash;&gt;
               <includeArtifactIds>commons-collections,commons-configuration,commons-io,commons-lang,jline,log4j,libthrift,commons-jci-core,commons-jci-fam,commons-logging,commons-logging-api</includeArtifactIds>
               <excludeGroupIds>accumulo</excludeGroupIds>
               <excludeTransitive>true</excludeTransitive>