You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cp...@apache.org on 2016/10/31 14:06:05 UTC

[12/37] lucene-solr:jira/solr-8542-v2: LUCENE-7429: AnalyzerWrapper can now wrap the normalization analysis chain too.

LUCENE-7429: AnalyzerWrapper can now wrap the normalization analysis chain too.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/af600480
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/af600480
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/af600480

Branch: refs/heads/jira/solr-8542-v2
Commit: af60048097a83220aae135b09d209a0f2d4ba3c6
Parents: 2172f3e
Author: Adrien Grand <jp...@gmail.com>
Authored: Thu Oct 27 16:27:45 2016 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Thu Oct 27 16:27:45 2016 +0200

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   3 +
 .../lucene/analysis/custom/CustomAnalyzer.java  |   2 +-
 .../lucene/collation/CollationKeyAnalyzer.java  |   2 +-
 .../org/apache/lucene/analysis/Analyzer.java    |   9 +-
 .../apache/lucene/analysis/AnalyzerWrapper.java |  50 ++++++++-
 .../analysis/DelegatingAnalyzerWrapper.java     |  14 ++-
 .../analysis/TestDelegatingAnalyzerWrapper.java | 107 +++++++++++++++++++
 .../lucene/analysis/MockBytesAnalyzer.java      |   2 +-
 .../apache/solr/analysis/TokenizerChain.java    |   2 +-
 9 files changed, 180 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/af600480/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index d574a8a..5a6601b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -107,6 +107,9 @@ Bug Fixes
   allTermsRequired is false and context filters are specified (Mike
   McCandless)
 
+* LUCENE-7429: AnalyzerWrapper can now modify the normalization chain too and
+  DelegatingAnalyzerWrapper does the right thing automatically. (Adrien Grand)
+
 Improvements
 
 * LUCENE-7439: FuzzyQuery now matches all terms within the specified

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/af600480/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
index b2de5e8..466642c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
@@ -131,7 +131,7 @@ public final class CustomAnalyzer extends Analyzer {
 
   @Override
   protected TokenStreamComponents createComponents(String fieldName) {
-    final Tokenizer tk = tokenizer.create(attributeFactory());
+    final Tokenizer tk = tokenizer.create(attributeFactory(fieldName));
     TokenStream ts = tk;
     for (final TokenFilterFactory filter : tokenFilters) {
       ts = filter.create(ts);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/af600480/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
index ea98731..4d0f039 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
@@ -85,7 +85,7 @@ public final class CollationKeyAnalyzer extends Analyzer {
   }
 
   @Override
-  protected AttributeFactory attributeFactory() {
+  protected AttributeFactory attributeFactory(String fieldName) {
     return factory;
   }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/af600480/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java b/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java
index aa4b42d..3a5d41c 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java
@@ -238,7 +238,7 @@ public abstract class Analyzer implements Closeable {
         throw new IllegalStateException("Normalization threw an unexpected exeption", e);
       }
 
-      final AttributeFactory attributeFactory = attributeFactory();
+      final AttributeFactory attributeFactory = attributeFactory(fieldName);
       try (TokenStream ts = normalize(fieldName,
           new StringTokenStream(attributeFactory, filteredText, text.length()))) {
         final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
@@ -286,9 +286,10 @@ public abstract class Analyzer implements Closeable {
 
   /** Return the {@link AttributeFactory} to be used for
    *  {@link #tokenStream analysis} and
-   *  {@link #normalize(String, String) normalization}. The default
-   *  implementation returns {@link TokenStream#DEFAULT_TOKEN_ATTRIBUTE_FACTORY}. */
-  protected AttributeFactory attributeFactory() {
+   *  {@link #normalize(String, String) normalization} on the given
+   *  {@code FieldName}. The default implementation returns
+   *  {@link TokenStream#DEFAULT_TOKEN_ATTRIBUTE_FACTORY}. */
+  protected AttributeFactory attributeFactory(String fieldName) {
     return TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY;
   }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/af600480/lucene/core/src/java/org/apache/lucene/analysis/AnalyzerWrapper.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/AnalyzerWrapper.java b/lucene/core/src/java/org/apache/lucene/analysis/AnalyzerWrapper.java
index 1e5640f..d23d004 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/AnalyzerWrapper.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/AnalyzerWrapper.java
@@ -19,6 +19,8 @@ package org.apache.lucene.analysis;
 
 import java.io.Reader;
 
+import org.apache.lucene.util.AttributeFactory;
+
 /**
  * Extension to {@link Analyzer} suitable for Analyzers which wrap
  * other Analyzers.
@@ -82,6 +84,22 @@ public abstract class AnalyzerWrapper extends Analyzer {
   }
 
   /**
+   * Wraps / alters the given TokenStream for normalization purposes, taken
+   * from the wrapped Analyzer, to form new components. It is through this
+   * method that new TokenFilters can be added by AnalyzerWrappers. By default,
+   * the given token stream are returned.
+   * 
+   * @param fieldName
+   *          Name of the field which is to be analyzed
+   * @param in
+   *          TokenStream taken from the wrapped Analyzer
+   * @return Wrapped / altered TokenStreamComponents.
+   */
+  protected TokenStream wrapTokenStreamForNormalization(String fieldName, TokenStream in) {
+    return in;
+  }
+
+  /**
    * Wraps / alters the given Reader. Through this method AnalyzerWrappers can
    * implement {@link #initReader(String, Reader)}. By default, the given reader
    * is returned.
@@ -95,13 +113,33 @@ public abstract class AnalyzerWrapper extends Analyzer {
   protected Reader wrapReader(String fieldName, Reader reader) {
     return reader;
   }
-  
+
+  /**
+   * Wraps / alters the given Reader. Through this method AnalyzerWrappers can
+   * implement {@link #initReaderForNormalization(String, Reader)}. By default,
+   * the given reader  is returned.
+   * 
+   * @param fieldName
+   *          name of the field which is to be analyzed
+   * @param reader
+   *          the reader to wrap
+   * @return the wrapped reader
+   */
+  protected Reader wrapReaderForNormalization(String fieldName, Reader reader) {
+    return reader;
+  }
+
   @Override
   protected final TokenStreamComponents createComponents(String fieldName) {
     return wrapComponents(fieldName, getWrappedAnalyzer(fieldName).createComponents(fieldName));
   }
 
   @Override
+  protected final TokenStream normalize(String fieldName, TokenStream in) {
+    return wrapTokenStreamForNormalization(fieldName, getWrappedAnalyzer(fieldName).normalize(fieldName, in));
+  }
+
+  @Override
   public int getPositionIncrementGap(String fieldName) {
     return getWrappedAnalyzer(fieldName).getPositionIncrementGap(fieldName);
   }
@@ -115,4 +153,14 @@ public abstract class AnalyzerWrapper extends Analyzer {
   public final Reader initReader(String fieldName, Reader reader) {
     return getWrappedAnalyzer(fieldName).initReader(fieldName, wrapReader(fieldName, reader));
   }
+
+  @Override
+  protected final Reader initReaderForNormalization(String fieldName, Reader reader) {
+    return getWrappedAnalyzer(fieldName).initReaderForNormalization(fieldName, wrapReaderForNormalization(fieldName, reader));
+  }
+
+  @Override
+  protected final AttributeFactory attributeFactory(String fieldName) {
+    return getWrappedAnalyzer(fieldName).attributeFactory(fieldName);
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/af600480/lucene/core/src/java/org/apache/lucene/analysis/DelegatingAnalyzerWrapper.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/DelegatingAnalyzerWrapper.java b/lucene/core/src/java/org/apache/lucene/analysis/DelegatingAnalyzerWrapper.java
index 6f05d4d..edf5b2b 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/DelegatingAnalyzerWrapper.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/DelegatingAnalyzerWrapper.java
@@ -54,12 +54,22 @@ public abstract class DelegatingAnalyzerWrapper extends AnalyzerWrapper {
   protected final TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
     return super.wrapComponents(fieldName, components);
   }
-  
+
+  @Override
+  protected final TokenStream wrapTokenStreamForNormalization(String fieldName, TokenStream in) {
+    return super.wrapTokenStreamForNormalization(fieldName, in);
+  }
+
   @Override
   protected final Reader wrapReader(String fieldName, Reader reader) {
     return super.wrapReader(fieldName, reader);
   }
-  
+
+  @Override
+  protected final Reader wrapReaderForNormalization(String fieldName, Reader reader) {
+    return super.wrapReaderForNormalization(fieldName, reader);
+  }
+
   private static final class DelegatingReuseStrategy extends ReuseStrategy {
     DelegatingAnalyzerWrapper wrapper;
     private final ReuseStrategy fallbackStrategy;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/af600480/lucene/core/src/test/org/apache/lucene/analysis/TestDelegatingAnalyzerWrapper.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestDelegatingAnalyzerWrapper.java b/lucene/core/src/test/org/apache/lucene/analysis/TestDelegatingAnalyzerWrapper.java
new file mode 100644
index 0000000..1d6cf15
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestDelegatingAnalyzerWrapper.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestDelegatingAnalyzerWrapper extends LuceneTestCase {
+
+  public void testDelegatesNormalization() {
+    Analyzer analyzer1 = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
+    DelegatingAnalyzerWrapper w1 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) {
+      @Override
+      protected Analyzer getWrappedAnalyzer(String fieldName) {
+        return analyzer1;
+      }
+    };
+    assertEquals(new BytesRef("Ab C"), w1.normalize("foo", "Ab C"));
+
+    Analyzer analyzer2 = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true);
+    DelegatingAnalyzerWrapper w2 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) {
+      @Override
+      protected Analyzer getWrappedAnalyzer(String fieldName) {
+        return analyzer2;
+      }
+    };
+    assertEquals(new BytesRef("ab c"), w2.normalize("foo", "Ab C"));
+  }
+
+  public void testDelegatesAttributeFactory() throws Exception {
+    Analyzer analyzer1 = new MockBytesAnalyzer();
+    DelegatingAnalyzerWrapper w1 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) {
+      @Override
+      protected Analyzer getWrappedAnalyzer(String fieldName) {
+        return analyzer1;
+      }
+    };
+    assertEquals(new BytesRef("Ab C".getBytes(StandardCharsets.UTF_16LE)), w1.normalize("foo", "Ab C"));
+  }
+
+  public void testDelegatesCharFilter() throws Exception {
+    Analyzer analyzer1 = new Analyzer() {
+      @Override
+      protected Reader initReaderForNormalization(String fieldName, Reader reader) {
+        return new DummyCharFilter(reader, 'b', 'z');
+      }
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new MockTokenizer(attributeFactory(fieldName));
+        return new TokenStreamComponents(tokenizer);
+      }
+    };
+    DelegatingAnalyzerWrapper w1 = new DelegatingAnalyzerWrapper(Analyzer.GLOBAL_REUSE_STRATEGY) {
+      @Override
+      protected Analyzer getWrappedAnalyzer(String fieldName) {
+        return analyzer1;
+      }
+    };
+    assertEquals(new BytesRef("az c"), w1.normalize("foo", "ab c"));
+  }
+
+  private static class DummyCharFilter extends CharFilter {
+
+    private final char match, repl;
+
+    public DummyCharFilter(Reader input, char match, char repl) {
+      super(input);
+      this.match = match;
+      this.repl = repl;
+    }
+
+    @Override
+    protected int correct(int currentOff) {
+      return currentOff;
+    }
+
+    @Override
+    public int read(char[] cbuf, int off, int len) throws IOException {
+      final int read = input.read(cbuf, off, len);
+      for (int i = 0; i < read; ++i) {
+        if (cbuf[off+i] == match) {
+          cbuf[off+i] = repl;
+        }
+      }
+      return read;
+    }
+    
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/af600480/lucene/test-framework/src/java/org/apache/lucene/analysis/MockBytesAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockBytesAnalyzer.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockBytesAnalyzer.java
index b8cfc5b..4d51717 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockBytesAnalyzer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockBytesAnalyzer.java
@@ -30,7 +30,7 @@ public final class MockBytesAnalyzer extends Analyzer {
   }
 
   @Override
-  protected AttributeFactory attributeFactory() {
+  protected AttributeFactory attributeFactory(String fieldName) {
     return MockUTF16TermAttributeImpl.UTF16_TERM_ATTRIBUTE_FACTORY;
   }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/af600480/solr/core/src/java/org/apache/solr/analysis/TokenizerChain.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/analysis/TokenizerChain.java b/solr/core/src/java/org/apache/solr/analysis/TokenizerChain.java
index a5afbec..ab5458c 100644
--- a/solr/core/src/java/org/apache/solr/analysis/TokenizerChain.java
+++ b/solr/core/src/java/org/apache/solr/analysis/TokenizerChain.java
@@ -99,7 +99,7 @@ public final class TokenizerChain extends SolrAnalyzer {
 
   @Override
   protected TokenStreamComponents createComponents(String fieldName) {
-    Tokenizer tk = tokenizer.create(attributeFactory());
+    Tokenizer tk = tokenizer.create(attributeFactory(fieldName));
     TokenStream ts = tk;
     for (TokenFilterFactory filter : filters) {
       ts = filter.create(ts);