You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2016/07/12 16:03:16 UTC

[1/3] lucene-solr:branch_6x: LUCENE-7371: Better compression of values in Lucene60PointsFormat.

Repository: lucene-solr
Updated Branches:
  refs/heads/branch_6x ccd3bc846 -> 1f446872a


LUCENE-7371: Better compression of values in Lucene60PointsFormat.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/1f446872
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/1f446872
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/1f446872

Branch: refs/heads/branch_6x
Commit: 1f446872aa9346c22643d0fb753ec42942b5a4d2
Parents: 7c2e7a0
Author: Adrien Grand <jp...@gmail.com>
Authored: Tue Jul 5 16:54:19 2016 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Tue Jul 12 18:01:44 2016 +0200

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   3 +
 .../simpletext/SimpleTextPointsWriter.java      |  16 +-
 .../org/apache/lucene/util/bkd/BKDReader.java   |  65 ++++++-
 .../org/apache/lucene/util/bkd/BKDWriter.java   | 185 +++++++++++++++----
 .../org/apache/lucene/util/bkd/TestBKD.java     |  29 +++
 .../lucene/index/BasePointsFormatTestCase.java  |  29 +++
 6 files changed, 281 insertions(+), 46 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1f446872/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 83d1782..c5e6f5c 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -92,6 +92,9 @@ Optimizations
 
 * LUCENE-7351: Doc id compression for points. (Adrien Grand)
 
+* LUCENE-7351: Point values are now better compressed using run-length
+  encoding. (Adrien Grand)
+
 Other
 
 * LUCENE-4787: Fixed some highlighting javadocs. (Michael Dodsworth via Adrien

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1f446872/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
----------------------------------------------------------------------
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
index e54e20a..8d5c034 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextPointsWriter.java
@@ -20,6 +20,7 @@ package org.apache.lucene.codecs.simpletext;
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.function.IntFunction;
 
 import org.apache.lucene.codecs.PointsReader;
 import org.apache.lucene.codecs.PointsWriter;
@@ -161,12 +162,15 @@ class SimpleTextPointsWriter extends PointsWriter {
         }
 
         @Override
-        protected void writeLeafBlockPackedValue(IndexOutput out, int[] commonPrefixLengths, byte[] bytes, int bytesOffset) throws IOException {
-          // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
-          write(out, BLOCK_VALUE);
-          write(out, new BytesRef(bytes, bytesOffset, packedBytesLength).toString());
-          newline(out);
-        }          
+        protected void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
+          for (int i = 0; i < count; ++i) {
+            BytesRef packedValue = packedValues.apply(i);
+            // NOTE: we don't do prefix coding, so we ignore commonPrefixLengths
+            write(out, BLOCK_VALUE);
+            write(out, packedValue.toString());
+            newline(out);
+          }
+        }
       }) {
 
       values.intersect(fieldInfo.name, new IntersectVisitor() {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1f446872/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
index 3566bc1..9ca0bb4 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
@@ -20,6 +20,7 @@ import java.io.IOException;
 import java.util.Arrays;
 
 import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.PointValues.IntersectVisitor;
 import org.apache.lucene.index.PointValues.Relation;
 import org.apache.lucene.store.IndexInput;
@@ -345,6 +346,63 @@ public class BKDReader implements Accountable {
 
   protected void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
     visitor.grow(count);
+
+    readCommonPrefixes(commonPrefixLengths, scratchPackedValue, in);
+
+    int compressedDim = version < BKDWriter.VERSION_COMPRESSED_VALUES
+        ? -1
+        : readCompressedDim(in);
+
+    if (compressedDim == -1) {
+      visitRawDocValues(commonPrefixLengths, scratchPackedValue, in, docIDs, count, visitor);
+    } else {
+      visitCompressedDocValues(commonPrefixLengths, scratchPackedValue, in, docIDs, count, visitor, compressedDim);
+    }
+  }
+
+  // Just read suffixes for every dimension
+  private void visitRawDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
+    for (int i = 0; i < count; ++i) {
+      for(int dim=0;dim<numDims;dim++) {
+        int prefix = commonPrefixLengths[dim];
+        in.readBytes(scratchPackedValue, dim*bytesPerDim + prefix, bytesPerDim - prefix);
+      }
+      visitor.visit(docIDs[i], scratchPackedValue);
+    }
+  }
+
+  private void visitCompressedDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor, int compressedDim) throws IOException {
+    // the byte at `compressedByteOffset` is compressed using run-length compression,
+    // other suffix bytes are stored verbatim
+    final int compressedByteOffset = compressedDim * bytesPerDim + commonPrefixLengths[compressedDim];
+    commonPrefixLengths[compressedDim]++;
+    int i;
+    for (i = 0; i < count; ) {
+      scratchPackedValue[compressedByteOffset] = in.readByte();
+      final int runLen = Byte.toUnsignedInt(in.readByte());
+      for (int j = 0; j < runLen; ++j) {
+        for(int dim=0;dim<numDims;dim++) {
+          int prefix = commonPrefixLengths[dim];
+          in.readBytes(scratchPackedValue, dim*bytesPerDim + prefix, bytesPerDim - prefix);
+        }
+        visitor.visit(docIDs[i+j], scratchPackedValue);
+      }
+      i += runLen;
+    }
+    if (i != count) {
+      throw new CorruptIndexException("Sub blocks do not add up to the expected count: " + count + " != " + i, in);
+    }
+  }
+
+  private int readCompressedDim(IndexInput in) throws IOException {
+    int compressedDim = in.readByte();
+    if (compressedDim < -1 || compressedDim >= numDims) {
+      throw new CorruptIndexException("Got compressedDim="+compressedDim, in);
+    }
+    return compressedDim;
+  }
+
+  private void readCommonPrefixes(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in) throws IOException {
     for(int dim=0;dim<numDims;dim++) {
       int prefix = in.readVInt();
       commonPrefixLengths[dim] = prefix;
@@ -353,13 +411,6 @@ public class BKDReader implements Accountable {
       }
       //System.out.println("R: " + dim + " of " + numDims + " prefix=" + prefix);
     }
-    for(int i=0;i<count;i++) {
-      for(int dim=0;dim<numDims;dim++) {
-        int prefix = commonPrefixLengths[dim];
-        in.readBytes(scratchPackedValue, dim*bytesPerDim + prefix, bytesPerDim - prefix);
-      }
-      visitor.visit(docIDs[i], scratchPackedValue);
-    }
   }
 
   private void intersect(IntersectState state,

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1f446872/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
index 6dfdac2..09e6412 100644
--- a/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
@@ -22,9 +22,12 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.List;
+import java.util.function.IntFunction;
 
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.index.MergeState;
+import org.apache.lucene.index.PointValues.IntersectVisitor;
+import org.apache.lucene.index.PointValues.Relation;
 import org.apache.lucene.store.ChecksumIndexInput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
@@ -43,7 +46,6 @@ import org.apache.lucene.util.PriorityQueue;
 import org.apache.lucene.util.StringHelper;
 
 // TODO
-//   - the compression is somewhat stupid now (delta vInt for 1024 docIDs, no compression for the byte[] values even though they have high locality)
 //   - allow variable length byte[] (across docs and dims), but this is quite a bit more hairy
 //   - we could also index "auto-prefix terms" here, and use better compression, and maybe only use for the "fully contained" case so we'd
 //     only index docIDs
@@ -60,7 +62,7 @@ import org.apache.lucene.util.StringHelper;
  *  the requested <code>maxPointsInLeafNode</code>.  Values that fall exactly
  *  on a cell boundary may be in either cell.
  *
- *  <p>The number of dimensions can be 1 to 255, but every byte[] value is fixed length.
+ *  <p>The number of dimensions can be 1 to 8, but every byte[] value is fixed length.
  *
  *  <p>
  *  See <a href="https://www.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf">this paper</a> for details.
@@ -69,7 +71,7 @@ import org.apache.lucene.util.StringHelper;
  *  and then uses up to the specified {@code maxMBSortInHeap} heap space for writing.
  *
  *  <p>
- *  <b>NOTE</b>: This can write at most Integer.MAX_VALUE * <code>maxPointsInLeafNode</code> total points, and
+ *  <b>NOTE</b>: This can write at most Integer.MAX_VALUE * <code>maxPointsInLeafNode</code> total points.
  *
  * @lucene.experimental */
 
@@ -78,7 +80,8 @@ public class BKDWriter implements Closeable {
   public static final String CODEC_NAME = "BKD";
   public static final int VERSION_START = 0;
   public static final int VERSION_COMPRESSED_DOC_IDS = 1;
-  public static final int VERSION_CURRENT = VERSION_COMPRESSED_DOC_IDS;
+  public static final int VERSION_COMPRESSED_VALUES = 2;
+  public static final int VERSION_CURRENT = VERSION_COMPRESSED_VALUES;
 
   /** How many bytes each docs takes in the fixed-width offline format */
   private final int bytesPerDoc;
@@ -312,6 +315,8 @@ public class BKDWriter implements Closeable {
     /** Which leaf block we are up to */
     private int blockID;
 
+    private final byte[] packedValues;
+
     public MergeReader(BKDReader bkd, MergeState.DocMap docMap) throws IOException {
       this.bkd = bkd;
       state = new BKDReader.IntersectState(bkd.in.clone(),
@@ -327,6 +332,7 @@ public class BKDWriter implements Closeable {
         //System.out.println("  leaf fp=" + fp);
       }
       state.in.seek(minFP);
+      this.packedValues = new byte[bkd.maxPointsInLeafNode * bkd.packedBytesLength];
     }
 
     public boolean next() throws IOException {
@@ -341,18 +347,33 @@ public class BKDWriter implements Closeable {
           docsInBlock = bkd.readDocIDs(state.in, state.in.getFilePointer(), state.scratchDocIDs);
           assert docsInBlock > 0;
           docBlockUpto = 0;
-          for(int dim=0;dim<bkd.numDims;dim++) {
-            int prefix = state.in.readVInt();
-            state.commonPrefixLengths[dim] = prefix;
-            if (prefix > 0) {
-              state.in.readBytes(state.scratchPackedValue, dim*bkd.bytesPerDim, prefix);
+          bkd.visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, docsInBlock, new IntersectVisitor() {
+            int i = 0;
+
+            @Override
+            public void visit(int docID) throws IOException {
+              throw new UnsupportedOperationException();
             }
-          }
+
+            @Override
+            public void visit(int docID, byte[] packedValue) throws IOException {
+              assert docID == state.scratchDocIDs[i];
+              System.arraycopy(packedValue, 0, packedValues, i * bkd.packedBytesLength, bkd.packedBytesLength);
+              i++;
+            }
+
+            @Override
+            public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+              throw new UnsupportedOperationException();
+            }
+
+          });
 
           blockID++;
         }
 
-        int oldDocID = state.scratchDocIDs[docBlockUpto++];
+        final int index = docBlockUpto++;
+        int oldDocID = state.scratchDocIDs[index];
 
         int mappedDocID;
         if (docMap == null) {
@@ -360,13 +381,11 @@ public class BKDWriter implements Closeable {
         } else {
           mappedDocID = docMap.get(oldDocID);
         }
-        for(int dim=0;dim<bkd.numDims;dim++) {
-          int prefix = state.commonPrefixLengths[dim];
-          state.in.readBytes(state.scratchPackedValue, dim*bkd.bytesPerDim + prefix, bkd.bytesPerDim - prefix);
-        }
+        
         if (mappedDocID != -1) {
           // Not deleted!
           docID = mappedDocID;
+          System.arraycopy(packedValues, index * bkd.packedBytesLength, state.scratchPackedValue, 0, bkd.packedBytesLength);
           return true;
         }
       }
@@ -518,10 +537,21 @@ public class BKDWriter implements Closeable {
         writeLeafBlockDocs(out, leafBlockDocIDs, 0, leafCount);
         writeCommonPrefixes(out, commonPrefixLengths, firstPackedValue);
 
-        // Write the full values:
-        for (int i=0;i<leafCount;i++) {
-          writeLeafBlockPackedValue(out, commonPrefixLengths, leafBlockPackedValues[i], 0);
-        }
+        final IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
+          final BytesRef scratch = new BytesRef();
+
+          {
+            scratch.length = packedBytesLength;
+            scratch.offset = 0;
+          }
+
+          @Override
+          public BytesRef apply(int i) {
+            scratch.bytes = leafBlockPackedValues[i];
+            return scratch;
+          }
+        };
+        writeLeafBlockPackedValues(out, commonPrefixLengths, leafCount, 0, packedValues);
 
         leafCount = 0;
       }
@@ -896,13 +926,57 @@ public class BKDWriter implements Closeable {
     DocIdsWriter.writeDocIds(docIDs, start, count, out);
   }
 
-  protected void writeLeafBlockPackedValue(IndexOutput out, int[] commonPrefixLengths, byte[] bytes, int offset) throws IOException {
-    for(int dim=0;dim<numDims;dim++) {
-      int prefix = commonPrefixLengths[dim];
-      out.writeBytes(bytes, offset+dim*bytesPerDim+prefix, bytesPerDim-prefix);
+  protected void writeLeafBlockPackedValues(IndexOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException {
+    int prefixLenSum = Arrays.stream(commonPrefixLengths).sum();
+    if (prefixLenSum == packedBytesLength) {
+      // all values in this block are equal
+      out.writeByte((byte) -1);
+    } else {
+      assert commonPrefixLengths[sortedDim] < bytesPerDim;
+      out.writeByte((byte) sortedDim);
+      int compressedByteOffset = sortedDim * bytesPerDim + commonPrefixLengths[sortedDim];
+      commonPrefixLengths[sortedDim]++;
+      for (int i = 0; i < count; ) {
+        // do run-length compression on the byte at compressedByteOffset 
+        int runLen = runLen(packedValues, i, Math.min(i + 0xff, count), compressedByteOffset);
+        assert runLen <= 0xff;
+        BytesRef first = packedValues.apply(i);
+        byte prefixByte = first.bytes[first.offset + compressedByteOffset];
+        out.writeByte(prefixByte);
+        out.writeByte((byte) runLen);
+        writeLeafBlockPackedValuesRange(out, commonPrefixLengths, i, i + runLen, packedValues);
+        i += runLen;
+        assert i <= count;
+      }
     }
   }
 
+  private void writeLeafBlockPackedValuesRange(IndexOutput out, int[] commonPrefixLengths, int start, int end, IntFunction<BytesRef> packedValues) throws IOException {
+    for (int i = start; i < end; ++i) {
+      BytesRef ref = packedValues.apply(i);
+      assert ref.length == packedBytesLength;
+
+      for(int dim=0;dim<numDims;dim++) {
+        int prefix = commonPrefixLengths[dim];
+        out.writeBytes(ref.bytes, ref.offset + dim*bytesPerDim + prefix, bytesPerDim-prefix);
+      }
+    }
+  }
+
+  private static int runLen(IntFunction<BytesRef> packedValues, int start, int end, int byteOffset) {
+    BytesRef first = packedValues.apply(start);
+    byte b = first.bytes[first.offset + byteOffset];
+    for (int i = start + 1; i < end; ++i) {
+      BytesRef ref = packedValues.apply(i);
+      byte b2 = ref.bytes[ref.offset + byteOffset];
+      assert Byte.toUnsignedInt(b2) >= Byte.toUnsignedInt(b);
+      if (b != b2) {
+        return i - start;
+      }
+    }
+    return end - start;
+  }
+
   protected void writeCommonPrefixes(IndexOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException {
     for(int dim=0;dim<numDims;dim++) {
       out.writeVInt(commonPrefixes[dim]);
@@ -1058,6 +1132,11 @@ public class BKDWriter implements Closeable {
     if (nodeID >= leafNodeOffset) {
 
       // Leaf node: write block
+      // We can write the block in any order so by default we write it sorted by the dimension that has the
+      // least number of unique bytes at commonPrefixLengths[dim], which makes compression more efficient
+      int sortedDim = 0;
+      int sortedDimCardinality = Integer.MAX_VALUE;
+
       for (int dim=0;dim<numDims;dim++) {
         if (slices[dim].writer instanceof HeapPointWriter == false) {
           // Adversarial cases can cause this, e.g. very lopsided data, all equal points, such that we started
@@ -1081,9 +1160,29 @@ public class BKDWriter implements Closeable {
             break;
           }
         }
+
+        int prefix = commonPrefixLengths[dim];
+        if (prefix < bytesPerDim) {
+          int cardinality = 1;
+          byte previous = scratch1[offset + prefix];
+          for (long i = 1; i < source.count; ++i) {
+            heapSource.readPackedValue(Math.toIntExact(source.start + i), scratch2);
+            byte b = scratch2[offset + prefix];
+            assert Byte.toUnsignedInt(previous) <= Byte.toUnsignedInt(b);
+            if (b != previous) {
+              cardinality++;
+              previous = b;
+            }
+          }
+          assert cardinality <= 256;
+          if (cardinality < sortedDimCardinality) {
+            sortedDim = dim;
+            sortedDimCardinality = cardinality;
+          }
+        }
       }
 
-      PathSlice source = slices[0];
+      PathSlice source = slices[sortedDim];
 
       // We ensured that maxPointsSortInHeap was >= maxPointsInLeafNode, so we better be in heap at this point:
       HeapPointWriter heapSource = (HeapPointWriter) source.writer;
@@ -1105,15 +1204,21 @@ public class BKDWriter implements Closeable {
       writeCommonPrefixes(out, commonPrefixLengths, scratch1);
 
       // Write the full values:
-      byte[] lastPackedValue = new byte[bytesPerDim];
-      for (int i=0;i<count;i++) {
-        heapSource.getPackedValueSlice(Math.toIntExact(source.start + i), scratchBytesRef);
-        assert numDims != 1 || valueInOrder(i, lastPackedValue, scratchBytesRef.bytes, scratchBytesRef.offset);
-
-        // Make sure this value does in fact fall within this leaf cell:
-        assert valueInBounds(scratchBytesRef, minPackedValue, maxPackedValue);
-        writeLeafBlockPackedValue(out, commonPrefixLengths, scratchBytesRef.bytes, scratchBytesRef.offset);
-      }
+      IntFunction<BytesRef> packedValues = new IntFunction<BytesRef>() {
+        final BytesRef scratch = new BytesRef();
+
+        {
+          scratch.length = packedBytesLength;
+        }
+
+        @Override
+        public BytesRef apply(int i) {
+          heapSource.getPackedValueSlice(Math.toIntExact(source.start + i), scratch);
+          return scratch;
+        }
+      };
+      assert valuesInOrderAndBounds(count, minPackedValue, maxPackedValue, packedValues);
+      writeLeafBlockPackedValues(out, commonPrefixLengths, count, sortedDim, packedValues);
 
     } else {
       // Inner node: partition/recurse
@@ -1216,6 +1321,20 @@ public class BKDWriter implements Closeable {
   }
 
   // only called from assert
+  private boolean valuesInOrderAndBounds(int count, byte[] minPackedValue, byte[] maxPackedValue, IntFunction<BytesRef> values) throws IOException {
+    byte[] lastPackedValue = new byte[bytesPerDim];
+    for (int i=0;i<count;i++) {
+      BytesRef packedValue = values.apply(i);
+      assert packedValue.length == packedBytesLength;
+      assert numDims != 1 || valueInOrder(i, lastPackedValue, packedValue.bytes, packedValue.offset);
+
+      // Make sure this value does in fact fall within this leaf cell:
+      assert valueInBounds(packedValue, minPackedValue, maxPackedValue);
+    }
+    return true;
+  }
+
+  // only called from assert
   private boolean valueInOrder(long ord, byte[] lastPackedValue, byte[] packedValue, int packedValueOffset) {
     if (ord > 0 && StringHelper.compare(bytesPerDim, lastPackedValue, 0, packedValue, packedValueOffset) > 0) {
       throw new AssertionError("values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1f446872/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
index e8b88fc..9eb1fd3 100644
--- a/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
+++ b/lucene/core/src/test/org/apache/lucene/util/bkd/TestBKD.java
@@ -507,6 +507,35 @@ public class TestBKD extends LuceneTestCase {
     verify(docValues, null, numDims, numBytesPerDim);
   }
 
+  // this should trigger run-length compression with lengths that are greater than 255
+  public void testOneDimTwoValues() throws Exception {
+    int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
+    int numDims = TestUtil.nextInt(random(), 1, 5);
+
+    int numDocs = atLeast(1000);
+    int theDim = random().nextInt(numDims);
+    byte[] value1 = new byte[numBytesPerDim];
+    random().nextBytes(value1);
+    byte[] value2 = new byte[numBytesPerDim];
+    random().nextBytes(value2);
+    byte[][][] docValues = new byte[numDocs][][];
+
+    for(int docID=0;docID<numDocs;docID++) {
+      byte[][] values = new byte[numDims][];
+      for(int dim=0;dim<numDims;dim++) {
+        if (dim == theDim) {
+          values[dim] = random().nextBoolean() ? value1 : value2;
+        } else {
+          values[dim] = new byte[numBytesPerDim];
+          random().nextBytes(values[dim]);
+        }
+      }
+      docValues[docID] = values;
+    }
+
+    verify(docValues, null, numDims, numBytesPerDim);
+  }
+
   public void testMultiValued() throws Exception {
     int numBytesPerDim = TestUtil.nextInt(random(), 2, 30);
     int numDims = TestUtil.nextInt(random(), 1, 5);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/1f446872/lucene/test-framework/src/java/org/apache/lucene/index/BasePointsFormatTestCase.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BasePointsFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BasePointsFormatTestCase.java
index 7c42d1c..5891df5 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BasePointsFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BasePointsFormatTestCase.java
@@ -327,6 +327,35 @@ public abstract class BasePointsFormatTestCase extends BaseIndexFileFormatTestCa
     verify(docValues, null, numDims, numBytesPerDim);
   }
 
+  // this should trigger run-length compression with lengths that are greater than 255
+  public void testOneDimTwoValues() throws Exception {
+    int numBytesPerDim = TestUtil.nextInt(random(), 2, PointValues.MAX_NUM_BYTES);
+    int numDims = TestUtil.nextInt(random(), 1, PointValues.MAX_DIMENSIONS);
+
+    int numDocs = atLeast(1000);
+    int theDim = random().nextInt(numDims);
+    byte[] value1 = new byte[numBytesPerDim];
+    random().nextBytes(value1);
+    byte[] value2 = new byte[numBytesPerDim];
+    random().nextBytes(value2);
+    byte[][][] docValues = new byte[numDocs][][];
+
+    for(int docID=0;docID<numDocs;docID++) {
+      byte[][] values = new byte[numDims][];
+      for(int dim=0;dim<numDims;dim++) {
+        if (dim == theDim) {
+          values[dim] = random().nextBoolean() ? value1 : value2;
+        } else {
+          values[dim] = new byte[numBytesPerDim];
+          random().nextBytes(values[dim]);
+        }
+      }
+      docValues[docID] = values;
+    }
+
+    verify(docValues, null, numDims, numBytesPerDim);
+  }
+
   // Tests on N-dimensional points where each dimension is a BigInteger
   public void testBigIntNDims() throws Exception {
 


[3/3] lucene-solr:branch_6x: LUCENE-7355: Add Analyzer#normalize() and use it in query parsers.

Posted by jp...@apache.org.
LUCENE-7355: Add Analyzer#normalize() and use it in query parsers.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/7c2e7a0f
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/7c2e7a0f
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/7c2e7a0f

Branch: refs/heads/branch_6x
Commit: 7c2e7a0fb80a5bf733cf710aee6cbf01d02629eb
Parents: ccd3bc8
Author: Adrien Grand <jp...@gmail.com>
Authored: Tue Jun 28 18:23:11 2016 +0200
Committer: Adrien Grand <jp...@gmail.com>
Committed: Tue Jul 12 18:01:44 2016 +0200

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   7 +
 .../lucene/analysis/ar/ArabicAnalyzer.java      |   8 ++
 .../lucene/analysis/bg/BulgarianAnalyzer.java   |   7 +
 .../lucene/analysis/br/BrazilianAnalyzer.java   |   7 +
 .../lucene/analysis/ca/CatalanAnalyzer.java     |   8 ++
 .../apache/lucene/analysis/cjk/CJKAnalyzer.java |   7 +
 .../lucene/analysis/ckb/SoraniAnalyzer.java     |   9 ++
 .../lucene/analysis/core/SimpleAnalyzer.java    |   6 +
 .../lucene/analysis/core/StopAnalyzer.java      |   6 +
 .../lucene/analysis/custom/CustomAnalyzer.java  |  28 +++-
 .../lucene/analysis/cz/CzechAnalyzer.java       |   7 +
 .../lucene/analysis/da/DanishAnalyzer.java      |   7 +
 .../lucene/analysis/de/GermanAnalyzer.java      |   8 ++
 .../lucene/analysis/el/GreekAnalyzer.java       |   7 +
 .../lucene/analysis/en/EnglishAnalyzer.java     |   7 +
 .../lucene/analysis/es/SpanishAnalyzer.java     |   7 +
 .../lucene/analysis/eu/BasqueAnalyzer.java      |   7 +
 .../lucene/analysis/fa/PersianAnalyzer.java     |  14 +-
 .../lucene/analysis/fi/FinnishAnalyzer.java     |   7 +
 .../lucene/analysis/fr/FrenchAnalyzer.java      |   8 ++
 .../lucene/analysis/ga/IrishAnalyzer.java       |   8 ++
 .../lucene/analysis/gl/GalicianAnalyzer.java    |   7 +
 .../lucene/analysis/hi/HindiAnalyzer.java       |  11 ++
 .../lucene/analysis/hu/HungarianAnalyzer.java   |   7 +
 .../lucene/analysis/hy/ArmenianAnalyzer.java    |   7 +
 .../lucene/analysis/id/IndonesianAnalyzer.java  |   7 +
 .../lucene/analysis/it/ItalianAnalyzer.java     |   8 ++
 .../lucene/analysis/lt/LithuanianAnalyzer.java  |   7 +
 .../lucene/analysis/lv/LatvianAnalyzer.java     |   7 +
 .../lucene/analysis/nl/DutchAnalyzer.java       |   7 +
 .../lucene/analysis/no/NorwegianAnalyzer.java   |   7 +
 .../lucene/analysis/pt/PortugueseAnalyzer.java  |   7 +
 .../lucene/analysis/ro/RomanianAnalyzer.java    |   7 +
 .../lucene/analysis/ru/RussianAnalyzer.java     |   7 +
 .../analysis/standard/ClassicAnalyzer.java      |   5 +
 .../standard/UAX29URLEmailAnalyzer.java         |   5 +
 .../lucene/analysis/sv/SwedishAnalyzer.java     |   7 +
 .../apache/lucene/analysis/th/ThaiAnalyzer.java |   7 +
 .../lucene/analysis/tr/TurkishAnalyzer.java     |   7 +
 .../lucene/collation/CollationKeyAnalyzer.java  |   7 +
 .../core/TestAllAnalyzersHaveFactories.java     |   2 +
 .../lucene/analysis/core/TestAnalyzers.java     |   4 +
 .../lucene/analysis/core/TestRandomChains.java  |  10 +-
 .../analysis/custom/TestCustomAnalyzer.java     | 143 +++++++++++++++++++
 .../lucene/analysis/ja/JapaneseAnalyzer.java    |   7 +
 .../analysis/morfologik/MorfologikAnalyzer.java |   6 +
 .../analysis/cn/smart/SmartChineseAnalyzer.java |   6 +
 .../lucene/analysis/pl/PolishAnalyzer.java      |   7 +
 .../org/apache/lucene/analysis/Analyzer.java    | 135 ++++++++++++++++-
 .../analysis/standard/StandardAnalyzer.java     |   7 +
 .../analysis/standard/TestStandardAnalyzer.java |   6 +
 .../analyzing/AnalyzingQueryParser.java         | 127 ++++++----------
 .../queryparser/classic/QueryParserBase.java    |  35 +----
 .../queryparser/simple/SimpleQueryParser.java   |   9 +-
 .../analyzing/TestAnalyzingQueryParser.java     |  45 +++---
 .../queryparser/util/QueryParserTestBase.java   |   4 +
 .../analysis/BaseTokenStreamTestCase.java       |   5 +-
 .../apache/lucene/analysis/MockAnalyzer.java    |  11 +-
 .../lucene/analysis/MockBytesAnalyzer.java      |   7 +
 .../lucene/analysis/MockLowerCaseFilter.java    |  40 ++++++
 .../apache/solr/analysis/TokenizerChain.java    |  28 +++-
 61 files changed, 808 insertions(+), 150 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 573dd4a..83d1782 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -26,6 +26,9 @@ New Features
   methods Directory.rename and Directory.syncMetaData instead (Robert Muir,
   Uwe Schindler, Mike McCandless)
 
+* LUCENE-7355: Added Analyzer#normalize(), which only applies normalization to
+  an input string. (Adrien Grand)
+
 Bug Fixes
 
 * LUCENE-6662: Fixed potential resource leaks. (Rishabh Patel via Adrien Grand)
@@ -77,6 +80,10 @@ Improvements
 * LUCENE-7276: MatchNoDocsQuery now includes an optional reason for
   why it was used (Jim Ferenczi via Mike McCandless)
 
+* LUCENE-7355: AnalyzingQueryParser now only applies the subset of the analysis
+  chain that is about normalization for range/fuzzy/wildcard queries.
+  (Adrien Grand)
+
 Optimizations
 
 * LUCENE-7330, LUCENE-7339: Speed up conjunction queries. (Adrien Grand)

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
index 71da32d..c68399e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java
@@ -146,5 +146,13 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
     }
     return new TokenStreamComponents(source, new ArabicStemFilter(result));
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new LowerCaseFilter(in);
+    result = new DecimalDigitFilter(result);
+    result = new ArabicNormalizationFilter(result);
+    return result;
+  }
 }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
index 9cb0657..06c7eea 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/bg/BulgarianAnalyzer.java
@@ -126,4 +126,11 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
     result = new BulgarianStemFilter(result);
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
index 5dd0cbc..ad1af92 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
@@ -127,5 +127,12 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
       result = new SetKeywordMarkerFilter(result, excltable);
     return new TokenStreamComponents(source, new BrazilianStemFilter(result));
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
index 739b61a..56f36e1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ca/CatalanAnalyzer.java
@@ -130,4 +130,12 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
     result = new SnowballFilter(result, new CatalanStemmer());
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new ElisionFilter(result, DEFAULT_ARTICLES);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
index d500ff9..d4214a1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
@@ -92,4 +92,11 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
     result = new CJKBigramFilter(result);
     return new TokenStreamComponents(source, new StopFilter(result, stopwords));
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new CJKWidthFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
index 5fd1bec..7819c66 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ckb/SoraniAnalyzer.java
@@ -129,4 +129,13 @@ public final class SoraniAnalyzer extends StopwordAnalyzerBase {
     result = new SoraniStemFilter(result);
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new SoraniNormalizationFilter(result);
+    result = new LowerCaseFilter(result);
+    result = new DecimalDigitFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java
index d0fdcf6..6e0f2f0 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/SimpleAnalyzer.java
@@ -19,6 +19,7 @@ package org.apache.lucene.analysis.core;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.TokenStream;
 
 /** An {@link Analyzer} that filters {@link LetterTokenizer} 
  *  with {@link LowerCaseFilter} 
@@ -35,4 +36,9 @@ public final class SimpleAnalyzer extends Analyzer {
   protected TokenStreamComponents createComponents(final String fieldName) {
     return new TokenStreamComponents(new LowerCaseTokenizer());
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    return new LowerCaseFilter(in);
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
index 3fa4982..7d7f532 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/StopAnalyzer.java
@@ -25,6 +25,7 @@ import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WordlistLoader;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
@@ -79,5 +80,10 @@ public final class StopAnalyzer extends StopwordAnalyzerBase {
     final Tokenizer source = new LowerCaseTokenizer();
     return new TokenStreamComponents(source, new StopFilter(source, stopwords));
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    return new LowerCaseFilter(in);
+  }
 }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
index f2ed01f..b2de5e8 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/custom/CustomAnalyzer.java
@@ -37,6 +37,7 @@ import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
 import org.apache.lucene.analysis.util.CharFilterFactory;
 import org.apache.lucene.analysis.util.ClasspathResourceLoader;
 import org.apache.lucene.analysis.util.FilesystemResourceLoader;
+import org.apache.lucene.analysis.util.MultiTermAwareComponent;
 import org.apache.lucene.analysis.util.ResourceLoader;
 import org.apache.lucene.analysis.util.ResourceLoaderAware;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
@@ -118,15 +119,38 @@ public final class CustomAnalyzer extends Analyzer {
   }
 
   @Override
+  protected Reader initReaderForNormalization(String fieldName, Reader reader) {
+    for (CharFilterFactory charFilter : charFilters) {
+      if (charFilter instanceof MultiTermAwareComponent) {
+        charFilter = (CharFilterFactory) ((MultiTermAwareComponent) charFilter).getMultiTermComponent();
+        reader = charFilter.create(reader);
+      }
+    }
+    return reader;
+  }
+
+  @Override
   protected TokenStreamComponents createComponents(String fieldName) {
-    final Tokenizer tk = tokenizer.create();
+    final Tokenizer tk = tokenizer.create(attributeFactory());
     TokenStream ts = tk;
     for (final TokenFilterFactory filter : tokenFilters) {
       ts = filter.create(ts);
     }
     return new TokenStreamComponents(tk, ts);
   }
-  
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = in;
+    for (TokenFilterFactory filter : tokenFilters) {
+      if (filter instanceof MultiTermAwareComponent) {
+        filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
+        result = filter.create(in);
+      }
+    }
+    return result;
+  }
+
   @Override
   public int getPositionIncrementGap(String fieldName) {
     // use default from Analyzer base class if null

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
index 9777179..fbb9efa 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
@@ -125,5 +125,12 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
     result = new CzechStemFilter(result);
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
index f9c316d..ccbd9d1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/da/DanishAnalyzer.java
@@ -124,4 +124,11 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
     result = new SnowballFilter(result, new DanishStemmer());
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
index 790fc48..8a39945 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
@@ -139,4 +139,12 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
     result = new GermanLightStemFilter(result);
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    result = new GermanNormalizationFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
index c85b6ec..bd09d25 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
@@ -104,4 +104,11 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
     result = new GreekStemFilter(result);
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new GreekLowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java
index 16dc0c5..94ba43a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/en/EnglishAnalyzer.java
@@ -107,4 +107,11 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
     result = new PorterStemFilter(result);
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
index ab5b6c3..3b21cdd 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/es/SpanishAnalyzer.java
@@ -123,4 +123,11 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
     result = new SpanishLightStemFilter(result);
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java
index cff2da0..4bc1ba7 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/eu/BasqueAnalyzer.java
@@ -121,4 +121,11 @@ public final class BasqueAnalyzer extends StopwordAnalyzerBase {
     result = new SnowballFilter(result, new BasqueStemmer());
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
index 2515d1e..0d6b80c 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java
@@ -29,6 +29,7 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
 import org.apache.lucene.analysis.core.DecimalDigitFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.util.Version;
 
@@ -128,7 +129,18 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
      */
     return new TokenStreamComponents(source, new StopFilter(result, stopwords));
   }
-  
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    result = new DecimalDigitFilter(result);
+    result = new ArabicNormalizationFilter(result);
+    /* additional persian-specific normalization */
+    result = new PersianNormalizationFilter(result);
+    return result;
+  }
+
   /** 
    * Wraps the Reader with {@link PersianCharFilter}
    */

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
index 6b00101..69cc537 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishAnalyzer.java
@@ -124,4 +124,11 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
     result = new SnowballFilter(result, new FinnishStemmer());
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
index 5f90246..2e072be 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
@@ -144,5 +144,13 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
     result = new FrenchLightStemFilter(result);
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new ElisionFilter(result, DEFAULT_ARTICLES);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java
index 1ca3455..3ae366d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ga/IrishAnalyzer.java
@@ -141,4 +141,12 @@ public final class IrishAnalyzer extends StopwordAnalyzerBase {
     result = new SnowballFilter(result, new IrishStemmer());
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new ElisionFilter(result, DEFAULT_ARTICLES);
+    result = new IrishLowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
index 372a6ec..4f70596 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/gl/GalicianAnalyzer.java
@@ -122,4 +122,11 @@ public final class GalicianAnalyzer extends StopwordAnalyzerBase {
     result = new GalicianStemFilter(result);
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
index 1b57129..84bfd7a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hi/HindiAnalyzer.java
@@ -29,6 +29,7 @@ import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.DecimalDigitFilter;
 import org.apache.lucene.analysis.in.IndicNormalizationFilter;
 import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.util.Version;
 
@@ -128,4 +129,14 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
     result = new HindiStemFilter(result);
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    result = new DecimalDigitFilter(result);
+    result = new IndicNormalizationFilter(result);
+    result = new HindiNormalizationFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
index 0615bdc..e980f5a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianAnalyzer.java
@@ -124,4 +124,11 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
     result = new SnowballFilter(result, new HungarianStemmer());
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java
index 8c04639..95506e1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hy/ArmenianAnalyzer.java
@@ -121,4 +121,11 @@ public final class ArmenianAnalyzer extends StopwordAnalyzerBase {
     result = new SnowballFilter(result, new ArmenianStemmer());
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java
index fc9b4d2..9804bea 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/id/IndonesianAnalyzer.java
@@ -119,4 +119,11 @@ public final class IndonesianAnalyzer extends StopwordAnalyzerBase {
     }
     return new TokenStreamComponents(source, new IndonesianStemFilter(result));
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
index a18aa5d..32f4e30 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/it/ItalianAnalyzer.java
@@ -133,4 +133,12 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
     result = new ItalianLightStemFilter(result);
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new ElisionFilter(result, DEFAULT_ARTICLES);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/lt/LithuanianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/lt/LithuanianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/lt/LithuanianAnalyzer.java
index 5e24cf9..4eccc51 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/lt/LithuanianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/lt/LithuanianAnalyzer.java
@@ -121,4 +121,11 @@ public final class LithuanianAnalyzer extends StopwordAnalyzerBase {
     result = new SnowballFilter(result, new LithuanianStemmer());
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
index 0a016af..1b08b3b 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/lv/LatvianAnalyzer.java
@@ -122,4 +122,11 @@ public final class LatvianAnalyzer extends StopwordAnalyzerBase {
     result = new LatvianStemFilter(result);
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
index 0391425..900d9c6 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
@@ -159,4 +159,11 @@ public final class DutchAnalyzer extends Analyzer {
     result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
index c413793..3570ad4 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/no/NorwegianAnalyzer.java
@@ -124,5 +124,12 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
     result = new SnowballFilter(result, new NorwegianStemmer());
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
index 769e142..8f54803 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseAnalyzer.java
@@ -123,4 +123,11 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
     result = new PortugueseLightStemFilter(result);
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
index 06ff999..1b74184 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ro/RomanianAnalyzer.java
@@ -126,4 +126,11 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
     result = new SnowballFilter(result, new RomanianStemmer());
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
index dfe8ef3..76bf495 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
@@ -121,4 +121,11 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase {
       result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
       return new TokenStreamComponents(source, result);
     }
+
+    @Override
+    protected TokenStream normalize(String fieldName, TokenStream in) {
+      TokenStream result = new StandardFilter(in);
+      result = new LowerCaseFilter(result);
+      return result;
+    }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
index dc6c118..ef2ef7e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicAnalyzer.java
@@ -100,4 +100,9 @@ public final class ClassicAnalyzer extends StopwordAnalyzerBase {
       }
     };
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    return new LowerCaseFilter(in);
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
index 9994884..fe71b7e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailAnalyzer.java
@@ -97,4 +97,9 @@ public final class UAX29URLEmailAnalyzer extends StopwordAnalyzerBase {
       }
     };
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    return new LowerCaseFilter(in);
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
index fd2aa2e..3896d3e 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishAnalyzer.java
@@ -124,4 +124,11 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
     result = new SnowballFilter(result, new SwedishStemmer());
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
index 9543c5c..6ab7ba1 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiAnalyzer.java
@@ -104,4 +104,11 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
     result = new StopFilter(result, stopwords);
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new LowerCaseFilter(in);
+    result = new DecimalDigitFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
index a21495f..719e434 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/tr/TurkishAnalyzer.java
@@ -127,4 +127,11 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
     result = new SnowballFilter(result, new TurkishStemmer());
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new TurkishLowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java b/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
index f7b15f6..ea98731 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/collation/CollationKeyAnalyzer.java
@@ -20,6 +20,8 @@ package org.apache.lucene.collation;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.util.AttributeFactory;
+
 import java.text.Collator;
 
 /**
@@ -83,6 +85,11 @@ public final class CollationKeyAnalyzer extends Analyzer {
   }
 
   @Override
+  protected AttributeFactory attributeFactory() {
+    return factory;
+  }
+
+  @Override
   protected TokenStreamComponents createComponents(String fieldName) {
     KeywordTokenizer tokenizer = new KeywordTokenizer(factory, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
     return new TokenStreamComponents(tokenizer, tokenizer);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java
index d826a60..7099566 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java
@@ -35,6 +35,7 @@ import org.apache.lucene.analysis.MockCharFilter;
 import org.apache.lucene.analysis.MockFixedLengthPayloadFilter;
 import org.apache.lucene.analysis.MockGraphTokenFilter;
 import org.apache.lucene.analysis.MockHoleInjectingTokenFilter;
+import org.apache.lucene.analysis.MockLowerCaseFilter;
 import org.apache.lucene.analysis.MockRandomLookaheadTokenFilter;
 import org.apache.lucene.analysis.MockSynonymFilter;
 import org.apache.lucene.analysis.MockTokenFilter;
@@ -75,6 +76,7 @@ public class TestAllAnalyzersHaveFactories extends LuceneTestCase {
       MockFixedLengthPayloadFilter.class,
       MockGraphTokenFilter.class,
       MockHoleInjectingTokenFilter.class,
+      MockLowerCaseFilter.class,
       MockRandomLookaheadTokenFilter.class,
       MockSynonymFilter.class,
       MockTokenFilter.class,

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
index 8f7f2cd..6d514d1 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java
@@ -52,6 +52,7 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
                      new String[] { "b" });
     assertAnalyzesTo(a, "\"QUOTED\" word", 
                      new String[] { "quoted", "word" });
+    assertEquals(new BytesRef("\"\\�3[]()! cz@"), a.normalize("dummy", "\"\\�3[]()! Cz@"));
     a.close();
   }
 
@@ -73,6 +74,7 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
                      new String[] { "2B" });
     assertAnalyzesTo(a, "\"QUOTED\" word", 
                      new String[] { "\"QUOTED\"", "word" });
+    assertEquals(new BytesRef("\"\\�3[]()! Cz@"), a.normalize("dummy", "\"\\�3[]()! Cz@"));
     a.close();
   }
 
@@ -82,6 +84,8 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
                      new String[] { "foo", "bar", "foo", "bar" });
     assertAnalyzesTo(a, "foo a bar such FOO THESE BAR", 
                      new String[] { "foo", "bar", "foo", "bar" });
+    assertEquals(new BytesRef("\"\\�3[]()! cz@"), a.normalize("dummy", "\"\\�3[]()! Cz@"));
+    assertEquals(new BytesRef("the"), a.normalize("dummy", "the"));
     a.close();
   }
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index 4effc79..25ca7a3 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -928,6 +928,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
           System.out.println("Creating random analyzer:" + a);
         }
         try {
+          checkNormalize(a);
           checkRandomData(random, a, 500*RANDOM_MULTIPLIER, 20, false,
               false /* We already validate our own offsets... */);
         } catch (Throwable e) {
@@ -937,7 +938,14 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
       }
     }
   }
-  
+
+  public void checkNormalize(Analyzer a) {
+    // normalization should not modify characters that may be used for wildcards
+    // or regular expressions
+    String s = "([0-9]+)?*";
+    assertEquals(s, a.normalize("dummy", s).utf8ToString());
+  }
+
   // we might regret this decision...
   public void testRandomChainsWithLargeStrings() throws Throwable {
     int numIterations = TEST_NIGHTLY ? atLeast(20) : 3;

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
index af11927..60633e4 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
@@ -17,6 +17,8 @@
 package org.apache.lucene.analysis.custom;
 
 
+import java.io.IOException;
+import java.io.Reader;
 import java.nio.file.Paths;
 import java.util.Collections;
 import java.util.HashMap;
@@ -24,16 +26,25 @@ import java.util.List;
 import java.util.Map;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
+import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
 import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
+import org.apache.lucene.analysis.core.LowerCaseTokenizer;
 import org.apache.lucene.analysis.core.StopFilterFactory;
 import org.apache.lucene.analysis.core.WhitespaceTokenizerFactory;
 import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilterFactory;
 import org.apache.lucene.analysis.standard.ClassicTokenizerFactory;
 import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
+import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
 import org.apache.lucene.analysis.util.CharFilterFactory;
+import org.apache.lucene.analysis.util.MultiTermAwareComponent;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.SetOnce.AlreadySetException;
 import org.apache.lucene.util.Version;
 
@@ -336,4 +347,136 @@ public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
     });
   }
 
+  private static class DummyCharFilter extends CharFilter {
+
+    private final char match, repl;
+
+    public DummyCharFilter(Reader input, char match, char repl) {
+      super(input);
+      this.match = match;
+      this.repl = repl;
+    }
+
+    @Override
+    protected int correct(int currentOff) {
+      return currentOff;
+    }
+
+    @Override
+    public int read(char[] cbuf, int off, int len) throws IOException {
+      final int read = input.read(cbuf, off, len);
+      for (int i = 0; i < read; ++i) {
+        if (cbuf[off+i] == match) {
+          cbuf[off+i] = repl;
+        }
+      }
+      return read;
+    }
+    
+  }
+
+  public static class DummyCharFilterFactory extends CharFilterFactory {
+
+    private final char match, repl;
+
+    public DummyCharFilterFactory(Map<String,String> args) {
+      this(args, '0', '1');
+    }
+
+    DummyCharFilterFactory(Map<String,String> args, char match, char repl) {
+      super(args);
+      this.match = match;
+      this.repl = repl;
+    }
+
+    @Override
+    public Reader create(Reader input) {
+      return new DummyCharFilter(input, match, repl);
+    }
+    
+  }
+
+  public static class DummyMultiTermAwareCharFilterFactory extends DummyCharFilterFactory implements MultiTermAwareComponent {
+
+    public DummyMultiTermAwareCharFilterFactory(Map<String,String> args) {
+      super(args);
+    }
+
+    @Override
+    public AbstractAnalysisFactory getMultiTermComponent() {
+      return new DummyCharFilterFactory(Collections.emptyMap(), '0', '2');
+    }
+
+  }
+
+  public static class DummyTokenizerFactory extends TokenizerFactory {
+
+    public DummyTokenizerFactory(Map<String,String> args) {
+      super(args);
+    }
+
+    @Override
+    public Tokenizer create(AttributeFactory factory) {
+      return new LowerCaseTokenizer(factory);
+    }
+
+  }
+
+  public static class DummyMultiTermAwareTokenizerFactory extends DummyTokenizerFactory implements MultiTermAwareComponent {
+
+    public DummyMultiTermAwareTokenizerFactory(Map<String,String> args) {
+      super(args);
+    }
+
+    @Override
+    public AbstractAnalysisFactory getMultiTermComponent() {
+      return new KeywordTokenizerFactory(getOriginalArgs());
+    }
+    
+  }
+
+  public static class DummyTokenFilterFactory extends TokenFilterFactory {
+
+    public DummyTokenFilterFactory(Map<String,String> args) {
+      super(args);
+    }
+
+    @Override
+    public TokenStream create(TokenStream input) {
+      return input;
+    }
+    
+  }
+
+  public static class DummyMultiTermAwareTokenFilterFactory extends DummyTokenFilterFactory implements MultiTermAwareComponent {
+
+    public DummyMultiTermAwareTokenFilterFactory(Map<String,String> args) {
+      super(args);
+    }
+
+    @Override
+    public AbstractAnalysisFactory getMultiTermComponent() {
+      return new ASCIIFoldingFilterFactory(Collections.emptyMap());
+    }
+    
+  }
+
+  public void testNormalization() throws IOException {
+    CustomAnalyzer analyzer1 = CustomAnalyzer.builder()
+        // none of these components are multi-term aware so they should not be applied
+        .withTokenizer(DummyTokenizerFactory.class, Collections.emptyMap())
+        .addCharFilter(DummyCharFilterFactory.class, Collections.emptyMap())
+        .addTokenFilter(DummyTokenFilterFactory.class, Collections.emptyMap())
+        .build();
+    assertEquals(new BytesRef("0�"), analyzer1.normalize("dummy", "0�"));
+
+    CustomAnalyzer analyzer2 = CustomAnalyzer.builder()
+        // these components are multi-term aware so they should be applied
+        .withTokenizer(DummyMultiTermAwareTokenizerFactory.class, Collections.emptyMap())
+        .addCharFilter(DummyMultiTermAwareCharFilterFactory.class, Collections.emptyMap())
+        .addTokenFilter(DummyMultiTermAwareTokenFilterFactory.class, Collections.emptyMap())
+        .build();
+    assertEquals(new BytesRef("2A"), analyzer2.normalize("dummy", "0�"));
+  }
+
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
index 46d40b1..06e119e 100644
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseAnalyzer.java
@@ -94,4 +94,11 @@ public class JapaneseAnalyzer extends StopwordAnalyzerBase {
     stream = new LowerCaseFilter(stream);
     return new TokenStreamComponents(tokenizer, stream);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new CJKWidthFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
index 091acfd..0caca35 100644
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
@@ -23,6 +23,7 @@ import morfologik.stemming.Dictionary;
 import morfologik.stemming.polish.PolishStemmer;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.standard.StandardFilter;
 import org.apache.lucene.analysis.standard.StandardTokenizer;
@@ -69,4 +70,9 @@ public class MorfologikAnalyzer extends Analyzer {
         src, 
         new MorfologikFilter(new StandardFilter(src), dictionary));
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    return new StandardFilter(in);
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
index 5f0347b..f604d4b 100644
--- a/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
+++ b/lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
@@ -22,6 +22,7 @@ import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseFilter;
 import org.apache.lucene.analysis.StopFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
@@ -139,4 +140,9 @@ public final class SmartChineseAnalyzer extends Analyzer {
     }
     return new TokenStreamComponents(tokenizer, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    return new LowerCaseFilter(in);
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java b/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
index 6ed4fda..2d3ef4c 100644
--- a/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
+++ b/lucene/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
@@ -146,4 +146,11 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
     result = new StempelFilter(result, new StempelStemmer(stemTable));
     return new TokenStreamComponents(source, result);
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java b/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java
index cce740d..0d60d24 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/Analyzer.java
@@ -18,11 +18,18 @@ package org.apache.lucene.analysis;
 
 
 import java.io.Closeable;
+import java.io.IOException;
 import java.io.Reader;
+import java.io.StringReader;
 import java.util.HashMap;
 import java.util.Map;
 
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.store.AlreadyClosedException;
+import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CloseableThreadLocal;
 import org.apache.lucene.util.Version;
 
@@ -44,6 +51,12 @@ import org.apache.lucene.util.Version;
  *     filter = new BarFilter(filter);
  *     return new TokenStreamComponents(source, filter);
  *   }
+ *   {@literal @Override}
+ *   protected TokenStream normalize(TokenStream in) {
+ *     // Assuming FooFilter is about normalization and BarFilter is about
+ *     // stemming, only FooFilter should be applied
+ *     return new FooFilter(in);
+ *   }
  * };
  * </pre>
  * For more examples, see the {@link org.apache.lucene.analysis Analysis package documentation}.
@@ -108,6 +121,15 @@ public abstract class Analyzer implements Closeable {
   protected abstract TokenStreamComponents createComponents(String fieldName);
 
   /**
+   * Wrap the given {@link TokenStream} in order to apply normalization filters.
+   * The default implementation returns the {@link TokenStream} as-is. This is
+   * used by {@link #normalize(String, String)}.
+   */
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    return in;
+  }
+
+  /**
    * Returns a TokenStream suitable for <code>fieldName</code>, tokenizing
    * the contents of <code>reader</code>.
    * <p>
@@ -181,7 +203,65 @@ public abstract class Analyzer implements Closeable {
     components.reusableStringReader = strReader;
     return components.getTokenStream();
   }
-    
+
+  /**
+   * Normalize a string down to the representation that it would have in the
+   * index.
+   * <p>
+   * This is typically used by query parsers in order to generate a query on
+   * a given term, without tokenizing or stemming, which are undesirable if
+   * the string to analyze is a partial word (eg. in case of a wildcard or
+   * fuzzy query).
+   * <p>
+   * This method uses {@link #initReaderForNormalization(String, Reader)} in
+   * order to apply necessary character-level normalization and then
+   * {@link #normalize(String, TokenStream)} in order to apply the normalizing
+   * token filters.
+   */
+  public final BytesRef normalize(final String fieldName, final String text) {
+    try {
+      // apply char filters
+      final String filteredText;
+      try (Reader reader = new StringReader(text)) {
+        Reader filterReader = initReaderForNormalization(fieldName, reader);
+        char[] buffer = new char[64];
+        StringBuilder builder = new StringBuilder();
+        for (;;) {
+          final int read = filterReader.read(buffer, 0, buffer.length);
+          if (read == -1) {
+            break;
+          }
+          builder.append(buffer, 0, read);
+        }
+        filteredText = builder.toString();
+      } catch (IOException e) {
+        throw new IllegalStateException("Normalization threw an unexpected exeption", e);
+      }
+
+      final AttributeFactory attributeFactory = attributeFactory();
+      try (TokenStream ts = normalize(fieldName,
+          new StringTokenStream(attributeFactory, filteredText, text.length()))) {
+        final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
+        ts.reset();
+        if (ts.incrementToken() == false) {
+          throw new IllegalStateException("The normalization token stream is "
+              + "expected to produce exactly 1 token, but got 0 for analyzer "
+              + this + " and input \"" + text + "\"");
+        }
+        final BytesRef term = BytesRef.deepCopyOf(termAtt.getBytesRef());
+        if (ts.incrementToken()) {
+          throw new IllegalStateException("The normalization token stream is "
+              + "expected to produce exactly 1 token, but got 2+ for analyzer "
+              + this + " and input \"" + text + "\"");
+        }
+        ts.end();
+        return term;
+      }
+    } catch (IOException e) {
+      throw new IllegalStateException("Normalization threw an unexpected exeption", e);
+    }
+  }
+
   /**
    * Override this if you want to add a CharFilter chain.
    * <p>
@@ -196,6 +276,22 @@ public abstract class Analyzer implements Closeable {
     return reader;
   }
 
+  /** Wrap the given {@link Reader} with {@link CharFilter}s that make sense
+   *  for normalization. This is typically a subset of the {@link CharFilter}s
+   *  that are applied in {@link #initReader(String, Reader)}. This is used by
+   *  {@link #normalize(String, String)}. */
+  protected Reader initReaderForNormalization(String fieldName, Reader reader) {
+    return reader;
+  }
+
+  /** Return the {@link AttributeFactory} to be used for
+   *  {@link #tokenStream analysis} and
+   *  {@link #normalize(String, String) normalization}. The default
+   *  implementation returns {@link AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY}. */
+  protected AttributeFactory attributeFactory() {
+    return AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
+  }
+
   /**
    * Invoked before indexing a IndexableField instance if
    * terms have already been added to that field.  This allows custom
@@ -435,4 +531,41 @@ public abstract class Analyzer implements Closeable {
     }
   };
 
+  private static final class StringTokenStream extends TokenStream {
+
+    private final String value;
+    private final int length;
+    private boolean used = true;
+    private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+    private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
+
+    StringTokenStream(AttributeFactory attributeFactory, String value, int length) {
+      super(attributeFactory);
+      this.value = value;
+      this.length = length;
+    }
+
+    @Override
+    public void reset() {
+      used = false;
+    }
+
+    @Override
+    public boolean incrementToken() {
+      if (used) {
+        return false;
+      }
+      clearAttributes();
+      termAttribute.append(value);
+      offsetAttribute.setOffset(0, length);
+      used = true;
+      return true;
+    }
+
+    @Override
+    public void end() throws IOException {
+      super.end();
+      offsetAttribute.setOffset(length, length);
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
index 251017d..fb57573 100644
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
@@ -112,4 +112,11 @@ public final class StandardAnalyzer extends StopwordAnalyzerBase {
       }
     };
   }
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = new StandardFilter(in);
+    result = new LowerCaseFilter(result);
+    return result;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
index 6c6ddc8..2cc9274 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
@@ -27,6 +27,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockGraphTokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.TestUtil;
 
 public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
@@ -387,4 +388,9 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
     checkRandomData(random, analyzer, 100*RANDOM_MULTIPLIER, 8192);
     analyzer.close();
   }
+
+  public void testNormalize() {
+    Analyzer a = new StandardAnalyzer();
+    assertEquals(new BytesRef("\"\\�3[]()! cz@"), a.normalize("dummy", "\"\\�3[]()! Cz@"));
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/queryparser/src/java/org/apache/lucene/queryparser/analyzing/AnalyzingQueryParser.java
----------------------------------------------------------------------
diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/analyzing/AnalyzingQueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/analyzing/AnalyzingQueryParser.java
index 49690fe..1fab24f 100644
--- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/analyzing/AnalyzingQueryParser.java
+++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/analyzing/AnalyzingQueryParser.java
@@ -16,15 +16,15 @@
  */
 package org.apache.lucene.queryparser.analyzing;
 
-import java.io.IOException;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.index.Term;
 import org.apache.lucene.queryparser.classic.ParseException;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
 
 /**
  * Overrides Lucene's default QueryParser so that Fuzzy-, Prefix-, Range-, and WildcardQuerys
@@ -39,7 +39,7 @@ import org.apache.lucene.search.Query;
  */
 public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.QueryParser {
   // gobble escaped chars or find a wildcard character 
-  private final Pattern wildcardPattern = Pattern.compile("(\\.)|([?*]+)");
+  private static final Pattern WILDCARD_PATTERN = Pattern.compile("(\\\\.)|([?*]+)");
   public AnalyzingQueryParser(String field, Analyzer analyzer) {
     super(field, analyzer);
     setAnalyzeRangeTerms(true);
@@ -65,42 +65,41 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
    */
   @Override
   protected Query getWildcardQuery(String field, String termStr) throws ParseException {
-
-    if (termStr == null){
-      //can't imagine this would ever happen
-      throw new ParseException("Passed null value as term to getWildcardQuery");
-    }
-    if ( ! getAllowLeadingWildcard() && (termStr.startsWith("*") || termStr.startsWith("?"))) {
-      throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"
-                              + " unless getAllowLeadingWildcard() returns true");
+    if ("*".equals(field)) {
+      if ("*".equals(termStr)) return newMatchAllDocsQuery();
     }
-    
-    Matcher wildcardMatcher = wildcardPattern.matcher(termStr);
-    StringBuilder sb = new StringBuilder();
+    if (getAllowLeadingWildcard() == false && (termStr.startsWith("*") || termStr.startsWith("?")))
+      throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery");
+
+    Term t = new Term(field, analyzeWildcard(field, termStr));
+    return newWildcardQuery(t);
+  }
+
+  private BytesRef analyzeWildcard(String field, String termStr) {
+    // best effort to not pass the wildcard characters and escaped characters through #normalize
+    Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(termStr);
+    BytesRefBuilder sb = new BytesRefBuilder();
     int last = 0;
-  
+
     while (wildcardMatcher.find()){
-      // continue if escaped char
-      if (wildcardMatcher.group(1) != null){
-        continue;
-      }
-     
-      if (wildcardMatcher.start() > 0){
+      if (wildcardMatcher.start() > 0) {
         String chunk = termStr.substring(last, wildcardMatcher.start());
-        String analyzed = analyzeSingleChunk(field, termStr, chunk);
-        sb.append(analyzed);
+        BytesRef normalized = getAnalyzer().normalize(field, chunk);
+        sb.append(normalized);
       }
-      //append the wildcard character
-      sb.append(wildcardMatcher.group(2));
-     
+      //append the matched group - without normalizing
+      sb.append(new BytesRef(wildcardMatcher.group()));
+
       last = wildcardMatcher.end();
     }
     if (last < termStr.length()){
-      sb.append(analyzeSingleChunk(field, termStr, termStr.substring(last)));
+      String chunk = termStr.substring(last);
+      BytesRef normalized = getAnalyzer().normalize(field, chunk);
+      sb.append(normalized);
     }
-    return super.getWildcardQuery(field, sb.toString());
+    return sb.toBytesRef();
   }
-  
+
   /**
    * Called when parser parses an input term
    * that uses prefix notation; that is, contains a single '*' wildcard
@@ -121,8 +120,14 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
    */
   @Override
   protected Query getPrefixQuery(String field, String termStr) throws ParseException {
-    String analyzed = analyzeSingleChunk(field, termStr, termStr);
-    return super.getPrefixQuery(field, analyzed);
+    if (!getAllowLeadingWildcard() && termStr.startsWith("*"))
+      throw new ParseException("'*' not allowed as first character in PrefixQuery");
+    if (getLowercaseExpandedTerms()) {
+      termStr = termStr.toLowerCase(getLocale());
+    }
+    BytesRef term = getAnalyzer().normalize(field, termStr);
+    Term t = new Term(field, term);
+    return newPrefixQuery(t);
   }
 
   /**
@@ -142,61 +147,9 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryparser.classic.
   protected Query getFuzzyQuery(String field, String termStr, float minSimilarity)
       throws ParseException {
    
-    String analyzed = analyzeSingleChunk(field, termStr, termStr);
-    return super.getFuzzyQuery(field, analyzed, minSimilarity);
+    BytesRef term = getAnalyzer().normalize(field, termStr);
+    Term t = new Term(field, term);
+    return newFuzzyQuery(t, minSimilarity, getFuzzyPrefixLength());
   }
 
-  /**
-   * Returns the analyzed form for the given chunk
-   * 
-   * If the analyzer produces more than one output token from the given chunk,
-   * a ParseException is thrown.
-   *
-   * @param field The target field
-   * @param termStr The full term from which the given chunk is excerpted
-   * @param chunk The portion of the given termStr to be analyzed
-   * @return The result of analyzing the given chunk
-   * @throws ParseException when analysis returns other than one output token
-   */
-  protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException{
-    String analyzed = null;
-    try (TokenStream stream = getAnalyzer().tokenStream(field, chunk)) {
-      stream.reset();
-      CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
-      // get first and hopefully only output token
-      if (stream.incrementToken()) {
-        analyzed = termAtt.toString();
-        
-        // try to increment again, there should only be one output token
-        StringBuilder multipleOutputs = null;
-        while (stream.incrementToken()) {
-          if (null == multipleOutputs) {
-            multipleOutputs = new StringBuilder();
-            multipleOutputs.append('"');
-            multipleOutputs.append(analyzed);
-            multipleOutputs.append('"');
-          }
-          multipleOutputs.append(',');
-          multipleOutputs.append('"');
-          multipleOutputs.append(termAtt.toString());
-          multipleOutputs.append('"');
-        }
-        stream.end();
-        if (null != multipleOutputs) {
-          throw new ParseException(
-              String.format(getLocale(),
-                  "Analyzer created multiple terms for \"%s\": %s", chunk, multipleOutputs.toString()));
-        }
-      } else {
-        // nothing returned by analyzer.  Was it a stop word and the user accidentally
-        // used an analyzer with stop words?
-        stream.end();
-        throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk));
-      }
-    } catch (IOException e){
-      throw new ParseException(
-          String.format(getLocale(), "IO error while trying to analyze single term: \"%s\"", termStr));
-    }
-    return analyzed;
-  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
----------------------------------------------------------------------
diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
index cdfa477..8b0866f 100644
--- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
+++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
@@ -16,14 +16,11 @@
  */
 package org.apache.lucene.queryparser.classic;
 
-import java.io.IOException;
 import java.io.StringReader;
 import java.text.DateFormat;
 import java.util.*;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.document.DateTools;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queryparser.classic.QueryParser.Operator;
@@ -41,9 +38,6 @@ import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZ
  * and acts to separate the majority of the Java code from the .jj grammar file. 
  */
 public abstract class QueryParserBase extends QueryBuilder implements CommonQueryParserConfiguration {
-  
-  /** Do not catch this exception in your code, it means you are using methods that you should no longer use. */
-  public static class MethodRemovedUseAnother extends Throwable {}
 
   static final int CONJ_NONE   = 0;
   static final int CONJ_AND    = 1;
@@ -640,31 +634,6 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer
     return new FuzzyQuery(term,numEdits,prefixLength);
   }
 
-  // TODO: Should this be protected instead?
-  private BytesRef analyzeMultitermTerm(String field, String part) {
-    return analyzeMultitermTerm(field, part, getAnalyzer());
-  }
-
-  protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
-    if (analyzerIn == null) analyzerIn = getAnalyzer();
-
-    try (TokenStream source = analyzerIn.tokenStream(field, part)) {
-      source.reset();
-      
-      TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
-
-      if (!source.incrementToken())
-        throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
-      BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef());
-      if (source.incrementToken())
-        throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
-      source.end();
-      return bytes;
-    } catch (IOException e) {
-      throw new RuntimeException("Error analyzing multiTerm term: " + part, e);
-    }
-  }
-
   /**
    * Builds a new {@link TermRangeQuery} instance
    * @param field Field
@@ -681,13 +650,13 @@ public abstract class QueryParserBase extends QueryBuilder implements CommonQuer
     if (part1 == null) {
       start = null;
     } else {
-      start = analyzeRangeTerms ? analyzeMultitermTerm(field, part1) : new BytesRef(part1);
+      start = analyzeRangeTerms ? getAnalyzer().normalize(field, part1) : new BytesRef(part1);
     }
      
     if (part2 == null) {
       end = null;
     } else {
-      end = analyzeRangeTerms ? analyzeMultitermTerm(field, part2) : new BytesRef(part2);
+      end = analyzeRangeTerms ? getAnalyzer().normalize(field, part2) : new BytesRef(part2);
     }
       
     final TermRangeQuery query = new TermRangeQuery(field, start, end, startInclusive, endInclusive);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/queryparser/src/java/org/apache/lucene/queryparser/simple/SimpleQueryParser.java
----------------------------------------------------------------------
diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/simple/SimpleQueryParser.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/simple/SimpleQueryParser.java
index 45a24f7..316d97d 100644
--- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/simple/SimpleQueryParser.java
+++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/simple/SimpleQueryParser.java
@@ -26,6 +26,7 @@ import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.QueryBuilder;
 import org.apache.lucene.util.automaton.LevenshteinAutomata;
 
@@ -551,7 +552,9 @@ public class SimpleQueryParser extends QueryBuilder {
     BooleanQuery.Builder bq = new BooleanQuery.Builder();
     bq.setDisableCoord(true);
     for (Map.Entry<String,Float> entry : weights.entrySet()) {
-      Query q = new FuzzyQuery(new Term(entry.getKey(), text), fuzziness);
+      final String fieldName = entry.getKey();
+      final BytesRef term = getAnalyzer().normalize(fieldName, text);
+      Query q = new FuzzyQuery(new Term(fieldName, term), fuzziness);
       float boost = entry.getValue();
       if (boost != 1f) {
         q = new BoostQuery(q, boost);
@@ -587,7 +590,9 @@ public class SimpleQueryParser extends QueryBuilder {
     BooleanQuery.Builder bq = new BooleanQuery.Builder();
     bq.setDisableCoord(true);
     for (Map.Entry<String,Float> entry : weights.entrySet()) {
-      Query q = new PrefixQuery(new Term(entry.getKey(), text));
+      final String fieldName = entry.getKey();
+      final BytesRef term = getAnalyzer().normalize(fieldName, text);
+      Query q = new PrefixQuery(new Term(fieldName, term));
       float boost = entry.getValue();
       if (boost != 1f) {
         q = new BoostQuery(q, boost);


[2/3] lucene-solr:branch_6x: LUCENE-7355: Add Analyzer#normalize() and use it in query parsers.

Posted by jp...@apache.org.
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java
----------------------------------------------------------------------
diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java
index bf5f69f..c1c6375 100644
--- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java
+++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java
@@ -21,9 +21,8 @@ import java.util.Map;
 import java.util.TreeMap;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.CharacterUtils;
 import org.apache.lucene.analysis.MockBytesAnalyzer;
-import org.apache.lucene.analysis.MockTokenFilter;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
@@ -100,24 +99,6 @@ public class TestAnalyzingQueryParser extends LuceneTestCase {
       
     a = new ASCIIAnalyzer();
   }
-
-  public void testSingleChunkExceptions() {
-    String termStr = "the*tre";
-      
-    Analyzer stopsAnalyzer = new MockAnalyzer
-        (random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET);
-
-    ParseException expected = expectThrows(ParseException.class, () -> {
-      parseWithAnalyzingQueryParser(termStr, stopsAnalyzer, true);
-    });
-    assertTrue(expected.getMessage().contains("returned nothing"));
-     
-    AnalyzingQueryParser qp = new AnalyzingQueryParser(FIELD, a);
-    expected = expectThrows(ParseException.class, () -> {
-      qp.analyzeSingleChunk(FIELD, "", "not a single chunk");
-    });
-    assertTrue(expected.getMessage().contains("multiple terms"));
-  }
    
   public void testWildcardAlone() throws ParseException {
     //seems like crazy edge case, but can be useful in concordance 
@@ -221,12 +202,36 @@ public class TestAnalyzingQueryParser extends LuceneTestCase {
     }
   }
 
+  final static class LowercaseFilter extends TokenFilter {
+
+    final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+    LowercaseFilter(TokenStream input) {
+      super(input);
+    }
+
+    @Override
+    public final boolean incrementToken() throws IOException {
+      if (input.incrementToken()) {
+        CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
+        return true;
+      } else
+        return false;
+    }
+
+  }
+
   final static class ASCIIAnalyzer extends Analyzer {
     @Override
     public TokenStreamComponents createComponents(String fieldName) {
       Tokenizer result = new MockTokenizer(MockTokenizer.WHITESPACE, true);
       return new TokenStreamComponents(result, new FoldingFilter(result));
     }
+
+    @Override
+    protected TokenStream normalize(String fieldName, TokenStream in) {
+      return new FoldingFilter(new LowercaseFilter(in));
+    }
   }
    
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java
----------------------------------------------------------------------
diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java
index d58f660..71c65f6 100644
--- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java
+++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java
@@ -1169,6 +1169,10 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
       Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
       return new TokenStreamComponents(tokenizer, new MockCollationFilter(tokenizer));
     }
+    @Override
+    protected TokenStream normalize(String fieldName, TokenStream in) {
+      return new MockCollationFilter(in);
+    }
   }
   
   public void testCollatedRange() throws Exception {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
index c57a8bc..0bb623f 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@@ -883,7 +883,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
       assertTokenStreamContents(ts, 
                                 tokens.toArray(new String[tokens.size()]));
     }
-    
+
+    a.normalize("dummy", text);
+    // TODO: what can we do besides testing that the above method does not throw?
+
     if (field != null) {
       reader = new StringReader(text);
       random = new Random(seed);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java
index e87bf45..bbeffe9 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockAnalyzer.java
@@ -92,7 +92,16 @@ public final class MockAnalyzer extends Analyzer {
     MockTokenFilter filt = new MockTokenFilter(tokenizer, filter);
     return new TokenStreamComponents(tokenizer, maybePayload(filt, fieldName));
   }
-  
+
+  @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = in;
+    if (lowerCase) {
+      result = new MockLowerCaseFilter(result);
+    }
+    return result;
+  }
+
   private synchronized TokenFilter maybePayload(TokenFilter stream, String fieldName) {
     Integer val = previousMappings.get(fieldName);
     if (val == null) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/test-framework/src/java/org/apache/lucene/analysis/MockBytesAnalyzer.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockBytesAnalyzer.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockBytesAnalyzer.java
index 01f3d4d..b8cfc5b 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockBytesAnalyzer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockBytesAnalyzer.java
@@ -16,6 +16,8 @@
  */
 package org.apache.lucene.analysis;
 
+import org.apache.lucene.util.AttributeFactory;
+
 /**
  * Analyzer for testing that encodes terms as UTF-16 bytes.
  */
@@ -26,4 +28,9 @@ public final class MockBytesAnalyzer extends Analyzer {
         MockTokenizer.KEYWORD, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH);
     return new TokenStreamComponents(t);
   }
+
+  @Override
+  protected AttributeFactory attributeFactory() {
+    return MockUTF16TermAttributeImpl.UTF16_TERM_ATTRIBUTE_FACTORY;
+  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/lucene/test-framework/src/java/org/apache/lucene/analysis/MockLowerCaseFilter.java
----------------------------------------------------------------------
diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockLowerCaseFilter.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockLowerCaseFilter.java
new file mode 100644
index 0000000..b1aea3d
--- /dev/null
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockLowerCaseFilter.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/** A lowercasing {@link TokenFilter}. */
+public final class MockLowerCaseFilter extends TokenFilter {
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+  /** Sole constructor. */
+  public MockLowerCaseFilter(TokenStream in) {
+    super(in);
+  }
+  
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
+      return true;
+    } else
+      return false;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7c2e7a0f/solr/core/src/java/org/apache/solr/analysis/TokenizerChain.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/analysis/TokenizerChain.java b/solr/core/src/java/org/apache/solr/analysis/TokenizerChain.java
index c9f263d..a5afbec 100644
--- a/solr/core/src/java/org/apache/solr/analysis/TokenizerChain.java
+++ b/solr/core/src/java/org/apache/solr/analysis/TokenizerChain.java
@@ -18,6 +18,7 @@ package org.apache.solr.analysis;
 
 import org.apache.lucene.analysis.*;
 import org.apache.lucene.analysis.util.CharFilterFactory;
+import org.apache.lucene.analysis.util.MultiTermAwareComponent;
 import org.apache.lucene.analysis.util.TokenFilterFactory;
 import org.apache.lucene.analysis.util.TokenizerFactory;
 
@@ -84,8 +85,21 @@ public final class TokenizerChain extends SolrAnalyzer {
   }
 
   @Override
+  protected Reader initReaderForNormalization(String fieldName, Reader reader) {
+    if (charFilters != null && charFilters.length > 0) {
+      for (CharFilterFactory charFilter : charFilters) {
+        if (charFilter instanceof MultiTermAwareComponent) {
+          charFilter = (CharFilterFactory) ((MultiTermAwareComponent) charFilter).getMultiTermComponent();
+          reader = charFilter.create(reader);
+        }
+      }
+    }
+    return reader;
+  }
+
+  @Override
   protected TokenStreamComponents createComponents(String fieldName) {
-    Tokenizer tk = tokenizer.create();
+    Tokenizer tk = tokenizer.create(attributeFactory());
     TokenStream ts = tk;
     for (TokenFilterFactory filter : filters) {
       ts = filter.create(ts);
@@ -94,6 +108,18 @@ public final class TokenizerChain extends SolrAnalyzer {
   }
 
   @Override
+  protected TokenStream normalize(String fieldName, TokenStream in) {
+    TokenStream result = in;
+    for (TokenFilterFactory filter : filters) {
+      if (filter instanceof MultiTermAwareComponent) {
+        filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
+        result = filter.create(in);
+      }
+    }
+    return result;
+  }
+
+  @Override
   public String toString() {
     StringBuilder sb = new StringBuilder("TokenizerChain(");
     for (CharFilterFactory filter: charFilters) {