You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by us...@apache.org on 2011/10/25 13:12:14 UTC

svn commit: r1188604 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/backwards/src/test/ lucene/contrib/ lucene/contrib/analyzers/common/ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/ lucene/contrib/analyzers/comm...

Author: uschindler
Date: Tue Oct 25 11:12:14 2011
New Revision: 1188604

URL: http://svn.apache.org/viewvc?rev=1188604&view=rev
Log:
LUCENE-3508: Decompounders based on CompoundWordTokenFilterBase can now be used with custom attributes. All those attributes are preserved and set on all added decompounded tokens

Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/lucene/backwards/src/test/   (props changed)
    lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/   (props changed)
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
    lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
    lucene/dev/branches/branch_3x/solr/   (props changed)

Modified: lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt?rev=1188604&r1=1188603&r2=1188604&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/CHANGES.txt Tue Oct 25 11:12:14 2011
@@ -10,6 +10,9 @@ Changes in backwards compatibility polic
  * LUCENE-3446: Removed BooleanFilter.finalResult() due to change to
    FixedBitSet.  (Uwe Schindler)
 
+ * LUCENE-3508: Changed some method signatures in decompounding TokenFilters
+   to make them no longer use the Token class.  (Uwe Schindler)
+
 New Features
 
  * LUCENE-1824: Add BoundaryScanner interface and its implementation classes,
@@ -79,6 +82,10 @@ Bug Fixes
    Replaced with RandomSampler. For previous behavior use RepeatableSampler.
    (Gilad Barkai, Shai Erera, Doron Cohen)
 
+ * LUCENE-3508: Decompounders based on CompoundWordTokenFilterBase can now be
+   used with custom attributes. All those attributes are preserved and set on all
+   added decompounded tokens.  (Spyros Kapnissis, Uwe Schindler)
+
 API Changes
  
  * LUCENE-3436: Add SuggestMode to the spellchecker, so you can specify the strategy

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java?rev=1188604&r1=1188603&r2=1188604&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java Tue Oct 25 11:12:14 2011
@@ -25,19 +25,16 @@ import java.util.Locale;
 import java.util.Set;
 
 import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.Version;
 
 /**
- * Base class for decomposition token filters. <a name="version"/>
+ * Base class for decomposition token filters.
  * <p>
  * You must specify the required {@link Version} compatibility when creating
  * CompoundWordTokenFilterBase:
@@ -46,6 +43,13 @@ import org.apache.lucene.util.Version;
  * supplementary characters in strings and char arrays provided as compound word
  * dictionaries.
  * </ul>
+ * <p>If you pass in a {@link org.apache.lucene.analysis.CharArraySet} as dictionary,
+ * it should be case-insensitive unless it contains only lowercased entries and you
+ * have {@link org.apache.lucene.analysis.LowerCaseFilter} before this filter in your analysis chain.
+ * For optional performance (as this filter does lots of lookups to the dictionary,
+ * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
+ * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
+ * transformed to case-insensitive!
  */
 public abstract class CompoundWordTokenFilterBase extends TokenFilter {
   /**
@@ -64,26 +68,24 @@ public abstract class CompoundWordTokenF
   public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
   
   protected final CharArraySet dictionary;
-  protected final LinkedList<Token> tokens;
+  protected final LinkedList<CompoundToken> tokens;
   protected final int minWordSize;
   protected final int minSubwordSize;
   protected final int maxSubwordSize;
   protected final boolean onlyLongestMatch;
   
-  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+  protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
   private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
-  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
-  private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
   
-  private final Token wrapper = new Token();
+  private AttributeSource.State current;
+
   /**
    * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], int, int, int, boolean)} instead
    */
   @Deprecated
   protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
-    this(Version.LUCENE_30, input, makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
+    this(Version.LUCENE_30, input, makeDictionary(Version.LUCENE_30, dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
   }
   
   /**
@@ -91,7 +93,7 @@ public abstract class CompoundWordTokenF
    */
   @Deprecated
   protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
-    this(Version.LUCENE_30, input, makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
+    this(Version.LUCENE_30, input, makeDictionary(Version.LUCENE_30, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
   }
   
   /**
@@ -107,7 +109,7 @@ public abstract class CompoundWordTokenF
    */
   @Deprecated
   protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary) {
-    this(Version.LUCENE_30, input, makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
+    this(Version.LUCENE_30, input, makeDictionary(Version.LUCENE_30, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
   }
   
   /**
@@ -127,11 +129,11 @@ public abstract class CompoundWordTokenF
   }
   
   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
-    this(matchVersion, input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
+    this(matchVersion, input,makeDictionary(matchVersion, dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
   }
   
   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
-    this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
+    this(matchVersion, input,makeDictionary(matchVersion, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
   }
 
   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {
@@ -139,7 +141,7 @@ public abstract class CompoundWordTokenF
   }
 
   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) {
-    this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
+    this(matchVersion, input,makeDictionary(matchVersion, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
   }
 
   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary) {
@@ -149,7 +151,7 @@ public abstract class CompoundWordTokenF
   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
     super(input);
     
-    this.tokens=new LinkedList<Token>();
+    this.tokens=new LinkedList<CompoundToken>();
     this.minWordSize=minWordSize;
     this.minSubwordSize=minSubwordSize;
     this.maxSubwordSize=maxSubwordSize;
@@ -158,113 +160,77 @@ public abstract class CompoundWordTokenF
     if (dictionary==null || dictionary instanceof CharArraySet) {
       this.dictionary = (CharArraySet) dictionary;
     } else {
-      this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false);
-      addAllLowerCase(this.dictionary, dictionary);
+      this.dictionary = new CharArraySet(matchVersion, dictionary, true);
     }
   }
-
-  /**
-   * Create a set of words from an array
-   * The resulting Set does case insensitive matching
-   * TODO We should look for a faster dictionary lookup approach.
-   * @param dictionary 
-   * @return {@link Set} of lowercased terms 
-   */
-  public static final Set<?> makeDictionary(final String[] dictionary) {
-    return makeDictionary(Version.LUCENE_30, dictionary);
-  }
   
-  public static final Set<?> makeDictionary(final Version matchVersion, final String[] dictionary) {
+  /** @deprecated Only available for backwards compatibility. */
+  @Deprecated
+  public static CharArraySet makeDictionary(final Version matchVersion, final String[] dictionary) {
     if (dictionary == null) {
       return null;
     }
-    // is the below really case insensitive? 
-    CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false);
-    addAllLowerCase(dict, Arrays.asList(dictionary));
-    return dict;
-  }
-  
-  private final void setToken(final Token token) throws IOException {
-    clearAttributes();
-    termAtt.copyBuffer(token.buffer(), 0, token.length());
-    flagsAtt.setFlags(token.getFlags());
-    typeAtt.setType(token.type());
-    offsetAtt.setOffset(token.startOffset(), token.endOffset());
-    posIncAtt.setPositionIncrement(token.getPositionIncrement());
-    payloadAtt.setPayload(token.getPayload());
+    return new CharArraySet(matchVersion, Arrays.asList(dictionary), true);
   }
   
   @Override
   public final boolean incrementToken() throws IOException {
-    if (tokens.size() > 0) {
-      setToken(tokens.removeFirst());
+    if (!tokens.isEmpty()) {
+      assert current != null;
+      CompoundToken token = tokens.removeFirst();
+      restoreState(current); // keep all other attributes untouched
+      termAtt.setEmpty().append(token.txt);
+      offsetAtt.setOffset(token.startOffset, token.endOffset);
+      posIncAtt.setPositionIncrement(0);
       return true;
     }
 
-    if (!input.incrementToken())
-      return false;
-    
-    wrapper.copyBuffer(termAtt.buffer(), 0, termAtt.length());
-    wrapper.setStartOffset(offsetAtt.startOffset());
-    wrapper.setEndOffset(offsetAtt.endOffset());
-    wrapper.setFlags(flagsAtt.getFlags());
-    wrapper.setType(typeAtt.type());
-    wrapper.setPositionIncrement(posIncAtt.getPositionIncrement());
-    wrapper.setPayload(payloadAtt.getPayload());
-    
-    decompose(wrapper);
-
-    if (tokens.size() > 0) {
-      setToken(tokens.removeFirst());
+    current = null; // not really needed, but for safety
+    if (input.incrementToken()) {
+      // Only words longer than minWordSize get processed
+      if (termAtt.length() >= this.minWordSize) {
+        decompose();
+        // only capture the state if we really need it for producing new tokens
+        if (!tokens.isEmpty()) {
+          current = captureState();
+        }
+      }
+      // return original token:
       return true;
     } else {
       return false;
     }
   }
-  
-  protected static final void addAllLowerCase(CharArraySet target, Collection<?> col) {
-    for (Object obj : col) {
-      String string = (String) obj;
-      target.add(string.toLowerCase(Locale.ENGLISH));
-    }
-  }
-  
-  protected static char[] makeLowerCaseCopy(final char[] buffer) {
-    char[] result=new char[buffer.length];
-    System.arraycopy(buffer, 0, result, 0, buffer.length);
-    
-    for (int i=0;i<buffer.length;++i) {
-       result[i]=Character.toLowerCase(buffer[i]);
-    }
-    
-    return result;
-  }
-  
-  protected final Token createToken(final int offset, final int length,
-      final Token prototype) {
-    int newStart = prototype.startOffset() + offset;
-    Token t = prototype.clone(prototype.buffer(), offset, length, newStart, newStart+length);
-    t.setPositionIncrement(0);
-    return t;
-  }
-
-  protected void decompose(final Token token) {
-    // In any case we give the original token back
-    tokens.add((Token) token.clone());
 
-    // Only words longer than minWordSize get processed
-    if (token.length() < this.minWordSize) {
-      return;
-    }
-    
-    decomposeInternal(token);
-  }
-  
-  protected abstract void decomposeInternal(final Token token);
+  /** Decomposes the current {@link #termAtt} and places {@link CompoundToken} instances in the {@link #tokens} list.
+   * The original token may not be placed in the list, as it is automatically passed through this filter.
+   */
+  protected abstract void decompose();
 
   @Override
   public void reset() throws IOException {
     super.reset();
     tokens.clear();
+    current = null;
   }
+  
+  /**
+   * Helper class to hold decompounded token information
+   */
+  protected class CompoundToken {
+    public final CharSequence txt;
+    public final int startOffset, endOffset;
+
+    /** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */
+    public CompoundToken(int offset, int length) {
+      final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset;
+      this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
+      // TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed
+      // chars from the term, offsets may not match correctly (other filters producing tokens
+      // may also have this problem):
+      this.startOffset = newStart;
+      this.endOffset = newStart + length;
+    }
+
+  }  
 }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java?rev=1188604&r1=1188603&r2=1188604&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java Tue Oct 25 11:12:14 2011
@@ -20,7 +20,6 @@ package org.apache.lucene.analysis.compo
 
 import java.util.Set;
 
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter; // for javadocs
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.util.Version;
@@ -31,13 +30,26 @@ import org.apache.lucene.util.Version;
  * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
  * "Donaudampfschiff" even when you only enter "schiff". 
  *  It uses a brute-force algorithm to achieve this.
- * </p>
+ * <p>
+ * You must specify the required {@link Version} compatibility when creating
+ * CompoundWordTokenFilterBase:
+ * <ul>
+ * <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
+ * supplementary characters in strings and char arrays provided as compound word
+ * dictionaries.
+ * </ul>
+ * <p>If you pass in a {@link org.apache.lucene.analysis.CharArraySet} as dictionary,
+ * it should be case-insensitive unless it contains only lowercased entries and you
+ * have {@link org.apache.lucene.analysis.LowerCaseFilter} before this filter in your analysis chain.
+ * For optional performance (as this filter does lots of lookups to the dictionary,
+ * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
+ * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
+ * transformed to case-insensitive!
  */
 public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
   
   /**
-   * Creates a new {@link DictionaryCompoundWordTokenFilter}
-   * 
+   * Creates a new {@link DictionaryCompoundWordTokenFilter}.
    * @param input the {@link TokenStream} to process
    * @param dictionary the word dictionary to match against
    * @param minWordSize only words longer than this get processed
@@ -115,7 +127,9 @@ public class DictionaryCompoundWordToken
    *          only subwords shorter than this get to the output stream
    * @param onlyLongestMatch
    *          Add only the longest matching subword to the stream
+   * @deprecated Use the constructors taking {@link Set}
    */
+  @Deprecated
   public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary,
       int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
     super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
@@ -134,7 +148,9 @@ public class DictionaryCompoundWordToken
    *          the {@link TokenStream} to process
    * @param dictionary
    *          the word dictionary to match against
+   * @deprecated Use the constructors taking {@link Set}
    */
+  @Deprecated
   public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary) {
     super(matchVersion, input, dictionary);
   }
@@ -150,12 +166,9 @@ public class DictionaryCompoundWordToken
    * @param input
    *          the {@link TokenStream} to process
    * @param dictionary
-   *          the word dictionary to match against. If this is a
-   *          {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
-   *          must have set ignoreCase=false and only contain lower case
-   *          strings.
+   *          the word dictionary to match against.
    */
-  public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary) {
+  public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set<?> dictionary) {
     super(matchVersion, input, dictionary);
   }
   
@@ -170,10 +183,7 @@ public class DictionaryCompoundWordToken
    * @param input
    *          the {@link TokenStream} to process
    * @param dictionary
-   *          the word dictionary to match against. If this is a
-   *          {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
-   *          must have set ignoreCase=false and only contain lower case
-   *          strings.
+   *          the word dictionary to match against.
    * @param minWordSize
    *          only words longer than this get processed
    * @param minSubwordSize
@@ -183,37 +193,31 @@ public class DictionaryCompoundWordToken
    * @param onlyLongestMatch
    *          Add only the longest matching subword to the stream
    */
-  public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary,
+  public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set<?> dictionary,
       int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
     super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
   }
 
   @Override
-  protected void decomposeInternal(final Token token) {
-    // Only words longer than minWordSize get processed
-    if (token.length() < this.minWordSize) {
-      return;
-    }
-    
-    char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());
-    
-    for (int i=0;i<=token.length()-this.minSubwordSize;++i) {
-        Token longestMatchToken=null;
+  protected void decompose() {
+    final int len = termAtt.length();
+    for (int i=0;i<=len-this.minSubwordSize;++i) {
+        CompoundToken longestMatchToken=null;
         for (int j=this.minSubwordSize;j<=this.maxSubwordSize;++j) {
-            if(i+j>token.length()) {
+            if(i+j>len) {
                 break;
             }
-            if(dictionary.contains(lowerCaseTermBuffer, i, j)) {
+            if(dictionary.contains(termAtt.buffer(), i, j)) {
                 if (this.onlyLongestMatch) {
                    if (longestMatchToken!=null) {
-                     if (longestMatchToken.length()<j) {
-                       longestMatchToken=createToken(i,j,token);
+                     if (longestMatchToken.txt.length()<j) {
+                       longestMatchToken=new CompoundToken(i,j);
                      }
                    } else {
-                     longestMatchToken=createToken(i,j,token);
+                     longestMatchToken=new CompoundToken(i,j);
                    }
                 } else {
-                   tokens.add(createToken(i,j,token));
+                   tokens.add(new CompoundToken(i,j));
                 }
             } 
         }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java?rev=1188604&r1=1188603&r2=1188604&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java Tue Oct 25 11:12:14 2011
@@ -21,7 +21,6 @@ import java.io.File;
 import java.io.Reader;
 import java.util.Set;
 
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter; // for javadocs
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
@@ -35,7 +34,21 @@ import org.xml.sax.InputSource;
  * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
  * "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation
  * grammar and a word dictionary to achieve this.
- * </p>
+ * <p>
+ * You must specify the required {@link Version} compatibility when creating
+ * CompoundWordTokenFilterBase:
+ * <ul>
+ * <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
+ * supplementary characters in strings and char arrays provided as compound word
+ * dictionaries.
+ * </ul>
+ * <p>If you pass in a {@link org.apache.lucene.analysis.CharArraySet} as dictionary,
+ * it should be case-insensitive unless it contains only lowercased entries and you
+ * have {@link org.apache.lucene.analysis.LowerCaseFilter} before this filter in your analysis chain.
+ * For optional performance (as this filter does lots of lookups to the dictionary,
+ * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
+ * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
+ * transformed to case-insensitive!
  */
 public class HyphenationCompoundWordTokenFilter extends
     CompoundWordTokenFilterBase {
@@ -63,11 +76,13 @@ public class HyphenationCompoundWordToke
    *          only subwords shorter than this get to the output stream
    * @param onlyLongestMatch
    *          Add only the longest matching subword to the stream
+   * @deprecated Use the constructors taking {@link Set}
    */
+  @Deprecated
   public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
       HyphenationTree hyphenator, String[] dictionary, int minWordSize,
       int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
-    this(input, hyphenator, makeDictionary(dictionary), minWordSize,
+    this(matchVersion, input, hyphenator, makeDictionary(matchVersion, dictionary), minWordSize,
         minSubwordSize, maxSubwordSize, onlyLongestMatch);
   }
 
@@ -85,10 +100,12 @@ public class HyphenationCompoundWordToke
    *          the hyphenation pattern tree to use for hyphenation
    * @param dictionary
    *          the word dictionary to match against
+   * @deprecated Use the constructors taking {@link Set}
    */
+  @Deprecated
   public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
       HyphenationTree hyphenator, String[] dictionary) {
-    this(input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
+    this(Version.LUCENE_30, input, hyphenator, makeDictionary(Version.LUCENE_30,dictionary), DEFAULT_MIN_WORD_SIZE,
         DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
   }
 
@@ -105,10 +122,7 @@ public class HyphenationCompoundWordToke
    * @param hyphenator
    *          the hyphenation pattern tree to use for hyphenation
    * @param dictionary
-   *          the word dictionary to match against. If this is a
-   *          {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
-   *          must have set ignoreCase=false and only contain lower case
-   *          strings.
+   *          the word dictionary to match against.
    */
   public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
       HyphenationTree hyphenator, Set<?> dictionary) {
@@ -129,10 +143,7 @@ public class HyphenationCompoundWordToke
    * @param hyphenator
    *          the hyphenation pattern tree to use for hyphenation
    * @param dictionary
-   *          the word dictionary to match against. If this is a
-   *          {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
-   *          must have set ignoreCase=false and only contain lower case
-   *          strings.
+   *          the word dictionary to match against.
    * @param minWordSize
    *          only words longer than this get processed
    * @param minSubwordSize
@@ -196,7 +207,7 @@ public class HyphenationCompoundWordToke
   public HyphenationCompoundWordTokenFilter(TokenStream input,
       HyphenationTree hyphenator, String[] dictionary, int minWordSize,
       int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
-    this(Version.LUCENE_30, input, hyphenator, makeDictionary(dictionary), minWordSize,
+    this(Version.LUCENE_30, input, hyphenator, makeDictionary(Version.LUCENE_30, dictionary), minWordSize,
         minSubwordSize, maxSubwordSize, onlyLongestMatch);
   }
 
@@ -211,7 +222,7 @@ public class HyphenationCompoundWordToke
   @Deprecated
   public HyphenationCompoundWordTokenFilter(TokenStream input,
       HyphenationTree hyphenator, String[] dictionary) {
-    this(Version.LUCENE_30, input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
+    this(Version.LUCENE_30, input, hyphenator, makeDictionary(Version.LUCENE_30, dictionary), DEFAULT_MIN_WORD_SIZE,
         DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
   }
 
@@ -316,22 +327,20 @@ public class HyphenationCompoundWordToke
   }
 
   @Override
-  protected void decomposeInternal(final Token token) {
+  protected void decompose() {
     // get the hyphenation points
-    Hyphenation hyphens = hyphenator.hyphenate(token.buffer(), 0, token
-        .length(), 1, 1);
+    Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
     // No hyphen points found -> exit
     if (hyphens == null) {
       return;
     }
 
     final int[] hyp = hyphens.getHyphenationPoints();
-    char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());
 
     for (int i = 0; i < hyp.length; ++i) {
       int remaining = hyp.length - i;
       int start = hyp[i];
-      Token longestMatchToken = null;
+      CompoundToken longestMatchToken = null;
       for (int j = 1; j < remaining; j++) {
         int partLength = hyp[i + j] - start;
 
@@ -348,34 +357,33 @@ public class HyphenationCompoundWordToke
         }
 
         // check the dictionary
-        if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
+        if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
           if (this.onlyLongestMatch) {
             if (longestMatchToken != null) {
-              if (longestMatchToken.length() < partLength) {
-                longestMatchToken = createToken(start, partLength, token);
+              if (longestMatchToken.txt.length() < partLength) {
+                longestMatchToken = new CompoundToken(start, partLength);
               }
             } else {
-              longestMatchToken = createToken(start, partLength, token);
+              longestMatchToken = new CompoundToken(start, partLength);
             }
           } else {
-            tokens.add(createToken(start, partLength, token));
+            tokens.add(new CompoundToken(start, partLength));
           }
-        } else if (dictionary.contains(lowerCaseTermBuffer, start,
-            partLength - 1)) {
+        } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
           // check the dictionary again with a word that is one character
           // shorter
           // to avoid problems with genitive 's characters and other binding
           // characters
           if (this.onlyLongestMatch) {
             if (longestMatchToken != null) {
-              if (longestMatchToken.length() < partLength - 1) {
-                longestMatchToken = createToken(start, partLength - 1, token);
+              if (longestMatchToken.txt.length() < partLength - 1) {
+                longestMatchToken = new CompoundToken(start, partLength - 1);
               }
             } else {
-              longestMatchToken = createToken(start, partLength - 1, token);
+              longestMatchToken = new CompoundToken(start, partLength - 1);
             }
           } else {
-            tokens.add(createToken(start, partLength - 1, token));
+            tokens.add(new CompoundToken(start, partLength - 1));
           }
         }
       }

Modified: lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java?rev=1188604&r1=1188603&r2=1188604&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (original)
+++ lucene/dev/branches/branch_3x/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java Tue Oct 25 11:12:14 2011
@@ -17,15 +17,20 @@ package org.apache.lucene.analysis.compo
  * limitations under the License.
  */
 
+import java.io.IOException;
 import java.io.StringReader;
-import org.xml.sax.InputSource;
 
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeImpl;
+import org.xml.sax.InputSource;
 
 public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
   public void testHyphenationCompoundWordsDA() throws Exception {
@@ -166,45 +171,45 @@ public class TestCompoundWordTokenFilter
     String[] dict = {"ab", "cd", "ef"};
 
     DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
-			new WhitespaceTokenizer(TEST_VERSION_CURRENT,
-				new StringReader(
-					"abcdef")
-				),
-			dict,
-			CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
-			CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
-			CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+      new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+        new StringReader(
+          "abcdef")
+        ),
+      dict,
+      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+      CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
 
     assertTokenStreamContents(tf,
-			new String[] { "abcdef", "ab", "cd", "ef" },
-			new int[] { 0, 0, 2, 4},
-			new int[] { 6, 2, 4, 6},
-			new int[] { 1, 0, 0, 0}
-			);
+      new String[] { "abcdef", "ab", "cd", "ef" },
+      new int[] { 0, 0, 2, 4},
+      new int[] { 6, 2, 4, 6},
+      new int[] { 1, 0, 0, 0}
+      );
   }
 
   public void testWordComponentWithLessThanMinimumLength() throws Exception {
     String[] dict = {"abc", "d", "efg"};
 
     DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
-			new WhitespaceTokenizer(TEST_VERSION_CURRENT,
-				new StringReader(
-					"abcdefg")
-				),
-			dict,
-			CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
-			CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
-			CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+      new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+        new StringReader(
+          "abcdefg")
+        ),
+      dict,
+      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+      CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
 
-	// since "d" is shorter than the minimum subword size, it should not be added to the token stream
+  // since "d" is shorter than the minimum subword size, it should not be added to the token stream
     assertTokenStreamContents(tf,
-			new String[] { "abcdefg", "abc", "efg" },
-			new int[] { 0, 0, 4},
-			new int[] { 7, 3, 7},
-			new int[] { 1, 0, 0}
-			);
+      new String[] { "abcdefg", "abc", "efg" },
+      new int[] { 0, 0, 4},
+      new int[] { 7, 3, 7},
+      new int[] { 1, 0, 0}
+      );
   }
-  
+
   public void testReset() throws Exception {
     String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
         "Aufgabe", "Überwachung" };
@@ -228,4 +233,64 @@ public class TestCompoundWordTokenFilter
     assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
   }
 
+  public void testRetainMockAttribute() throws Exception {
+    String[] dict = { "abc", "d", "efg" };
+    Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+        new StringReader("abcdefg"));
+    TokenStream stream = new MockRetainAttributeFilter(tokenizer);
+    stream = new DictionaryCompoundWordTokenFilter(
+        TEST_VERSION_CURRENT, stream, dict,
+        CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+        CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+        CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+    MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
+    while (stream.incrementToken()) {
+      assertTrue("Custom attribute value was lost", retAtt.getRetain());
+    }
+
+  }
+
+  public static interface MockRetainAttribute extends Attribute {
+    void setRetain(boolean attr);
+    boolean getRetain();
+  }
+
+  public static final class MockRetainAttributeImpl extends AttributeImpl implements MockRetainAttribute {
+    private boolean retain = false;
+    @Override
+    public void clear() {
+      retain = false;
+    }
+    public boolean getRetain() {
+      return retain;
+    }
+    public void setRetain(boolean retain) {
+      this.retain = retain;
+    }
+    @Override
+    public void copyTo(AttributeImpl target) {
+      MockRetainAttribute t = (MockRetainAttribute) target;
+      t.setRetain(retain);
+    }
+  }
+
+  private static class MockRetainAttributeFilter extends TokenFilter {
+    
+    MockRetainAttribute retainAtt = addAttribute(MockRetainAttribute.class);
+    
+    MockRetainAttributeFilter(TokenStream input) {
+      super(input);
+    }
+    
+    @Override
+    public boolean incrementToken() throws IOException {
+      if (input.incrementToken()){
+        retainAtt.setRetain(true); 
+        return true;
+      } else {
+      return false;
+      }
+    }
+  }
+
 }