You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-user@lucene.apache.org by Paul Taylor <pa...@fastmail.fm> on 2011/09/15 11:39:08 UTC

Converting from TermAttribute to CharTermAttribute

Have updated from Lucene 3.0 to lucene 3.1 an dnow getting various 
deprecations that Im trying to move

I change this filter class and now my test are failing, anybody able to 
see what Im missing please

Paul

package org.musicbrainz.search.analysis;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;

import java.io.IOException;

/**
  * A filter that replaces accented characters by their unaccented 
equivalents.
  */
public class AccentFilter extends TokenFilter {

     private char[] output = new char[256];
     private int outputPos;

     private TermAttribute termAttr;

     public AccentFilter(TokenStream input) {
         super(input);
         termAttr = (TermAttribute) addAttribute(TermAttribute.class);
     }

     @Override
     public boolean incrementToken() throws IOException {
         if (!input.incrementToken())
             return false;

         final char[] buffer = termAttr.termBuffer();
         final int length    = termAttr.termLength();
         if (removeAccents(buffer, length))  {
             termAttr.setTermBuffer(output, 0, outputPos);
         }
         return true;
     }

     protected final boolean removeAccents(char[] input, int length) {
         final int maxSizeNeeded = 2 * length;
         int size = output.length;
         while (size < maxSizeNeeded)
             size *= 2;

         int inputPos = 0;
         outputPos = 0;

         for (int i = 0; i < length; i++) {
             int c = (int) input[i];

             int block = UnaccentIndexes.indexes[c >> 
UnaccentData.BLOCK_SHIFT];
             int position = c & UnaccentData.BLOCK_MASK;

             short[] positions = UnaccentPositions.positions[block];
             int unacPosition = positions[position];
             int unacLength = positions[position + 1] - unacPosition;

             if (unacLength > 0) {
                 // allocate a new char array, if necessary
                 if (size != output.length)
                     output = new char[size];
                 // copy front of the input
                 if (inputPos < i) {
                     System.arraycopy(input, inputPos, output, 
outputPos, i - inputPos);
                     outputPos += i - inputPos;
                 }
                 // copy unaccented data
                 System.arraycopy(UnaccentData.data[block], unacPosition,
                         output, outputPos, unacLength);
                 outputPos += unacLength;
                 inputPos = i + 1;
             }
         }

         // no conversion needed...
         if (inputPos == 0)
             return false;

         // copy rest of the input
         int copyLength = length - inputPos;
         if (copyLength > 0) {
             System.arraycopy(input, inputPos, output, outputPos, 
copyLength);
             outputPos += copyLength;
         }

         return true;
     }

}

to

package org.musicbrainz.search.analysis;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import java.io.IOException;

/**
  * A filter that replaces accented characters by their unaccented 
equivalents.
  */
public class AccentFilter extends TokenFilter {

     private char[] output = new char[256];
     private int outputPos;

     private CharTermAttribute termAttr;

     public AccentFilter(TokenStream input) {
         super(input);
         termAttr = (CharTermAttribute) 
addAttribute(CharTermAttribute.class);
     }

     @Override
     public boolean incrementToken() throws IOException {
         if (!input.incrementToken())
             return false;

         final char[] buffer = termAttr.buffer();
         final int length    = termAttr.length();
         if (removeAccents(buffer, length))  {
             termAttr.resizeBuffer(outputPos);
         }
         return true;
     }

     protected final boolean removeAccents(char[] input, int length) {
         final int maxSizeNeeded = 2 * length;
         int size = output.length;
         while (size < maxSizeNeeded)
             size *= 2;

         int inputPos = 0;
         outputPos = 0;

         for (int i = 0; i < length; i++) {
             int c = (int) input[i];

             int block = UnaccentIndexes.indexes[c >> 
UnaccentData.BLOCK_SHIFT];
             int position = c & UnaccentData.BLOCK_MASK;

             short[] positions = UnaccentPositions.positions[block];
             int unacPosition = positions[position];
             int unacLength = positions[position + 1] - unacPosition;

             if (unacLength > 0) {
                 // allocate a new char array, if necessary
                 if (size != output.length)
                     output = new char[size];
                 // copy front of the input
                 if (inputPos < i) {
                     System.arraycopy(input, inputPos, output, 
outputPos, i - inputPos);
                     outputPos += i - inputPos;
                 }
                 // copy unaccented data
                 System.arraycopy(UnaccentData.data[block], unacPosition,
                         output, outputPos, unacLength);
                 outputPos += unacLength;
                 inputPos = i + 1;
             }
         }

         // no conversion needed...
         if (inputPos == 0)
             return false;

         // copy rest of the input
         int copyLength = length - inputPos;
         if (copyLength > 0) {
             System.arraycopy(input, inputPos, output, outputPos, 
copyLength);
             outputPos += copyLength;
         }

         return true;
     }

}




---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


Re: Converting from TermAttribute to CharTermAttribute

Posted by Paul Taylor <pa...@fastmail.fm>.
On 15/09/2011 11:12, Uwe Schindler wrote:
> Hi,
>
> In your incrementToken method, you differs in both implementations:
> The original one uses setTermBuffer(), but the new one only resizes the
> buffer, but never changes it.
>
Thankyou, now working, changed to

@Override
     public boolean incrementToken() throws IOException {
         if (!input.incrementToken())
             return false;

         final char[] buffer = termAttr.buffer();
         final int length    = termAttr.length();
         if (removeAccents(buffer, length))  {
             termAttr.copyBuffer(output,0,outputPos);
         }
         return true;
     }

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


RE: Converting from TermAttribute to CharTermAttribute

Posted by Uwe Schindler <uw...@thetaphi.de>.
Hi,

In your incrementToken method, you differs in both implementations:
The original one uses setTermBuffer(), but the new one only resizes the
buffer, but never changes it.

Uwe
-----
Uwe Schindler
H.-H.-Meier-Allee 63, D-28213 Bremen
http://www.thetaphi.de
eMail: uwe@thetaphi.de


> -----Original Message-----
> From: Paul Taylor [mailto:paul_t100@fastmail.fm]
> Sent: Thursday, September 15, 2011 11:39 AM
> To: 'java-user@lucene.apache.org'
> Subject: Converting from TermAttribute to CharTermAttribute
> 
> Have updated from Lucene 3.0 to lucene 3.1 an dnow getting various
> deprecations that Im trying to move
> 
> I change this filter class and now my test are failing, anybody able to
see what
> Im missing please
> 
> Paul
> 
> package org.musicbrainz.search.analysis;
> 
> import org.apache.lucene.analysis.TokenFilter;
> import org.apache.lucene.analysis.TokenStream;
> import org.apache.lucene.analysis.tokenattributes.TermAttribute;
> 
> import java.io.IOException;
> 
> /**
>   * A filter that replaces accented characters by their unaccented
equivalents.
>   */
> public class AccentFilter extends TokenFilter {
> 
>      private char[] output = new char[256];
>      private int outputPos;
> 
>      private TermAttribute termAttr;
> 
>      public AccentFilter(TokenStream input) {
>          super(input);
>          termAttr = (TermAttribute) addAttribute(TermAttribute.class);
>      }
> 
>      @Override
>      public boolean incrementToken() throws IOException {
>          if (!input.incrementToken())
>              return false;
> 
>          final char[] buffer = termAttr.termBuffer();
>          final int length    = termAttr.termLength();
>          if (removeAccents(buffer, length))  {
>              termAttr.setTermBuffer(output, 0, outputPos);
>          }
>          return true;
>      }
> 
>      protected final boolean removeAccents(char[] input, int length) {
>          final int maxSizeNeeded = 2 * length;
>          int size = output.length;
>          while (size < maxSizeNeeded)
>              size *= 2;
> 
>          int inputPos = 0;
>          outputPos = 0;
> 
>          for (int i = 0; i < length; i++) {
>              int c = (int) input[i];
> 
>              int block = UnaccentIndexes.indexes[c >>
UnaccentData.BLOCK_SHIFT];
>              int position = c & UnaccentData.BLOCK_MASK;
> 
>              short[] positions = UnaccentPositions.positions[block];
>              int unacPosition = positions[position];
>              int unacLength = positions[position + 1] - unacPosition;
> 
>              if (unacLength > 0) {
>                  // allocate a new char array, if necessary
>                  if (size != output.length)
>                      output = new char[size];
>                  // copy front of the input
>                  if (inputPos < i) {
>                      System.arraycopy(input, inputPos, output, outputPos,
i - inputPos);
>                      outputPos += i - inputPos;
>                  }
>                  // copy unaccented data
>                  System.arraycopy(UnaccentData.data[block], unacPosition,
>                          output, outputPos, unacLength);
>                  outputPos += unacLength;
>                  inputPos = i + 1;
>              }
>          }
> 
>          // no conversion needed...
>          if (inputPos == 0)
>              return false;
> 
>          // copy rest of the input
>          int copyLength = length - inputPos;
>          if (copyLength > 0) {
>              System.arraycopy(input, inputPos, output, outputPos,
copyLength);
>              outputPos += copyLength;
>          }
> 
>          return true;
>      }
> 
> }
> 
> to
> 
> package org.musicbrainz.search.analysis;
> 
> import org.apache.lucene.analysis.TokenFilter;
> import org.apache.lucene.analysis.TokenStream;
> import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
> 
> import java.io.IOException;
> 
> /**
>   * A filter that replaces accented characters by their unaccented
equivalents.
>   */
> public class AccentFilter extends TokenFilter {
> 
>      private char[] output = new char[256];
>      private int outputPos;
> 
>      private CharTermAttribute termAttr;
> 
>      public AccentFilter(TokenStream input) {
>          super(input);
>          termAttr = (CharTermAttribute)
addAttribute(CharTermAttribute.class);
>      }
> 
>      @Override
>      public boolean incrementToken() throws IOException {
>          if (!input.incrementToken())
>              return false;
> 
>          final char[] buffer = termAttr.buffer();
>          final int length    = termAttr.length();
>          if (removeAccents(buffer, length))  {
>              termAttr.resizeBuffer(outputPos);
>          }
>          return true;
>      }
> 
>      protected final boolean removeAccents(char[] input, int length) {
>          final int maxSizeNeeded = 2 * length;
>          int size = output.length;
>          while (size < maxSizeNeeded)
>              size *= 2;
> 
>          int inputPos = 0;
>          outputPos = 0;
> 
>          for (int i = 0; i < length; i++) {
>              int c = (int) input[i];
> 
>              int block = UnaccentIndexes.indexes[c >>
UnaccentData.BLOCK_SHIFT];
>              int position = c & UnaccentData.BLOCK_MASK;
> 
>              short[] positions = UnaccentPositions.positions[block];
>              int unacPosition = positions[position];
>              int unacLength = positions[position + 1] - unacPosition;
> 
>              if (unacLength > 0) {
>                  // allocate a new char array, if necessary
>                  if (size != output.length)
>                      output = new char[size];
>                  // copy front of the input
>                  if (inputPos < i) {
>                      System.arraycopy(input, inputPos, output, outputPos,
i - inputPos);
>                      outputPos += i - inputPos;
>                  }
>                  // copy unaccented data
>                  System.arraycopy(UnaccentData.data[block], unacPosition,
>                          output, outputPos, unacLength);
>                  outputPos += unacLength;
>                  inputPos = i + 1;
>              }
>          }
> 
>          // no conversion needed...
>          if (inputPos == 0)
>              return false;
> 
>          // copy rest of the input
>          int copyLength = length - inputPos;
>          if (copyLength > 0) {
>              System.arraycopy(input, inputPos, output, outputPos,
copyLength);
>              outputPos += copyLength;
>          }
> 
>          return true;
>      }
> 
> }
> 
> 
> 
> 
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org