You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-user@lucene.apache.org by Paul Taylor <pa...@fastmail.fm> on 2011/09/15 11:39:08 UTC
Converting from TermAttribute to CharTermAttribute
Have updated from Lucene 3.0 to lucene 3.1 an dnow getting various
deprecations that Im trying to move
I change this filter class and now my test are failing, anybody able to
see what Im missing please
Paul
package org.musicbrainz.search.analysis;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.io.IOException;
/**
* A filter that replaces accented characters by their unaccented
equivalents.
*/
public class AccentFilter extends TokenFilter {
private char[] output = new char[256];
private int outputPos;
private TermAttribute termAttr;
public AccentFilter(TokenStream input) {
super(input);
termAttr = (TermAttribute) addAttribute(TermAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken())
return false;
final char[] buffer = termAttr.termBuffer();
final int length = termAttr.termLength();
if (removeAccents(buffer, length)) {
termAttr.setTermBuffer(output, 0, outputPos);
}
return true;
}
protected final boolean removeAccents(char[] input, int length) {
final int maxSizeNeeded = 2 * length;
int size = output.length;
while (size < maxSizeNeeded)
size *= 2;
int inputPos = 0;
outputPos = 0;
for (int i = 0; i < length; i++) {
int c = (int) input[i];
int block = UnaccentIndexes.indexes[c >>
UnaccentData.BLOCK_SHIFT];
int position = c & UnaccentData.BLOCK_MASK;
short[] positions = UnaccentPositions.positions[block];
int unacPosition = positions[position];
int unacLength = positions[position + 1] - unacPosition;
if (unacLength > 0) {
// allocate a new char array, if necessary
if (size != output.length)
output = new char[size];
// copy front of the input
if (inputPos < i) {
System.arraycopy(input, inputPos, output,
outputPos, i - inputPos);
outputPos += i - inputPos;
}
// copy unaccented data
System.arraycopy(UnaccentData.data[block], unacPosition,
output, outputPos, unacLength);
outputPos += unacLength;
inputPos = i + 1;
}
}
// no conversion needed...
if (inputPos == 0)
return false;
// copy rest of the input
int copyLength = length - inputPos;
if (copyLength > 0) {
System.arraycopy(input, inputPos, output, outputPos,
copyLength);
outputPos += copyLength;
}
return true;
}
}
to
package org.musicbrainz.search.analysis;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.IOException;
/**
* A filter that replaces accented characters by their unaccented
equivalents.
*/
public class AccentFilter extends TokenFilter {
private char[] output = new char[256];
private int outputPos;
private CharTermAttribute termAttr;
public AccentFilter(TokenStream input) {
super(input);
termAttr = (CharTermAttribute)
addAttribute(CharTermAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken())
return false;
final char[] buffer = termAttr.buffer();
final int length = termAttr.length();
if (removeAccents(buffer, length)) {
termAttr.resizeBuffer(outputPos);
}
return true;
}
protected final boolean removeAccents(char[] input, int length) {
final int maxSizeNeeded = 2 * length;
int size = output.length;
while (size < maxSizeNeeded)
size *= 2;
int inputPos = 0;
outputPos = 0;
for (int i = 0; i < length; i++) {
int c = (int) input[i];
int block = UnaccentIndexes.indexes[c >>
UnaccentData.BLOCK_SHIFT];
int position = c & UnaccentData.BLOCK_MASK;
short[] positions = UnaccentPositions.positions[block];
int unacPosition = positions[position];
int unacLength = positions[position + 1] - unacPosition;
if (unacLength > 0) {
// allocate a new char array, if necessary
if (size != output.length)
output = new char[size];
// copy front of the input
if (inputPos < i) {
System.arraycopy(input, inputPos, output,
outputPos, i - inputPos);
outputPos += i - inputPos;
}
// copy unaccented data
System.arraycopy(UnaccentData.data[block], unacPosition,
output, outputPos, unacLength);
outputPos += unacLength;
inputPos = i + 1;
}
}
// no conversion needed...
if (inputPos == 0)
return false;
// copy rest of the input
int copyLength = length - inputPos;
if (copyLength > 0) {
System.arraycopy(input, inputPos, output, outputPos,
copyLength);
outputPos += copyLength;
}
return true;
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org
Re: Converting from TermAttribute to CharTermAttribute
Posted by Paul Taylor <pa...@fastmail.fm>.
On 15/09/2011 11:12, Uwe Schindler wrote:
> Hi,
>
> In your incrementToken method, you differs in both implementations:
> The original one uses setTermBuffer(), but the new one only resizes the
> buffer, but never changes it.
>
Thankyou, now working, changed to
@Override
public boolean incrementToken() throws IOException {
if (!input.incrementToken())
return false;
final char[] buffer = termAttr.buffer();
final int length = termAttr.length();
if (removeAccents(buffer, length)) {
termAttr.copyBuffer(output,0,outputPos);
}
return true;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org
RE: Converting from TermAttribute to CharTermAttribute
Posted by Uwe Schindler <uw...@thetaphi.de>.
Hi,
In your incrementToken method, you differs in both implementations:
The original one uses setTermBuffer(), but the new one only resizes the
buffer, but never changes it.
Uwe
-----
Uwe Schindler
H.-H.-Meier-Allee 63, D-28213 Bremen
http://www.thetaphi.de
eMail: uwe@thetaphi.de
> -----Original Message-----
> From: Paul Taylor [mailto:paul_t100@fastmail.fm]
> Sent: Thursday, September 15, 2011 11:39 AM
> To: 'java-user@lucene.apache.org'
> Subject: Converting from TermAttribute to CharTermAttribute
>
> Have updated from Lucene 3.0 to lucene 3.1 an dnow getting various
> deprecations that Im trying to move
>
> I change this filter class and now my test are failing, anybody able to
see what
> Im missing please
>
> Paul
>
> package org.musicbrainz.search.analysis;
>
> import org.apache.lucene.analysis.TokenFilter;
> import org.apache.lucene.analysis.TokenStream;
> import org.apache.lucene.analysis.tokenattributes.TermAttribute;
>
> import java.io.IOException;
>
> /**
> * A filter that replaces accented characters by their unaccented
equivalents.
> */
> public class AccentFilter extends TokenFilter {
>
> private char[] output = new char[256];
> private int outputPos;
>
> private TermAttribute termAttr;
>
> public AccentFilter(TokenStream input) {
> super(input);
> termAttr = (TermAttribute) addAttribute(TermAttribute.class);
> }
>
> @Override
> public boolean incrementToken() throws IOException {
> if (!input.incrementToken())
> return false;
>
> final char[] buffer = termAttr.termBuffer();
> final int length = termAttr.termLength();
> if (removeAccents(buffer, length)) {
> termAttr.setTermBuffer(output, 0, outputPos);
> }
> return true;
> }
>
> protected final boolean removeAccents(char[] input, int length) {
> final int maxSizeNeeded = 2 * length;
> int size = output.length;
> while (size < maxSizeNeeded)
> size *= 2;
>
> int inputPos = 0;
> outputPos = 0;
>
> for (int i = 0; i < length; i++) {
> int c = (int) input[i];
>
> int block = UnaccentIndexes.indexes[c >>
UnaccentData.BLOCK_SHIFT];
> int position = c & UnaccentData.BLOCK_MASK;
>
> short[] positions = UnaccentPositions.positions[block];
> int unacPosition = positions[position];
> int unacLength = positions[position + 1] - unacPosition;
>
> if (unacLength > 0) {
> // allocate a new char array, if necessary
> if (size != output.length)
> output = new char[size];
> // copy front of the input
> if (inputPos < i) {
> System.arraycopy(input, inputPos, output, outputPos,
i - inputPos);
> outputPos += i - inputPos;
> }
> // copy unaccented data
> System.arraycopy(UnaccentData.data[block], unacPosition,
> output, outputPos, unacLength);
> outputPos += unacLength;
> inputPos = i + 1;
> }
> }
>
> // no conversion needed...
> if (inputPos == 0)
> return false;
>
> // copy rest of the input
> int copyLength = length - inputPos;
> if (copyLength > 0) {
> System.arraycopy(input, inputPos, output, outputPos,
copyLength);
> outputPos += copyLength;
> }
>
> return true;
> }
>
> }
>
> to
>
> package org.musicbrainz.search.analysis;
>
> import org.apache.lucene.analysis.TokenFilter;
> import org.apache.lucene.analysis.TokenStream;
> import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
>
> import java.io.IOException;
>
> /**
> * A filter that replaces accented characters by their unaccented
equivalents.
> */
> public class AccentFilter extends TokenFilter {
>
> private char[] output = new char[256];
> private int outputPos;
>
> private CharTermAttribute termAttr;
>
> public AccentFilter(TokenStream input) {
> super(input);
> termAttr = (CharTermAttribute)
addAttribute(CharTermAttribute.class);
> }
>
> @Override
> public boolean incrementToken() throws IOException {
> if (!input.incrementToken())
> return false;
>
> final char[] buffer = termAttr.buffer();
> final int length = termAttr.length();
> if (removeAccents(buffer, length)) {
> termAttr.resizeBuffer(outputPos);
> }
> return true;
> }
>
> protected final boolean removeAccents(char[] input, int length) {
> final int maxSizeNeeded = 2 * length;
> int size = output.length;
> while (size < maxSizeNeeded)
> size *= 2;
>
> int inputPos = 0;
> outputPos = 0;
>
> for (int i = 0; i < length; i++) {
> int c = (int) input[i];
>
> int block = UnaccentIndexes.indexes[c >>
UnaccentData.BLOCK_SHIFT];
> int position = c & UnaccentData.BLOCK_MASK;
>
> short[] positions = UnaccentPositions.positions[block];
> int unacPosition = positions[position];
> int unacLength = positions[position + 1] - unacPosition;
>
> if (unacLength > 0) {
> // allocate a new char array, if necessary
> if (size != output.length)
> output = new char[size];
> // copy front of the input
> if (inputPos < i) {
> System.arraycopy(input, inputPos, output, outputPos,
i - inputPos);
> outputPos += i - inputPos;
> }
> // copy unaccented data
> System.arraycopy(UnaccentData.data[block], unacPosition,
> output, outputPos, unacLength);
> outputPos += unacLength;
> inputPos = i + 1;
> }
> }
>
> // no conversion needed...
> if (inputPos == 0)
> return false;
>
> // copy rest of the input
> int copyLength = length - inputPos;
> if (copyLength > 0) {
> System.arraycopy(input, inputPos, output, outputPos,
copyLength);
> outputPos += copyLength;
> }
>
> return true;
> }
>
> }
>
>
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org