You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by dhaivat dave <dh...@gmail.com> on 2013/08/13 15:52:07 UTC

issue with custom tokenizer

Hello All,

I am trying to develop custom tokeniser (please find code below) and found
some issue while adding multiple document one after another.

it works fine when i add first document and when i add another document
it's not calling "create" method from SampleTokeniserFactory.java but it
calls directly reset method and then call incrementToken(). any one have an
idea on this what's wrong in the code below?  please share your thoughts on
this.

here is the class which extends TokeniserFactory class

=== SampleTokeniserFactory.java

public class SampleTokeniserFactory extends TokenizerFactory {

public SampleTokeniserFactory(Map<String, String> args) {
super(args);
}

public SampleTokeniser create(AttributeFactory factory, Reader reader) {
return new SampleTokeniser(factory, reader);
}

}

here is the class which extends Tokenizer class
====

package ns.solr.analyser;

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import
org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

public class SampleTokeniser extends Tokenizer {

private List<Token> tokenList = new ArrayList<Token>();

int tokenCounter = -1;

private final CharTermAttribute termAtt =
addAttribute(CharTermAttribute.class);

/**
 * Object that defines the offset attribute
 */
private final OffsetAttribute offsetAttribute = (OffsetAttribute)
addAttribute(OffsetAttribute.class);

/**
 * Object that defines the position attribute
 */
private final PositionIncrementAttribute position =
(PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);

public SampleTokeniser(AttributeFactory factory, Reader reader) {
super(factory, reader);
String textToProcess = null;
try {
textToProcess = readFully(reader);
processText(textToProcess);
} catch (IOException e) {
e.printStackTrace();
}

}

public String readFully(Reader reader) throws IOException {
char[] arr = new char[8 * 1024]; // 8K at a time
StringBuffer buf = new StringBuffer();
int numChars;
while ((numChars = reader.read(arr, 0, arr.length)) > 0) {
buf.append(arr, 0, numChars);
}
return buf.toString();
}

public void processText(String textToProcess) {

String wordsList[] = textToProcess.split(" ");

int startOffset = 0, endOffset = 0;

for (String word : wordsList) {

endOffset = word.length();

Token aToken = new Token("Token." + word, startOffset, endOffset);

aToken.setPositionIncrement(1);

tokenList.add(aToken);

startOffset = endOffset + 1;
}
}

@Override
public boolean incrementToken() throws IOException {

clearAttributes();
tokenCounter++;

if (tokenCounter < tokenList.size()) {
Token aToken = tokenList.get(tokenCounter);

termAtt.append(aToken);
termAtt.setLength(aToken.length());
offsetAttribute.setOffset(correctOffset(aToken.startOffset()),
correctOffset(aToken.endOffset()));
position.setPositionIncrement(aToken.getPositionIncrement());
return true;
}

return false;
}

/**
 * close object
 *
 * @throws IOException
 */
public void close() throws IOException {
super.close();
System.out.println("Close method called");

}

/**
 * called when end method gets called
 *
 * @throws IOException
 */
public void end() throws IOException {
super.end();
// setting final offset
System.out.println("end called with final offset");
}

/**
 * method reset the record
 *
 * @throws IOException
 */
public void reset() throws IOException {
super.reset();
System.out.println("Reset Called");
tokenCounter = -1;

}
}