You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-user@lucene.apache.org by nischal reddy <ni...@gmail.com> on 2013/09/02 09:52:48 UTC
Problem with Custom analyzer
Hi,
I have created a custom analyzer, with a custom tokenizer which takes Antlr
tokens from a file and will convert it into lucene tokens by setting them
to attribute source.
It works fine if i add one document to index, i am able to search through a
query and getting the hits.
Problem comes when i add another document, the custom tokenizer still seems
to hold the same old reader instance pointing to the end of the file, hence
it is not tokenizing the contents of subsequent files added.
my document will look something like this:
Document doc = ...
doc.add(new StringField(FIELD_FILE_PATH, getIndexFilePath(resource),
Store.YES)); doc.add(new StringField(FIELD_FILE_TYPE,
ifile.getFileExtension().toLowerCase(), Store.YES));
FieldType fieldType = new FieldType();
fieldType.setStoreTermVectors(true);
fieldType.setStoreTermVectorOffsets(true);
fieldType.setIndexed(true);
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
fieldType.setStoreTermVectorPayloads(true);
fieldType.setStoreTermVectorPositions(true);
doc.add(new Field(FIELD_CONTENTS, new FileReader(file), fieldType));
My Custom Analyzer:
public class FilesAnalyzer extends Analyzer {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
return new TokenStreamComponents(new FilesTokenizer(reader));
}
}
My Tokenizer:
public class FilesTokenizer extends Tokenizer {
/**
* Tokenizer Constants
*/
public static final String INCLUDE_NAME = "include_name";// TODO just
the
// name of
the
// included
// file-
not the
// path
relative
// to
current
// propath
// TODO also handle case of spaces- "" or '' appears as {my.i} or
similar
// (including params and other junk)
public static final String PROC_NAME = "proc_name";// TODO the name of
the
// procedure to
run- can
// this also be
given as
// path a la
include?
// TODO also handle case of spaces- "" or '' appears as RUN myProc.p
public static final String[] ALL_TOKEN_TYPES = new String[] {
INCLUDE_NAME,
PROC_NAME };
boolean done = false;
private Reader input = null;
protected Lexer lexer;
//Token term attributes
private CharTermAttribute charTermAttribute =
addAttribute(CharTermAttribute.class);
private OffsetAttribute offsetAttribute =
addAttribute(OffsetAttribute.class);
private TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);
private PayloadAttribute payloadAttribute =
addAttribute(PayloadAttribute.class);
protected FilesTokenizer(Reader in) {
super(in);
if (in instanceof BufferedReader) {
this.input = in;
} else
this.input = new BufferedReader(in);
ASTInfo astInfo = new ASTInfo(null);
astInfo.setMonitor(new NullProgressMonitor());
// files etc which is not really needed
lexer = new Lexer(input) {
@Override
public antlr.Token nextToken() throws TokenStreamException {
antlr.Token token = super.nextToken();
// TODO move to interface IASTToken- methods
// get/setOffset
AntlrToken myToken = new AntlrToken(token);
myToken.setOffset(totalOffset);
return myToken;
}
@Override
public void consume() throws CharStreamException {
// TODO Auto-generated method stub
super.consume();
currentLineOffset = inputState.getColumn();
}
int totalOffset = 0;
int currentLineOffset = 0;
public antlr.Token getTokenObject() {
antlr.Token tokenObject = super.getTokenObject();
return tokenObject;
}
@Override
public void match(char arg0) throws MismatchedCharException,
CharStreamException {
if (arg0 == '\n') {
totalOffset += currentLineOffset;
currentLineOffset = 0;
}
super.match(arg0);
}
@Override
public void newline() {
super.newline();
}
@Override
public void setColumn(int c) {
super.setColumn(c);
currentLineOffset += c;
}
};
lexer.setASTInfo(astInfo);
}
public final TokenType next() throws java.io.IOException {// we are not
// interested in all
// antlr tokens
// - keep consuming antlr tokens till we find a token of interest-
// either include or run
antlr.Token nextAntlrToken = null;
try {
nextAntlrToken = lexer.nextToken();
// System.out.println("antlr token:" + nextAntlrToken);
int type = nextAntlrToken.getType();
while (type != TokenTypes.EOF) {
if (type == ParserTokenTypes.RUN) {// RUN
String text = nextAntlrToken.getText();
nextAntlrToken = progressLexer.nextToken();
type = nextAntlrToken.getType();
if (type == TokenTypes.IDENT) {
int offset;
// TODO move to interface IASTToken- methods
// get/setOffset
offset = ((AntlrToken) nextAntlrToken)
.getOffset();
// TODO handle case of value expression here
text = nextAntlrToken.getText();
TokenType token = new TokenType(text, offset, offset
+ text.length(), PROC_NAME);
return token;
}
}
// TODO use proper token type- prob INCLUDE__REF
if (type == TokenTypes.IDENT) {
// include or run for building a digraph. we would need
to
// index all identifiers anyway
// TODO should we include more info (like global)? this
will
// make it pretty complicated and will have performance
// issues-
// we will need to "remember" too many tokens in this
case
// case 1: include- the identifier is of the form
// {/abc/d/e.p "someArg"}- we need to store both
/abc/d/e.p
// and e.p- note that most customer workspaces dont have
// repeating filenames
// TODO also handle case of spaces in include name
String text = nextAntlrToken.getText();
if (text.startsWith("{")) {// include
// TODO handle all possible cases in path- spaces,...-
// preferably use regex here
if (text.contains("/")) {
text = text.substring(text.lastIndexOf('/') +
1);
}
if (text.indexOf(' ') != -1) {
text = text.substring(0, text.indexOf(' '));
}
if (text.startsWith("{")) {
text = text.substring(1);
}
if (text.endsWith("}")) {
text = text.substring(0, text.length() - 1);
}
/*
* int column = nextAntlrToken.getColumn(); int
line =
* nextAntlrToken.getLine();
*/int lineOffset;
lineOffset = ((AntlrToken) nextAntlrToken)
.getOffset();
// lineOffset=lineOffset+column-1;
lineOffset +=
nextAntlrToken.getText().indexOf(text);
// System.out.println("Offset:" + lineOffset);
// System.out.println("text:$" + text + "$");
// {optional "optional /path / name of file"
TokenType token = new TokenType(text, lineOffset,
lineOffset
+ text.length(), INCLUDE_NAME);
return token;
} else {
nextAntlrToken = progressLexer.nextToken();
type = nextAntlrToken.getType();
}
} else {
nextAntlrToken = lexer.nextToken();
type = nextAntlrToken.getType();
}
}
} catch (TokenStreamException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
@Override
public boolean incrementToken() throws IOException {
// TODO Auto-generated method stub
clearAttributes();
TokenType token = null;
if((token = next()) != null){
charTermAttribute.append(token.getTokenText());
offsetAttribute.setOffset(token.getStartOffset(),
token.getEndOffset());
typeAttribute.setType(token.getType());
payloadAttribute.setPayload(new
BytesRef(token.getType().getBytes()));
return true;
}else{
return false;
}
return true;
}
}
Should i be doing a reset for the reader input, am i missing anything here?
TIA,
Nischal Y