You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by Abhishek Pratap Singh <ab...@gmail.com> on 2013/04/06 11:26:53 UTC
How to generate multiple tokens on same position through TokenFilter
Hi All,
Objective: I want to create a filter to generate multiple tokens (mentioned
below) of input stream and I want to put all generated tokens at same
position i.e. 1.
Although there is already a tokenizer (PathHierarchyTokenizerFactory) for
similar purpose but I also want my tokens to be stemmed so to achieve my
objective I created a filter, please look at the source code below (I am
not an Java expert, so code may not be optimized):
// File: ExtendedNameFilter.java
// Purpose: To combine multiple tokens such that "apache solr foundation"
generates tokens "apachsolrfoundat", "solrfoundat", "foundat"
package org.apache.lucene.analysis;
import java.io.IOException;
import java.util.LinkedList;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
import
org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
public final class ExtendedNameFilter extends TokenFilter {
private final CharTermAttribute termAtt =
addAttribute(CharTermAttribute.class);
private PositionIncrementAttribute posIncAttr;
private OffsetAttribute setOffsetAttr;
private final int extendedWordCount;
public ExtendedNameFilter(Version matchVersion, TokenStream in, int
extendedWordCount) {
super(in);
CharacterUtils.getInstance(matchVersion);
this.extendedWordCount = extendedWordCount;
this.posIncAttr = addAttribute(PositionIncrementAttribute.class);
this.setOffsetAttr = addAttribute(OffsetAttribute.class);
}
LinkedList<String> list = new LinkedList<String>();
ArrayList<Integer> startOffsetList = new ArrayList<Integer>();
int endOffset = 0;
int count = 0;
@Override
public final boolean incrementToken() throws IOException {
Iterator<String> iterator;
int len = 0;
while(input.incrementToken()) {
list.add(termAtt.toString());
startOffsetList.add(setOffsetAttr.startOffset());
endOffset = setOffsetAttr.endOffset();
}
iterator = list.iterator();
len = list.size();
if (len > 0 && (extendedWordCount < 0 || count < extendedWordCount)) {
generateToken(iterator);
return true;
}
else {
return false;
}
}
public void generateToken(Iterator<String> iterator) {
termAtt.setEmpty();
while (iterator.hasNext()){
termAtt.append((CharSequence) iterator.next());
}
list.removeFirst();
if(count == 0) {
posIncAttr.setPositionIncrement(1);
}
else {
posIncAttr.setPositionIncrement(0);
}
setOffsetAttr.setOffset(startOffsetList.get(count),endOffset);
count++;
}
}
// Code Ends
On analysis page of solr it worked fine, I've shared screenshot of analysis
page on google, anyone can see this by click on below link
https://docs.google.com/file/d/0BxNUkIJt2ma3TUN0YUF1dW1Pc2s/edit?usp=sharing<https://docs.google.com/file/d/0BxNUkIJt2ma3SEE2SDBLTkpETE0/edit?usp=sharing>
but while indexing documents Solr gives following exception:
Apr 6, 2013 12:05:45 PM org.apache.solr.common.SolrException log
SEVERE: java.lang.IllegalArgumentException: first position increment must
be > 0 (got 0)
at
org.apache.lucene.index.DocInverterPerField.processFields(DocInverterPerField.java:125)
at
org.apache.lucene.index.DocFieldProcessor.processDocument(DocFieldProcessor.java:254)
at
org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:256)
at
org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:376)
at
org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1473)
at
org.apache.solr.update.DirectUpdateHandler2.addDoc(DirectUpdateHandler2.java:206)
at
org.apache.solr.update.processor.RunUpdateProcessor.processAdd(RunUpdateProcessorFactory.java:69)
at
org.apache.solr.update.processor.UpdateRequestProcessor.processAdd(UpdateRequestProcessor.java:51)
at
org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:477)
at
org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:346)
at
org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100)
at
org.apache.solr.handler.loader.XMLLoader.processUpdate(XMLLoader.java:246)
at org.apache.solr.handler.loader.XMLLoader.load(XMLLoader.java:173)
at
org.apache.solr.handler.UpdateRequestHandler$1.load(UpdateRequestHandler.java:92)
at
org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
at
org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
at org.apache.solr.core.SolrCore.execute(SolrCore.java:1797)
at
org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:637)
at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:343)
at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:141)
at
org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
at
org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
at
org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
at
org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
at
org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
at
org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
at
org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
at
org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
at
org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
at
org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
at
org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
at org.eclipse.jetty.server.Server.handle(Server.java:365)
at
org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
at
org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
at
org.eclipse.jetty.server.AbstractHttpConnection.headerComplete(AbstractHttpConnection.java:926)
at
org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.headerComplete(AbstractHttpConnection.java:988)
at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:642)
at
org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:235)
at
org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
at
org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
at
org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
at
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
at java.lang.Thread.run(Unknown Source)
am I doing something wrong in code, please guide to overcome this exception.
I am also not clear if it is related to the blank starting tokens in the
last filter chain output.
--
Regards
Abhishek Pratap Singh