You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by Abhishek Pratap Singh <ab...@gmail.com> on 2013/04/06 11:26:53 UTC

How to generate multiple tokens on same position through TokenFilter

Hi All,

Objective: I want to create a filter to generate multiple tokens (mentioned
below) of input stream and I want to put all generated tokens at same
position i.e. 1.

Although there is already a tokenizer (PathHierarchyTokenizerFactory) for
similar purpose but I also want my tokens to be stemmed so to achieve my
objective I created a filter, please look at the source code below (I am
not an Java expert, so code may not be optimized):


// File: ExtendedNameFilter.java
// Purpose: To combine multiple tokens such that "apache solr foundation"
generates tokens "apachsolrfoundat", "solrfoundat", "foundat"

package org.apache.lucene.analysis;

import java.io.IOException;
import java.util.LinkedList;
import java.util.ArrayList;
import java.util.Iterator;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
import
org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

public final class ExtendedNameFilter extends TokenFilter {
  private final CharTermAttribute termAtt =
addAttribute(CharTermAttribute.class);
  private PositionIncrementAttribute posIncAttr;
  private OffsetAttribute setOffsetAttr;
  private final int extendedWordCount;

  public ExtendedNameFilter(Version matchVersion, TokenStream in, int
extendedWordCount) {
    super(in);
    CharacterUtils.getInstance(matchVersion);
    this.extendedWordCount = extendedWordCount;
    this.posIncAttr = addAttribute(PositionIncrementAttribute.class);
    this.setOffsetAttr = addAttribute(OffsetAttribute.class);
  }

  LinkedList<String> list = new LinkedList<String>();
  ArrayList<Integer> startOffsetList = new ArrayList<Integer>();
  int endOffset = 0;
  int count = 0;

  @Override
  public final boolean incrementToken() throws IOException {
      Iterator<String> iterator;
      int len = 0;

      while(input.incrementToken()) {
          list.add(termAtt.toString());
          startOffsetList.add(setOffsetAttr.startOffset());
          endOffset = setOffsetAttr.endOffset();
      }

      iterator = list.iterator();
      len = list.size();

      if (len > 0 && (extendedWordCount < 0 || count < extendedWordCount)) {
          generateToken(iterator);
          return true;
          }
      else {
          return false;
      }
  }

  public void generateToken(Iterator<String> iterator) {
      termAtt.setEmpty();
      while (iterator.hasNext()){
          termAtt.append((CharSequence) iterator.next());
          }
      list.removeFirst();
      if(count == 0) {
          posIncAttr.setPositionIncrement(1);
      }
      else {
          posIncAttr.setPositionIncrement(0);
      }

      setOffsetAttr.setOffset(startOffsetList.get(count),endOffset);
      count++;
  }
}


// Code Ends



On analysis page of solr it worked fine, I've shared screenshot of analysis
page on google, anyone can see this by click on below link
https://docs.google.com/file/d/0BxNUkIJt2ma3TUN0YUF1dW1Pc2s/edit?usp=sharing<https://docs.google.com/file/d/0BxNUkIJt2ma3SEE2SDBLTkpETE0/edit?usp=sharing>

but while indexing documents Solr gives following exception:

Apr 6, 2013 12:05:45 PM org.apache.solr.common.SolrException log
SEVERE: java.lang.IllegalArgumentException: first position increment must
be > 0 (got 0)
        at
org.apache.lucene.index.DocInverterPerField.processFields(DocInverterPerField.java:125)
        at
org.apache.lucene.index.DocFieldProcessor.processDocument(DocFieldProcessor.java:254)
        at
org.apache.lucene.index.DocumentsWriterPerThread.updateDocument(DocumentsWriterPerThread.java:256)
        at
org.apache.lucene.index.DocumentsWriter.updateDocument(DocumentsWriter.java:376)
        at
org.apache.lucene.index.IndexWriter.updateDocument(IndexWriter.java:1473)
        at
org.apache.solr.update.DirectUpdateHandler2.addDoc(DirectUpdateHandler2.java:206)
        at
org.apache.solr.update.processor.RunUpdateProcessor.processAdd(RunUpdateProcessorFactory.java:69)
        at
org.apache.solr.update.processor.UpdateRequestProcessor.processAdd(UpdateRequestProcessor.java:51)
        at
org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:477)
        at
org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:346)
        at
org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100)
        at
org.apache.solr.handler.loader.XMLLoader.processUpdate(XMLLoader.java:246)
        at org.apache.solr.handler.loader.XMLLoader.load(XMLLoader.java:173)
        at
org.apache.solr.handler.UpdateRequestHandler$1.load(UpdateRequestHandler.java:92)
        at
org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
        at
org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
        at org.apache.solr.core.SolrCore.execute(SolrCore.java:1797)
        at
org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:637)
        at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:343)
        at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:141)
        at
org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1307)
        at
org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:453)
        at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
        at
org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:560)
        at
org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
        at
org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1072)
        at
org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:382)
        at
org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
        at
org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1006)
        at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
        at
org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
        at
org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
        at
org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
        at org.eclipse.jetty.server.Server.handle(Server.java:365)
        at
org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:485)
        at
org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
        at
org.eclipse.jetty.server.AbstractHttpConnection.headerComplete(AbstractHttpConnection.java:926)
        at
org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.headerComplete(AbstractHttpConnection.java:988)
        at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:642)
        at
org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:235)
        at
org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
        at
org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
        at
org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
        at
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
        at java.lang.Thread.run(Unknown Source)

am I doing something wrong in code, please guide to overcome this exception.


I am also not clear if it is related to the blank starting tokens in the
last filter chain output.


-- 
Regards
Abhishek Pratap Singh