You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2015/10/02 20:52:31 UTC
svn commit: r1706466 - /ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/LuceneCollectionReader.java

Author: tmill
Date: Fri Oct  2 18:52:31 2015
New Revision: 1706466

URL: http://svn.apache.org/viewvc?rev=1706466&view=rev
Log:
Added collection reader for lucene indices containing whole documents.

Added:
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/LuceneCollectionReader.java

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/LuceneCollectionReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/LuceneCollectionReader.java?rev=1706466&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/LuceneCollectionReader.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/LuceneCollectionReader.java Fri Oct  2 18:52:31 2015
@@ -0,0 +1,110 @@
+package org.apache.ctakes.core.cr;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.ctakes.typesystem.type.structured.DocumentID;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.uima.UimaContext;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CollectionException;
+import org.apache.uima.fit.component.CasCollectionReader_ImplBase;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.internal.util.XMLUtils;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.Progress;
+import org.apache.uima.util.ProgressImpl;
+
+public class LuceneCollectionReader extends CasCollectionReader_ImplBase {
+
+  public static final String PARAM_INDEX_DIR = "IndexDirectory";
+  @ConfigurationParameter(
+      name = PARAM_INDEX_DIR,
+      description = "Location of lucene index",
+      mandatory = true
+      )
+  private String indexDir;
+  
+  public static final String PARAM_FIELD_NAME = "FieldName";
+  @ConfigurationParameter(
+      name = PARAM_FIELD_NAME,
+      description = "Field to look in for document text",
+      mandatory = false
+      )
+  private String fieldName = "text";
+  
+  public static final String PARAM_MAX_WORDS = "MaxWords";
+  @ConfigurationParameter(
+      name = PARAM_MAX_WORDS,
+      description = "Maximum number of words to process (approximate -- actually based on characters)",
+      mandatory = false
+      )
+  private int maxWords = -1;
+  
+  private int docNum = 0;
+  private DirectoryReader ireader = null;
+  private int wordNum = 0;
+  public static final int CHARS_PER_WORD = 6;
+  
+  @Override
+  public void initialize(UimaContext context)
+      throws ResourceInitializationException {
+    super.initialize(context);
+    
+    Directory dir;
+    try {
+      dir = FSDirectory.open(new File(indexDir));
+      ireader = DirectoryReader.open(dir);
+    } catch (IOException e) {
+      e.printStackTrace();
+      throw new ResourceInitializationException(e);
+    }
+ }
+
+  @Override
+  public void getNext(CAS cas) throws IOException, CollectionException {
+    JCas jcas = null;
+    try {
+      jcas = cas.getJCas();
+    } catch (CASException e) {
+      e.printStackTrace();
+      throw new IOException(e);
+    }
+    
+    Document doc = ireader.document(docNum++);
+    IndexableField textField = doc.getField(fieldName);
+    while(textField == null){
+      doc = ireader.document(docNum++);
+      textField = doc.getField(fieldName);
+    }
+    StringBuffer text = new StringBuffer(textField.stringValue());
+    int pos;
+    while((pos = XMLUtils.checkForNonXmlCharacters(text.toString())) != -1){
+      text.setCharAt(pos, ' ');
+    }
+    jcas.setDocumentText(text.toString().replaceAll("__+", " "));
+    DocumentID docId = new DocumentID(jcas);
+    docId.setDocumentID("doc" + docNum);
+    docId.addToIndexes();
+    
+    wordNum += text.length() / CHARS_PER_WORD;
+  }
+
+  @Override
+  public Progress[] getProgress() {
+    return new Progress[]{ (maxWords < 0 ? new ProgressImpl(docNum, ireader.numDocs(), "Documents") : new ProgressImpl(wordNum, maxWords, "Words"))};
+  }
+
+  @Override
+  public boolean hasNext() throws IOException, CollectionException {
+    return (docNum < ireader.numDocs()) &&
+        (maxWords < 0 || wordNum < maxWords);
+  }
+
+}