You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2017/11/08 14:48:40 UTC

svn commit: r1814584 - in /ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core: cr/ resource/

Author: seanfinan
Date: Wed Nov  8 14:48:39 2017
New Revision: 1814584

URL: http://svn.apache.org/viewvc?rev=1814584&view=rev
Log:
Core collection reader updates (refactor)

Added:
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/resource/FileReadWriteUtil.java
Modified:
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/JdbcCollectionReader.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/LinesFromFileCollectionReader.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/TextReader.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/XmiCollectionReaderCtakes.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/resource/LuceneIndexReaderResourceImpl.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/resource/StringIntegerMapResourceImpl.java

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/JdbcCollectionReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/JdbcCollectionReader.java?rev=1814584&r1=1814583&r2=1814584&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/JdbcCollectionReader.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/JdbcCollectionReader.java Wed Nov  8 14:48:39 2017
@@ -30,7 +30,10 @@ import org.apache.uima.resource.Resource
 import org.apache.uima.util.Progress;
 import org.apache.uima.util.ProgressImpl;
 
-import java.io.*;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
 import java.sql.*;
 import java.util.ArrayList;
 import java.util.List;
@@ -268,17 +271,9 @@ public class JdbcCollectionReader extend
 
             try
             {
-                // if there's a CAS Initializer, call it
-                if (getCasInitializer() != null)
-                {
-                    Reader reader = new StringReader(document);
-                    getCasInitializer().initializeCas(reader, cas);
-                } else
-                {
-                    // No CAS Initiliazer, so set document text ourselves.
+               // No CAS Initiliazer, so set document text ourselves.
                     // put document in CAS (assume CAS)
                     cas.getJCas().setDocumentText(document);
-                }
 
                 DocumentID docIdAnnot = new DocumentID(cas
                         .getJCas());

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/LinesFromFileCollectionReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/LinesFromFileCollectionReader.java?rev=1814584&r1=1814583&r2=1814584&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/LinesFromFileCollectionReader.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/LinesFromFileCollectionReader.java Wed Nov  8 14:48:39 2017
@@ -30,7 +30,9 @@ import org.apache.uima.resource.Resource
 import org.apache.uima.util.Progress;
 import org.apache.uima.util.ProgressImpl;
 
-import java.io.*;
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 
@@ -173,19 +175,9 @@ public class LinesFromFileCollectionRead
 		  	iv_logger.debug("id="+id);
 		  	iv_logger.debug("text="+text);
 		  	
-			//if there's a CAS Initializer, call it	
-			if (getCasInitializer() != null)
-			{
-				Reader reader = new StringReader(text);
-				getCasInitializer().initializeCas(reader, cas);
-				reader.close();
-			}
-			else  //No CAS Initiliazer, so read file and set document text ourselves
-			{				
 				jcas.setDocumentText(text);
-			}
-		   
-		    //set language if it was explicitly specified as a configuration parameter
+
+          //set language if it was explicitly specified as a configuration parameter
 		    if (iv_language != null)
 		    {
 //		      ((DocumentAnnotation)jcas.getDocumentAnnotationFs()).setLanguage(iv_language);

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/TextReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/TextReader.java?rev=1814584&r1=1814583&r2=1814584&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/TextReader.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/TextReader.java Wed Nov  8 14:48:39 2017
@@ -19,9 +19,9 @@
 package org.apache.ctakes.core.cr;
 
 import org.apache.ctakes.core.pipeline.PipeBitInfo;
+import org.apache.ctakes.core.resource.FileReadWriteUtil;
 import org.apache.ctakes.typesystem.type.structured.DocumentID;
 import org.apache.uima.UimaContext;
-import org.apache.uima.collection.CasInitializer;
 import org.apache.uima.collection.CollectionException;
 import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
 import org.apache.uima.fit.descriptor.ConfigurationParameter;
@@ -30,7 +30,8 @@ import org.apache.uima.resource.Resource
 import org.apache.uima.util.Progress;
 import org.apache.uima.util.ProgressImpl;
 
-import java.io.*;
+import java.io.File;
+import java.io.IOException;
 import java.util.Iterator;
 import java.util.List;
 
@@ -51,68 +52,41 @@ public class TextReader extends JCasColl
       name = PARAM_FILES,
       mandatory = true,
       description = "The text files to be loaded")
-  private List<File> files;
+  private List<File> _files;
 
-  private Iterator<File> filesIter;
+   private Iterator<File> _filesIter;
 
-  private int completed;
+   private int _completed;
 
   @Override
-  public void initialize(UimaContext context) throws ResourceInitializationException {
-    super.initialize(context);
-    this.filesIter = files.iterator();
-    this.completed = 0;
+  public void initialize( final UimaContext context ) throws ResourceInitializationException {
+     super.initialize( context );
+     _filesIter = _files.iterator();
+     _completed = 0;
   }
 
   @Override
   public Progress[] getProgress() {
-    return new Progress[] { new ProgressImpl(this.completed, this.files.size(), Progress.ENTITIES) };
+     return new Progress[]{ new ProgressImpl( _completed, _files.size(), Progress.ENTITIES ) };
   }
 
   @Override
   public boolean hasNext() throws IOException, CollectionException {
-    return this.filesIter.hasNext();
+     return _filesIter.hasNext();
   }
 
   @Override
   public void getNext(JCas jCas) throws IOException, CollectionException {
-    File currentFile = this.filesIter.next();
-    String filename = currentFile.getName();
-    FileInputStream fileInputStream = new FileInputStream(currentFile);
-    InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream);
-    BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
-    
-    CasInitializer casInitializer = getCasInitializer();
-
-    if (casInitializer != null)
-    {
-      casInitializer.initializeCas(bufferedReader, jCas.getCas());  
-    }
-    else  //No CAS Initializer, so read file and set document text ourselves
-    {       
-      try
-      {
-        byte[] contents = new byte[(int)currentFile.length() ];
-        fileInputStream.read( contents );   
-        String text;
-        text = new String(contents); 
-        //put document in CAS (assume CAS)
-        jCas.setDocumentText(text);
-      }
-      finally
-      {
-        if (fileInputStream != null)
-          fileInputStream.close();
-      }  
-        
-    }
+     final File currentFile = _filesIter.next();
+     final String filename = currentFile.getName();
+     final String text = FileReadWriteUtil.readText( filename );
+     jCas.setDocumentText( text );
 
-    DocumentID documentIDAnnotation = new DocumentID(jCas);
-    documentIDAnnotation.setDocumentID(filename);
+     final DocumentID documentIDAnnotation = new DocumentID( jCas );
+     documentIDAnnotation.setDocumentID( filename );
     documentIDAnnotation.addToIndexes();
 
-    
-    this.completed += 1;
+     _completed++;
   }
 
   

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/XmiCollectionReaderCtakes.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/XmiCollectionReaderCtakes.java?rev=1814584&r1=1814583&r2=1814584&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/XmiCollectionReaderCtakes.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/XmiCollectionReaderCtakes.java Wed Nov  8 14:48:39 2017
@@ -58,8 +58,8 @@ public class XmiCollectionReaderCtakes e
   public static final String PARAM_FAILUNKNOWN = "FailOnUnknownType";
 
   private Boolean mFailOnUnknownType;
-  
-  private ArrayList mFiles;
+
+   private ArrayList<File> mFiles;
 
   private int mCurrentIndex;
 
@@ -82,8 +82,11 @@ public class XmiCollectionReaderCtakes e
     }
 
     // get list of .xmi files in the specified directory
-    mFiles = new ArrayList();
+     mFiles = new ArrayList<>();
     File[] files = directory.listFiles();
+     if ( files == null ) {
+        return;
+     }
     for (int i = 0; i < files.length; i++) {
       if (!files[i].isDirectory() && files[i].getName().endsWith(".xmi")) {
         mFiles.add(files[i]);

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/resource/FileReadWriteUtil.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/resource/FileReadWriteUtil.java?rev=1814584&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/resource/FileReadWriteUtil.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/resource/FileReadWriteUtil.java Wed Nov  8 14:48:39 2017
@@ -0,0 +1,78 @@
+package org.apache.ctakes.core.resource;
+
+
+import org.apache.log4j.Logger;
+
+import java.io.*;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 10/23/2017
+ */
+final public class FileReadWriteUtil {
+
+   private FileReadWriteUtil() {
+   }
+
+   static private final Logger LOGGER = Logger.getLogger( "FileReadWriteUtil" );
+
+   /**
+    * Reading text from a file or resource is done everywhere, but a common implementation is missing from ctakes.
+    *
+    * @param path for file or resource.
+    * @return a single string of text from the file.
+    * @throws IOException if the resource cannot be read.
+    */
+   static public String readText( final String path ) throws IOException {
+      final InputStream stream = FileLocator.getAsStream( path );
+      final StringBuilder sb = new StringBuilder();
+      try ( BufferedReader reader = new BufferedReader( new InputStreamReader( stream ) ) ) {
+         String line;
+         while ( (line = reader.readLine()) != null ) {
+            sb.append( line ).append( "\n" );
+         }
+      }
+      return sb.toString();
+   }
+
+   /**
+    * Reading text from a file or resource is done everywhere, but a common implementation is missing from ctakes.
+    *
+    * @param path for file or resource.
+    * @return a list containing each line of text in the file.
+    * @throws IOException if the resource cannot be read.
+    */
+   static public List<String> readLines( final String path ) throws IOException {
+      final InputStream stream = FileLocator.getAsStream( path );
+      final List<String> lines = new ArrayList<>();
+      try ( BufferedReader reader = new BufferedReader( new InputStreamReader( stream ) ) ) {
+         String line;
+         while ( (line = reader.readLine()) != null ) {
+            lines.add( line );
+         }
+      }
+      return lines;
+   }
+
+   /**
+    * Writing text to a file is done everywhere, but a common implementation is missing from ctakes.
+    *
+    * @param text     to be written.
+    * @param filepath for output file.
+    * @throws IOException if the file cannot be written.
+    */
+   static public void writeText( final String text, final String filepath ) throws IOException {
+      final Path path = Paths.get( filepath );
+      try ( Writer writer = Files.newBufferedWriter( path ) ) {
+         writer.write( text );
+      }
+   }
+
+
+}

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/resource/LuceneIndexReaderResourceImpl.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/resource/LuceneIndexReaderResourceImpl.java?rev=1814584&r1=1814583&r2=1814584&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/resource/LuceneIndexReaderResourceImpl.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/resource/LuceneIndexReaderResourceImpl.java Wed Nov  8 14:48:39 2017
@@ -18,21 +18,21 @@
  */
 package org.apache.ctakes.core.resource;
 
-import java.io.File;
-
 import org.apache.log4j.Logger;
+import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.RAMDirectory;
-
 import org.apache.uima.resource.DataResource;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.apache.uima.resource.SharedResourceObject;
 import org.apache.uima.resource.metadata.ConfigurationParameterSettings;
 
-/**
+import java.io.File;
+
+/**
  * Oct 2010 - convert to lucene 3.0.2
  * @author Mayo Clinic
  */
@@ -55,7 +55,7 @@ public class LuceneIndexReaderResourceIm
         String indexDirStr = (String) cps.getParameterValue("IndexDirectory");
         try {
 
-            File indexDir = FileLocator.locateFile(indexDirStr);
+           File indexDir = FileLocator.getFile( indexDirStr );
 
             if(!indexDir.exists())
             	iv_logger.info("indexDir="+indexDirStr+"  does not exist!");
@@ -64,15 +64,17 @@ public class LuceneIndexReaderResourceIm
             
             if (useMemoryIndex.booleanValue()) {
 
-                iv_logger.info("Loading Lucene Index into memory: " + indexDir);
+                iv_logger.info("Loading Lucene Index into memory: " + indexDir);
                 FSDirectory fsd = FSDirectory.open(indexDir);
                 Directory d = new RAMDirectory(fsd, IOContext.DEFAULT);
-                iv_indexReader = IndexReader.open(d);
+//                iv_indexReader = IndexReader.open(d);
+               iv_indexReader = DirectoryReader.open( d );
             }
             else {
                 iv_logger.info("Loading Lucene Index: " + indexDir);
-                FSDirectory fsd = FSDirectory.open(indexDir);
-                iv_indexReader = IndexReader.open(fsd);
+                FSDirectory fsd = FSDirectory.open(indexDir);
+//                iv_indexReader = IndexReader.open(fsd);
+               iv_indexReader = DirectoryReader.open( fsd );
             }
             iv_logger.info("Loaded Lucene Index, # docs=" + iv_indexReader.numDocs());
         }

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/resource/StringIntegerMapResourceImpl.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/resource/StringIntegerMapResourceImpl.java?rev=1814584&r1=1814583&r2=1814584&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/resource/StringIntegerMapResourceImpl.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/resource/StringIntegerMapResourceImpl.java Wed Nov  8 14:48:39 2017
@@ -18,6 +18,11 @@
  */
 package org.apache.ctakes.core.resource;
 
+import org.apache.log4j.Logger;
+import org.apache.uima.resource.DataResource;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.SharedResourceObject;
+
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
@@ -26,12 +31,6 @@ import java.util.HashMap;
 import java.util.Map;
 import java.util.StringTokenizer;
 
-import org.apache.log4j.Logger;
-
-import org.apache.uima.resource.DataResource;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.resource.SharedResourceObject;
-
 /**
  * Implementation for StringIntegerMapResource interface.  
  * 
@@ -98,7 +97,7 @@ public class StringIntegerMapResourceImp
     /**
      * Gets a map of the String/Integer values.
      */
-    public Map getMap()
+    public Map<String, Integer> getMap()
     {
         return iv_map;
     }