You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2017/08/08 18:26:43 UTC

svn commit: r1804455 - /ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FileTreeReader.java

Author: seanfinan
Date: Tue Aug  8 18:26:43 2017
New Revision: 1804455

URL: http://svn.apache.org/viewvc?rev=1804455&view=rev
Log:
Attempt to read with path, util.stream and CharSet

Modified:
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FileTreeReader.java

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FileTreeReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FileTreeReader.java?rev=1804455&r1=1804454&r2=1804455&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FileTreeReader.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FileTreeReader.java Tue Aug  8 18:26:43 2017
@@ -19,10 +19,13 @@ import org.apache.uima.util.Progress;
 import org.apache.uima.util.ProgressImpl;
 
 import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
+import java.util.stream.Collectors;
 
 
 /**
@@ -238,6 +241,68 @@ final public class FileTreeReader extend
    public void getNext( final JCas jcas ) throws IOException, CollectionException {
       final File file = _files.get( _currentIndex );
       _currentIndex++;
+      String docText = readFile( file );
+      if ( !docText.isEmpty() && !docText.endsWith( "\n" ) ) {
+         // Make sure that we end with a newline
+         docText += "\n";
+      }
+      jcas.setDocumentText( docText );
+      final DocumentID documentId = new DocumentID( jcas );
+      final String id = createDocumentID( file, _validExtensions );
+      documentId.setDocumentID( id );
+      documentId.addToIndexes();
+      final DocumentIdPrefix documentIdPrefix = new DocumentIdPrefix( jcas );
+      final String idPrefix = createDocumentIdPrefix( file, _rootDir );
+      documentIdPrefix.setDocumentIdPrefix( idPrefix );
+      documentIdPrefix.addToIndexes();
+      final DocumentPath documentPath = new DocumentPath( jcas );
+      documentPath.setDocumentPath( file.getAbsolutePath() );
+      documentPath.addToIndexes();
+   }
+
+   /**
+    * Reads file using a Path and stream.  Failing that it calls {@link #readByBuffer(File)}
+    *
+    * @param file file to read
+    * @return text in file
+    * @throws IOException if the file could not be read
+    */
+   private String readFile( final File file ) throws IOException {
+      try {
+         return readByPath( file );
+      } catch ( UncheckedIOException uE ) {
+         // This is a pretty bad way to handle a MalformedInputException, but that can be thrown by the collector
+         // in the stream, and java streams and exceptions do not go well together
+         LOGGER.warn( "Bad characters in " + file.getPath() );
+      }
+      return readByBuffer( file );
+   }
+
+   /**
+    * Reads file using a Path and stream.
+    *
+    * @param file file to read
+    * @return text in file
+    * @throws IOException if the file could not be read
+    */
+   private String readByPath( final File file ) throws IOException {
+      if ( _encoding != null && !_encoding.isEmpty() ) {
+         final Charset charset = Charset.forName( _encoding );
+         return Files.lines( file.toPath(), charset ).collect( Collectors.joining( "\n" ) );
+      } else {
+         return Files.lines( file.toPath() ).collect( Collectors.joining( "\n" ) );
+      }
+   }
+
+
+   /**
+    * Reads file using buffered input stream
+    *
+    * @param file file to read
+    * @return text in file
+    * @throws IOException if the file could not be read
+    */
+   private String readByBuffer( final File file ) throws IOException {
       // Use 8KB as the default buffer size
       byte[] buffer = new byte[ 8192 ];
       final StringBuilder sb = new StringBuilder();
@@ -256,19 +321,7 @@ final public class FileTreeReader extend
       } catch ( FileNotFoundException fnfE ) {
          throw new IOException( fnfE );
       }
-      // put document text and id annotations in CAS (assume CAS)
-      jcas.setDocumentText( sb.toString() );
-      final DocumentID documentId = new DocumentID( jcas );
-      final String id = createDocumentID( file, _validExtensions );
-      documentId.setDocumentID( id );
-      documentId.addToIndexes();
-      final DocumentIdPrefix documentIdPrefix = new DocumentIdPrefix( jcas );
-      final String idPrefix = createDocumentIdPrefix( file, _rootDir );
-      documentIdPrefix.setDocumentIdPrefix( idPrefix );
-      documentIdPrefix.addToIndexes();
-      final DocumentPath documentPath = new DocumentPath( jcas );
-      documentPath.setDocumentPath( file.getAbsolutePath() );
-      documentPath.addToIndexes();
+      return sb.toString();
    }
 
    /**