You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by sc...@apache.org on 2008/12/06 20:40:17 UTC

svn commit: r724031 - in /incubator/uima/uimaj/trunk/uimaj-examples/src/main: descriptors/collection_reader/FileSystemCollectionReader.xml java/org/apache/uima/examples/cpe/FileSystemCollectionReader.java

Author: schor
Date: Sat Dec  6 11:40:16 2008
New Revision: 724031

URL: http://svn.apache.org/viewvc?rev=724031&view=rev
Log:
[UIMA-1206] commit patch with minor changes, including checking parameter for "null" which it can have if it is not set.

Modified:
    incubator/uima/uimaj/trunk/uimaj-examples/src/main/descriptors/collection_reader/FileSystemCollectionReader.xml
    incubator/uima/uimaj/trunk/uimaj-examples/src/main/java/org/apache/uima/examples/cpe/FileSystemCollectionReader.java

Modified: incubator/uima/uimaj/trunk/uimaj-examples/src/main/descriptors/collection_reader/FileSystemCollectionReader.xml
URL: http://svn.apache.org/viewvc/incubator/uima/uimaj/trunk/uimaj-examples/src/main/descriptors/collection_reader/FileSystemCollectionReader.xml?rev=724031&r1=724030&r2=724031&view=diff
==============================================================================
--- incubator/uima/uimaj/trunk/uimaj-examples/src/main/descriptors/collection_reader/FileSystemCollectionReader.xml (original)
+++ incubator/uima/uimaj/trunk/uimaj-examples/src/main/descriptors/collection_reader/FileSystemCollectionReader.xml Sat Dec  6 11:40:16 2008
@@ -63,6 +63,13 @@
                 <multiValued>false</multiValued>
                 <mandatory>false</mandatory>
             </configurationParameter>
+            <configurationParameter>
+                <name>BrowseSubdirectories</name>
+                <description>True means include files of subdirectories, recursively, of the input directory.</description>
+                <type>Boolean</type>
+                <multiValued>false</multiValued>
+                <mandatory>false</mandatory>
+            </configurationParameter>
         </configurationParameters>
 		<configurationParameterSettings>
 		    <nameValuePair>
@@ -71,6 +78,12 @@
 		            <string>C:/Program Files/apache-uima/examples/data</string>
 		        </value>
 		    </nameValuePair>
+            <nameValuePair>
+                <name>BrowseSubdirectories</name>
+                <value>
+                    <boolean>false</boolean>
+                </value>
+            </nameValuePair>
 		</configurationParameterSettings>
 
         <!-- Type System of CASes returned by this Collection Reader -->

Modified: incubator/uima/uimaj/trunk/uimaj-examples/src/main/java/org/apache/uima/examples/cpe/FileSystemCollectionReader.java
URL: http://svn.apache.org/viewvc/incubator/uima/uimaj/trunk/uimaj-examples/src/main/java/org/apache/uima/examples/cpe/FileSystemCollectionReader.java?rev=724031&r1=724030&r2=724031&view=diff
==============================================================================
--- incubator/uima/uimaj/trunk/uimaj-examples/src/main/java/org/apache/uima/examples/cpe/FileSystemCollectionReader.java (original)
+++ incubator/uima/uimaj/trunk/uimaj-examples/src/main/java/org/apache/uima/examples/cpe/FileSystemCollectionReader.java Sat Dec  6 11:40:16 2008
@@ -1,167 +1,193 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.uima.examples.cpe;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-
-import org.apache.uima.cas.CAS;
-import org.apache.uima.cas.CASException;
-import org.apache.uima.collection.CollectionException;
-import org.apache.uima.collection.CollectionReader_ImplBase;
-import org.apache.uima.examples.SourceDocumentInformation;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.tcas.DocumentAnnotation;
-import org.apache.uima.resource.ResourceConfigurationException;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.util.FileUtils;
-import org.apache.uima.util.Progress;
-import org.apache.uima.util.ProgressImpl;
-
-/**
- * A simple collection reader that reads documents from a directory in the filesystem. It can be
- * configured with the following parameters:
- * <ul>
- * <li><code>InputDirectory</code> - path to directory containing files</li>
- * <li><code>Encoding</code> (optional) - character encoding of the input files</li>
- * <li><code>Language</code> (optional) - language of the input documents</li>
- * </ul>
- * 
- * 
- */
-public class FileSystemCollectionReader extends CollectionReader_ImplBase {
-  /**
-   * Name of configuration parameter that must be set to the path of a directory containing input
-   * files.
-   */
-  public static final String PARAM_INPUTDIR = "InputDirectory";
-
-  /**
-   * Name of configuration parameter that contains the character encoding used by the input files.
-   * If not specified, the default system encoding will be used.
-   */
-  public static final String PARAM_ENCODING = "Encoding";
-
-  /**
-   * Name of optional configuration parameter that contains the language of the documents in the
-   * input directory. If specified this information will be added to the CAS.
-   */
-  public static final String PARAM_LANGUAGE = "Language";
-
-  private ArrayList mFiles;
-
-  private String mEncoding;
-
-  private String mLanguage;
-
-  private int mCurrentIndex;
-
-  /**
-   * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
-   */
-  public void initialize() throws ResourceInitializationException {
-    File directory = new File(((String) getConfigParameterValue(PARAM_INPUTDIR)).trim());
-    mEncoding = (String) getConfigParameterValue(PARAM_ENCODING);
-    mLanguage = (String) getConfigParameterValue(PARAM_LANGUAGE);
-    mCurrentIndex = 0;
-
-    // if input directory does not exist or is not a directory, throw exception
-    if (!directory.exists() || !directory.isDirectory()) {
-      throw new ResourceInitializationException(ResourceConfigurationException.DIRECTORY_NOT_FOUND,
-              new Object[] { PARAM_INPUTDIR, this.getMetaData().getName(), directory.getPath() });
-    }
-
-    // get list of files (not subdirectories) in the specified directory
-    mFiles = new ArrayList();
-    File[] files = directory.listFiles();
-    for (int i = 0; i < files.length; i++) {
-      if (!files[i].isDirectory()) {
-        mFiles.add(files[i]);
-      }
-    }
-  }
-
-  /**
-   * @see org.apache.uima.collection.CollectionReader#hasNext()
-   */
-  public boolean hasNext() {
-    return mCurrentIndex < mFiles.size();
-  }
-
-  /**
-   * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
-   */
-  public void getNext(CAS aCAS) throws IOException, CollectionException {
-    JCas jcas;
-    try {
-      jcas = aCAS.getJCas();
-    } catch (CASException e) {
-      throw new CollectionException(e);
-    }
-
-    // open input stream to file
-    File file = (File) mFiles.get(mCurrentIndex++);
-    String text = FileUtils.file2String(file, mEncoding);
-      // put document in CAS
-    jcas.setDocumentText(text);
-
-    // set language if it was explicitly specified as a configuration parameter
-    if (mLanguage != null) {
-      ((DocumentAnnotation) jcas.getDocumentAnnotationFs()).setLanguage(mLanguage);
-    }
-
-    // Also store location of source document in CAS. This information is critical
-    // if CAS Consumers will need to know where the original document contents are located.
-    // For example, the Semantic Search CAS Indexer writes this information into the
-    // search index that it creates, which allows applications that use the search index to
-    // locate the documents that satisfy their semantic queries.
-    SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
-    srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString());
-    srcDocInfo.setOffsetInSource(0);
-    srcDocInfo.setDocumentSize((int) file.length());
-    srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size());
-    srcDocInfo.addToIndexes();
-  }
-
-  /**
-   * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
-   */
-  public void close() throws IOException {
-  }
-
-  /**
-   * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
-   */
-  public Progress[] getProgress() {
-    return new Progress[] { new ProgressImpl(mCurrentIndex, mFiles.size(), Progress.ENTITIES) };
-  }
-
-  /**
-   * Gets the total number of documents that will be returned by this collection reader. This is not
-   * part of the general collection reader interface.
-   * 
-   * @return the number of documents in the collection
-   */
-  public int getNumberOfDocuments() {
-    return mFiles.size();
-  }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.examples.cpe;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CollectionException;
+import org.apache.uima.collection.CollectionReader_ImplBase;
+import org.apache.uima.examples.SourceDocumentInformation;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.DocumentAnnotation;
+import org.apache.uima.resource.ResourceConfigurationException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.FileUtils;
+import org.apache.uima.util.Progress;
+import org.apache.uima.util.ProgressImpl;
+
+/**
+ * A simple collection reader that reads documents from a directory in the filesystem. It can be
+ * configured with the following parameters:
+ * <ul>
+ * <li><code>InputDirectory</code> - path to directory containing files</li>
+ * <li><code>Encoding</code> (optional) - character encoding of the input files</li>
+ * <li><code>Language</code> (optional) - language of the input documents</li>
+ * </ul>
+ * 
+ * 
+ */
+public class FileSystemCollectionReader extends CollectionReader_ImplBase {
+  /**
+   * Name of configuration parameter that must be set to the path of a directory containing input
+   * files.
+   */
+  public static final String PARAM_INPUTDIR = "InputDirectory";
+
+  /**
+   * Name of configuration parameter that contains the character encoding used by the input files.
+   * If not specified, the default system encoding will be used.
+   */
+  public static final String PARAM_ENCODING = "Encoding";
+
+  /**
+   * Name of optional configuration parameter that contains the language of the documents in the
+   * input directory. If specified this information will be added to the CAS.
+   */
+  public static final String PARAM_LANGUAGE = "Language";
+
+  /**
+   * Name of optional configuration parameter that indicates including
+   * the subdirectories (recursively) of the current input directory.
+   */
+  public static final String PARAM_SUBDIR = "BrowseSubdirectories";
+  
+  private ArrayList<File> mFiles;
+
+  private String mEncoding;
+
+  private String mLanguage;
+  
+  private Boolean mRecursive;
+
+  private int mCurrentIndex;
+
+  /**
+   * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
+   */
+  public void initialize() throws ResourceInitializationException {
+    File directory = new File(((String) getConfigParameterValue(PARAM_INPUTDIR)).trim());
+    mEncoding  = (String) getConfigParameterValue(PARAM_ENCODING);
+    mLanguage  = (String) getConfigParameterValue(PARAM_LANGUAGE);
+    mRecursive = (Boolean) getConfigParameterValue(PARAM_SUBDIR);
+    if (null == mRecursive) { // could be null if not set, it is optional
+      mRecursive = Boolean.FALSE;
+    }
+    mCurrentIndex = 0;
+
+    // if input directory does not exist or is not a directory, throw exception
+    if (!directory.exists() || !directory.isDirectory()) {
+      throw new ResourceInitializationException(ResourceConfigurationException.DIRECTORY_NOT_FOUND,
+              new Object[] { PARAM_INPUTDIR, this.getMetaData().getName(), directory.getPath() });
+    }
+
+    // get list of files in the specified directory, and subdirectories if the
+    // parameter PARAM_SUBDIR is set to True
+    mFiles = new ArrayList<File>();
+    addFilesFromDir(directory);
+  }
+  
+  /**
+   * This method adds files in the directory passed in as a parameter to mFiles.
+   * If mRecursive is true, it will include all files in all
+   * subdirectories (recursively), as well. 
+   * 
+   * @param dir
+   */
+  private void addFilesFromDir(File dir) {
+    File[] files = dir.listFiles();
+    for (int i = 0; i < files.length; i++) {
+      if (!files[i].isDirectory()) {
+        mFiles.add(files[i]);
+      } else if (mRecursive) {
+        addFilesFromDir(files[i]);
+      }
+    }
+  }
+
+  /**
+   * @see org.apache.uima.collection.CollectionReader#hasNext()
+   */
+  public boolean hasNext() {
+    return mCurrentIndex < mFiles.size();
+  }
+
+  /**
+   * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
+   */
+  public void getNext(CAS aCAS) throws IOException, CollectionException {
+    JCas jcas;
+    try {
+      jcas = aCAS.getJCas();
+    } catch (CASException e) {
+      throw new CollectionException(e);
+    }
+
+    // open input stream to file
+    File file = (File) mFiles.get(mCurrentIndex++);
+    String text = FileUtils.file2String(file, mEncoding);
+      // put document in CAS
+    jcas.setDocumentText(text);
+
+    // set language if it was explicitly specified as a configuration parameter
+    if (mLanguage != null) {
+      ((DocumentAnnotation) jcas.getDocumentAnnotationFs()).setLanguage(mLanguage);
+    }
+
+    // Also store location of source document in CAS. This information is critical
+    // if CAS Consumers will need to know where the original document contents are located.
+    // For example, the Semantic Search CAS Indexer writes this information into the
+    // search index that it creates, which allows applications that use the search index to
+    // locate the documents that satisfy their semantic queries.
+    SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
+    srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString());
+    srcDocInfo.setOffsetInSource(0);
+    srcDocInfo.setDocumentSize((int) file.length());
+    srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size());
+    srcDocInfo.addToIndexes();
+  }
+
+  /**
+   * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
+   */
+  public void close() throws IOException {
+  }
+
+  /**
+   * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
+   */
+  public Progress[] getProgress() {
+    return new Progress[] { new ProgressImpl(mCurrentIndex, mFiles.size(), Progress.ENTITIES) };
+  }
+
+  /**
+   * Gets the total number of documents that will be returned by this collection reader. This is not
+   * part of the general collection reader interface.
+   * 
+   * @return the number of documents in the collection
+   */
+  public int getNumberOfDocuments() {
+    return mFiles.size();
+  }
+
+}