You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by sc...@apache.org on 2008/12/06 20:40:17 UTC
svn commit: r724031 - in /incubator/uima/uimaj/trunk/uimaj-examples/src/main:
descriptors/collection_reader/FileSystemCollectionReader.xml
java/org/apache/uima/examples/cpe/FileSystemCollectionReader.java
Author: schor
Date: Sat Dec 6 11:40:16 2008
New Revision: 724031
URL: http://svn.apache.org/viewvc?rev=724031&view=rev
Log:
[UIMA-1206] commit patch with minor changes, including checking parameter for "null" which it can have if it is not set.
Modified:
incubator/uima/uimaj/trunk/uimaj-examples/src/main/descriptors/collection_reader/FileSystemCollectionReader.xml
incubator/uima/uimaj/trunk/uimaj-examples/src/main/java/org/apache/uima/examples/cpe/FileSystemCollectionReader.java
Modified: incubator/uima/uimaj/trunk/uimaj-examples/src/main/descriptors/collection_reader/FileSystemCollectionReader.xml
URL: http://svn.apache.org/viewvc/incubator/uima/uimaj/trunk/uimaj-examples/src/main/descriptors/collection_reader/FileSystemCollectionReader.xml?rev=724031&r1=724030&r2=724031&view=diff
==============================================================================
--- incubator/uima/uimaj/trunk/uimaj-examples/src/main/descriptors/collection_reader/FileSystemCollectionReader.xml (original)
+++ incubator/uima/uimaj/trunk/uimaj-examples/src/main/descriptors/collection_reader/FileSystemCollectionReader.xml Sat Dec 6 11:40:16 2008
@@ -63,6 +63,13 @@
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
+ <configurationParameter>
+ <name>BrowseSubdirectories</name>
+ <description>True means include files of subdirectories, recursively, of the input directory.</description>
+ <type>Boolean</type>
+ <multiValued>false</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
</configurationParameters>
<configurationParameterSettings>
<nameValuePair>
@@ -71,6 +78,12 @@
<string>C:/Program Files/apache-uima/examples/data</string>
</value>
</nameValuePair>
+ <nameValuePair>
+ <name>BrowseSubdirectories</name>
+ <value>
+ <boolean>false</boolean>
+ </value>
+ </nameValuePair>
</configurationParameterSettings>
<!-- Type System of CASes returned by this Collection Reader -->
Modified: incubator/uima/uimaj/trunk/uimaj-examples/src/main/java/org/apache/uima/examples/cpe/FileSystemCollectionReader.java
URL: http://svn.apache.org/viewvc/incubator/uima/uimaj/trunk/uimaj-examples/src/main/java/org/apache/uima/examples/cpe/FileSystemCollectionReader.java?rev=724031&r1=724030&r2=724031&view=diff
==============================================================================
--- incubator/uima/uimaj/trunk/uimaj-examples/src/main/java/org/apache/uima/examples/cpe/FileSystemCollectionReader.java (original)
+++ incubator/uima/uimaj/trunk/uimaj-examples/src/main/java/org/apache/uima/examples/cpe/FileSystemCollectionReader.java Sat Dec 6 11:40:16 2008
@@ -1,167 +1,193 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.uima.examples.cpe;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.ArrayList;
-
-import org.apache.uima.cas.CAS;
-import org.apache.uima.cas.CASException;
-import org.apache.uima.collection.CollectionException;
-import org.apache.uima.collection.CollectionReader_ImplBase;
-import org.apache.uima.examples.SourceDocumentInformation;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.tcas.DocumentAnnotation;
-import org.apache.uima.resource.ResourceConfigurationException;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.util.FileUtils;
-import org.apache.uima.util.Progress;
-import org.apache.uima.util.ProgressImpl;
-
-/**
- * A simple collection reader that reads documents from a directory in the filesystem. It can be
- * configured with the following parameters:
- * <ul>
- * <li><code>InputDirectory</code> - path to directory containing files</li>
- * <li><code>Encoding</code> (optional) - character encoding of the input files</li>
- * <li><code>Language</code> (optional) - language of the input documents</li>
- * </ul>
- *
- *
- */
-public class FileSystemCollectionReader extends CollectionReader_ImplBase {
- /**
- * Name of configuration parameter that must be set to the path of a directory containing input
- * files.
- */
- public static final String PARAM_INPUTDIR = "InputDirectory";
-
- /**
- * Name of configuration parameter that contains the character encoding used by the input files.
- * If not specified, the default system encoding will be used.
- */
- public static final String PARAM_ENCODING = "Encoding";
-
- /**
- * Name of optional configuration parameter that contains the language of the documents in the
- * input directory. If specified this information will be added to the CAS.
- */
- public static final String PARAM_LANGUAGE = "Language";
-
- private ArrayList mFiles;
-
- private String mEncoding;
-
- private String mLanguage;
-
- private int mCurrentIndex;
-
- /**
- * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
- */
- public void initialize() throws ResourceInitializationException {
- File directory = new File(((String) getConfigParameterValue(PARAM_INPUTDIR)).trim());
- mEncoding = (String) getConfigParameterValue(PARAM_ENCODING);
- mLanguage = (String) getConfigParameterValue(PARAM_LANGUAGE);
- mCurrentIndex = 0;
-
- // if input directory does not exist or is not a directory, throw exception
- if (!directory.exists() || !directory.isDirectory()) {
- throw new ResourceInitializationException(ResourceConfigurationException.DIRECTORY_NOT_FOUND,
- new Object[] { PARAM_INPUTDIR, this.getMetaData().getName(), directory.getPath() });
- }
-
- // get list of files (not subdirectories) in the specified directory
- mFiles = new ArrayList();
- File[] files = directory.listFiles();
- for (int i = 0; i < files.length; i++) {
- if (!files[i].isDirectory()) {
- mFiles.add(files[i]);
- }
- }
- }
-
- /**
- * @see org.apache.uima.collection.CollectionReader#hasNext()
- */
- public boolean hasNext() {
- return mCurrentIndex < mFiles.size();
- }
-
- /**
- * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
- */
- public void getNext(CAS aCAS) throws IOException, CollectionException {
- JCas jcas;
- try {
- jcas = aCAS.getJCas();
- } catch (CASException e) {
- throw new CollectionException(e);
- }
-
- // open input stream to file
- File file = (File) mFiles.get(mCurrentIndex++);
- String text = FileUtils.file2String(file, mEncoding);
- // put document in CAS
- jcas.setDocumentText(text);
-
- // set language if it was explicitly specified as a configuration parameter
- if (mLanguage != null) {
- ((DocumentAnnotation) jcas.getDocumentAnnotationFs()).setLanguage(mLanguage);
- }
-
- // Also store location of source document in CAS. This information is critical
- // if CAS Consumers will need to know where the original document contents are located.
- // For example, the Semantic Search CAS Indexer writes this information into the
- // search index that it creates, which allows applications that use the search index to
- // locate the documents that satisfy their semantic queries.
- SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
- srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString());
- srcDocInfo.setOffsetInSource(0);
- srcDocInfo.setDocumentSize((int) file.length());
- srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size());
- srcDocInfo.addToIndexes();
- }
-
- /**
- * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
- */
- public void close() throws IOException {
- }
-
- /**
- * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
- */
- public Progress[] getProgress() {
- return new Progress[] { new ProgressImpl(mCurrentIndex, mFiles.size(), Progress.ENTITIES) };
- }
-
- /**
- * Gets the total number of documents that will be returned by this collection reader. This is not
- * part of the general collection reader interface.
- *
- * @return the number of documents in the collection
- */
- public int getNumberOfDocuments() {
- return mFiles.size();
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.examples.cpe;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CollectionException;
+import org.apache.uima.collection.CollectionReader_ImplBase;
+import org.apache.uima.examples.SourceDocumentInformation;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.DocumentAnnotation;
+import org.apache.uima.resource.ResourceConfigurationException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.FileUtils;
+import org.apache.uima.util.Progress;
+import org.apache.uima.util.ProgressImpl;
+
+/**
+ * A simple collection reader that reads documents from a directory in the filesystem. It can be
+ * configured with the following parameters:
+ * <ul>
+ * <li><code>InputDirectory</code> - path to directory containing files</li>
+ * <li><code>Encoding</code> (optional) - character encoding of the input files</li>
+ * <li><code>Language</code> (optional) - language of the input documents</li>
+ * </ul>
+ *
+ *
+ */
+public class FileSystemCollectionReader extends CollectionReader_ImplBase {
+ /**
+ * Name of configuration parameter that must be set to the path of a directory containing input
+ * files.
+ */
+ public static final String PARAM_INPUTDIR = "InputDirectory";
+
+ /**
+ * Name of configuration parameter that contains the character encoding used by the input files.
+ * If not specified, the default system encoding will be used.
+ */
+ public static final String PARAM_ENCODING = "Encoding";
+
+ /**
+ * Name of optional configuration parameter that contains the language of the documents in the
+ * input directory. If specified this information will be added to the CAS.
+ */
+ public static final String PARAM_LANGUAGE = "Language";
+
+ /**
+ * Name of optional configuration parameter that indicates including
+ * the subdirectories (recursively) of the current input directory.
+ */
+ public static final String PARAM_SUBDIR = "BrowseSubdirectories";
+
+ private ArrayList<File> mFiles;
+
+ private String mEncoding;
+
+ private String mLanguage;
+
+ private Boolean mRecursive;
+
+ private int mCurrentIndex;
+
+ /**
+ * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
+ */
+ public void initialize() throws ResourceInitializationException {
+ File directory = new File(((String) getConfigParameterValue(PARAM_INPUTDIR)).trim());
+ mEncoding = (String) getConfigParameterValue(PARAM_ENCODING);
+ mLanguage = (String) getConfigParameterValue(PARAM_LANGUAGE);
+ mRecursive = (Boolean) getConfigParameterValue(PARAM_SUBDIR);
+ if (null == mRecursive) { // could be null if not set, it is optional
+ mRecursive = Boolean.FALSE;
+ }
+ mCurrentIndex = 0;
+
+ // if input directory does not exist or is not a directory, throw exception
+ if (!directory.exists() || !directory.isDirectory()) {
+ throw new ResourceInitializationException(ResourceConfigurationException.DIRECTORY_NOT_FOUND,
+ new Object[] { PARAM_INPUTDIR, this.getMetaData().getName(), directory.getPath() });
+ }
+
+ // get list of files in the specified directory, and subdirectories if the
+ // parameter PARAM_SUBDIR is set to True
+ mFiles = new ArrayList<File>();
+ addFilesFromDir(directory);
+ }
+
+ /**
+ * This method adds files in the directory passed in as a parameter to mFiles.
+ * If mRecursive is true, it will include all files in all
+ * subdirectories (recursively), as well.
+ *
+ * @param dir
+ */
+ private void addFilesFromDir(File dir) {
+ File[] files = dir.listFiles();
+ for (int i = 0; i < files.length; i++) {
+ if (!files[i].isDirectory()) {
+ mFiles.add(files[i]);
+ } else if (mRecursive) {
+ addFilesFromDir(files[i]);
+ }
+ }
+ }
+
+ /**
+ * @see org.apache.uima.collection.CollectionReader#hasNext()
+ */
+ public boolean hasNext() {
+ return mCurrentIndex < mFiles.size();
+ }
+
+ /**
+ * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
+ */
+ public void getNext(CAS aCAS) throws IOException, CollectionException {
+ JCas jcas;
+ try {
+ jcas = aCAS.getJCas();
+ } catch (CASException e) {
+ throw new CollectionException(e);
+ }
+
+ // open input stream to file
+ File file = (File) mFiles.get(mCurrentIndex++);
+ String text = FileUtils.file2String(file, mEncoding);
+ // put document in CAS
+ jcas.setDocumentText(text);
+
+ // set language if it was explicitly specified as a configuration parameter
+ if (mLanguage != null) {
+ ((DocumentAnnotation) jcas.getDocumentAnnotationFs()).setLanguage(mLanguage);
+ }
+
+ // Also store location of source document in CAS. This information is critical
+ // if CAS Consumers will need to know where the original document contents are located.
+ // For example, the Semantic Search CAS Indexer writes this information into the
+ // search index that it creates, which allows applications that use the search index to
+ // locate the documents that satisfy their semantic queries.
+ SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
+ srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString());
+ srcDocInfo.setOffsetInSource(0);
+ srcDocInfo.setDocumentSize((int) file.length());
+ srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size());
+ srcDocInfo.addToIndexes();
+ }
+
+ /**
+ * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
+ */
+ public void close() throws IOException {
+ }
+
+ /**
+ * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
+ */
+ public Progress[] getProgress() {
+ return new Progress[] { new ProgressImpl(mCurrentIndex, mFiles.size(), Progress.ENTITIES) };
+ }
+
+ /**
+ * Gets the total number of documents that will be returned by this collection reader. This is not
+ * part of the general collection reader interface.
+ *
+ * @return the number of documents in the collection
+ */
+ public int getNumberOfDocuments() {
+ return mFiles.size();
+ }
+
+}