You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by ea...@apache.org on 2013/08/05 23:34:40 UTC
svn commit: r1510744 - in
/uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main:
java/org/apache/uima/ducc/sampleapps/
resources/org/apache/uima/ducc/sampleapps/
Author: eae
Date: Mon Aug 5 21:34:40 2013
New Revision: 1510744
URL: http://svn.apache.org/r1510744
Log:
UIMA-3149 First part of the text sample app
Added:
uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/
uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccDocumentInfo.java
uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccDocumentInfo_Type.java
uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccJobTextCR.java
uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccTextCC.java
uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccTextCM.java
uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/
uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccDocumentInfoTS.xml
uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccJobTextCR.xml
uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccTextCC.xml
uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccTextCM.xml
Added: uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccDocumentInfo.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccDocumentInfo.java?rev=1510744&view=auto
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccDocumentInfo.java (added)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccDocumentInfo.java Mon Aug 5 21:34:40 2013
@@ -0,0 +1,126 @@
+
+
+/* First created by JCasGen Wed Jul 31 15:14:59 EDT 2013 */
+package org.apache.uima.ducc.sampleapps;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.JCasRegistry;
+import org.apache.uima.jcas.cas.TOP_Type;
+
+import org.apache.uima.jcas.cas.TOP;
+
+
+/**
+ * Updated by JCasGen Thu Aug 01 14:48:37 EDT 2013
+ * XML source: /users1/eae/workspace-ducc/uima-ducc/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccDocumentInfoTS.xml
+ * @generated */
+public class DuccDocumentInfo extends TOP {
+ /** @generated
+ * @ordered
+ */
+ public final static int typeIndexID = JCasRegistry.register(DuccDocumentInfo.class);
+ /** @generated
+ * @ordered
+ */
+ public final static int type = typeIndexID;
+ /** @generated */
+ public int getTypeIndexID() {return typeIndexID;}
+
+ /** Never called. Disable default constructor
+ * @generated */
+ protected DuccDocumentInfo() {}
+
+ /** Internal - constructor used by generator
+ * @generated */
+ public DuccDocumentInfo(int addr, TOP_Type type) {
+ super(addr, type);
+ readObject();
+ }
+
+ /** @generated */
+ public DuccDocumentInfo(JCas jcas) {
+ super(jcas);
+ readObject();
+ }
+
+ /** <!-- begin-user-doc -->
+ * Write your own initialization here
+ * <!-- end-user-doc -->
+ @generated modifiable */
+ private void readObject() {}
+
+
+
+ //*--------------*
+ //* Feature: inputfile
+
+ /** getter for inputfile - gets
+ * @generated */
+ public String getInputfile() {
+ if (DuccDocumentInfo_Type.featOkTst && ((DuccDocumentInfo_Type)jcasType).casFeat_inputfile == null)
+ jcasType.jcas.throwFeatMissing("inputfile", "org.apache.uima.ducc.sampleapps.DuccDocumentInfo");
+ return jcasType.ll_cas.ll_getStringValue(addr, ((DuccDocumentInfo_Type)jcasType).casFeatCode_inputfile);}
+
+ /** setter for inputfile - sets
+ * @generated */
+ public void setInputfile(String v) {
+ if (DuccDocumentInfo_Type.featOkTst && ((DuccDocumentInfo_Type)jcasType).casFeat_inputfile == null)
+ jcasType.jcas.throwFeatMissing("inputfile", "org.apache.uima.ducc.sampleapps.DuccDocumentInfo");
+ jcasType.ll_cas.ll_setStringValue(addr, ((DuccDocumentInfo_Type)jcasType).casFeatCode_inputfile, v);}
+
+
+ //*--------------*
+ //* Feature: outputfile
+
+ /** getter for outputfile - gets
+ * @generated */
+ public String getOutputfile() {
+ if (DuccDocumentInfo_Type.featOkTst && ((DuccDocumentInfo_Type)jcasType).casFeat_outputfile == null)
+ jcasType.jcas.throwFeatMissing("outputfile", "org.apache.uima.ducc.sampleapps.DuccDocumentInfo");
+ return jcasType.ll_cas.ll_getStringValue(addr, ((DuccDocumentInfo_Type)jcasType).casFeatCode_outputfile);}
+
+ /** setter for outputfile - sets
+ * @generated */
+ public void setOutputfile(String v) {
+ if (DuccDocumentInfo_Type.featOkTst && ((DuccDocumentInfo_Type)jcasType).casFeat_outputfile == null)
+ jcasType.jcas.throwFeatMissing("outputfile", "org.apache.uima.ducc.sampleapps.DuccDocumentInfo");
+ jcasType.ll_cas.ll_setStringValue(addr, ((DuccDocumentInfo_Type)jcasType).casFeatCode_outputfile, v);}
+
+
+ //*--------------*
+ //* Feature: docseq
+
+ /** getter for docseq - gets document sequence within work item
+ * @generated */
+ public int getDocseq() {
+ if (DuccDocumentInfo_Type.featOkTst && ((DuccDocumentInfo_Type)jcasType).casFeat_docseq == null)
+ jcasType.jcas.throwFeatMissing("docseq", "org.apache.uima.ducc.sampleapps.DuccDocumentInfo");
+ return jcasType.ll_cas.ll_getIntValue(addr, ((DuccDocumentInfo_Type)jcasType).casFeatCode_docseq);}
+
+ /** setter for docseq - sets document sequence within work item
+ * @generated */
+ public void setDocseq(int v) {
+ if (DuccDocumentInfo_Type.featOkTst && ((DuccDocumentInfo_Type)jcasType).casFeat_docseq == null)
+ jcasType.jcas.throwFeatMissing("docseq", "org.apache.uima.ducc.sampleapps.DuccDocumentInfo");
+ jcasType.ll_cas.ll_setIntValue(addr, ((DuccDocumentInfo_Type)jcasType).casFeatCode_docseq, v);}
+
+
+ //*--------------*
+ //* Feature: byteoffset
+
+ /** getter for byteoffset - gets offset of byte location of first character in document
+ * @generated */
+ public int getByteoffset() {
+ if (DuccDocumentInfo_Type.featOkTst && ((DuccDocumentInfo_Type)jcasType).casFeat_byteoffset == null)
+ jcasType.jcas.throwFeatMissing("byteoffset", "org.apache.uima.ducc.sampleapps.DuccDocumentInfo");
+ return jcasType.ll_cas.ll_getIntValue(addr, ((DuccDocumentInfo_Type)jcasType).casFeatCode_byteoffset);}
+
+ /** setter for byteoffset - sets offset of byte location of first character in document
+ * @generated */
+ public void setByteoffset(int v) {
+ if (DuccDocumentInfo_Type.featOkTst && ((DuccDocumentInfo_Type)jcasType).casFeat_byteoffset == null)
+ jcasType.jcas.throwFeatMissing("byteoffset", "org.apache.uima.ducc.sampleapps.DuccDocumentInfo");
+ jcasType.ll_cas.ll_setIntValue(addr, ((DuccDocumentInfo_Type)jcasType).casFeatCode_byteoffset, v);}
+ }
+
+
\ No newline at end of file
Added: uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccDocumentInfo_Type.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccDocumentInfo_Type.java?rev=1510744&view=auto
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccDocumentInfo_Type.java (added)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccDocumentInfo_Type.java Mon Aug 5 21:34:40 2013
@@ -0,0 +1,145 @@
+
+/* First created by JCasGen Wed Jul 31 15:14:59 EDT 2013 */
+package org.apache.uima.ducc.sampleapps;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.JCasRegistry;
+import org.apache.uima.cas.impl.CASImpl;
+import org.apache.uima.cas.impl.FSGenerator;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.impl.TypeImpl;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.impl.FeatureImpl;
+import org.apache.uima.cas.Feature;
+import org.apache.uima.jcas.cas.TOP_Type;
+
+/**
+ * Updated by JCasGen Thu Aug 01 14:48:37 EDT 2013
+ * @generated */
+public class DuccDocumentInfo_Type extends TOP_Type {
+ /** @generated */
+ protected FSGenerator getFSGenerator() {return fsGenerator;}
+ /** @generated */
+ private final FSGenerator fsGenerator =
+ new FSGenerator() {
+ public FeatureStructure createFS(int addr, CASImpl cas) {
+ if (DuccDocumentInfo_Type.this.useExistingInstance) {
+ // Return eq fs instance if already created
+ FeatureStructure fs = DuccDocumentInfo_Type.this.jcas.getJfsFromCaddr(addr);
+ if (null == fs) {
+ fs = new DuccDocumentInfo(addr, DuccDocumentInfo_Type.this);
+ DuccDocumentInfo_Type.this.jcas.putJfsFromCaddr(addr, fs);
+ return fs;
+ }
+ return fs;
+ } else return new DuccDocumentInfo(addr, DuccDocumentInfo_Type.this);
+ }
+ };
+ /** @generated */
+ public final static int typeIndexID = DuccDocumentInfo.typeIndexID;
+ /** @generated
+ @modifiable */
+ public final static boolean featOkTst = JCasRegistry.getFeatOkTst("org.apache.uima.ducc.sampleapps.DuccDocumentInfo");
+
+ /** @generated */
+ final Feature casFeat_inputfile;
+ /** @generated */
+ final int casFeatCode_inputfile;
+ /** @generated */
+ public String getInputfile(int addr) {
+ if (featOkTst && casFeat_inputfile == null)
+ jcas.throwFeatMissing("inputfile", "org.apache.uima.ducc.sampleapps.DuccDocumentInfo");
+ return ll_cas.ll_getStringValue(addr, casFeatCode_inputfile);
+ }
+ /** @generated */
+ public void setInputfile(int addr, String v) {
+ if (featOkTst && casFeat_inputfile == null)
+ jcas.throwFeatMissing("inputfile", "org.apache.uima.ducc.sampleapps.DuccDocumentInfo");
+ ll_cas.ll_setStringValue(addr, casFeatCode_inputfile, v);}
+
+
+
+ /** @generated */
+ final Feature casFeat_outputfile;
+ /** @generated */
+ final int casFeatCode_outputfile;
+ /** @generated */
+ public String getOutputfile(int addr) {
+ if (featOkTst && casFeat_outputfile == null)
+ jcas.throwFeatMissing("outputfile", "org.apache.uima.ducc.sampleapps.DuccDocumentInfo");
+ return ll_cas.ll_getStringValue(addr, casFeatCode_outputfile);
+ }
+ /** @generated */
+ public void setOutputfile(int addr, String v) {
+ if (featOkTst && casFeat_outputfile == null)
+ jcas.throwFeatMissing("outputfile", "org.apache.uima.ducc.sampleapps.DuccDocumentInfo");
+ ll_cas.ll_setStringValue(addr, casFeatCode_outputfile, v);}
+
+
+
+ /** @generated */
+ final Feature casFeat_docseq;
+ /** @generated */
+ final int casFeatCode_docseq;
+ /** @generated */
+ public int getDocseq(int addr) {
+ if (featOkTst && casFeat_docseq == null)
+ jcas.throwFeatMissing("docseq", "org.apache.uima.ducc.sampleapps.DuccDocumentInfo");
+ return ll_cas.ll_getIntValue(addr, casFeatCode_docseq);
+ }
+ /** @generated */
+ public void setDocseq(int addr, int v) {
+ if (featOkTst && casFeat_docseq == null)
+ jcas.throwFeatMissing("docseq", "org.apache.uima.ducc.sampleapps.DuccDocumentInfo");
+ ll_cas.ll_setIntValue(addr, casFeatCode_docseq, v);}
+
+
+
+ /** @generated */
+ final Feature casFeat_byteoffset;
+ /** @generated */
+ final int casFeatCode_byteoffset;
+ /** @generated */
+ public int getByteoffset(int addr) {
+ if (featOkTst && casFeat_byteoffset == null)
+ jcas.throwFeatMissing("byteoffset", "org.apache.uima.ducc.sampleapps.DuccDocumentInfo");
+ return ll_cas.ll_getIntValue(addr, casFeatCode_byteoffset);
+ }
+ /** @generated */
+ public void setByteoffset(int addr, int v) {
+ if (featOkTst && casFeat_byteoffset == null)
+ jcas.throwFeatMissing("byteoffset", "org.apache.uima.ducc.sampleapps.DuccDocumentInfo");
+ ll_cas.ll_setIntValue(addr, casFeatCode_byteoffset, v);}
+
+
+
+
+
+ /** initialize variables to correspond with Cas Type and Features
+ * @generated */
+ public DuccDocumentInfo_Type(JCas jcas, Type casType) {
+ super(jcas, casType);
+ casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
+
+
+ casFeat_inputfile = jcas.getRequiredFeatureDE(casType, "inputfile", "uima.cas.String", featOkTst);
+ casFeatCode_inputfile = (null == casFeat_inputfile) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_inputfile).getCode();
+
+
+ casFeat_outputfile = jcas.getRequiredFeatureDE(casType, "outputfile", "uima.cas.String", featOkTst);
+ casFeatCode_outputfile = (null == casFeat_outputfile) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_outputfile).getCode();
+
+
+ casFeat_docseq = jcas.getRequiredFeatureDE(casType, "docseq", "uima.cas.Integer", featOkTst);
+ casFeatCode_docseq = (null == casFeat_docseq) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_docseq).getCode();
+
+
+ casFeat_byteoffset = jcas.getRequiredFeatureDE(casType, "byteoffset", "uima.cas.Integer", featOkTst);
+ casFeatCode_byteoffset = (null == casFeat_byteoffset) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_byteoffset).getCode();
+
+ }
+}
+
+
+
+
\ No newline at end of file
Added: uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccJobTextCR.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccJobTextCR.java?rev=1510744&view=auto
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccJobTextCR.java (added)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccJobTextCR.java Mon Aug 5 21:34:40 2013
@@ -0,0 +1,310 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.ducc.sampleapps;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CollectionException;
+import org.apache.uima.collection.CollectionReader_ImplBase;
+import org.apache.uima.ducc.Workitem;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceConfigurationException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.Level;
+import org.apache.uima.util.Logger;
+import org.apache.uima.util.Progress;
+import org.apache.uima.util.ProgressImpl;
+
+/**
+ * A simple DUCC Job collection reader that reads text files from a directory in the filesystem.
+ * It can be configured with the following parameters:
+ * <ul>
+ * <li><code>InputDirectory</code> - path to directory containing input files</li>
+ * <li><code>OutputDirectory</code> - path to directory for output files</li>
+ * <li><code>Encoding</code> (optional) - character encoding of the input files</li>
+ * <li><code>Language</code> (optional) - language of the input documents</li>
+ * <li><code>BlockSize</code> (optional) - Block size used to process input files</li>
+ * </ul>
+ *
+ */
+public class DuccJobTextCR extends CollectionReader_ImplBase {
+ /**
+ * Name of configuration parameter that must be set to the path of a directory containing input
+ * files.
+ */
+ public static final String PARAM_INPUTDIR = "InputDirectory";
+
+ /**
+ * Name of configuration parameter that must be set to the path of the base directory
+ * where output files will be created.
+ */
+ public static final String PARAM_OUTPUTDIR = "OutputDirectory";
+
+ /**
+ * Name of configuration parameter that indicates if previous output should be ignored.
+ */
+ public static final String PARAM_IGNOREPREVIOUS = "IgnorePreviousOutput";
+
+ /**
+ * Name of configuration parameter that contains the character encoding used by the input files.
+ * If not specified, the default system encoding will be used.
+ */
+ public static final String PARAM_ENCODING = "Encoding";
+
+ /**
+ * Name of optional configuration parameter that contains the language of the documents in the
+ * input directory. If specified this information will be added to the CAS.
+ */
+ public static final String PARAM_LANGUAGE = "Language";
+
+ /**
+ * Name of configuration parameter specifying the block size used to break input files into work-items.
+ * Output files will correspond to the input data found in each block.
+ * If not specified, the entire file will be processed as a single work-item.
+ */
+ public static final String PARAM_BLOCKSIZE = "BlockSize";
+
+ /**
+ * Name of configuration parameter specifying the block size used to break input files into work-items.
+ * Output files will correspond to the input data found in each block.
+ * If not specified, the entire file will be processed as a single work-item.
+ */
+ public static final String PARAM_SENDTOLAST = "SendToLast";
+
+ public static final String PARAM_DEBUG = "Debug";
+
+ class WorkItem {
+ public WorkItem(String absolutePathIn, String absolutePathOut, int i, long len, long off, boolean end) {
+ filename=absolutePathIn;
+ outname=absolutePathOut;
+ index=i;
+ length=(int)len;
+ offset=(int)off;
+ last=end;
+ }
+ String filename;
+ String outname;
+ int index;
+ int offset;
+ int length;
+ boolean last;
+ }
+
+ private ArrayList<WorkItem> mWorkList;
+
+ private String mInputdirectory;
+
+ private String mOutputdirectory;
+
+ private Boolean mIgnorePrevious;
+
+ private String mEncoding;
+
+ private String mLanguage;
+
+ private int mBlocksize;
+
+ private int mCurrentIndex;
+
+ private Boolean mSendToLast;
+
+ private Boolean mDebug;
+
+ private int mPreviouslyDone;
+
+ private Logger logger;
+
+ /**
+ * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
+ */
+ public void initialize() throws ResourceInitializationException {
+ logger = getUimaContext().getLogger();
+ mInputdirectory = ((String) getConfigParameterValue(PARAM_INPUTDIR)).trim();
+ mOutputdirectory = ((String) getConfigParameterValue(PARAM_OUTPUTDIR)).trim();
+ mIgnorePrevious = (Boolean) getConfigParameterValue(PARAM_IGNOREPREVIOUS);
+ mEncoding = (String) getConfigParameterValue(PARAM_ENCODING);
+ mLanguage = (String) getConfigParameterValue(PARAM_LANGUAGE);
+ mSendToLast = (Boolean) getConfigParameterValue(PARAM_SENDTOLAST);
+ mDebug = (Boolean) getConfigParameterValue(PARAM_DEBUG);
+
+ if (null == mIgnorePrevious) {
+ mIgnorePrevious = Boolean.FALSE;
+ }
+ if (null == mSendToLast) {
+ mSendToLast = Boolean.FALSE;
+ }
+ if (null == mDebug) {
+ mDebug = Boolean.FALSE;
+ }
+ mCurrentIndex = 0;
+ mPreviouslyDone = 0;
+
+ // if input directory does not exist or is not a directory, throw exception
+ File inDirectory = new File(mInputdirectory);
+ if (!inDirectory.exists() || !inDirectory.isDirectory()) {
+ throw new ResourceInitializationException(ResourceConfigurationException.DIRECTORY_NOT_FOUND,
+ new Object[] { PARAM_INPUTDIR, this.getMetaData().getName(), inDirectory.getPath() });
+ }
+
+ // if output directory does not exist or is not a directory, throw exception
+ File outDirectory = new File(mOutputdirectory);
+ if (outDirectory.exists() && !outDirectory.isDirectory()) {
+ throw new ResourceInitializationException(new RuntimeException("Specified output directory "+mOutputdirectory+" is a file"));
+ }
+ if (!outDirectory.exists()) {
+ mIgnorePrevious = true;
+ }
+
+ mBlocksize = 0;
+ logger.log(Level.INFO, "Processing input files from "+mInputdirectory);
+ if (null != getConfigParameterValue(PARAM_BLOCKSIZE)) {
+ mBlocksize = (Integer) getConfigParameterValue(PARAM_BLOCKSIZE);
+ logger.log(Level.INFO, "Using blocksize "+ mBlocksize);
+ }
+ if (null != mIgnorePrevious && mIgnorePrevious) {
+// mIgnorePrevious = Boolean.FALSE;
+ logger.log(Level.INFO, "Overwriting previous outfiles");
+ }
+
+ // get list of files or file-parts in the specified directory, and subdirectories if recursive
+ mWorkList = new ArrayList<WorkItem>();
+ addFilesFromDir(inDirectory);
+ if (0 < mPreviouslyDone) {
+ logger.log(Level.INFO, "Preserving "+mPreviouslyDone+" output files in "+mOutputdirectory);
+ }
+ logger.log(Level.INFO, "Processing "+mWorkList.size()+" output files in "+mOutputdirectory);
+ }
+
+ /**
+ * This method adds files or file-chunks in the input directory,
+ * if the respective output file does not exist,
+ * or if mIgnorePrevious = true.
+ *
+ * @param dir
+ */
+ private void addFilesFromDir(File dir) {
+ File[] files = dir.listFiles();
+ for (int i = 0; i < files.length; i++) {
+ if (!files[i].isDirectory()) {
+ String outfilename = files[i].getAbsolutePath();
+ outfilename = outfilename.substring(mInputdirectory.length());
+ outfilename = mOutputdirectory+outfilename;
+ if (mBlocksize == 0) {
+ File outFile = new File(outfilename+".processed");
+ if (!mIgnorePrevious && outFile.exists()) {
+ mPreviouslyDone++;
+ }
+ if (mIgnorePrevious || !outFile.exists()) {
+ mWorkList.add(new WorkItem(files[i].getAbsolutePath(),outfilename+".processed",0,files[i].length(),0,false));
+ logger.log(Level.FINE, "adding "+outfilename);
+ }
+ }
+ // use blocksize
+ else {
+ long fsize = files[i].length();
+ long offset=0;
+ int j=0;
+ while (fsize > 0) {
+ String outfilechunk = outfilename+"_"+j;
+ long length = (fsize < mBlocksize) ? fsize : mBlocksize;
+ File outFile = new File(outfilechunk+".processed");
+ if (!mIgnorePrevious && outFile.exists()) {
+ mPreviouslyDone++;
+ }
+ if (mIgnorePrevious || !outFile.exists()) {
+ mWorkList.add(new WorkItem(files[i].getAbsolutePath(),outfilechunk+".processed",j,length,offset,fsize==length));
+ logger.log(Level.FINE, "adding "+outfilechunk);
+ }
+ j++;
+ fsize -= length;
+ offset += length;
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * @see org.apache.uima.collection.CollectionReader#hasNext()
+ */
+ public boolean hasNext() {
+ return mCurrentIndex < mWorkList.size();
+ }
+
+ /**
+ * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
+ */
+ public void getNext(CAS aCAS) throws IOException, CollectionException {
+ JCas jcas;
+ try {
+ jcas = aCAS.getJCas();
+ Workitem wi = new Workitem(jcas);
+ wi.setInputspec(mWorkList.get(mCurrentIndex).filename);
+ wi.setOutputspec(mWorkList.get(mCurrentIndex).outname);
+ wi.setBlockindex(mWorkList.get(mCurrentIndex).index);
+ wi.setBlocksize(mBlocksize);
+ wi.setBytelength(mWorkList.get(mCurrentIndex).length);
+ if (null != mEncoding) {
+ wi.setEncoding(mEncoding);
+ }
+ if (null != mLanguage) {
+ wi.setLanguage(mLanguage);
+ }
+ wi.setSendToLast(mSendToLast);
+ wi.addToIndexes();
+ wi.setLastBlock(mWorkList.get(mCurrentIndex).last);
+ logger.log(Level.INFO, "Sending "+wi.getInputspec()+" index="+wi.getBlockindex()+" last="+wi.getLastBlock()+" length="+wi.getBytelength());
+ mCurrentIndex++;
+ jcas.setDocumentText(wi.getInputspec()+" index="+wi.getBlockindex()+" length="+wi.getBytelength());
+ } catch (CASException e) {
+ throw new CollectionException(e);
+ }
+
+ //create WorkItem info structure
+ }
+
+ /**
+ * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
+ */
+ public void close() throws IOException {
+ }
+
+ /**
+ * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
+ */
+ public Progress[] getProgress() {
+ return new Progress[] { new ProgressImpl(mCurrentIndex, mWorkList.size(), Progress.ENTITIES) };
+ }
+
+ /**
+ * Gets the total number of documents that will be returned by this collection reader. This is not
+ * part of the general collection reader interface.
+ *
+ * @return the number of documents in the collection
+ */
+ public int getNumberOfDocuments() {
+ return mWorkList.size();
+ }
+
+}
Added: uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccTextCC.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccTextCC.java?rev=1510744&view=auto
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccTextCC.java (added)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccTextCC.java Mon Aug 5 21:34:40 2013
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.ducc.sampleapps;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipOutputStream;
+
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.ducc.Workitem;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.Level;
+import org.apache.uima.util.Logger;
+import org.apache.uima.util.XMLSerializer;
+import org.xml.sax.SAXException;
+
+public class DuccTextCC extends JCasAnnotator_ImplBase {
+ private Logger logger;
+ private String outputFilename=null;
+ private File outFile;
+ private FileOutputStream fos;
+ private ZipOutputStream zos;
+
+
+ public void initialize(UimaContext aContext) throws ResourceInitializationException {
+ super.initialize(aContext);
+ logger = aContext.getLogger();
+ }
+
+ public void process(JCas jcas) throws AnalysisEngineProcessException {
+ Iterator<FeatureStructure> fsit = jcas.getIndexRepository().getAllIndexedFS(jcas.getCasType(Workitem.type));
+ if (fsit.hasNext()) {
+ Workitem wi = (Workitem) fsit.next();
+ if (!outputFilename.equals(wi.getOutputspec())) {
+ throw new AnalysisEngineProcessException(new RuntimeException("flush mismatch: "+outputFilename+" != "+wi.getOutputspec()));
+ }
+ try {
+ zos.close();
+ fos.close();
+ if (!outFile.renameTo(new File(outputFilename))) {
+ throw new IOException("Rename failed for "+outputFilename);
+ }
+ } catch (IOException e) {
+ throw new AnalysisEngineProcessException(e);
+ }
+ logger.log(Level.INFO, "DuccDummyCC: Flushed "+wi.getOutputspec());
+ return;
+ }
+
+ fsit = jcas.getIndexRepository().getAllIndexedFS(jcas.getCasType(DuccDocumentInfo.type));
+ if (!fsit.hasNext()) {
+ throw new AnalysisEngineProcessException(new RuntimeException("No DuccDocumentInfo FS in CAS"));
+ }
+ DuccDocumentInfo di = (DuccDocumentInfo) fsit.next();
+// logger.log(Level.FINE, "DuccDummyCC: No workitem FS found in CAS");
+ String outputfile = di.getOutputfile();
+ if (!outputfile.equals(outputFilename)) {
+ // create new output file
+ outputFilename = outputfile;
+ try {
+ outFile = new File(outputFilename+"_temp");
+ File outDir = outFile.getParentFile();
+ if (outDir != null && !outDir.exists()) {
+ outDir.mkdirs();
+ }
+ fos = new FileOutputStream(outFile);
+ zos = new ZipOutputStream(fos);
+ zos.setLevel(7); //TODO turn off compression for binary
+ } catch (FileNotFoundException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ }
+ ZipEntry ze = new ZipEntry("doc_"+di.getDocseq());
+ ze.setMethod(ZipEntry.DEFLATED);
+ try {
+ zos.putNextEntry(ze);
+ // write XMI
+ XmiCasSerializer ser = new XmiCasSerializer(jcas.getTypeSystem());
+ XMLSerializer xmlSer = new XMLSerializer(zos, false);
+ ser.serialize(jcas.getCas(), xmlSer.getContentHandler());
+ zos.closeEntry();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ } catch (SAXException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+ }
+
+}
Added: uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccTextCM.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccTextCM.java?rev=1510744&view=auto
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccTextCM.java (added)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/java/org/apache/uima/ducc/sampleapps/DuccTextCM.java Mon Aug 5 21:34:40 2013
@@ -0,0 +1,300 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.ducc.sampleapps;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.channels.FileChannel;
+import java.util.Arrays;
+import java.util.Iterator;
+
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_component.JCasMultiplier_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.AbstractCas;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.ducc.Workitem;
+import org.apache.uima.ducc.sampleapps.DuccDocumentInfo;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.Level;
+import org.apache.uima.util.Logger;
+
+public class DuccTextCM extends JCasMultiplier_ImplBase {
+ private byte[] buffer = null;
+ private int buffsize;
+ private FileInputStream fis;
+ private String inputFileName;
+ private String outputFileName;
+ private String language;
+ private String encoding;
+ private String nextDoc;
+ private int nextDocOffset;
+ private int bytelength;
+ private int blockindex;
+ private boolean newWI;
+ private boolean spilled;
+ private boolean firstdoc;
+ private boolean lastblock;
+ private int docInWI;
+ private long filesize;
+ private Workitem wi;
+ private int currentindex;
+ private Logger logger;
+ FileChannel fc;
+
+ private enum NextDoc { FIRSTDOC, SEP_IN_LASTBLOCK, NORMAL };
+ private NextDoc strategy;
+
+ private final int DEFAULT_BUFFER_SIZE = 20000000;
+
+ public boolean hasNext() throws AnalysisEngineProcessException {
+ if (spilled) {
+ return false;
+ }
+ try {
+ return findnextdoc(strategy);
+ } catch (IOException e) {
+ throw new AnalysisEngineProcessException(e);
+ }
+ }
+
+ public AbstractCas next() throws AnalysisEngineProcessException {
+ JCas newcas = getEmptyJCas();
+ newcas.setDocumentText(getNextDocument());
+ newcas.setDocumentLanguage(language);
+ DuccDocumentInfo di = new DuccDocumentInfo(newcas);
+ di.setInputfile(inputFileName);
+ di.setOutputfile(outputFileName);
+ di.setDocseq(docInWI++);
+ di.setByteoffset(wi.getBlockindex() * wi.getBlocksize() + nextDocOffset);
+ di.addToIndexes();
+ return newcas;
+ }
+
+ @Override
+ public void process(JCas jcas) throws AnalysisEngineProcessException {
+ Iterator<FeatureStructure> fsit = jcas.getIndexRepository().getAllIndexedFS(jcas.getCasType(Workitem.type));
+ if (!fsit.hasNext()) {
+ throw new AnalysisEngineProcessException(new RuntimeException("No workitem FS in CAS"));
+ }
+ wi = (Workitem) fsit.next();
+ logger.log(Level.INFO, "DuccTextCM: "+wi.getInputspec()+" at block "+wi.getBlockindex()+" length "+wi.getBytelength()+
+ " offset "+wi.getBlockindex() * wi.getBlocksize()+" outputs "+wi.getOutputspec());
+ try {
+ openInputFile(wi);
+ } catch (IOException e) {
+ throw new AnalysisEngineProcessException(e);
+ }
+
+ if (buffer == null) {
+ if (wi.getBlocksize()>0) {
+ buffer = new byte[wi.getBlocksize() * 2];
+ buffsize = wi.getBlocksize() * 2;
+ }
+ else {
+ buffer = new byte[DEFAULT_BUFFER_SIZE];
+ buffsize = DEFAULT_BUFFER_SIZE;
+ }
+ }
+ else {
+ if (wi.getBytelength() > buffsize) {
+ buffer = new byte[wi.getBytelength() * 2];
+ buffsize = wi.getBytelength();
+ }
+ }
+
+ spilled = false;
+ docInWI = 0;
+ strategy = (blockindex == 0) ? NextDoc.FIRSTDOC : NextDoc.NORMAL;
+ }
+
+
+ public void initialize(UimaContext aContext) throws ResourceInitializationException {
+ super.initialize(aContext);
+ logger = aContext.getLogger();
+ }
+
+
+ private void openInputFile(Workitem wi) throws IOException {
+ inputFileName = wi.getInputspec();
+ outputFileName = wi.getOutputspec();
+ bytelength = wi.getBytelength();
+ blockindex = wi.getBlockindex();
+ lastblock = wi.getLastBlock();
+ language = wi.getLanguage();
+ fis = new FileInputStream(new File(inputFileName));
+ encoding = (null==wi.getEncoding()) ? "UTF-8" : wi.getEncoding();
+ fc = fis.getChannel();
+ long start = wi.getBlockindex() * wi.getBlocksize();
+ filesize = fc.size();
+ if (start > filesize) {
+ throw new IOException("Specifid start position beyond end of input file "+inputFileName);
+ }
+ fis.skip(start);
+ newWI = true;
+ }
+
+ private boolean findnextdoc(NextDoc condition) throws IOException {
+ int startloc=-1;
+
+ if (newWI) {
+ newWI = false;
+ int len = fis.read(buffer,0,bytelength);
+ if (len != bytelength) {
+ throw new IOException("Read "+len+" bytes, expected "+bytelength);
+ }
+ currentindex = 0;
+ }
+
+ if (condition.equals(NextDoc.SEP_IN_LASTBLOCK)) {
+ // separator found at end of last block
+ if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
+ return false;
+ }
+ if (10 == buffer[currentindex]) {
+ currentindex++; // point at first char in Doc
+ }
+ startloc=currentindex;
+
+ // find end of next doc
+ int endloc=0;
+ while (currentindex < (bytelength-1)) {
+ if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
+ endloc = currentindex - 1;
+ break;
+ }
+ else {
+ currentindex++;
+ }
+ }
+ if (endloc == 0) {
+ throw new RuntimeException("Document larger than "+bytelength+" found in "+inputFileName+" block "+blockindex);
+ }
+ byte [] docbytes = Arrays.copyOfRange(buffer, startloc, endloc);
+ nextDoc = new String(docbytes, encoding);
+ nextDocOffset = startloc;
+ return true;
+ }
+
+ if (condition.equals(NextDoc.FIRSTDOC)) {
+ // special handling at beginning of first block
+ // skip any leading EOL to find start of first doc
+ // only execute this once
+ strategy = NextDoc.NORMAL;
+ while (10 == buffer[currentindex]) {
+ currentindex++;
+ if (currentindex == bytelength) {
+ if (firstdoc) {
+ throw new RuntimeException("All newlines found in "+inputFileName+" block "+blockindex);
+ }
+ }
+ }
+ }
+
+ if (condition.equals(NextDoc.NORMAL)) {
+ // currentindex either pointing at start of a segmentation, or
+ // if a new block then possibly the middle of a previous document
+ if (!(10 == buffer[currentindex] && 10 == buffer[currentindex+1])) {
+ // in the middle of a spilled Doc. Find next segmentation
+ while (currentindex < (bytelength-1)) {
+ if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
+ break;
+ }
+ else {
+ currentindex++;
+ }
+ }
+ }
+ if ( currentindex == bytelength-1) {
+ fis.close();
+ return false;
+ }
+ // now pointing at start of a segmentation, find start/end of next Doc
+ while (10 == buffer[currentindex]) {
+ currentindex++;
+ if (currentindex == bytelength) {
+ if (lastblock) {
+ fis.close();
+ return false;
+ }
+ // read next block and continue looking for end of Doc
+ int len = fis.read(buffer,bytelength,bytelength);
+ if (len <= 0) {
+ throw new IOException("Read "+len+" bytes for "+inputFileName+" block "+blockindex+1);
+ }
+ fis.close();
+ spilled = true;
+ bytelength += len;
+ return findnextdoc(NextDoc.SEP_IN_LASTBLOCK);
+ }
+ }
+ }
+
+ startloc = currentindex;
+ // find end of Doc
+ int endloc=0;
+ while (currentindex < (bytelength-1)) {
+ if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
+ endloc = currentindex - 1;
+ break;
+ }
+ else {
+ currentindex++;
+ }
+ }
+
+ if (endloc == 0) {
+ if (lastblock) {
+ endloc = bytelength-1;
+ }
+ else {
+ // read next block and continue looking for end of Doc
+ int len = fis.read(buffer,bytelength,bytelength);
+ if (len <= 0) {
+ throw new IOException("Read "+len+" bytes for "+inputFileName+" block "+blockindex+1);
+ }
+ fis.close();
+ spilled = true;
+ bytelength += len;
+ }
+ while (currentindex < (bytelength-1)) {
+ if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
+ endloc = currentindex - 1;
+ break;
+ }
+ else {
+ currentindex++;
+ }
+ }
+ endloc = currentindex - 1;
+ }
+ byte [] docbytes = Arrays.copyOfRange(buffer, startloc, endloc);
+ nextDoc = new String(docbytes, encoding);
+ nextDocOffset = startloc;
+ return true;
+ }
+
+ private String getNextDocument() {
+ return nextDoc;
+ }
+
+}
Added: uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccDocumentInfoTS.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccDocumentInfoTS.xml?rev=1510744&view=auto
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccDocumentInfoTS.xml (added)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccDocumentInfoTS.xml Mon Aug 5 21:34:40 2013
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?><typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier">
+ <name>Ducc Document Info</name>
+ <description>Type for communication between CR, CM, FC and CC</description>
+ <version>1.0</version>
+ <vendor>Apache UIMA</vendor>
+ <types>
+ <typeDescription>
+ <name>org.apache.uima.ducc.sampleapps.DuccDocumentInfo</name>
+ <description/>
+ <supertypeName>uima.cas.TOP</supertypeName>
+ <features>
+ <featureDescription>
+ <name>inputfile</name>
+ <description/>
+ <rangeTypeName>uima.cas.String</rangeTypeName>
+ </featureDescription>
+ <featureDescription>
+ <name>outputfile</name>
+ <description/>
+ <rangeTypeName>uima.cas.String</rangeTypeName>
+ </featureDescription>
+ <featureDescription>
+ <name>docseq</name>
+ <description>document sequence within work item</description>
+ <rangeTypeName>uima.cas.Integer</rangeTypeName>
+ </featureDescription>
+ <featureDescription>
+ <name>byteoffset</name>
+ <description>offset of byte location of first character in document</description>
+ <rangeTypeName>uima.cas.Integer</rangeTypeName>
+ </featureDescription>
+ </features>
+ </typeDescription>
+ </types>
+</typeSystemDescription>
\ No newline at end of file
Added: uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccJobTextCR.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccJobTextCR.xml?rev=1510744&view=auto
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccJobTextCR.xml (added)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccJobTextCR.xml Mon Aug 5 21:34:40 2013
@@ -0,0 +1,117 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+ <!--
+ ***************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ***************************************************************
+ -->
+
+<collectionReaderDescription xmlns="http://uima.apache.org/resourceSpecifier">
+ <frameworkImplementation>org.apache.uima.java</frameworkImplementation>
+ <implementationName>org.apache.uima.ducc.sampleapps.DuccJobTextCR</implementationName>
+ <processingResourceMetaData>
+ <name>DuccJobTextCR</name>
+ <description>Generates CASes with reference to input.</description>
+ <version>1.0</version>
+ <vendor>Apache UIMA</vendor>
+ <configurationParameters>
+ <configurationParameter>
+ <name>InputDirectory</name>
+ <type>String</type>
+ <multiValued>false</multiValued>
+ <mandatory>true</mandatory>
+ </configurationParameter>
+ <configurationParameter>
+ <name>OutputDirectory</name>
+ <description>The base output directory</description>
+ <type>String</type>
+ <multiValued>false</multiValued>
+ <mandatory>true</mandatory>
+ </configurationParameter>
+ <configurationParameter>
+ <name>IgnorePreviousOutput</name>
+ <type>Boolean</type>
+ <multiValued>false</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
+ <configurationParameter>
+ <name>Encoding</name>
+ <type>String</type>
+ <multiValued>false</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
+ <configurationParameter>
+ <name>Language</name>
+ <type>String</type>
+ <multiValued>false</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
+ <configurationParameter>
+ <name>BlockSize</name>
+ <type>Integer</type>
+ <multiValued>false</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
+ <configurationParameter>
+ <name>SendToLast</name>
+ <type>Boolean</type>
+ <multiValued>false</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
+ <configurationParameter>
+ <name>Debug</name>
+ <type>Boolean</type>
+ <multiValued>false</multiValued>
+ <mandatory>false</mandatory>
+ </configurationParameter>
+ </configurationParameters>
+ <configurationParameterSettings>
+ <nameValuePair>
+ <name>InputDirectory</name>
+ <value>
+ <string>/tmp</string>
+ </value>
+ </nameValuePair>
+ <nameValuePair>
+ <name>OutputDirectory</name>
+ <value>
+ <string>/tmp</string>
+ </value>
+ </nameValuePair>
+ </configurationParameterSettings>
+ <typeSystemDescription>
+ <imports>
+ <import name="org.apache.uima.ducc.common.uima.DuccJobFlowControlTS"/>
+ </imports>
+ </typeSystemDescription>
+ <typePriorities/>
+ <fsIndexCollection/>
+ <capabilities>
+ <capability>
+ <inputs/>
+ <outputs/>
+ <languagesSupported/>
+ </capability>
+ </capabilities>
+ <operationalProperties>
+ <modifiesCas>true</modifiesCas>
+ <multipleDeploymentAllowed>false</multipleDeploymentAllowed>
+ <outputsNewCASes>true</outputsNewCASes>
+ </operationalProperties>
+ </processingResourceMetaData>
+</collectionReaderDescription>
Added: uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccTextCC.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccTextCC.xml?rev=1510744&view=auto
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccTextCC.xml (added)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccTextCC.xml Mon Aug 5 21:34:40 2013
@@ -0,0 +1,58 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+ <!--
+ ***************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ***************************************************************
+ -->
+
+<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
+ <frameworkImplementation>org.apache.uima.java</frameworkImplementation>
+ <primitive>true</primitive>
+ <annotatorImplementationName>org.apache.uima.ducc.sampleapps.DuccTextCC</annotatorImplementationName>
+ <analysisEngineMetaData>
+ <name>DuccTextCC</name>
+ <description/>
+ <version>1.0</version>
+ <vendor/>
+ <configurationParameters/>
+ <configurationParameterSettings/>
+ <typeSystemDescription>
+ <imports>
+ <import name="org.apache.uima.ducc.common.uima.DuccJobFlowControlTS"/>
+ <import name="org.apache.uima.ducc.sampleapps.DuccDocumentInfoTS"/>
+ </imports>
+ </typeSystemDescription>
+ <typePriorities/>
+ <fsIndexCollection/>
+ <capabilities>
+ <capability>
+ <inputs/>
+ <outputs/>
+ <languagesSupported/>
+ </capability>
+ </capabilities>
+ <operationalProperties>
+ <modifiesCas>true</modifiesCas>
+ <multipleDeploymentAllowed>true</multipleDeploymentAllowed>
+ <outputsNewCASes>false</outputsNewCASes>
+ </operationalProperties>
+ </analysisEngineMetaData>
+ <resourceManagerConfiguration/>
+</analysisEngineDescription>
+
\ No newline at end of file
Added: uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccTextCM.xml
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccTextCM.xml?rev=1510744&view=auto
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccTextCM.xml (added)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-examples/src/main/resources/org/apache/uima/ducc/sampleapps/DuccTextCM.xml Mon Aug 5 21:34:40 2013
@@ -0,0 +1,57 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+ <!--
+ ***************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ***************************************************************
+ -->
+
+<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
+ <frameworkImplementation>org.apache.uima.java</frameworkImplementation>
+ <primitive>true</primitive>
+ <annotatorImplementationName>org.apache.uima.ducc.sampleapps.DuccTextCM</annotatorImplementationName>
+ <analysisEngineMetaData>
+ <name>DuccTextCM</name>
+ <description>Reads ...</description>
+ <version>1.0</version>
+ <vendor/>
+ <configurationParameters/>
+ <configurationParameterSettings/>
+ <typeSystemDescription>
+ <imports>
+ <import name="org.apache.uima.ducc.common.uima.DuccJobFlowControlTS"/>
+ <import name="org.apache.uima.ducc.sampleapps.DuccDocumentInfoTS"/>
+ </imports>
+ </typeSystemDescription>
+ <typePriorities/>
+ <fsIndexCollection/>
+ <capabilities>
+ <capability>
+ <inputs/>
+ <outputs/>
+ <languagesSupported/>
+ </capability>
+ </capabilities>
+ <operationalProperties>
+ <modifiesCas>true</modifiesCas>
+ <multipleDeploymentAllowed>true</multipleDeploymentAllowed>
+ <outputsNewCASes>true</outputsNewCASes>
+ </operationalProperties>
+ </analysisEngineMetaData>
+ <resourceManagerConfiguration/>
+</analysisEngineDescription>