You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by ja...@apache.org on 2012/10/31 06:26:55 UTC
svn commit: r1403989 [14/28] - in /incubator/ctakes/branches/SHARPn-cTAKES:
Constituency Parser/src/org/chboston/cnlp/ctakes/parser/ Constituency
Parser/src/org/chboston/cnlp/ctakes/parser/uima/ae/ Constituency
Parser/src/org/chboston/cnlp/ctakes/parse...
Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/HyphenTextModifierImpl.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/HyphenTextModifierImpl.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/HyphenTextModifierImpl.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/HyphenTextModifierImpl.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
/*
- * Copyright: (c) 2009 Mayo Foundation for Medical Education and
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
@@ -21,287 +14,287 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-/*
- * Created on May 23, 2005
- *
- * To change the template for this generated file go to
- * Window>Preferences>Java>Code Generation>Code and Comments
- */
-package edu.mayo.bmi.uima.core.ci;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-import edu.mayo.bmi.nlp.tokenizer.Token;
-import edu.mayo.bmi.nlp.tokenizer.Tokenizer;
-
-/**
- * @author Mayo Clinic
- *
- */
-public class HyphenTextModifierImpl implements TextModifier {
-
- private Map iv_shouldbeHyphenMap = null;
- private int iv_windowSize = 3; // default lookahead window
- private Tokenizer iv_tokenizer = null;
-
- /**
- * Default constructor takes a name of the file containing hyphenated
- * phrases, with their frequency.
- * Currently the frequency is unused.<br>
- * The case of the words in the file is unimportant - we lowercase
- * everything when doing compares.<br>
- * The file is delimited with "|" and has two fields:<br>
- * hyphen-term|frequency
- */
- public HyphenTextModifierImpl(String hyphenfilename, int windowSize) {
- iv_windowSize = windowSize;
- iv_tokenizer = new Tokenizer();
- BufferedReader br;
- try {
- br = new BufferedReader(new FileReader(new File(hyphenfilename)));
-
- String line = "";
-
- iv_shouldbeHyphenMap = new HashMap();
- while ((line = br.readLine()) != null) {
- String[] toks = line.split("\\|");
- String[] unh = toks[0].split("\\-");
- String shouldbehyphen = "";
- for (int i = 0; i < unh.length; i++) {
- shouldbehyphen += " " + unh[i];
- }
- shouldbehyphen = shouldbehyphen.trim().toLowerCase();
- iv_shouldbeHyphenMap.put(shouldbehyphen, new Integer(1));
- }
- } catch (FileNotFoundException e) {
- System.err.println("Cannot find the hyphenation file:" + hyphenfilename);
- e.printStackTrace();
- } catch (IOException e) {
- System.err.println("IOException accessing the hyphenation file:" + hyphenfilename);
- e.printStackTrace();
- }
-
- }
-
- /**
- * Filters out unwanted tokens - newlines.
- *
- * @param tokenList
- */
- private void filterTokens(List tokenList) {
-
- List removalList = new ArrayList();
- Iterator tokenItr = tokenList.iterator();
-
- while (tokenItr.hasNext()) {
- Token token = (Token) tokenItr.next();
- if (token.getType() == Token.TYPE_EOL) {
- removalList.add(token);
- }
- }
-
- tokenList.removeAll(removalList);
- }
-
- /*
- * (non-Javadoc)
- *
- * @see edu.mayo.bmi.uima.util.ci.TextModifier#modify(java.lang.String)
- */
- public TextModification[] modify(String in) throws Exception {
-
- // intermediate data structure to use for easy adding of new
- // TextModification objects
- ArrayList textmods = new ArrayList();
-
- // Tokenize the input to get offset information
- List inputtoks = iv_tokenizer.tokenizeAndSort(in);
-
- filterTokens(inputtoks);
-
- int orig_startOffset = 0;
- int orig_endOffset = 0;
- int new_startOffset = 0;
- int new_endOffset = 0;
-
- int i = 0;
- int j = 0;
- int end_offset_adj = 0;
- int start_offset_adj = 0;
-
- while (i < inputtoks.size()) {
-
- if (inputtoks.size() - (i + 1) < iv_windowSize) {
- j = inputtoks.size() - 1;
- } else {
- j = i + iv_windowSize;
- }
-
- while (j > i) {
-
- StringBuffer candSB = new StringBuffer();
- for (int k = i; k <= j; k++) {
- Token currtok = (Token) inputtoks.get(k);
- candSB.append(" ");
- candSB.append(currtok.getText());
- }
- String cand = candSB.toString().trim();
-
- // Attempt to look up the candidate in the hyphen map
- if (iv_shouldbeHyphenMap.containsKey(cand.toLowerCase())) {
-
- // set the initial offsets
- orig_startOffset = ((Token) inputtoks.get(i)).getStartOffset();
- orig_endOffset = ((Token) inputtoks.get(j)).getEndOffset();
- new_startOffset = orig_startOffset;
- new_endOffset = orig_endOffset;
-
- // compile new text
- String newText = "";
- for (int k = i; k <= j; k++) {
- Token currtok = (Token) inputtoks.get(k);
- newText += currtok.getText() + "-";
- }
- newText = newText.substring(0, newText.length() - 1);
-
- // Get the new and old lengths of hyphenated spans
- int new_Length = newText.length();
- int orig_Length = orig_endOffset - orig_startOffset;
-
- // Pad the end offset adjuster by the new amount
- end_offset_adj += orig_Length - new_Length;
-
- // Create a new modification object
- TextModification tm = new TextModification(orig_startOffset, orig_endOffset, new_startOffset
- - start_offset_adj, new_endOffset - end_offset_adj, newText);
-
- // Adjust the start offset on the next Text Modification
- // object
- start_offset_adj += orig_Length - new_Length;
-
- // Put the newly created TextMod object into a temporary
- // holder
- textmods.add(tm);
-
- i = j;
- }
- j--;
- }
-
- i++;
- }
-
- // generate the expected return as an array of TextModification objects
- TextModification[] tma = new TextModification[textmods.size()];
- for (int y = 0; y < tma.length; y++) {
- tma[y] = (TextModification) textmods.get(y);
- }
-
- return tma;
- }
-
-
- /**
- * Apply text modifier to the text <br>
- * TODO - move this to <code>TextModifier</code> and take a <code>Logger</code>
- * See <code>HyphenTextModifierImpl</code>
- * @param tm TextModifier to apply
- * @param text Original text
- * @param sb Buffer containing text to apply modifier to
- * @return unableToModifyText true if modifier would require offset changes, which is not supported by this method
- * @throws Exception
- */
- private static boolean applyTextModifier(TextModifier tm, String text, StringBuffer sb) throws Exception {
- boolean unableToModifyText = false;
- TextModification[] textModArr = tm.modify(text);
- for (int i = 0; i < textModArr.length; i++) {
-
- TextModification textMod = textModArr[i];
-
- if ((textMod.getOrigStartOffset() != textMod.getNewStartOffset())
- || (textMod.getOrigEndOffset() != textMod.getNewEndOffset())) {
- System.err.println("UNSUPPORTED: TextModification with offset changes.");
- unableToModifyText = true;
- }
- else {
- sb.replace(textMod.getOrigStartOffset(),
- textMod.getOrigEndOffset(),
- textMod.getNewText());
- }
- }
- return unableToModifyText;
- }
-
- public static ArrayList<String> test(HyphenTextModifierImpl tm, String text) {
- ArrayList<String> messages = new ArrayList<String>();
- try {
- TextModification[] tma = tm.modify(text);
- StringBuffer sb = new StringBuffer(text);
- boolean errorModifyingText = applyTextModifier(tm,text,sb);
- messages.add("Orig: " + text);
- if (!errorModifyingText) {
- messages.add("New: " + sb);
- }
- else {
- System.err.println("New: (new text not generated, see previous messages)");
- }
- // Regardless of whether was able to modify the text
- // without
- // (_apply_ the TextModifier), output the
- // the
- for (int u = 0; u < tma.length; u++) {
- TextModification tmo = (TextModification) tma[u];
- messages.add(tmo.getNewText() + " Orig: " + tmo.getOrigStartOffset() + "-"
- + tmo.getOrigEndOffset() + " New: " + tmo.getNewStartOffset() + "-" + tmo.getNewEndOffset());
- }
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return messages;
-
- }
- /**
- * Simple tests of <code>TextModification</code>
- * <br>
- * Output expected:<br>
- * UNSUPPORTED: TextModification with offset changes.<br>
- * UNSUPPORTED: TextModification with offset changes.<br>
- * UNSUPPORTED: TextModification with offset changes.<br>
- * Orig: Non Hodgkin's the x ray without any non small cell complications.<br>
- * New: (new text not generated, see previous messages)
- * Non-Hodgkin Orig: 0-12 New: 0-11<br>
- * x-ray Orig: 19-25 New: 18-23<br>
- * non-small-cell Orig: 38-53 New: 36-50<br>
- *
- * Orig: Non Hodgkin's the x ray without any non small cell complications.<br>
- * New: Non-Hodgkin's the x-ray without any non-small-cell complications.<br>
- * Non-Hodgkin Orig: 0-11 New: 0-11<br>
- * x-ray Orig: 18-23 New: 18-23<br>
- * non-small-cell Orig: 36-50 New: 36-50<br>
- * Note the case of the words doesn't matter.
- * @param args hyphen text filename (each line: hyphenated-word|freq)
- */
- public static void main(String[] args) {
- ArrayList<String> messages;
- HyphenTextModifierImpl tm = new HyphenTextModifierImpl(args[0], 7);
-
- String t = "Non Hodgkin's the x ray without any non small cell complications.";
- messages = test(tm, t); // extra blanks
- for (String s : messages) { System.out.println(s); }
-
- t = t.replace(" ", " "); // change text to only have single blanks between words
- messages = test(tm, t); // single blanks
- for (String s : messages) { System.out.println(s); }
- }
-
-}
+/*
+ * Created on May 23, 2005
+ *
+ * To change the template for this generated file go to
+ * Window>Preferences>Java>Code Generation>Code and Comments
+ */
+package edu.mayo.bmi.uima.core.ci;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import edu.mayo.bmi.nlp.tokenizer.Token;
+import edu.mayo.bmi.nlp.tokenizer.Tokenizer;
+
+/**
+ * @author Mayo Clinic
+ *
+ */
+public class HyphenTextModifierImpl implements TextModifier {
+
+ private Map iv_shouldbeHyphenMap = null;
+ private int iv_windowSize = 3; // default lookahead window
+ private Tokenizer iv_tokenizer = null;
+
+ /**
+ * Default constructor takes a name of the file containing hyphenated
+ * phrases, with their frequency.
+ * Currently the frequency is unused.<br>
+ * The case of the words in the file is unimportant - we lowercase
+ * everything when doing compares.<br>
+ * The file is delimited with "|" and has two fields:<br>
+ * hyphen-term|frequency
+ */
+ public HyphenTextModifierImpl(String hyphenfilename, int windowSize) {
+ iv_windowSize = windowSize;
+ iv_tokenizer = new Tokenizer();
+ BufferedReader br;
+ try {
+ br = new BufferedReader(new FileReader(new File(hyphenfilename)));
+
+ String line = "";
+
+ iv_shouldbeHyphenMap = new HashMap();
+ while ((line = br.readLine()) != null) {
+ String[] toks = line.split("\\|");
+ String[] unh = toks[0].split("\\-");
+ String shouldbehyphen = "";
+ for (int i = 0; i < unh.length; i++) {
+ shouldbehyphen += " " + unh[i];
+ }
+ shouldbehyphen = shouldbehyphen.trim().toLowerCase();
+ iv_shouldbeHyphenMap.put(shouldbehyphen, new Integer(1));
+ }
+ } catch (FileNotFoundException e) {
+ System.err.println("Cannot find the hyphenation file:" + hyphenfilename);
+ e.printStackTrace();
+ } catch (IOException e) {
+ System.err.println("IOException accessing the hyphenation file:" + hyphenfilename);
+ e.printStackTrace();
+ }
+
+ }
+
+ /**
+ * Filters out unwanted tokens - newlines.
+ *
+ * @param tokenList
+ */
+ private void filterTokens(List tokenList) {
+
+ List removalList = new ArrayList();
+ Iterator tokenItr = tokenList.iterator();
+
+ while (tokenItr.hasNext()) {
+ Token token = (Token) tokenItr.next();
+ if (token.getType() == Token.TYPE_EOL) {
+ removalList.add(token);
+ }
+ }
+
+ tokenList.removeAll(removalList);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see edu.mayo.bmi.uima.util.ci.TextModifier#modify(java.lang.String)
+ */
+ public TextModification[] modify(String in) throws Exception {
+
+ // intermediate data structure to use for easy adding of new
+ // TextModification objects
+ ArrayList textmods = new ArrayList();
+
+ // Tokenize the input to get offset information
+ List inputtoks = iv_tokenizer.tokenizeAndSort(in);
+
+ filterTokens(inputtoks);
+
+ int orig_startOffset = 0;
+ int orig_endOffset = 0;
+ int new_startOffset = 0;
+ int new_endOffset = 0;
+
+ int i = 0;
+ int j = 0;
+ int end_offset_adj = 0;
+ int start_offset_adj = 0;
+
+ while (i < inputtoks.size()) {
+
+ if (inputtoks.size() - (i + 1) < iv_windowSize) {
+ j = inputtoks.size() - 1;
+ } else {
+ j = i + iv_windowSize;
+ }
+
+ while (j > i) {
+
+ StringBuffer candSB = new StringBuffer();
+ for (int k = i; k <= j; k++) {
+ Token currtok = (Token) inputtoks.get(k);
+ candSB.append(" ");
+ candSB.append(currtok.getText());
+ }
+ String cand = candSB.toString().trim();
+
+ // Attempt to look up the candidate in the hyphen map
+ if (iv_shouldbeHyphenMap.containsKey(cand.toLowerCase())) {
+
+ // set the initial offsets
+ orig_startOffset = ((Token) inputtoks.get(i)).getStartOffset();
+ orig_endOffset = ((Token) inputtoks.get(j)).getEndOffset();
+ new_startOffset = orig_startOffset;
+ new_endOffset = orig_endOffset;
+
+ // compile new text
+ String newText = "";
+ for (int k = i; k <= j; k++) {
+ Token currtok = (Token) inputtoks.get(k);
+ newText += currtok.getText() + "-";
+ }
+ newText = newText.substring(0, newText.length() - 1);
+
+ // Get the new and old lengths of hyphenated spans
+ int new_Length = newText.length();
+ int orig_Length = orig_endOffset - orig_startOffset;
+
+ // Pad the end offset adjuster by the new amount
+ end_offset_adj += orig_Length - new_Length;
+
+ // Create a new modification object
+ TextModification tm = new TextModification(orig_startOffset, orig_endOffset, new_startOffset
+ - start_offset_adj, new_endOffset - end_offset_adj, newText);
+
+ // Adjust the start offset on the next Text Modification
+ // object
+ start_offset_adj += orig_Length - new_Length;
+
+ // Put the newly created TextMod object into a temporary
+ // holder
+ textmods.add(tm);
+
+ i = j;
+ }
+ j--;
+ }
+
+ i++;
+ }
+
+ // generate the expected return as an array of TextModification objects
+ TextModification[] tma = new TextModification[textmods.size()];
+ for (int y = 0; y < tma.length; y++) {
+ tma[y] = (TextModification) textmods.get(y);
+ }
+
+ return tma;
+ }
+
+
+ /**
+ * Apply text modifier to the text <br>
+ * TODO - move this to <code>TextModifier</code> and take a <code>Logger</code>
+ * See <code>HyphenTextModifierImpl</code>
+ * @param tm TextModifier to apply
+ * @param text Original text
+ * @param sb Buffer containing text to apply modifier to
+ * @return unableToModifyText true if modifier would require offset changes, which is not supported by this method
+ * @throws Exception
+ */
+ private static boolean applyTextModifier(TextModifier tm, String text, StringBuffer sb) throws Exception {
+ boolean unableToModifyText = false;
+ TextModification[] textModArr = tm.modify(text);
+ for (int i = 0; i < textModArr.length; i++) {
+
+ TextModification textMod = textModArr[i];
+
+ if ((textMod.getOrigStartOffset() != textMod.getNewStartOffset())
+ || (textMod.getOrigEndOffset() != textMod.getNewEndOffset())) {
+ System.err.println("UNSUPPORTED: TextModification with offset changes.");
+ unableToModifyText = true;
+ }
+ else {
+ sb.replace(textMod.getOrigStartOffset(),
+ textMod.getOrigEndOffset(),
+ textMod.getNewText());
+ }
+ }
+ return unableToModifyText;
+ }
+
+ public static ArrayList<String> test(HyphenTextModifierImpl tm, String text) {
+ ArrayList<String> messages = new ArrayList<String>();
+ try {
+ TextModification[] tma = tm.modify(text);
+ StringBuffer sb = new StringBuffer(text);
+ boolean errorModifyingText = applyTextModifier(tm,text,sb);
+ messages.add("Orig: " + text);
+ if (!errorModifyingText) {
+ messages.add("New: " + sb);
+ }
+ else {
+ System.err.println("New: (new text not generated, see previous messages)");
+ }
+ // Regardless of whether was able to modify the text
+ // without
+ // (_apply_ the TextModifier), output the
+ // the
+ for (int u = 0; u < tma.length; u++) {
+ TextModification tmo = (TextModification) tma[u];
+ messages.add(tmo.getNewText() + " Orig: " + tmo.getOrigStartOffset() + "-"
+ + tmo.getOrigEndOffset() + " New: " + tmo.getNewStartOffset() + "-" + tmo.getNewEndOffset());
+ }
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ return messages;
+
+ }
+ /**
+ * Simple tests of <code>TextModification</code>
+ * <br>
+ * Output expected:<br>
+ * UNSUPPORTED: TextModification with offset changes.<br>
+ * UNSUPPORTED: TextModification with offset changes.<br>
+ * UNSUPPORTED: TextModification with offset changes.<br>
+ * Orig: Non Hodgkin's the x ray without any non small cell complications.<br>
+ * New: (new text not generated, see previous messages)
+ * Non-Hodgkin Orig: 0-12 New: 0-11<br>
+ * x-ray Orig: 19-25 New: 18-23<br>
+ * non-small-cell Orig: 38-53 New: 36-50<br>
+ *
+ * Orig: Non Hodgkin's the x ray without any non small cell complications.<br>
+ * New: Non-Hodgkin's the x-ray without any non-small-cell complications.<br>
+ * Non-Hodgkin Orig: 0-11 New: 0-11<br>
+ * x-ray Orig: 18-23 New: 18-23<br>
+ * non-small-cell Orig: 36-50 New: 36-50<br>
+ * Note the case of the words doesn't matter.
+ * @param args hyphen text filename (each line: hyphenated-word|freq)
+ */
+ public static void main(String[] args) {
+ ArrayList<String> messages;
+ HyphenTextModifierImpl tm = new HyphenTextModifierImpl(args[0], 7);
+
+ String t = "Non Hodgkin's the x ray without any non small cell complications.";
+ messages = test(tm, t); // extra blanks
+ for (String s : messages) { System.out.println(s); }
+
+ t = t.replace(" ", " "); // change text to only have single blanks between words
+ messages = test(tm, t); // single blanks
+ for (String s : messages) { System.out.println(s); }
+ }
+
+}
Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModification.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModification.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModification.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModification.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
/*
- * Copyright: (c) 2009 Mayo Foundation for Medical Education and
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
@@ -21,62 +14,62 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package edu.mayo.bmi.uima.core.ci;
-
-/**
- * Value object class that describes a modification of document text. This
- * object tracks the original text and the new replacement text.
- */
-public class TextModification
-{
- private int iv_origStartOffset;
- private int iv_origEndOffset;
-
- private int iv_newStartOffset;
- private int iv_newEndOffset;
- private String iv_newText;
-
- /**
- * Constructor
- *
- * @param origStartOffset
- * @param origEndOffset
- * @param newStartOffset
- * @param newEndOffset
- * @param newText
- */
- public TextModification(int origStartOffset, int origEndOffset,
- int newStartOffset, int newEndOffset, String newText)
- {
- iv_origStartOffset = origStartOffset;
- iv_origEndOffset = origEndOffset;
- iv_newStartOffset = newStartOffset;
- iv_newEndOffset = newEndOffset;
- iv_newText = newText;
- }
-
- public int getNewEndOffset()
- {
- return iv_newEndOffset;
- }
-
- public int getNewStartOffset()
- {
- return iv_newStartOffset;
- }
-
- public String getNewText()
- {
- return iv_newText;
- }
-
- public int getOrigEndOffset()
- {
- return iv_origEndOffset;
- }
-
- public int getOrigStartOffset()
- {
- return iv_origStartOffset;
- }
-}
\ No newline at end of file
+package edu.mayo.bmi.uima.core.ci;
+
+/**
+ * Value object class that describes a modification of document text. This
+ * object tracks the original text and the new replacement text.
+ */
+public class TextModification
+{
+ private int iv_origStartOffset;
+ private int iv_origEndOffset;
+
+ private int iv_newStartOffset;
+ private int iv_newEndOffset;
+ private String iv_newText;
+
+ /**
+ * Constructor
+ *
+ * @param origStartOffset
+ * @param origEndOffset
+ * @param newStartOffset
+ * @param newEndOffset
+ * @param newText
+ */
+ public TextModification(int origStartOffset, int origEndOffset,
+ int newStartOffset, int newEndOffset, String newText)
+ {
+ iv_origStartOffset = origStartOffset;
+ iv_origEndOffset = origEndOffset;
+ iv_newStartOffset = newStartOffset;
+ iv_newEndOffset = newEndOffset;
+ iv_newText = newText;
+ }
+
+ public int getNewEndOffset()
+ {
+ return iv_newEndOffset;
+ }
+
+ public int getNewStartOffset()
+ {
+ return iv_newStartOffset;
+ }
+
+ public String getNewText()
+ {
+ return iv_newText;
+ }
+
+ public int getOrigEndOffset()
+ {
+ return iv_origEndOffset;
+ }
+
+ public int getOrigStartOffset()
+ {
+ return iv_origStartOffset;
+ }
+}
Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModifier.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModifier.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModifier.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModifier.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
/*
- * Copyright: (c) 2009 Mayo Foundation for Medical Education and
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
@@ -21,21 +14,21 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package edu.mayo.bmi.uima.core.ci;
-
-/**
- * Defines a generic interface for modifying text.
- */
-public interface TextModifier
-{
- /**
- * Generates modifications for the specified text.
- *
- * @param text
- * Original document text.
- * @return Array of TextModification objects that describe the
- * modifications. Offset values are relative to the String object.
- * @throws Exception
- */
- public TextModification[] modify(String text) throws Exception;
-}
\ No newline at end of file
+package edu.mayo.bmi.uima.core.ci;
+
+/**
+ * Defines a generic interface for modifying text.
+ */
+public interface TextModifier
+{
+ /**
+ * Generates modifications for the specified text.
+ *
+ * @param text
+ * Original document text.
+ * @return Array of TextModification objects that describe the
+ * modifications. Offset values are relative to the String object.
+ * @throws Exception
+ */
+ public TextModification[] modify(String text) throws Exception;
+}
Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionCyclicalReads.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionCyclicalReads.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionCyclicalReads.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionCyclicalReads.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
/*
- * Copyright: (c) 2009 Mayo Foundation for Medical Education and
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
@@ -21,183 +14,183 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package edu.mayo.bmi.uima.core.cr;
-
-/**
- * @author Mayo Clinic
- * @version 1.0
- * The original code was copied from org.apache.uima.examples.cpe.FileSystemCollectionReader
- * and modified for Mayo use. This inherits from FilesInDirectoryCollectionReader and adds
- * the capability to specify the number of documents to process.
- *
- * A simple collection reader that reads documents from a directory
- * in the filesystem. It can be configured with the following parameters:
- * <ul>
- * <li><code>InputDirectory</code> - path to directory containing files</li>
- * <li><code>Encoding</code> (optional) - character encoding of the input
- * files</li>
- * <li><code>Language</code> (optional) - language of the input documents</li>
- * <li><code>Extensions</code> (optional) - Name of optional configuration
- * parameter that specifies the extensions of the files that the
- * collection reader will read. </li>
- * <li><code>NumberOfIterations</code> (optional) - actual number of files to be processed</li>
- * </ul>
- *
- * TODO We may need to provide a way to specify some portion of the path of the file
- * to be included in the id of the document especially if we extend to recursively
- * gather files in the directory from sub directories.
- */
-
-import java.io.IOException;
-
-
-import org.apache.uima.cas.CAS;
-import org.apache.uima.collection.CollectionException;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.util.Progress;
-import org.apache.uima.util.ProgressImpl;
-
-
-public class FilesInDirectoryCollectionCyclicalReads extends FilesInDirectoryCollectionReader
-{
- /**
- * Name of configuration parameter that must be set to the path of
- * a directory containing input files.
- */
- public static final String PARAM_INPUTDIR = "InputDirectory";
-
- /**
- * Name of configuration parameter that contains the character encoding used
- * by the input files. If not specified, the default system encoding will
- * be used.
- */
- public static final String PARAM_ENCODING = "Encoding";
-
- /**
- * Name of optional configuration parameter that contains the language of
- * the documents in the input directory. If specified this information will
- * be added to the CAS.
- */
- public static final String PARAM_LANGUAGE = "Language";
-
- /**Name of optional configuration parameter that specifies the extensions
- * of the files that the collection reader will read. Values for this
- * parameter should not begin with a dot <code>'.'</code>.
- */
-
- public static final String PARAM_EXTENSIONS = "Extensions";
-
- /**Arguement to equate to # of times it should read the files.
- * Takes this argument to equate to # of times it should read the files.
- */
-
- public static final String PARAM_NUMREADS = "NumberOfIterations";
-
- public static final String PARAM_RECURSE = "Recurse";
- private int iv_iteration;
- private int scaleTime, totalNumFiles, remainTimes;
-
- /**
- * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
- */
- public void initialize() throws ResourceInitializationException {
-
- super.initialize();
- totalNumFiles = iv_files.size();
- iv_iteration = ((Integer) getConfigParameterValue(PARAM_NUMREADS))
- .intValue();
- if (iv_iteration > totalNumFiles) {
- scaleTime = iv_iteration / totalNumFiles;
- remainTimes = iv_iteration % totalNumFiles;
- } else
- scaleTime = -1;
-
- }
-
- /**
- * Similar to 'org.apache.uima.collection.CollectionReader' method hasNext() except
- * interations represents the actual number of documents to be processed, so if the
- * total number of documents in a queue is more than the 'Iterations' value then only
- * the iteration amount will be processed. Multiples of the total available documents
- * will be provided to supplement the list required to meet the total iteration value.
- */
- public boolean hasNext()
- {
-
- // If hasNext false then start over only if count that has been passed to the contructor hasn't been reached.
- boolean doNext = iv_currentIndex < totalNumFiles;
-
- if ((!doNext) && (scaleTime > 0)) {
- scaleTime--;
- if (scaleTime > 0) {
- iv_currentIndex = 0;
- doNext = true;
- }
- else if (remainTimes > 0){
- iv_currentIndex = 0;
- totalNumFiles = remainTimes;
- remainTimes=0;
- doNext = true;
- }
-
- }
- if (scaleTime == -1) {
- if (iv_currentIndex < iv_iteration)
- doNext = true;
- else
- doNext = false;
-
- }
-
- return doNext;
- }
-
- /**
- * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
- */
- public void getNext(CAS aCAS) throws IOException, CollectionException
- {
-
- super.getNext(aCAS);
-
-
- }
-
-
- /**
- * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
- */
- public void close() throws IOException
- {
- super.close();
- }
-
- /**
- * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
- */
- public Progress[] getProgress() {
- int offSet = iv_currentIndex;
- if ((scaleTime > 0) && (iv_currentIndex > 0))
- offSet = iv_currentIndex*(1/scaleTime);
- if (scaleTime == 0){
- offSet = iv_iteration + remainTimes;
- }
-
- return new Progress[] { new ProgressImpl( offSet ,
- iv_iteration, Progress.ENTITIES) };
- }
-
- /**
- * Gets the total number of documents that will be returned by this
- * collection reader. This is not part of the general collection reader
- * interface.
- *
- * @return the number of documents in the collection
- */
- public int getNumberOfDocuments()
- {
- return iv_files.size();
- }
-
-
-}
+package edu.mayo.bmi.uima.core.cr;
+
+/**
+ * @author Mayo Clinic
+ * @version 1.0
+ * The original code was copied from org.apache.uima.examples.cpe.FileSystemCollectionReader
+ * and modified for Mayo use. This inherits from FilesInDirectoryCollectionReader and adds
+ * the capability to specify the number of documents to process.
+ *
+ * A simple collection reader that reads documents from a directory
+ * in the filesystem. It can be configured with the following parameters:
+ * <ul>
+ * <li><code>InputDirectory</code> - path to directory containing files</li>
+ * <li><code>Encoding</code> (optional) - character encoding of the input
+ * files</li>
+ * <li><code>Language</code> (optional) - language of the input documents</li>
+ * <li><code>Extensions</code> (optional) - Name of optional configuration
+ * parameter that specifies the extensions of the files that the
+ * collection reader will read. </li>
+ * <li><code>NumberOfIterations</code> (optional) - actual number of files to be processed</li>
+ * </ul>
+ *
+ * TODO We may need to provide a way to specify some portion of the path of the file
+ * to be included in the id of the document especially if we extend to recursively
+ * gather files in the directory from sub directories.
+ */
+
+import java.io.IOException;
+
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.collection.CollectionException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.Progress;
+import org.apache.uima.util.ProgressImpl;
+
+
+public class FilesInDirectoryCollectionCyclicalReads extends FilesInDirectoryCollectionReader
+{
+ /**
+ * Name of configuration parameter that must be set to the path of
+ * a directory containing input files.
+ */
+ public static final String PARAM_INPUTDIR = "InputDirectory";
+
+ /**
+ * Name of configuration parameter that contains the character encoding used
+ * by the input files. If not specified, the default system encoding will
+ * be used.
+ */
+ public static final String PARAM_ENCODING = "Encoding";
+
+ /**
+ * Name of optional configuration parameter that contains the language of
+ * the documents in the input directory. If specified this information will
+ * be added to the CAS.
+ */
+ public static final String PARAM_LANGUAGE = "Language";
+
+ /**Name of optional configuration parameter that specifies the extensions
+ * of the files that the collection reader will read. Values for this
+ * parameter should not begin with a dot <code>'.'</code>.
+ */
+
+ public static final String PARAM_EXTENSIONS = "Extensions";
+
+ /**Arguement to equate to # of times it should read the files.
+ * Takes this argument to equate to # of times it should read the files.
+ */
+
+ public static final String PARAM_NUMREADS = "NumberOfIterations";
+
+ public static final String PARAM_RECURSE = "Recurse";
+ private int iv_iteration;
+ private int scaleTime, totalNumFiles, remainTimes;
+
+ /**
+ * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
+ */
+ public void initialize() throws ResourceInitializationException {
+
+ super.initialize();
+ totalNumFiles = iv_files.size();
+ iv_iteration = ((Integer) getConfigParameterValue(PARAM_NUMREADS))
+ .intValue();
+ if (iv_iteration > totalNumFiles) {
+ scaleTime = iv_iteration / totalNumFiles;
+ remainTimes = iv_iteration % totalNumFiles;
+ } else
+ scaleTime = -1;
+
+ }
+
+ /**
+ * Similar to 'org.apache.uima.collection.CollectionReader' method hasNext() except
+ * interations represents the actual number of documents to be processed, so if the
+ * total number of documents in a queue is more than the 'Iterations' value then only
+ * the iteration amount will be processed. Multiples of the total available documents
+ * will be provided to supplement the list required to meet the total iteration value.
+ */
+ public boolean hasNext()
+ {
+
+ // If hasNext false then start over only if count that has been passed to the contructor hasn't been reached.
+ boolean doNext = iv_currentIndex < totalNumFiles;
+
+ if ((!doNext) && (scaleTime > 0)) {
+ scaleTime--;
+ if (scaleTime > 0) {
+ iv_currentIndex = 0;
+ doNext = true;
+ }
+ else if (remainTimes > 0){
+ iv_currentIndex = 0;
+ totalNumFiles = remainTimes;
+ remainTimes=0;
+ doNext = true;
+ }
+
+ }
+ if (scaleTime == -1) {
+ if (iv_currentIndex < iv_iteration)
+ doNext = true;
+ else
+ doNext = false;
+
+ }
+
+ return doNext;
+ }
+
+ /**
+ * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
+ */
+ public void getNext(CAS aCAS) throws IOException, CollectionException
+ {
+
+ super.getNext(aCAS);
+
+
+ }
+
+
+ /**
+ * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
+ */
+ public void close() throws IOException
+ {
+ super.close();
+ }
+
+ /**
+ * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
+ */
+ public Progress[] getProgress() {
+ int offSet = iv_currentIndex;
+ if ((scaleTime > 0) && (iv_currentIndex > 0))
+ offSet = iv_currentIndex*(1/scaleTime);
+ if (scaleTime == 0){
+ offSet = iv_iteration + remainTimes;
+ }
+
+ return new Progress[] { new ProgressImpl( offSet ,
+ iv_iteration, Progress.ENTITIES) };
+ }
+
+ /**
+ * Gets the total number of documents that will be returned by this
+ * collection reader. This is not part of the general collection reader
+ * interface.
+ *
+ * @return the number of documents in the collection
+ */
+ public int getNumberOfDocuments()
+ {
+ return iv_files.size();
+ }
+
+
+}
Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionReader.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionReader.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionReader.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionReader.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
/*
- * Copyright: (c) 2009 Mayo Foundation for Medical Education and
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
@@ -21,295 +14,295 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package edu.mayo.bmi.uima.core.cr;
-
-/**
- * The original code was copied from org.apache.uima.examples.cpe.FileSystemCollectionReader
- * and modified for Mayo use.
- *
- * A simple collection reader that reads documents from a directory
- * in the filesystem. It can be configured with the following parameters:
- * <ul>
- * <li><code>InputDirectory</code> - path to directory containing files</li>
- * <li><code>Encoding</code> (optional) - character encoding of the input
- * files</li>
- * <li><code>Language</code> (optional) - language of the input documents</li>
- * <li><code>Extensions</code> (optional) - Name of optional configuration
- * parameter that specifies the extensions of the files that the
- * collection reader will read.
- * </ul>
- *
- * TODO We may need to provide a way to specify some portion of the path of the file
- * to be included in the id of the document especially if we extend to recursively
- * gather files in the directory from sub directories.
- */
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.uima.cas.CAS;
-import org.apache.uima.cas.CASException;
-import org.apache.uima.collection.CollectionException;
-import org.apache.uima.collection.CollectionReader_ImplBase;
-import org.apache.uima.jcas.JCas;
-//import org.apache.uima.jcas.tcas.DocumentAnnotation;
-import org.apache.uima.resource.ResourceConfigurationException;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.util.Progress;
-import org.apache.uima.util.ProgressImpl;
-
-import edu.mayo.bmi.uima.core.type.structured.DocumentID;
-
-public class FilesInDirectoryCollectionReader extends CollectionReader_ImplBase
-{
- /**
- * Name of configuration parameter that must be set to the path of
- * a directory containing input files.
- */
- public static final String PARAM_INPUTDIR = "InputDirectory";
-
- /**
- * Name of configuration parameter that contains the character encoding used
- * by the input files. If not specified, the default system encoding will
- * be used.
- */
- public static final String PARAM_ENCODING = "Encoding";
-
- /**
- * Name of optional configuration parameter that contains the language of
- * the documents in the input directory. If specified this information will
- * be added to the CAS.
- */
- public static final String PARAM_LANGUAGE = "Language";
-
- /**Name of optional configuration parameter that specifies the extensions
- * of the files that the collection reader will read. Values for this
- * parameter should not begin with a dot <code>'.'</code>.
- */
-
- public static final String PARAM_EXTENSIONS = "Extensions";
-
- public static final String PARAM_RECURSE = "Recurse";
-
- protected ArrayList iv_files;
- private String iv_encoding;
- private String iv_language;
- private static String[] iv_extensions;
-
- protected int iv_currentIndex;
-
- private boolean iv_recurse = false;
-
- private String iv_rootPath = "";
-
- /**
- * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
- */
- public void initialize() throws ResourceInitializationException
- {
- File directory = new File((String)getConfigParameterValue(PARAM_INPUTDIR));
- iv_encoding = (String)getConfigParameterValue(PARAM_ENCODING);
- iv_language = (String)getConfigParameterValue(PARAM_LANGUAGE);
- iv_extensions = (String[]) getConfigParameterValue(PARAM_EXTENSIONS);
-
- iv_currentIndex = 0;
-
- iv_recurse = false;
- Boolean recurse = (Boolean) getConfigParameterValue(PARAM_RECURSE);
- if(recurse != null)
- iv_recurse = recurse.booleanValue();
- iv_rootPath = directory.getPath();
-
- //if input directory does not exist or is not a directory, throw exception
- if (!directory.exists() || !directory.isDirectory())
- {
- throw new ResourceInitializationException(
- ResourceConfigurationException.DIRECTORY_NOT_FOUND,
- new Object[]{PARAM_INPUTDIR, this.getMetaData().getName(), directory.getPath()});
- }
-
-
- //get list of files (not subdirectories) in the specified directory
- iv_files = new ArrayList();
- if(!iv_recurse)
- {
- File[] files = directory.listFiles();
- for (int i = 0; i < files.length; i++)
- {
- if (!files[i].isDirectory() && hasValidExtension(files[i]))
- {
- iv_files.add(files[i]);
- }
- }
- }
- else
- {
- try
- {
- collectFiles(directory, iv_files);
- System.out.println("iv_files.size()="+iv_files.size());
- }
- catch(IOException ioe)
- {
- throw new ResourceInitializationException(ioe);
- }
- }
- }
-
- private void collectFiles(File directory, List files) throws IOException
- {
- File[] dirFiles = directory.listFiles();
- for(int i=0; i<dirFiles.length;i++)
- {
- if(dirFiles[i].isDirectory())
- {
- collectFiles(dirFiles[i], files);
- }
- else if(hasValidExtension(dirFiles[i]))
- {
- files.add(dirFiles[i]);
- }
- }
- }
-
-
- private boolean hasValidExtension(File file)
- {
- if(iv_extensions == null) return true;
- for (int i = 0; i < iv_extensions.length; i++)
- {
- if(file.getName().endsWith("."+iv_extensions[i]))
- {
- return true;
- }
- }
- return false;
- }
-
-
- /**
- * @see org.apache.uima.collection.CollectionReader#hasNext()
- */
- public boolean hasNext()
- {
- return iv_currentIndex < iv_files.size();
- }
-
- /**
- * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
- */
- public void getNext(CAS aCAS) throws IOException, CollectionException
- {
- JCas jcas;
- InputStream fileInputStream = null;
- Reader fileReader = null;
-
- try
- {
- jcas = aCAS.getJCas();
-
- //open input stream to file
- File file = (File)iv_files.get(iv_currentIndex);
- fileInputStream = new FileInputStream(file);
- fileReader = new BufferedReader(new InputStreamReader(fileInputStream));
-
- DocumentID documentIDAnnotation = new DocumentID(jcas);
- String docID = createDocID(file);
- documentIDAnnotation.setDocumentID(docID);
- documentIDAnnotation.addToIndexes();
-
- //if there's a CAS Initializer, call it
- if (getCasInitializer() != null)
- {
- getCasInitializer().initializeCas(fileReader, aCAS);
- }
- else //No CAS Initializer, so read file and set document text ourselves
- {
- byte[] contents = new byte[(int)file.length() ];
- fileInputStream.read( contents );
- String text;
- if (iv_encoding != null)
- {
- text = new String(contents, iv_encoding);
- }
- else
- {
- text = new String(contents);
- }
- if(text == null)
- {
- System.out.println("text ==null!");
- System.out.println("docID = "+docID);
- }
- //put document in CAS (assume CAS)
- jcas.setDocumentText(text);
- }
-
- //set language if it was explicitly specified as a configuration parameter
- if (iv_language != null)
- {
- // ((DocumentAnnotation)jcas.getDocumentAnnotationFs()).setLanguage(iv_language);
- }
-
- }
- catch (CASException e)
- {
- throw new CollectionException(e);
- }
- finally
- {
- if (fileInputStream != null)
- fileInputStream.close();
- iv_currentIndex++;
- }
- }
-
- private String createDocID(File file)
- {
- String docID = file.getPath();
- if(iv_rootPath.endsWith(""+File.separator) ||
- iv_rootPath.equals(""))
- {
- docID = docID.substring(iv_rootPath.length());
- }
- else
- docID = docID.substring(iv_rootPath.length()+1);
- return docID;
- }
- /**
- * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
- */
- public void close() throws IOException
- {
- }
-
- /**
- * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
- */
- public Progress[] getProgress()
- {
- return new Progress[]{
- new ProgressImpl(iv_currentIndex, iv_files.size(),Progress.ENTITIES)};
- }
-
- /**
- * Gets the total number of documents that will be returned by this
- * collection reader. This is not part of the general collection reader
- * interface.
- *
- * @return the number of documents in the collection
- */
- public int getNumberOfDocuments()
- {
- return iv_files.size();
- }
-
-
-}
+package edu.mayo.bmi.uima.core.cr;
+
+/**
+ * The original code was copied from org.apache.uima.examples.cpe.FileSystemCollectionReader
+ * and modified for Mayo use.
+ *
+ * A simple collection reader that reads documents from a directory
+ * in the filesystem. It can be configured with the following parameters:
+ * <ul>
+ * <li><code>InputDirectory</code> - path to directory containing files</li>
+ * <li><code>Encoding</code> (optional) - character encoding of the input
+ * files</li>
+ * <li><code>Language</code> (optional) - language of the input documents</li>
+ * <li><code>Extensions</code> (optional) - Name of optional configuration
+ * parameter that specifies the extensions of the files that the
+ * collection reader will read.
+ * </ul>
+ *
+ * TODO We may need to provide a way to specify some portion of the path of the file
+ * to be included in the id of the document especially if we extend to recursively
+ * gather files in the directory from sub directories.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CollectionException;
+import org.apache.uima.collection.CollectionReader_ImplBase;
+import org.apache.uima.jcas.JCas;
+//import org.apache.uima.jcas.tcas.DocumentAnnotation;
+import org.apache.uima.resource.ResourceConfigurationException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.Progress;
+import org.apache.uima.util.ProgressImpl;
+
+import edu.mayo.bmi.uima.core.type.structured.DocumentID;
+
+public class FilesInDirectoryCollectionReader extends CollectionReader_ImplBase
+{
+ /**
+ * Name of configuration parameter that must be set to the path of
+ * a directory containing input files.
+ */
+ public static final String PARAM_INPUTDIR = "InputDirectory";
+
+ /**
+ * Name of configuration parameter that contains the character encoding used
+ * by the input files. If not specified, the default system encoding will
+ * be used.
+ */
+ public static final String PARAM_ENCODING = "Encoding";
+
+ /**
+ * Name of optional configuration parameter that contains the language of
+ * the documents in the input directory. If specified this information will
+ * be added to the CAS.
+ */
+ public static final String PARAM_LANGUAGE = "Language";
+
+ /**Name of optional configuration parameter that specifies the extensions
+ * of the files that the collection reader will read. Values for this
+ * parameter should not begin with a dot <code>'.'</code>.
+ */
+
+ public static final String PARAM_EXTENSIONS = "Extensions";
+
+ public static final String PARAM_RECURSE = "Recurse";
+
+ protected ArrayList iv_files;
+ private String iv_encoding;
+ private String iv_language;
+ private static String[] iv_extensions;
+
+ protected int iv_currentIndex;
+
+ private boolean iv_recurse = false;
+
+ private String iv_rootPath = "";
+
+ /**
+ * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
+ */
+ public void initialize() throws ResourceInitializationException
+ {
+ File directory = new File((String)getConfigParameterValue(PARAM_INPUTDIR));
+ iv_encoding = (String)getConfigParameterValue(PARAM_ENCODING);
+ iv_language = (String)getConfigParameterValue(PARAM_LANGUAGE);
+ iv_extensions = (String[]) getConfigParameterValue(PARAM_EXTENSIONS);
+
+ iv_currentIndex = 0;
+
+ iv_recurse = false;
+ Boolean recurse = (Boolean) getConfigParameterValue(PARAM_RECURSE);
+ if(recurse != null)
+ iv_recurse = recurse.booleanValue();
+ iv_rootPath = directory.getPath();
+
+ //if input directory does not exist or is not a directory, throw exception
+ if (!directory.exists() || !directory.isDirectory())
+ {
+ throw new ResourceInitializationException(
+ ResourceConfigurationException.DIRECTORY_NOT_FOUND,
+ new Object[]{PARAM_INPUTDIR, this.getMetaData().getName(), directory.getPath()});
+ }
+
+
+ //get list of files (not subdirectories) in the specified directory
+ iv_files = new ArrayList();
+ if(!iv_recurse)
+ {
+ File[] files = directory.listFiles();
+ for (int i = 0; i < files.length; i++)
+ {
+ if (!files[i].isDirectory() && hasValidExtension(files[i]))
+ {
+ iv_files.add(files[i]);
+ }
+ }
+ }
+ else
+ {
+ try
+ {
+ collectFiles(directory, iv_files);
+ System.out.println("iv_files.size()="+iv_files.size());
+ }
+ catch(IOException ioe)
+ {
+ throw new ResourceInitializationException(ioe);
+ }
+ }
+ }
+
+ private void collectFiles(File directory, List files) throws IOException
+ {
+ File[] dirFiles = directory.listFiles();
+ for(int i=0; i<dirFiles.length;i++)
+ {
+ if(dirFiles[i].isDirectory())
+ {
+ collectFiles(dirFiles[i], files);
+ }
+ else if(hasValidExtension(dirFiles[i]))
+ {
+ files.add(dirFiles[i]);
+ }
+ }
+ }
+
+
+ private boolean hasValidExtension(File file)
+ {
+ if(iv_extensions == null) return true;
+ for (int i = 0; i < iv_extensions.length; i++)
+ {
+ if(file.getName().endsWith("."+iv_extensions[i]))
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+
+ /**
+ * @see org.apache.uima.collection.CollectionReader#hasNext()
+ */
+ public boolean hasNext()
+ {
+ return iv_currentIndex < iv_files.size();
+ }
+
+ /**
+ * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
+ */
+ public void getNext(CAS aCAS) throws IOException, CollectionException
+ {
+ JCas jcas;
+ InputStream fileInputStream = null;
+ Reader fileReader = null;
+
+ try
+ {
+ jcas = aCAS.getJCas();
+
+ //open input stream to file
+ File file = (File)iv_files.get(iv_currentIndex);
+ fileInputStream = new FileInputStream(file);
+ fileReader = new BufferedReader(new InputStreamReader(fileInputStream));
+
+ DocumentID documentIDAnnotation = new DocumentID(jcas);
+ String docID = createDocID(file);
+ documentIDAnnotation.setDocumentID(docID);
+ documentIDAnnotation.addToIndexes();
+
+ //if there's a CAS Initializer, call it
+ if (getCasInitializer() != null)
+ {
+ getCasInitializer().initializeCas(fileReader, aCAS);
+ }
+ else //No CAS Initializer, so read file and set document text ourselves
+ {
+ byte[] contents = new byte[(int)file.length() ];
+ fileInputStream.read( contents );
+ String text;
+ if (iv_encoding != null)
+ {
+ text = new String(contents, iv_encoding);
+ }
+ else
+ {
+ text = new String(contents);
+ }
+ if(text == null)
+ {
+ System.out.println("text ==null!");
+ System.out.println("docID = "+docID);
+ }
+ //put document in CAS (assume CAS)
+ jcas.setDocumentText(text);
+ }
+
+ //set language if it was explicitly specified as a configuration parameter
+ if (iv_language != null)
+ {
+ // ((DocumentAnnotation)jcas.getDocumentAnnotationFs()).setLanguage(iv_language);
+ }
+
+ }
+ catch (CASException e)
+ {
+ throw new CollectionException(e);
+ }
+ finally
+ {
+ if (fileInputStream != null)
+ fileInputStream.close();
+ iv_currentIndex++;
+ }
+ }
+
+ private String createDocID(File file)
+ {
+ String docID = file.getPath();
+ if(iv_rootPath.endsWith(""+File.separator) ||
+ iv_rootPath.equals(""))
+ {
+ docID = docID.substring(iv_rootPath.length());
+ }
+ else
+ docID = docID.substring(iv_rootPath.length()+1);
+ return docID;
+ }
+ /**
+ * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
+ */
+ public void close() throws IOException
+ {
+ }
+
+ /**
+ * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
+ */
+ public Progress[] getProgress()
+ {
+ return new Progress[]{
+ new ProgressImpl(iv_currentIndex, iv_files.size(),Progress.ENTITIES)};
+ }
+
+ /**
+ * Gets the total number of documents that will be returned by this
+ * collection reader. This is not part of the general collection reader
+ * interface.
+ *
+ * @return the number of documents in the collection
+ */
+ public int getNumberOfDocuments()
+ {
+ return iv_files.size();
+ }
+
+
+}