You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by ja...@apache.org on 2012/10/31 06:26:55 UTC

svn commit: r1403989 [5/28] - in /incubator/ctakes/branches/SHARPn-cTAKES: Constituency Parser/src/org/chboston/cnlp/ctakes/parser/ Constituency Parser/src/org/chboston/cnlp/ctakes/parser/uima/ae/ Constituency Parser/src/org/chboston/cnlp/ctakes/parser...

Modified: incubator/ctakes/branches/SHARPn-cTAKES/PAD term spotter/utils/src/edu/mayo/bmi/utils/xcas_comparison/XcasFile.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/PAD%20term%20spotter/utils/src/edu/mayo/bmi/utils/xcas_comparison/XcasFile.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/PAD term spotter/utils/src/edu/mayo/bmi/utils/xcas_comparison/XcasFile.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/PAD term spotter/utils/src/edu/mayo/bmi/utils/xcas_comparison/XcasFile.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -21,227 +14,227 @@
  * See the License for the specific language governing permissions and 
  * limitations under the License. 
  */
-package edu.mayo.bmi.utils.xcas_comparison;
-import java.util.Hashtable;
-import java.util.LinkedList;
-import java.util.Vector;
-import java.io.File;
-
-/**
- * An <code>XcasFile</code> wraps all <code>XcasAnnotations</code>s
- * in it and also contains their position information in terms of
- * line and column numbers.
- * @author Mayo Clinic
- *
- */
-public class XcasFile implements Cloneable {
-
-	protected File f;
-	protected Hashtable<Integer, XcasAnnotation> annotations;
-	protected Hashtable<XcasAnnotation, String> positions;
-
-	/**
-	 * Default constructor.
-	 */
-	public XcasFile () {
-		annotations = new Hashtable<Integer, XcasAnnotation>();
-		positions = new Hashtable<XcasAnnotation, String>();
-	}
-
-	/**
-	 * Constructs an <code>XcasFile</code> with the specified name.
-	 * @param f File name.
-	 */
-	public XcasFile (String f) { this(); this.f = new File(f); }
-
-	/**
-	 * Constructs an <code>XcasFile</code> with the specified name.
-	 * @param f A File object.
-	 */
-	public XcasFile (File f) { this(); this.f = f; }
-
-	/**
-	 * Creates a new <code>XcasFile</code> object from the specified file.
-	 * Avoid using this method if you plan to parse multiple files,
-	 * as this method creates an anonymous <code>XcasProcessor</code> instance
-	 * each time called, which could be used to parse multiple files.
-	 * @param f A string containing the XCAS file name.
-	 * @return A parsed <code>XcasFile</code> object.
-	 */
-	public static XcasFile process (String f) {
-		return (new XcasProcessor()).process(f);
-	}
-
-	/**
-	 * Inserts a new annotation with the specified internal <code>id</code>
-	 * to this <code>XcasFile</code>.
-	 * @param id UIMA CAS internal <code>_id</code>.
-	 * @param a An <code>XcasAnnotation</code> object to add.
-	 * @see #addAnnotation(int, XcasAnnotation, String)
-	 * @see #addAnnotation(int, XcasAnnotation, int, int)
-	 */
-	public void addAnnotation (int id, XcasAnnotation a) { annotations.put(id, a); }
-
-	/**
-	 * Inserts a new annotation, along with its position in the file,
-	 * to this <code>XcasFile</code> object.
-	 * @param id UIMA CAS internal <code>_id</code>.
-	 * @param a An <code>XcasAnnotation</code> object to add.
-	 * @param pos Line and column number of the specified annotation,
-	 *        in the form of <code>line_number:column_number</code>.
-	 */
-	public void addAnnotation (int id, XcasAnnotation a, String pos) { addAnnotation(id, a); positions.put(a, pos); }
-
-	/**
-	 * Inserts a new annotation, along with its position in the file,
-	 * to this <code>XcasFile</code> object.
-	 * @param id UIMA CAS internal <code>_id</code>.
-	 * @param a An <code>XcasAnnotation</code> object to add.
-	 * @param lineNum Line number of the specified annotation.
-	 * @param colNum Column number of the specified annotation.
-	 */
-	public void addAnnotation (int id, XcasAnnotation a, int lineNum, int colNum) { addAnnotation(id, a, Integer.toString(lineNum)+":"+Integer.toString(colNum)); }
-
-	/**
-	 * Returns the <code>XcasAnnotation</code> object associated with
-	 * the specified internal id.
-	 * @param id UIMA CAS internal <code>_id</code>.
-	 * @return The <code>XcasAnnotation</code> with the specified id.
-	 */
-	public XcasAnnotation getAnnotation (int id) { return annotations.get(id); }
-
-	public java.util.Collection<XcasAnnotation> getAllAnnotations () { return annotations.values(); }
-	public String getFileName () { return f.getName(); }
-
-	/**
-	 * Returns the line and column numbers of the specified <code>XcasAnnotation</code>,
-	 * which is included in this <code>XcasFile</code> object.
-	 * @param a
-	 * @return A string containing the line and column numbers of the specified object,
-	 *         in the form of <code>line_number:column_number</code>.
-	 * @see #getPositionOwn(int)
-	 * @see #getPositionOther(XcasAnnotation)
-	 */
-	public String getPositionOwn (XcasAnnotation a) { return positions.get(a); }
-
-	/**
-	 * Returns the line and column numbers of the <code>XcasAnnotation</code>,
-	 * specified by the original XCAS internal <code>_id</code> field.
-	 * @param id UIMA CAS internal <code>_id</code>.
-	 * @return A string containing the line and column numbers of the specified object,
-	 *         in the form of <code>line_number:column_number</code>.
-	 * @see #getPositionOwn(XcasAnnotation)
-	 * @see #getPositionOther(XcasAnnotation)
-	 */
-	public String getPositionOwn (int id) { return positions.get(annotations.get(id)); }
-
-	/**
-	 * Finds an <code>XcasAnnotation</code> with the same attributes as specified,
-	 * and returns its line and column numbers. 
-	 * @param a
-	 * @return A string containing the line and column numbers of the specified object,
-	 *         in the form of <code>line_number:column_number</code>.
-	 * @see #getPositionOwn(int)
-	 * @see #getPositionOwn(XcasAnnotation)
-	 */
-	public String getPositionOther (XcasAnnotation a) {
-		for (XcasAnnotation o : positions.keySet())
-			if (o.equals(a)) return positions.get(o);
-		return null;
-	}
-
-	/**
-	 * Returns the line and column numbers of the specified <code>XcasAnnotation</code>.
-	 * <p>
-	 * Do not use this method if you know the specified <code>XcasAnnotation</code>
-	 * object is in this <code>XcasFile</code>. Instead, use
-	 * {@link #getPositionOwn(XcasAnnotation)}, which is faster.
-	 * @param a An 
-	 * @return A string containing the line and column numbers of the specified object,
-	 *         in the form of <code>line_number:column_number</code>.
-	 * @see #getPositionOwn(XcasAnnotation)
-	 * @see #getPositionOther(XcasAnnotation)
-	 */
-	public String getPosition (XcasAnnotation a) {
-		if (positions.keySet().contains(a)) return positions.get(a);
-		else return getPositionOther(a);
-	}
-
-	/**
-	 * Finds an <code>XcasAnnotation</code> of the same type as specified, and
-	 * a same text span, then returns its line and column number.
-	 * @param a An <code>XcasAnnotation</code> against which a similar
-	 *          <code>XcasAnnotation</code> in this <code>XcasFile</code>
-	 *          is to be matched.
-	 * @return A string containing the line and column numbers of the specified object,
-	 *         in the form of <code>line_number:column_number</code>.
-	 * @see #getPositionOther(XcasAnnotation)
-	 */
-	public String getPositionSimilar (XcasAnnotation a) {
-		for (XcasAnnotation o : positions.keySet())
-			if (o.type.equals(a.type)) {
-				int oBegin = o.attributes.containsKey("begin") ? Integer.parseInt(o.getAttribute("begin")) : -1;
-				int oEnd = o.attributes.containsKey("end") ? Integer.parseInt(o.getAttribute("end")) : -1;
-				int aBegin = a.attributes.containsKey("begin") ? Integer.parseInt(a.getAttribute("begin")) : -2;
-				int aEnd = a.attributes.containsKey("end") ? Integer.parseInt(a.getAttribute("end")) : -2;
-				if (oBegin==aBegin && oEnd==aEnd) return positions.get(o);
-				else if (o.attributes.containsKey("key") && a.attributes.containsKey("key") && o.getAttribute("key").equals(a.getAttribute("key")))
-					return positions.get(o);
-			}
-		return null;
-	}
-
-	/**
-	 * Checks whether this XCAS file has an annotation with the specified id.
-	 * @param id UIMA CAS internal <code>_id</code>.
-	 * @return <code>true</code> if file has an annotation with the specified id,
-	 *         <code>false</code> otherwise.
-	 */
-	public boolean hasAnnotation (int id) { return annotations.containsKey(id); }
-
-	/**
-	 * Checks whether this XCAS file has the specified <code>XcasAnnotation</code>.
-	 * If there is an <code>XcasAnnotation</code> object that has exactly the same
-	 * type, attributes, and references, return <code>true</code>.
-	 * @param a An <code>XcasAnnotation</code> to check.
-	 * @return <code>true</code> if there is one <code>XcasAnnotation</code> equals
-	 *         the specified one, <code>false</code> otherwise.
-	 * @see XcasAnnotation#equals(Object)
-	 */
-	public boolean hasAnnotation (XcasAnnotation a) { return annotations.containsValue(a); }
-
-	/**
-	 * Checks whether the specified object has the same set of annotations. First check
-	 * whether the specified is an <code>XcasFile</code> object. If so, check whether
-	 * its annotation set is of the same size as in this <code>XcasFile</code>, then check
-	 * whether these two sets are equal. 
-	 * @param obj An object to compare to.
-	 * @return <code>true if the specified object is an <code>XcasFile</code> object and
-	 *         has a same set of <code>XcasAnnotations</code>, <code>false</code> otherwise.
-	 */
-	public boolean equals (Object obj) {
-		if (obj.getClass()!=getClass() || annotations.values().size()!=((XcasFile)obj).annotations.values().size()) return false;
-		return annotations.values().containsAll(((XcasFile)obj).annotations.values());
-	}
-
-	public LinkedList<XcasAnnotation> annotationsClone () {
-		LinkedList<XcasAnnotation> ret = new LinkedList<XcasAnnotation>();
-		Hashtable<XcasAnnotation, XcasAnnotation> cloneMap = new Hashtable<XcasAnnotation, XcasAnnotation>();
-		for (XcasAnnotation a : annotations.values()) {
-			XcasAnnotation c = a.shallowCopy();
-			cloneMap.put(a, c);
-			ret.add(c);
-		}
-		for (XcasAnnotation a : annotations.values())
-			for (String s : a.references.keySet())
-				for (XcasAnnotation r : (Vector<XcasAnnotation>)a.references.get(s))
-					((Vector<XcasAnnotation>)cloneMap.get(a).references.get(s)).add(cloneMap.get(r));
-		return ret;
-	}
-
-	public Object clone () {
-		return null; //TODO implement clone?
-		// Should not use XcasAnnotation.clone()
-		// otherwise, XcasAnnotation objects referenced by multiple objects will be cloned more than once.
-	}
-}
+package edu.mayo.bmi.utils.xcas_comparison;
+import java.util.Hashtable;
+import java.util.LinkedList;
+import java.util.Vector;
+import java.io.File;
+
+/**
+ * An <code>XcasFile</code> wraps all <code>XcasAnnotations</code>s
+ * in it and also contains their position information in terms of
+ * line and column numbers.
+ * @author Mayo Clinic
+ *
+ */
+public class XcasFile implements Cloneable {
+
+	protected File f;
+	protected Hashtable<Integer, XcasAnnotation> annotations;
+	protected Hashtable<XcasAnnotation, String> positions;
+
+	/**
+	 * Default constructor.
+	 */
+	public XcasFile () {
+		annotations = new Hashtable<Integer, XcasAnnotation>();
+		positions = new Hashtable<XcasAnnotation, String>();
+	}
+
+	/**
+	 * Constructs an <code>XcasFile</code> with the specified name.
+	 * @param f File name.
+	 */
+	public XcasFile (String f) { this(); this.f = new File(f); }
+
+	/**
+	 * Constructs an <code>XcasFile</code> with the specified name.
+	 * @param f A File object.
+	 */
+	public XcasFile (File f) { this(); this.f = f; }
+
+	/**
+	 * Creates a new <code>XcasFile</code> object from the specified file.
+	 * Avoid using this method if you plan to parse multiple files,
+	 * as this method creates an anonymous <code>XcasProcessor</code> instance
+	 * each time called, which could be used to parse multiple files.
+	 * @param f A string containing the XCAS file name.
+	 * @return A parsed <code>XcasFile</code> object.
+	 */
+	public static XcasFile process (String f) {
+		return (new XcasProcessor()).process(f);
+	}
+
+	/**
+	 * Inserts a new annotation with the specified internal <code>id</code>
+	 * to this <code>XcasFile</code>.
+	 * @param id UIMA CAS internal <code>_id</code>.
+	 * @param a An <code>XcasAnnotation</code> object to add.
+	 * @see #addAnnotation(int, XcasAnnotation, String)
+	 * @see #addAnnotation(int, XcasAnnotation, int, int)
+	 */
+	public void addAnnotation (int id, XcasAnnotation a) { annotations.put(id, a); }
+
+	/**
+	 * Inserts a new annotation, along with its position in the file,
+	 * to this <code>XcasFile</code> object.
+	 * @param id UIMA CAS internal <code>_id</code>.
+	 * @param a An <code>XcasAnnotation</code> object to add.
+	 * @param pos Line and column number of the specified annotation,
+	 *        in the form of <code>line_number:column_number</code>.
+	 */
+	public void addAnnotation (int id, XcasAnnotation a, String pos) { addAnnotation(id, a); positions.put(a, pos); }
+
+	/**
+	 * Inserts a new annotation, along with its position in the file,
+	 * to this <code>XcasFile</code> object.
+	 * @param id UIMA CAS internal <code>_id</code>.
+	 * @param a An <code>XcasAnnotation</code> object to add.
+	 * @param lineNum Line number of the specified annotation.
+	 * @param colNum Column number of the specified annotation.
+	 */
+	public void addAnnotation (int id, XcasAnnotation a, int lineNum, int colNum) { addAnnotation(id, a, Integer.toString(lineNum)+":"+Integer.toString(colNum)); }
+
+	/**
+	 * Returns the <code>XcasAnnotation</code> object associated with
+	 * the specified internal id.
+	 * @param id UIMA CAS internal <code>_id</code>.
+	 * @return The <code>XcasAnnotation</code> with the specified id.
+	 */
+	public XcasAnnotation getAnnotation (int id) { return annotations.get(id); }
+
+	public java.util.Collection<XcasAnnotation> getAllAnnotations () { return annotations.values(); }
+	public String getFileName () { return f.getName(); }
+
+	/**
+	 * Returns the line and column numbers of the specified <code>XcasAnnotation</code>,
+	 * which is included in this <code>XcasFile</code> object.
+	 * @param a
+	 * @return A string containing the line and column numbers of the specified object,
+	 *         in the form of <code>line_number:column_number</code>.
+	 * @see #getPositionOwn(int)
+	 * @see #getPositionOther(XcasAnnotation)
+	 */
+	public String getPositionOwn (XcasAnnotation a) { return positions.get(a); }
+
+	/**
+	 * Returns the line and column numbers of the <code>XcasAnnotation</code>,
+	 * specified by the original XCAS internal <code>_id</code> field.
+	 * @param id UIMA CAS internal <code>_id</code>.
+	 * @return A string containing the line and column numbers of the specified object,
+	 *         in the form of <code>line_number:column_number</code>.
+	 * @see #getPositionOwn(XcasAnnotation)
+	 * @see #getPositionOther(XcasAnnotation)
+	 */
+	public String getPositionOwn (int id) { return positions.get(annotations.get(id)); }
+
+	/**
+	 * Finds an <code>XcasAnnotation</code> with the same attributes as specified,
+	 * and returns its line and column numbers. 
+	 * @param a
+	 * @return A string containing the line and column numbers of the specified object,
+	 *         in the form of <code>line_number:column_number</code>.
+	 * @see #getPositionOwn(int)
+	 * @see #getPositionOwn(XcasAnnotation)
+	 */
+	public String getPositionOther (XcasAnnotation a) {
+		for (XcasAnnotation o : positions.keySet())
+			if (o.equals(a)) return positions.get(o);
+		return null;
+	}
+
+	/**
+	 * Returns the line and column numbers of the specified <code>XcasAnnotation</code>.
+	 * <p>
+	 * Do not use this method if you know the specified <code>XcasAnnotation</code>
+	 * object is in this <code>XcasFile</code>. Instead, use
+	 * {@link #getPositionOwn(XcasAnnotation)}, which is faster.
+	 * @param a An 
+	 * @return A string containing the line and column numbers of the specified object,
+	 *         in the form of <code>line_number:column_number</code>.
+	 * @see #getPositionOwn(XcasAnnotation)
+	 * @see #getPositionOther(XcasAnnotation)
+	 */
+	public String getPosition (XcasAnnotation a) {
+		if (positions.keySet().contains(a)) return positions.get(a);
+		else return getPositionOther(a);
+	}
+
+	/**
+	 * Finds an <code>XcasAnnotation</code> of the same type as specified, and
+	 * a same text span, then returns its line and column number.
+	 * @param a An <code>XcasAnnotation</code> against which a similar
+	 *          <code>XcasAnnotation</code> in this <code>XcasFile</code>
+	 *          is to be matched.
+	 * @return A string containing the line and column numbers of the specified object,
+	 *         in the form of <code>line_number:column_number</code>.
+	 * @see #getPositionOther(XcasAnnotation)
+	 */
+	public String getPositionSimilar (XcasAnnotation a) {
+		for (XcasAnnotation o : positions.keySet())
+			if (o.type.equals(a.type)) {
+				int oBegin = o.attributes.containsKey("begin") ? Integer.parseInt(o.getAttribute("begin")) : -1;
+				int oEnd = o.attributes.containsKey("end") ? Integer.parseInt(o.getAttribute("end")) : -1;
+				int aBegin = a.attributes.containsKey("begin") ? Integer.parseInt(a.getAttribute("begin")) : -2;
+				int aEnd = a.attributes.containsKey("end") ? Integer.parseInt(a.getAttribute("end")) : -2;
+				if (oBegin==aBegin && oEnd==aEnd) return positions.get(o);
+				else if (o.attributes.containsKey("key") && a.attributes.containsKey("key") && o.getAttribute("key").equals(a.getAttribute("key")))
+					return positions.get(o);
+			}
+		return null;
+	}
+
+	/**
+	 * Checks whether this XCAS file has an annotation with the specified id.
+	 * @param id UIMA CAS internal <code>_id</code>.
+	 * @return <code>true</code> if file has an annotation with the specified id,
+	 *         <code>false</code> otherwise.
+	 */
+	public boolean hasAnnotation (int id) { return annotations.containsKey(id); }
+
+	/**
+	 * Checks whether this XCAS file has the specified <code>XcasAnnotation</code>.
+	 * If there is an <code>XcasAnnotation</code> object that has exactly the same
+	 * type, attributes, and references, return <code>true</code>.
+	 * @param a An <code>XcasAnnotation</code> to check.
+	 * @return <code>true</code> if there is one <code>XcasAnnotation</code> equals
+	 *         the specified one, <code>false</code> otherwise.
+	 * @see XcasAnnotation#equals(Object)
+	 */
+	public boolean hasAnnotation (XcasAnnotation a) { return annotations.containsValue(a); }
+
+	/**
+	 * Checks whether the specified object has the same set of annotations. First check
+	 * whether the specified is an <code>XcasFile</code> object. If so, check whether
+	 * its annotation set is of the same size as in this <code>XcasFile</code>, then check
+	 * whether these two sets are equal. 
+	 * @param obj An object to compare to.
+	 * @return <code>true if the specified object is an <code>XcasFile</code> object and
+	 *         has a same set of <code>XcasAnnotations</code>, <code>false</code> otherwise.
+	 */
+	public boolean equals (Object obj) {
+		if (obj.getClass()!=getClass() || annotations.values().size()!=((XcasFile)obj).annotations.values().size()) return false;
+		return annotations.values().containsAll(((XcasFile)obj).annotations.values());
+	}
+
+	public LinkedList<XcasAnnotation> annotationsClone () {
+		LinkedList<XcasAnnotation> ret = new LinkedList<XcasAnnotation>();
+		Hashtable<XcasAnnotation, XcasAnnotation> cloneMap = new Hashtable<XcasAnnotation, XcasAnnotation>();
+		for (XcasAnnotation a : annotations.values()) {
+			XcasAnnotation c = a.shallowCopy();
+			cloneMap.put(a, c);
+			ret.add(c);
+		}
+		for (XcasAnnotation a : annotations.values())
+			for (String s : a.references.keySet())
+				for (XcasAnnotation r : (Vector<XcasAnnotation>)a.references.get(s))
+					((Vector<XcasAnnotation>)cloneMap.get(a).references.get(s)).add(cloneMap.get(r));
+		return ret;
+	}
+
+	public Object clone () {
+		return null; //TODO implement clone?
+		// Should not use XcasAnnotation.clone()
+		// otherwise, XcasAnnotation objects referenced by multiple objects will be cloned more than once.
+	}
+}

Modified: incubator/ctakes/branches/SHARPn-cTAKES/PAD term spotter/utils/src/edu/mayo/bmi/utils/xcas_comparison/XcasProcessor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/PAD%20term%20spotter/utils/src/edu/mayo/bmi/utils/xcas_comparison/XcasProcessor.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/PAD term spotter/utils/src/edu/mayo/bmi/utils/xcas_comparison/XcasProcessor.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/PAD term spotter/utils/src/edu/mayo/bmi/utils/xcas_comparison/XcasProcessor.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -21,171 +14,171 @@
  * See the License for the specific language governing permissions and 
  * limitations under the License. 
  */
-package edu.mayo.bmi.utils.xcas_comparison;
-import java.util.Hashtable;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.io.File;
-
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.SAXParseException;
-import org.xml.sax.Locator;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * A SAX parser that parses an XCAS file.
- * This is done without referencing to the UIMA Type System definition.
- * Therefore, {@link XcasDiff} can be used to compare XCASes from
- * different type systems.
- * @author Mayo Clinic
- *
- */
-public class XcasProcessor extends DefaultHandler {
-
-	private Locator loc;
-	private SAXParser sp;
-	private XcasFile xcasf;
-	private Hashtable<String, Integer> pendingRef;
-	private Hashtable<Integer, int[]> pendingArr;
-	private Hashtable<Integer, int[]> pendingIntArr;
-	private HashMap<Integer, int[]> pendingList;
-	private String parentTag;
-	private int arrID;
-	private int arrInd = -1;
-	private StringBuffer val;
-
-	public void setDocumentLocator(Locator locator) { loc = locator; }
-
-	/**
-	 * Default constructor.
-	 */
-	public XcasProcessor () {
-		pendingRef = new Hashtable<String, Integer>();
-		pendingArr = new Hashtable<Integer, int[]>();
-		pendingIntArr = new Hashtable<Integer, int[]>();
-		pendingList = new HashMap<Integer, int[]>();
-		val = new StringBuffer();
-		try { sp = SAXParserFactory.newInstance().newSAXParser(); }
-		catch (Exception e) { e.printStackTrace(); }
-	}
-
-	/**
-	 * Parses the specified file and returns a parsed <code>XcasFile</code> object.
-	 * @param f A File object.
-	 * @return An <code>XcasFile</code> object.
-	 */
-	public XcasFile process (File f) {
-		xcasf = new XcasFile(f);
-		pendingRef.clear();
-		pendingArr.clear();
-		pendingIntArr.clear();
-		pendingList.clear();
-		val.delete(0, val.length());
-		arrInd = -1;
-		try { sp.parse(f, this); }
-		catch (SAXParseException spe) {
-			System.err.println("Error parsing XCAS file: "+f+" at line"+spe.getLineNumber());
-			System.err.println(spe.getMessage());
-		}
-		catch (Exception e) { e.printStackTrace(); }
-		return xcasf;
-	}
-
-	/**
-	 * Parses the specified file and returns a parsed <code>XcasFile</code> object.
-	 * @param f File name.
-	 * @return An <code>XcasFile</code> object.
-	 */
-	public XcasFile process (String f) {
-		return process(new File(f));
-	}
-
-	public void characters (char[] ch, int start, int length) throws SAXException {
-		val.append(ch, start, length);
-	}
-
-	public void startElement (String uri, String localName, String qName, Attributes attributes) throws SAXException {
-		val.delete(0, val.length());
-		String s = attributes.getValue(Const.ID);
-		int id;
-		if (s==null) return;
-		else id = Integer.parseInt(s);
-		if (qName.equalsIgnoreCase(Const.UIMA_FSARRAY)) {
-			pendingArr.put(id, new int[Integer.parseInt(attributes.getValue(Const.UIMA_ARRAY_SIZE_KEYWORD))]);
-			parentTag = qName;
-			arrID = id;
-			arrInd = 0;
-		}
-		else if (qName.equalsIgnoreCase(Const.UIMA_INTARRAY)) {
-			pendingIntArr.put(id, new int[Integer.parseInt(attributes.getValue(Const.UIMA_ARRAY_SIZE_KEYWORD))]);
-			parentTag = qName;
-			arrID = id;
-			arrInd = 0;
-		}
-		else if (qName.equalsIgnoreCase(Const.UIMA_NONEMPTY_FSLIST)) {
-			int[] ref = {Integer.parseInt(attributes.getValue(Const.UIMA_LIST_HEAD_KEYWORD)),
-					Integer.parseInt(attributes.getValue(Const.UIMA_LIST_TAIL_KEYWORD))};
-			pendingList.put(id, ref);
-		}
-		else if (qName.equalsIgnoreCase(Const.UIMA_EMPTY_FSLIST)) {
-			pendingList.put(id, null);
-		}
-		else if (!qName.equalsIgnoreCase(Const.UIMA_ARRAY_INDEX_KEYWORD)) {
-			XcasAnnotation a = new XcasAnnotation(qName);
-			for (int i = attributes.getLength(); i > 0; i--) {
-				String q = attributes.getQName(i-1);
-				String v = attributes.getValue(i-1);
-				if (q.equalsIgnoreCase(Const.ID) || Const.ATTRIBUTES_TO_IGNORE.contains(q)) continue;
-				else if (q.startsWith(Const.REF_PREFIX)) pendingRef.put(Integer.toString(id)+":"+q, Integer.parseInt(v));
-				else a.insertAttribute(q, v);
-			}
-			xcasf.addAnnotation(id, a, loc.getLineNumber()+":"+loc.getColumnNumber());
-		}
-	}
-
-	public void endElement (String uri, String localName, String qName) throws SAXException {
-		if (qName.equalsIgnoreCase(Const.UIMA_ARRAY_INDEX_KEYWORD)) {
-			if (parentTag.equalsIgnoreCase(Const.UIMA_FSARRAY))
-				pendingArr.get(arrID)[arrInd++] = Integer.parseInt(val.toString());
-			else if (parentTag.equalsIgnoreCase(Const.UIMA_INTARRAY))
-				pendingIntArr.get(arrID)[arrInd++] = Integer.parseInt(val.toString());
-		} else if (qName.equalsIgnoreCase(Const.UIMA_FSARRAY)) {
-			arrInd = -1;
-		} else if (qName.equalsIgnoreCase(Const.UIMA_INTARRAY)) {
-			arrInd = -1;
-		} else if (qName.equalsIgnoreCase(Const.UIMA_CAS)) {
-			for (String s : pendingRef.keySet()) {
-				String[] ref = s.split(":");
-				int refID = pendingRef.get(s);
-				XcasAnnotation a = xcasf.getAnnotation(Integer.parseInt(ref[0]));
-				if (pendingIntArr.containsKey(refID)) {
-					a.insertIntReference(ref[1], pendingIntArr.get(refID));
-					continue;
-				}
-				int[] arr;
-				if (pendingArr.containsKey(refID))
-					arr = pendingArr.get(refID);
-				else if (pendingList.containsKey(refID)) {
-					LinkedList<Integer> ll = new LinkedList<Integer>();
-					int[] l = pendingList.get(refID);
-					while (l!=null) {
-						ll.add(l[0]);
-						l = pendingList.get(l[1]);
-					}
-					arr = new int[ll.size()];
-					for (int i = 0; i < arr.length; i++)
-						arr[i] = ll.get(i);
-				}
-				else { arr = new int[1]; arr[0] = refID; }
-				for (int i : arr)
-					a.insertReference(ref[1], xcasf.getAnnotation(i));
-			}
-		}
-		if (val.length()>0 && !qName.equalsIgnoreCase(Const.UIMA_ARRAY_INDEX_KEYWORD) && !qName.equalsIgnoreCase(Const.UIMA_TCAS_DOCUMENT))
-			System.err.println("Unexpected text ("+qName+"): \""+val.toString()+"\"");
-		val.delete(0, val.length());
-	}
-}
+package edu.mayo.bmi.utils.xcas_comparison;
+import java.util.Hashtable;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.io.File;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+import org.xml.sax.Locator;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * A SAX parser that parses an XCAS file.
+ * This is done without referencing to the UIMA Type System definition.
+ * Therefore, {@link XcasDiff} can be used to compare XCASes from
+ * different type systems.
+ * @author Mayo Clinic
+ *
+ */
+public class XcasProcessor extends DefaultHandler {
+
+	private Locator loc;
+	private SAXParser sp;
+	private XcasFile xcasf;
+	private Hashtable<String, Integer> pendingRef;
+	private Hashtable<Integer, int[]> pendingArr;
+	private Hashtable<Integer, int[]> pendingIntArr;
+	private HashMap<Integer, int[]> pendingList;
+	private String parentTag;
+	private int arrID;
+	private int arrInd = -1;
+	private StringBuffer val;
+
+	public void setDocumentLocator(Locator locator) { loc = locator; }
+
+	/**
+	 * Default constructor.
+	 */
+	public XcasProcessor () {
+		pendingRef = new Hashtable<String, Integer>();
+		pendingArr = new Hashtable<Integer, int[]>();
+		pendingIntArr = new Hashtable<Integer, int[]>();
+		pendingList = new HashMap<Integer, int[]>();
+		val = new StringBuffer();
+		try { sp = SAXParserFactory.newInstance().newSAXParser(); }
+		catch (Exception e) { e.printStackTrace(); }
+	}
+
+	/**
+	 * Parses the specified file and returns a parsed <code>XcasFile</code> object.
+	 * @param f A File object.
+	 * @return An <code>XcasFile</code> object.
+	 */
+	public XcasFile process (File f) {
+		xcasf = new XcasFile(f);
+		pendingRef.clear();
+		pendingArr.clear();
+		pendingIntArr.clear();
+		pendingList.clear();
+		val.delete(0, val.length());
+		arrInd = -1;
+		try { sp.parse(f, this); }
+		catch (SAXParseException spe) {
+			System.err.println("Error parsing XCAS file: "+f+" at line"+spe.getLineNumber());
+			System.err.println(spe.getMessage());
+		}
+		catch (Exception e) { e.printStackTrace(); }
+		return xcasf;
+	}
+
+	/**
+	 * Parses the specified file and returns a parsed <code>XcasFile</code> object.
+	 * @param f File name.
+	 * @return An <code>XcasFile</code> object.
+	 */
+	public XcasFile process (String f) {
+		return process(new File(f));
+	}
+
+	public void characters (char[] ch, int start, int length) throws SAXException {
+		val.append(ch, start, length);
+	}
+
+	public void startElement (String uri, String localName, String qName, Attributes attributes) throws SAXException {
+		val.delete(0, val.length());
+		String s = attributes.getValue(Const.ID);
+		int id;
+		if (s==null) return;
+		else id = Integer.parseInt(s);
+		if (qName.equalsIgnoreCase(Const.UIMA_FSARRAY)) {
+			pendingArr.put(id, new int[Integer.parseInt(attributes.getValue(Const.UIMA_ARRAY_SIZE_KEYWORD))]);
+			parentTag = qName;
+			arrID = id;
+			arrInd = 0;
+		}
+		else if (qName.equalsIgnoreCase(Const.UIMA_INTARRAY)) {
+			pendingIntArr.put(id, new int[Integer.parseInt(attributes.getValue(Const.UIMA_ARRAY_SIZE_KEYWORD))]);
+			parentTag = qName;
+			arrID = id;
+			arrInd = 0;
+		}
+		else if (qName.equalsIgnoreCase(Const.UIMA_NONEMPTY_FSLIST)) {
+			int[] ref = {Integer.parseInt(attributes.getValue(Const.UIMA_LIST_HEAD_KEYWORD)),
+					Integer.parseInt(attributes.getValue(Const.UIMA_LIST_TAIL_KEYWORD))};
+			pendingList.put(id, ref);
+		}
+		else if (qName.equalsIgnoreCase(Const.UIMA_EMPTY_FSLIST)) {
+			pendingList.put(id, null);
+		}
+		else if (!qName.equalsIgnoreCase(Const.UIMA_ARRAY_INDEX_KEYWORD)) {
+			XcasAnnotation a = new XcasAnnotation(qName);
+			for (int i = attributes.getLength(); i > 0; i--) {
+				String q = attributes.getQName(i-1);
+				String v = attributes.getValue(i-1);
+				if (q.equalsIgnoreCase(Const.ID) || Const.ATTRIBUTES_TO_IGNORE.contains(q)) continue;
+				else if (q.startsWith(Const.REF_PREFIX)) pendingRef.put(Integer.toString(id)+":"+q, Integer.parseInt(v));
+				else a.insertAttribute(q, v);
+			}
+			xcasf.addAnnotation(id, a, loc.getLineNumber()+":"+loc.getColumnNumber());
+		}
+	}
+
+	public void endElement (String uri, String localName, String qName) throws SAXException {
+		if (qName.equalsIgnoreCase(Const.UIMA_ARRAY_INDEX_KEYWORD)) {
+			if (parentTag.equalsIgnoreCase(Const.UIMA_FSARRAY))
+				pendingArr.get(arrID)[arrInd++] = Integer.parseInt(val.toString());
+			else if (parentTag.equalsIgnoreCase(Const.UIMA_INTARRAY))
+				pendingIntArr.get(arrID)[arrInd++] = Integer.parseInt(val.toString());
+		} else if (qName.equalsIgnoreCase(Const.UIMA_FSARRAY)) {
+			arrInd = -1;
+		} else if (qName.equalsIgnoreCase(Const.UIMA_INTARRAY)) {
+			arrInd = -1;
+		} else if (qName.equalsIgnoreCase(Const.UIMA_CAS)) {
+			for (String s : pendingRef.keySet()) {
+				String[] ref = s.split(":");
+				int refID = pendingRef.get(s);
+				XcasAnnotation a = xcasf.getAnnotation(Integer.parseInt(ref[0]));
+				if (pendingIntArr.containsKey(refID)) {
+					a.insertIntReference(ref[1], pendingIntArr.get(refID));
+					continue;
+				}
+				int[] arr;
+				if (pendingArr.containsKey(refID))
+					arr = pendingArr.get(refID);
+				else if (pendingList.containsKey(refID)) {
+					LinkedList<Integer> ll = new LinkedList<Integer>();
+					int[] l = pendingList.get(refID);
+					while (l!=null) {
+						ll.add(l[0]);
+						l = pendingList.get(l[1]);
+					}
+					arr = new int[ll.size()];
+					for (int i = 0; i < arr.length; i++)
+						arr[i] = ll.get(i);
+				}
+				else { arr = new int[1]; arr[0] = refID; }
+				for (int i : arr)
+					a.insertReference(ref[1], xcasf.getAnnotation(i));
+			}
+		}
+		if (val.length()>0 && !qName.equalsIgnoreCase(Const.UIMA_ARRAY_INDEX_KEYWORD) && !qName.equalsIgnoreCase(Const.UIMA_TCAS_DOCUMENT))
+			System.err.println("Unexpected text ("+qName+"): \""+val.toString()+"\"");
+		val.delete(0, val.length());
+	}
+}

Modified: incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/scripts/java/data/pos/dictionary/ListTags.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/POS%20tagger/scripts/java/data/pos/dictionary/ListTags.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/scripts/java/data/pos/dictionary/ListTags.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/scripts/java/data/pos/dictionary/ListTags.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -21,143 +14,143 @@
  * See the License for the specific language governing permissions and 
  * limitations under the License. 
  */
-package data.pos.dictionary;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.IOException;
-import java.io.Reader;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.ArrayList;
-
-
-/**
- * From a POS corpus in OpenNLP format, create a list of the POS tags found within the corpus
- * <br>Outputs the list of tags to stdout, and for each tag, outputs one word/token that 
- * had been tagged with that tag
- * @author Mayo Clinic
- */
-
-public class ListTags {
-
-	/*  
-	 * writes the list of tags to stdout, together with an example / a word found tagged with that tag
-	 */
-	private static void writeTagList(File f, HashMap<String, String> tagList) throws IOException {
-		// sort them before outputting them
-        ArrayList<String> list = new ArrayList<String>();
-		for (Object key : tagList.keySet()) {
-			list.add(key.toString());
-		}
-        Collections.sort(list);
-        
-        // output to stdout
-        System.out.println("\nFor file " + f.getName() + ":");
-        for (String s : list) {
-    		System.out.println(s + "\t   which was a tag for '" + tagList.get(s)+ "'"); // output the tagList entry to stdout        	
-        }
-	}
-
-	// Use a HashMap so we can keep an example, for each tag, of what was tagged with the tag
-	private static HashMap<String, String> createTagList(BufferedReader br) throws IOException {
-		HashMap<String, String> tagList;
-		tagList = new HashMap<String, String> (100); // initial size is arbitrary
-		String line;
-		String tag;
-		int pos; // position of last underscore
-		String taggedThing;
-		while((line = br.readLine()) != null) {
-			for (String token : line.split(" ")) {
-				pos = token.lastIndexOf('_');
-				if (pos < 0) {
-					System.err.println("ERROR: didn't find underscore within '" + token + "'");
-				}
-				taggedThing = token.substring(0, pos);
-				tag = token.substring(pos+1);
-				if (tagList.get(tag)==null) {
-					tagList.put(tag, taggedThing);
-				}
-				else {
-					//	System.out.println(tag + " already was seen for " + taggedThing);
-				}	
-			}
-		}
-		return tagList;
-	}
-
-	private static BufferedReader getBufferedReader(String filename) throws FileNotFoundException {
-		File f = new File(filename);
-		Reader r;
-		try {
-			r = new FileReader(f);
-		} catch (FileNotFoundException e) {
-			System.err.println("Error reading from file " + filename);
-			throw e;
-		}
-
-		return new BufferedReader(r);		
-	}
-
-
-	/**
-	 * Read a file containing POS-tagged tokens in OpenNLP format,
-	 * and output to stdout the list of tags found<br>
-	 *  Example input:
-	 *  <br>body_NN
-	 *  <br>winning_VBG<br>
-	 *  <br>body_NN<br>
-	 *  Example output:
-	 *  <br>NN
-	 *  <br>VBG
-	 * @param args args[0] is required - the name of the input file containing 
-	 * POS-tagged tokens in OpenNLP format.
-	 * <br>E.g. data/pos/ptb-pos-training.txt
-	 *  
-	 */
-	public static void main(String[] args) {
-
-		if (args[0]==null || args[0].length()==0) {
-			System.err.println("ERROR: corpus name required");
-			return;
-		}
-		
-		String arg0 = args[0].trim();
-		if (arg0.equals("-h") || (arg0.equals("--help"))) {
-			System.out.println("Usage: java ListTags <corpus-name>");
-			System.out.println("  where <corpus-name> is something like   data/pos/ptb-pos-training.txt");
-			System.out.println("Usage: java ListTags <directory>");
-			System.out.println("  where <directory> is something like   data/pos/");
-			return;
-		}
-		
-		String inputPath = args[0];
-		
-		File f = new File(inputPath);
-		File [] files; // list of files to process
-		if (f.isDirectory()) { // directory name was input
-			files = f.listFiles(); // process all within the dir
-		}
-		else { // name of a regular file was input
-			files = new File[1];
-			files[0] = f;
-		}
-		HashMap<String, String> tagList;
-		
-		try {
-			for (File file : files) {
-				if (file.isDirectory()) continue; // skip subdirectories
-				if (file.getName().endsWith(".lnk")) continue; // skip shortcuts
-				BufferedReader br = getBufferedReader(file.getAbsolutePath());
-				tagList = createTagList(br);
-				writeTagList(file, tagList);
-			}
-		} catch (IOException e) {
-			System.err.println("Failed");
-		}
-    	
-    }
-	
-}
+package data.pos.dictionary;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.ArrayList;
+
+
+/**
+ * From a POS corpus in OpenNLP format, create a list of the POS tags found within the corpus
+ * <br>Outputs the list of tags to stdout, and for each tag, outputs one word/token that 
+ * had been tagged with that tag
+ * @author Mayo Clinic
+ */
+
+public class ListTags {
+
+	/*  
+	 * writes the list of tags to stdout, together with an example / a word found tagged with that tag
+	 */
+	private static void writeTagList(File f, HashMap<String, String> tagList) throws IOException {
+		// sort them before outputting them
+        ArrayList<String> list = new ArrayList<String>();
+		for (Object key : tagList.keySet()) {
+			list.add(key.toString());
+		}
+        Collections.sort(list);
+        
+        // output to stdout
+        System.out.println("\nFor file " + f.getName() + ":");
+        for (String s : list) {
+    		System.out.println(s + "\t   which was a tag for '" + tagList.get(s)+ "'"); // output the tagList entry to stdout        	
+        }
+	}
+
+	// Use a HashMap so we can keep an example, for each tag, of what was tagged with the tag
+	private static HashMap<String, String> createTagList(BufferedReader br) throws IOException {
+		HashMap<String, String> tagList;
+		tagList = new HashMap<String, String> (100); // initial size is arbitrary
+		String line;
+		String tag;
+		int pos; // position of last underscore
+		String taggedThing;
+		while((line = br.readLine()) != null) {
+			for (String token : line.split(" ")) {
+				pos = token.lastIndexOf('_');
+				if (pos < 0) {
+					System.err.println("ERROR: didn't find underscore within '" + token + "'");
+				}
+				taggedThing = token.substring(0, pos);
+				tag = token.substring(pos+1);
+				if (tagList.get(tag)==null) {
+					tagList.put(tag, taggedThing);
+				}
+				else {
+					//	System.out.println(tag + " already was seen for " + taggedThing);
+				}	
+			}
+		}
+		return tagList;
+	}
+
+	private static BufferedReader getBufferedReader(String filename) throws FileNotFoundException {
+		File f = new File(filename);
+		Reader r;
+		try {
+			r = new FileReader(f);
+		} catch (FileNotFoundException e) {
+			System.err.println("Error reading from file " + filename);
+			throw e;
+		}
+
+		return new BufferedReader(r);		
+	}
+
+
+	/**
+	 * Read a file containing POS-tagged tokens in OpenNLP format,
+	 * and output to stdout the list of tags found<br>
+	 *  Example input:
+	 *  <br>body_NN
+	 *  <br>winning_VBG<br>
+	 *  <br>body_NN<br>
+	 *  Example output:
+	 *  <br>NN
+	 *  <br>VBG
+	 * @param args args[0] is required - the name of the input file containing 
+	 * POS-tagged tokens in OpenNLP format.
+	 * <br>E.g. data/pos/ptb-pos-training.txt
+	 *  
+	 */
+	public static void main(String[] args) {
+
+		if (args[0]==null || args[0].length()==0) {
+			System.err.println("ERROR: corpus name required");
+			return;
+		}
+		
+		String arg0 = args[0].trim();
+		if (arg0.equals("-h") || (arg0.equals("--help"))) {
+			System.out.println("Usage: java ListTags <corpus-name>");
+			System.out.println("  where <corpus-name> is something like   data/pos/ptb-pos-training.txt");
+			System.out.println("Usage: java ListTags <directory>");
+			System.out.println("  where <directory> is something like   data/pos/");
+			return;
+		}
+		
+		String inputPath = args[0];
+		
+		File f = new File(inputPath);
+		File [] files; // list of files to process
+		if (f.isDirectory()) { // directory name was input
+			files = f.listFiles(); // process all within the dir
+		}
+		else { // name of a regular file was input
+			files = new File[1];
+			files[0] = f;
+		}
+		HashMap<String, String> tagList;
+		
+		try {
+			for (File file : files) {
+				if (file.isDirectory()) continue; // skip subdirectories
+				if (file.getName().endsWith(".lnk")) continue; // skip shortcuts
+				BufferedReader br = getBufferedReader(file.getAbsolutePath());
+				tagList = createTagList(br);
+				writeTagList(file, tagList);
+			}
+		} catch (IOException e) {
+			System.err.println("Failed");
+		}
+    	
+    }
+	
+}

Modified: incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/scripts/java/data/pos/training/GeniaPosTrainingDataExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/POS%20tagger/scripts/java/data/pos/training/GeniaPosTrainingDataExtractor.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/scripts/java/data/pos/training/GeniaPosTrainingDataExtractor.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/scripts/java/data/pos/training/GeniaPosTrainingDataExtractor.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -21,203 +14,203 @@
  * See the License for the specific language governing permissions and 
  * limitations under the License. 
  */
-package data.pos.training;
-
-import java.io.File;
-import java.io.PrintStream;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.jdom.Element;
-import org.jdom.JDOMException;
-import org.jdom.input.SAXBuilder;
-
-/**
- * This class reads in the GENIA corpus and produces part-of-speech training
- * data. It reads in the corpus file GENIAcorpus3.02.pos.xml and writes out the
- * file found at data/pos/training/genia-pos-training.txt.
- * 
- * see also data/pos/training/README
- * 
- * @author Mayo Clinic
- * 
- */
-public class GeniaPosTrainingDataExtractor implements Iterator<data.pos.training.GeniaPosTrainingDataExtractor.TaggedAbstract> {
-
-	Iterator<?> articles;
-	Element article;
-	
-	public GeniaPosTrainingDataExtractor(String geniaCorpusFileName) throws JDOMException{
-		File geniaCorpusFile = new File(geniaCorpusFileName);
-		SAXBuilder builder = new SAXBuilder();
-		builder.setDTDHandler(null);
-		Element root = builder.build(geniaCorpusFile).getRootElement();
-		articles = root.getChildren("article").iterator();
-	}
-
-	
-	public boolean hasNext() {
-		if(article != null)
-			return true;
-		else {
-			if (articles.hasNext()) {
-				article = (Element) articles.next();
-				return true;
-			}
-		}
-		return false;
-	}
-
-	public TaggedAbstract next() {
-		if(hasNext()) {
-			TaggedAbstract taggedAbstract = parseArticle(article);
-			article = null;
-			return taggedAbstract;
-		}
-		return null;
-	}
-
-	public void remove() {}
-
-	public TaggedAbstract parseArticle(Element article) {
-		List<TaggedSentence> taggedSentences = new ArrayList<TaggedSentence>();
-		
-		Element title = article.getChild("title");
-		if (title != null)
-			taggedSentences.addAll(parseAbstract(title));
-		Element abstractElement = article.getChild("abstract");
-		if (abstractElement != null)
-			taggedSentences.addAll(parseAbstract(abstractElement));
-		return new TaggedAbstract(taggedSentences);
-	}
-	
-	public List<TaggedSentence> parseAbstract(Element titleOrAbstract){
-		List<TaggedSentence> taggedSentences = new ArrayList<TaggedSentence>();
-		Iterator<?> sentences = titleOrAbstract.getChildren("sentence").iterator();
-		while (sentences.hasNext()) {
-			Element sentence = (Element) sentences.next();
-			TaggedSentence taggedSentence = parseSentence(sentence);
-			taggedSentences.add(taggedSentence);
-		}
-		return taggedSentences;
-	}
-	
-	public TaggedSentence parseSentence(Element sentence){
-		List<TaggedWord> wordTags = new ArrayList<TaggedWord>();
-		Iterator<?> words = sentence.getChildren("w").iterator();
-		while (words.hasNext()) {
-			Element word = (Element) words.next();
-			String wordText = word.getText();
-			String posTag = word.getAttributeValue("c");
-			/**
-			 * If the posTag is an asterisk, then we want to find the next word that has a 
-			 * an actual posTag.  
-			 */
-			while (posTag.equals("*")) {
-				word = (Element) words.next();
-				wordText = wordText + word.getText();
-				posTag = word.getAttributeValue("c");
-			}
-			
-			if(posTag.indexOf("|") != -1)
-				System.out.println(wordText+":  "+posTag);
-			posTag = posTag.split("\\|")[0];
-			/**
-			 * some of the tags in Genia have white space that messes things up.  Just remove
-			 * the whitespace from these words.
-			 */
-			wordText = wordText.replaceAll("\\s", "");
-			wordTags.add(new TaggedWord(wordText, posTag));
-		}
-		return new TaggedSentence(wordTags);
-	}
-	
-	public class TaggedAbstract{
-		List<TaggedSentence> taggedSentences;
-
-		public TaggedAbstract(List<TaggedSentence> taggedSentences) {
-			super();
-			this.taggedSentences = taggedSentences;
-		}
-
-		public List<TaggedSentence> getTaggedSentences() {
-			return taggedSentences;
-		}
-
-		public void setTaggedSentences(List<TaggedSentence> taggedSentences) {
-			this.taggedSentences = taggedSentences;
-		}
-		
-	}
-	
-	public class TaggedSentence{
-		List<TaggedWord> taggedWords;
-
-		public TaggedSentence(List<TaggedWord> taggedWords) {
-			super();
-			this.taggedWords = taggedWords;
-		}
-
-		public List<TaggedWord> getTaggedWords() {
-			return taggedWords;
-		}
-
-		public void setTaggedWords(List<TaggedWord> taggedWords) {
-			this.taggedWords = taggedWords;
-		}
-	}
-	
-	public class TaggedWord{
-		String word;
-		String tag;
-		public String getWord() {
-			return word;
-		}
-		public void setWord(String word) {
-			this.word = word;
-		}
-		public String getTag() {
-			return tag;
-		}
-		public void setTag(String tag) {
-			this.tag = tag;
-		}
-		public TaggedWord(String word, String tag) {
-			super();
-			this.word = word;
-			this.tag = tag;
-		}
-		
-	}
-	
-	
-	public static void main(String[] args) {
-		try {
-			System.out.println("Usage: java GeniaPosExtractor GENIAcorpus3.02.pos.xml data/pos/training/genia-pos-training.txt");
-			String geniaCorpusFileName = args[0];
-			String outputFileName = args[1];
-
-			PrintStream out = new PrintStream(outputFileName);
-
-			GeniaPosTrainingDataExtractor gptde = new GeniaPosTrainingDataExtractor(geniaCorpusFileName);
-			while(gptde.hasNext()) {
-				TaggedAbstract taggedAbstract = gptde.next();
-				for(TaggedSentence taggedSentence : taggedAbstract.getTaggedSentences()) {
-					for(TaggedWord taggedWord : taggedSentence.getTaggedWords()) {
-						out.print(taggedWord.getWord()+"_"+taggedWord.getTag()+" ");
-					}
-					out.println();
-				}
-			}
-
-			out.flush();
-			out.close();
-
-		} catch (Exception e) {
-			e.printStackTrace();
-		}
-
-	}
-
-}
+package data.pos.training;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.jdom.Element;
+import org.jdom.JDOMException;
+import org.jdom.input.SAXBuilder;
+
+/**
+ * This class reads in the GENIA corpus and produces part-of-speech training
+ * data. It reads in the corpus file GENIAcorpus3.02.pos.xml and writes out the
+ * file found at data/pos/training/genia-pos-training.txt.
+ * 
+ * see also data/pos/training/README
+ * 
+ * @author Mayo Clinic
+ * 
+ */
+public class GeniaPosTrainingDataExtractor implements Iterator<data.pos.training.GeniaPosTrainingDataExtractor.TaggedAbstract> {
+
+	Iterator<?> articles;
+	Element article;
+	
+	public GeniaPosTrainingDataExtractor(String geniaCorpusFileName) throws JDOMException{
+		File geniaCorpusFile = new File(geniaCorpusFileName);
+		SAXBuilder builder = new SAXBuilder();
+		builder.setDTDHandler(null);
+		Element root = builder.build(geniaCorpusFile).getRootElement();
+		articles = root.getChildren("article").iterator();
+	}
+
+	
+	public boolean hasNext() {
+		if(article != null)
+			return true;
+		else {
+			if (articles.hasNext()) {
+				article = (Element) articles.next();
+				return true;
+			}
+		}
+		return false;
+	}
+
+	public TaggedAbstract next() {
+		if(hasNext()) {
+			TaggedAbstract taggedAbstract = parseArticle(article);
+			article = null;
+			return taggedAbstract;
+		}
+		return null;
+	}
+
+	public void remove() {}
+
+	public TaggedAbstract parseArticle(Element article) {
+		List<TaggedSentence> taggedSentences = new ArrayList<TaggedSentence>();
+		
+		Element title = article.getChild("title");
+		if (title != null)
+			taggedSentences.addAll(parseAbstract(title));
+		Element abstractElement = article.getChild("abstract");
+		if (abstractElement != null)
+			taggedSentences.addAll(parseAbstract(abstractElement));
+		return new TaggedAbstract(taggedSentences);
+	}
+	
+	public List<TaggedSentence> parseAbstract(Element titleOrAbstract){
+		List<TaggedSentence> taggedSentences = new ArrayList<TaggedSentence>();
+		Iterator<?> sentences = titleOrAbstract.getChildren("sentence").iterator();
+		while (sentences.hasNext()) {
+			Element sentence = (Element) sentences.next();
+			TaggedSentence taggedSentence = parseSentence(sentence);
+			taggedSentences.add(taggedSentence);
+		}
+		return taggedSentences;
+	}
+	
+	public TaggedSentence parseSentence(Element sentence){
+		List<TaggedWord> wordTags = new ArrayList<TaggedWord>();
+		Iterator<?> words = sentence.getChildren("w").iterator();
+		while (words.hasNext()) {
+			Element word = (Element) words.next();
+			String wordText = word.getText();
+			String posTag = word.getAttributeValue("c");
+			/**
+			 * If the posTag is an asterisk, then we want to find the next word that has a 
+			 * an actual posTag.  
+			 */
+			while (posTag.equals("*")) {
+				word = (Element) words.next();
+				wordText = wordText + word.getText();
+				posTag = word.getAttributeValue("c");
+			}
+			
+			if(posTag.indexOf("|") != -1)
+				System.out.println(wordText+":  "+posTag);
+			posTag = posTag.split("\\|")[0];
+			/**
+			 * some of the tags in Genia have white space that messes things up.  Just remove
+			 * the whitespace from these words.
+			 */
+			wordText = wordText.replaceAll("\\s", "");
+			wordTags.add(new TaggedWord(wordText, posTag));
+		}
+		return new TaggedSentence(wordTags);
+	}
+	
+	public class TaggedAbstract{
+		List<TaggedSentence> taggedSentences;
+
+		public TaggedAbstract(List<TaggedSentence> taggedSentences) {
+			super();
+			this.taggedSentences = taggedSentences;
+		}
+
+		public List<TaggedSentence> getTaggedSentences() {
+			return taggedSentences;
+		}
+
+		public void setTaggedSentences(List<TaggedSentence> taggedSentences) {
+			this.taggedSentences = taggedSentences;
+		}
+		
+	}
+	
+	public class TaggedSentence{
+		List<TaggedWord> taggedWords;
+
+		public TaggedSentence(List<TaggedWord> taggedWords) {
+			super();
+			this.taggedWords = taggedWords;
+		}
+
+		public List<TaggedWord> getTaggedWords() {
+			return taggedWords;
+		}
+
+		public void setTaggedWords(List<TaggedWord> taggedWords) {
+			this.taggedWords = taggedWords;
+		}
+	}
+	
+	public class TaggedWord{
+		String word;
+		String tag;
+		public String getWord() {
+			return word;
+		}
+		public void setWord(String word) {
+			this.word = word;
+		}
+		public String getTag() {
+			return tag;
+		}
+		public void setTag(String tag) {
+			this.tag = tag;
+		}
+		public TaggedWord(String word, String tag) {
+			super();
+			this.word = word;
+			this.tag = tag;
+		}
+		
+	}
+	
+	
+	public static void main(String[] args) {
+		try {
+			System.out.println("Usage: java GeniaPosExtractor GENIAcorpus3.02.pos.xml data/pos/training/genia-pos-training.txt");
+			String geniaCorpusFileName = args[0];
+			String outputFileName = args[1];
+
+			PrintStream out = new PrintStream(outputFileName);
+
+			GeniaPosTrainingDataExtractor gptde = new GeniaPosTrainingDataExtractor(geniaCorpusFileName);
+			while(gptde.hasNext()) {
+				TaggedAbstract taggedAbstract = gptde.next();
+				for(TaggedSentence taggedSentence : taggedAbstract.getTaggedSentences()) {
+					for(TaggedWord taggedWord : taggedSentence.getTaggedWords()) {
+						out.print(taggedWord.getWord()+"_"+taggedWord.getTag()+" ");
+					}
+					out.println();
+				}
+			}
+
+			out.flush();
+			out.close();
+
+		} catch (Exception e) {
+			e.printStackTrace();
+		}
+
+	}
+
+}

Modified: incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/scripts/java/data/pos/training/GeniaPosTrainingDataExtractorTests.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/POS%20tagger/scripts/java/data/pos/training/GeniaPosTrainingDataExtractorTests.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/scripts/java/data/pos/training/GeniaPosTrainingDataExtractorTests.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/scripts/java/data/pos/training/GeniaPosTrainingDataExtractorTests.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -21,94 +14,94 @@
  * See the License for the specific language governing permissions and 
  * limitations under the License. 
  */
-package data.pos.training;
-
-import static org.junit.Assert.assertEquals;
-
-import org.jdom.JDOMException;
-import org.junit.Test;
-
-import data.pos.training.GeniaPosTrainingDataExtractor.TaggedAbstract;
-import data.pos.training.GeniaPosTrainingDataExtractor.TaggedSentence;
-import data.pos.training.GeniaPosTrainingDataExtractor.TaggedWord;
-
-public class GeniaPosTrainingDataExtractorTests {
-
-	@Test
-    public void test() throws JDOMException {
-		GeniaPosTrainingDataExtractor gptde = new GeniaPosTrainingDataExtractor("test/data/GENIAcorpus3.02.pos.test.xml");  
-
-		TaggedAbstract taggedAbstract = gptde.next();
-		TaggedSentence taggedSentence;
-		TaggedWord taggedWord;
-		
-		//test one full sentence from title
-		taggedSentence = taggedAbstract.getTaggedSentences().get(0);
-		taggedWord = taggedSentence.getTaggedWords().get(0);
-		assertEquals("Pancreatic", taggedWord.getWord());
-		assertEquals("JJ", taggedWord.getTag());
-		taggedWord = taggedSentence.getTaggedWords().get(1);
-		assertEquals("development", taggedWord.getWord());
-		assertEquals("NN", taggedWord.getTag());
-		taggedWord = taggedSentence.getTaggedWords().get(2);
-		assertEquals("and", taggedWord.getWord());
-		assertEquals("CC", taggedWord.getTag());
-		taggedWord = taggedSentence.getTaggedWords().get(3);
-		assertEquals("maturation", taggedWord.getWord());
-		assertEquals("NN", taggedWord.getTag());
-		taggedWord = taggedSentence.getTaggedWords().get(4);
-		assertEquals("of", taggedWord.getWord());
-		assertEquals("IN", taggedWord.getTag());
-		taggedWord = taggedSentence.getTaggedWords().get(5);
-		assertEquals("the", taggedWord.getWord());
-		assertEquals("DT", taggedWord.getTag());
-		taggedWord = taggedSentence.getTaggedWords().get(6);
-		assertEquals("islet", taggedWord.getWord());
-		assertEquals("NN", taggedWord.getTag());
-		taggedWord = taggedSentence.getTaggedWords().get(7);
-		assertEquals("B", taggedWord.getWord());
-		assertEquals("NN", taggedWord.getTag());
-		taggedWord = taggedSentence.getTaggedWords().get(8);
-		assertEquals("cell", taggedWord.getWord());
-		assertEquals("NN", taggedWord.getTag());
-		taggedWord = taggedSentence.getTaggedWords().get(9);
-		assertEquals(".", taggedWord.getWord());
-		assertEquals(".", taggedWord.getTag());
-
-		//test one full sentence from abstract
-		//<sentence><w c="DT">The</w> <w c="CD">three</w> <w c="NNS">compartments</w> <w c="VBP">are</w> <w c="VBN">thought</w> <w c="TO">to</w> <w c="VB">be</w> <w c="IN">of</w> <w c="JJ">common</w> <w c="JJ">endodermal</w> <w c="NN">origin</w><w c=":">;</w> <w c="IN">in</w> <w c="NN">contrast</w> <w c="TO">to</w> <w c="JJR">earlier</w> <w c="NNS">hypotheses</w><w c=",">,</w> <w c="WDT">which</w> <w c="VBD">suggested</w> <w c="IN">that</w> <w c="DT">the</w> <w c="JJ">endocrine</w> <w c="NN">compartment</w> <w c="VBD">was</w> <w c="IN">of</w> <w c="JJ">neuroectodermal</w> <w c="NN">origin</w><w c=".">.</w></sentence>
-		taggedSentence = taggedAbstract.getTaggedSentences().get(5);
-		taggedWord = taggedSentence.getTaggedWords().get(0);
-		assertEquals("The", taggedWord.getWord());
-		assertEquals("DT", taggedWord.getTag());
-
-		
-		taggedSentence = taggedAbstract.getTaggedSentences().get(1);
-		assertEquals(6, taggedSentence.getTaggedWords().size());
-
-		taggedSentence = taggedAbstract.getTaggedSentences().get(2);
-		taggedWord = taggedSentence.getTaggedWords().get(0);
-		assertEquals("Pancreas", taggedWord.getWord());
-		assertEquals("NN", taggedWord.getTag());
-		taggedWord = taggedSentence.getTaggedWords().get(11);
-		assertEquals("anlage", taggedWord.getWord());
-		assertEquals("NNS", taggedWord.getTag());
-		taggedWord = taggedSentence.getTaggedWords().get(17);
-		assertEquals(".", taggedWord.getWord());
-		assertEquals(".", taggedWord.getTag());
-
-
-		
-		taggedAbstract = gptde.next();
-		taggedSentence = taggedAbstract.getTaggedSentences().get(4);
-		taggedWord = taggedSentence.getTaggedWords().get(0);
-		assertEquals("We", taggedWord.getWord());
-		assertEquals("PRP", taggedWord.getTag());
-		taggedWord = taggedSentence.getTaggedWords().get(37);
-		assertEquals("non-octamer", taggedWord.getWord());
-		assertEquals("JJ", taggedWord.getTag());
-		
-	}
-}
-
-
+package data.pos.training;
+
+import static org.junit.Assert.assertEquals;
+
+import org.jdom.JDOMException;
+import org.junit.Test;
+
+import data.pos.training.GeniaPosTrainingDataExtractor.TaggedAbstract;
+import data.pos.training.GeniaPosTrainingDataExtractor.TaggedSentence;
+import data.pos.training.GeniaPosTrainingDataExtractor.TaggedWord;
+
+public class GeniaPosTrainingDataExtractorTests {
+
+	@Test
+    public void test() throws JDOMException {
+		GeniaPosTrainingDataExtractor gptde = new GeniaPosTrainingDataExtractor("test/data/GENIAcorpus3.02.pos.test.xml");  
+
+		TaggedAbstract taggedAbstract = gptde.next();
+		TaggedSentence taggedSentence;
+		TaggedWord taggedWord;
+		
+		//test one full sentence from title
+		taggedSentence = taggedAbstract.getTaggedSentences().get(0);
+		taggedWord = taggedSentence.getTaggedWords().get(0);
+		assertEquals("Pancreatic", taggedWord.getWord());
+		assertEquals("JJ", taggedWord.getTag());
+		taggedWord = taggedSentence.getTaggedWords().get(1);
+		assertEquals("development", taggedWord.getWord());
+		assertEquals("NN", taggedWord.getTag());
+		taggedWord = taggedSentence.getTaggedWords().get(2);
+		assertEquals("and", taggedWord.getWord());
+		assertEquals("CC", taggedWord.getTag());
+		taggedWord = taggedSentence.getTaggedWords().get(3);
+		assertEquals("maturation", taggedWord.getWord());
+		assertEquals("NN", taggedWord.getTag());
+		taggedWord = taggedSentence.getTaggedWords().get(4);
+		assertEquals("of", taggedWord.getWord());
+		assertEquals("IN", taggedWord.getTag());
+		taggedWord = taggedSentence.getTaggedWords().get(5);
+		assertEquals("the", taggedWord.getWord());
+		assertEquals("DT", taggedWord.getTag());
+		taggedWord = taggedSentence.getTaggedWords().get(6);
+		assertEquals("islet", taggedWord.getWord());
+		assertEquals("NN", taggedWord.getTag());
+		taggedWord = taggedSentence.getTaggedWords().get(7);
+		assertEquals("B", taggedWord.getWord());
+		assertEquals("NN", taggedWord.getTag());
+		taggedWord = taggedSentence.getTaggedWords().get(8);
+		assertEquals("cell", taggedWord.getWord());
+		assertEquals("NN", taggedWord.getTag());
+		taggedWord = taggedSentence.getTaggedWords().get(9);
+		assertEquals(".", taggedWord.getWord());
+		assertEquals(".", taggedWord.getTag());
+
+		//test one full sentence from abstract
+		//<sentence><w c="DT">The</w> <w c="CD">three</w> <w c="NNS">compartments</w> <w c="VBP">are</w> <w c="VBN">thought</w> <w c="TO">to</w> <w c="VB">be</w> <w c="IN">of</w> <w c="JJ">common</w> <w c="JJ">endodermal</w> <w c="NN">origin</w><w c=":">;</w> <w c="IN">in</w> <w c="NN">contrast</w> <w c="TO">to</w> <w c="JJR">earlier</w> <w c="NNS">hypotheses</w><w c=",">,</w> <w c="WDT">which</w> <w c="VBD">suggested</w> <w c="IN">that</w> <w c="DT">the</w> <w c="JJ">endocrine</w> <w c="NN">compartment</w> <w c="VBD">was</w> <w c="IN">of</w> <w c="JJ">neuroectodermal</w> <w c="NN">origin</w><w c=".">.</w></sentence>
+		taggedSentence = taggedAbstract.getTaggedSentences().get(5);
+		taggedWord = taggedSentence.getTaggedWords().get(0);
+		assertEquals("The", taggedWord.getWord());
+		assertEquals("DT", taggedWord.getTag());
+
+		
+		taggedSentence = taggedAbstract.getTaggedSentences().get(1);
+		assertEquals(6, taggedSentence.getTaggedWords().size());
+
+		taggedSentence = taggedAbstract.getTaggedSentences().get(2);
+		taggedWord = taggedSentence.getTaggedWords().get(0);
+		assertEquals("Pancreas", taggedWord.getWord());
+		assertEquals("NN", taggedWord.getTag());
+		taggedWord = taggedSentence.getTaggedWords().get(11);
+		assertEquals("anlage", taggedWord.getWord());
+		assertEquals("NNS", taggedWord.getTag());
+		taggedWord = taggedSentence.getTaggedWords().get(17);
+		assertEquals(".", taggedWord.getWord());
+		assertEquals(".", taggedWord.getTag());
+
+
+		
+		taggedAbstract = gptde.next();
+		taggedSentence = taggedAbstract.getTaggedSentences().get(4);
+		taggedWord = taggedSentence.getTaggedWords().get(0);
+		assertEquals("We", taggedWord.getWord());
+		assertEquals("PRP", taggedWord.getTag());
+		taggedWord = taggedSentence.getTaggedWords().get(37);
+		assertEquals("non-octamer", taggedWord.getWord());
+		assertEquals("JJ", taggedWord.getTag());
+		
+	}
+}
+
+

Modified: incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/src/edu/mayo/bmi/uima/pos_tagger/OpenNLPPOSCollectionReader.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/POS%20tagger/src/edu/mayo/bmi/uima/pos_tagger/OpenNLPPOSCollectionReader.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/src/edu/mayo/bmi/uima/pos_tagger/OpenNLPPOSCollectionReader.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/src/edu/mayo/bmi/uima/pos_tagger/OpenNLPPOSCollectionReader.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -21,134 +14,134 @@
  * See the License for the specific language governing permissions and 
  * limitations under the License. 
  */
-package edu.mayo.bmi.uima.pos_tagger;
-
-import java.io.BufferedReader;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.IOException;
-
-import org.apache.uima.cas.CAS;
-import org.apache.uima.cas.CASException;
-import org.apache.uima.collection.CollectionException;
-import org.apache.uima.collection.CollectionReader_ImplBase;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.util.Progress;
-
-import edu.mayo.bmi.uima.core.type.syntax.BaseToken;
-import edu.mayo.bmi.uima.core.type.textspan.Sentence;
-
-/**
- * This collection reader reads in part-of-speech training/test data in the
- * OpenNLP format. See note below for POS_DATA_FILE_PARAM for details about this
- * format. Each line in the file will correspond to a "document" - i.e.
- * getNext() will populate the CAS with information from one line of the file.
- */
-public class OpenNLPPOSCollectionReader extends CollectionReader_ImplBase {
-
-	/**
-	 * "PosDataFile" is a required, single, string parameter that specifies the
-	 * location of a data file that contains part-of-speech data in it. The
-	 * format of the file should have one sentence per line where each word is
-	 * followed immediately by "_" and it pos tag followed by a space.
-	 * 
-	 * <pre>
-	 * 	 		IL-2_NN gene_NN expression_NN and_CC ...
-	 * 	 	
-	 * </pre>
-	 */
-	public static final String POS_DATA_FILE_PARAM = "PosDataFile";
-
-	/**
-	 * "LoadWordsOnly" is a optional, single, boolean parameter that determines
-	 * whether or not the part-of-speech tags associated with each word will be
-	 * loaded into the CAS or not. The default value is false.
-	 */
-	public static final String LOAD_WORDS_ONLY_PARAM = "LoadWordsOnly";
-
-	BufferedReader input;
-	String line = null;
-
-	boolean loadWordsOnly;
-
-	@Override
-	public void initialize() throws ResourceInitializationException {
-		try {
-			String posDataFile = (String) getConfigParameterValue(POS_DATA_FILE_PARAM);
-			input = new BufferedReader(new FileReader(posDataFile));
-			Boolean paramValue = (Boolean) getConfigParameterValue(LOAD_WORDS_ONLY_PARAM);
-			loadWordsOnly = paramValue == null ? false : paramValue;
-
-		} catch (FileNotFoundException fnfe) {
-			throw new ResourceInitializationException(fnfe);
-		}
-	}
-
-	/**
-	 * Some of the code in this method is based loosely on
-	 * opennlp.tools.postag.POSEventCollector
-	 */
-	public void getNext(CAS cas) throws IOException, CollectionException {
-		try {
-			if (hasNext()) {
-				JCas jCas = cas.getJCas();
-				String[] tokens = line.split(" ");
-				int wordStart = 0;
-				int wordEnd = 0;
-				int wordNumber = 0;
-				StringBuffer documentText = new StringBuffer();
-				for (String token : tokens) {
-					int split = token.lastIndexOf("_");
-					if(split == token.length()-1) {
-						split = token.substring(0, token.length()-1).lastIndexOf("_");
-					}
-					if (split == -1) {
-						line = null;
-						throw new CollectionException("There is a problem in your training data: " + token
-								+ " does not conform to the format WORD_TAG.", null);
-					}
-					String word = token.substring(0, split);
-					wordEnd = wordStart + word.length();
-					// Consider creating a token similar to the way
-					// TokenConverter.convert method creates BaseToken's
-					BaseToken baseToken = new BaseToken(jCas, wordStart, wordEnd);
-					if (!loadWordsOnly) {
-						String tag = token.substring(split + 1);
-						baseToken.setPartOfSpeech(tag);
-					}
-					baseToken.setTokenNumber(wordNumber++);
-					baseToken.addToIndexes();
-
-					documentText.append(word + " ");
-					wordStart = wordEnd + 1;
-				}
-				Sentence sentence = new Sentence(jCas, 0, wordEnd);
-				sentence.setSentenceNumber(0);
-				sentence.addToIndexes();
-				jCas.setDocumentText(documentText.toString());
-			}
-		} catch (CASException ce) {
-			throw new CollectionException(ce);
-		}
-		line = null;
-	}
-
-	public void close() throws IOException {
-		input.close();
-	}
-
-	public Progress[] getProgress() {
-		return null;
-	}
-
-	public boolean hasNext() throws IOException, CollectionException {
-		if (line == null) {
-			line = input.readLine();
-		}
-		if (line == null)
-			return false;
-		return true;
-	}
-
-}
+package edu.mayo.bmi.uima.pos_tagger;
+
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CollectionException;
+import org.apache.uima.collection.CollectionReader_ImplBase;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.Progress;
+
+import edu.mayo.bmi.uima.core.type.syntax.BaseToken;
+import edu.mayo.bmi.uima.core.type.textspan.Sentence;
+
+/**
+ * This collection reader reads in part-of-speech training/test data in the
+ * OpenNLP format. See note below for POS_DATA_FILE_PARAM for details about this
+ * format. Each line in the file will correspond to a "document" - i.e.
+ * getNext() will populate the CAS with information from one line of the file.
+ */
+public class OpenNLPPOSCollectionReader extends CollectionReader_ImplBase {
+
+	/**
+	 * "PosDataFile" is a required, single, string parameter that specifies the
+	 * location of a data file that contains part-of-speech data in it. The
+	 * format of the file should have one sentence per line where each word is
+	 * followed immediately by "_" and it pos tag followed by a space.
+	 * 
+	 * <pre>
+	 * 	 		IL-2_NN gene_NN expression_NN and_CC ...
+	 * 	 	
+	 * </pre>
+	 */
+	public static final String POS_DATA_FILE_PARAM = "PosDataFile";
+
+	/**
+	 * "LoadWordsOnly" is a optional, single, boolean parameter that determines
+	 * whether or not the part-of-speech tags associated with each word will be
+	 * loaded into the CAS or not. The default value is false.
+	 */
+	public static final String LOAD_WORDS_ONLY_PARAM = "LoadWordsOnly";
+
+	BufferedReader input;
+	String line = null;
+
+	boolean loadWordsOnly;
+
+	@Override
+	public void initialize() throws ResourceInitializationException {
+		try {
+			String posDataFile = (String) getConfigParameterValue(POS_DATA_FILE_PARAM);
+			input = new BufferedReader(new FileReader(posDataFile));
+			Boolean paramValue = (Boolean) getConfigParameterValue(LOAD_WORDS_ONLY_PARAM);
+			loadWordsOnly = paramValue == null ? false : paramValue;
+
+		} catch (FileNotFoundException fnfe) {
+			throw new ResourceInitializationException(fnfe);
+		}
+	}
+
+	/**
+	 * Some of the code in this method is based loosely on
+	 * opennlp.tools.postag.POSEventCollector
+	 */
+	public void getNext(CAS cas) throws IOException, CollectionException {
+		try {
+			if (hasNext()) {
+				JCas jCas = cas.getJCas();
+				String[] tokens = line.split(" ");
+				int wordStart = 0;
+				int wordEnd = 0;
+				int wordNumber = 0;
+				StringBuffer documentText = new StringBuffer();
+				for (String token : tokens) {
+					int split = token.lastIndexOf("_");
+					if(split == token.length()-1) {
+						split = token.substring(0, token.length()-1).lastIndexOf("_");
+					}
+					if (split == -1) {
+						line = null;
+						throw new CollectionException("There is a problem in your training data: " + token
+								+ " does not conform to the format WORD_TAG.", null);
+					}
+					String word = token.substring(0, split);
+					wordEnd = wordStart + word.length();
+					// Consider creating a token similar to the way
+					// TokenConverter.convert method creates BaseToken's
+					BaseToken baseToken = new BaseToken(jCas, wordStart, wordEnd);
+					if (!loadWordsOnly) {
+						String tag = token.substring(split + 1);
+						baseToken.setPartOfSpeech(tag);
+					}
+					baseToken.setTokenNumber(wordNumber++);
+					baseToken.addToIndexes();
+
+					documentText.append(word + " ");
+					wordStart = wordEnd + 1;
+				}
+				Sentence sentence = new Sentence(jCas, 0, wordEnd);
+				sentence.setSentenceNumber(0);
+				sentence.addToIndexes();
+				jCas.setDocumentText(documentText.toString());
+			}
+		} catch (CASException ce) {
+			throw new CollectionException(ce);
+		}
+		line = null;
+	}
+
+	public void close() throws IOException {
+		input.close();
+	}
+
+	public Progress[] getProgress() {
+		return null;
+	}
+
+	public boolean hasNext() throws IOException, CollectionException {
+		if (line == null) {
+			line = input.readLine();
+		}
+		if (line == null)
+			return false;
+		return true;
+	}
+
+}

Modified: incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/src/edu/mayo/bmi/uima/pos_tagger/POSTagger.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/POS%20tagger/src/edu/mayo/bmi/uima/pos_tagger/POSTagger.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/src/edu/mayo/bmi/uima/pos_tagger/POSTagger.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/src/edu/mayo/bmi/uima/pos_tagger/POSTagger.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software