You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by ja...@apache.org on 2012/10/31 06:26:55 UTC
svn commit: r1403989 [5/28] - in /incubator/ctakes/branches/SHARPn-cTAKES:
Constituency Parser/src/org/chboston/cnlp/ctakes/parser/ Constituency
Parser/src/org/chboston/cnlp/ctakes/parser/uima/ae/ Constituency
Parser/src/org/chboston/cnlp/ctakes/parser...
Modified: incubator/ctakes/branches/SHARPn-cTAKES/PAD term spotter/utils/src/edu/mayo/bmi/utils/xcas_comparison/XcasFile.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/PAD%20term%20spotter/utils/src/edu/mayo/bmi/utils/xcas_comparison/XcasFile.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/PAD term spotter/utils/src/edu/mayo/bmi/utils/xcas_comparison/XcasFile.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/PAD term spotter/utils/src/edu/mayo/bmi/utils/xcas_comparison/XcasFile.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
/*
- * Copyright: (c) 2009 Mayo Foundation for Medical Education and
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
@@ -21,227 +14,227 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package edu.mayo.bmi.utils.xcas_comparison;
-import java.util.Hashtable;
-import java.util.LinkedList;
-import java.util.Vector;
-import java.io.File;
-
-/**
- * An <code>XcasFile</code> wraps all <code>XcasAnnotations</code>s
- * in it and also contains their position information in terms of
- * line and column numbers.
- * @author Mayo Clinic
- *
- */
-public class XcasFile implements Cloneable {
-
- protected File f;
- protected Hashtable<Integer, XcasAnnotation> annotations;
- protected Hashtable<XcasAnnotation, String> positions;
-
- /**
- * Default constructor.
- */
- public XcasFile () {
- annotations = new Hashtable<Integer, XcasAnnotation>();
- positions = new Hashtable<XcasAnnotation, String>();
- }
-
- /**
- * Constructs an <code>XcasFile</code> with the specified name.
- * @param f File name.
- */
- public XcasFile (String f) { this(); this.f = new File(f); }
-
- /**
- * Constructs an <code>XcasFile</code> with the specified name.
- * @param f A File object.
- */
- public XcasFile (File f) { this(); this.f = f; }
-
- /**
- * Creates a new <code>XcasFile</code> object from the specified file.
- * Avoid using this method if you plan to parse multiple files,
- * as this method creates an anonymous <code>XcasProcessor</code> instance
- * each time called, which could be used to parse multiple files.
- * @param f A string containing the XCAS file name.
- * @return A parsed <code>XcasFile</code> object.
- */
- public static XcasFile process (String f) {
- return (new XcasProcessor()).process(f);
- }
-
- /**
- * Inserts a new annotation with the specified internal <code>id</code>
- * to this <code>XcasFile</code>.
- * @param id UIMA CAS internal <code>_id</code>.
- * @param a An <code>XcasAnnotation</code> object to add.
- * @see #addAnnotation(int, XcasAnnotation, String)
- * @see #addAnnotation(int, XcasAnnotation, int, int)
- */
- public void addAnnotation (int id, XcasAnnotation a) { annotations.put(id, a); }
-
- /**
- * Inserts a new annotation, along with its position in the file,
- * to this <code>XcasFile</code> object.
- * @param id UIMA CAS internal <code>_id</code>.
- * @param a An <code>XcasAnnotation</code> object to add.
- * @param pos Line and column number of the specified annotation,
- * in the form of <code>line_number:column_number</code>.
- */
- public void addAnnotation (int id, XcasAnnotation a, String pos) { addAnnotation(id, a); positions.put(a, pos); }
-
- /**
- * Inserts a new annotation, along with its position in the file,
- * to this <code>XcasFile</code> object.
- * @param id UIMA CAS internal <code>_id</code>.
- * @param a An <code>XcasAnnotation</code> object to add.
- * @param lineNum Line number of the specified annotation.
- * @param colNum Column number of the specified annotation.
- */
- public void addAnnotation (int id, XcasAnnotation a, int lineNum, int colNum) { addAnnotation(id, a, Integer.toString(lineNum)+":"+Integer.toString(colNum)); }
-
- /**
- * Returns the <code>XcasAnnotation</code> object associated with
- * the specified internal id.
- * @param id UIMA CAS internal <code>_id</code>.
- * @return The <code>XcasAnnotation</code> with the specified id.
- */
- public XcasAnnotation getAnnotation (int id) { return annotations.get(id); }
-
- public java.util.Collection<XcasAnnotation> getAllAnnotations () { return annotations.values(); }
- public String getFileName () { return f.getName(); }
-
- /**
- * Returns the line and column numbers of the specified <code>XcasAnnotation</code>,
- * which is included in this <code>XcasFile</code> object.
- * @param a
- * @return A string containing the line and column numbers of the specified object,
- * in the form of <code>line_number:column_number</code>.
- * @see #getPositionOwn(int)
- * @see #getPositionOther(XcasAnnotation)
- */
- public String getPositionOwn (XcasAnnotation a) { return positions.get(a); }
-
- /**
- * Returns the line and column numbers of the <code>XcasAnnotation</code>,
- * specified by the original XCAS internal <code>_id</code> field.
- * @param id UIMA CAS internal <code>_id</code>.
- * @return A string containing the line and column numbers of the specified object,
- * in the form of <code>line_number:column_number</code>.
- * @see #getPositionOwn(XcasAnnotation)
- * @see #getPositionOther(XcasAnnotation)
- */
- public String getPositionOwn (int id) { return positions.get(annotations.get(id)); }
-
- /**
- * Finds an <code>XcasAnnotation</code> with the same attributes as specified,
- * and returns its line and column numbers.
- * @param a
- * @return A string containing the line and column numbers of the specified object,
- * in the form of <code>line_number:column_number</code>.
- * @see #getPositionOwn(int)
- * @see #getPositionOwn(XcasAnnotation)
- */
- public String getPositionOther (XcasAnnotation a) {
- for (XcasAnnotation o : positions.keySet())
- if (o.equals(a)) return positions.get(o);
- return null;
- }
-
- /**
- * Returns the line and column numbers of the specified <code>XcasAnnotation</code>.
- * <p>
- * Do not use this method if you know the specified <code>XcasAnnotation</code>
- * object is in this <code>XcasFile</code>. Instead, use
- * {@link #getPositionOwn(XcasAnnotation)}, which is faster.
- * @param a An
- * @return A string containing the line and column numbers of the specified object,
- * in the form of <code>line_number:column_number</code>.
- * @see #getPositionOwn(XcasAnnotation)
- * @see #getPositionOther(XcasAnnotation)
- */
- public String getPosition (XcasAnnotation a) {
- if (positions.keySet().contains(a)) return positions.get(a);
- else return getPositionOther(a);
- }
-
- /**
- * Finds an <code>XcasAnnotation</code> of the same type as specified, and
- * a same text span, then returns its line and column number.
- * @param a An <code>XcasAnnotation</code> against which a similar
- * <code>XcasAnnotation</code> in this <code>XcasFile</code>
- * is to be matched.
- * @return A string containing the line and column numbers of the specified object,
- * in the form of <code>line_number:column_number</code>.
- * @see #getPositionOther(XcasAnnotation)
- */
- public String getPositionSimilar (XcasAnnotation a) {
- for (XcasAnnotation o : positions.keySet())
- if (o.type.equals(a.type)) {
- int oBegin = o.attributes.containsKey("begin") ? Integer.parseInt(o.getAttribute("begin")) : -1;
- int oEnd = o.attributes.containsKey("end") ? Integer.parseInt(o.getAttribute("end")) : -1;
- int aBegin = a.attributes.containsKey("begin") ? Integer.parseInt(a.getAttribute("begin")) : -2;
- int aEnd = a.attributes.containsKey("end") ? Integer.parseInt(a.getAttribute("end")) : -2;
- if (oBegin==aBegin && oEnd==aEnd) return positions.get(o);
- else if (o.attributes.containsKey("key") && a.attributes.containsKey("key") && o.getAttribute("key").equals(a.getAttribute("key")))
- return positions.get(o);
- }
- return null;
- }
-
- /**
- * Checks whether this XCAS file has an annotation with the specified id.
- * @param id UIMA CAS internal <code>_id</code>.
- * @return <code>true</code> if file has an annotation with the specified id,
- * <code>false</code> otherwise.
- */
- public boolean hasAnnotation (int id) { return annotations.containsKey(id); }
-
- /**
- * Checks whether this XCAS file has the specified <code>XcasAnnotation</code>.
- * If there is an <code>XcasAnnotation</code> object that has exactly the same
- * type, attributes, and references, return <code>true</code>.
- * @param a An <code>XcasAnnotation</code> to check.
- * @return <code>true</code> if there is one <code>XcasAnnotation</code> equals
- * the specified one, <code>false</code> otherwise.
- * @see XcasAnnotation#equals(Object)
- */
- public boolean hasAnnotation (XcasAnnotation a) { return annotations.containsValue(a); }
-
- /**
- * Checks whether the specified object has the same set of annotations. First check
- * whether the specified is an <code>XcasFile</code> object. If so, check whether
- * its annotation set is of the same size as in this <code>XcasFile</code>, then check
- * whether these two sets are equal.
- * @param obj An object to compare to.
- * @return <code>true if the specified object is an <code>XcasFile</code> object and
- * has a same set of <code>XcasAnnotations</code>, <code>false</code> otherwise.
- */
- public boolean equals (Object obj) {
- if (obj.getClass()!=getClass() || annotations.values().size()!=((XcasFile)obj).annotations.values().size()) return false;
- return annotations.values().containsAll(((XcasFile)obj).annotations.values());
- }
-
- public LinkedList<XcasAnnotation> annotationsClone () {
- LinkedList<XcasAnnotation> ret = new LinkedList<XcasAnnotation>();
- Hashtable<XcasAnnotation, XcasAnnotation> cloneMap = new Hashtable<XcasAnnotation, XcasAnnotation>();
- for (XcasAnnotation a : annotations.values()) {
- XcasAnnotation c = a.shallowCopy();
- cloneMap.put(a, c);
- ret.add(c);
- }
- for (XcasAnnotation a : annotations.values())
- for (String s : a.references.keySet())
- for (XcasAnnotation r : (Vector<XcasAnnotation>)a.references.get(s))
- ((Vector<XcasAnnotation>)cloneMap.get(a).references.get(s)).add(cloneMap.get(r));
- return ret;
- }
-
- public Object clone () {
- return null; //TODO implement clone?
- // Should not use XcasAnnotation.clone()
- // otherwise, XcasAnnotation objects referenced by multiple objects will be cloned more than once.
- }
-}
+package edu.mayo.bmi.utils.xcas_comparison;
+import java.util.Hashtable;
+import java.util.LinkedList;
+import java.util.Vector;
+import java.io.File;
+
+/**
+ * An <code>XcasFile</code> wraps all <code>XcasAnnotations</code>s
+ * in it and also contains their position information in terms of
+ * line and column numbers.
+ * @author Mayo Clinic
+ *
+ */
+public class XcasFile implements Cloneable {
+
+ protected File f;
+ protected Hashtable<Integer, XcasAnnotation> annotations;
+ protected Hashtable<XcasAnnotation, String> positions;
+
+ /**
+ * Default constructor.
+ */
+ public XcasFile () {
+ annotations = new Hashtable<Integer, XcasAnnotation>();
+ positions = new Hashtable<XcasAnnotation, String>();
+ }
+
+ /**
+ * Constructs an <code>XcasFile</code> with the specified name.
+ * @param f File name.
+ */
+ public XcasFile (String f) { this(); this.f = new File(f); }
+
+ /**
+ * Constructs an <code>XcasFile</code> with the specified name.
+ * @param f A File object.
+ */
+ public XcasFile (File f) { this(); this.f = f; }
+
+ /**
+ * Creates a new <code>XcasFile</code> object from the specified file.
+ * Avoid using this method if you plan to parse multiple files,
+ * as this method creates an anonymous <code>XcasProcessor</code> instance
+ * each time called, which could be used to parse multiple files.
+ * @param f A string containing the XCAS file name.
+ * @return A parsed <code>XcasFile</code> object.
+ */
+ public static XcasFile process (String f) {
+ return (new XcasProcessor()).process(f);
+ }
+
+ /**
+ * Inserts a new annotation with the specified internal <code>id</code>
+ * to this <code>XcasFile</code>.
+ * @param id UIMA CAS internal <code>_id</code>.
+ * @param a An <code>XcasAnnotation</code> object to add.
+ * @see #addAnnotation(int, XcasAnnotation, String)
+ * @see #addAnnotation(int, XcasAnnotation, int, int)
+ */
+ public void addAnnotation (int id, XcasAnnotation a) { annotations.put(id, a); }
+
+ /**
+ * Inserts a new annotation, along with its position in the file,
+ * to this <code>XcasFile</code> object.
+ * @param id UIMA CAS internal <code>_id</code>.
+ * @param a An <code>XcasAnnotation</code> object to add.
+ * @param pos Line and column number of the specified annotation,
+ * in the form of <code>line_number:column_number</code>.
+ */
+ public void addAnnotation (int id, XcasAnnotation a, String pos) { addAnnotation(id, a); positions.put(a, pos); }
+
+ /**
+ * Inserts a new annotation, along with its position in the file,
+ * to this <code>XcasFile</code> object.
+ * @param id UIMA CAS internal <code>_id</code>.
+ * @param a An <code>XcasAnnotation</code> object to add.
+ * @param lineNum Line number of the specified annotation.
+ * @param colNum Column number of the specified annotation.
+ */
+ public void addAnnotation (int id, XcasAnnotation a, int lineNum, int colNum) { addAnnotation(id, a, Integer.toString(lineNum)+":"+Integer.toString(colNum)); }
+
+ /**
+ * Returns the <code>XcasAnnotation</code> object associated with
+ * the specified internal id.
+ * @param id UIMA CAS internal <code>_id</code>.
+ * @return The <code>XcasAnnotation</code> with the specified id.
+ */
+ public XcasAnnotation getAnnotation (int id) { return annotations.get(id); }
+
+ public java.util.Collection<XcasAnnotation> getAllAnnotations () { return annotations.values(); }
+ public String getFileName () { return f.getName(); }
+
+ /**
+ * Returns the line and column numbers of the specified <code>XcasAnnotation</code>,
+ * which is included in this <code>XcasFile</code> object.
+ * @param a
+ * @return A string containing the line and column numbers of the specified object,
+ * in the form of <code>line_number:column_number</code>.
+ * @see #getPositionOwn(int)
+ * @see #getPositionOther(XcasAnnotation)
+ */
+ public String getPositionOwn (XcasAnnotation a) { return positions.get(a); }
+
+ /**
+ * Returns the line and column numbers of the <code>XcasAnnotation</code>,
+ * specified by the original XCAS internal <code>_id</code> field.
+ * @param id UIMA CAS internal <code>_id</code>.
+ * @return A string containing the line and column numbers of the specified object,
+ * in the form of <code>line_number:column_number</code>.
+ * @see #getPositionOwn(XcasAnnotation)
+ * @see #getPositionOther(XcasAnnotation)
+ */
+ public String getPositionOwn (int id) { return positions.get(annotations.get(id)); }
+
+ /**
+ * Finds an <code>XcasAnnotation</code> with the same attributes as specified,
+ * and returns its line and column numbers.
+ * @param a
+ * @return A string containing the line and column numbers of the specified object,
+ * in the form of <code>line_number:column_number</code>.
+ * @see #getPositionOwn(int)
+ * @see #getPositionOwn(XcasAnnotation)
+ */
+ public String getPositionOther (XcasAnnotation a) {
+ for (XcasAnnotation o : positions.keySet())
+ if (o.equals(a)) return positions.get(o);
+ return null;
+ }
+
+ /**
+ * Returns the line and column numbers of the specified <code>XcasAnnotation</code>.
+ * <p>
+ * Do not use this method if you know the specified <code>XcasAnnotation</code>
+ * object is in this <code>XcasFile</code>. Instead, use
+ * {@link #getPositionOwn(XcasAnnotation)}, which is faster.
+ * @param a An
+ * @return A string containing the line and column numbers of the specified object,
+ * in the form of <code>line_number:column_number</code>.
+ * @see #getPositionOwn(XcasAnnotation)
+ * @see #getPositionOther(XcasAnnotation)
+ */
+ public String getPosition (XcasAnnotation a) {
+ if (positions.keySet().contains(a)) return positions.get(a);
+ else return getPositionOther(a);
+ }
+
+ /**
+ * Finds an <code>XcasAnnotation</code> of the same type as specified, and
+ * a same text span, then returns its line and column number.
+ * @param a An <code>XcasAnnotation</code> against which a similar
+ * <code>XcasAnnotation</code> in this <code>XcasFile</code>
+ * is to be matched.
+ * @return A string containing the line and column numbers of the specified object,
+ * in the form of <code>line_number:column_number</code>.
+ * @see #getPositionOther(XcasAnnotation)
+ */
+ public String getPositionSimilar (XcasAnnotation a) {
+ for (XcasAnnotation o : positions.keySet())
+ if (o.type.equals(a.type)) {
+ int oBegin = o.attributes.containsKey("begin") ? Integer.parseInt(o.getAttribute("begin")) : -1;
+ int oEnd = o.attributes.containsKey("end") ? Integer.parseInt(o.getAttribute("end")) : -1;
+ int aBegin = a.attributes.containsKey("begin") ? Integer.parseInt(a.getAttribute("begin")) : -2;
+ int aEnd = a.attributes.containsKey("end") ? Integer.parseInt(a.getAttribute("end")) : -2;
+ if (oBegin==aBegin && oEnd==aEnd) return positions.get(o);
+ else if (o.attributes.containsKey("key") && a.attributes.containsKey("key") && o.getAttribute("key").equals(a.getAttribute("key")))
+ return positions.get(o);
+ }
+ return null;
+ }
+
+ /**
+ * Checks whether this XCAS file has an annotation with the specified id.
+ * @param id UIMA CAS internal <code>_id</code>.
+ * @return <code>true</code> if file has an annotation with the specified id,
+ * <code>false</code> otherwise.
+ */
+ public boolean hasAnnotation (int id) { return annotations.containsKey(id); }
+
+ /**
+ * Checks whether this XCAS file has the specified <code>XcasAnnotation</code>.
+ * If there is an <code>XcasAnnotation</code> object that has exactly the same
+ * type, attributes, and references, return <code>true</code>.
+ * @param a An <code>XcasAnnotation</code> to check.
+ * @return <code>true</code> if there is one <code>XcasAnnotation</code> equals
+ * the specified one, <code>false</code> otherwise.
+ * @see XcasAnnotation#equals(Object)
+ */
+ public boolean hasAnnotation (XcasAnnotation a) { return annotations.containsValue(a); }
+
+ /**
+ * Checks whether the specified object has the same set of annotations. First check
+ * whether the specified is an <code>XcasFile</code> object. If so, check whether
+ * its annotation set is of the same size as in this <code>XcasFile</code>, then check
+ * whether these two sets are equal.
+ * @param obj An object to compare to.
+ * @return <code>true if the specified object is an <code>XcasFile</code> object and
+ * has a same set of <code>XcasAnnotations</code>, <code>false</code> otherwise.
+ */
+ public boolean equals (Object obj) {
+ if (obj.getClass()!=getClass() || annotations.values().size()!=((XcasFile)obj).annotations.values().size()) return false;
+ return annotations.values().containsAll(((XcasFile)obj).annotations.values());
+ }
+
+ public LinkedList<XcasAnnotation> annotationsClone () {
+ LinkedList<XcasAnnotation> ret = new LinkedList<XcasAnnotation>();
+ Hashtable<XcasAnnotation, XcasAnnotation> cloneMap = new Hashtable<XcasAnnotation, XcasAnnotation>();
+ for (XcasAnnotation a : annotations.values()) {
+ XcasAnnotation c = a.shallowCopy();
+ cloneMap.put(a, c);
+ ret.add(c);
+ }
+ for (XcasAnnotation a : annotations.values())
+ for (String s : a.references.keySet())
+ for (XcasAnnotation r : (Vector<XcasAnnotation>)a.references.get(s))
+ ((Vector<XcasAnnotation>)cloneMap.get(a).references.get(s)).add(cloneMap.get(r));
+ return ret;
+ }
+
+ public Object clone () {
+ return null; //TODO implement clone?
+ // Should not use XcasAnnotation.clone()
+ // otherwise, XcasAnnotation objects referenced by multiple objects will be cloned more than once.
+ }
+}
Modified: incubator/ctakes/branches/SHARPn-cTAKES/PAD term spotter/utils/src/edu/mayo/bmi/utils/xcas_comparison/XcasProcessor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/PAD%20term%20spotter/utils/src/edu/mayo/bmi/utils/xcas_comparison/XcasProcessor.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/PAD term spotter/utils/src/edu/mayo/bmi/utils/xcas_comparison/XcasProcessor.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/PAD term spotter/utils/src/edu/mayo/bmi/utils/xcas_comparison/XcasProcessor.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
/*
- * Copyright: (c) 2009 Mayo Foundation for Medical Education and
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
@@ -21,171 +14,171 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package edu.mayo.bmi.utils.xcas_comparison;
-import java.util.Hashtable;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.io.File;
-
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.SAXParseException;
-import org.xml.sax.Locator;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * A SAX parser that parses an XCAS file.
- * This is done without referencing to the UIMA Type System definition.
- * Therefore, {@link XcasDiff} can be used to compare XCASes from
- * different type systems.
- * @author Mayo Clinic
- *
- */
-public class XcasProcessor extends DefaultHandler {
-
- private Locator loc;
- private SAXParser sp;
- private XcasFile xcasf;
- private Hashtable<String, Integer> pendingRef;
- private Hashtable<Integer, int[]> pendingArr;
- private Hashtable<Integer, int[]> pendingIntArr;
- private HashMap<Integer, int[]> pendingList;
- private String parentTag;
- private int arrID;
- private int arrInd = -1;
- private StringBuffer val;
-
- public void setDocumentLocator(Locator locator) { loc = locator; }
-
- /**
- * Default constructor.
- */
- public XcasProcessor () {
- pendingRef = new Hashtable<String, Integer>();
- pendingArr = new Hashtable<Integer, int[]>();
- pendingIntArr = new Hashtable<Integer, int[]>();
- pendingList = new HashMap<Integer, int[]>();
- val = new StringBuffer();
- try { sp = SAXParserFactory.newInstance().newSAXParser(); }
- catch (Exception e) { e.printStackTrace(); }
- }
-
- /**
- * Parses the specified file and returns a parsed <code>XcasFile</code> object.
- * @param f A File object.
- * @return An <code>XcasFile</code> object.
- */
- public XcasFile process (File f) {
- xcasf = new XcasFile(f);
- pendingRef.clear();
- pendingArr.clear();
- pendingIntArr.clear();
- pendingList.clear();
- val.delete(0, val.length());
- arrInd = -1;
- try { sp.parse(f, this); }
- catch (SAXParseException spe) {
- System.err.println("Error parsing XCAS file: "+f+" at line"+spe.getLineNumber());
- System.err.println(spe.getMessage());
- }
- catch (Exception e) { e.printStackTrace(); }
- return xcasf;
- }
-
- /**
- * Parses the specified file and returns a parsed <code>XcasFile</code> object.
- * @param f File name.
- * @return An <code>XcasFile</code> object.
- */
- public XcasFile process (String f) {
- return process(new File(f));
- }
-
- public void characters (char[] ch, int start, int length) throws SAXException {
- val.append(ch, start, length);
- }
-
- public void startElement (String uri, String localName, String qName, Attributes attributes) throws SAXException {
- val.delete(0, val.length());
- String s = attributes.getValue(Const.ID);
- int id;
- if (s==null) return;
- else id = Integer.parseInt(s);
- if (qName.equalsIgnoreCase(Const.UIMA_FSARRAY)) {
- pendingArr.put(id, new int[Integer.parseInt(attributes.getValue(Const.UIMA_ARRAY_SIZE_KEYWORD))]);
- parentTag = qName;
- arrID = id;
- arrInd = 0;
- }
- else if (qName.equalsIgnoreCase(Const.UIMA_INTARRAY)) {
- pendingIntArr.put(id, new int[Integer.parseInt(attributes.getValue(Const.UIMA_ARRAY_SIZE_KEYWORD))]);
- parentTag = qName;
- arrID = id;
- arrInd = 0;
- }
- else if (qName.equalsIgnoreCase(Const.UIMA_NONEMPTY_FSLIST)) {
- int[] ref = {Integer.parseInt(attributes.getValue(Const.UIMA_LIST_HEAD_KEYWORD)),
- Integer.parseInt(attributes.getValue(Const.UIMA_LIST_TAIL_KEYWORD))};
- pendingList.put(id, ref);
- }
- else if (qName.equalsIgnoreCase(Const.UIMA_EMPTY_FSLIST)) {
- pendingList.put(id, null);
- }
- else if (!qName.equalsIgnoreCase(Const.UIMA_ARRAY_INDEX_KEYWORD)) {
- XcasAnnotation a = new XcasAnnotation(qName);
- for (int i = attributes.getLength(); i > 0; i--) {
- String q = attributes.getQName(i-1);
- String v = attributes.getValue(i-1);
- if (q.equalsIgnoreCase(Const.ID) || Const.ATTRIBUTES_TO_IGNORE.contains(q)) continue;
- else if (q.startsWith(Const.REF_PREFIX)) pendingRef.put(Integer.toString(id)+":"+q, Integer.parseInt(v));
- else a.insertAttribute(q, v);
- }
- xcasf.addAnnotation(id, a, loc.getLineNumber()+":"+loc.getColumnNumber());
- }
- }
-
- public void endElement (String uri, String localName, String qName) throws SAXException {
- if (qName.equalsIgnoreCase(Const.UIMA_ARRAY_INDEX_KEYWORD)) {
- if (parentTag.equalsIgnoreCase(Const.UIMA_FSARRAY))
- pendingArr.get(arrID)[arrInd++] = Integer.parseInt(val.toString());
- else if (parentTag.equalsIgnoreCase(Const.UIMA_INTARRAY))
- pendingIntArr.get(arrID)[arrInd++] = Integer.parseInt(val.toString());
- } else if (qName.equalsIgnoreCase(Const.UIMA_FSARRAY)) {
- arrInd = -1;
- } else if (qName.equalsIgnoreCase(Const.UIMA_INTARRAY)) {
- arrInd = -1;
- } else if (qName.equalsIgnoreCase(Const.UIMA_CAS)) {
- for (String s : pendingRef.keySet()) {
- String[] ref = s.split(":");
- int refID = pendingRef.get(s);
- XcasAnnotation a = xcasf.getAnnotation(Integer.parseInt(ref[0]));
- if (pendingIntArr.containsKey(refID)) {
- a.insertIntReference(ref[1], pendingIntArr.get(refID));
- continue;
- }
- int[] arr;
- if (pendingArr.containsKey(refID))
- arr = pendingArr.get(refID);
- else if (pendingList.containsKey(refID)) {
- LinkedList<Integer> ll = new LinkedList<Integer>();
- int[] l = pendingList.get(refID);
- while (l!=null) {
- ll.add(l[0]);
- l = pendingList.get(l[1]);
- }
- arr = new int[ll.size()];
- for (int i = 0; i < arr.length; i++)
- arr[i] = ll.get(i);
- }
- else { arr = new int[1]; arr[0] = refID; }
- for (int i : arr)
- a.insertReference(ref[1], xcasf.getAnnotation(i));
- }
- }
- if (val.length()>0 && !qName.equalsIgnoreCase(Const.UIMA_ARRAY_INDEX_KEYWORD) && !qName.equalsIgnoreCase(Const.UIMA_TCAS_DOCUMENT))
- System.err.println("Unexpected text ("+qName+"): \""+val.toString()+"\"");
- val.delete(0, val.length());
- }
-}
+package edu.mayo.bmi.utils.xcas_comparison;
+import java.util.Hashtable;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.io.File;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+import org.xml.sax.Locator;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * A SAX parser that parses an XCAS file.
+ * This is done without referencing to the UIMA Type System definition.
+ * Therefore, {@link XcasDiff} can be used to compare XCASes from
+ * different type systems.
+ * @author Mayo Clinic
+ *
+ */
+public class XcasProcessor extends DefaultHandler {
+
+ private Locator loc;
+ private SAXParser sp;
+ private XcasFile xcasf;
+ private Hashtable<String, Integer> pendingRef;
+ private Hashtable<Integer, int[]> pendingArr;
+ private Hashtable<Integer, int[]> pendingIntArr;
+ private HashMap<Integer, int[]> pendingList;
+ private String parentTag;
+ private int arrID;
+ private int arrInd = -1;
+ private StringBuffer val;
+
+ public void setDocumentLocator(Locator locator) { loc = locator; }
+
+ /**
+ * Default constructor.
+ */
+ public XcasProcessor () {
+ pendingRef = new Hashtable<String, Integer>();
+ pendingArr = new Hashtable<Integer, int[]>();
+ pendingIntArr = new Hashtable<Integer, int[]>();
+ pendingList = new HashMap<Integer, int[]>();
+ val = new StringBuffer();
+ try { sp = SAXParserFactory.newInstance().newSAXParser(); }
+ catch (Exception e) { e.printStackTrace(); }
+ }
+
+ /**
+ * Parses the specified file and returns a parsed <code>XcasFile</code> object.
+ * @param f A File object.
+ * @return An <code>XcasFile</code> object.
+ */
+ public XcasFile process (File f) {
+ xcasf = new XcasFile(f);
+ pendingRef.clear();
+ pendingArr.clear();
+ pendingIntArr.clear();
+ pendingList.clear();
+ val.delete(0, val.length());
+ arrInd = -1;
+ try { sp.parse(f, this); }
+ catch (SAXParseException spe) {
+ System.err.println("Error parsing XCAS file: "+f+" at line"+spe.getLineNumber());
+ System.err.println(spe.getMessage());
+ }
+ catch (Exception e) { e.printStackTrace(); }
+ return xcasf;
+ }
+
+ /**
+ * Parses the specified file and returns a parsed <code>XcasFile</code> object.
+ * @param f File name.
+ * @return An <code>XcasFile</code> object.
+ */
+ public XcasFile process (String f) {
+ return process(new File(f));
+ }
+
+ public void characters (char[] ch, int start, int length) throws SAXException {
+ val.append(ch, start, length);
+ }
+
+ public void startElement (String uri, String localName, String qName, Attributes attributes) throws SAXException {
+ val.delete(0, val.length());
+ String s = attributes.getValue(Const.ID);
+ int id;
+ if (s==null) return;
+ else id = Integer.parseInt(s);
+ if (qName.equalsIgnoreCase(Const.UIMA_FSARRAY)) {
+ pendingArr.put(id, new int[Integer.parseInt(attributes.getValue(Const.UIMA_ARRAY_SIZE_KEYWORD))]);
+ parentTag = qName;
+ arrID = id;
+ arrInd = 0;
+ }
+ else if (qName.equalsIgnoreCase(Const.UIMA_INTARRAY)) {
+ pendingIntArr.put(id, new int[Integer.parseInt(attributes.getValue(Const.UIMA_ARRAY_SIZE_KEYWORD))]);
+ parentTag = qName;
+ arrID = id;
+ arrInd = 0;
+ }
+ else if (qName.equalsIgnoreCase(Const.UIMA_NONEMPTY_FSLIST)) {
+ int[] ref = {Integer.parseInt(attributes.getValue(Const.UIMA_LIST_HEAD_KEYWORD)),
+ Integer.parseInt(attributes.getValue(Const.UIMA_LIST_TAIL_KEYWORD))};
+ pendingList.put(id, ref);
+ }
+ else if (qName.equalsIgnoreCase(Const.UIMA_EMPTY_FSLIST)) {
+ pendingList.put(id, null);
+ }
+ else if (!qName.equalsIgnoreCase(Const.UIMA_ARRAY_INDEX_KEYWORD)) {
+ XcasAnnotation a = new XcasAnnotation(qName);
+ for (int i = attributes.getLength(); i > 0; i--) {
+ String q = attributes.getQName(i-1);
+ String v = attributes.getValue(i-1);
+ if (q.equalsIgnoreCase(Const.ID) || Const.ATTRIBUTES_TO_IGNORE.contains(q)) continue;
+ else if (q.startsWith(Const.REF_PREFIX)) pendingRef.put(Integer.toString(id)+":"+q, Integer.parseInt(v));
+ else a.insertAttribute(q, v);
+ }
+ xcasf.addAnnotation(id, a, loc.getLineNumber()+":"+loc.getColumnNumber());
+ }
+ }
+
+ public void endElement (String uri, String localName, String qName) throws SAXException {
+ if (qName.equalsIgnoreCase(Const.UIMA_ARRAY_INDEX_KEYWORD)) {
+ if (parentTag.equalsIgnoreCase(Const.UIMA_FSARRAY))
+ pendingArr.get(arrID)[arrInd++] = Integer.parseInt(val.toString());
+ else if (parentTag.equalsIgnoreCase(Const.UIMA_INTARRAY))
+ pendingIntArr.get(arrID)[arrInd++] = Integer.parseInt(val.toString());
+ } else if (qName.equalsIgnoreCase(Const.UIMA_FSARRAY)) {
+ arrInd = -1;
+ } else if (qName.equalsIgnoreCase(Const.UIMA_INTARRAY)) {
+ arrInd = -1;
+ } else if (qName.equalsIgnoreCase(Const.UIMA_CAS)) {
+ for (String s : pendingRef.keySet()) {
+ String[] ref = s.split(":");
+ int refID = pendingRef.get(s);
+ XcasAnnotation a = xcasf.getAnnotation(Integer.parseInt(ref[0]));
+ if (pendingIntArr.containsKey(refID)) {
+ a.insertIntReference(ref[1], pendingIntArr.get(refID));
+ continue;
+ }
+ int[] arr;
+ if (pendingArr.containsKey(refID))
+ arr = pendingArr.get(refID);
+ else if (pendingList.containsKey(refID)) {
+ LinkedList<Integer> ll = new LinkedList<Integer>();
+ int[] l = pendingList.get(refID);
+ while (l!=null) {
+ ll.add(l[0]);
+ l = pendingList.get(l[1]);
+ }
+ arr = new int[ll.size()];
+ for (int i = 0; i < arr.length; i++)
+ arr[i] = ll.get(i);
+ }
+ else { arr = new int[1]; arr[0] = refID; }
+ for (int i : arr)
+ a.insertReference(ref[1], xcasf.getAnnotation(i));
+ }
+ }
+ if (val.length()>0 && !qName.equalsIgnoreCase(Const.UIMA_ARRAY_INDEX_KEYWORD) && !qName.equalsIgnoreCase(Const.UIMA_TCAS_DOCUMENT))
+ System.err.println("Unexpected text ("+qName+"): \""+val.toString()+"\"");
+ val.delete(0, val.length());
+ }
+}
Modified: incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/scripts/java/data/pos/dictionary/ListTags.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/POS%20tagger/scripts/java/data/pos/dictionary/ListTags.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/scripts/java/data/pos/dictionary/ListTags.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/scripts/java/data/pos/dictionary/ListTags.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
/*
- * Copyright: (c) 2009 Mayo Foundation for Medical Education and
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
@@ -21,143 +14,143 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package data.pos.dictionary;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.IOException;
-import java.io.Reader;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.ArrayList;
-
-
-/**
- * From a POS corpus in OpenNLP format, create a list of the POS tags found within the corpus
- * <br>Outputs the list of tags to stdout, and for each tag, outputs one word/token that
- * had been tagged with that tag
- * @author Mayo Clinic
- */
-
-public class ListTags {
-
- /*
- * writes the list of tags to stdout, together with an example / a word found tagged with that tag
- */
- private static void writeTagList(File f, HashMap<String, String> tagList) throws IOException {
- // sort them before outputting them
- ArrayList<String> list = new ArrayList<String>();
- for (Object key : tagList.keySet()) {
- list.add(key.toString());
- }
- Collections.sort(list);
-
- // output to stdout
- System.out.println("\nFor file " + f.getName() + ":");
- for (String s : list) {
- System.out.println(s + "\t which was a tag for '" + tagList.get(s)+ "'"); // output the tagList entry to stdout
- }
- }
-
- // Use a HashMap so we can keep an example, for each tag, of what was tagged with the tag
- private static HashMap<String, String> createTagList(BufferedReader br) throws IOException {
- HashMap<String, String> tagList;
- tagList = new HashMap<String, String> (100); // initial size is arbitrary
- String line;
- String tag;
- int pos; // position of last underscore
- String taggedThing;
- while((line = br.readLine()) != null) {
- for (String token : line.split(" ")) {
- pos = token.lastIndexOf('_');
- if (pos < 0) {
- System.err.println("ERROR: didn't find underscore within '" + token + "'");
- }
- taggedThing = token.substring(0, pos);
- tag = token.substring(pos+1);
- if (tagList.get(tag)==null) {
- tagList.put(tag, taggedThing);
- }
- else {
- // System.out.println(tag + " already was seen for " + taggedThing);
- }
- }
- }
- return tagList;
- }
-
- private static BufferedReader getBufferedReader(String filename) throws FileNotFoundException {
- File f = new File(filename);
- Reader r;
- try {
- r = new FileReader(f);
- } catch (FileNotFoundException e) {
- System.err.println("Error reading from file " + filename);
- throw e;
- }
-
- return new BufferedReader(r);
- }
-
-
- /**
- * Read a file containing POS-tagged tokens in OpenNLP format,
- * and output to stdout the list of tags found<br>
- * Example input:
- * <br>body_NN
- * <br>winning_VBG<br>
- * <br>body_NN<br>
- * Example output:
- * <br>NN
- * <br>VBG
- * @param args args[0] is required - the name of the input file containing
- * POS-tagged tokens in OpenNLP format.
- * <br>E.g. data/pos/ptb-pos-training.txt
- *
- */
- public static void main(String[] args) {
-
- if (args[0]==null || args[0].length()==0) {
- System.err.println("ERROR: corpus name required");
- return;
- }
-
- String arg0 = args[0].trim();
- if (arg0.equals("-h") || (arg0.equals("--help"))) {
- System.out.println("Usage: java ListTags <corpus-name>");
- System.out.println(" where <corpus-name> is something like data/pos/ptb-pos-training.txt");
- System.out.println("Usage: java ListTags <directory>");
- System.out.println(" where <directory> is something like data/pos/");
- return;
- }
-
- String inputPath = args[0];
-
- File f = new File(inputPath);
- File [] files; // list of files to process
- if (f.isDirectory()) { // directory name was input
- files = f.listFiles(); // process all within the dir
- }
- else { // name of a regular file was input
- files = new File[1];
- files[0] = f;
- }
- HashMap<String, String> tagList;
-
- try {
- for (File file : files) {
- if (file.isDirectory()) continue; // skip subdirectories
- if (file.getName().endsWith(".lnk")) continue; // skip shortcuts
- BufferedReader br = getBufferedReader(file.getAbsolutePath());
- tagList = createTagList(br);
- writeTagList(file, tagList);
- }
- } catch (IOException e) {
- System.err.println("Failed");
- }
-
- }
-
-}
+package data.pos.dictionary;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.ArrayList;
+
+
+/**
+ * From a POS corpus in OpenNLP format, create a list of the POS tags found within the corpus
+ * <br>Outputs the list of tags to stdout, and for each tag, outputs one word/token that
+ * had been tagged with that tag
+ * @author Mayo Clinic
+ */
+
+public class ListTags {
+
+ /*
+ * writes the list of tags to stdout, together with an example / a word found tagged with that tag
+ */
+ private static void writeTagList(File f, HashMap<String, String> tagList) throws IOException {
+ // sort them before outputting them
+ ArrayList<String> list = new ArrayList<String>();
+ for (Object key : tagList.keySet()) {
+ list.add(key.toString());
+ }
+ Collections.sort(list);
+
+ // output to stdout
+ System.out.println("\nFor file " + f.getName() + ":");
+ for (String s : list) {
+ System.out.println(s + "\t which was a tag for '" + tagList.get(s)+ "'"); // output the tagList entry to stdout
+ }
+ }
+
+ // Use a HashMap so we can keep an example, for each tag, of what was tagged with the tag
+ private static HashMap<String, String> createTagList(BufferedReader br) throws IOException {
+ HashMap<String, String> tagList;
+ tagList = new HashMap<String, String> (100); // initial size is arbitrary
+ String line;
+ String tag;
+ int pos; // position of last underscore
+ String taggedThing;
+ while((line = br.readLine()) != null) {
+ for (String token : line.split(" ")) {
+ pos = token.lastIndexOf('_');
+ if (pos < 0) {
+ System.err.println("ERROR: didn't find underscore within '" + token + "'");
+ }
+ taggedThing = token.substring(0, pos);
+ tag = token.substring(pos+1);
+ if (tagList.get(tag)==null) {
+ tagList.put(tag, taggedThing);
+ }
+ else {
+ // System.out.println(tag + " already was seen for " + taggedThing);
+ }
+ }
+ }
+ return tagList;
+ }
+
+ private static BufferedReader getBufferedReader(String filename) throws FileNotFoundException {
+ File f = new File(filename);
+ Reader r;
+ try {
+ r = new FileReader(f);
+ } catch (FileNotFoundException e) {
+ System.err.println("Error reading from file " + filename);
+ throw e;
+ }
+
+ return new BufferedReader(r);
+ }
+
+
+ /**
+ * Read a file containing POS-tagged tokens in OpenNLP format,
+ * and output to stdout the list of tags found<br>
+ * Example input:
+ * <br>body_NN
+ * <br>winning_VBG<br>
+ * <br>body_NN<br>
+ * Example output:
+ * <br>NN
+ * <br>VBG
+ * @param args args[0] is required - the name of the input file containing
+ * POS-tagged tokens in OpenNLP format.
+ * <br>E.g. data/pos/ptb-pos-training.txt
+ *
+ */
+ public static void main(String[] args) {
+
+ if (args[0]==null || args[0].length()==0) {
+ System.err.println("ERROR: corpus name required");
+ return;
+ }
+
+ String arg0 = args[0].trim();
+ if (arg0.equals("-h") || (arg0.equals("--help"))) {
+ System.out.println("Usage: java ListTags <corpus-name>");
+ System.out.println(" where <corpus-name> is something like data/pos/ptb-pos-training.txt");
+ System.out.println("Usage: java ListTags <directory>");
+ System.out.println(" where <directory> is something like data/pos/");
+ return;
+ }
+
+ String inputPath = args[0];
+
+ File f = new File(inputPath);
+ File [] files; // list of files to process
+ if (f.isDirectory()) { // directory name was input
+ files = f.listFiles(); // process all within the dir
+ }
+ else { // name of a regular file was input
+ files = new File[1];
+ files[0] = f;
+ }
+ HashMap<String, String> tagList;
+
+ try {
+ for (File file : files) {
+ if (file.isDirectory()) continue; // skip subdirectories
+ if (file.getName().endsWith(".lnk")) continue; // skip shortcuts
+ BufferedReader br = getBufferedReader(file.getAbsolutePath());
+ tagList = createTagList(br);
+ writeTagList(file, tagList);
+ }
+ } catch (IOException e) {
+ System.err.println("Failed");
+ }
+
+ }
+
+}
Modified: incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/scripts/java/data/pos/training/GeniaPosTrainingDataExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/POS%20tagger/scripts/java/data/pos/training/GeniaPosTrainingDataExtractor.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/scripts/java/data/pos/training/GeniaPosTrainingDataExtractor.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/scripts/java/data/pos/training/GeniaPosTrainingDataExtractor.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
/*
- * Copyright: (c) 2009 Mayo Foundation for Medical Education and
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
@@ -21,203 +14,203 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package data.pos.training;
-
-import java.io.File;
-import java.io.PrintStream;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.jdom.Element;
-import org.jdom.JDOMException;
-import org.jdom.input.SAXBuilder;
-
-/**
- * This class reads in the GENIA corpus and produces part-of-speech training
- * data. It reads in the corpus file GENIAcorpus3.02.pos.xml and writes out the
- * file found at data/pos/training/genia-pos-training.txt.
- *
- * see also data/pos/training/README
- *
- * @author Mayo Clinic
- *
- */
-public class GeniaPosTrainingDataExtractor implements Iterator<data.pos.training.GeniaPosTrainingDataExtractor.TaggedAbstract> {
-
- Iterator<?> articles;
- Element article;
-
- public GeniaPosTrainingDataExtractor(String geniaCorpusFileName) throws JDOMException{
- File geniaCorpusFile = new File(geniaCorpusFileName);
- SAXBuilder builder = new SAXBuilder();
- builder.setDTDHandler(null);
- Element root = builder.build(geniaCorpusFile).getRootElement();
- articles = root.getChildren("article").iterator();
- }
-
-
- public boolean hasNext() {
- if(article != null)
- return true;
- else {
- if (articles.hasNext()) {
- article = (Element) articles.next();
- return true;
- }
- }
- return false;
- }
-
- public TaggedAbstract next() {
- if(hasNext()) {
- TaggedAbstract taggedAbstract = parseArticle(article);
- article = null;
- return taggedAbstract;
- }
- return null;
- }
-
- public void remove() {}
-
- public TaggedAbstract parseArticle(Element article) {
- List<TaggedSentence> taggedSentences = new ArrayList<TaggedSentence>();
-
- Element title = article.getChild("title");
- if (title != null)
- taggedSentences.addAll(parseAbstract(title));
- Element abstractElement = article.getChild("abstract");
- if (abstractElement != null)
- taggedSentences.addAll(parseAbstract(abstractElement));
- return new TaggedAbstract(taggedSentences);
- }
-
- public List<TaggedSentence> parseAbstract(Element titleOrAbstract){
- List<TaggedSentence> taggedSentences = new ArrayList<TaggedSentence>();
- Iterator<?> sentences = titleOrAbstract.getChildren("sentence").iterator();
- while (sentences.hasNext()) {
- Element sentence = (Element) sentences.next();
- TaggedSentence taggedSentence = parseSentence(sentence);
- taggedSentences.add(taggedSentence);
- }
- return taggedSentences;
- }
-
- public TaggedSentence parseSentence(Element sentence){
- List<TaggedWord> wordTags = new ArrayList<TaggedWord>();
- Iterator<?> words = sentence.getChildren("w").iterator();
- while (words.hasNext()) {
- Element word = (Element) words.next();
- String wordText = word.getText();
- String posTag = word.getAttributeValue("c");
- /**
- * If the posTag is an asterisk, then we want to find the next word that has a
- * an actual posTag.
- */
- while (posTag.equals("*")) {
- word = (Element) words.next();
- wordText = wordText + word.getText();
- posTag = word.getAttributeValue("c");
- }
-
- if(posTag.indexOf("|") != -1)
- System.out.println(wordText+": "+posTag);
- posTag = posTag.split("\\|")[0];
- /**
- * some of the tags in Genia have white space that messes things up. Just remove
- * the whitespace from these words.
- */
- wordText = wordText.replaceAll("\\s", "");
- wordTags.add(new TaggedWord(wordText, posTag));
- }
- return new TaggedSentence(wordTags);
- }
-
- public class TaggedAbstract{
- List<TaggedSentence> taggedSentences;
-
- public TaggedAbstract(List<TaggedSentence> taggedSentences) {
- super();
- this.taggedSentences = taggedSentences;
- }
-
- public List<TaggedSentence> getTaggedSentences() {
- return taggedSentences;
- }
-
- public void setTaggedSentences(List<TaggedSentence> taggedSentences) {
- this.taggedSentences = taggedSentences;
- }
-
- }
-
- public class TaggedSentence{
- List<TaggedWord> taggedWords;
-
- public TaggedSentence(List<TaggedWord> taggedWords) {
- super();
- this.taggedWords = taggedWords;
- }
-
- public List<TaggedWord> getTaggedWords() {
- return taggedWords;
- }
-
- public void setTaggedWords(List<TaggedWord> taggedWords) {
- this.taggedWords = taggedWords;
- }
- }
-
- public class TaggedWord{
- String word;
- String tag;
- public String getWord() {
- return word;
- }
- public void setWord(String word) {
- this.word = word;
- }
- public String getTag() {
- return tag;
- }
- public void setTag(String tag) {
- this.tag = tag;
- }
- public TaggedWord(String word, String tag) {
- super();
- this.word = word;
- this.tag = tag;
- }
-
- }
-
-
- public static void main(String[] args) {
- try {
- System.out.println("Usage: java GeniaPosExtractor GENIAcorpus3.02.pos.xml data/pos/training/genia-pos-training.txt");
- String geniaCorpusFileName = args[0];
- String outputFileName = args[1];
-
- PrintStream out = new PrintStream(outputFileName);
-
- GeniaPosTrainingDataExtractor gptde = new GeniaPosTrainingDataExtractor(geniaCorpusFileName);
- while(gptde.hasNext()) {
- TaggedAbstract taggedAbstract = gptde.next();
- for(TaggedSentence taggedSentence : taggedAbstract.getTaggedSentences()) {
- for(TaggedWord taggedWord : taggedSentence.getTaggedWords()) {
- out.print(taggedWord.getWord()+"_"+taggedWord.getTag()+" ");
- }
- out.println();
- }
- }
-
- out.flush();
- out.close();
-
- } catch (Exception e) {
- e.printStackTrace();
- }
-
- }
-
-}
+package data.pos.training;
+
+import java.io.File;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.jdom.Element;
+import org.jdom.JDOMException;
+import org.jdom.input.SAXBuilder;
+
+/**
+ * This class reads in the GENIA corpus and produces part-of-speech training
+ * data. It reads in the corpus file GENIAcorpus3.02.pos.xml and writes out the
+ * file found at data/pos/training/genia-pos-training.txt.
+ *
+ * see also data/pos/training/README
+ *
+ * @author Mayo Clinic
+ *
+ */
+public class GeniaPosTrainingDataExtractor implements Iterator<data.pos.training.GeniaPosTrainingDataExtractor.TaggedAbstract> {
+
+ Iterator<?> articles;
+ Element article;
+
+ public GeniaPosTrainingDataExtractor(String geniaCorpusFileName) throws JDOMException{
+ File geniaCorpusFile = new File(geniaCorpusFileName);
+ SAXBuilder builder = new SAXBuilder();
+ builder.setDTDHandler(null);
+ Element root = builder.build(geniaCorpusFile).getRootElement();
+ articles = root.getChildren("article").iterator();
+ }
+
+
+ public boolean hasNext() {
+ if(article != null)
+ return true;
+ else {
+ if (articles.hasNext()) {
+ article = (Element) articles.next();
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public TaggedAbstract next() {
+ if(hasNext()) {
+ TaggedAbstract taggedAbstract = parseArticle(article);
+ article = null;
+ return taggedAbstract;
+ }
+ return null;
+ }
+
+ public void remove() {}
+
+ public TaggedAbstract parseArticle(Element article) {
+ List<TaggedSentence> taggedSentences = new ArrayList<TaggedSentence>();
+
+ Element title = article.getChild("title");
+ if (title != null)
+ taggedSentences.addAll(parseAbstract(title));
+ Element abstractElement = article.getChild("abstract");
+ if (abstractElement != null)
+ taggedSentences.addAll(parseAbstract(abstractElement));
+ return new TaggedAbstract(taggedSentences);
+ }
+
+ public List<TaggedSentence> parseAbstract(Element titleOrAbstract){
+ List<TaggedSentence> taggedSentences = new ArrayList<TaggedSentence>();
+ Iterator<?> sentences = titleOrAbstract.getChildren("sentence").iterator();
+ while (sentences.hasNext()) {
+ Element sentence = (Element) sentences.next();
+ TaggedSentence taggedSentence = parseSentence(sentence);
+ taggedSentences.add(taggedSentence);
+ }
+ return taggedSentences;
+ }
+
+ public TaggedSentence parseSentence(Element sentence){
+ List<TaggedWord> wordTags = new ArrayList<TaggedWord>();
+ Iterator<?> words = sentence.getChildren("w").iterator();
+ while (words.hasNext()) {
+ Element word = (Element) words.next();
+ String wordText = word.getText();
+ String posTag = word.getAttributeValue("c");
+ /**
+ * If the posTag is an asterisk, then we want to find the next word that has a
+ * an actual posTag.
+ */
+ while (posTag.equals("*")) {
+ word = (Element) words.next();
+ wordText = wordText + word.getText();
+ posTag = word.getAttributeValue("c");
+ }
+
+ if(posTag.indexOf("|") != -1)
+ System.out.println(wordText+": "+posTag);
+ posTag = posTag.split("\\|")[0];
+ /**
+ * some of the tags in Genia have white space that messes things up. Just remove
+ * the whitespace from these words.
+ */
+ wordText = wordText.replaceAll("\\s", "");
+ wordTags.add(new TaggedWord(wordText, posTag));
+ }
+ return new TaggedSentence(wordTags);
+ }
+
+ public class TaggedAbstract{
+ List<TaggedSentence> taggedSentences;
+
+ public TaggedAbstract(List<TaggedSentence> taggedSentences) {
+ super();
+ this.taggedSentences = taggedSentences;
+ }
+
+ public List<TaggedSentence> getTaggedSentences() {
+ return taggedSentences;
+ }
+
+ public void setTaggedSentences(List<TaggedSentence> taggedSentences) {
+ this.taggedSentences = taggedSentences;
+ }
+
+ }
+
+ public class TaggedSentence{
+ List<TaggedWord> taggedWords;
+
+ public TaggedSentence(List<TaggedWord> taggedWords) {
+ super();
+ this.taggedWords = taggedWords;
+ }
+
+ public List<TaggedWord> getTaggedWords() {
+ return taggedWords;
+ }
+
+ public void setTaggedWords(List<TaggedWord> taggedWords) {
+ this.taggedWords = taggedWords;
+ }
+ }
+
+ public class TaggedWord{
+ String word;
+ String tag;
+ public String getWord() {
+ return word;
+ }
+ public void setWord(String word) {
+ this.word = word;
+ }
+ public String getTag() {
+ return tag;
+ }
+ public void setTag(String tag) {
+ this.tag = tag;
+ }
+ public TaggedWord(String word, String tag) {
+ super();
+ this.word = word;
+ this.tag = tag;
+ }
+
+ }
+
+
+ public static void main(String[] args) {
+ try {
+ System.out.println("Usage: java GeniaPosExtractor GENIAcorpus3.02.pos.xml data/pos/training/genia-pos-training.txt");
+ String geniaCorpusFileName = args[0];
+ String outputFileName = args[1];
+
+ PrintStream out = new PrintStream(outputFileName);
+
+ GeniaPosTrainingDataExtractor gptde = new GeniaPosTrainingDataExtractor(geniaCorpusFileName);
+ while(gptde.hasNext()) {
+ TaggedAbstract taggedAbstract = gptde.next();
+ for(TaggedSentence taggedSentence : taggedAbstract.getTaggedSentences()) {
+ for(TaggedWord taggedWord : taggedSentence.getTaggedWords()) {
+ out.print(taggedWord.getWord()+"_"+taggedWord.getTag()+" ");
+ }
+ out.println();
+ }
+ }
+
+ out.flush();
+ out.close();
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ }
+
+}
Modified: incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/scripts/java/data/pos/training/GeniaPosTrainingDataExtractorTests.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/POS%20tagger/scripts/java/data/pos/training/GeniaPosTrainingDataExtractorTests.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/scripts/java/data/pos/training/GeniaPosTrainingDataExtractorTests.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/scripts/java/data/pos/training/GeniaPosTrainingDataExtractorTests.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
/*
- * Copyright: (c) 2009 Mayo Foundation for Medical Education and
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
@@ -21,94 +14,94 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package data.pos.training;
-
-import static org.junit.Assert.assertEquals;
-
-import org.jdom.JDOMException;
-import org.junit.Test;
-
-import data.pos.training.GeniaPosTrainingDataExtractor.TaggedAbstract;
-import data.pos.training.GeniaPosTrainingDataExtractor.TaggedSentence;
-import data.pos.training.GeniaPosTrainingDataExtractor.TaggedWord;
-
-public class GeniaPosTrainingDataExtractorTests {
-
- @Test
- public void test() throws JDOMException {
- GeniaPosTrainingDataExtractor gptde = new GeniaPosTrainingDataExtractor("test/data/GENIAcorpus3.02.pos.test.xml");
-
- TaggedAbstract taggedAbstract = gptde.next();
- TaggedSentence taggedSentence;
- TaggedWord taggedWord;
-
- //test one full sentence from title
- taggedSentence = taggedAbstract.getTaggedSentences().get(0);
- taggedWord = taggedSentence.getTaggedWords().get(0);
- assertEquals("Pancreatic", taggedWord.getWord());
- assertEquals("JJ", taggedWord.getTag());
- taggedWord = taggedSentence.getTaggedWords().get(1);
- assertEquals("development", taggedWord.getWord());
- assertEquals("NN", taggedWord.getTag());
- taggedWord = taggedSentence.getTaggedWords().get(2);
- assertEquals("and", taggedWord.getWord());
- assertEquals("CC", taggedWord.getTag());
- taggedWord = taggedSentence.getTaggedWords().get(3);
- assertEquals("maturation", taggedWord.getWord());
- assertEquals("NN", taggedWord.getTag());
- taggedWord = taggedSentence.getTaggedWords().get(4);
- assertEquals("of", taggedWord.getWord());
- assertEquals("IN", taggedWord.getTag());
- taggedWord = taggedSentence.getTaggedWords().get(5);
- assertEquals("the", taggedWord.getWord());
- assertEquals("DT", taggedWord.getTag());
- taggedWord = taggedSentence.getTaggedWords().get(6);
- assertEquals("islet", taggedWord.getWord());
- assertEquals("NN", taggedWord.getTag());
- taggedWord = taggedSentence.getTaggedWords().get(7);
- assertEquals("B", taggedWord.getWord());
- assertEquals("NN", taggedWord.getTag());
- taggedWord = taggedSentence.getTaggedWords().get(8);
- assertEquals("cell", taggedWord.getWord());
- assertEquals("NN", taggedWord.getTag());
- taggedWord = taggedSentence.getTaggedWords().get(9);
- assertEquals(".", taggedWord.getWord());
- assertEquals(".", taggedWord.getTag());
-
- //test one full sentence from abstract
- //<sentence><w c="DT">The</w> <w c="CD">three</w> <w c="NNS">compartments</w> <w c="VBP">are</w> <w c="VBN">thought</w> <w c="TO">to</w> <w c="VB">be</w> <w c="IN">of</w> <w c="JJ">common</w> <w c="JJ">endodermal</w> <w c="NN">origin</w><w c=":">;</w> <w c="IN">in</w> <w c="NN">contrast</w> <w c="TO">to</w> <w c="JJR">earlier</w> <w c="NNS">hypotheses</w><w c=",">,</w> <w c="WDT">which</w> <w c="VBD">suggested</w> <w c="IN">that</w> <w c="DT">the</w> <w c="JJ">endocrine</w> <w c="NN">compartment</w> <w c="VBD">was</w> <w c="IN">of</w> <w c="JJ">neuroectodermal</w> <w c="NN">origin</w><w c=".">.</w></sentence>
- taggedSentence = taggedAbstract.getTaggedSentences().get(5);
- taggedWord = taggedSentence.getTaggedWords().get(0);
- assertEquals("The", taggedWord.getWord());
- assertEquals("DT", taggedWord.getTag());
-
-
- taggedSentence = taggedAbstract.getTaggedSentences().get(1);
- assertEquals(6, taggedSentence.getTaggedWords().size());
-
- taggedSentence = taggedAbstract.getTaggedSentences().get(2);
- taggedWord = taggedSentence.getTaggedWords().get(0);
- assertEquals("Pancreas", taggedWord.getWord());
- assertEquals("NN", taggedWord.getTag());
- taggedWord = taggedSentence.getTaggedWords().get(11);
- assertEquals("anlage", taggedWord.getWord());
- assertEquals("NNS", taggedWord.getTag());
- taggedWord = taggedSentence.getTaggedWords().get(17);
- assertEquals(".", taggedWord.getWord());
- assertEquals(".", taggedWord.getTag());
-
-
-
- taggedAbstract = gptde.next();
- taggedSentence = taggedAbstract.getTaggedSentences().get(4);
- taggedWord = taggedSentence.getTaggedWords().get(0);
- assertEquals("We", taggedWord.getWord());
- assertEquals("PRP", taggedWord.getTag());
- taggedWord = taggedSentence.getTaggedWords().get(37);
- assertEquals("non-octamer", taggedWord.getWord());
- assertEquals("JJ", taggedWord.getTag());
-
- }
-}
-
-
+package data.pos.training;
+
+import static org.junit.Assert.assertEquals;
+
+import org.jdom.JDOMException;
+import org.junit.Test;
+
+import data.pos.training.GeniaPosTrainingDataExtractor.TaggedAbstract;
+import data.pos.training.GeniaPosTrainingDataExtractor.TaggedSentence;
+import data.pos.training.GeniaPosTrainingDataExtractor.TaggedWord;
+
+public class GeniaPosTrainingDataExtractorTests {
+
+ @Test
+ public void test() throws JDOMException {
+ GeniaPosTrainingDataExtractor gptde = new GeniaPosTrainingDataExtractor("test/data/GENIAcorpus3.02.pos.test.xml");
+
+ TaggedAbstract taggedAbstract = gptde.next();
+ TaggedSentence taggedSentence;
+ TaggedWord taggedWord;
+
+ //test one full sentence from title
+ taggedSentence = taggedAbstract.getTaggedSentences().get(0);
+ taggedWord = taggedSentence.getTaggedWords().get(0);
+ assertEquals("Pancreatic", taggedWord.getWord());
+ assertEquals("JJ", taggedWord.getTag());
+ taggedWord = taggedSentence.getTaggedWords().get(1);
+ assertEquals("development", taggedWord.getWord());
+ assertEquals("NN", taggedWord.getTag());
+ taggedWord = taggedSentence.getTaggedWords().get(2);
+ assertEquals("and", taggedWord.getWord());
+ assertEquals("CC", taggedWord.getTag());
+ taggedWord = taggedSentence.getTaggedWords().get(3);
+ assertEquals("maturation", taggedWord.getWord());
+ assertEquals("NN", taggedWord.getTag());
+ taggedWord = taggedSentence.getTaggedWords().get(4);
+ assertEquals("of", taggedWord.getWord());
+ assertEquals("IN", taggedWord.getTag());
+ taggedWord = taggedSentence.getTaggedWords().get(5);
+ assertEquals("the", taggedWord.getWord());
+ assertEquals("DT", taggedWord.getTag());
+ taggedWord = taggedSentence.getTaggedWords().get(6);
+ assertEquals("islet", taggedWord.getWord());
+ assertEquals("NN", taggedWord.getTag());
+ taggedWord = taggedSentence.getTaggedWords().get(7);
+ assertEquals("B", taggedWord.getWord());
+ assertEquals("NN", taggedWord.getTag());
+ taggedWord = taggedSentence.getTaggedWords().get(8);
+ assertEquals("cell", taggedWord.getWord());
+ assertEquals("NN", taggedWord.getTag());
+ taggedWord = taggedSentence.getTaggedWords().get(9);
+ assertEquals(".", taggedWord.getWord());
+ assertEquals(".", taggedWord.getTag());
+
+ //test one full sentence from abstract
+ //<sentence><w c="DT">The</w> <w c="CD">three</w> <w c="NNS">compartments</w> <w c="VBP">are</w> <w c="VBN">thought</w> <w c="TO">to</w> <w c="VB">be</w> <w c="IN">of</w> <w c="JJ">common</w> <w c="JJ">endodermal</w> <w c="NN">origin</w><w c=":">;</w> <w c="IN">in</w> <w c="NN">contrast</w> <w c="TO">to</w> <w c="JJR">earlier</w> <w c="NNS">hypotheses</w><w c=",">,</w> <w c="WDT">which</w> <w c="VBD">suggested</w> <w c="IN">that</w> <w c="DT">the</w> <w c="JJ">endocrine</w> <w c="NN">compartment</w> <w c="VBD">was</w> <w c="IN">of</w> <w c="JJ">neuroectodermal</w> <w c="NN">origin</w><w c=".">.</w></sentence>
+ taggedSentence = taggedAbstract.getTaggedSentences().get(5);
+ taggedWord = taggedSentence.getTaggedWords().get(0);
+ assertEquals("The", taggedWord.getWord());
+ assertEquals("DT", taggedWord.getTag());
+
+
+ taggedSentence = taggedAbstract.getTaggedSentences().get(1);
+ assertEquals(6, taggedSentence.getTaggedWords().size());
+
+ taggedSentence = taggedAbstract.getTaggedSentences().get(2);
+ taggedWord = taggedSentence.getTaggedWords().get(0);
+ assertEquals("Pancreas", taggedWord.getWord());
+ assertEquals("NN", taggedWord.getTag());
+ taggedWord = taggedSentence.getTaggedWords().get(11);
+ assertEquals("anlage", taggedWord.getWord());
+ assertEquals("NNS", taggedWord.getTag());
+ taggedWord = taggedSentence.getTaggedWords().get(17);
+ assertEquals(".", taggedWord.getWord());
+ assertEquals(".", taggedWord.getTag());
+
+
+
+ taggedAbstract = gptde.next();
+ taggedSentence = taggedAbstract.getTaggedSentences().get(4);
+ taggedWord = taggedSentence.getTaggedWords().get(0);
+ assertEquals("We", taggedWord.getWord());
+ assertEquals("PRP", taggedWord.getTag());
+ taggedWord = taggedSentence.getTaggedWords().get(37);
+ assertEquals("non-octamer", taggedWord.getWord());
+ assertEquals("JJ", taggedWord.getTag());
+
+ }
+}
+
+
Modified: incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/src/edu/mayo/bmi/uima/pos_tagger/OpenNLPPOSCollectionReader.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/POS%20tagger/src/edu/mayo/bmi/uima/pos_tagger/OpenNLPPOSCollectionReader.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/src/edu/mayo/bmi/uima/pos_tagger/OpenNLPPOSCollectionReader.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/src/edu/mayo/bmi/uima/pos_tagger/OpenNLPPOSCollectionReader.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
/*
- * Copyright: (c) 2009 Mayo Foundation for Medical Education and
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
@@ -21,134 +14,134 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package edu.mayo.bmi.uima.pos_tagger;
-
-import java.io.BufferedReader;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.IOException;
-
-import org.apache.uima.cas.CAS;
-import org.apache.uima.cas.CASException;
-import org.apache.uima.collection.CollectionException;
-import org.apache.uima.collection.CollectionReader_ImplBase;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.util.Progress;
-
-import edu.mayo.bmi.uima.core.type.syntax.BaseToken;
-import edu.mayo.bmi.uima.core.type.textspan.Sentence;
-
-/**
- * This collection reader reads in part-of-speech training/test data in the
- * OpenNLP format. See note below for POS_DATA_FILE_PARAM for details about this
- * format. Each line in the file will correspond to a "document" - i.e.
- * getNext() will populate the CAS with information from one line of the file.
- */
-public class OpenNLPPOSCollectionReader extends CollectionReader_ImplBase {
-
- /**
- * "PosDataFile" is a required, single, string parameter that specifies the
- * location of a data file that contains part-of-speech data in it. The
- * format of the file should have one sentence per line where each word is
- * followed immediately by "_" and it pos tag followed by a space.
- *
- * <pre>
- * IL-2_NN gene_NN expression_NN and_CC ...
- *
- * </pre>
- */
- public static final String POS_DATA_FILE_PARAM = "PosDataFile";
-
- /**
- * "LoadWordsOnly" is a optional, single, boolean parameter that determines
- * whether or not the part-of-speech tags associated with each word will be
- * loaded into the CAS or not. The default value is false.
- */
- public static final String LOAD_WORDS_ONLY_PARAM = "LoadWordsOnly";
-
- BufferedReader input;
- String line = null;
-
- boolean loadWordsOnly;
-
- @Override
- public void initialize() throws ResourceInitializationException {
- try {
- String posDataFile = (String) getConfigParameterValue(POS_DATA_FILE_PARAM);
- input = new BufferedReader(new FileReader(posDataFile));
- Boolean paramValue = (Boolean) getConfigParameterValue(LOAD_WORDS_ONLY_PARAM);
- loadWordsOnly = paramValue == null ? false : paramValue;
-
- } catch (FileNotFoundException fnfe) {
- throw new ResourceInitializationException(fnfe);
- }
- }
-
- /**
- * Some of the code in this method is based loosely on
- * opennlp.tools.postag.POSEventCollector
- */
- public void getNext(CAS cas) throws IOException, CollectionException {
- try {
- if (hasNext()) {
- JCas jCas = cas.getJCas();
- String[] tokens = line.split(" ");
- int wordStart = 0;
- int wordEnd = 0;
- int wordNumber = 0;
- StringBuffer documentText = new StringBuffer();
- for (String token : tokens) {
- int split = token.lastIndexOf("_");
- if(split == token.length()-1) {
- split = token.substring(0, token.length()-1).lastIndexOf("_");
- }
- if (split == -1) {
- line = null;
- throw new CollectionException("There is a problem in your training data: " + token
- + " does not conform to the format WORD_TAG.", null);
- }
- String word = token.substring(0, split);
- wordEnd = wordStart + word.length();
- // Consider creating a token similar to the way
- // TokenConverter.convert method creates BaseToken's
- BaseToken baseToken = new BaseToken(jCas, wordStart, wordEnd);
- if (!loadWordsOnly) {
- String tag = token.substring(split + 1);
- baseToken.setPartOfSpeech(tag);
- }
- baseToken.setTokenNumber(wordNumber++);
- baseToken.addToIndexes();
-
- documentText.append(word + " ");
- wordStart = wordEnd + 1;
- }
- Sentence sentence = new Sentence(jCas, 0, wordEnd);
- sentence.setSentenceNumber(0);
- sentence.addToIndexes();
- jCas.setDocumentText(documentText.toString());
- }
- } catch (CASException ce) {
- throw new CollectionException(ce);
- }
- line = null;
- }
-
- public void close() throws IOException {
- input.close();
- }
-
- public Progress[] getProgress() {
- return null;
- }
-
- public boolean hasNext() throws IOException, CollectionException {
- if (line == null) {
- line = input.readLine();
- }
- if (line == null)
- return false;
- return true;
- }
-
-}
+package edu.mayo.bmi.uima.pos_tagger;
+
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CollectionException;
+import org.apache.uima.collection.CollectionReader_ImplBase;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.Progress;
+
+import edu.mayo.bmi.uima.core.type.syntax.BaseToken;
+import edu.mayo.bmi.uima.core.type.textspan.Sentence;
+
+/**
+ * This collection reader reads in part-of-speech training/test data in the
+ * OpenNLP format. See note below for POS_DATA_FILE_PARAM for details about this
+ * format. Each line in the file will correspond to a "document" - i.e.
+ * getNext() will populate the CAS with information from one line of the file.
+ */
+public class OpenNLPPOSCollectionReader extends CollectionReader_ImplBase {
+
+ /**
+ * "PosDataFile" is a required, single, string parameter that specifies the
+ * location of a data file that contains part-of-speech data in it. The
+ * format of the file should have one sentence per line where each word is
+ * followed immediately by "_" and it pos tag followed by a space.
+ *
+ * <pre>
+ * IL-2_NN gene_NN expression_NN and_CC ...
+ *
+ * </pre>
+ */
+ public static final String POS_DATA_FILE_PARAM = "PosDataFile";
+
+ /**
+ * "LoadWordsOnly" is a optional, single, boolean parameter that determines
+ * whether or not the part-of-speech tags associated with each word will be
+ * loaded into the CAS or not. The default value is false.
+ */
+ public static final String LOAD_WORDS_ONLY_PARAM = "LoadWordsOnly";
+
+ BufferedReader input;
+ String line = null;
+
+ boolean loadWordsOnly;
+
+ @Override
+ public void initialize() throws ResourceInitializationException {
+ try {
+ String posDataFile = (String) getConfigParameterValue(POS_DATA_FILE_PARAM);
+ input = new BufferedReader(new FileReader(posDataFile));
+ Boolean paramValue = (Boolean) getConfigParameterValue(LOAD_WORDS_ONLY_PARAM);
+ loadWordsOnly = paramValue == null ? false : paramValue;
+
+ } catch (FileNotFoundException fnfe) {
+ throw new ResourceInitializationException(fnfe);
+ }
+ }
+
+ /**
+ * Some of the code in this method is based loosely on
+ * opennlp.tools.postag.POSEventCollector
+ */
+ public void getNext(CAS cas) throws IOException, CollectionException {
+ try {
+ if (hasNext()) {
+ JCas jCas = cas.getJCas();
+ String[] tokens = line.split(" ");
+ int wordStart = 0;
+ int wordEnd = 0;
+ int wordNumber = 0;
+ StringBuffer documentText = new StringBuffer();
+ for (String token : tokens) {
+ int split = token.lastIndexOf("_");
+ if(split == token.length()-1) {
+ split = token.substring(0, token.length()-1).lastIndexOf("_");
+ }
+ if (split == -1) {
+ line = null;
+ throw new CollectionException("There is a problem in your training data: " + token
+ + " does not conform to the format WORD_TAG.", null);
+ }
+ String word = token.substring(0, split);
+ wordEnd = wordStart + word.length();
+ // Consider creating a token similar to the way
+ // TokenConverter.convert method creates BaseToken's
+ BaseToken baseToken = new BaseToken(jCas, wordStart, wordEnd);
+ if (!loadWordsOnly) {
+ String tag = token.substring(split + 1);
+ baseToken.setPartOfSpeech(tag);
+ }
+ baseToken.setTokenNumber(wordNumber++);
+ baseToken.addToIndexes();
+
+ documentText.append(word + " ");
+ wordStart = wordEnd + 1;
+ }
+ Sentence sentence = new Sentence(jCas, 0, wordEnd);
+ sentence.setSentenceNumber(0);
+ sentence.addToIndexes();
+ jCas.setDocumentText(documentText.toString());
+ }
+ } catch (CASException ce) {
+ throw new CollectionException(ce);
+ }
+ line = null;
+ }
+
+ public void close() throws IOException {
+ input.close();
+ }
+
+ public Progress[] getProgress() {
+ return null;
+ }
+
+ public boolean hasNext() throws IOException, CollectionException {
+ if (line == null) {
+ line = input.readLine();
+ }
+ if (line == null)
+ return false;
+ return true;
+ }
+
+}
Modified: incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/src/edu/mayo/bmi/uima/pos_tagger/POSTagger.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/POS%20tagger/src/edu/mayo/bmi/uima/pos_tagger/POSTagger.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/src/edu/mayo/bmi/uima/pos_tagger/POSTagger.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/POS tagger/src/edu/mayo/bmi/uima/pos_tagger/POSTagger.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
/*
- * Copyright: (c) 2009 Mayo Foundation for Medical Education and
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software