You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by br...@apache.org on 2013/07/07 21:23:07 UTC
svn commit: r1500511 [5/6] - in /ctakes/sandbox/ctakes-scrubber-deid/src: ./
main/ main/java/ main/java/org/ main/java/org/apache/
main/java/org/apache/uima/ main/java/org/apache/uima/examples/
main/java/org/spin/ main/java/org/spin/scrubber/ main/java...
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsDAO.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsDAO.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsDAO.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsDAO.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,505 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+/**
+ *
+ */
+package org.spin.scrubber.uima.dao;
+
+import com.mysql.jdbc.PreparedStatement;
+import org.spin.scrubber.beans.Annot;
+
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.*;
+
+/**
+ * @author britt fitch bf19
+ * @link http://lists.mysql.com/mysql/202489
+ */
+public class AnnotationsDAO extends BaseDAO
+{
+ public AnnotationsDAO(String tableSuffix)
+ {
+ this.tableSuffix = tableSuffix;
+ }
+
+ public List<Annot> selectPosWordAnnotations(String subjectId) throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ List<Annot> results = new ArrayList<Annot>();
+
+ StringBuilder sql = new StringBuilder("select * from machine_annotations"+tableSuffix+" where filename_short = ? and annot_type_short = 'WordToken' and match_source = 'pos';");
+
+ try
+ {
+ conn = getConnectionToScrubber();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setString(i++, subjectId);
+
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ Annot annot = new Annot();
+ annot.setAnnot_type_long(rs.getString("annot_type_long"));
+ annot.setAnnot_type_short(rs.getString("annot_type_short"));
+ annot.setEndIdx(rs.getInt("end_idx"));
+ annot.setFilename_long(rs.getString("filename_long"));
+ annot.setFilename_short(rs.getString("filename_short"));
+ annot.setId(rs.getInt("id"));
+ annot.setStartIdx(rs.getInt("start_idx"));
+ annot.setToken(rs.getString("token"));
+ annot.setMatch_source(rs.getString("match_source"));
+ annot.setMatch_value(rs.getString("match_value"));
+
+ results.add(annot);
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ log.error("failed selecting from machine_annotations"+tableSuffix+" : ", e);
+ }
+ finally
+ {
+ close(rs, ps, conn);
+ }
+
+ return results;
+ }
+
+
+ public List<String> selectDistinctFilenames() throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ List<String> results = new ArrayList<String>();
+
+ StringBuilder sql = new StringBuilder("select distinct filename_short from machine_annotations"+tableSuffix+" ;");
+
+ try
+ {
+ conn = getConnectionToScrubber();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ results.add(rs.getString("filename_short"));
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ log.error("failed selecting machine_annotations"+tableSuffix+" .subject_id",e);
+ }
+ finally
+ {
+ close(rs, ps, conn);
+ }
+
+ return results;
+ }
+
+ /**
+ * return map of lists. key is filename, lists contain annot objects.
+ * @return TODO ???
+ * @throws Exception
+ */
+ public List<Annot> selectAllAnnotByFilename(String filename) throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ List<Annot> results = new ArrayList<Annot>();
+
+ StringBuilder sql = new StringBuilder("select match_source, match_value, filename_short, start_idx, end_idx from machine_annotations"+tableSuffix+" where filename_short = '"+filename+"';");
+
+ try
+ {
+ conn = getConnectionToScrubber();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ Annot a = new Annot();
+ a.setMatch_source(rs.getString("match_source"));
+ a.setMatch_value(rs.getString("match_value"));
+ a.setStartIdx(rs.getInt("start_idx"));
+ a.setEndIdx(rs.getInt("end_idx"));
+ a.setFilename_short(rs.getString("filename_short"));
+
+ results.add(a);
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ log.error("Failed selecting for Annot List: ", e);
+ }
+ finally
+ {
+ close(rs, ps, conn);
+ }
+
+ return results;
+ }
+
+ /**
+ * return map of lists. key is filename, lists contain annot objects.
+ * @return TODO ???
+ * @throws Exception
+ */
+ @Deprecated
+ public Map<String, List<Annot>> selectAllAnnotByFilename() throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ Map<String, List<Annot>> results = new HashMap<String, List<Annot>>();
+
+ StringBuilder sql = new StringBuilder("select match_source, match_value, filename_short, start_idx, end_idx from machine_annotations"+tableSuffix+" ;");
+
+ try
+ {
+ conn = getConnectionToScrubber();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ Annot a = new Annot();
+ a.setMatch_source(rs.getString("match_source"));
+ a.setMatch_value(rs.getString("match_value"));
+ a.setStartIdx(rs.getInt("start_idx"));
+ a.setEndIdx(rs.getInt("end_idx"));
+ a.setFilename_short(rs.getString("filename_short"));
+
+ if (results.get(a.getFilename_short())==null)
+ {
+ List<Annot> lst = new ArrayList<Annot>();
+ lst.add(a);
+ results.put(a.getFilename_short(), lst);
+ }
+ else
+ {
+ results.get(a.getFilename_short()).add(a);
+ }
+
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ log.error("Failed selecting for Annot map: ", e);
+ }
+ finally
+ {
+ close(rs, ps, conn);
+ }
+
+ return results;
+ }
+
+ /**
+ * select all KnownPHI for a given file.
+ * data structure will be used to force classification of all tokens matching
+ * a piece of known PHI for a given file.
+ *
+ * @param filename
+ * @return Map containing token to annot link
+ * @throws Exception
+ */
+ public Map<String, Annot> selectKnownPHIAnnotations(String filename) throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ Map<String, Annot> results = new HashMap<String, Annot>();
+
+ //todo: isn't this supposed to be human_annotations_*?
+ StringBuilder sql = new StringBuilder("select * from machine_annotations"+tableSuffix+" where filename_short = ? and annot_type_short = 'KnownPHI';");
+
+ try
+ {
+ conn = getConnectionToScrubber();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setString(i++, filename);
+
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ String token = rs.getString("token");
+
+ Annot annot = new Annot();
+ annot.setAnnot_type_long(rs.getString("annot_type_long"));
+ annot.setAnnot_type_short(rs.getString("annot_type_short"));
+ annot.setEndIdx(rs.getInt("end_idx"));
+ annot.setFilename_long(rs.getString("filename_long"));
+ annot.setFilename_short(rs.getString("filename_short"));
+ annot.setId(rs.getInt("id"));
+ annot.setStartIdx(rs.getInt("start_idx"));
+ annot.setToken(token);
+ annot.setMatch_source(rs.getString("match_source"));
+ annot.setMatch_value(rs.getString("match_value"));
+
+ results.put(token.toLowerCase(),annot);
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ log.error("failed selecting from machine_annotations"+tableSuffix+" : ",e);
+ }
+ finally
+ {
+ close(rs, ps, conn);
+ }
+
+ return results;
+ }
+
+ public List<Annot> selectNumAnnotations(String subjectId) throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ List<Annot> results = new ArrayList<Annot>();
+
+ StringBuilder sql = new StringBuilder("select * from machine_annotations"+tableSuffix+" where filename_short = ? and annot_type_short = 'NumToken';");
+
+ try
+ {
+ conn = this.getConnectionToScrubber();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setString(i++, subjectId);
+
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ Annot annot = new Annot();
+ annot.setAnnot_type_long(rs.getString("annot_type_long"));
+ annot.setAnnot_type_short(rs.getString("annot_type_short"));
+ annot.setEndIdx(rs.getInt("end_idx"));
+ annot.setFilename_long(rs.getString("filename_long"));
+ annot.setFilename_short(rs.getString("filename_short"));
+ annot.setId(rs.getInt("id"));
+ annot.setStartIdx(rs.getInt("start_idx"));
+ annot.setToken(rs.getString("token"));
+ annot.setMatch_source(rs.getString("match_source"));
+ annot.setMatch_value(rs.getString("match_value"));
+
+ results.add(annot);
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ log.error("Failed selecting from machine_annotations"+tableSuffix+" : ",e);
+ }
+ finally
+ {
+ close(rs, ps, conn);
+ }
+
+ return results;
+ }
+
+ //TODO: DEPRECATED !
+
+ @Deprecated
+ public Map<String, String> selectFeaturesByFilenameIdx(String filename, int startIdx) throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ Map<String, String> results = new HashMap<String, String>();
+
+ StringBuilder sql = new StringBuilder("select match_source, match_value from machine_annotations"+tableSuffix+" where filename_short=? and ? between start_idx and end_idx;");
+
+ try
+ {
+ conn = this.getConnectionToScrubber();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ ps.setString(1, filename);
+ ps.setInt(2, startIdx);
+
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ results.put(rs.getString("match_value"),rs.getString("match_source"));
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ System.out.println("ERROR: failed selecting features for subject_id/idx: " + filename + "/" + startIdx + " " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps, conn);
+ }
+
+ return results;
+ }
+
+ @Deprecated
+ public List<String> selectDistinctPOS() throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ List<String> results = new ArrayList<String>();
+
+ StringBuilder sql = new StringBuilder("select distinct pos from machine_annotations"+tableSuffix+" ;");
+
+ try
+ {
+ conn = this.getConnectionToScrubber();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ results.add(rs.getString("pos"));
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ System.out.println("ERROR: failed selecting machine_annotations"+tableSuffix+" .pos: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps, conn);
+ }
+
+ return results;
+ }
+
+ @Deprecated
+ public Set<String> selectDistinctFeatures() throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ Set<String> results = new HashSet<String>();
+
+ StringBuilder sql = new StringBuilder("select concat_ws('|', match_source, match_value) as feature from machine_annotations"+tableSuffix+" ;");
+
+ try
+ {
+ conn = this.getConnectionToScrubber();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ results.add(rs.getString("feature"));
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ System.out.println("ERROR: failed selecting machine_annotations"+tableSuffix+" features: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps, conn);
+ }
+
+ return results;
+ }
+
+ @Deprecated
+ public Set<String> selectDistinctOntology() throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ Set<String> results = new HashSet<String>();
+
+ StringBuilder sql = new StringBuilder("select distinct match_source as feature from machine_annotations"+tableSuffix+" ;");
+
+ try
+ {
+ conn = this.getConnectionToScrubber();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ results.add(rs.getString("feature"));
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ System.out.println("ERROR: failed selecting machine_annotations"+tableSuffix+" ontologies: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps, conn);
+ }
+
+ return results;
+ }
+
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsDAO.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsPubsDAO.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsPubsDAO.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsPubsDAO.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsPubsDAO.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,141 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+/**
+ *
+ */
+package org.spin.scrubber.uima.dao;
+
+import com.mysql.jdbc.PreparedStatement;
+
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * @author bf19
+ * @link http://lists.mysql.com/mysql/202489
+ */
+public class AnnotationsPubsDAO extends BaseDAO
+{
+ public void insertPubsPOS(String pos, int posCnt) throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+
+ StringBuilder sql = new StringBuilder("insert into pubs_pos (pos, pos_cnt) values (?,?); ");
+
+ try
+ {
+ conn = getConnectionToScrubber();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ ps.setString(1, pos);
+ ps.setInt(2, posCnt);
+ int updated = ps.executeUpdate();
+
+ log.info("inserted " + updated + " rows to pubs_pos.");
+ }
+ catch (SQLException e)
+ {
+ log.error("Failed inserting annotations.pubs_pos",e);
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(ps, conn);
+ }
+ }
+
+ public Map<String,Integer> selectDistinctPOS(Map<String,Integer> pubPosMap, String filename) throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+
+ StringBuilder sql = new StringBuilder("select count(*) cnt, match_value from machine_annotations_pubs where filename_short=? and match_source='pos' group by match_value;");
+
+ try
+ {
+ conn = getConnectionToScrubber();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ ps.setString(1, filename);
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ String pos = rs.getString("match_value");
+ Integer cnt = rs.getInt("cnt");
+
+ pubPosMap.put(pos, (pubPosMap.containsKey(pos) ? pubPosMap.get(pos) + cnt : cnt));
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ log.error("Failed selecting annotations.pos ",e);
+ }
+ finally
+ {
+ close(rs, ps, conn);
+ }
+
+ return pubPosMap;
+ }
+
+
+ public List<String> selectDistinctFilenameShort() throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ List<String> results = new ArrayList<String>();
+
+ StringBuilder sql = new StringBuilder("select distinct filename_short from machine_annotations_pubs;");
+
+ try
+ {
+ conn = getConnectionToScrubber();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ results.add(rs.getString("filename_short"));
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ log.error("Failed selecting distinct filename_short",e);
+ }
+ finally
+ {
+ close(rs, ps, conn);
+ }
+
+ return results;
+ }
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsPubsDAO.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/BaseDAO.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/BaseDAO.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/BaseDAO.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/BaseDAO.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,192 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+/**
+ *
+ */
+package org.spin.scrubber.uima.dao;
+
+import com.mysql.jdbc.PreparedStatement;
+import org.apache.log4j.Logger;
+import org.spin.scrubber.ScrubberProperties;
+
+import java.sql.*;
+
+/**
+ * @author bf19
+ *
+ */
+public class BaseDAO
+{
+ protected static Logger log = Logger.getLogger(BaseDAO.class);
+ protected String tableSuffix;
+
+ public static Connection getConnectionToScrubber() throws Exception
+ {
+ return getConnection(ScrubberProperties.getDbDriver(), ScrubberProperties.getDbURI(), ScrubberProperties.getDbUser(), ScrubberProperties.getDbPassword());
+ }
+
+ public static Connection getConnection(String driver, String uri, String user, String pw) throws SQLException, ClassNotFoundException
+ {
+ Connection conn;
+
+ try
+ {
+ Class.forName(driver);
+ conn = DriverManager.getConnection(uri, user, pw);
+ }
+ catch (ClassNotFoundException e)
+ {
+ log.error("ERROR: could not load driver: " + e.getMessage());
+ throw e;
+ }
+ catch (SQLException e)
+ {
+ log.error("ERROR: unable to get db conn: " + e.getMessage());
+ throw e;
+ }
+ return conn;
+ }
+
+ /**
+ * log & swallow exceptions here.
+ * dropping indexes is not a requirement, it just speeds up the process.
+ *
+ * in general its bad news to embed params directly into sql but the normal ps.setString() option makes the var a quoted string.
+ *
+ * @param table drop an index from this talbe
+ * @param index name of index to drop
+ * @throws Exception
+ */
+ public static void dropIndex(String table, String index) throws Exception
+ {
+ Connection conn = getConnectionToScrubber();
+ PreparedStatement ps = null;
+ StringBuilder sql = new StringBuilder("alter table "+table + " drop index "+index+";");
+
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ ps.execute();
+ }
+ catch (SQLException e)
+ {
+ log.warn("unable to drop index " + index + " on table "+ table);
+ }
+ finally
+ {
+ if (ps!=null)
+ {
+ ps.close();
+ }
+ if (conn!=null)
+ {
+ conn.close();
+ }
+ }
+ }
+
+ public void runPreparedSQL(String sql) throws Exception
+ {
+ Connection conn = getConnectionToScrubber();
+ PreparedStatement ps = null;
+
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql);
+ ps.execute();
+ }
+ catch (SQLException e)
+ {
+ e.printStackTrace();
+ throw e;
+ }
+ finally
+ {
+ if (ps!=null)
+ {
+ ps.close();
+ }
+ if (conn!=null)
+ {
+ conn.close();
+ }
+ }
+ }
+
+ public static void closeConnection(Connection conn)
+ {
+ close(conn);
+ }
+ public static void closeRSPS(ResultSet rs, Statement stmt)
+ {
+ close(rs, stmt);
+ }
+
+ public static void close(Connection conn)
+ {
+ if (conn != null)
+ {
+ try
+ {
+ conn.close();
+ }
+ catch (Exception e)
+ {
+ log.error(e.getMessage(), e);
+ }
+ }
+ }
+
+ public static void close(ResultSet rs)
+ {
+ if (rs != null) {
+ try {
+ rs.close();
+ } catch (Exception e) {
+ log.error(e.getMessage(), e);
+ }
+ }
+ }
+
+ public static void close(Statement stmt) {
+ if (stmt != null) {
+ try {
+ stmt.close();
+ } catch (Exception e) {
+ log.error(e.getMessage(), e);
+ }
+ }
+ }
+
+ public static void close(ResultSet rs, Statement stmt) {
+ close(rs);
+ close(stmt);
+ }
+
+ public static void close(Statement stmt, Connection conn) {
+ close(stmt);
+ close(conn);
+ }
+
+ public static void close(ResultSet rs, Statement stmt, Connection conn) {
+ close(rs);
+ close(stmt);
+ close(conn);
+ }
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/BaseDAO.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/FeatureMatrixDAO.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/FeatureMatrixDAO.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/FeatureMatrixDAO.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/FeatureMatrixDAO.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,815 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+package org.spin.scrubber.uima.dao;
+
+import com.mysql.jdbc.PreparedStatement;
+import org.spin.scrubber.beans.CaseFeature;
+
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.util.ArrayList;
+import java.util.List;
+
+public class FeatureMatrixDAO extends BaseDAO
+{
+ Connection conn = null;
+
+ public FeatureMatrixDAO(String tableSuffix) throws Exception
+ {
+ log.info("Constructing CaseFeaturesDAO...");
+
+ this.tableSuffix = tableSuffix;
+ conn = getConnectionToScrubber();
+ }
+
+ public static void main(String[] args) throws Exception
+ {
+ if(args.length != 1)
+ {
+ //TODO: refactor
+ System.out.println("Usage: ");
+ System.out.println("FeatureMatrixDAO tableSuffix \r\n");
+
+ System.out.println("Example: ");
+ System.out.println("FeatureMatrixDAO");
+ System.out.println("FeatureMatrixDAO _train");
+ System.out.println("FeatureMatrixDAO _test");
+ }
+ else
+ {
+ String tableSuffix = args[0];
+ FeatureMatrixDAO dao = new FeatureMatrixDAO(tableSuffix);
+
+ dao.selectDataSetTrain();
+ }
+ }
+
+ public List<String> selectDataSetTest()
+ {
+ List<String> results = new ArrayList<String>();
+ PreparedStatement ps = null;
+ ResultSet rs = null;
+
+ StringBuilder sql = new StringBuilder();
+ sql.append(" SELECT CONCAT_WS(',', id, ");
+ sql.append(" pos,");
+ sql.append(" pos_bin, ");
+ sql.append(" has_capital,");
+ sql.append(" cnt_regex_phon, cnt_regex_date, cnt_regex_age, cnt_regex_id, cnt_regex_pat, cnt_regex_doc, cnt_regex_loc, cnt_regex_hosp, ");
+ sql.append(" cnt_priv, cnt_hosp, cnt_name,");
+ sql.append(" cnt_dict_costar, cnt_dict_hl7v25, cnt_dict_hl7v30, cnt_dict_icd10cm, cnt_dict_icd10pcs, cnt_dict_icd9cm, cnt_dict_lnc, cnt_dict_msh, cnt_dict_rxnorm, cnt_dict_snomedct, ");
+ sql.append(" cnt_ham_w_pos*10e6, ");
+ sql.append(" cnt_ham_wo_pos*10e6,");
+ sql.append(" phi_type) as record ");
+ sql.append(" FROM feature_matrix"+tableSuffix+" ");
+ sql.append(" where pos is not null "); //required to avoid pulling in knownPHI tokens from xml header
+
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+
+ rs = ps.executeQuery();
+
+ if (rs!=null)
+ {
+ while(rs.next())
+ {
+ results.add(rs.getString("record"));
+ }
+ }
+ }
+ catch (Exception e)
+ {
+ log.error("ERROR: failed selecting TEST data set: ", e);
+ }
+ finally
+ {
+ close(rs, ps);
+ }
+
+ return results;
+ }
+
+ public List<String> selectDataSetTrain()
+ {
+ List<String> results = new ArrayList<String>();
+
+ //get all positive cases
+ results.addAll(selectDataSetTrainPositive());
+
+ //get an equal number of negative cases
+// results.addAll(selectDataSetTrainNegative(results.size()));
+ results.addAll(selectDataSetTrainNegative(5000*1000)); //TODO: refactor
+
+ return results;
+ }
+
+ private List<String> selectDataSetTrainPositive()
+ {
+ List<String> results = new ArrayList<String>();
+ PreparedStatement ps = null;
+ ResultSet rs = null;
+
+ //positive case
+ StringBuilder sql = new StringBuilder();
+ sql.append(" SELECT CONCAT_WS(',', id, ");
+ sql.append(" pos,");
+ sql.append(" pos_bin, ");
+ sql.append(" has_capital,");
+ sql.append(" cnt_regex_phon, cnt_regex_date, cnt_regex_age, cnt_regex_id, cnt_regex_pat, cnt_regex_doc, cnt_regex_loc, cnt_regex_hosp, ");
+ sql.append(" cnt_priv, cnt_hosp, cnt_name,");
+ sql.append(" cnt_dict_costar, cnt_dict_hl7v25, cnt_dict_hl7v30, cnt_dict_icd10cm, cnt_dict_icd10pcs, cnt_dict_icd9cm, cnt_dict_lnc, cnt_dict_msh, cnt_dict_rxnorm, cnt_dict_snomedct, ");
+ sql.append(" cnt_ham_w_pos*10e6, ");
+ sql.append(" cnt_ham_wo_pos*10e6,");
+ sql.append(" phi_type) as record ");
+ sql.append(" FROM feature_matrix"+tableSuffix+" ");
+ sql.append(" where is_phi = 1 ;");
+
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+
+ rs = ps.executeQuery();
+
+ if (rs!=null)
+ {
+ while(rs.next())
+ {
+ results.add(rs.getString("record"));
+ }
+ }
+ }
+ catch (Exception e)
+ {
+ log.error("ERROR: failed selecting TEST data set (positive): ", e);
+ }
+ finally
+ {
+ close(rs, ps);
+ }
+
+ return results;
+ }
+
+ private List<String> selectDataSetTrainNegative(int limit)
+ {
+ List<String> results = new ArrayList<String>();
+ PreparedStatement ps = null;
+ ResultSet rs = null;
+
+ //negative case
+ StringBuilder sql = new StringBuilder();
+ sql.append(" SELECT CONCAT_WS(',', id, ");
+ sql.append(" pos,");
+ sql.append(" pos_bin, ");
+ sql.append(" has_capital,");
+ sql.append(" cnt_regex_phon, cnt_regex_date, cnt_regex_age, cnt_regex_id, cnt_regex_pat, cnt_regex_doc, cnt_regex_loc, cnt_regex_hosp, ");
+ sql.append(" cnt_priv, cnt_hosp, cnt_name,");
+ sql.append(" cnt_dict_costar, cnt_dict_hl7v25, cnt_dict_hl7v30, cnt_dict_icd10cm, cnt_dict_icd10pcs, cnt_dict_icd9cm, cnt_dict_lnc, cnt_dict_msh, cnt_dict_rxnorm, cnt_dict_snomedct, ");
+ sql.append(" cnt_ham_w_pos*10e6, ");
+ sql.append(" cnt_ham_wo_pos*10e6,");
+ sql.append(" phi_type) as record ");
+ sql.append(" FROM feature_matrix"+tableSuffix+" ");
+ sql.append(" where is_phi = 0 ");
+ sql.append(" and phi_type = 'NA' "); //required to avoid pulling in knownPHI tokens from xml header
+ sql.append(" and classified_as = 'NA' "); //required to avoid pulling in knownPHI tokens from xml header
+ sql.append(" order by rand() ");
+ sql.append(" limit "+limit+" ;");
+
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+
+ rs = ps.executeQuery();
+
+ if (rs!=null)
+ {
+ while(rs.next())
+ {
+ results.add(rs.getString("record"));
+ }
+ }
+ }
+ catch (Exception e)
+ {
+ log.error("ERROR: failed selecting TEST data set (negative): ", e);
+ }
+ finally
+ {
+ close(rs, ps);
+ }
+
+ return results;
+ }
+
+ public void insertCaseFeatureBatch(List<CaseFeature> list)
+ {
+ PreparedStatement ps = null;
+ StringBuilder sql = new StringBuilder("insert into feature_matrix"+tableSuffix+" ");
+ sql.append("(token, start_idx, end_idx, filename_short, pos, pos_bin, has_capital, ");
+ sql.append("cnt_regex_phon, cnt_regex_date, cnt_regex_age, cnt_regex_id, cnt_regex_pat, cnt_regex_doc, cnt_regex_loc, cnt_regex_hosp, ");
+ sql.append("cnt_priv, cnt_hosp, cnt_name, ");
+ sql.append("cnt_dict_costar, cnt_dict_hl7v25, cnt_dict_hl7v30, cnt_dict_icd10cm, cnt_dict_icd10pcs, cnt_dict_icd9cm, cnt_dict_lnc, cnt_dict_msh, cnt_dict_rxnorm, cnt_dict_snomedct, ");
+ sql.append("cnt_ham_w_pos, cnt_ham_wo_pos, is_phi, phi_type, classified_as) ");
+ sql.append("values (?,?,?,?,?,?,?,?,?,?, ?,?,?,?,?,?,?,?,?,?, ?,?,?,?,?,?,?,?,?,?, ?,?,?) ");
+ sql.append(";");
+
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ for (CaseFeature cf : list)
+ {
+ int i=1;
+ ps.setString(i++, cf.getToken());
+ ps.setInt(i++, cf.getStartIdx());
+ ps.setInt(i++, cf.getEndIdx());
+ ps.setString(i++, cf.getFilename_short());
+ ps.setString(i++, cf.getPos());
+ ps.setString(i++, cf.getPos_bin());
+ ps.setInt(i++, cf.getHas_capital());
+
+ ps.setInt(i++, cf.getCnt_regex_phon());
+ ps.setInt(i++, cf.getCnt_regex_date());
+ ps.setInt(i++, cf.getCnt_regex_age());
+ ps.setInt(i++, cf.getCnt_regex_id());
+ ps.setInt(i++, cf.getCnt_regex_pat());
+ ps.setInt(i++, cf.getCnt_regex_doc());
+ ps.setInt(i++, cf.getCnt_regex_loc());
+ ps.setInt(i++, cf.getCnt_regex_hosp());
+
+ ps.setInt(i++, cf.getCnt_priv());
+ ps.setInt(i++, cf.getCnt_hosp());
+ ps.setInt(i++, cf.getCnt_name());
+
+ ps.setInt(i++, cf.getCnt_dict_costar());
+ ps.setInt(i++, cf.getCnt_dict_hl7v25());
+ ps.setInt(i++, cf.getCnt_dict_hl7v30());
+ ps.setInt(i++, cf.getCnt_dict_icd10cm());
+ ps.setInt(i++, cf.getCnt_dict_icd10pcs());
+ ps.setInt(i++, cf.getCnt_dict_icd9cm());
+ ps.setInt(i++, cf.getCnt_dict_lnc());
+ ps.setInt(i++, cf.getCnt_dict_msh());
+ ps.setInt(i++, cf.getCnt_dict_rxnorm());
+ ps.setInt(i++, cf.getCnt_dict_snomedct());
+
+ ps.setFloat(i++, cf.getCnt_ham_w_pos());
+ ps.setFloat(i++, cf.getCnt_ham_wo_pos());
+
+ ps.setInt(i++, cf.getIs_phi());
+ ps.setString(i++, cf.getPhi_type());
+ ps.setString(i++, cf.getClassified_as());
+ ps.addBatch();
+ }
+
+ ps.executeBatch();
+ }
+ catch (Exception e)
+ {
+ log.error("Failed inserting caseFeatureBatch.", e);
+ }
+ finally
+ {
+ close(ps);
+ }
+ }
+
+ public void updateClassification(String classifiedAs, int id)
+ {
+ PreparedStatement ps = null;
+ StringBuilder sql = new StringBuilder("update feature_matrix"+tableSuffix+" set classified_as = ? where id = ? ;");
+
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setString(i++, classifiedAs);
+ ps.setInt(i++, id);
+
+ ps.executeUpdate();
+ }
+ catch (Exception e)
+ {
+ System.out.println("ERROR: failed updating feature_matrix"+tableSuffix+" for classified_as field.");
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(ps);
+ }
+ }
+
+ public List<CaseFeature> selectAllCaseFeatures()
+ {
+ PreparedStatement ps = null;
+// Connection conn = null;
+ ResultSet rs = null;
+ StringBuilder sql = new StringBuilder("select * from feature_matrix"+tableSuffix+" ");
+
+ List<CaseFeature> results = new ArrayList<CaseFeature>();
+ try
+ {
+// conn = this.getConnectionToScrubber();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+
+ rs = ps.executeQuery();
+
+ if (rs!=null)
+ {
+ CaseFeature cf;
+ while(rs.next())
+ {
+ cf = new CaseFeature();
+
+ cf.setId(rs.getInt("id"));
+ cf.setToken(rs.getString("token").toLowerCase());
+ cf.setPos(rs.getString("pos"));
+ cf.setStartIdx(rs.getInt("start_idx"));
+ cf.setEndIdx(rs.getInt("end_idx"));
+ cf.setClassified_as(rs.getString("classified_as"));
+ cf.setPhi_type(rs.getString("phi_type"));
+ cf.setFilename_short(rs.getString("filename_short"));
+
+ //TODO: finish populating the cf object...
+ results.add(cf);
+ }
+ }
+ }
+ catch (Exception e)
+ {
+ System.out.println("ERROR: failed selecting all caseFeature: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps);
+// close(rs, ps, conn);
+ }
+
+ return results;
+ }
+
+ /**
+ * select all records for a given filename
+ * from the feature_matrix table
+ * where the record is classified_as *ANY* type of phi
+ * (ie. NOT 'NA')
+ *
+ * @param filename_short
+ * @return
+ */
+ public List<CaseFeature> selectPHICaseFeaturesByFilename(String filename_short)
+ {
+ PreparedStatement ps = null;
+ ResultSet rs = null;
+ StringBuilder sql = new StringBuilder("select * from feature_matrix"+tableSuffix+" where filename_short = ? and classified_as <> 'NA'");
+
+ List<CaseFeature> results = new ArrayList<CaseFeature>();
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ ps.setString(1, filename_short);
+ rs = ps.executeQuery();
+
+ if (rs!=null)
+ {
+ CaseFeature cf;
+ while(rs.next())
+ {
+ cf = new CaseFeature();
+ cf.setId(rs.getInt("id"));
+ cf.setToken(rs.getString("token").toLowerCase());
+ cf.setPos(rs.getString("pos"));
+ cf.setStartIdx(rs.getInt("start_idx"));
+ cf.setEndIdx(rs.getInt("end_idx"));
+ cf.setClassified_as(rs.getString("classified_as"));
+ cf.setPhi_type(rs.getString("phi_type"));
+ cf.setFilename_short(rs.getString("filename_short"));
+
+ //TODO: cf object is populated from 3 methods. need to write seperate method that takes RS and returns a CF obj.
+ results.add(cf);
+ }
+ }
+ }
+ catch (Exception e)
+ {
+ System.out.println("ERROR: failed selecting caseFeatures by filename: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps);
+// close(rs, ps, conn);
+ }
+
+ return results;
+ }
+
+ public List<String> selectDistinctFilenames()
+ {
+ PreparedStatement ps = null;
+ ResultSet rs = null;
+ StringBuilder sql = new StringBuilder("select distinct filename_short from feature_matrix"+tableSuffix+" ");
+
+ List<String> results = new ArrayList<String>();
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+
+ rs = ps.executeQuery();
+
+ if (rs!=null)
+ {
+ while(rs.next())
+ {
+ results.add(rs.getString("filename_short"));
+ }
+ }
+ }
+ catch (Exception e)
+ {
+ System.out.println("ERROR: failed selecting distinct filename_short from caseFeature: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps);
+ }
+
+ return results;
+ }
+
+ @Deprecated
+ public List<CaseFeature> selectAllTestCaseFeatures()
+ {
+ PreparedStatement ps = null;
+ ResultSet rs = null;
+ StringBuilder sql = new StringBuilder("select * from feature_matrix"+tableSuffix+" ");
+
+ List<CaseFeature> results = new ArrayList<CaseFeature>();
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+
+ rs = ps.executeQuery();
+
+ if (rs!=null)
+ {
+ CaseFeature cf;
+ while(rs.next())
+ {
+ cf = new CaseFeature();
+ cf.setId(rs.getInt("id"));
+ cf.setToken(rs.getString("token").toLowerCase());
+ cf.setPos(rs.getString("pos"));
+ cf.setStartIdx(rs.getInt("start_idx"));
+ cf.setEndIdx(rs.getInt("end_idx"));
+ cf.setClassified_as(rs.getString("classified_as"));
+ cf.setPhi_type(rs.getString("phi_type"));
+ cf.setFilename_short(rs.getString("filename_short"));
+
+ //TODO: finish populating the cf object...
+ results.add(cf);
+ }
+ }
+ }
+ catch (Exception e)
+ {
+ System.out.println("ERROR: failed selecting all caseFeature: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps);
+// close(rs, ps, conn);
+ }
+
+ return results;
+ }
+
+
+ /**
+ * Get case features containing the character and position of where scrubber found PHI.
+ *
+ * @param filename name of the actual physical file to be deidentified (redacted)
+ * @return list of PHI classifications in REVERSE (desc) order
+ */
+ public List<CaseFeature> selectClassifiedAsPHI(String filename)
+ {
+ PreparedStatement ps = null;
+ ResultSet rs = null;
+ StringBuilder sql = new StringBuilder("select filename_short, token, start_idx, end_idx, classified_as from feature_matrix"+tableSuffix+" where filename_short = ? and classified_as <> 'NA' order by start_idx desc");
+
+ List<CaseFeature> results = new ArrayList<CaseFeature>();
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i = 1;
+ ps.setString(i++, filename);
+
+ rs = ps.executeQuery();
+
+ if (rs!=null)
+ {
+ CaseFeature cf;
+ while(rs.next())
+ {
+ cf = new CaseFeature();
+ cf.setStartIdx(rs.getInt("start_idx"));
+ cf.setEndIdx(rs.getInt("end_idx"));
+ cf.setToken(rs.getString("token").toLowerCase());
+ cf.setFilename_short(rs.getString("filename_short"));
+ cf.setClassified_as(rs.getString("classified_as"));
+
+ //TODO: finish populating the cf object...
+ results.add(cf);
+ }
+ }
+ }
+ catch (Exception e)
+ {
+ System.out.println("ERROR: failed selecting caseFeatures: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps);
+ }
+
+ return results;
+ }
+
+ public List<CaseFeature> selectClassifiedAsPHITest(String filename)
+ {
+ PreparedStatement ps = null;
+ ResultSet rs = null;
+ StringBuilder sql = new StringBuilder("select filename_short, token, start_idx, end_idx, classified_as from feature_matrix"+tableSuffix+" where filename_short = ? and classified_as <> 'NA' order by start_idx desc");
+
+ List<CaseFeature> results = new ArrayList<CaseFeature>();
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i = 1;
+ ps.setString(i++, filename);
+
+ rs = ps.executeQuery();
+
+ if (rs!=null)
+ {
+ CaseFeature cf;
+ while(rs.next())
+ {
+ cf = new CaseFeature();
+ cf.setStartIdx(rs.getInt("start_idx"));
+ cf.setEndIdx(rs.getInt("end_idx"));
+ cf.setToken(rs.getString("token").toLowerCase());
+ cf.setFilename_short(rs.getString("filename_short"));
+ cf.setClassified_as(rs.getString("classified_as"));
+
+ //TODO: finish populating the cf object...
+ results.add(cf);
+ }
+ }
+ }
+ catch (Exception e)
+ {
+ System.out.println("ERROR: failed selecting caseFeatures: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps);
+ }
+
+ return results;
+ }
+
+
+
+ public void updateCaseFeaturePHITypeTest(int id, String type)
+ {
+ PreparedStatement ps = null;
+ ResultSet rs = null;
+ StringBuilder sql = new StringBuilder("update feature_matrix"+tableSuffix+" set phi_type = ? where id = ?");
+
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setString(i++, type);
+ ps.setInt(i++, id);
+ int updated = ps.executeUpdate();
+
+ System.out.println("updated count: " + updated);
+ }
+ catch (Exception e)
+ {
+ System.out.println("ERROR: failed updating (test) caseFeature phi_type: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps);
+ }
+ }
+
+ /**
+ *
+ * @param id - id of row to be updated.
+ * @param pcount - count of terms w/ same token AND pos.
+ * @param tcount - count of terms w/ same token.
+ */
+ public void updateCaseFeatureTFAllPubs(int id, float pcount, float tcount)
+ {
+ PreparedStatement ps = null;
+ ResultSet rs = null;
+ StringBuilder sql = new StringBuilder("update feature_matrix"+tableSuffix+" set cnt_ham_w_pos = ?, cnt_ham_wo_pos=? where id = ?");
+
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setFloat(i++, pcount);
+ ps.setFloat(i++, tcount);
+ ps.setInt(i++, id);
+ int updated = ps.executeUpdate();
+
+ System.out.println("updated count: " + updated);
+ }
+ catch (Exception e)
+ {
+ System.out.println("ERROR: failed updating caseFeature - pubs: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps);
+ }
+ }
+
+ /**
+ *
+ * @param id - id of row to be updated.
+ * @param pcount - count of terms w/ same token AND pos.
+ * @param tcount - count of terms w/ same token.
+ */
+ public void updateTestCaseFeatureTFAllPubs(int id, float pcount, float tcount)
+ {
+ PreparedStatement ps = null;
+ ResultSet rs = null;
+ StringBuilder sql = new StringBuilder("update feature_matrix"+tableSuffix+" set cnt_ham_w_pos = ?, cnt_ham_wo_pos=? where id = ?");
+
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setFloat(i++, pcount);
+ ps.setFloat(i++, tcount);
+ ps.setInt(i++, id);
+ int updated = ps.executeUpdate();
+
+ System.out.println("updated count: " + updated);
+ }
+ catch (Exception e)
+ {
+ System.out.println("ERROR: failed updating caseFeature - pubs: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps);
+ }
+ }
+
+ @Deprecated
+ public void updateClassification(String classifiedAs, String filename, int startIdx)
+ {
+ PreparedStatement ps = null;
+ StringBuilder sql = new StringBuilder("update feature_matrix"+tableSuffix+" set classified_as = ? where filename_short = ? and start_idx = ? ;");
+
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setString(i++, classifiedAs);
+ ps.setString(i++, filename);
+ ps.setInt(i++, startIdx);
+
+ ps.executeUpdate();
+ }
+ catch (Exception e)
+ {
+ System.out.println("ERROR: failed updating feature_matrix"+tableSuffix+" for classified_as field.");
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(ps);
+ }
+ }
+
+ @Deprecated
+ public void updateCaseFeaturePHITypeTrain(int id, String type)
+ {
+ PreparedStatement ps = null;
+ ResultSet rs = null;
+ StringBuilder sql = new StringBuilder("update feature_matrix"+tableSuffix+" set phi_type = ? where id = ?");
+
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setString(i++, type);
+ ps.setInt(i++, id);
+ int updated = ps.executeUpdate();
+
+ System.out.println("updated count: " + updated);
+ }
+ catch (Exception e)
+ {
+ System.out.println("ERROR: failed updating (train) caseFeature phi_type: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps);
+ }
+ }
+
+ @Deprecated
+ public void insertCaseFeature(CaseFeature cf)
+ {
+ PreparedStatement ps = null;
+// Connection conn = null;
+
+ StringBuilder sql = new StringBuilder("insert into feature_matrix"+tableSuffix+" ");
+ sql.append("(token, start_idx, filename_short, pos, pos_bin, has_capital, ");
+ sql.append("cnt_regex_phon, cnt_regex_date, cnt_regex_age, cnt_regex_id, cnt_regex_pat, cnt_regex_doc, cnt_regex_loc, cnt_regex_hosp, ");
+ sql.append("cnt_priv, cnt_hosp, cnt_name, ");
+ sql.append("cnt_dict_costar, cnt_dict_hl7v25, cnt_dict_hl7v30, cnt_dict_icd10cm, cnt_dict_icd10pcs, cnt_dict_icd9cm, cnt_dict_lnc, cnt_dict_msh, cnt_dict_rxnorm, cnt_dict_snomedct, ");
+ sql.append("cnt_ham_w_pos, cnt_ham_wo_pos, is_phi) ");
+ sql.append("values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?);");
+
+ try
+ {
+// conn = this.getConnectionToScrubber();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setString(i++, cf.getToken());
+ ps.setInt(i++, cf.getStartIdx());
+ ps.setString(i++, cf.getFilename_short());
+ ps.setString(i++, cf.getPos());
+ ps.setString(i++, cf.getPos_bin());
+ ps.setInt(i++, cf.getHas_capital());
+ ps.setInt(i++, cf.getCnt_regex_phon());
+ ps.setInt(i++, cf.getCnt_regex_date());
+ ps.setInt(i++, cf.getCnt_regex_age());
+ ps.setInt(i++, cf.getCnt_regex_id());
+ ps.setInt(i++, cf.getCnt_regex_pat());
+ ps.setInt(i++, cf.getCnt_regex_doc());
+ ps.setInt(i++, cf.getCnt_regex_loc());
+ ps.setInt(i++, cf.getCnt_regex_hosp());
+ ps.setInt(i++, cf.getCnt_priv());
+ ps.setInt(i++, cf.getCnt_hosp());
+ ps.setInt(i++, cf.getCnt_name());
+ ps.setInt(i++, cf.getCnt_dict_costar());
+ ps.setInt(i++, cf.getCnt_dict_hl7v25());
+ ps.setInt(i++, cf.getCnt_dict_hl7v30());
+ ps.setInt(i++, cf.getCnt_dict_icd10cm());
+ ps.setInt(i++, cf.getCnt_dict_icd10pcs());
+ ps.setInt(i++, cf.getCnt_dict_icd9cm());
+ ps.setInt(i++, cf.getCnt_dict_lnc());
+ ps.setInt(i++, cf.getCnt_dict_msh());
+ ps.setInt(i++, cf.getCnt_dict_rxnorm());
+ ps.setInt(i++, cf.getCnt_dict_snomedct());
+ ps.setFloat(i++, cf.getCnt_ham_w_pos());
+ ps.setFloat(i++, cf.getCnt_ham_wo_pos());
+ ps.setInt(i++, cf.getIs_phi());
+
+ ps.execute();
+ }
+ catch (Exception e)
+ {
+ log.error("Failed inserting caseFeature: " + cf.getFilename_short() + ":" + cf.getToken(), e);
+ }
+ finally
+ {
+ close(ps);
+ }
+ }
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/FeatureMatrixDAO.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/HumanAnnotationsDAO.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/HumanAnnotationsDAO.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/HumanAnnotationsDAO.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/HumanAnnotationsDAO.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,233 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+package org.spin.scrubber.uima.dao;
+
+import com.mysql.jdbc.PreparedStatement;
+
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.HashSet;
+import java.util.Set;
+
+public class HumanAnnotationsDAO extends BaseDAO
+{
+ Connection conn;
+
+ public HumanAnnotationsDAO(String tableSuffix) throws Exception
+ {
+ this.tableSuffix = tableSuffix;
+ conn = getConnectionToScrubber();
+ }
+
+ public void insert(String subjectId, String tagName, String tagValue, int startIdx, int endIdx) throws Exception
+ {
+ PreparedStatement ps = null;
+
+ StringBuilder sql = new StringBuilder("insert into human_annotations"+tableSuffix+" (subject_id, tag_name, tag_value, start_idx, end_idx) values (?,?,?,?,?);");
+
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setString(i++, subjectId);
+ ps.setString(i++, tagName);
+ ps.setString(i++, tagValue);
+ ps.setInt(i++, startIdx);
+ ps.setInt(i++, endIdx);
+ ps.execute();
+ }
+ catch (SQLException e)
+ {
+ log.error("Failed inserting : " + subjectId + ":" + tagName +":"+ tagValue, e);
+ }
+ finally
+ {
+ close(ps);
+ }
+ }
+
+ public String selectPHIType(String filename, int startIdx) throws Exception
+ {
+ PreparedStatement ps = null;
+ ResultSet rs = null;
+ String retVal = null;
+ StringBuilder sql = new StringBuilder("select tag_name from human_annotations"+tableSuffix+" where subject_id=? and ? between start_idx and end_idx;");
+
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setString(i++, filename);
+ ps.setInt(i++, startIdx);
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ String tmp = rs.getString("tag_name");
+ if (tmp!=null && tmp.length()>0)
+ {
+ retVal = tmp;
+ }
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ log.error("Failed selectPHIType", e);
+ }
+ finally
+ {
+ close(rs, ps);
+ }
+
+ return retVal;
+ }
+
+ public void close()
+ {
+ try
+ {
+ close(conn);
+ }
+ catch(Exception e)
+ {
+ System.out.println("ERROR: issue closing connection for HumanAnnotationsDAO.");
+ }
+ }
+
+ @Deprecated
+ public Set<String> selectTagValueBySubjectId(String subjectId) throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ Set<String> results = new HashSet<String>();
+
+ StringBuilder sql = new StringBuilder("select tag_value from human_annotations"+tableSuffix+" where tag_name in ('HOSPITAL', 'PATIENT', 'DOCTOR', 'LOCATION') and subject_id = ?;");
+
+ try
+ {
+ conn = this.getConnectionToScrubber();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setString(i++, subjectId);
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ String tagValue = rs.getString("tag_value");
+ results.add(tagValue);
+
+ //adding version w/o period (in cases of middle name, etc...)
+ if (tagValue.contains("."))
+ {
+ results.add(tagValue.replace(".", " ").trim());
+ }
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ System.out.println("ERROR: failed selecting human_annotations"+tableSuffix+" .tag_value: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps, conn);
+ }
+
+ return results;
+ }
+
+ @Deprecated
+ public int isPHI(String filename, int startIdx) throws Exception
+ {
+ PreparedStatement ps = null;
+ ResultSet rs = null;
+ int retVal = 0;
+ StringBuilder sql = new StringBuilder("select count(*) cnt from human_annotations"+tableSuffix+" where subject_id=? and ? between start_idx and end_idx;");
+
+ try
+ {
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setString(i++, filename);
+ ps.setInt(i++, startIdx);
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ int tmp = rs.getInt("cnt");
+ if (tmp>0)
+ {
+ retVal = 1;
+ }
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ System.out.println("ERROR: failed isPHI check: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps);
+// close(rs, ps, conn);
+ }
+
+ return retVal;
+ }
+
+ @Deprecated
+ public void insert(String subjectId, String tagName, String tagValue) throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+
+ StringBuilder sql = new StringBuilder("insert into human_annotations"+tableSuffix+" (subject_id, tag_name, tag_value) values (?,?,?);");
+
+ try
+ {
+ conn = this.getConnectionToScrubber();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setString(i++, subjectId);
+ ps.setString(i++, tagName);
+ ps.setString(i++, tagValue);
+ ps.execute();
+ }
+ catch (SQLException e)
+ {
+ System.out.println("ERROR: failed inserting : " + subjectId + ":" + tagName +":"+ tagValue);
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(ps, conn);
+ }
+ }
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/HumanAnnotationsDAO.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/PubDAO.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/PubDAO.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/PubDAO.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/PubDAO.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,417 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+/**
+ *
+ */
+package org.spin.scrubber.uima.dao;
+
+import com.mysql.jdbc.PreparedStatement;
+
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * @author bf19
+ *
+ */
+public class PubDAO extends BaseDAO
+{
+ private Connection conn;
+
+ public PubDAO() throws Exception
+ {
+ conn = getConnectionToScrubber();
+ }
+
+ public Map<Integer,String> selectRandomPubContent() throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ Map<Integer, String> pubMap = new HashMap<Integer,String>();
+
+ StringBuilder sql = new StringBuilder("select distinct p.id, p.body ");
+ sql.append(" from pubs p right join keywords k on p.id = k.pub_id ");
+ sql.append(" where rand()>0.9 limit 10500; ");
+
+ try
+ {
+ conn = getConn();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ pubMap.put(rs.getInt("id"), rs.getString("body"));
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ log.error("Failed selecting RANDOM pub set: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps, conn);
+ }
+
+ return pubMap;
+ }
+
+ public Map<Integer,String> selectAllPubContentById(int id) throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ Map<Integer, String> pubMap = new HashMap<Integer,String>();
+
+// StringBuilder sql = new StringBuilder("select p.id, p.body ");
+// sql.append(" from pubs p right join keywords k on p.id = k.pub_id where p.id = ? ");
+
+ StringBuilder sql = new StringBuilder("select p.id, p.body from pubs p where p.id = ? ");
+
+ try
+ {
+ conn = getConn();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ ps.setInt(1, id);
+
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ pubMap.put(rs.getInt("id"), rs.getString("body"));
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ log.error("Failed selecting ALL pub set: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps);
+ }
+
+ return pubMap;
+ }
+
+ public List<Integer> selectAllPubIds() throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ List<Integer> results = new ArrayList<Integer>();
+
+// StringBuilder sql = new StringBuilder("select distinct p.id ");
+// sql.append(" from pubs p right join keywords k on p.id = k.pub_id ");
+
+ StringBuilder sql = new StringBuilder("select distinct p.id from pubs p");
+
+ try
+ {
+ conn = getConn();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ results.add(rs.getInt("id"));
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ log.error("Failed selecting ALL pub IDs: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps);
+ }
+
+ return results;
+ }
+
+ public Map<Integer,String> selectPubContentByLikeKeywords(String[] keywordFragList) throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ Map<Integer, String> pubMap = new HashMap<Integer,String>();
+ List<Integer> pubIdList = selectPubIDByLikeKeywords(keywordFragList);
+
+ StringBuilder sql = new StringBuilder("select id, body from pubs where id in ( ");
+ for (Integer i : pubIdList)
+ {
+ sql.append(i+",");
+ }
+ sql.append(" 0);");
+
+ try
+ {
+ conn = getConn();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ pubMap.put(rs.getInt("id"), rs.getString("body"));
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ log.error("Failed selecting pubs by LIKE keyword: " + e.getMessage());
+ }
+ finally
+ {
+ close(rs, ps, conn);
+ }
+
+ return pubMap;
+ }
+
+ public List<Integer> selectPubIDByLikeKeywords(String[] keywordFragList) throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ List<Integer> results = new ArrayList<Integer>();
+
+ StringBuilder sql = new StringBuilder("select distinct pub_id from keywords where keyword like ? ");
+ for (int k=1; k<keywordFragList.length; k++)
+ {
+ sql.append(" or keyword like ? ");
+ }
+
+ try
+ {
+ conn = getConn();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ for (String s : keywordFragList)
+ {
+ ps.setString(i++, s);
+ }
+
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ results.add(rs.getInt("pub_id"));
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ log.error("Failed selecting pubs by LIKE keyword: " + e.getMessage());
+ }
+ finally
+ {
+ close(rs, ps);
+ }
+
+ return results;
+ }
+ public void insertAuthor(int pubId, String surname, String givenName) throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ int updated = 0;
+ StringBuilder sql = new StringBuilder("insert into authors (pub_id, surname, given_name) values (?,?,?);");
+
+ try
+ {
+ conn = getConn();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setInt(i++, pubId);
+ ps.setString(i++, surname);
+ ps.setString(i++, givenName);
+
+ updated = ps.executeUpdate();
+
+ if(updated!=1) { throw new SQLException("ERROR: db insert count="+updated + ". expected=1"); }
+ }
+ catch (SQLException e)
+ {
+ log.error("Failed inserting author: " + e.getMessage());
+ }
+ finally
+ {
+ close(ps);
+ }
+ }
+ public void insertRef(int pubId, String surname, String givenName) throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ int updated = 0;
+ StringBuilder sql = new StringBuilder("insert into refs (pub_id, surname, given_name) values (?,?,?);");
+
+ try
+ {
+ conn = getConn();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setInt(i++, pubId);
+ ps.setString(i++, surname);
+ ps.setString(i++, givenName);
+
+ updated = ps.executeUpdate();
+
+ if(updated!=1) { throw new SQLException("ERROR: db insert count="+updated + ". expected=1"); }
+ }
+ catch (SQLException e)
+ {
+ log.error("Failed inserting ref: " + e.getMessage());
+ }
+ finally
+ {
+ close(ps);
+ }
+ }
+ public void insertKeyword(int pubId, String keyword) throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ int updated = 0;
+ StringBuilder sql = new StringBuilder("insert into keywords (pub_id, keyword) values (?,?);");
+
+ try
+ {
+ conn = getConn();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setInt(i++, pubId);
+ ps.setString(i++, keyword);
+
+ updated = ps.executeUpdate();
+
+ if(updated!=1) { throw new SQLException("ERROR: db insert count="+updated + ". expected=1"); }
+ }
+ catch (SQLException e)
+ {
+ log.error("Failed inserting keyword: " + e.getMessage());
+ }
+ finally
+ {
+ close(ps);
+ }
+ }
+
+ public int selectNewestPub() throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ int id = 0;
+ StringBuilder sql = new StringBuilder("select max(id) id from pubs;");
+
+ try
+ {
+ conn = getConn();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ id = rs.getInt("id");
+ }
+ }
+ }
+ catch (SQLException e)
+ {
+ log.error("Failed inserting keyword: " + e.getMessage());
+ }
+ finally
+ {
+ close(rs, ps);
+ }
+
+ return id;
+ }
+ public int insertPub(String journalTitle, String articleTitle, String pmc, String pmid, String doi, String body, String filepath) throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ int updated = 0;
+ int pubId=0;
+ StringBuilder sql = new StringBuilder("insert into pubs (journal_title, article_title, pmc, pmid, doi, body, filepath) values (?,?,?,?,?,?,?);");
+
+ try
+ {
+ conn = getConn();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ int i=1;
+ ps.setString(i++, journalTitle);
+ ps.setString(i++, articleTitle);
+ ps.setString(i++, pmc);
+ ps.setString(i++, pmid);
+ ps.setString(i++, doi);
+ ps.setString(i++, body);
+ ps.setString(i++, filepath);
+
+ updated = ps.executeUpdate();
+
+ if(updated!=1) { throw new SQLException("ERROR: db insert count="+updated + ". expected=1"); }
+
+ pubId = selectNewestPub();
+ }
+ catch (SQLException e)
+ {
+ log.error("Failed inserting pub: " + e.getMessage());
+ }
+ finally
+ {
+ close(ps);
+ }
+
+ return pubId;
+ }
+
+ public Connection getConn()
+ {
+ return conn;
+ }
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/PubDAO.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/TfDAO.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/TfDAO.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/TfDAO.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/TfDAO.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+package org.spin.scrubber.uima.dao;
+
+import com.mysql.jdbc.PreparedStatement;
+
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.Map;
+
+//TODO: rename to TermFrequencyDAO
+public class TfDAO extends BaseDAO
+{
+
+ /**
+ * create a map of lowercase token and number of occurances with and without the POS.
+ * @return
+ * @throws Exception
+ */
+ public Map<String,Integer> selectPubTFMap() throws Exception
+ {
+ PreparedStatement ps = null;
+ Connection conn = null;
+ ResultSet rs = null;
+ Map<String,Integer> results = new HashMap<String,Integer>();
+ int total=0;
+
+ StringBuilder sql = new StringBuilder("select * from lookup_term_frequency;");
+
+ try
+ {
+ conn = this.getConnectionToScrubber();
+ ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+ rs = ps.executeQuery();
+
+ if(rs!=null)
+ {
+ while (rs.next())
+ {
+ String token = rs.getString("token").toLowerCase();
+ String pos = rs.getString("pos");
+ String compositeToken = token+"|"+pos;
+ int cnt = rs.getInt("cnt");
+ String compositePOSToken = "pos|"+pos;
+ total+=cnt;
+
+ results.put(token,(results.get(token)==null) ? cnt : results.get(token) + cnt);
+ results.put(compositeToken,(results.get(compositeToken)==null) ? cnt : results.get(compositeToken) + cnt);
+ results.put(compositePOSToken,(results.get(compositePOSToken)==null) ? cnt : results.get(compositePOSToken) + cnt);
+ }
+ //add in total count of all tokens - later used for computing tf (count/total)
+ results.put("totalPubCount", total);
+ }
+ }
+ catch (SQLException e)
+ {
+ log.error("Selecting all_pubs_tf: " + e.getMessage());
+ e.printStackTrace();
+ }
+ finally
+ {
+ close(rs, ps, conn);
+ }
+
+ return results;
+ }
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/TfDAO.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/reader/FileSystemCollectionReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/reader/FileSystemCollectionReader.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/reader/FileSystemCollectionReader.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/reader/FileSystemCollectionReader.java Sun Jul 7 19:23:05 2013
@@ -0,0 +1,211 @@
+/*******************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ******************************************************************************/
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.spin.scrubber.uima.reader;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CollectionException;
+import org.apache.uima.collection.CollectionReader_ImplBase;
+import org.apache.uima.examples.SourceDocumentInformation;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.DocumentAnnotation;
+import org.apache.uima.resource.ResourceConfigurationException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.FileUtils;
+import org.apache.uima.util.Progress;
+import org.apache.uima.util.ProgressImpl;
+
+/**
+ * A simple collection reader that reads documents from a directory in the filesystem. It can be
+ * configured with the following parameters:
+ * <ul>
+ * <li><code>InputDirectory</code> - path to directory containing files</li>
+ * <li><code>Encoding</code> (optional) - character encoding of the input files</li>
+ * <li><code>Language</code> (optional) - language of the input documents</li>
+ * </ul>
+ *
+ *
+ */
+public class FileSystemCollectionReader extends CollectionReader_ImplBase {
+ /**
+ * Name of configuration parameter that must be set to the path of a directory containing input
+ * files.
+ */
+ public static final String PARAM_INPUTDIR = "InputDirectory";
+
+ /**
+ * Name of configuration parameter that contains the character encoding used by the input files.
+ * If not specified, the default system encoding will be used.
+ */
+ public static final String PARAM_ENCODING = "Encoding";
+
+ /**
+ * Name of optional configuration parameter that contains the language of the documents in the
+ * input directory. If specified this information will be added to the CAS.
+ */
+ public static final String PARAM_LANGUAGE = "Language";
+
+ /**
+ * Name of optional configuration parameter that indicates including
+ * the subdirectories (recursively) of the current input directory.
+ */
+ public static final String PARAM_SUBDIR = "BrowseSubdirectories";
+
+ private ArrayList<File> mFiles;
+
+ private String mEncoding;
+
+ private String mLanguage;
+
+ private Boolean mRecursive;
+
+ private int mCurrentIndex;
+
+ /**
+ * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
+ */
+ public void initialize() throws ResourceInitializationException {
+ File directory = new File(((String) getConfigParameterValue(PARAM_INPUTDIR)).trim());
+ mEncoding = (String) getConfigParameterValue(PARAM_ENCODING);
+ mLanguage = (String) getConfigParameterValue(PARAM_LANGUAGE);
+ mRecursive = (Boolean) getConfigParameterValue(PARAM_SUBDIR);
+ if (null == mRecursive) { // could be null if not set, it is optional
+ mRecursive = Boolean.FALSE;
+ }
+ mCurrentIndex = 0;
+
+ // if input directory does not exist or is not a directory, throw exception
+ if (!directory.exists() || !directory.isDirectory()) {
+ throw new ResourceInitializationException(ResourceConfigurationException.DIRECTORY_NOT_FOUND,
+ new Object[] { PARAM_INPUTDIR, this.getMetaData().getName(), directory.getPath() });
+ }
+
+ // get list of files in the specified directory, and subdirectories if the
+ // parameter PARAM_SUBDIR is set to True
+ mFiles = new ArrayList<File>();
+ addFilesFromDir(directory);
+ }
+
+ /**
+ * This method adds files in the directory passed in as a parameter to mFiles.
+ * If mRecursive is true, it will include all files in all
+ * subdirectories (recursively), as well.
+ *
+ * @param dir
+ */
+ private void addFilesFromDir(File dir) {
+ File[] files = dir.listFiles();
+ for (int i = 0; i < files.length; i++) {
+ if (!files[i].isDirectory()) {
+ mFiles.add(files[i]);
+ } else if (mRecursive) {
+ addFilesFromDir(files[i]);
+ }
+ }
+ }
+
+ /**
+ * @see org.apache.uima.collection.CollectionReader#hasNext()
+ */
+ public boolean hasNext() {
+ return mCurrentIndex < mFiles.size();
+ }
+
+ /**
+ * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
+ */
+ public void getNext(CAS aCAS) throws IOException, CollectionException {
+ JCas jcas;
+ try {
+ jcas = aCAS.getJCas();
+ } catch (CASException e) {
+ throw new CollectionException(e);
+ }
+
+ // open input stream to file
+ File file = (File) mFiles.get(mCurrentIndex++);
+ String text = FileUtils.file2String(file, mEncoding);
+ // put document in CAS
+ jcas.setDocumentText(text);
+
+ // set language if it was explicitly specified as a configuration parameter
+ if (mLanguage != null) {
+ ((DocumentAnnotation) jcas.getDocumentAnnotationFs()).setLanguage(mLanguage);
+ }
+
+ // Also store location of source document in CAS. This information is critical
+ // if CAS Consumers will need to know where the original document contents are located.
+ // For example, the Semantic Search CAS Indexer writes this information into the
+ // search index that it creates, which allows applications that use the search index to
+ // locate the documents that satisfy their semantic queries.
+ SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
+ srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString());
+ srcDocInfo.setOffsetInSource(0);
+ srcDocInfo.setDocumentSize((int) file.length());
+ srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size());
+ srcDocInfo.addToIndexes();
+ }
+
+ /**
+ * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
+ */
+ public void close() throws IOException {
+ }
+
+ /**
+ * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
+ */
+ public Progress[] getProgress() {
+ return new Progress[] { new ProgressImpl(mCurrentIndex, mFiles.size(), Progress.ENTITIES) };
+ }
+
+ /**
+ * Gets the total number of documents that will be returned by this collection reader. This is not
+ * part of the general collection reader interface.
+ *
+ * @return the number of documents in the collection
+ */
+ public int getNumberOfDocuments() {
+ return mFiles.size();
+ }
+
+}
Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/reader/FileSystemCollectionReader.java
------------------------------------------------------------------------------
svn:mime-type = text/plain