You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by br...@apache.org on 2013/07/07 21:23:07 UTC
svn commit: r1500511 [5/6] - in /ctakes/sandbox/ctakes-scrubber-deid/src: ./ main/ main/java/ main/java/org/ main/java/org/apache/ main/java/org/apache/uima/ main/java/org/apache/uima/examples/ main/java/org/spin/ main/java/org/spin/scrubber/ main/java...

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsDAO.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsDAO.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsDAO.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsDAO.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,505 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+/**
+ * 
+ */
+package org.spin.scrubber.uima.dao;
+
+import com.mysql.jdbc.PreparedStatement;
+import org.spin.scrubber.beans.Annot;
+
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.*;
+
+/**
+ * @author britt fitch bf19
+ * @link http://lists.mysql.com/mysql/202489
+ */
+public class AnnotationsDAO extends BaseDAO
+{
+	public AnnotationsDAO(String tableSuffix)
+	{
+		this.tableSuffix = tableSuffix;
+	}
+	
+    public List<Annot> selectPosWordAnnotations(String subjectId) throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		List<Annot> results = new ArrayList<Annot>();
+
+		StringBuilder sql = new StringBuilder("select * from machine_annotations"+tableSuffix+"  where filename_short = ? and annot_type_short = 'WordToken' and match_source = 'pos';");
+
+		try
+		{
+			conn = getConnectionToScrubber();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setString(i++, subjectId);
+
+			rs = ps.executeQuery();
+
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		Annot annot = new Annot();
+					annot.setAnnot_type_long(rs.getString("annot_type_long"));
+					annot.setAnnot_type_short(rs.getString("annot_type_short"));
+					annot.setEndIdx(rs.getInt("end_idx"));
+					annot.setFilename_long(rs.getString("filename_long"));
+					annot.setFilename_short(rs.getString("filename_short"));
+					annot.setId(rs.getInt("id"));
+					annot.setStartIdx(rs.getInt("start_idx"));
+					annot.setToken(rs.getString("token"));
+					annot.setMatch_source(rs.getString("match_source"));
+					annot.setMatch_value(rs.getString("match_value"));
+
+					results.add(annot);
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			log.error("failed selecting from machine_annotations"+tableSuffix+" : ", e);
+		}
+		finally
+		{
+			close(rs, ps, conn);
+		}
+
+		return results;
+	}
+
+
+	public List<String> selectDistinctFilenames() throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		List<String> results = new ArrayList<String>();
+		
+		StringBuilder sql = new StringBuilder("select distinct filename_short from machine_annotations"+tableSuffix+" ;");
+					
+		try 
+		{
+			conn = getConnectionToScrubber();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			
+			rs = ps.executeQuery();
+		    
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		results.add(rs.getString("filename_short"));
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			log.error("failed selecting machine_annotations"+tableSuffix+" .subject_id",e);
+		}
+		finally
+		{
+			close(rs, ps, conn);
+		}
+		
+		return results;
+	}
+	
+	/**
+	 * return map of lists. key is filename, lists contain annot objects.
+	 * @return TODO ???
+	 * @throws Exception
+	 */
+	public List<Annot> selectAllAnnotByFilename(String filename) throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		List<Annot> results = new ArrayList<Annot>();
+		
+		StringBuilder sql = new StringBuilder("select match_source, match_value, filename_short, start_idx, end_idx from machine_annotations"+tableSuffix+" where filename_short = '"+filename+"';");
+					
+		try 
+		{
+			conn = getConnectionToScrubber();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			
+			rs = ps.executeQuery();
+		    
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		Annot a = new Annot();
+		    		a.setMatch_source(rs.getString("match_source"));
+		    		a.setMatch_value(rs.getString("match_value"));
+		    		a.setStartIdx(rs.getInt("start_idx"));
+		    		a.setEndIdx(rs.getInt("end_idx"));
+		    		a.setFilename_short(rs.getString("filename_short"));
+		    		
+		    		results.add(a);
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			log.error("Failed selecting for Annot List: ", e);
+		}
+		finally
+		{
+			close(rs, ps, conn);
+		}
+		
+		return results;
+	}
+	
+	/**
+	 * return map of lists. key is filename, lists contain annot objects.
+	 * @return TODO ???
+	 * @throws Exception
+	 */
+	@Deprecated
+	public Map<String, List<Annot>> selectAllAnnotByFilename() throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		Map<String, List<Annot>> results = new HashMap<String, List<Annot>>();
+		
+		StringBuilder sql = new StringBuilder("select match_source, match_value, filename_short, start_idx, end_idx from machine_annotations"+tableSuffix+" ;");
+					
+		try 
+		{
+			conn = getConnectionToScrubber();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			
+			rs = ps.executeQuery();
+		    
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		Annot a = new Annot();
+		    		a.setMatch_source(rs.getString("match_source"));
+		    		a.setMatch_value(rs.getString("match_value"));
+		    		a.setStartIdx(rs.getInt("start_idx"));
+		    		a.setEndIdx(rs.getInt("end_idx"));
+		    		a.setFilename_short(rs.getString("filename_short"));
+		    		
+		    		if (results.get(a.getFilename_short())==null)
+		    		{
+		    			List<Annot> lst = new ArrayList<Annot>();
+		    			lst.add(a);
+		    			results.put(a.getFilename_short(), lst);
+		    		}
+		    		else
+		    		{
+		    			results.get(a.getFilename_short()).add(a);
+		    		}
+		    		
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			log.error("Failed selecting for Annot map: ", e);
+		}
+		finally
+		{
+			close(rs, ps, conn);
+		}
+		
+		return results;
+	}
+
+    	/**
+	 * select all KnownPHI for a given file.
+	 * data structure will be used to force classification of all tokens matching
+	 * a piece of known PHI for a given file.
+	 *
+	 * @param filename
+	 * @return Map containing token to annot link
+	 * @throws Exception
+	 */
+	public Map<String, Annot> selectKnownPHIAnnotations(String filename) throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		Map<String, Annot> results = new HashMap<String, Annot>();
+
+        //todo: isn't this supposed to be human_annotations_*?
+		StringBuilder sql = new StringBuilder("select * from machine_annotations"+tableSuffix+"  where filename_short = ? and annot_type_short = 'KnownPHI';");
+
+		try
+		{
+			conn = getConnectionToScrubber();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setString(i++, filename);
+
+			rs = ps.executeQuery();
+
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		String token = rs.getString("token");
+
+		    		Annot annot = new Annot();
+					annot.setAnnot_type_long(rs.getString("annot_type_long"));
+					annot.setAnnot_type_short(rs.getString("annot_type_short"));
+					annot.setEndIdx(rs.getInt("end_idx"));
+					annot.setFilename_long(rs.getString("filename_long"));
+					annot.setFilename_short(rs.getString("filename_short"));
+					annot.setId(rs.getInt("id"));
+					annot.setStartIdx(rs.getInt("start_idx"));
+					annot.setToken(token);
+					annot.setMatch_source(rs.getString("match_source"));
+					annot.setMatch_value(rs.getString("match_value"));
+
+					results.put(token.toLowerCase(),annot);
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			log.error("failed selecting from machine_annotations"+tableSuffix+" : ",e);
+		}
+		finally
+		{
+			close(rs, ps, conn);
+		}
+
+		return results;
+	}
+
+	public List<Annot> selectNumAnnotations(String subjectId) throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		List<Annot> results = new ArrayList<Annot>();
+
+		StringBuilder sql = new StringBuilder("select * from machine_annotations"+tableSuffix+"  where filename_short = ? and annot_type_short = 'NumToken';");
+
+		try
+		{
+			conn = this.getConnectionToScrubber();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setString(i++, subjectId);
+
+			rs = ps.executeQuery();
+
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		Annot annot = new Annot();
+					annot.setAnnot_type_long(rs.getString("annot_type_long"));
+					annot.setAnnot_type_short(rs.getString("annot_type_short"));
+					annot.setEndIdx(rs.getInt("end_idx"));
+					annot.setFilename_long(rs.getString("filename_long"));
+					annot.setFilename_short(rs.getString("filename_short"));
+					annot.setId(rs.getInt("id"));
+					annot.setStartIdx(rs.getInt("start_idx"));
+					annot.setToken(rs.getString("token"));
+					annot.setMatch_source(rs.getString("match_source"));
+					annot.setMatch_value(rs.getString("match_value"));
+
+					results.add(annot);
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			log.error("Failed selecting from machine_annotations"+tableSuffix+" : ",e);
+		}
+		finally
+		{
+			close(rs, ps, conn);
+		}
+
+		return results;
+	}
+
+    //TODO: DEPRECATED !
+
+	@Deprecated
+	public Map<String, String> selectFeaturesByFilenameIdx(String filename, int startIdx) throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		Map<String, String> results = new HashMap<String, String>();
+		
+		StringBuilder sql = new StringBuilder("select match_source, match_value from machine_annotations"+tableSuffix+"  where filename_short=? and ? between start_idx and end_idx;");
+					
+		try 
+		{
+			conn = this.getConnectionToScrubber();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			ps.setString(1, filename);
+			ps.setInt(2, startIdx);
+			
+			rs = ps.executeQuery();
+		    
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		results.put(rs.getString("match_value"),rs.getString("match_source"));
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			System.out.println("ERROR: failed selecting features for subject_id/idx: " + filename + "/" + startIdx + " " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps, conn);
+		}
+		
+		return results;
+	}
+
+    @Deprecated
+	public List<String> selectDistinctPOS() throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		List<String> results = new ArrayList<String>();
+		
+		StringBuilder sql = new StringBuilder("select distinct pos from machine_annotations"+tableSuffix+" ;");
+					
+		try 
+		{
+			conn = this.getConnectionToScrubber();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			
+			rs = ps.executeQuery();
+		    
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		results.add(rs.getString("pos"));
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			System.out.println("ERROR: failed selecting machine_annotations"+tableSuffix+" .pos: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps, conn);
+		}
+		
+		return results;
+	}
+
+    @Deprecated
+	public Set<String> selectDistinctFeatures() throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		Set<String> results = new HashSet<String>();
+		
+		StringBuilder sql = new StringBuilder("select concat_ws('|', match_source, match_value) as feature from machine_annotations"+tableSuffix+" ;");
+					
+		try 
+		{
+			conn = this.getConnectionToScrubber();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			
+			rs = ps.executeQuery();
+		    
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		results.add(rs.getString("feature"));
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			System.out.println("ERROR: failed selecting machine_annotations"+tableSuffix+"  features: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps, conn);
+		}
+		
+		return results;
+	}
+
+    @Deprecated
+	public Set<String> selectDistinctOntology() throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		Set<String> results = new HashSet<String>();
+		
+		StringBuilder sql = new StringBuilder("select distinct match_source as feature from machine_annotations"+tableSuffix+" ;");
+					
+		try 
+		{
+			conn = this.getConnectionToScrubber();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			
+			rs = ps.executeQuery();
+		    
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		results.add(rs.getString("feature"));
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			System.out.println("ERROR: failed selecting machine_annotations"+tableSuffix+"  ontologies: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps, conn);
+		}
+		
+		return results;
+	}
+
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsDAO.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsPubsDAO.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsPubsDAO.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsPubsDAO.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsPubsDAO.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,141 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+/**
+ * 
+ */
+package org.spin.scrubber.uima.dao;
+
+import com.mysql.jdbc.PreparedStatement;
+
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * @author bf19
+ * @link http://lists.mysql.com/mysql/202489
+ */
+public class AnnotationsPubsDAO extends BaseDAO
+{
+    public void insertPubsPOS(String pos, int posCnt) throws Exception
+    {
+        PreparedStatement ps = null;
+        Connection conn = null;
+
+        StringBuilder sql = new StringBuilder("insert into pubs_pos (pos, pos_cnt) values (?,?); ");
+
+        try
+        {
+            conn = getConnectionToScrubber();
+            ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+            ps.setString(1, pos);
+            ps.setInt(2, posCnt);
+            int updated = ps.executeUpdate();
+
+            log.info("inserted "  + updated + " rows to pubs_pos.");
+        }
+        catch (SQLException e)
+        {
+            log.error("Failed inserting annotations.pubs_pos",e);
+            e.printStackTrace();
+        }
+        finally
+        {
+            close(ps, conn);
+        }
+    }
+
+	public Map<String,Integer> selectDistinctPOS(Map<String,Integer> pubPosMap, String filename) throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		
+		StringBuilder sql = new StringBuilder("select count(*) cnt, match_value from machine_annotations_pubs where filename_short=? and match_source='pos' group by match_value;");
+					
+		try 
+		{
+			conn = getConnectionToScrubber();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			ps.setString(1, filename);
+			rs = ps.executeQuery();
+		    
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		String pos = rs.getString("match_value");
+		    		Integer cnt = rs.getInt("cnt");
+		    		
+		    		pubPosMap.put(pos, (pubPosMap.containsKey(pos) ? pubPosMap.get(pos) + cnt : cnt));
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			log.error("Failed selecting annotations.pos ",e);
+		}
+		finally
+		{
+			close(rs, ps, conn);
+		}
+		
+		return pubPosMap;
+	}
+	
+
+	public List<String> selectDistinctFilenameShort() throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		List<String> results = new ArrayList<String>();
+		
+		StringBuilder sql = new StringBuilder("select distinct filename_short from machine_annotations_pubs;");
+					
+		try 
+		{
+			conn = getConnectionToScrubber();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			
+			rs = ps.executeQuery();
+		    
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		results.add(rs.getString("filename_short"));
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			log.error("Failed selecting distinct filename_short",e);
+		}
+		finally
+		{
+			close(rs, ps, conn);
+		}
+		
+		return results;
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/AnnotationsPubsDAO.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/BaseDAO.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/BaseDAO.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/BaseDAO.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/BaseDAO.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,192 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+/**
+ * 
+ */
+package org.spin.scrubber.uima.dao;
+
+import com.mysql.jdbc.PreparedStatement;
+import org.apache.log4j.Logger;
+import org.spin.scrubber.ScrubberProperties;
+
+import java.sql.*;
+
+/**
+ * @author bf19
+ *
+ */
+public class BaseDAO
+{
+    protected static Logger log =  Logger.getLogger(BaseDAO.class);
+    protected String tableSuffix; 
+
+	public static Connection getConnectionToScrubber() throws Exception
+	{
+		return getConnection(ScrubberProperties.getDbDriver(), ScrubberProperties.getDbURI(), ScrubberProperties.getDbUser(), ScrubberProperties.getDbPassword());
+	}
+	
+	public static Connection getConnection(String driver, String uri, String user, String pw) throws SQLException, ClassNotFoundException
+	{
+		Connection conn;
+		  
+		try 
+		{
+			Class.forName(driver);
+			conn = DriverManager.getConnection(uri, user, pw);
+		}
+        catch (ClassNotFoundException e)
+        {
+            log.error("ERROR: could not load driver: " + e.getMessage());
+            throw e;
+        }
+		catch (SQLException e)
+		{
+			log.error("ERROR: unable to get db conn: " + e.getMessage());
+			throw e;
+		}
+		return conn;		  
+	}
+
+    /**
+     * log & swallow exceptions here.
+     * dropping indexes is not a requirement, it just speeds up the process.
+     *
+     * in general its bad news to embed params directly into sql but the normal ps.setString() option makes the var a quoted string.
+     *
+     * @param table drop an index from this talbe
+     * @param index name of index to drop
+     * @throws Exception
+     */
+    public static void dropIndex(String table, String index) throws Exception
+    {
+        Connection conn      = getConnectionToScrubber();
+        PreparedStatement ps = null;
+        StringBuilder sql    = new StringBuilder("alter table "+table + " drop index "+index+";");
+
+        try
+        {
+            ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+            ps.execute();
+        }
+        catch (SQLException e)
+        {
+            log.warn("unable to drop index " + index + " on table "+ table);
+        }
+        finally
+        {
+            if (ps!=null)
+            {
+                ps.close();
+            }
+            if (conn!=null)
+            {
+                conn.close();
+            }
+        }
+    }
+
+    public void runPreparedSQL(String sql) throws Exception
+	{
+		Connection conn = getConnectionToScrubber();
+		PreparedStatement ps = null;
+
+		try
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql);
+			ps.execute();
+		}
+		catch (SQLException e)
+		{
+			e.printStackTrace();
+			throw e;
+		}
+		finally
+		{
+			if (ps!=null)
+			{
+				ps.close();
+			}
+			if (conn!=null)
+			{
+				conn.close();
+			}
+		}
+	}
+
+	public static void closeConnection(Connection conn)
+	{
+		close(conn);
+	}
+	public static void closeRSPS(ResultSet rs, Statement stmt)
+	{
+		close(rs, stmt);
+	}
+	
+	public static void close(Connection conn)
+    {
+    	if (conn != null) 
+    	{
+    		try 
+    		{
+		        conn.close();    		    
+    		} 
+    		catch (Exception e) 
+    		{
+    			log.error(e.getMessage(), e);
+    		}
+    	}
+    }
+
+	public static void close(ResultSet rs)
+    {
+        if (rs != null) {
+            try {
+                rs.close();
+            } catch (Exception e) {
+                log.error(e.getMessage(), e);
+            }
+        }
+    }
+    
+    public static void close(Statement stmt) {
+        if (stmt != null) {
+            try {
+                stmt.close();
+            } catch (Exception e) {
+                log.error(e.getMessage(), e);
+            }
+        }
+    }
+    
+    public static void close(ResultSet rs, Statement stmt) {
+    	close(rs);
+    	close(stmt);
+    }
+    
+    public static void close(Statement stmt, Connection conn) {
+        close(stmt);
+        close(conn);
+    }
+    
+    public static void close(ResultSet rs, Statement stmt, Connection conn) {
+        close(rs);
+        close(stmt);
+        close(conn);
+    }
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/BaseDAO.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/FeatureMatrixDAO.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/FeatureMatrixDAO.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/FeatureMatrixDAO.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/FeatureMatrixDAO.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,815 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.uima.dao;
+
+import com.mysql.jdbc.PreparedStatement;
+import org.spin.scrubber.beans.CaseFeature;
+
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.util.ArrayList;
+import java.util.List;
+
+public class FeatureMatrixDAO extends BaseDAO
+{
+	Connection conn = null;
+
+	public FeatureMatrixDAO(String tableSuffix) throws Exception
+	{
+		log.info("Constructing CaseFeaturesDAO...");
+		
+		this.tableSuffix = tableSuffix;
+        conn = getConnectionToScrubber();
+	}
+
+    public static void main(String[] args) throws Exception
+    {
+        if(args.length != 1)
+        {
+            //TODO: refactor
+            System.out.println("Usage: ");
+            System.out.println("FeatureMatrixDAO tableSuffix \r\n");
+
+            System.out.println("Example: ");
+            System.out.println("FeatureMatrixDAO");
+            System.out.println("FeatureMatrixDAO _train");
+            System.out.println("FeatureMatrixDAO _test");
+        }
+        else
+        {
+            String tableSuffix = args[0];
+            FeatureMatrixDAO dao = new FeatureMatrixDAO(tableSuffix);
+
+            dao.selectDataSetTrain();
+        }
+    }
+
+	public List<String> selectDataSetTest()
+	{
+		List<String> results = new ArrayList<String>();
+		PreparedStatement ps = null;
+		ResultSet rs = null;
+		
+		StringBuilder sql = new StringBuilder();
+		sql.append(" SELECT CONCAT_WS(',', id, ");
+		sql.append(" pos,");
+		sql.append(" pos_bin, ");
+		sql.append(" has_capital,");
+		sql.append(" cnt_regex_phon, cnt_regex_date, cnt_regex_age, cnt_regex_id, cnt_regex_pat, cnt_regex_doc, cnt_regex_loc, cnt_regex_hosp, ");
+		sql.append(" cnt_priv, cnt_hosp, cnt_name,");
+		sql.append(" cnt_dict_costar, cnt_dict_hl7v25, cnt_dict_hl7v30, cnt_dict_icd10cm, cnt_dict_icd10pcs, cnt_dict_icd9cm, cnt_dict_lnc, cnt_dict_msh, cnt_dict_rxnorm, cnt_dict_snomedct, ");
+		sql.append(" cnt_ham_w_pos*10e6, ");
+		sql.append(" cnt_ham_wo_pos*10e6,");
+		sql.append(" phi_type) as record ");
+		sql.append(" FROM feature_matrix"+tableSuffix+" ");
+		sql.append(" where pos is not null ");		//required to avoid pulling in knownPHI tokens from xml header
+
+		try 
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+						
+			rs = ps.executeQuery();
+			
+			if (rs!=null)
+			{
+				while(rs.next())
+				{
+					results.add(rs.getString("record"));
+				}
+			}
+		}
+		catch (Exception e)
+		{
+			log.error("ERROR: failed selecting TEST data set: ", e);
+		}
+		finally
+		{
+			close(rs, ps);
+		}
+		
+		return results;
+	}
+	
+	public List<String> selectDataSetTrain()
+	{
+		List<String> results = new ArrayList<String>();
+		
+		//get all positive cases
+		results.addAll(selectDataSetTrainPositive());
+		
+		//get an equal number of negative cases
+//		results.addAll(selectDataSetTrainNegative(results.size()));
+        results.addAll(selectDataSetTrainNegative(5000*1000)); //TODO: refactor
+		
+		return results;
+	}
+	
+	private List<String> selectDataSetTrainPositive()
+	{
+		List<String> results = new ArrayList<String>();
+		PreparedStatement ps = null;
+		ResultSet rs = null;
+		
+		//positive case 
+		StringBuilder sql = new StringBuilder();
+		sql.append(" SELECT CONCAT_WS(',', id, ");
+		sql.append(" pos,");
+		sql.append(" pos_bin, ");
+		sql.append(" has_capital,");
+		sql.append(" cnt_regex_phon, cnt_regex_date, cnt_regex_age, cnt_regex_id, cnt_regex_pat, cnt_regex_doc, cnt_regex_loc, cnt_regex_hosp, ");
+		sql.append(" cnt_priv, cnt_hosp, cnt_name,");
+		sql.append(" cnt_dict_costar, cnt_dict_hl7v25, cnt_dict_hl7v30, cnt_dict_icd10cm, cnt_dict_icd10pcs, cnt_dict_icd9cm, cnt_dict_lnc, cnt_dict_msh, cnt_dict_rxnorm, cnt_dict_snomedct, ");
+		sql.append(" cnt_ham_w_pos*10e6, ");
+		sql.append(" cnt_ham_wo_pos*10e6,");
+		sql.append(" phi_type) as record ");
+		sql.append(" FROM feature_matrix"+tableSuffix+" ");
+		sql.append(" where is_phi = 1 ;");
+		
+		try 
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+						
+			rs = ps.executeQuery();
+			
+			if (rs!=null)
+			{
+				while(rs.next())
+				{
+					results.add(rs.getString("record"));
+				}
+			}
+		}
+		catch (Exception e)
+		{
+			log.error("ERROR: failed selecting TEST data set (positive): ", e);
+		}
+		finally
+		{
+			close(rs, ps);
+		}
+		
+		return results;
+	}
+	
+	private List<String> selectDataSetTrainNegative(int limit)
+	{
+		List<String> results = new ArrayList<String>();
+		PreparedStatement ps = null;
+		ResultSet rs = null;
+		
+		//negative case 
+		StringBuilder sql = new StringBuilder();
+		sql.append(" SELECT CONCAT_WS(',', id, ");
+		sql.append(" pos,");
+		sql.append(" pos_bin, ");
+		sql.append(" has_capital,");
+		sql.append(" cnt_regex_phon, cnt_regex_date, cnt_regex_age, cnt_regex_id, cnt_regex_pat, cnt_regex_doc, cnt_regex_loc, cnt_regex_hosp, ");
+		sql.append(" cnt_priv, cnt_hosp, cnt_name,");
+		sql.append(" cnt_dict_costar, cnt_dict_hl7v25, cnt_dict_hl7v30, cnt_dict_icd10cm, cnt_dict_icd10pcs, cnt_dict_icd9cm, cnt_dict_lnc, cnt_dict_msh, cnt_dict_rxnorm, cnt_dict_snomedct, ");
+		sql.append(" cnt_ham_w_pos*10e6, ");
+		sql.append(" cnt_ham_wo_pos*10e6,");
+		sql.append(" phi_type) as record ");
+		sql.append(" FROM feature_matrix"+tableSuffix+" ");
+		sql.append(" where is_phi = 0 ");
+		sql.append(" and phi_type = 'NA' "); 		//required to avoid pulling in knownPHI tokens from xml header
+		sql.append(" and classified_as = 'NA' ");	//required to avoid pulling in knownPHI tokens from xml header
+		sql.append(" order by rand() ");
+		sql.append(" limit "+limit+" ;");
+		
+		try 
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+						
+			rs = ps.executeQuery();
+			
+			if (rs!=null)
+			{
+				while(rs.next())
+				{
+					results.add(rs.getString("record"));
+				}
+			}
+		}
+		catch (Exception e)
+		{
+			log.error("ERROR: failed selecting TEST data set (negative): ", e);
+		}
+		finally
+		{
+			close(rs, ps);
+		}
+		
+		return results;
+	}
+	
+	public void insertCaseFeatureBatch(List<CaseFeature> list)
+	{
+		PreparedStatement ps = null;
+		StringBuilder sql = new StringBuilder("insert into feature_matrix"+tableSuffix+"  ");
+		sql.append("(token, start_idx, end_idx, filename_short, pos, pos_bin, has_capital, ");
+		sql.append("cnt_regex_phon, cnt_regex_date, cnt_regex_age, cnt_regex_id, cnt_regex_pat, cnt_regex_doc, cnt_regex_loc, cnt_regex_hosp, ");
+		sql.append("cnt_priv, cnt_hosp, cnt_name, ");
+		sql.append("cnt_dict_costar, cnt_dict_hl7v25, cnt_dict_hl7v30, cnt_dict_icd10cm, cnt_dict_icd10pcs, cnt_dict_icd9cm, cnt_dict_lnc, cnt_dict_msh, cnt_dict_rxnorm, cnt_dict_snomedct, ");
+		sql.append("cnt_ham_w_pos, cnt_ham_wo_pos, is_phi, phi_type, classified_as) ");
+		sql.append("values (?,?,?,?,?,?,?,?,?,?,  ?,?,?,?,?,?,?,?,?,?,   ?,?,?,?,?,?,?,?,?,?,  ?,?,?) ");
+		sql.append(";");
+		
+		try 
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			for (CaseFeature cf : list)
+			{
+				int i=1;
+				ps.setString(i++, cf.getToken());
+				ps.setInt(i++, cf.getStartIdx());
+				ps.setInt(i++, cf.getEndIdx());
+				ps.setString(i++, cf.getFilename_short());
+				ps.setString(i++, cf.getPos());
+				ps.setString(i++, cf.getPos_bin());
+				ps.setInt(i++, cf.getHas_capital());
+				
+				ps.setInt(i++, cf.getCnt_regex_phon());
+				ps.setInt(i++, cf.getCnt_regex_date());
+				ps.setInt(i++, cf.getCnt_regex_age());
+				ps.setInt(i++, cf.getCnt_regex_id());
+				ps.setInt(i++, cf.getCnt_regex_pat());
+				ps.setInt(i++, cf.getCnt_regex_doc());
+				ps.setInt(i++, cf.getCnt_regex_loc());
+				ps.setInt(i++, cf.getCnt_regex_hosp());
+				
+				ps.setInt(i++, cf.getCnt_priv());
+				ps.setInt(i++, cf.getCnt_hosp());
+				ps.setInt(i++, cf.getCnt_name());
+				
+				ps.setInt(i++, cf.getCnt_dict_costar());
+				ps.setInt(i++, cf.getCnt_dict_hl7v25());
+				ps.setInt(i++, cf.getCnt_dict_hl7v30());
+				ps.setInt(i++, cf.getCnt_dict_icd10cm());
+				ps.setInt(i++, cf.getCnt_dict_icd10pcs());
+				ps.setInt(i++, cf.getCnt_dict_icd9cm());
+				ps.setInt(i++, cf.getCnt_dict_lnc());
+				ps.setInt(i++, cf.getCnt_dict_msh());
+				ps.setInt(i++, cf.getCnt_dict_rxnorm());
+				ps.setInt(i++, cf.getCnt_dict_snomedct());
+				
+				ps.setFloat(i++, cf.getCnt_ham_w_pos());
+				ps.setFloat(i++, cf.getCnt_ham_wo_pos());
+				
+				ps.setInt(i++, cf.getIs_phi());
+				ps.setString(i++, cf.getPhi_type());
+				ps.setString(i++, cf.getClassified_as());
+				ps.addBatch();
+			}
+			
+			ps.executeBatch();		    
+		}
+		catch (Exception e)
+		{
+			log.error("Failed inserting caseFeatureBatch.", e);
+		}
+		finally
+		{
+			close(ps);
+		}
+	}
+		
+	public void updateClassification(String classifiedAs, int id)
+	{
+		PreparedStatement ps = null;
+		StringBuilder sql = new StringBuilder("update feature_matrix"+tableSuffix+"  set classified_as = ? where id = ? ;");
+		
+		try 
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setString(i++, classifiedAs);
+			ps.setInt(i++, id);
+			
+			ps.executeUpdate();		    
+		}
+		catch (Exception e)
+		{
+			System.out.println("ERROR: failed updating feature_matrix"+tableSuffix+"  for classified_as field.");
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(ps);
+		}
+	}
+		
+	public List<CaseFeature> selectAllCaseFeatures()
+	{
+		PreparedStatement ps = null;
+//		Connection conn = null;
+		ResultSet rs = null;
+		StringBuilder sql = new StringBuilder("select * from feature_matrix"+tableSuffix+"  ");
+			
+		List<CaseFeature> results = new ArrayList<CaseFeature>();
+		try 
+		{
+//			conn = this.getConnectionToScrubber();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+						
+			rs = ps.executeQuery();
+			
+			if (rs!=null)
+			{
+				CaseFeature cf; 
+				while(rs.next())
+				{
+					cf = new CaseFeature();
+
+					cf.setId(rs.getInt("id"));
+					cf.setToken(rs.getString("token").toLowerCase());
+					cf.setPos(rs.getString("pos"));
+					cf.setStartIdx(rs.getInt("start_idx"));
+					cf.setEndIdx(rs.getInt("end_idx"));
+					cf.setClassified_as(rs.getString("classified_as"));
+					cf.setPhi_type(rs.getString("phi_type"));
+					cf.setFilename_short(rs.getString("filename_short"));
+					
+					//TODO: finish populating the cf object...
+					results.add(cf);
+				}
+			}
+		}
+		catch (Exception e)
+		{
+			System.out.println("ERROR: failed selecting all caseFeature: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps);
+//			close(rs, ps, conn);
+		}
+		
+		return results;
+	}
+	
+	/**
+	 * select all records for a given filename 
+	 * from the feature_matrix table
+	 * where the record is classified_as *ANY* type of phi 
+	 * (ie. NOT 'NA')
+	 * 
+	 * @param filename_short
+	 * @return
+	 */
+	public List<CaseFeature> selectPHICaseFeaturesByFilename(String filename_short)
+	{
+		PreparedStatement ps = null;
+		ResultSet rs = null;
+		StringBuilder sql = new StringBuilder("select * from feature_matrix"+tableSuffix+"  where filename_short = ? and classified_as <> 'NA'");
+			
+		List<CaseFeature> results = new ArrayList<CaseFeature>();
+		try 
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			ps.setString(1, filename_short);			
+			rs = ps.executeQuery();
+			
+			if (rs!=null)
+			{
+				CaseFeature cf; 
+				while(rs.next())
+				{
+					cf = new CaseFeature();
+					cf.setId(rs.getInt("id"));
+					cf.setToken(rs.getString("token").toLowerCase());
+					cf.setPos(rs.getString("pos"));
+					cf.setStartIdx(rs.getInt("start_idx"));
+					cf.setEndIdx(rs.getInt("end_idx"));
+					cf.setClassified_as(rs.getString("classified_as"));
+					cf.setPhi_type(rs.getString("phi_type"));
+					cf.setFilename_short(rs.getString("filename_short"));
+					
+					//TODO: cf object is populated from 3 methods. need to write seperate method that takes RS and returns a CF obj.
+					results.add(cf);
+				}
+			}
+		}
+		catch (Exception e)
+		{
+			System.out.println("ERROR: failed selecting caseFeatures by filename: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps);
+//			close(rs, ps, conn);
+		}
+		
+		return results;
+	}
+	
+	public List<String> selectDistinctFilenames()
+	{
+		PreparedStatement ps = null;
+		ResultSet rs = null;
+		StringBuilder sql = new StringBuilder("select distinct filename_short from feature_matrix"+tableSuffix+"  ");
+			
+		List<String> results = new ArrayList<String>();
+		try 
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+						
+			rs = ps.executeQuery();
+			
+			if (rs!=null)
+			{
+				while(rs.next())
+				{
+					results.add(rs.getString("filename_short"));
+				}
+			}
+		}
+		catch (Exception e)
+		{
+			System.out.println("ERROR: failed selecting distinct filename_short from caseFeature: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps);
+		}
+		
+		return results;
+	}
+	
+	@Deprecated
+	public List<CaseFeature> selectAllTestCaseFeatures()
+	{
+		PreparedStatement ps = null;
+		ResultSet rs = null;
+		StringBuilder sql = new StringBuilder("select * from feature_matrix"+tableSuffix+"  ");
+			
+		List<CaseFeature> results = new ArrayList<CaseFeature>();
+		try 
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+						
+			rs = ps.executeQuery();
+			
+			if (rs!=null)
+			{
+				CaseFeature cf; 
+				while(rs.next())
+				{
+					cf = new CaseFeature();
+					cf.setId(rs.getInt("id"));
+					cf.setToken(rs.getString("token").toLowerCase());
+					cf.setPos(rs.getString("pos"));
+					cf.setStartIdx(rs.getInt("start_idx"));
+					cf.setEndIdx(rs.getInt("end_idx"));
+					cf.setClassified_as(rs.getString("classified_as"));
+					cf.setPhi_type(rs.getString("phi_type"));
+					cf.setFilename_short(rs.getString("filename_short"));
+					
+					//TODO: finish populating the cf object...
+					results.add(cf);
+				}
+			}
+		}
+		catch (Exception e)
+		{
+			System.out.println("ERROR: failed selecting all caseFeature: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps);
+//				close(rs, ps, conn);
+		}
+		
+		return results;
+	}
+
+
+    /**
+     * Get case features containing the character and position of where scrubber found PHI.
+     *
+     * @param filename name of the actual physical file to be deidentified (redacted)
+     * @return list of PHI classifications in REVERSE (desc) order
+     */
+	public List<CaseFeature> selectClassifiedAsPHI(String filename)
+	{
+		PreparedStatement ps = null;
+		ResultSet rs = null;
+		StringBuilder sql = new StringBuilder("select filename_short, token, start_idx, end_idx, classified_as from feature_matrix"+tableSuffix+"  where filename_short = ? and classified_as <> 'NA' order by start_idx desc");
+			
+		List<CaseFeature> results = new ArrayList<CaseFeature>();
+		try 
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i = 1;
+			ps.setString(i++, filename);
+						
+			rs = ps.executeQuery();
+			
+			if (rs!=null)
+			{
+				CaseFeature cf; 
+				while(rs.next())
+				{
+					cf = new CaseFeature();
+					cf.setStartIdx(rs.getInt("start_idx"));
+					cf.setEndIdx(rs.getInt("end_idx"));
+					cf.setToken(rs.getString("token").toLowerCase());
+					cf.setFilename_short(rs.getString("filename_short"));
+					cf.setClassified_as(rs.getString("classified_as"));
+					
+					//TODO: finish populating the cf object...
+					results.add(cf);
+				}
+			}
+		}
+		catch (Exception e)
+		{
+			System.out.println("ERROR: failed selecting caseFeatures: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps);
+		}
+		
+		return results;
+	}
+	
+	public List<CaseFeature> selectClassifiedAsPHITest(String filename)
+	{
+		PreparedStatement ps = null;
+		ResultSet rs = null;
+		StringBuilder sql = new StringBuilder("select filename_short, token, start_idx, end_idx, classified_as from feature_matrix"+tableSuffix+"  where filename_short = ? and classified_as <> 'NA' order by start_idx desc");
+			
+		List<CaseFeature> results = new ArrayList<CaseFeature>();
+		try 
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i = 1;
+			ps.setString(i++, filename);
+						
+			rs = ps.executeQuery();
+			
+			if (rs!=null)
+			{
+				CaseFeature cf; 
+				while(rs.next())
+				{
+					cf = new CaseFeature();
+					cf.setStartIdx(rs.getInt("start_idx"));
+					cf.setEndIdx(rs.getInt("end_idx"));
+					cf.setToken(rs.getString("token").toLowerCase());
+					cf.setFilename_short(rs.getString("filename_short"));
+					cf.setClassified_as(rs.getString("classified_as"));
+					
+					//TODO: finish populating the cf object...
+					results.add(cf);
+				}
+			}
+		}
+		catch (Exception e)
+		{
+			System.out.println("ERROR: failed selecting caseFeatures: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps);
+		}
+		
+		return results;
+	}
+		
+
+	
+	public void updateCaseFeaturePHITypeTest(int id, String type)
+	{
+		PreparedStatement ps = null;
+		ResultSet rs = null;
+		StringBuilder sql = new StringBuilder("update feature_matrix"+tableSuffix+"  set phi_type = ? where id = ?");
+			
+		try 
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setString(i++, type);
+			ps.setInt(i++, id);
+			int updated = ps.executeUpdate();
+			
+			System.out.println("updated count: " + updated);
+		}
+		catch (Exception e)
+		{
+			System.out.println("ERROR: failed updating (test) caseFeature phi_type: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps);
+		}
+	}
+	
+	/**
+	 * 
+	 * @param id - id of row to be updated.
+	 * @param pcount - count of terms w/ same token AND pos.
+	 * @param tcount - count of terms w/ same token.
+	 */
+	public void updateCaseFeatureTFAllPubs(int id, float pcount, float tcount)
+	{
+		PreparedStatement ps = null;
+		ResultSet rs = null;
+		StringBuilder sql = new StringBuilder("update feature_matrix"+tableSuffix+"  set cnt_ham_w_pos = ?, cnt_ham_wo_pos=? where id = ?");
+			
+		try 
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setFloat(i++, pcount);
+			ps.setFloat(i++, tcount);
+			ps.setInt(i++, id);
+			int updated = ps.executeUpdate();
+			
+			System.out.println("updated count: " + updated);
+		}
+		catch (Exception e)
+		{
+			System.out.println("ERROR: failed updating caseFeature - pubs: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps);
+		}
+	}
+	
+	/**
+	 * 
+	 * @param id - id of row to be updated.
+	 * @param pcount - count of terms w/ same token AND pos.
+	 * @param tcount - count of terms w/ same token.
+	 */
+	public void updateTestCaseFeatureTFAllPubs(int id, float pcount, float tcount)
+	{
+		PreparedStatement ps = null;
+		ResultSet rs = null;
+		StringBuilder sql = new StringBuilder("update feature_matrix"+tableSuffix+"  set cnt_ham_w_pos = ?, cnt_ham_wo_pos=? where id = ?");
+			
+		try 
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setFloat(i++, pcount);
+			ps.setFloat(i++, tcount);
+			ps.setInt(i++, id);
+			int updated = ps.executeUpdate();
+			
+			System.out.println("updated count: " + updated);
+		}
+		catch (Exception e)
+		{
+			System.out.println("ERROR: failed updating caseFeature - pubs: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps);
+		}
+	}
+
+    @Deprecated
+	public void updateClassification(String classifiedAs, String filename, int startIdx)
+	{
+		PreparedStatement ps = null;
+		StringBuilder sql = new StringBuilder("update feature_matrix"+tableSuffix+"  set classified_as = ? where filename_short = ? and start_idx = ? ;");
+
+		try
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setString(i++, classifiedAs);
+			ps.setString(i++, filename);
+			ps.setInt(i++, startIdx);
+
+			ps.executeUpdate();
+		}
+		catch (Exception e)
+		{
+			System.out.println("ERROR: failed updating feature_matrix"+tableSuffix+"  for classified_as field.");
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(ps);
+		}
+	}
+
+    @Deprecated
+    public void updateCaseFeaturePHITypeTrain(int id, String type)
+	{
+		PreparedStatement ps = null;
+		ResultSet rs = null;
+		StringBuilder sql = new StringBuilder("update feature_matrix"+tableSuffix+"  set phi_type = ? where id = ?");
+
+		try
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setString(i++, type);
+			ps.setInt(i++, id);
+			int updated = ps.executeUpdate();
+
+			System.out.println("updated count: " + updated);
+		}
+		catch (Exception e)
+		{
+			System.out.println("ERROR: failed updating (train) caseFeature phi_type: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps);
+		}
+	}
+
+    @Deprecated
+	public void insertCaseFeature(CaseFeature cf)
+	{
+		PreparedStatement ps = null;
+//		Connection conn = null;
+
+		StringBuilder sql = new StringBuilder("insert into feature_matrix"+tableSuffix+"  ");
+		sql.append("(token, start_idx, filename_short, pos, pos_bin, has_capital, ");
+		sql.append("cnt_regex_phon, cnt_regex_date, cnt_regex_age, cnt_regex_id, cnt_regex_pat, cnt_regex_doc, cnt_regex_loc, cnt_regex_hosp, ");
+		sql.append("cnt_priv, cnt_hosp, cnt_name, ");
+		sql.append("cnt_dict_costar, cnt_dict_hl7v25, cnt_dict_hl7v30, cnt_dict_icd10cm, cnt_dict_icd10pcs, cnt_dict_icd9cm, cnt_dict_lnc, cnt_dict_msh, cnt_dict_rxnorm, cnt_dict_snomedct, ");
+		sql.append("cnt_ham_w_pos, cnt_ham_wo_pos, is_phi) ");
+		sql.append("values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?);");
+
+		try
+		{
+//			conn = this.getConnectionToScrubber();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setString(i++, cf.getToken());
+			ps.setInt(i++, cf.getStartIdx());
+			ps.setString(i++, cf.getFilename_short());
+			ps.setString(i++, cf.getPos());
+			ps.setString(i++, cf.getPos_bin());
+			ps.setInt(i++, cf.getHas_capital());
+			ps.setInt(i++, cf.getCnt_regex_phon());
+			ps.setInt(i++, cf.getCnt_regex_date());
+			ps.setInt(i++, cf.getCnt_regex_age());
+			ps.setInt(i++, cf.getCnt_regex_id());
+			ps.setInt(i++, cf.getCnt_regex_pat());
+			ps.setInt(i++, cf.getCnt_regex_doc());
+			ps.setInt(i++, cf.getCnt_regex_loc());
+			ps.setInt(i++, cf.getCnt_regex_hosp());
+			ps.setInt(i++, cf.getCnt_priv());
+			ps.setInt(i++, cf.getCnt_hosp());
+			ps.setInt(i++, cf.getCnt_name());
+			ps.setInt(i++, cf.getCnt_dict_costar());
+			ps.setInt(i++, cf.getCnt_dict_hl7v25());
+			ps.setInt(i++, cf.getCnt_dict_hl7v30());
+			ps.setInt(i++, cf.getCnt_dict_icd10cm());
+			ps.setInt(i++, cf.getCnt_dict_icd10pcs());
+			ps.setInt(i++, cf.getCnt_dict_icd9cm());
+			ps.setInt(i++, cf.getCnt_dict_lnc());
+			ps.setInt(i++, cf.getCnt_dict_msh());
+			ps.setInt(i++, cf.getCnt_dict_rxnorm());
+			ps.setInt(i++, cf.getCnt_dict_snomedct());
+			ps.setFloat(i++, cf.getCnt_ham_w_pos());
+			ps.setFloat(i++, cf.getCnt_ham_wo_pos());
+			ps.setInt(i++, cf.getIs_phi());
+
+			ps.execute();
+		}
+		catch (Exception e)
+		{
+			log.error("Failed inserting caseFeature: " + cf.getFilename_short() + ":" + cf.getToken(), e);
+		}
+		finally
+		{
+			close(ps);
+		}
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/FeatureMatrixDAO.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/HumanAnnotationsDAO.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/HumanAnnotationsDAO.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/HumanAnnotationsDAO.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/HumanAnnotationsDAO.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,233 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.uima.dao;
+
+import com.mysql.jdbc.PreparedStatement;
+
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.HashSet;
+import java.util.Set;
+
+public class HumanAnnotationsDAO extends BaseDAO
+{
+	Connection conn; 
+	
+	public HumanAnnotationsDAO(String tableSuffix) throws Exception
+	{
+		this.tableSuffix = tableSuffix;
+		conn = getConnectionToScrubber();
+	}
+
+	public void insert(String subjectId, String tagName, String tagValue, int startIdx, int endIdx) throws Exception
+	{
+		PreparedStatement ps = null;
+				
+		StringBuilder sql = new StringBuilder("insert into human_annotations"+tableSuffix+"  (subject_id, tag_name, tag_value, start_idx, end_idx) values (?,?,?,?,?);");
+					
+		try 
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setString(i++, subjectId);
+			ps.setString(i++, tagName);
+			ps.setString(i++, tagValue);
+			ps.setInt(i++, startIdx);
+			ps.setInt(i++, endIdx);
+			ps.execute();		    
+		}
+		catch (SQLException e)
+		{
+			log.error("Failed inserting : " + subjectId + ":" + tagName +":"+ tagValue, e);
+		}
+		finally
+		{
+			close(ps);
+		}
+	}
+
+	public String selectPHIType(String filename, int startIdx) throws Exception
+	{
+		PreparedStatement ps = null;
+		ResultSet rs = null;
+		String retVal = null;
+		StringBuilder sql = new StringBuilder("select tag_name from human_annotations"+tableSuffix+"  where subject_id=? and ? between start_idx and end_idx;");
+					
+		try 
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setString(i++, filename);
+			ps.setInt(i++, startIdx);
+			rs = ps.executeQuery();
+		    
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		String tmp = rs.getString("tag_name");
+		    		if (tmp!=null && tmp.length()>0)
+		    		{
+		    			retVal = tmp;
+		    		}
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			log.error("Failed selectPHIType", e);
+		}
+		finally
+		{
+			close(rs, ps);
+		}
+		
+		return retVal;
+	}
+
+	public void close()
+	{
+		try
+		{
+			close(conn);
+		}
+		catch(Exception e)
+		{
+			System.out.println("ERROR: issue closing connection for HumanAnnotationsDAO.");
+		}
+	}
+
+    @Deprecated
+    public Set<String> selectTagValueBySubjectId(String subjectId) throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		Set<String> results = new HashSet<String>();
+
+		StringBuilder sql = new StringBuilder("select tag_value from human_annotations"+tableSuffix+"  where tag_name in ('HOSPITAL', 'PATIENT', 'DOCTOR', 'LOCATION') and subject_id = ?;");
+
+		try
+		{
+			conn = this.getConnectionToScrubber();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setString(i++, subjectId);
+			rs = ps.executeQuery();
+
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		String tagValue = rs.getString("tag_value");
+		    		results.add(tagValue);
+
+		    		//adding version w/o period (in cases of middle name, etc...)
+		    		if (tagValue.contains("."))
+		    		{
+		    			results.add(tagValue.replace(".", " ").trim());
+		    		}
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			System.out.println("ERROR: failed selecting human_annotations"+tableSuffix+" .tag_value: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps, conn);
+		}
+
+		return results;
+	}
+
+    @Deprecated
+	public int isPHI(String filename, int startIdx) throws Exception
+	{
+		PreparedStatement ps = null;
+		ResultSet rs = null;
+		int retVal = 0;
+		StringBuilder sql = new StringBuilder("select count(*) cnt from human_annotations"+tableSuffix+"  where subject_id=? and ? between start_idx and end_idx;");
+
+		try
+		{
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setString(i++, filename);
+			ps.setInt(i++, startIdx);
+			rs = ps.executeQuery();
+
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		int tmp = rs.getInt("cnt");
+		    		if (tmp>0)
+		    		{
+		    			retVal = 1;
+		    		}
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			System.out.println("ERROR: failed isPHI check: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps);
+//			close(rs, ps, conn);
+		}
+
+		return retVal;
+	}
+
+    @Deprecated
+	public void insert(String subjectId, String tagName, String tagValue) throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+
+		StringBuilder sql = new StringBuilder("insert into human_annotations"+tableSuffix+"  (subject_id, tag_name, tag_value) values (?,?,?);");
+
+		try
+		{
+			conn = this.getConnectionToScrubber();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setString(i++, subjectId);
+			ps.setString(i++, tagName);
+			ps.setString(i++, tagValue);
+			ps.execute();
+		}
+		catch (SQLException e)
+		{
+			System.out.println("ERROR: failed inserting : " + subjectId + ":" + tagName +":"+ tagValue);
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(ps, conn);
+		}
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/HumanAnnotationsDAO.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/PubDAO.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/PubDAO.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/PubDAO.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/PubDAO.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,417 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+/**
+ * 
+ */
+package org.spin.scrubber.uima.dao;
+
+import com.mysql.jdbc.PreparedStatement;
+
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * @author bf19
+ *
+ */
+public class PubDAO extends BaseDAO
+{
+	private Connection conn;
+
+	public PubDAO() throws Exception
+	{
+		conn = getConnectionToScrubber();
+	}
+
+	public Map<Integer,String> selectRandomPubContent() throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		Map<Integer, String> pubMap = new HashMap<Integer,String>();
+		
+		StringBuilder sql = new StringBuilder("select distinct p.id, p.body ");
+		sql.append(" from pubs p right join keywords k on p.id = k.pub_id ");
+		sql.append(" where rand()>0.9 limit 10500; ");
+					
+		try 
+		{
+			conn = getConn();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			
+			rs = ps.executeQuery();
+		    
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		pubMap.put(rs.getInt("id"), rs.getString("body"));
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			log.error("Failed selecting RANDOM pub set: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps, conn);
+		}
+		
+		return pubMap;
+	}
+	
+	public Map<Integer,String> selectAllPubContentById(int id) throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		Map<Integer, String> pubMap = new HashMap<Integer,String>();
+		
+//		StringBuilder sql = new StringBuilder("select p.id, p.body ");
+//		sql.append(" from pubs p right join keywords k on p.id = k.pub_id where p.id = ? ");
+		
+		StringBuilder sql = new StringBuilder("select p.id, p.body from pubs p where p.id = ? ");
+					
+		try 
+		{
+			conn = getConn();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			ps.setInt(1, id);
+			
+			rs = ps.executeQuery();
+		    
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		pubMap.put(rs.getInt("id"), rs.getString("body"));
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			log.error("Failed selecting ALL pub set: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps);
+		}
+		
+		return pubMap;
+	}
+	
+	public List<Integer> selectAllPubIds() throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		List<Integer> results = new ArrayList<Integer>();
+		
+//		StringBuilder sql = new StringBuilder("select distinct p.id ");
+//		sql.append(" from pubs p right join keywords k on p.id = k.pub_id ");
+		
+		StringBuilder sql = new StringBuilder("select distinct p.id from pubs p");
+					
+		try 
+		{
+			conn = getConn();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			
+			rs = ps.executeQuery();
+		    
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		results.add(rs.getInt("id"));
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			log.error("Failed selecting ALL pub IDs: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps);
+		}
+		
+		return results;
+	}
+	
+	public Map<Integer,String> selectPubContentByLikeKeywords(String[] keywordFragList) throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		Map<Integer, String> pubMap = new HashMap<Integer,String>();
+		List<Integer> pubIdList = selectPubIDByLikeKeywords(keywordFragList);
+		
+		StringBuilder sql = new StringBuilder("select id, body from pubs where id in ( ");
+		for (Integer i : pubIdList)
+		{
+			sql.append(i+",");
+		}		
+		sql.append(" 0);");
+					
+		try 
+		{
+			conn = getConn();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			
+			rs = ps.executeQuery();
+		    
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		pubMap.put(rs.getInt("id"), rs.getString("body"));
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			log.error("Failed selecting pubs by LIKE keyword: " + e.getMessage());
+		}
+		finally
+		{
+			close(rs, ps, conn);
+		}
+		
+		return pubMap;
+	}
+	
+	public List<Integer> selectPubIDByLikeKeywords(String[] keywordFragList) throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		List<Integer> results = new ArrayList<Integer>();
+		
+		StringBuilder sql = new StringBuilder("select distinct pub_id from keywords where keyword like ? ");
+		for (int k=1; k<keywordFragList.length; k++)
+		{
+			sql.append(" or keyword like ? ");
+		}
+			
+		try 
+		{
+			conn = getConn();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			for (String s : keywordFragList)
+			{
+				ps.setString(i++, s);
+			}
+			
+			rs = ps.executeQuery();
+		    
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		results.add(rs.getInt("pub_id"));
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			log.error("Failed selecting pubs by LIKE keyword: " + e.getMessage());
+		}
+		finally
+		{
+			close(rs, ps);
+		}
+		
+		return results;
+	}
+	public void insertAuthor(int pubId, String surname, String givenName) throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		int updated = 0;
+		StringBuilder sql = new StringBuilder("insert into authors (pub_id, surname, given_name) values (?,?,?);");
+			
+		try 
+		{
+			conn = getConn();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setInt(i++, pubId);
+			ps.setString(i++, surname);
+			ps.setString(i++, givenName);
+			
+			updated = ps.executeUpdate();
+		    
+		    if(updated!=1) { throw new SQLException("ERROR: db insert count="+updated + ". expected=1"); } 
+		}
+		catch (SQLException e)
+		{
+			log.error("Failed inserting author: " + e.getMessage());
+		}
+		finally
+		{
+			close(ps);
+		}
+	}
+	public void insertRef(int pubId, String surname, String givenName) throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		int updated = 0;
+		StringBuilder sql = new StringBuilder("insert into refs (pub_id, surname, given_name) values (?,?,?);");
+			
+		try 
+		{
+			conn = getConn();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setInt(i++, pubId);
+			ps.setString(i++, surname);
+			ps.setString(i++, givenName);
+			
+			updated = ps.executeUpdate();
+		    
+		    if(updated!=1) { throw new SQLException("ERROR: db insert count="+updated + ". expected=1"); } 
+		}
+		catch (SQLException e)
+		{
+			log.error("Failed inserting ref: " + e.getMessage());
+		}
+		finally
+		{
+			close(ps);
+		}
+	}
+	public void insertKeyword(int pubId, String keyword) throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		int updated = 0;
+		StringBuilder sql = new StringBuilder("insert into keywords (pub_id, keyword) values (?,?);");
+			
+		try 
+		{
+			conn = getConn();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setInt(i++, pubId);
+			ps.setString(i++, keyword);
+			
+			updated = ps.executeUpdate();
+		    
+		    if(updated!=1) { throw new SQLException("ERROR: db insert count="+updated + ". expected=1"); } 
+		}
+		catch (SQLException e)
+		{
+			log.error("Failed inserting keyword: " + e.getMessage());
+		}
+		finally
+		{
+			close(ps);
+		}
+	}
+	
+	public int selectNewestPub() throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		int id = 0;
+		StringBuilder sql = new StringBuilder("select max(id) id from pubs;");
+			
+		try 
+		{
+			conn = getConn();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			
+			rs = ps.executeQuery();
+		    
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		id = rs.getInt("id");
+				}
+		    }
+		}
+		catch (SQLException e)
+		{
+			log.error("Failed inserting keyword: " + e.getMessage());
+		}
+		finally
+		{
+			close(rs, ps);
+		}
+		
+		return id;
+	}
+	public int insertPub(String journalTitle, String articleTitle, String pmc, String pmid, String doi, String body, String filepath) throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		int updated = 0;
+		int pubId=0;
+		StringBuilder sql = new StringBuilder("insert into pubs (journal_title, article_title, pmc, pmid, doi, body, filepath) values (?,?,?,?,?,?,?);");
+		
+		try 
+		{
+			conn = getConn();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			int i=1;
+			ps.setString(i++, journalTitle);
+			ps.setString(i++, articleTitle);
+			ps.setString(i++, pmc);
+			ps.setString(i++, pmid);
+			ps.setString(i++, doi);
+			ps.setString(i++, body);
+			ps.setString(i++, filepath);
+			
+			updated = ps.executeUpdate();
+		    
+		    if(updated!=1) { throw new SQLException("ERROR: db insert count="+updated + ". expected=1"); }
+		    
+		    pubId = selectNewestPub();
+		}
+		catch (SQLException e)
+		{
+			log.error("Failed inserting pub: " + e.getMessage());
+		}
+		finally
+		{
+			close(ps);
+		}
+		
+		return pubId;
+	}
+
+	public Connection getConn()
+	{
+		return conn;
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/PubDAO.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/TfDAO.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/TfDAO.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/TfDAO.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/TfDAO.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+package org.spin.scrubber.uima.dao;
+
+import com.mysql.jdbc.PreparedStatement;
+
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.Map;
+
+//TODO: rename to TermFrequencyDAO
+public class TfDAO extends BaseDAO
+{
+		
+	/**
+	 * create a map of lowercase token and number of occurances with and without the POS.
+	 * @return
+	 * @throws Exception
+	 */
+	public Map<String,Integer> selectPubTFMap() throws Exception
+	{
+		PreparedStatement ps = null;
+		Connection conn = null;
+		ResultSet rs = null;
+		Map<String,Integer> results = new HashMap<String,Integer>();
+		int total=0;
+		
+		StringBuilder sql = new StringBuilder("select * from lookup_term_frequency;");
+					
+		try 
+		{
+			conn = this.getConnectionToScrubber();
+			ps = (PreparedStatement) conn.prepareStatement(sql.toString());
+			rs = ps.executeQuery();
+		    
+		    if(rs!=null)
+		    {
+		    	while (rs.next())
+				{
+		    		String token = rs.getString("token").toLowerCase();
+		    		String pos = rs.getString("pos");
+		    		String compositeToken = token+"|"+pos;
+		    		int cnt = rs.getInt("cnt");
+		    		String compositePOSToken = "pos|"+pos;
+		    		total+=cnt;
+		    		
+		    		results.put(token,(results.get(token)==null) ? cnt : results.get(token) + cnt);
+		    		results.put(compositeToken,(results.get(compositeToken)==null) ? cnt : results.get(compositeToken) + cnt);
+		    		results.put(compositePOSToken,(results.get(compositePOSToken)==null) ? cnt : results.get(compositePOSToken) + cnt);
+				}
+		    	//add in total count of all tokens - later used for computing tf (count/total)
+		    	results.put("totalPubCount", total);
+		    }
+		}
+		catch (SQLException e)
+		{
+			log.error("Selecting all_pubs_tf: " + e.getMessage());
+			e.printStackTrace();
+		}
+		finally
+		{
+			close(rs, ps, conn);
+		}
+		
+		return results;
+	}
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/dao/TfDAO.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/reader/FileSystemCollectionReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/reader/FileSystemCollectionReader.java?rev=1500511&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/reader/FileSystemCollectionReader.java (added)
+++ ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/reader/FileSystemCollectionReader.java Sun Jul  7 19:23:05 2013
@@ -0,0 +1,211 @@
+/*******************************************************************************
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ *  Unless required by applicable law or agreed to in writing,
+ *  software distributed under the License is distributed on an
+ *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ *  KIND, either express or implied.  See the License for the
+ *  specific language governing permissions and limitations
+ *  under the License.
+ ******************************************************************************/
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.spin.scrubber.uima.reader;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CollectionException;
+import org.apache.uima.collection.CollectionReader_ImplBase;
+import org.apache.uima.examples.SourceDocumentInformation;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.DocumentAnnotation;
+import org.apache.uima.resource.ResourceConfigurationException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.FileUtils;
+import org.apache.uima.util.Progress;
+import org.apache.uima.util.ProgressImpl;
+
+/**
+ * A simple collection reader that reads documents from a directory in the filesystem. It can be
+ * configured with the following parameters:
+ * <ul>
+ * <li><code>InputDirectory</code> - path to directory containing files</li>
+ * <li><code>Encoding</code> (optional) - character encoding of the input files</li>
+ * <li><code>Language</code> (optional) - language of the input documents</li>
+ * </ul>
+ * 
+ * 
+ */
+public class FileSystemCollectionReader extends CollectionReader_ImplBase {
+  /**
+   * Name of configuration parameter that must be set to the path of a directory containing input
+   * files.
+   */
+  public static final String PARAM_INPUTDIR = "InputDirectory";
+
+  /**
+   * Name of configuration parameter that contains the character encoding used by the input files.
+   * If not specified, the default system encoding will be used.
+   */
+  public static final String PARAM_ENCODING = "Encoding";
+
+  /**
+   * Name of optional configuration parameter that contains the language of the documents in the
+   * input directory. If specified this information will be added to the CAS.
+   */
+  public static final String PARAM_LANGUAGE = "Language";
+
+  /**
+   * Name of optional configuration parameter that indicates including
+   * the subdirectories (recursively) of the current input directory.
+   */
+  public static final String PARAM_SUBDIR = "BrowseSubdirectories";
+  
+  private ArrayList<File> mFiles;
+
+  private String mEncoding;
+
+  private String mLanguage;
+  
+  private Boolean mRecursive;
+
+  private int mCurrentIndex;
+
+  /**
+   * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
+   */
+  public void initialize() throws ResourceInitializationException {
+    File directory = new File(((String) getConfigParameterValue(PARAM_INPUTDIR)).trim());
+    mEncoding  = (String) getConfigParameterValue(PARAM_ENCODING);
+    mLanguage  = (String) getConfigParameterValue(PARAM_LANGUAGE);
+    mRecursive = (Boolean) getConfigParameterValue(PARAM_SUBDIR);
+    if (null == mRecursive) { // could be null if not set, it is optional
+      mRecursive = Boolean.FALSE;
+    }
+    mCurrentIndex = 0;
+
+    // if input directory does not exist or is not a directory, throw exception
+    if (!directory.exists() || !directory.isDirectory()) {
+      throw new ResourceInitializationException(ResourceConfigurationException.DIRECTORY_NOT_FOUND,
+              new Object[] { PARAM_INPUTDIR, this.getMetaData().getName(), directory.getPath() });
+    }
+
+    // get list of files in the specified directory, and subdirectories if the
+    // parameter PARAM_SUBDIR is set to True
+    mFiles = new ArrayList<File>();
+    addFilesFromDir(directory);
+  }
+  
+  /**
+   * This method adds files in the directory passed in as a parameter to mFiles.
+   * If mRecursive is true, it will include all files in all
+   * subdirectories (recursively), as well. 
+   * 
+   * @param dir
+   */
+  private void addFilesFromDir(File dir) {
+    File[] files = dir.listFiles();
+    for (int i = 0; i < files.length; i++) {
+      if (!files[i].isDirectory()) {
+        mFiles.add(files[i]);
+      } else if (mRecursive) {
+        addFilesFromDir(files[i]);
+      }
+    }
+  }
+
+  /**
+   * @see org.apache.uima.collection.CollectionReader#hasNext()
+   */
+  public boolean hasNext() {
+    return mCurrentIndex < mFiles.size();
+  }
+
+  /**
+   * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
+   */
+  public void getNext(CAS aCAS) throws IOException, CollectionException {
+    JCas jcas;
+    try {
+      jcas = aCAS.getJCas();
+    } catch (CASException e) {
+      throw new CollectionException(e);
+    }
+
+    // open input stream to file
+    File file = (File) mFiles.get(mCurrentIndex++);
+    String text = FileUtils.file2String(file, mEncoding);
+      // put document in CAS
+    jcas.setDocumentText(text);
+
+    // set language if it was explicitly specified as a configuration parameter
+    if (mLanguage != null) {
+      ((DocumentAnnotation) jcas.getDocumentAnnotationFs()).setLanguage(mLanguage);
+    }
+
+    // Also store location of source document in CAS. This information is critical
+    // if CAS Consumers will need to know where the original document contents are located.
+    // For example, the Semantic Search CAS Indexer writes this information into the
+    // search index that it creates, which allows applications that use the search index to
+    // locate the documents that satisfy their semantic queries.
+    SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
+    srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString());
+    srcDocInfo.setOffsetInSource(0);
+    srcDocInfo.setDocumentSize((int) file.length());
+    srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size());
+    srcDocInfo.addToIndexes();
+  }
+
+  /**
+   * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
+   */
+  public void close() throws IOException {
+  }
+
+  /**
+   * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
+   */
+  public Progress[] getProgress() {
+    return new Progress[] { new ProgressImpl(mCurrentIndex, mFiles.size(), Progress.ENTITIES) };
+  }
+
+  /**
+   * Gets the total number of documents that will be returned by this collection reader. This is not
+   * part of the general collection reader interface.
+   * 
+   * @return the number of documents in the collection
+   */
+  public int getNumberOfDocuments() {
+    return mFiles.size();
+  }
+
+}

Propchange: ctakes/sandbox/ctakes-scrubber-deid/src/main/java/org/spin/scrubber/uima/reader/FileSystemCollectionReader.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain