You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2023/05/10 16:39:36 UTC

[ctakes] branch main updated: Dictionary creator without poor man wsd DictionaryComparator util

This is an automated email from the ASF dual-hosted git repository.

seanfinan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/ctakes.git


The following commit(s) were added to refs/heads/main by this push:
     new 065885e  Dictionary creator without poor man wsd DictionaryComparator util
065885e is described below

commit 065885e46419775ba3647228ee19206c563a665d
Author: Sean Finan <se...@childrens.harvard.edu>
AuthorDate: Wed May 10 12:39:21 2023 -0400

    Dictionary creator without poor man wsd
    DictionaryComparator util
---
 .../lookup2/util/tool/DictionaryComparator.java    | 84 ++++++++++++++++++++++
 .../ctakes/gui/dictionary/DictionaryBuilder.java   |  2 +-
 2 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/tool/DictionaryComparator.java b/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/tool/DictionaryComparator.java
new file mode 100644
index 0000000..e778b15
--- /dev/null
+++ b/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/tool/DictionaryComparator.java
@@ -0,0 +1,84 @@
+package org.apache.ctakes.dictionary.lookup2.util.tool;
+
+import org.apache.ctakes.core.util.StringUtil;
+
+import java.io.*;
+import java.util.Collection;
+import java.util.HashSet;
+
+/**
+ * @author SPF , chip-nlp
+ * @since {5/10/2023}
+ */
+final public class DictionaryComparator {
+
+   static private final String INSERT_SQL = "INSERT INTO CUI_TERMS VALUES(";
+
+   public static void main( String... args ) {
+      final File script1 = new File( args[0] );
+      final File script2 = new File( args[1] );
+      compareScriptCuis( script1, script2 );
+   }
+
+   static private void compareScriptCuis( final File script1, final File script2 ) {
+      final int sql_length = INSERT_SQL.length();
+      final Collection<String> cuis1 = new HashSet<>();
+      final Collection<String> synonyms1 = new HashSet<>();
+      try ( BufferedReader reader = new BufferedReader( new FileReader( script1 ) ) ) {
+         String line = "";
+         while ( line != null ) {
+            if ( line.startsWith( INSERT_SQL ) ) {
+               line = line.substring( sql_length );
+               final String[] splits = StringUtil.fastSplit( line, ',' );
+               cuis1.add( splits[ 0 ] );
+               final String[] splitz = StringUtil.fastSplit( line, '\'' );
+               synonyms1.add( splitz[ 1 ] );
+            }
+            line = reader.readLine();
+         }
+      } catch ( IOException ioE ) {
+         System.err.println( ioE.getMessage() );
+      }
+      final Collection<String> cuis2 = new HashSet<>();
+      final Collection<String> synonyms2 = new HashSet<>();
+      try ( BufferedReader reader = new BufferedReader( new FileReader( script2 ) ) ) {
+         String line = "";
+         while ( line != null ) {
+            if ( line.startsWith( INSERT_SQL ) ) {
+               line = line.substring( sql_length );
+               final String[] splits = StringUtil.fastSplit( line, ',' );
+               cuis2.add( splits[ 0 ] );
+               final String[] splitz = StringUtil.fastSplit( line, '\'' );
+               synonyms2.add( splitz[ 1 ] );
+            }
+            line = reader.readLine();
+         }
+      } catch ( IOException ioE ) {
+         System.err.println( ioE.getMessage() );
+      }
+      System.out.println( "CUIs in " + script1.getName() + " : " + cuis1.size() );
+      System.out.println( "CUIs in " + script2.getName() + " : " + cuis2.size() );
+
+      final Collection<String> missingFrom1 = new HashSet<>( cuis1 );
+      missingFrom1.removeAll( cuis2 );
+      System.out.println( "CUIs in " + script1.getName() + " not in " + script2.getName() + " : " + missingFrom1.size() );
+
+      final Collection<String> missingFrom2 = new HashSet<>( cuis2 );
+      missingFrom2.removeAll( cuis1 );
+      System.out.println( "CUIs in " + script2.getName() + " not in " + script1.getName() + " : " + missingFrom2.size() );
+
+
+
+      System.out.println( "Synonyms in " + script1.getName() + " : " + synonyms1.size() );
+      System.out.println( "Synonyms in " + script2.getName() + " : " + synonyms2.size() );
+
+      final Collection<String> mizzingFrom1 = new HashSet<>( synonyms1 );
+      mizzingFrom1.removeAll( synonyms2 );
+      System.out.println( "Synonyms in " + script1.getName() + " not in " + script2.getName() + " : " + mizzingFrom1.size() );
+
+      final Collection<String> mizzingFrom2 = new HashSet<>( synonyms2 );
+      mizzingFrom2.removeAll( synonyms1 );
+      System.out.println( "Synonyms in " + script2.getName() + " not in " + script1.getName() + " : " + mizzingFrom2.size() );
+   }
+
+}
diff --git a/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/DictionaryBuilder.java b/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/DictionaryBuilder.java
index f89f1e1..09f3e66 100644
--- a/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/DictionaryBuilder.java
+++ b/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/DictionaryBuilder.java
@@ -72,7 +72,7 @@ final class DictionaryBuilder {
       // Fill in information for all valid concepts
       MrconsoParser.parseAllConcepts( umlsDirPath, conceptMap, wantedSources, wantedTargets, umlsTermUtil,
             wantedLanguages, true, MIN_CHAR_LENGTH, MAX_CHAR_LENGTH, MAX_WORD_COUNT, MAX_SYM_COUNT );
-      removeWsdRarities( conceptMap, wantedTuis, WSD_DIVISOR, ANAT_MULTIPLIER );
+//      removeWsdRarities( conceptMap, wantedTuis, WSD_DIVISOR, ANAT_MULTIPLIER );
 //      removeUnwantedDrugs( conceptMap, wantedTuis );
       // Cull non-ANAT texts by ANAT texts as determined by ANAT tuis
 //      removeAnatTexts( conceptMap, wantedTuis );