You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by se...@apache.org on 2023/05/10 16:39:36 UTC
[ctakes] branch main updated: Dictionary creator without poor man wsd DictionaryComparator util
This is an automated email from the ASF dual-hosted git repository.
seanfinan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/ctakes.git
The following commit(s) were added to refs/heads/main by this push:
new 065885e Dictionary creator without poor man wsd DictionaryComparator util
065885e is described below
commit 065885e46419775ba3647228ee19206c563a665d
Author: Sean Finan <se...@childrens.harvard.edu>
AuthorDate: Wed May 10 12:39:21 2023 -0400
Dictionary creator without poor man wsd
DictionaryComparator util
---
.../lookup2/util/tool/DictionaryComparator.java | 84 ++++++++++++++++++++++
.../ctakes/gui/dictionary/DictionaryBuilder.java | 2 +-
2 files changed, 85 insertions(+), 1 deletion(-)
diff --git a/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/tool/DictionaryComparator.java b/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/tool/DictionaryComparator.java
new file mode 100644
index 0000000..e778b15
--- /dev/null
+++ b/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/tool/DictionaryComparator.java
@@ -0,0 +1,84 @@
+package org.apache.ctakes.dictionary.lookup2.util.tool;
+
+import org.apache.ctakes.core.util.StringUtil;
+
+import java.io.*;
+import java.util.Collection;
+import java.util.HashSet;
+
+/**
+ * @author SPF , chip-nlp
+ * @since {5/10/2023}
+ */
+final public class DictionaryComparator {
+
+ static private final String INSERT_SQL = "INSERT INTO CUI_TERMS VALUES(";
+
+ public static void main( String... args ) {
+ final File script1 = new File( args[0] );
+ final File script2 = new File( args[1] );
+ compareScriptCuis( script1, script2 );
+ }
+
+ static private void compareScriptCuis( final File script1, final File script2 ) {
+ final int sql_length = INSERT_SQL.length();
+ final Collection<String> cuis1 = new HashSet<>();
+ final Collection<String> synonyms1 = new HashSet<>();
+ try ( BufferedReader reader = new BufferedReader( new FileReader( script1 ) ) ) {
+ String line = "";
+ while ( line != null ) {
+ if ( line.startsWith( INSERT_SQL ) ) {
+ line = line.substring( sql_length );
+ final String[] splits = StringUtil.fastSplit( line, ',' );
+ cuis1.add( splits[ 0 ] );
+ final String[] splitz = StringUtil.fastSplit( line, '\'' );
+ synonyms1.add( splitz[ 1 ] );
+ }
+ line = reader.readLine();
+ }
+ } catch ( IOException ioE ) {
+ System.err.println( ioE.getMessage() );
+ }
+ final Collection<String> cuis2 = new HashSet<>();
+ final Collection<String> synonyms2 = new HashSet<>();
+ try ( BufferedReader reader = new BufferedReader( new FileReader( script2 ) ) ) {
+ String line = "";
+ while ( line != null ) {
+ if ( line.startsWith( INSERT_SQL ) ) {
+ line = line.substring( sql_length );
+ final String[] splits = StringUtil.fastSplit( line, ',' );
+ cuis2.add( splits[ 0 ] );
+ final String[] splitz = StringUtil.fastSplit( line, '\'' );
+ synonyms2.add( splitz[ 1 ] );
+ }
+ line = reader.readLine();
+ }
+ } catch ( IOException ioE ) {
+ System.err.println( ioE.getMessage() );
+ }
+ System.out.println( "CUIs in " + script1.getName() + " : " + cuis1.size() );
+ System.out.println( "CUIs in " + script2.getName() + " : " + cuis2.size() );
+
+ final Collection<String> missingFrom1 = new HashSet<>( cuis1 );
+ missingFrom1.removeAll( cuis2 );
+ System.out.println( "CUIs in " + script1.getName() + " not in " + script2.getName() + " : " + missingFrom1.size() );
+
+ final Collection<String> missingFrom2 = new HashSet<>( cuis2 );
+ missingFrom2.removeAll( cuis1 );
+ System.out.println( "CUIs in " + script2.getName() + " not in " + script1.getName() + " : " + missingFrom2.size() );
+
+
+
+ System.out.println( "Synonyms in " + script1.getName() + " : " + synonyms1.size() );
+ System.out.println( "Synonyms in " + script2.getName() + " : " + synonyms2.size() );
+
+ final Collection<String> mizzingFrom1 = new HashSet<>( synonyms1 );
+ mizzingFrom1.removeAll( synonyms2 );
+ System.out.println( "Synonyms in " + script1.getName() + " not in " + script2.getName() + " : " + mizzingFrom1.size() );
+
+ final Collection<String> mizzingFrom2 = new HashSet<>( synonyms2 );
+ mizzingFrom2.removeAll( synonyms1 );
+ System.out.println( "Synonyms in " + script2.getName() + " not in " + script1.getName() + " : " + mizzingFrom2.size() );
+ }
+
+}
diff --git a/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/DictionaryBuilder.java b/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/DictionaryBuilder.java
index f89f1e1..09f3e66 100644
--- a/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/DictionaryBuilder.java
+++ b/ctakes-gui/src/main/java/org/apache/ctakes/gui/dictionary/DictionaryBuilder.java
@@ -72,7 +72,7 @@ final class DictionaryBuilder {
// Fill in information for all valid concepts
MrconsoParser.parseAllConcepts( umlsDirPath, conceptMap, wantedSources, wantedTargets, umlsTermUtil,
wantedLanguages, true, MIN_CHAR_LENGTH, MAX_CHAR_LENGTH, MAX_WORD_COUNT, MAX_SYM_COUNT );
- removeWsdRarities( conceptMap, wantedTuis, WSD_DIVISOR, ANAT_MULTIPLIER );
+// removeWsdRarities( conceptMap, wantedTuis, WSD_DIVISOR, ANAT_MULTIPLIER );
// removeUnwantedDrugs( conceptMap, wantedTuis );
// Cull non-ANAT texts by ANAT texts as determined by ANAT tuis
// removeAnatTexts( conceptMap, wantedTuis );