You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2014/07/15 22:39:03 UTC

svn commit: r1610844 - in /ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem: ./ DistanceMetrics.java WordEmbeddings.java WordVector.java WordVectorReader.java

Author: tmill
Date: Tue Jul 15 20:39:03 2014
New Revision: 1610844

URL: http://svn.apache.org/r1610844
Log:
CTAKES-304: Added classes for reading/querying neural word embeddings.

Added:
    ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/
    ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/DistanceMetrics.java
    ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordEmbeddings.java
    ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVector.java
    ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVectorReader.java

Added: ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/DistanceMetrics.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/DistanceMetrics.java?rev=1610844&view=auto
==============================================================================
--- ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/DistanceMetrics.java (added)
+++ ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/DistanceMetrics.java Tue Jul 15 20:39:03 2014
@@ -0,0 +1,51 @@
+package org.apache.ctakes.utils.distsem;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Scanner;
+
+public class DistanceMetrics {
+
+  
+  public static void main(String[] args) throws IOException{
+    System.out.println("Reading embeddings...");
+    WordEmbeddings words = WordVectorReader.getEmbeddings(args[0]);
+    
+    String prompt = "Enter a single word to get neighbors, or two comma separated words for similarity score (or <ctrl>+d to exit):";
+    Scanner scanner = new Scanner(System.in);
+    String line;
+    System.out.println("Word Distance similarities:");
+    System.out.println(prompt);
+    while(scanner.hasNextLine()){
+      line = scanner.nextLine().trim();
+      String[] input = line.split(",");
+      
+      if(input.length == 1){
+        if(words.containsKey(input[0])){
+          List<String> simWords = words.getSimilarWords(input[0], 20);
+          for(String word : simWords){
+            System.out.print(word);
+            System.out.print('\t');
+            System.out.print(words.getSimilarity(input[0], word));
+            System.out.println();
+          }
+        }else{
+          System.err.println("Do not have vectors for your word: " + input[0]);
+          System.err.flush();
+        }
+      }else if(input.length == 2){
+        double sim = words.getSimilarity(input[0].trim(), input[1].trim());
+        System.out.print("Similarity of (");
+        System.out.print(input[0].trim());
+        System.out.print(',');
+        System.out.print(input[1].trim());
+        System.out.print(" = ");
+        System.out.println(sim);
+      }else{
+        System.err.println("Input should be one or two words only!");
+      }
+      System.out.println(prompt);
+    }
+    scanner.close();
+  }
+}

Added: ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordEmbeddings.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordEmbeddings.java?rev=1610844&view=auto
==============================================================================
--- ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordEmbeddings.java (added)
+++ ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordEmbeddings.java Tue Jul 15 20:39:03 2014
@@ -0,0 +1,104 @@
+package org.apache.ctakes.utils.distsem;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class WordEmbeddings {
+
+  private Map<String,WordVector> vectors = null;
+  private int dimensionality = 0;
+  
+  public WordEmbeddings(int dim){
+    this.vectors = new HashMap<>();
+    this.dimensionality = dim;
+  }
+  
+  public WordEmbeddings(Map<String,WordVector> vectors){
+    this.vectors = vectors;
+  }
+
+  public double getSimilarity(String word1, String word2){
+    WordVector vec1 = getVector(word1);
+    WordVector vec2 = getVector(word2);
+    
+    assert vec1.size() == vec2.size();
+    double sim = 0.0;
+    for(int i = 0; i < vec1.size(); i++){
+      sim += (vec1.getValue(i) * vec2.getValue(i));
+    }
+    
+    sim = sim / (vec1.getLength()*vec2.getLength());
+    return sim;
+  }
+
+  public void add(String line){
+    int wordBreak = line.indexOf(' ');
+    String word = line.substring(0, wordBreak);
+    String[] dims = line.substring(wordBreak+1).split(" ");
+    double[] vector = new double[dims.length];
+    for(int i = 0; i < dims.length; i++){
+      vector[i] = Double.valueOf(dims[i]);
+    }
+    vectors.put(word, new WordVector(word, vector));
+  }
+  
+  public boolean containsKey(String word){
+    return vectors.containsKey(word);
+  }
+  
+  public WordVector getVector(String word){
+    if(vectors != null){
+      return vectors.get(word);
+    }
+    return null;
+  }
+
+  public int getDimensionality(){
+    return this.dimensionality;
+  }
+  
+  public List<String> getSimilarWords(String word, int maxWords) {
+    double[] sims = new double[maxWords];
+    List<String> words = new ArrayList<>(20);
+    Arrays.fill(sims, -1);
+    for(String comp : vectors.keySet()){
+      double sim = getSimilarity(word, comp);
+      if(word.equals(comp)){
+        continue;
+      }else if(words.size() == 0){
+        words.add(comp);
+        sims[0] = sim;
+        continue;
+      }else if(sim < sims[maxWords-1]){
+        // most words won't be greater than the minimum similarity -- quit right away
+        continue;
+      }
+      for(int i = Math.min(maxWords-1, words.size()); i >= 0; i--){
+        // compare the similarity.
+        // if we're here we know that sim > sims[i], just seeing if we can keep going backwards
+        if(i > 0 && sim > sims[i-1]){
+          // shift over the score and the word
+          sims[i] = sims[i-1];
+          if(words.size() <= i){
+            words.add(words.get(i-1));
+          }else{
+            words.set(i, words.get(i-1));
+          }
+        }else{
+          // found our position for our new word:
+          sims[i] = sim;
+          if(words.size() <= i){
+            words.add(comp);
+          }else{
+            words.set(i, comp);
+          }
+          break;
+        }
+      }
+    }
+    return words;
+  }
+}

Added: ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVector.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVector.java?rev=1610844&view=auto
==============================================================================
--- ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVector.java (added)
+++ ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVector.java Tue Jul 15 20:39:03 2014
@@ -0,0 +1,35 @@
+package org.apache.ctakes.utils.distsem;
+
+public class WordVector {
+
+  String word;
+  double[] vector;
+  int size;
+  
+  public WordVector(String word, double[] vector){
+    this.word = word;
+    this.vector = vector;
+    this.size = vector.length;
+  }
+  
+  public double getValue(int i){
+    if(i >= 0 && i < vector.length){
+      return vector[i];
+    }else{
+      throw new ArrayIndexOutOfBoundsException();
+    }
+  }
+  
+  public double getLength(){
+    double len = 0;
+    for(int i = 0; i < size; i++){
+      len += vector[i]*vector[i];
+    }
+    
+    return Math.sqrt(len);
+  }
+  
+  public int size(){
+    return size;
+  }
+}

Added: ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVectorReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVectorReader.java?rev=1610844&view=auto
==============================================================================
--- ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVectorReader.java (added)
+++ ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVectorReader.java Tue Jul 15 20:39:03 2014
@@ -0,0 +1,43 @@
+package org.apache.ctakes.utils.distsem;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Scanner;
+
+public class WordVectorReader {
+  private WordEmbeddings embeddings = null;
+  private int dimensionality = 0;
+  private int numWords = 0;
+  
+  public WordVectorReader(InputStream in) throws IOException{
+    BufferedReader reader = new BufferedReader(new InputStreamReader(in));
+    String line = reader.readLine();
+    Scanner scanner = new Scanner(line);
+    numWords = scanner.nextInt();
+    dimensionality = scanner.nextInt();
+    scanner.close();
+    embeddings = new WordEmbeddings(dimensionality);
+    
+    while((line = reader.readLine()) != null){
+      embeddings.add(line.trim());
+    }
+    reader.close();
+  }
+  
+  public WordEmbeddings getEmbeddings(){
+    return this.embeddings;
+  }
+  
+  public static WordEmbeddings getEmbeddings(String fn) throws IOException{
+    WordVectorReader reader = new WordVectorReader(new FileInputStream(fn));
+    return reader.getEmbeddings();
+  }
+  
+  public static WordEmbeddings getEmbeddings(InputStream in) throws IOException {
+    WordVectorReader reader = new WordVectorReader(in);
+    return reader.getEmbeddings();
+  }
+}