You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2014/07/15 22:39:03 UTC
svn commit: r1610844 - in
/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem:
./ DistanceMetrics.java WordEmbeddings.java WordVector.java
WordVectorReader.java
Author: tmill
Date: Tue Jul 15 20:39:03 2014
New Revision: 1610844
URL: http://svn.apache.org/r1610844
Log:
CTAKES-304: Added classes for reading/querying neural word embeddings.
Added:
ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/
ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/DistanceMetrics.java
ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordEmbeddings.java
ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVector.java
ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVectorReader.java
Added: ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/DistanceMetrics.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/DistanceMetrics.java?rev=1610844&view=auto
==============================================================================
--- ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/DistanceMetrics.java (added)
+++ ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/DistanceMetrics.java Tue Jul 15 20:39:03 2014
@@ -0,0 +1,51 @@
+package org.apache.ctakes.utils.distsem;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Scanner;
+
+public class DistanceMetrics {
+
+
+ public static void main(String[] args) throws IOException{
+ System.out.println("Reading embeddings...");
+ WordEmbeddings words = WordVectorReader.getEmbeddings(args[0]);
+
+ String prompt = "Enter a single word to get neighbors, or two comma separated words for similarity score (or <ctrl>+d to exit):";
+ Scanner scanner = new Scanner(System.in);
+ String line;
+ System.out.println("Word Distance similarities:");
+ System.out.println(prompt);
+ while(scanner.hasNextLine()){
+ line = scanner.nextLine().trim();
+ String[] input = line.split(",");
+
+ if(input.length == 1){
+ if(words.containsKey(input[0])){
+ List<String> simWords = words.getSimilarWords(input[0], 20);
+ for(String word : simWords){
+ System.out.print(word);
+ System.out.print('\t');
+ System.out.print(words.getSimilarity(input[0], word));
+ System.out.println();
+ }
+ }else{
+ System.err.println("Do not have vectors for your word: " + input[0]);
+ System.err.flush();
+ }
+ }else if(input.length == 2){
+ double sim = words.getSimilarity(input[0].trim(), input[1].trim());
+ System.out.print("Similarity of (");
+ System.out.print(input[0].trim());
+ System.out.print(',');
+ System.out.print(input[1].trim());
+ System.out.print(" = ");
+ System.out.println(sim);
+ }else{
+ System.err.println("Input should be one or two words only!");
+ }
+ System.out.println(prompt);
+ }
+ scanner.close();
+ }
+}
Added: ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordEmbeddings.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordEmbeddings.java?rev=1610844&view=auto
==============================================================================
--- ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordEmbeddings.java (added)
+++ ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordEmbeddings.java Tue Jul 15 20:39:03 2014
@@ -0,0 +1,104 @@
+package org.apache.ctakes.utils.distsem;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+public class WordEmbeddings {
+
+ private Map<String,WordVector> vectors = null;
+ private int dimensionality = 0;
+
+ public WordEmbeddings(int dim){
+ this.vectors = new HashMap<>();
+ this.dimensionality = dim;
+ }
+
+ public WordEmbeddings(Map<String,WordVector> vectors){
+ this.vectors = vectors;
+ }
+
+ public double getSimilarity(String word1, String word2){
+ WordVector vec1 = getVector(word1);
+ WordVector vec2 = getVector(word2);
+
+ assert vec1.size() == vec2.size();
+ double sim = 0.0;
+ for(int i = 0; i < vec1.size(); i++){
+ sim += (vec1.getValue(i) * vec2.getValue(i));
+ }
+
+ sim = sim / (vec1.getLength()*vec2.getLength());
+ return sim;
+ }
+
+ public void add(String line){
+ int wordBreak = line.indexOf(' ');
+ String word = line.substring(0, wordBreak);
+ String[] dims = line.substring(wordBreak+1).split(" ");
+ double[] vector = new double[dims.length];
+ for(int i = 0; i < dims.length; i++){
+ vector[i] = Double.valueOf(dims[i]);
+ }
+ vectors.put(word, new WordVector(word, vector));
+ }
+
+ public boolean containsKey(String word){
+ return vectors.containsKey(word);
+ }
+
+ public WordVector getVector(String word){
+ if(vectors != null){
+ return vectors.get(word);
+ }
+ return null;
+ }
+
+ public int getDimensionality(){
+ return this.dimensionality;
+ }
+
+ public List<String> getSimilarWords(String word, int maxWords) {
+ double[] sims = new double[maxWords];
+ List<String> words = new ArrayList<>(20);
+ Arrays.fill(sims, -1);
+ for(String comp : vectors.keySet()){
+ double sim = getSimilarity(word, comp);
+ if(word.equals(comp)){
+ continue;
+ }else if(words.size() == 0){
+ words.add(comp);
+ sims[0] = sim;
+ continue;
+ }else if(sim < sims[maxWords-1]){
+ // most words won't be greater than the minimum similarity -- quit right away
+ continue;
+ }
+ for(int i = Math.min(maxWords-1, words.size()); i >= 0; i--){
+ // compare the similarity.
+ // if we're here we know that sim > sims[i], just seeing if we can keep going backwards
+ if(i > 0 && sim > sims[i-1]){
+ // shift over the score and the word
+ sims[i] = sims[i-1];
+ if(words.size() <= i){
+ words.add(words.get(i-1));
+ }else{
+ words.set(i, words.get(i-1));
+ }
+ }else{
+ // found our position for our new word:
+ sims[i] = sim;
+ if(words.size() <= i){
+ words.add(comp);
+ }else{
+ words.set(i, comp);
+ }
+ break;
+ }
+ }
+ }
+ return words;
+ }
+}
Added: ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVector.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVector.java?rev=1610844&view=auto
==============================================================================
--- ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVector.java (added)
+++ ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVector.java Tue Jul 15 20:39:03 2014
@@ -0,0 +1,35 @@
+package org.apache.ctakes.utils.distsem;
+
+public class WordVector {
+
+ String word;
+ double[] vector;
+ int size;
+
+ public WordVector(String word, double[] vector){
+ this.word = word;
+ this.vector = vector;
+ this.size = vector.length;
+ }
+
+ public double getValue(int i){
+ if(i >= 0 && i < vector.length){
+ return vector[i];
+ }else{
+ throw new ArrayIndexOutOfBoundsException();
+ }
+ }
+
+ public double getLength(){
+ double len = 0;
+ for(int i = 0; i < size; i++){
+ len += vector[i]*vector[i];
+ }
+
+ return Math.sqrt(len);
+ }
+
+ public int size(){
+ return size;
+ }
+}
Added: ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVectorReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVectorReader.java?rev=1610844&view=auto
==============================================================================
--- ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVectorReader.java (added)
+++ ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/distsem/WordVectorReader.java Tue Jul 15 20:39:03 2014
@@ -0,0 +1,43 @@
+package org.apache.ctakes.utils.distsem;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.Scanner;
+
+public class WordVectorReader {
+ private WordEmbeddings embeddings = null;
+ private int dimensionality = 0;
+ private int numWords = 0;
+
+ public WordVectorReader(InputStream in) throws IOException{
+ BufferedReader reader = new BufferedReader(new InputStreamReader(in));
+ String line = reader.readLine();
+ Scanner scanner = new Scanner(line);
+ numWords = scanner.nextInt();
+ dimensionality = scanner.nextInt();
+ scanner.close();
+ embeddings = new WordEmbeddings(dimensionality);
+
+ while((line = reader.readLine()) != null){
+ embeddings.add(line.trim());
+ }
+ reader.close();
+ }
+
+ public WordEmbeddings getEmbeddings(){
+ return this.embeddings;
+ }
+
+ public static WordEmbeddings getEmbeddings(String fn) throws IOException{
+ WordVectorReader reader = new WordVectorReader(new FileInputStream(fn));
+ return reader.getEmbeddings();
+ }
+
+ public static WordEmbeddings getEmbeddings(InputStream in) throws IOException {
+ WordVectorReader reader = new WordVectorReader(in);
+ return reader.getEmbeddings();
+ }
+}