You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2013/05/21 13:34:23 UTC
svn commit: r1484764 - in /ctakes/sandbox/ctakes-spelling-corrector: ./ .settings/ lib/ src/org/apache/ctakes/spelling/mistakes/ src/org/apache/ctakes/spelling/mistakes/edit/ src/org/apache/ctakes/spelling/test/

Author: tmill
Date: Tue May 21 11:34:23 2013
New Revision: 1484764

URL: http://svn.apache.org/r1484764
Log:
Addresses ctakes-58: Adds code to build pseudo-supervisd training examples from dictionary and context data.

Added:
    ctakes/sandbox/ctakes-spelling-corrector/README
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTrainingExamples.java
Removed:
    ctakes/sandbox/ctakes-spelling-corrector/lib/uimafit-1.2.0.jar
Modified:
    ctakes/sandbox/ctakes-spelling-corrector/   (props changed)
    ctakes/sandbox/ctakes-spelling-corrector/.settings/org.eclipse.jdt.core.prefs
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java

Propchange: ctakes/sandbox/ctakes-spelling-corrector/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Tue May 21 11:34:23 2013
@@ -0,0 +1 @@
+outputs

Modified: ctakes/sandbox/ctakes-spelling-corrector/.settings/org.eclipse.jdt.core.prefs
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/.settings/org.eclipse.jdt.core.prefs?rev=1484764&r1=1484763&r2=1484764&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/.settings/org.eclipse.jdt.core.prefs (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/.settings/org.eclipse.jdt.core.prefs Tue May 21 11:34:23 2013
@@ -8,4 +8,5 @@ org.eclipse.jdt.core.compiler.debug.loca
 org.eclipse.jdt.core.compiler.debug.sourceFile=generate
 org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
 org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
 org.eclipse.jdt.core.compiler.source=1.6

Added: ctakes/sandbox/ctakes-spelling-corrector/README
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/README?rev=1484764&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/README (added)
+++ ctakes/sandbox/ctakes-spelling-corrector/README Tue May 21 11:34:23 2013
@@ -0,0 +1,18 @@
+Standard workflow for unsupervised training of spellchecker:
+
+1) Generate a file of word unigram counts using UnigramPriorGenerator
+2) Sort list with sort -k3 -nr into new sorted file.
+3) Prune list using heuristic models
+  For example:
+  a) Plot data in gnuplot (log y for visibility)
+  b) Choose cutoff point where curve flattens
+  c) Inspect bottom of list for typo/rare word ratio.
+  d) If there are no 'real' words left, go back to b) and select a new point accordingly
+  e) Shorten sorted list to desired length to create dictionary.txt (retaining counts information)
+  f) Replace all " : " with "\t" in dictionary file for trie data structure.
+4) Create frequency-filtered term neighborhoods (edit distance-based errors) using GenerateTermNeighborhood:
+   Inputs: Dictionary file
+   Output: term neighborhood file
+5) Create context triples using GenerateContextTriples:
+   Inputs: Neighborhood file (see step 4), raw text directory, output directory name
+   Output: Context triple file for each term in dictionary

Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java?rev=1484764&r1=1484763&r2=1484764&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java Tue May 21 11:34:23 2013
@@ -45,8 +45,9 @@ public class GenerateContextTriples {
 
 	/**
 	 * @param args
+	 * @throws IOException 
 	 */
-	public static void main(String[] args) {
+	public static void main(String[] args) throws IOException {
 		if(args.length < 3){
 			System.err.println("Required arguments: <neighborhood file> <input files> <output dir>");
 			System.exit(-1);
@@ -55,6 +56,9 @@ public class GenerateContextTriples {
 		HashMap<String,String[]> dict = new HashMap<String,String[]>();
 //		HashMap<String,ClusterNode> dict = new HashMap<String,ClusterNode>();
 		HashMap<String,CounterMap<String>> contexts = new HashMap<String,CounterMap<String>>();
+//		File hashStorage = Files.createTempDir();
+//		DiskBackedMap<String, CounterDiskMap<String>> contexts = 
+//		    new DiskBackedMap<String, CounterDiskMap<String>>(hashStorage.getAbsolutePath());
 //		
 		Scanner scanner = null;
 		try {
@@ -117,25 +121,35 @@ public class GenerateContextTriples {
 		String prev = null;
 		String cur = null;
 		String next = null;
+		JCas jcas = null;
+		// this creates a huge hashmap -- maybe better off writing to disk if we have the space?
+		// maybe a disk-based map so we can have counts rather than raw lines?
 		while(casIter.hasNext()){
-			JCas jcas = casIter.next();
+		  try{
+		    jcas = casIter.next();
+		  }catch(Exception e){
+		    System.err.println("Skipping example.");
+		    continue;
+		  }
 			Collection<Sentence> sents = JCasUtil.select(jcas, Sentence.class);
 			for(Sentence sent : sents){
 				List<BaseToken> tokens = JCasUtil.selectCovered(jcas, BaseToken.class, sent);
 				for(int i = 0; i < tokens.size(); i++){
-					cur = tokens.get(i).getCoveredText();
+					cur = tokens.get(i).getCoveredText().toLowerCase();
 					if(!dict.containsKey(cur)) continue;
+					if(dict.size() == 1) continue;
 					if(i == 0){
 						prev = "<Start>";
 					}else{
-						prev = tokens.get(i-1).getCoveredText();
+						prev = tokens.get(i-1).getCoveredText().toLowerCase();
 					}
 					if(i == tokens.size()-1){
 						next = "<End>";
 					}else{
-						next = tokens.get(i+1).getCoveredText();
+						next = tokens.get(i+1).getCoveredText().toLowerCase();
 					}
 					if(!contexts.containsKey(cur)){
+//						contexts.put(cur, new CounterDiskMap<String>());
 						contexts.put(cur, new CounterMap<String>());
 					}
 					contexts.get(cur).add(prev + "-" + next);
@@ -143,11 +157,14 @@ public class GenerateContextTriples {
 			}
 		}
 		
+		File outputDir = new File(args[2]);
+		if(!outputDir.exists()) outputDir.mkdir();
+		
 		// for each word write the contexts and their values
 		for(String word : contexts.keySet()){
 			PrintWriter out;
 			try {
-				out = new PrintWriter(new File(args[2], word));
+				out = new PrintWriter(new File(outputDir, word));
 				for(String context : contexts.get(word).keySet()){
 					out.print(context);
 					out.print(" : ");
@@ -160,6 +177,7 @@ public class GenerateContextTriples {
 				System.exit(-1);
 			}
 		}
+//		contexts.close();
 		System.out.printf("Completed in %f seconds\n", (System.currentTimeMillis()-start) / 1000.0);
 	}
 	
@@ -172,5 +190,4 @@ public class GenerateContextTriples {
 			return word.equals(((ClusterNode)n).word);
 		}
 	}
-
 }

Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java?rev=1484764&r1=1484763&r2=1484764&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java Tue May 21 11:34:23 2013
@@ -22,14 +22,19 @@ import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Scanner;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.commons.lang.StringUtils;
-import org.apache.lucene.search.suggest.jaspell.JaspellTernarySearchTrie;
+import org.apache.ctakes.spelling.util.JaspellTernarySearchTrie;
 
 public class GenerateTermNeighborhoods {
-
+  public static final Pattern garbage = Pattern.compile("^\\d+\\p{Alpha}+");
 	/**
 	 * @param args
 	 */
@@ -41,6 +46,8 @@ public class GenerateTermNeighborhoods {
 		
 		long start = System.currentTimeMillis();
 		JaspellTernarySearchTrie trie=null;
+		HashMap<String,Integer> wordCounts = new HashMap<String,Integer>();
+		
 		try {
 			trie = new JaspellTernarySearchTrie(new File(args[0]));
 		} catch (IOException e) {
@@ -64,6 +71,15 @@ public class GenerateTermNeighborhoods {
 		
 		while(scanner.hasNextLine()){
 			String word = scanner.next();
+			
+			// make sure it isn't one of those garbage words like 2glasses which is a mistake but
+			// is not a simple misspelling.
+			Matcher m = garbage.matcher(word);
+			if(m.matches()){
+			  scanner.nextLine();
+			  continue;
+			}
+			
 			int maxDiff;
 			if(word.length() <= 4){
 				maxDiff = 1;
@@ -72,17 +88,22 @@ public class GenerateTermNeighborhoods {
 			}else{
 				maxDiff = 3;
 			}
+//			float count = (float) scanner.nextInt();
+			
 			float count = (Float) trie.get(word);
 			HashSet<String> neighbors = new HashSet<String>();
-			for(int diff = 0; diff <= maxDiff; diff++){
-				trie.setMatchAlmostDiff(diff);
-				neighbors.addAll(trie.matchAlmost(word));				
+
+			for(int diff = 1; diff <= maxDiff; diff++){
+			  trie.setMatchAlmostDiff(diff);
+			  neighbors.addAll(trie.matchAlmost(word));			  
 			}
+
+//			HashSet<String> neighbors = getEditDistanceWords(word, maxDiff, trie);
 			
 			HashSet<String> toRemove = new HashSet<String>();
 			for(String neighbor : neighbors){
-				float nCount = (Float) trie.get(neighbor);
-				if(count / nCount < 10){
+				Float nCount = (Float) trie.get(neighbor);
+				if(nCount == null || nCount / count < 10){
 					toRemove.add(neighbor);
 				}
 			}
@@ -100,5 +121,75 @@ public class GenerateTermNeighborhoods {
 		}
 		System.out.printf("Completed after %f seconds\n", (System.currentTimeMillis()-start) / 1000.0);
 	}
+	
+//  private static void addInsDelWords(int errs, String word, List<String> insDelWords, int pos) {
+//    for(int i = pos; i < word.length(); i++){
+//      if(i == 0){
+//        
+//      }
+//    }
+//  }
+  private static HashSet<String> getEditDistanceWords(String word, int d, JaspellTernarySearchTrie trie) {
+    HashSet<String> ed1words = new HashSet<String>();
+    StringBuilder buff = new StringBuilder(word);
+    
+    char curChar;
+    for(int i = 0; i < word.length(); i++){
+      curChar = word.charAt(i);
+      
+      // first delete
+      buff.deleteCharAt(i);
+      if(i == 0 || trie.matchPrefix(buff.substring(0,i)).size() > 0){
+        ed1words.add(buff.toString().trim());
+      }
+      // restore
+      buff.insert(i, curChar);
+      
+      for(char c = 'a'; c < 'z'; c++){
+        // insertion
+        buff.insert(i, c);
+        ed1words.add(buff.toString().trim());
+        // restore
+        buff.deleteCharAt(i);
+        
+        // now substitution
+        if(c != curChar){
+          buff.setCharAt(i, c);
+        }
+        ed1words.add(buff.toString().trim());
+        // restore
+        buff.setCharAt(i, curChar);
+      }
+      
+      if(i+1 < word.length()){
+        // swap
+        buff.setCharAt(i, word.charAt(i+1));
+        buff.setCharAt(i+1, curChar);
+        ed1words.add(buff.toString().trim());
+        // restore
+        buff.setCharAt(i+1, buff.charAt(i));
+        buff.setCharAt(i, curChar);
+      }
+    }
+    
+    // now try adding at the end of word
+    buff.append(' ');
+    int endInd = buff.length()-1;
+    for(char c = 'a'; c < 'z'; c++){
+      buff.setCharAt(endInd, c);
+      ed1words.add(buff.toString());
+    }
+    
+    if(d == 1){
+      return ed1words;
+    }else{
+      HashSet<String> allWords = new HashSet<String>();
+      allWords.addAll(ed1words);
+      for(String w : ed1words){
+        allWords.addAll(getEditDistanceWords(w, d-1, trie));
+      }
+      return allWords;
+    }
+  }
 
 }

Added: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTrainingExamples.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTrainingExamples.java?rev=1484764&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTrainingExamples.java (added)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTrainingExamples.java Tue May 21 11:34:23 2013
@@ -0,0 +1,81 @@
+package org.apache.ctakes.spelling.mistakes;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.PrintWriter;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Scanner;
+
+public class GenerateTrainingExamples {
+
+  public static int THRESHOLD = 2;
+  
+  /**
+   * @param args
+   * @throws FileNotFoundException 
+   */
+  public static void main(String[] args) throws FileNotFoundException {
+    if(args.length < 3){
+      System.err.println("Required arguments: <term neighborhood file> <contexts directory> <output file>");
+      System.exit(-1);
+    }
+    
+    Scanner scanner = new Scanner(new File(args[0]));
+    File conDir = new File(args[1]);
+    if(!conDir.exists() || !conDir.isDirectory()){
+      System.err.println("Argument 2 must be a directory of contexts!");
+      System.exit(-1);
+    }
+    PrintWriter out = new PrintWriter(args[2]);
+        
+    while(scanner.hasNextLine()){
+      String line = scanner.nextLine().trim();
+      String[] pair = line.split(" : ");
+      if(pair.length == 1) continue;
+      String word = pair[0];
+      String[] neighbors = pair[1].split(":");
+      if(neighbors.length == 0) continue;
+      
+      Map<String,Integer> wordContexts = getWordContexts(conDir, word);
+      
+      for(String neighbor : neighbors){
+        Map<String,Integer> neighborContexts = getWordContexts(conDir, neighbor);
+        int numErrors = 0;
+        for(String context : wordContexts.keySet()){
+          if(!neighborContexts.containsKey(context)) continue;
+          if(neighborContexts.get(context) > wordContexts.get(context)){
+            System.err.println("Found context with likely mistake: " + context);
+            numErrors++;
+          }
+        }
+        if(numErrors == 0) continue;
+        out.printf("%s\t%s : %d\n", word, neighbor, numErrors);
+        out.flush();
+        break;
+      }
+    }
+    out.close();
+    scanner.close();
+  }
+
+  private static Map<String,Integer> getWordContexts(File contextDir, String word) throws FileNotFoundException{
+    HashMap<String,Integer> wordCounts = new HashMap<String,Integer>();
+    
+    File contextFile = new File(contextDir, word);
+    if(contextFile.exists()){
+      Scanner scanner = new Scanner(contextFile);
+      while(scanner.hasNextLine()){
+        String line = scanner.nextLine().trim();
+        if(line.length() == 0) continue;
+        String[] parts = line.split(" : ");
+        if(parts.length < 2) continue;
+        if(Integer.parseInt(parts[1]) >= THRESHOLD){
+          wordCounts.put(parts[0], Integer.parseInt(parts[1]));
+        }
+      }
+      scanner.close();
+    }
+    return wordCounts;
+  }
+}

Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java?rev=1484764&r1=1484763&r2=1484764&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java Tue May 21 11:34:23 2013
@@ -47,7 +47,7 @@ public class SimpleEditDistanceModel imp
 		return words;
 	}
 	
-	public HashSet<String> getEditDistanceWords(String word, int d) {
+	public static HashSet<String> getEditDistanceWords(String word, int d) {
 		HashSet<String> ed1words = new HashSet<String>();
 		StringBuilder buff = new StringBuilder(word);
 		

Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java?rev=1484764&r1=1484763&r2=1484764&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java Tue May 21 11:34:23 2013
@@ -24,7 +24,7 @@ import java.util.ArrayList;
 import java.util.List;
 import java.util.Scanner;
 
-import org.apache.lucene.search.suggest.jaspell.JaspellTernarySearchTrie;
+import org.apache.ctakes.spelling.util.JaspellTernarySearchTrie;
 
 public class TestTrie {
 
@@ -59,7 +59,7 @@ public class TestTrie {
 				maxDiff = 3;
 			}
 			List<String> neighbors = new ArrayList<String>();
-			for(int diff = 0; diff <= maxDiff; diff++){
+			for(int diff = 1; diff <= maxDiff; diff++){
 				trie.setMatchAlmostDiff(diff);
 				neighbors.addAll(trie.matchAlmost(word));				
 			}
@@ -68,5 +68,6 @@ public class TestTrie {
 				System.out.println("\t" + neighbor + "\t" + trie.get(neighbor));
 			}
 		}
+		scanner.close();
 	}
 }