You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2013/05/21 13:34:23 UTC
svn commit: r1484764 - in /ctakes/sandbox/ctakes-spelling-corrector: ./
.settings/ lib/ src/org/apache/ctakes/spelling/mistakes/
src/org/apache/ctakes/spelling/mistakes/edit/
src/org/apache/ctakes/spelling/test/
Author: tmill
Date: Tue May 21 11:34:23 2013
New Revision: 1484764
URL: http://svn.apache.org/r1484764
Log:
Addresses ctakes-58: Adds code to build pseudo-supervisd training examples from dictionary and context data.
Added:
ctakes/sandbox/ctakes-spelling-corrector/README
ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTrainingExamples.java
Removed:
ctakes/sandbox/ctakes-spelling-corrector/lib/uimafit-1.2.0.jar
Modified:
ctakes/sandbox/ctakes-spelling-corrector/ (props changed)
ctakes/sandbox/ctakes-spelling-corrector/.settings/org.eclipse.jdt.core.prefs
ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java
ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java
ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java
ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java
Propchange: ctakes/sandbox/ctakes-spelling-corrector/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Tue May 21 11:34:23 2013
@@ -0,0 +1 @@
+outputs
Modified: ctakes/sandbox/ctakes-spelling-corrector/.settings/org.eclipse.jdt.core.prefs
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/.settings/org.eclipse.jdt.core.prefs?rev=1484764&r1=1484763&r2=1484764&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/.settings/org.eclipse.jdt.core.prefs (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/.settings/org.eclipse.jdt.core.prefs Tue May 21 11:34:23 2013
@@ -8,4 +8,5 @@ org.eclipse.jdt.core.compiler.debug.loca
org.eclipse.jdt.core.compiler.debug.sourceFile=generate
org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.compiler.source=1.6
Added: ctakes/sandbox/ctakes-spelling-corrector/README
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/README?rev=1484764&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/README (added)
+++ ctakes/sandbox/ctakes-spelling-corrector/README Tue May 21 11:34:23 2013
@@ -0,0 +1,18 @@
+Standard workflow for unsupervised training of spellchecker:
+
+1) Generate a file of word unigram counts using UnigramPriorGenerator
+2) Sort list with sort -k3 -nr into new sorted file.
+3) Prune list using heuristic models
+ For example:
+ a) Plot data in gnuplot (log y for visibility)
+ b) Choose cutoff point where curve flattens
+ c) Inspect bottom of list for typo/rare word ratio.
+ d) If there are no 'real' words left, go back to b) and select a new point accordingly
+ e) Shorten sorted list to desired length to create dictionary.txt (retaining counts information)
+ f) Replace all " : " with "\t" in dictionary file for trie data structure.
+4) Create frequency-filtered term neighborhoods (edit distance-based errors) using GenerateTermNeighborhood:
+ Inputs: Dictionary file
+ Output: term neighborhood file
+5) Create context triples using GenerateContextTriples:
+ Inputs: Neighborhood file (see step 4), raw text directory, output directory name
+ Output: Context triple file for each term in dictionary
Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java?rev=1484764&r1=1484763&r2=1484764&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java Tue May 21 11:34:23 2013
@@ -45,8 +45,9 @@ public class GenerateContextTriples {
/**
* @param args
+ * @throws IOException
*/
- public static void main(String[] args) {
+ public static void main(String[] args) throws IOException {
if(args.length < 3){
System.err.println("Required arguments: <neighborhood file> <input files> <output dir>");
System.exit(-1);
@@ -55,6 +56,9 @@ public class GenerateContextTriples {
HashMap<String,String[]> dict = new HashMap<String,String[]>();
// HashMap<String,ClusterNode> dict = new HashMap<String,ClusterNode>();
HashMap<String,CounterMap<String>> contexts = new HashMap<String,CounterMap<String>>();
+// File hashStorage = Files.createTempDir();
+// DiskBackedMap<String, CounterDiskMap<String>> contexts =
+// new DiskBackedMap<String, CounterDiskMap<String>>(hashStorage.getAbsolutePath());
//
Scanner scanner = null;
try {
@@ -117,25 +121,35 @@ public class GenerateContextTriples {
String prev = null;
String cur = null;
String next = null;
+ JCas jcas = null;
+ // this creates a huge hashmap -- maybe better off writing to disk if we have the space?
+ // maybe a disk-based map so we can have counts rather than raw lines?
while(casIter.hasNext()){
- JCas jcas = casIter.next();
+ try{
+ jcas = casIter.next();
+ }catch(Exception e){
+ System.err.println("Skipping example.");
+ continue;
+ }
Collection<Sentence> sents = JCasUtil.select(jcas, Sentence.class);
for(Sentence sent : sents){
List<BaseToken> tokens = JCasUtil.selectCovered(jcas, BaseToken.class, sent);
for(int i = 0; i < tokens.size(); i++){
- cur = tokens.get(i).getCoveredText();
+ cur = tokens.get(i).getCoveredText().toLowerCase();
if(!dict.containsKey(cur)) continue;
+ if(dict.size() == 1) continue;
if(i == 0){
prev = "<Start>";
}else{
- prev = tokens.get(i-1).getCoveredText();
+ prev = tokens.get(i-1).getCoveredText().toLowerCase();
}
if(i == tokens.size()-1){
next = "<End>";
}else{
- next = tokens.get(i+1).getCoveredText();
+ next = tokens.get(i+1).getCoveredText().toLowerCase();
}
if(!contexts.containsKey(cur)){
+// contexts.put(cur, new CounterDiskMap<String>());
contexts.put(cur, new CounterMap<String>());
}
contexts.get(cur).add(prev + "-" + next);
@@ -143,11 +157,14 @@ public class GenerateContextTriples {
}
}
+ File outputDir = new File(args[2]);
+ if(!outputDir.exists()) outputDir.mkdir();
+
// for each word write the contexts and their values
for(String word : contexts.keySet()){
PrintWriter out;
try {
- out = new PrintWriter(new File(args[2], word));
+ out = new PrintWriter(new File(outputDir, word));
for(String context : contexts.get(word).keySet()){
out.print(context);
out.print(" : ");
@@ -160,6 +177,7 @@ public class GenerateContextTriples {
System.exit(-1);
}
}
+// contexts.close();
System.out.printf("Completed in %f seconds\n", (System.currentTimeMillis()-start) / 1000.0);
}
@@ -172,5 +190,4 @@ public class GenerateContextTriples {
return word.equals(((ClusterNode)n).word);
}
}
-
}
Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java?rev=1484764&r1=1484763&r2=1484764&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java Tue May 21 11:34:23 2013
@@ -22,14 +22,19 @@ import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.HashMap;
import java.util.HashSet;
+import java.util.List;
import java.util.Scanner;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
-import org.apache.lucene.search.suggest.jaspell.JaspellTernarySearchTrie;
+import org.apache.ctakes.spelling.util.JaspellTernarySearchTrie;
public class GenerateTermNeighborhoods {
-
+ public static final Pattern garbage = Pattern.compile("^\\d+\\p{Alpha}+");
/**
* @param args
*/
@@ -41,6 +46,8 @@ public class GenerateTermNeighborhoods {
long start = System.currentTimeMillis();
JaspellTernarySearchTrie trie=null;
+ HashMap<String,Integer> wordCounts = new HashMap<String,Integer>();
+
try {
trie = new JaspellTernarySearchTrie(new File(args[0]));
} catch (IOException e) {
@@ -64,6 +71,15 @@ public class GenerateTermNeighborhoods {
while(scanner.hasNextLine()){
String word = scanner.next();
+
+ // make sure it isn't one of those garbage words like 2glasses which is a mistake but
+ // is not a simple misspelling.
+ Matcher m = garbage.matcher(word);
+ if(m.matches()){
+ scanner.nextLine();
+ continue;
+ }
+
int maxDiff;
if(word.length() <= 4){
maxDiff = 1;
@@ -72,17 +88,22 @@ public class GenerateTermNeighborhoods {
}else{
maxDiff = 3;
}
+// float count = (float) scanner.nextInt();
+
float count = (Float) trie.get(word);
HashSet<String> neighbors = new HashSet<String>();
- for(int diff = 0; diff <= maxDiff; diff++){
- trie.setMatchAlmostDiff(diff);
- neighbors.addAll(trie.matchAlmost(word));
+
+ for(int diff = 1; diff <= maxDiff; diff++){
+ trie.setMatchAlmostDiff(diff);
+ neighbors.addAll(trie.matchAlmost(word));
}
+
+// HashSet<String> neighbors = getEditDistanceWords(word, maxDiff, trie);
HashSet<String> toRemove = new HashSet<String>();
for(String neighbor : neighbors){
- float nCount = (Float) trie.get(neighbor);
- if(count / nCount < 10){
+ Float nCount = (Float) trie.get(neighbor);
+ if(nCount == null || nCount / count < 10){
toRemove.add(neighbor);
}
}
@@ -100,5 +121,75 @@ public class GenerateTermNeighborhoods {
}
System.out.printf("Completed after %f seconds\n", (System.currentTimeMillis()-start) / 1000.0);
}
+
+// private static void addInsDelWords(int errs, String word, List<String> insDelWords, int pos) {
+// for(int i = pos; i < word.length(); i++){
+// if(i == 0){
+//
+// }
+// }
+// }
+ private static HashSet<String> getEditDistanceWords(String word, int d, JaspellTernarySearchTrie trie) {
+ HashSet<String> ed1words = new HashSet<String>();
+ StringBuilder buff = new StringBuilder(word);
+
+ char curChar;
+ for(int i = 0; i < word.length(); i++){
+ curChar = word.charAt(i);
+
+ // first delete
+ buff.deleteCharAt(i);
+ if(i == 0 || trie.matchPrefix(buff.substring(0,i)).size() > 0){
+ ed1words.add(buff.toString().trim());
+ }
+ // restore
+ buff.insert(i, curChar);
+
+ for(char c = 'a'; c < 'z'; c++){
+ // insertion
+ buff.insert(i, c);
+ ed1words.add(buff.toString().trim());
+ // restore
+ buff.deleteCharAt(i);
+
+ // now substitution
+ if(c != curChar){
+ buff.setCharAt(i, c);
+ }
+ ed1words.add(buff.toString().trim());
+ // restore
+ buff.setCharAt(i, curChar);
+ }
+
+ if(i+1 < word.length()){
+ // swap
+ buff.setCharAt(i, word.charAt(i+1));
+ buff.setCharAt(i+1, curChar);
+ ed1words.add(buff.toString().trim());
+ // restore
+ buff.setCharAt(i+1, buff.charAt(i));
+ buff.setCharAt(i, curChar);
+ }
+ }
+
+ // now try adding at the end of word
+ buff.append(' ');
+ int endInd = buff.length()-1;
+ for(char c = 'a'; c < 'z'; c++){
+ buff.setCharAt(endInd, c);
+ ed1words.add(buff.toString());
+ }
+
+ if(d == 1){
+ return ed1words;
+ }else{
+ HashSet<String> allWords = new HashSet<String>();
+ allWords.addAll(ed1words);
+ for(String w : ed1words){
+ allWords.addAll(getEditDistanceWords(w, d-1, trie));
+ }
+ return allWords;
+ }
+ }
}
Added: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTrainingExamples.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTrainingExamples.java?rev=1484764&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTrainingExamples.java (added)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTrainingExamples.java Tue May 21 11:34:23 2013
@@ -0,0 +1,81 @@
+package org.apache.ctakes.spelling.mistakes;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.PrintWriter;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Scanner;
+
+public class GenerateTrainingExamples {
+
+ public static int THRESHOLD = 2;
+
+ /**
+ * @param args
+ * @throws FileNotFoundException
+ */
+ public static void main(String[] args) throws FileNotFoundException {
+ if(args.length < 3){
+ System.err.println("Required arguments: <term neighborhood file> <contexts directory> <output file>");
+ System.exit(-1);
+ }
+
+ Scanner scanner = new Scanner(new File(args[0]));
+ File conDir = new File(args[1]);
+ if(!conDir.exists() || !conDir.isDirectory()){
+ System.err.println("Argument 2 must be a directory of contexts!");
+ System.exit(-1);
+ }
+ PrintWriter out = new PrintWriter(args[2]);
+
+ while(scanner.hasNextLine()){
+ String line = scanner.nextLine().trim();
+ String[] pair = line.split(" : ");
+ if(pair.length == 1) continue;
+ String word = pair[0];
+ String[] neighbors = pair[1].split(":");
+ if(neighbors.length == 0) continue;
+
+ Map<String,Integer> wordContexts = getWordContexts(conDir, word);
+
+ for(String neighbor : neighbors){
+ Map<String,Integer> neighborContexts = getWordContexts(conDir, neighbor);
+ int numErrors = 0;
+ for(String context : wordContexts.keySet()){
+ if(!neighborContexts.containsKey(context)) continue;
+ if(neighborContexts.get(context) > wordContexts.get(context)){
+ System.err.println("Found context with likely mistake: " + context);
+ numErrors++;
+ }
+ }
+ if(numErrors == 0) continue;
+ out.printf("%s\t%s : %d\n", word, neighbor, numErrors);
+ out.flush();
+ break;
+ }
+ }
+ out.close();
+ scanner.close();
+ }
+
+ private static Map<String,Integer> getWordContexts(File contextDir, String word) throws FileNotFoundException{
+ HashMap<String,Integer> wordCounts = new HashMap<String,Integer>();
+
+ File contextFile = new File(contextDir, word);
+ if(contextFile.exists()){
+ Scanner scanner = new Scanner(contextFile);
+ while(scanner.hasNextLine()){
+ String line = scanner.nextLine().trim();
+ if(line.length() == 0) continue;
+ String[] parts = line.split(" : ");
+ if(parts.length < 2) continue;
+ if(Integer.parseInt(parts[1]) >= THRESHOLD){
+ wordCounts.put(parts[0], Integer.parseInt(parts[1]));
+ }
+ }
+ scanner.close();
+ }
+ return wordCounts;
+ }
+}
Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java?rev=1484764&r1=1484763&r2=1484764&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java Tue May 21 11:34:23 2013
@@ -47,7 +47,7 @@ public class SimpleEditDistanceModel imp
return words;
}
- public HashSet<String> getEditDistanceWords(String word, int d) {
+ public static HashSet<String> getEditDistanceWords(String word, int d) {
HashSet<String> ed1words = new HashSet<String>();
StringBuilder buff = new StringBuilder(word);
Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java?rev=1484764&r1=1484763&r2=1484764&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java Tue May 21 11:34:23 2013
@@ -24,7 +24,7 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
-import org.apache.lucene.search.suggest.jaspell.JaspellTernarySearchTrie;
+import org.apache.ctakes.spelling.util.JaspellTernarySearchTrie;
public class TestTrie {
@@ -59,7 +59,7 @@ public class TestTrie {
maxDiff = 3;
}
List<String> neighbors = new ArrayList<String>();
- for(int diff = 0; diff <= maxDiff; diff++){
+ for(int diff = 1; diff <= maxDiff; diff++){
trie.setMatchAlmostDiff(diff);
neighbors.addAll(trie.matchAlmost(word));
}
@@ -68,5 +68,6 @@ public class TestTrie {
System.out.println("\t" + neighbor + "\t" + trie.get(neighbor));
}
}
+ scanner.close();
}
}