You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2013/04/01 22:17:45 UTC
svn commit: r1463284 - in /ctakes/sandbox/ctakes-spelling-corrector: ./
src/org/apache/ctakes/spelling/ src/org/apache/ctakes/spelling/mistakes/
src/org/apache/ctakes/spelling/mistakes/edit/
src/org/apache/ctakes/spelling/priors/ src/org/apache/ctakes/...
Author: tmill
Date: Mon Apr 1 20:17:44 2013
New Revision: 1463284
URL: http://svn.apache.org/r1463284
Log:
ctakes-58: Adds code for building term neighborhoods and context triples. Uses JaspellTernarySearchTrie in lucene, so new dependency in pom.
Added:
ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java
ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java
ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java
Modified:
ctakes/sandbox/ctakes-spelling-corrector/pom.xml
ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java
ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/SpellingDistanceModel.java
ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java
ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/WordPriorModel.java
ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/SimpleUnigramPrior.java
ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java
ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java
ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java
Modified: ctakes/sandbox/ctakes-spelling-corrector/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/pom.xml?rev=1463284&r1=1463283&r2=1463284&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/pom.xml (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/pom.xml Mon Apr 1 20:17:44 2013
@@ -38,5 +38,10 @@
<groupId>org.apache.ctakes</groupId>
<artifactId>ctakes-clinical-pipeline</artifactId>
</dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-spellchecker</artifactId>
+ <version>3.6.2</version>
+ </dependency>
</dependencies>
</project>
\ No newline at end of file
Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java?rev=1463284&r1=1463283&r2=1463284&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java Mon Apr 1 20:17:44 2013
@@ -1,12 +1,24 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
package org.apache.ctakes.spelling;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.util.HashMap;
import java.util.List;
-import java.util.Scanner;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
import org.apache.ctakes.spelling.mistakes.SpellingDistanceModel;
import org.apache.ctakes.spelling.priors.WordPriorModel;
Added: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java?rev=1463284&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java (added)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java Mon Apr 1 20:17:44 2013
@@ -0,0 +1,175 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.spelling.mistakes;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Scanner;
+
+import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.ctakes.utils.struct.CounterMap;
+import org.apache.uima.UIMAException;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.CollectionReaderFactory;
+import org.uimafit.pipeline.JCasIterable;
+import org.uimafit.util.JCasUtil;
+
+public class GenerateContextTriples {
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+ if(args.length < 3){
+ System.err.println("Required arguments: <neighborhood file> <input files> <output dir>");
+ System.exit(-1);
+ }
+
+ HashMap<String,String[]> dict = new HashMap<String,String[]>();
+// HashMap<String,ClusterNode> dict = new HashMap<String,ClusterNode>();
+ HashMap<String,CounterMap<String>> contexts = new HashMap<String,CounterMap<String>>();
+//
+ Scanner scanner = null;
+ try {
+ scanner = new Scanner(new File(args[0]));
+ } catch (FileNotFoundException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ System.err.println("Error reading dictinoary file");
+ System.exit(-1);
+ }
+ while(scanner.hasNextLine()){
+ String[] parts = scanner.nextLine().trim().split(" ");
+ if(parts.length == 3){
+ String[] neighbors = parts[2].split(":");
+ dict.put(parts[0], neighbors);
+ }else{
+ dict.put(parts[0], new String[]{});
+ }
+// if(!dict.contains(o))
+ }
+
+ CollectionReader reader = null;
+ AnalysisEngine ae = null;
+ try {
+ ae = AnalysisEngineFactory.createAnalysisEngineFromPath("../ctakes-core/desc/analysis_engine/AggregateAE.xml");
+ reader = CollectionReaderFactory.createCollectionReader(FilesInDirectoryCollectionReader.class
+ ,FilesInDirectoryCollectionReader.PARAM_INPUTDIR
+ ,args[1]
+ ,FilesInDirectoryCollectionReader.PARAM_RECURSE
+ ,true
+ ,FilesInDirectoryCollectionReader.PARAM_EXTENSIONS
+ ,new String[]{"txt"}
+ );
+ } catch (ResourceInitializationException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ System.exit(-1);
+ } catch (UIMAException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ System.exit(-1);
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ System.exit(-1);
+ }
+
+ JCasIterable casIter = null;
+ try {
+ casIter = new JCasIterable(reader, ae);
+ } catch (UIMAException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ System.exit(-1);
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ System.exit(-1);
+ }
+ String prev = null;
+ String cur = null;
+ String next = null;
+ while(casIter.hasNext()){
+ JCas jcas = casIter.next();
+ Collection<Sentence> sents = JCasUtil.select(jcas, Sentence.class);
+ for(Sentence sent : sents){
+ List<BaseToken> tokens = JCasUtil.selectCovered(jcas, BaseToken.class, sent);
+ for(int i = 0; i < tokens.size(); i++){
+ cur = tokens.get(i).getCoveredText();
+ if(!dict.containsKey(cur)) continue;
+ if(i == 0){
+ prev = "<Start>";
+ }else{
+ prev = tokens.get(i-1).getCoveredText();
+ }
+ if(i == tokens.size()-1){
+ next = "<End>";
+ }else{
+ next = tokens.get(i+1).getCoveredText();
+ }
+ if(!contexts.containsKey(cur)){
+ contexts.put(cur, new CounterMap<String>());
+ }
+ contexts.get(cur).add(prev + "-" + next);
+ }
+ }
+ }
+
+ // for each word write the contexts and their values
+ for(String word : contexts.keySet()){
+ PrintWriter out;
+ try {
+ out = new PrintWriter(new File(args[2], word));
+ for(String context : contexts.get(word).keySet()){
+ out.print(context);
+ out.print(" : ");
+ out.println(contexts.get(word).get(context));
+ }
+ out.close();
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ System.err.println("Error creating context file!");
+ System.exit(-1);
+ }
+ }
+ }
+
+ class ClusterNode{
+ public String word;
+// private List<ClusterNode> parents = new ArrayList<ClusterNode>();
+
+ @Override
+ public boolean equals(Object n){
+ return word.equals(((ClusterNode)n).word);
+ }
+ }
+
+}
Added: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java?rev=1463284&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java (added)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java Mon Apr 1 20:17:44 2013
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.spelling.mistakes;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.HashSet;
+import java.util.Scanner;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.lucene.search.suggest.jaspell.JaspellTernarySearchTrie;
+
+public class GenerateTermNeighborhoods {
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+ if(args.length < 2){
+ System.err.println("Args: <dictionary file> <output file>");
+ System.exit(-1);
+ }
+
+ JaspellTernarySearchTrie trie=null;
+ try {
+ trie = new JaspellTernarySearchTrie(new File(args[0]));
+ } catch (IOException e) {
+ e.printStackTrace();
+ System.err.println("Could not read dictionary file!");
+ System.exit(-1);
+ }
+ System.err.println("Trie loaded...");
+
+ // look through terms file
+ Scanner scanner = null;
+ PrintWriter out = null;
+ try {
+ scanner = new Scanner(new File(args[0]));
+ out = new PrintWriter(args[1]);
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ System.err.println("Could not open input/output file!");
+ System.exit(-1);
+ }
+
+ while(scanner.hasNextLine()){
+ String word = scanner.next();
+ int maxDiff;
+ if(word.length() <= 4){
+ maxDiff = 1;
+ }else if(word.length() <= 12){
+ maxDiff = 2;
+ }else{
+ maxDiff = 3;
+ }
+ int count = (Integer) trie.get(word);
+ HashSet<String> neighbors = new HashSet<String>();
+ for(int diff = 0; diff <= maxDiff; diff++){
+ trie.setMatchAlmostDiff(diff);
+ neighbors.addAll(trie.matchAlmost(word));
+ }
+
+ HashSet<String> toRemove = new HashSet<String>();
+ for(String neighbor : neighbors){
+ int nCount = (Integer) trie.get(neighbor);
+ if(count / nCount < 10){
+ toRemove.add(neighbor);
+ }
+ }
+ for(String rm : toRemove){
+ neighbors.remove(rm);
+ }
+
+ out.print(word);
+ out.print(" : ");
+ String joinedNeighbors = StringUtils.join(neighbors, ":");
+ out.println(joinedNeighbors);
+ out.flush();
+
+ scanner.nextLine(); // go to next line
+ }
+ }
+
+}
Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/SpellingDistanceModel.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/SpellingDistanceModel.java?rev=1463284&r1=1463283&r2=1463284&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/SpellingDistanceModel.java (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/SpellingDistanceModel.java Mon Apr 1 20:17:44 2013
@@ -1,3 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
package org.apache.ctakes.spelling.mistakes;
import java.util.List;
Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java?rev=1463284&r1=1463283&r2=1463284&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java Mon Apr 1 20:17:44 2013
@@ -1,3 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
package org.apache.ctakes.spelling.mistakes.edit;
import java.util.ArrayList;
Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/WordPriorModel.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/WordPriorModel.java?rev=1463284&r1=1463283&r2=1463284&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/WordPriorModel.java (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/WordPriorModel.java Mon Apr 1 20:17:44 2013
@@ -1,3 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
package org.apache.ctakes.spelling.priors;
public interface WordPriorModel {
Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/SimpleUnigramPrior.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/SimpleUnigramPrior.java?rev=1463284&r1=1463283&r2=1463284&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/SimpleUnigramPrior.java (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/SimpleUnigramPrior.java Mon Apr 1 20:17:44 2013
@@ -1,3 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
package org.apache.ctakes.spelling.priors.unigram;
import java.io.File;
Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java?rev=1463284&r1=1463283&r2=1463284&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java Mon Apr 1 20:17:44 2013
@@ -1,12 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
package org.apache.ctakes.spelling.priors.unigram;
-import java.io.File;
-import java.io.FilenameFilter;
import java.io.IOException;
import java.io.PrintStream;
import java.util.HashMap;
import java.util.Map;
-import java.util.Scanner;
import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
@@ -16,11 +31,10 @@ import org.apache.uima.analysis_engine.A
import org.apache.uima.cas.FSIterator;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
import org.uimafit.factory.AnalysisEngineFactory;
import org.uimafit.factory.CollectionReaderFactory;
-import org.uimafit.factory.JCasFactory;
import org.uimafit.pipeline.JCasIterable;
-import org.uimafit.pipeline.SimplePipeline;
public class UnigramPriorGenerator {
static HashMap<String,Integer> wordCounts = new HashMap<String,Integer>();
@@ -55,7 +69,7 @@ public class UnigramPriorGenerator {
JCasIterable casIter = new JCasIterable(reader, ae);
while(casIter.hasNext()){
JCas jcas = casIter.next();
- FSIterator iter = jcas.getAnnotationIndex(BaseToken.type).iterator();
+ FSIterator<Annotation> iter = jcas.getAnnotationIndex(BaseToken.type).iterator();
while(iter.hasNext()){
BaseToken tok = (BaseToken) iter.next();
if(tok instanceof WordToken){
@@ -68,21 +82,21 @@ public class UnigramPriorGenerator {
writePriorModel(new PrintStream(args[0]));
}
- private static boolean startsWith(String word, String[] prefixes){
+ public static boolean startsWith(String word, String[] prefixes){
for(String prefix : prefixes){
if(word.startsWith(prefix)) return true;
}
return false;
}
- private static boolean endsWith(String word, String[] prefixes){
+ public static boolean endsWith(String word, String[] prefixes){
for(String prefix : prefixes){
if(word.endsWith(prefix)) return true;
}
return false;
}
- private static String removeStarting(String word, String[] prefixes){
+ public static String removeStarting(String word, String[] prefixes){
for(String prefix : prefixes){
// if(word.startsWith(prefix)){d
word = word.replaceFirst("^"+prefix, "");
@@ -91,7 +105,7 @@ public class UnigramPriorGenerator {
return word;
}
- private static String removeEnding(String word, String[] suffixes){
+ public static String removeEnding(String word, String[] suffixes){
for(String suffix : suffixes){
// if(word.endsWith(suffix)){
word = word.replaceFirst(suffix+"$", "");
@@ -100,7 +114,7 @@ public class UnigramPriorGenerator {
return word;
}
- private static void handleWord(String w){
+ public static void handleWord(String w){
String word = w.toLowerCase();
// if(word.matches("^.*\\p{Digit}.*$")) return; // ignore words with numbers
if(word.length() == 0) return;
@@ -111,7 +125,7 @@ public class UnigramPriorGenerator {
numWords++;
}
- private static void writePriorModel(PrintStream out){
+ public static void writePriorModel(PrintStream out){
for(Map.Entry<String,Integer> entry : wordCounts.entrySet()){
out.print(entry.getKey());
out.print(" : ");
Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java?rev=1463284&r1=1463283&r2=1463284&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java Mon Apr 1 20:17:44 2013
@@ -1,3 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
package org.apache.ctakes.spelling.test;
import java.io.File;
Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java?rev=1463284&r1=1463283&r2=1463284&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java Mon Apr 1 20:17:44 2013
@@ -1,3 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
package org.apache.ctakes.spelling.test;
import java.io.FileNotFoundException;
Added: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java?rev=1463284&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java (added)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java Mon Apr 1 20:17:44 2013
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.spelling.test;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Scanner;
+
+import org.apache.lucene.search.suggest.jaspell.JaspellTernarySearchTrie;
+
+public class TestTrie {
+
+ /**
+ * @param args
+ * @throws
+ */
+ public static void main(String[] args) {
+ if(args.length < 1){
+ System.err.println("Args: <dictionary file>");
+ System.exit(-1);
+ }
+
+ JaspellTernarySearchTrie trie=null;
+ try {
+ trie = new JaspellTernarySearchTrie(new File(args[0]));
+ } catch (IOException e) {
+ e.printStackTrace();
+ System.err.println("Could not read dictionary file!");
+ System.exit(-1);
+ }
+ System.err.println("Trie loaded...");
+ Scanner scanner = new Scanner(System.in);
+ while(scanner.hasNextLine()){
+ String word = scanner.nextLine().trim();
+ int maxDiff;
+ if(word.length() <= 4){
+ maxDiff = 1;
+ }else if(word.length() <= 12){
+ maxDiff = 2;
+ }else{
+ maxDiff = 3;
+ }
+ List<String> neighbors = new ArrayList<String>();
+ for(int diff = 0; diff <= maxDiff; diff++){
+ trie.setMatchAlmostDiff(diff);
+ neighbors.addAll(trie.matchAlmost(word));
+ }
+ System.out.println(word + "\t" + trie.get(word));
+ for(String neighbor : neighbors){
+ System.out.println("\t" + neighbor + "\t" + trie.get(neighbor));
+ }
+ }
+ }
+}