You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by tm...@apache.org on 2012/12/17 17:18:04 UTC
svn commit: r1423003 - in /incubator/ctakes/trunk/ctakes-utils: ./
src/main/java/org/apache/ctakes/utils/struct/
src/main/java/org/apache/ctakes/utils/wiki/
Author: tmill
Date: Mon Dec 17 16:18:03 2012
New Revision: 1423003
URL: http://svn.apache.org/viewvc?rev=1423003&view=rev
Log:
addresses jira issue: ctakes-117: add features to coreference -- wiki index with cosine similarity for 2 queries.
Working locally with lucene 4.0 but I am new to this api so may not be using it correctly.
Added:
incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/
incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterMap.java (with props)
incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterTreeMap.java (with props)
incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/
incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/ApproximateMath.java (with props)
incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/ApproximateSimilarity.java (with props)
incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/SearchResult.java (with props)
incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/TestCosineSimilarity.java (with props)
incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/WikiIndex.java (with props)
Modified:
incubator/ctakes/trunk/ctakes-utils/pom.xml
Modified: incubator/ctakes/trunk/ctakes-utils/pom.xml
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-utils/pom.xml?rev=1423003&r1=1423002&r2=1423003&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-utils/pom.xml (original)
+++ incubator/ctakes/trunk/ctakes-utils/pom.xml Mon Dec 17 16:18:03 2012
@@ -42,5 +42,21 @@
<artifactId>junit</artifactId>
<scope>compile</scope>
</dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-queries</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-queryparser</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analyzers-common</artifactId>
+ </dependency>
</dependencies>
</project>
Added: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterMap.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterMap.java?rev=1423003&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterMap.java (added)
+++ incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterMap.java Mon Dec 17 16:18:03 2012
@@ -0,0 +1,28 @@
+package org.apache.ctakes.utils.struct;
+
+import java.util.HashMap;
+
+// This class is a simplifying class which makes it easy to build hashes to keep track of counts
+// and write less boilerplate code. If you just call it with an object, it will increment the
+// object's count by 1, initializing it to zero first if necessary.
+public class CounterMap<K> extends HashMap<K, java.lang.Integer> {
+
+ @Override
+ public Integer get(Object key) {
+ if(super.containsKey(key)) return super.get(key);
+ else{
+ return 0;
+ }
+ }
+
+ public void add(K key){
+ add(key, 1);
+ }
+
+ public void add(K key, Integer i){
+ if(!super.containsKey(key)){
+ super.put(key,0);
+ }
+ super.put(key, super.get(key)+i);
+ }
+}
Propchange: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterMap.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterMap.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterTreeMap.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterTreeMap.java?rev=1423003&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterTreeMap.java (added)
+++ incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterTreeMap.java Mon Dec 17 16:18:03 2012
@@ -0,0 +1,46 @@
+package org.apache.ctakes.utils.struct;
+
+import java.util.Comparator;
+import java.util.Set;
+import java.util.TreeMap;
+
+// This class is a simplifying class which makes it easy to build hashes to keep track of counts
+// and write less boilerplate code. If you just call it with an object, it will increment the
+// object's count by 1, initializing it to zero first if necessary.
+public class CounterTreeMap<K> implements Comparator<K>{
+
+// IntValueComparator<K> comp = new IntValueComparator<K>();
+ TreeMap<K,Integer> map = null;
+
+ public CounterTreeMap(){
+ map = new TreeMap<K,Integer>(this);
+// super(this);
+ }
+
+ public Integer get(Object key) {
+ if(map.containsKey(key)) return map.get(key);
+ return 0;
+ }
+
+ public void add(K key){
+ add(key, 1);
+ }
+
+ public void add(K key, Integer i){
+ if(!map.containsKey(key)){
+ map.put(key,0);
+ }
+ map.put(key, map.get(key)+i);
+ }
+
+
+ @Override
+ public int compare(K o1, K o2) {
+ return map.get(o1) - map.get(o2);
+ }
+
+ public Set<K> keySet() {
+ return map.keySet();
+ }
+
+}
Propchange: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterTreeMap.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/struct/CounterTreeMap.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/ApproximateMath.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/ApproximateMath.java?rev=1423003&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/ApproximateMath.java (added)
+++ incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/ApproximateMath.java Mon Dec 17 16:18:03 2012
@@ -0,0 +1,40 @@
+package org.apache.ctakes.utils.wiki;
+
+public class ApproximateMath {
+ private static int numRoots = 10000;
+ private static float[] roots = new float[numRoots];
+ private static int cacheHit = 0;
+ private static int cacheMiss = 0;
+
+ private static int numLogs = 10000;
+ private static float[] logs = new float[numLogs];
+ static{
+ for(int i = 0; i < numRoots; i++){
+ roots[i] = (float) Math.sqrt(i);
+ }
+
+ for(int i = 0; i < numLogs; i++){
+ logs[i] = (float) Math.log(i);
+ }
+ }
+
+ public static final double asqrt(int i){
+ if(i < numRoots){ cacheHit++; return roots[i]; }
+ else{ cacheMiss++; return Math.sqrt(i); }
+ }
+
+ public static final double asqrt(double d){
+ if(d < numRoots) return roots[(int)d];
+ else return Math.sqrt(d);
+ }
+
+ public static final double alog(double d){
+ if(d < numLogs){ cacheHit++; return logs[(int)d];}
+ else{ cacheMiss++; return Math.log(d);}
+ }
+
+ public static void dumpCache(){
+ System.out.println(cacheHit + " cache hits");
+ System.out.println(cacheMiss + " cache misses");
+ }
+}
Propchange: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/ApproximateMath.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/ApproximateMath.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/ApproximateSimilarity.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/ApproximateSimilarity.java?rev=1423003&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/ApproximateSimilarity.java (added)
+++ incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/ApproximateSimilarity.java Mon Dec 17 16:18:03 2012
@@ -0,0 +1,26 @@
+package org.apache.ctakes.utils.wiki;
+
+import org.apache.lucene.search.similarities.DefaultSimilarity;
+
+
+public class ApproximateSimilarity extends DefaultSimilarity {
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+
+
+ @Override
+ public final float idf(long docFreq, long numDocs){
+ return (float)ApproximateMath.alog(numDocs / (docFreq+1.0))+1;
+ }
+
+ @Override
+ /* according to lucene javadocs, DefaultSimilarity.tf is simply a square root -- we can approximate with an
+ * array of pre-calculated square roots to save time
+ */
+ public final float tf(int termFreq){
+ return (float) ApproximateMath.asqrt(termFreq);
+ }
+}
Propchange: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/ApproximateSimilarity.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/ApproximateSimilarity.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/SearchResult.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/SearchResult.java?rev=1423003&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/SearchResult.java (added)
+++ incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/SearchResult.java Mon Dec 17 16:18:03 2012
@@ -0,0 +1,16 @@
+package org.apache.ctakes.utils.wiki;
+
+public class SearchResult {
+
+ public String documentTitle;
+ public float documentScore;
+
+ public SearchResult(String documentTitle, float documentScore) {
+ this.documentTitle = documentTitle;
+ this.documentScore = documentScore;
+ }
+
+ public String toString() {
+ return String.format("%s (%s)", documentTitle, documentScore);
+ }
+}
Propchange: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/SearchResult.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/SearchResult.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/TestCosineSimilarity.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/TestCosineSimilarity.java?rev=1423003&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/TestCosineSimilarity.java (added)
+++ incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/TestCosineSimilarity.java Mon Dec 17 16:18:03 2012
@@ -0,0 +1,69 @@
+package org.apache.ctakes.utils.wiki;
+
+import java.io.IOException;
+import java.util.Scanner;
+
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.queryparser.classic.ParseException;
+
+public class TestCosineSimilarity {
+
+ public static void main(String[] args) throws CorruptIndexException, IOException, ParseException {
+ boolean approx = true;
+// WikiIndex wikipediaIndex = new WikiIndex(WikiIndex.defaultMaxHits, "/home/tmill/Documents/wiki/index_nometa", "text", approx);
+ WikiIndex wikipediaIndex = new WikiIndex(5, "/home/tmill/Documents/wiki/index_med_5k", "text", approx);
+ // WikiIndex wikipediaIndex = new WikiIndex(WikiIndex.defaultMaxHits, "/home/tmill/mnt/prv/data/index_vectors_notext", "text");
+
+
+ wikipediaIndex.initialize();
+ // wikipediaIndex.useCache = true;
+ System.out.println("Index loaded... Press enter to continue...");
+ Scanner scanner = new Scanner(System.in);
+ String line = scanner.nextLine();
+// System.out.println("Thanks for typing: " + line);
+
+ double sim = wikipediaIndex.getCosineSimilarity("the procedure", "continuous Baker Baker dialysis");
+ System.out.println("Sim is: " + sim);
+ sim = wikipediaIndex.getCosineSimilarity("the procedure", "an orthotic liver transplant");
+ System.out.println("Sim is: " + sim);
+ sim = wikipediaIndex.getCosineSimilarity("the procedure", "transplant");
+ System.out.println("Sim is: " + sim);
+
+ long start = System.currentTimeMillis();
+
+ for(int i = 0; i < 10; i++){
+ System.out.println("i = " + i);
+ if(i == 1) start = System.currentTimeMillis();
+
+ double cosine0 = wikipediaIndex.getCosineSimilarity("heart disease", "microsoft");
+ System.out.println("Similarity score: " + cosine0 + " took " + (System.currentTimeMillis()-start) + " ms to compute.");
+
+ double cosine1 = wikipediaIndex.getCosineSimilarity("heart disease", "smoking");
+ System.out.println("Similarity score: " + cosine1 + " took " + (System.currentTimeMillis()-start) + " ms to compute.");
+
+ double cosine2 = wikipediaIndex.getCosineSimilarity("aspirin", "tylenol");
+ System.out.println("Similarity score: " + cosine2 + " took " + (System.currentTimeMillis()-start) + " ms to compute.");
+
+ double cosine3 = wikipediaIndex.getCosineSimilarity("aspirin", "ibuprofen");
+ System.out.println("Similarity score: " + cosine3 + " took " + (System.currentTimeMillis()-start) + " ms to compute.");
+
+ double cosine4 = wikipediaIndex.getCosineSimilarity("advil", "ibuprofen");
+ System.out.println("Similarity score: " + cosine4 + " took " + (System.currentTimeMillis()-start) + " ms to compute.");
+ }
+ System.out.println("10 iterations took: " + (System.currentTimeMillis()-start) + " ms to compute.");
+ ApproximateMath.dumpCache();
+ // Scanner scanner = new Scanner(System.in);
+ // System.out.println("Enter concept 1:");
+ // while(scanner.hasNextLine()){
+ // String con1 = scanner.nextLine().trim();
+ // System.out.println("Enter concept 2: ");
+ // String con2 = scanner.nextLine().trim();
+ // double cos = wikipediaIndex.getCosineSimilarity(con1, con2);
+ // System.out.println("Similarity is: " + cos);
+ // }
+ // wikipediaIndex.close();
+
+
+ wikipediaIndex.close();
+ }
+}
Propchange: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/TestCosineSimilarity.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/TestCosineSimilarity.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/WikiIndex.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/WikiIndex.java?rev=1423003&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/WikiIndex.java (added)
+++ incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/WikiIndex.java Mon Dec 17 16:18:03 2012
@@ -0,0 +1,407 @@
+package org.apache.ctakes.utils.wiki;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.ctakes.utils.struct.CounterMap;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.similarities.DefaultSimilarity;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.Version;
+
+/**
+ * A wrapper for a wikipedia lucene index.
+ *
+ * @author dmitriy dligach
+ *
+ */
+public class WikiIndex {
+
+ public static int defaultMaxHits = 10;
+ public static String defaultIndexPath = "/home/dima/i2b2/wiki-index/index_nometa";
+ public static String defaultSearchField = "text";
+
+ private int maxHits;
+ private String indexPath;
+ private String searchField;
+
+ private IndexReader indexReader;
+ private IndexSearcher indexSearcher;
+ private Analyzer standardAnalyzer;
+ private QueryParser queryParser;
+ private DefaultSimilarity similarity;
+ private int numDocs;
+
+ private boolean useCache = true;
+ private Cache lastQuery = null;
+
+ public WikiIndex(int maxHits, String indexPath, String searchField, boolean approximate) {
+ this.maxHits = maxHits;
+ this.indexPath = indexPath;
+ this.searchField = searchField;
+ this.similarity = approximate ? new ApproximateSimilarity() : new DefaultSimilarity();
+ }
+
+ public WikiIndex(int maxHits, String indexPath, String searchField){
+ this(maxHits, indexPath, searchField, false);
+ }
+
+ public WikiIndex() {
+ maxHits = defaultMaxHits;
+ indexPath = defaultIndexPath;
+ searchField = defaultSearchField;
+ }
+
+ public void initialize() throws CorruptIndexException, IOException {
+
+ indexReader = IndexReader.open(FSDirectory.open(new File(indexPath)));
+ numDocs = indexReader.numDocs();
+ indexSearcher = new IndexSearcher(indexReader);
+ standardAnalyzer = new StandardAnalyzer(Version.LUCENE_40);
+ queryParser = new QueryParser(Version.LUCENE_40, searchField, standardAnalyzer);
+ lastQuery = new Cache();
+ }
+
+ /**
+ * Search the index. Return a list of article titles and their scores.
+ * @throws org.apache.lucene.queryparser.classic.ParseException
+ */
+ public ArrayList<SearchResult> search(String queryText) throws ParseException, IOException {
+
+ ArrayList<SearchResult> articleTitles = new ArrayList<SearchResult>();
+
+ String escaped = QueryParser.escape(queryText);
+ Query query = queryParser.parse(escaped);
+
+ ScoreDoc[] scoreDocs = indexSearcher.search(query, null, maxHits).scoreDocs;
+ for(ScoreDoc scoreDoc : scoreDocs) {
+ ScoreDoc redirectScoreDoc = handlePossibleRedirect(scoreDoc);
+ Document doc = indexSearcher.doc(redirectScoreDoc.doc);
+ articleTitles.add(new SearchResult(doc.get("title"), redirectScoreDoc.score));
+ }
+
+ return articleTitles;
+ }
+
+// Document doc = indexSearcher.doc(scoreDoc.doc);
+// String redirectTitle = doc.get("redirect");
+//
+// // check if there is a redirect
+// if(redirectTitle == null) {
+// return scoreDoc;
+// } else {
+// QueryParser redirectQueryParser = new QueryParser(Version.LUCENE_30, "title", standardAnalyzer);
+//
+// String redirectTitleNoUnderscores = redirectTitle.replaceAll("_", " ");
+// String redirectTitleQuoted = '"' + redirectTitleNoUnderscores + '"';
+// String redirectTitleEscaped = QueryParser.escape(redirectTitleQuoted);
+// Query redirectQuery = redirectQueryParser.parse(redirectTitleEscaped);
+//
+// ScoreDoc[] redirectScoreDocs = indexSearcher.search(redirectQuery, null, 1).scoreDocs;
+// ScoreDoc redirectScoreDoc = redirectScoreDocs[0];
+//
+// return redirectScoreDoc;
+// }
+// }
+
+ /**
+ * Send two queries to the index.
+ * For each query, form a tfidf vector that represents N top matching documents.
+ * Return cosine similarity between the two tfidf vectors.
+ */
+ public double getCosineSimilarity(String queryText1, String queryText2) throws ParseException, IOException {
+ HashMap<String, Double> vector1 = null;
+ if(useCache && lastQuery.t1 != null && lastQuery.t1.equals(queryText1)){
+ vector1 = lastQuery.v1;
+ }else if(useCache && lastQuery.t2 != null && lastQuery.t2.equals(queryText1)){
+ vector1 = lastQuery.v2;
+ }else{
+ // start from scratch
+ ArrayList<Terms> termFreqVectors1 = getTermFreqVectors(queryText1);
+ if(termFreqVectors1.size() == 0) return 0;
+ vector1 = makeTfIdfVector(termFreqVectors1);
+ }
+
+ if(vector1.size() == 0) {
+ return 0; // e.g. redirects to a non-existent page
+ }
+
+ HashMap<String, Double> vector2 = null;
+ if(useCache && lastQuery.t1 != null && lastQuery.t1.equals(queryText2)){
+ vector2 = lastQuery.v1;
+ }else if(useCache && lastQuery.t2 != null && lastQuery.t2.equals(queryText2)){
+ vector2 = lastQuery.v2;
+ }else{
+ ArrayList<Terms> termFreqVectors2 = getTermFreqVectors(queryText2);
+ if(termFreqVectors2.size() == 0) return 0;
+ vector2 = makeTfIdfVector(termFreqVectors2);
+ }
+
+ if(vector2.size() == 0) {
+ return 0; // e.g. redirects to a non-existent page
+ }
+
+ if(useCache){
+ lastQuery.t1 = queryText1;
+ lastQuery.v1 = vector1;
+ lastQuery.t2 = queryText2;
+ lastQuery.v2 = vector2;
+ }
+
+ double dotProduct = computeDotProduct(vector1, vector2);
+ double norm1 = computeEuclideanNorm(vector1);
+ double norm2 = computeEuclideanNorm(vector2);
+
+ return dotProduct / (norm1 * norm2);
+ }
+
+ public ArrayList<Terms> getTermFreqVectors(String queryString) throws ParseException, IOException{
+ String escaped = QueryParser.escape(queryString);
+ Query query = queryParser.parse(escaped);
+ ScoreDoc[] scoreDocs = indexSearcher.search(query, maxHits).scoreDocs;
+
+ ArrayList<Terms> termFreqVectors = new ArrayList<Terms>();
+ for(ScoreDoc scoreDoc : scoreDocs) {
+ ScoreDoc redirectScoreDoc = handlePossibleRedirect(scoreDoc);
+ Terms termFreqVector = indexReader.getTermVector(redirectScoreDoc.doc, "text");
+ termFreqVectors.add(termFreqVector);
+ }
+
+ return termFreqVectors;
+ }
+
+ /**
+ * Form a tfidf vector for the set of pages matching each query.
+ * Return the terms that are common to the two sets.
+ */
+// public ArrayList<String> getCommmonTerms(String queryText1, String queryText2) throws ParseException, IOException {
+//
+// String escaped1 = QueryParser.escape(queryText1);
+// Query query1 = queryParser.parse(escaped1);
+// ScoreDoc[] scoreDocs1 = indexSearcher.search(query1, null, maxHits).scoreDocs;
+//
+// ArrayList<TermFreqVector> termFreqVectors1 = new ArrayList<TermFreqVector>();
+// for(ScoreDoc scoreDoc : scoreDocs1) {
+// ScoreDoc redirectScoreDoc = handlePossibleRedirect(scoreDoc);
+// termFreqVectors1.add(indexReader.getTermFreqVector(redirectScoreDoc.doc, "text"));
+// }
+// HashMap<String, Double> vector1 = makeTfIdfVector(termFreqVectors1);
+//
+// String escaped2 = QueryParser.escape(queryText2);
+// Query query2 = queryParser.parse(escaped2);
+// ScoreDoc[] scoreDocs2 = indexSearcher.search(query2, null, maxHits).scoreDocs;
+//
+// ArrayList<TermFreqVector> termFreqVectors2 = new ArrayList<TermFreqVector>();
+// for(ScoreDoc scoreDoc : scoreDocs2) {
+// ScoreDoc redirectScoreDoc = handlePossibleRedirect(scoreDoc);
+// termFreqVectors2.add(indexReader.getTermFreqVector(redirectScoreDoc.doc, "text"));
+// }
+// HashMap<String, Double> vector2 = makeTfIdfVector(termFreqVectors2);
+//
+//
+// HashMap<String, Double> sum = addVectors(vector1, vector2);
+//
+// Function<String, Double> getValue = Functions.forMap(sum);
+// ArrayList<String> keys = new ArrayList<String>(sum.keySet());
+// Collections.sort(keys, Ordering.natural().reverse().onResultOf(getValue));
+//
+// return removeStringsFromList(queryText1, queryText2, keys);
+// }
+
+ /**
+ * Take a list of strings and remove all occurences of two string arguments from it. Use stemming.
+ */
+// private static ArrayList<String> removeStringsFromList(String s1, String s2, ArrayList<String> list) {
+//
+// String stem1 = getStem(s1);
+// String stem2 = getStem(s2);
+//
+// ArrayList<String> result = new ArrayList<String>();
+//
+// for(String s : list) {
+// String stem = getStem(s);
+// if(stem.equals(stem1) || stem.equals(stem2)) {
+// continue;
+// }
+// result.add(s);
+// }
+// return result;
+// }
+//
+ /**
+ * Stem a word using Porter stemmer
+ */
+// private static String getStem(String word) {
+//
+// PorterStemmer stemmer = new PorterStemmer();
+// stemmer.add(word.toCharArray(), word.length());
+// stemmer.stem();
+//
+// return stemmer.toString();
+// }
+
+
+ /**
+ * Return the document to which the input document redirects.
+ * Return the same document if there is no redirect for the input document.
+ */
+ private ScoreDoc handlePossibleRedirect(ScoreDoc scoreDoc) throws ParseException, CorruptIndexException, IOException {
+
+ Document doc = indexSearcher.doc(scoreDoc.doc);
+ String redirectTitle = doc.get("redirect");
+
+ // check if there is a redirect
+ if(redirectTitle == null) {
+ return scoreDoc;
+ }
+
+ QueryParser redirectQueryParser = new QueryParser(Version.LUCENE_30, "title", standardAnalyzer);
+
+ String redirectTitleNoUnderscores = redirectTitle.replaceAll("_", " ");
+ String redirectTitleQuoted = '"' + redirectTitleNoUnderscores + '"';
+ String redirectTitleEscaped = QueryParser.escape(redirectTitleQuoted);
+ Query redirectQuery = redirectQueryParser.parse(redirectTitleEscaped);
+
+ ScoreDoc[] redirectScoreDocs = indexSearcher.search(redirectQuery, null, 1).scoreDocs;
+ if(redirectScoreDocs.length < 1) {
+ System.out.println("failed redirect: " + redirectTitle + " -> " + redirectTitle);
+ return scoreDoc; // redirect query did not return any results
+ }
+ ScoreDoc redirectScoreDoc = redirectScoreDocs[0];
+
+ return redirectScoreDoc;
+
+ }
+
+ /**
+ * Return a hash table that maps terms to their tfidf values.
+ * The input is a list of TermFreqVector objects. The return
+ * value is formed by summing up individual tfidf vectors.
+ */
+ private HashMap<String, Double> makeTfIdfVector(ArrayList<Terms> termFreqVectors) throws IOException {
+
+ // map terms to their tfidf values
+ CounterMap<String> countVector = new CounterMap<String>();
+ HashMap<String, Double> tfIdfVector = new HashMap<String, Double>();
+
+ for(Terms terms : termFreqVectors) {
+ if(terms == null) {
+ continue; // some documents are empty
+ }
+
+// String[] terms = termFreqVector.getTerms();
+// int[] freqs = termFreqVector.getTermFrequencies();
+ TermsEnum termsEnum = terms.iterator(null);
+
+ while(termsEnum.next() != null){
+ BytesRef term = termsEnum.term();
+ String termStr = term.utf8ToString();
+ countVector.add(termStr);
+ }
+
+ for(String key : countVector.keySet()){
+ double tf = similarity.tf((long)countVector.get(key));
+ double idf = similarity.idf(indexReader.docFreq(new Term("text", key)), numDocs);
+ tfIdfVector.put(key, tf*idf);
+ }
+/* for(int i = 0; i < terms.length; i++) {
+ double tf = similarity.tf(freqs[i]); // defaultSimilarity.tf(freqs[i]);
+ double idf = similarity.idf(indexReader.docFreq(new Term("text", terms[i])), numDocs);
+
+ if(tfIdfVector.containsKey(terms[i])) {
+ tfIdfVector.put(terms[i], tfIdfVector.get(terms[i]) + tf * idf);
+ }
+ else {
+ tfIdfVector.put(terms[i], tf * idf);
+ }
+ } */
+ }
+ return tfIdfVector;
+ }
+
+ private double computeEuclideanNorm(HashMap<String, Double> tfIdfVector) {
+
+ double sumOfSquares = 0;
+
+ for(double tfidf : tfIdfVector.values()) {
+ sumOfSquares = sumOfSquares + tfidf*tfidf; //Math.pow(tfidf, 2);
+ }
+
+ return ApproximateMath.asqrt(sumOfSquares);
+ }
+
+ private double computeDotProduct(HashMap<String, Double> vector1, HashMap<String, Double> vector2) {
+
+ double dotProduct = 0;
+ Map<String, Double> smallSet = null;
+ Map<String, Double> largeSet = null;
+ if(vector1.size() > vector2.size()){
+ smallSet = vector2;
+ largeSet = vector1;
+ }else{
+ smallSet = vector1;
+ largeSet = vector2;
+ }
+
+ for(String term : smallSet.keySet()) {
+ if(largeSet.containsKey(term)) {
+ dotProduct = dotProduct + smallSet.get(term) * largeSet.get(term);
+ }
+ }
+
+ return dotProduct;
+ }
+
+ private HashMap<String, Double> addVectors(HashMap<String, Double> vector1, HashMap<String, Double> vector2) {
+
+ HashMap<String, Double> sum = new HashMap<String, Double>();
+ Map<String, Double> smallSet = null;
+ Map<String, Double> largeSet = null;
+ if(vector1.size() > vector2.size()){
+ smallSet = vector2;
+ largeSet = vector1;
+ }else{
+ smallSet = vector1;
+ largeSet = vector2;
+ }
+
+ for(String term : smallSet.keySet()) {
+ if(largeSet.containsKey(term)) {
+ sum.put(term, smallSet.get(term) + largeSet.get(term));
+ }
+ }
+
+ return sum;
+ }
+
+ public void close() throws IOException {
+
+ indexReader.close();
+// indexSearcher.close();
+ standardAnalyzer.close();
+ }
+}
+
+class Cache{
+ String t1 = null;
+ String t2 = null;
+ HashMap<String,Double> v1 = null;
+ HashMap<String,Double> v2 = null;
+}
Propchange: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/WikiIndex.java
------------------------------------------------------------------------------
svn:executable = *
Propchange: incubator/ctakes/trunk/ctakes-utils/src/main/java/org/apache/ctakes/utils/wiki/WikiIndex.java
------------------------------------------------------------------------------
svn:mime-type = text/plain