You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by dl...@apache.org on 2015/08/18 19:29:19 UTC
svn commit: r1696463 - /ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/index/SearchUtility.java

Author: dligach
Date: Tue Aug 18 17:29:18 2015
New Revision: 1696463

URL: http://svn.apache.org/r1696463
Log:
a quick class to search lucene index

Added:
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/index/SearchUtility.java   (with props)

Added: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/index/SearchUtility.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/index/SearchUtility.java?rev=1696463&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/index/SearchUtility.java (added)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/index/SearchUtility.java Tue Aug 18 17:29:18 2015
@@ -0,0 +1,74 @@
+package org.apache.ctakes.index;
+
+import java.io.File;
+import java.io.IOException;
+
+import javax.swing.JOptionPane;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.FSDirectory;
+
+public class SearchUtility {
+  
+  public static void main(String[] args) throws IOException {
+
+    final int maxHits = 250;
+    final String searchField = "content";
+    final String indexLocation = "/Users/Dima/Boston/Data/Mimic/Index/";
+
+    String queryText = JOptionPane.showInputDialog("Enter query");
+    
+    IndexReader indexReader = IndexReader.open(FSDirectory.open(new File(indexLocation)));
+    IndexSearcher indexSearcher = new IndexSearcher(indexReader);
+
+    PhraseQuery phraseQuery = new PhraseQuery();
+    for(String word : queryText.split(" ")) {
+      phraseQuery.add(new Term(searchField, word));
+    }
+    phraseQuery.setSlop(0);
+    
+    TopDocs topDocs = indexSearcher.search(phraseQuery, maxHits);
+    ScoreDoc[] scoreDocs = topDocs.scoreDocs;     
+
+    for(ScoreDoc scoreDoc : scoreDocs) {
+      Document document = indexSearcher.doc(scoreDoc.doc);
+      String text = document.get(searchField).toLowerCase().replace('\n', ' ');
+      String context = getContext(queryText, text, 20);
+      System.out.println(context);
+    }
+    
+    // indexSearcher.close();
+    System.out.println("total hits: " + scoreDocs.length);
+  }
+  
+  /**
+   * Get context for a string. Return "" if string not found in text.
+   * 
+   * TODO: Occasionally no context is found when the indexer removed certain
+   * characters which still exist in the source text. E.g. when "... pain, and swelling"
+   * is in the source document, the query "pain and swelling" will return this document.
+   * However, this method will not find the occurence of "pain and swelling" in the
+   * document because of the comma.
+   */
+  public static String getContext(String string, String text, int characterWindow) {
+    
+    String noEOL = text.replace('\n', ' ');
+    int begin = noEOL.indexOf(string);
+    if(begin == -1) {
+      return "";
+    }
+    
+    int end = begin + string.length();
+    int contextBegin = Math.max(0, begin - characterWindow);
+    int contextEnd = Math.min(text.length(), end + characterWindow);
+    
+    return noEOL.substring(contextBegin, contextEnd);
+  }
+}
+

Propchange: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/index/SearchUtility.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain