You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ctakes.apache.org by dl...@apache.org on 2015/08/18 19:29:19 UTC
svn commit: r1696463 -
/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/index/SearchUtility.java
Author: dligach
Date: Tue Aug 18 17:29:18 2015
New Revision: 1696463
URL: http://svn.apache.org/r1696463
Log:
a quick class to search lucene index
Added:
ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/index/SearchUtility.java (with props)
Added: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/index/SearchUtility.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/index/SearchUtility.java?rev=1696463&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/index/SearchUtility.java (added)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/index/SearchUtility.java Tue Aug 18 17:29:18 2015
@@ -0,0 +1,74 @@
+package org.apache.ctakes.index;
+
+import java.io.File;
+import java.io.IOException;
+
+import javax.swing.JOptionPane;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.FSDirectory;
+
+public class SearchUtility {
+
+ public static void main(String[] args) throws IOException {
+
+ final int maxHits = 250;
+ final String searchField = "content";
+ final String indexLocation = "/Users/Dima/Boston/Data/Mimic/Index/";
+
+ String queryText = JOptionPane.showInputDialog("Enter query");
+
+ IndexReader indexReader = IndexReader.open(FSDirectory.open(new File(indexLocation)));
+ IndexSearcher indexSearcher = new IndexSearcher(indexReader);
+
+ PhraseQuery phraseQuery = new PhraseQuery();
+ for(String word : queryText.split(" ")) {
+ phraseQuery.add(new Term(searchField, word));
+ }
+ phraseQuery.setSlop(0);
+
+ TopDocs topDocs = indexSearcher.search(phraseQuery, maxHits);
+ ScoreDoc[] scoreDocs = topDocs.scoreDocs;
+
+ for(ScoreDoc scoreDoc : scoreDocs) {
+ Document document = indexSearcher.doc(scoreDoc.doc);
+ String text = document.get(searchField).toLowerCase().replace('\n', ' ');
+ String context = getContext(queryText, text, 20);
+ System.out.println(context);
+ }
+
+ // indexSearcher.close();
+ System.out.println("total hits: " + scoreDocs.length);
+ }
+
+ /**
+ * Get context for a string. Return "" if string not found in text.
+ *
+ * TODO: Occasionally no context is found when the indexer removed certain
+ * characters which still exist in the source text. E.g. when "... pain, and swelling"
+ * is in the source document, the query "pain and swelling" will return this document.
+ * However, this method will not find the occurence of "pain and swelling" in the
+ * document because of the comma.
+ */
+ public static String getContext(String string, String text, int characterWindow) {
+
+ String noEOL = text.replace('\n', ' ');
+ int begin = noEOL.indexOf(string);
+ if(begin == -1) {
+ return "";
+ }
+
+ int end = begin + string.length();
+ int contextBegin = Math.max(0, begin - characterWindow);
+ int contextEnd = Math.min(text.length(), end + characterWindow);
+
+ return noEOL.substring(contextBegin, contextEnd);
+ }
+}
+
Propchange: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/index/SearchUtility.java
------------------------------------------------------------------------------
svn:mime-type = text/plain