You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by Lingviston <vl...@outlook.com> on 2013/08/09 14:37:42 UTC

How to get hits coordinates in Lucene 4.4.0

Hi, I'm trying to use Lucene in my Android project. To start with I've
created a small demo app. It works with .txt files but I need to work with
.pdf. So analyzing my code I understand that it will have some issues with
.pdfs due to memory management. However the question I want to ask here is
not related to memory but to hit highlighting. It works now but using of
`Highlighter` class with pdfs is not what I want. So to implement my own
highlighting I need to know some kind of coordinates of found words in the
text. How can I get them? I'm using lucene 4.4.0 while all of the examples
like  here
<http://stackoverflow.com/questions/1311199/finding-the-position-of-search-hits-from-lucene> 
are for much older versions. Here is my code:

    public class MainActivity extends Activity {
   
//-----------------------------------------------------------------------------------------------------
    //
    // Constants
    //
   
//-----------------------------------------------------------------------------------------------------
	public static final String FIELD_PATH = "path";
	public static final String FIELD_CONTENTS = "contents";
	
   
//-----------------------------------------------------------------------------------------------------
    //
    // Fields
    //
   
//-----------------------------------------------------------------------------------------------------
	private EditText mEditText;
	private TextView mTextView;
	
   
//-----------------------------------------------------------------------------------------------------
    //
    // Methods
    //
   
//-----------------------------------------------------------------------------------------------------
    @Override
    protected void onCreate(Bundle savedInstanceState) {
        super.onCreate(savedInstanceState);
        setContentView(R.layout.activity_main);
        findViews();
        initViews();
        createIndex();
    }

    private void findViews() {
    	mEditText = (EditText) findViewById(R.id.activity_main_edittext);
    	mTextView = (TextView) findViewById(R.id.activity_main_textview);
    }
    
    private void initViews() {
    	mEditText.setOnEditorActionListener(mEditorActionListener);
    }

    private void performSearch(String searchString) {
    	try {
			Directory directory = NIOFSDirectory.open(getExternalFilesDir(null));
			DirectoryReader ireader = DirectoryReader.open(directory);
		    IndexSearcher isearcher = new IndexSearcher(ireader);
	
			Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
			QueryParser queryParser = new AnalyzingQueryParser(Version.LUCENE_44,
FIELD_CONTENTS, analyzer);
			Query query = queryParser.parse(searchString);
			TopDocs topDocs = isearcher.search(query, null, 1000);
			ScoreDoc[] docs = topDocs.scoreDocs;
			
			StringBuilder result = new StringBuilder();
			StringBuilder debugInfo = new StringBuilder();
			debugInfo.append("Number of hits: ");
			debugInfo.append(docs.length);
			debugInfo.append("\n");
			
			// Iterate through the results:
			for (int i = 0; i < docs.length; i++) {
				Document hitDoc = isearcher.doc(docs[i].doc);
				
				String path = hitDoc.get(FIELD_PATH);
				debugInfo.append("Path: ");
				debugInfo.append(path);
				debugInfo.append("\n");
				
			
result.append("-------------------------------------------------------");
				result.append("File: ");
				result.append(path);
			
result.append("-------------------------------------------------------");
				result.append("<br>");
				
				String content = hitDoc.get(FIELD_CONTENTS);
				QueryScorer scorer = new QueryScorer(query);
				Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("",
""), scorer);
				highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer,
Integer.MAX_VALUE));
				String highlighted = highlighter.getBestFragment(analyzer,
FIELD_CONTENTS, content);
			
result.append("-------------------------------------------------------");
				result.append("Contents: ");
			
result.append("-------------------------------------------------------");
				result.append("<br>");
				result.append(highlighted);
				result.append("<br><br><br>");
			}
			
			//not working
			/*PostingsHighlighter highlighter = new PostingsHighlighter();
			String highlights[] = highlighter.highlight(FIELD_CONTENTS, query,
isearcher, topDocs);*/
			mTextView.setText(Html.fromHtml(result.toString()));
			Log.d(getClass().getSimpleName(), debugInfo.toString());
    	} catch (Exception e) {
    		e.printStackTrace();
    		Log.e(getClass().getSimpleName(), e.getMessage());
    	}

    }
    
    private void createIndex() {
    	try {
    		//Create directory for index.
    		Directory indexDirectory = new
NIOFSDirectory(getExternalFilesDir(null));
    		
			Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
			
			IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44,
analyzer);
			config.setOpenMode(OpenMode.CREATE);
			
			IndexWriter indexWriter = new IndexWriter(indexDirectory, config);
			
			//Loop through files in specified directory and adding them to index.
			File dir = new File(Environment.getExternalStorageDirectory() +
"/lucene");
			File[] files = dir.listFiles();
			for (File file : files) {
				Document document = new Document();
	
				{
					FieldType fieldType = new FieldType(TextField.TYPE_STORED);
				
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
					
					String path = file.getCanonicalPath();
					document.add(new Field(FIELD_PATH, path, fieldType));
				}
	
				{
					FieldType fieldType = new FieldType(TextField.TYPE_STORED);
					fieldType.setIndexed(true);
				
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
					fieldType.setStored(true);
					fieldType.setStoreTermVectors(true);
					fieldType.setTokenized(true);
					fieldType.setStoreTermVectorOffsets(true);
					String content = readFully(new FileReader(file)); //we can't store
Reader objects but we need to be able to access the content for highlighting
					document.add(new Field(FIELD_CONTENTS, content, fieldType));
				}
	
				indexWriter.addDocument(document);
			}
			indexWriter.close();
    	} catch (Exception e) {
    		e.printStackTrace();
    	}
	}
    
    public static String readFully(Reader reader) throws IOException {
    	  char[] arr = new char[8*1024]; // 8K at a time
    	  StringBuffer buf = new StringBuffer();
    	  int numChars;

    	  while ((numChars = reader.read(arr, 0, arr.length)) > 0) {
    	      buf.append(arr, 0, numChars);
    	  }

    	  return buf.toString();
    	    }
    
    @Override
    public boolean onCreateOptionsMenu(Menu menu) {
        getMenuInflater().inflate(R.menu.main, menu);
        return true;
    }
    
   
//-----------------------------------------------------------------------------------------------------
    //
    // Listeners
    //
   
//-----------------------------------------------------------------------------------------------------
    private OnEditorActionListener mEditorActionListener = new
OnEditorActionListener() {
		@Override
		public boolean onEditorAction(TextView v, int actionId, KeyEvent event) {
			if (actionId == EditorInfo.IME_ACTION_SEARCH) {
	            performSearch(v.getText().toString());
	            return true;
	        }
	        return false;
	}
};
}

So how can I get hit coordinates and maybe you have any other advices what
I'm doing wrong?



--
View this message in context: http://lucene.472066.n3.nabble.com/How-to-get-hits-coordinates-in-Lucene-4-4-0-tp4083508.html
Sent from the Lucene - Java Developer mailing list archive at Nabble.com.

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: dev-help@lucene.apache.org


Re: How to get hits coordinates in Lucene 4.4.0

Posted by Michael McCandless <lu...@mikemccandless.com>.
Hi, could you please re-ask this on the user's list
(java-user@lucene.apache.org)?

The dev list is for discussing the development of Lucene's source code.

Thanks.

Mike McCandless

http://blog.mikemccandless.com


On Fri, Aug 9, 2013 at 8:37 AM, Lingviston
<vl...@outlook.com> wrote:
> Hi, I'm trying to use Lucene in my Android project. To start with I've
> created a small demo app. It works with .txt files but I need to work with
> .pdf. So analyzing my code I understand that it will have some issues with
> .pdfs due to memory management. However the question I want to ask here is
> not related to memory but to hit highlighting. It works now but using of
> `Highlighter` class with pdfs is not what I want. So to implement my own
> highlighting I need to know some kind of coordinates of found words in the
> text. How can I get them? I'm using lucene 4.4.0 while all of the examples
> like  here
> <http://stackoverflow.com/questions/1311199/finding-the-position-of-search-hits-from-lucene>
> are for much older versions. Here is my code:
>
>     public class MainActivity extends Activity {
>
> //-----------------------------------------------------------------------------------------------------
>     //
>     // Constants
>     //
>
> //-----------------------------------------------------------------------------------------------------
>         public static final String FIELD_PATH = "path";
>         public static final String FIELD_CONTENTS = "contents";
>
>
> //-----------------------------------------------------------------------------------------------------
>     //
>     // Fields
>     //
>
> //-----------------------------------------------------------------------------------------------------
>         private EditText mEditText;
>         private TextView mTextView;
>
>
> //-----------------------------------------------------------------------------------------------------
>     //
>     // Methods
>     //
>
> //-----------------------------------------------------------------------------------------------------
>     @Override
>     protected void onCreate(Bundle savedInstanceState) {
>         super.onCreate(savedInstanceState);
>         setContentView(R.layout.activity_main);
>         findViews();
>         initViews();
>         createIndex();
>     }
>
>     private void findViews() {
>         mEditText = (EditText) findViewById(R.id.activity_main_edittext);
>         mTextView = (TextView) findViewById(R.id.activity_main_textview);
>     }
>
>     private void initViews() {
>         mEditText.setOnEditorActionListener(mEditorActionListener);
>     }
>
>     private void performSearch(String searchString) {
>         try {
>                         Directory directory = NIOFSDirectory.open(getExternalFilesDir(null));
>                         DirectoryReader ireader = DirectoryReader.open(directory);
>                     IndexSearcher isearcher = new IndexSearcher(ireader);
>
>                         Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
>                         QueryParser queryParser = new AnalyzingQueryParser(Version.LUCENE_44,
> FIELD_CONTENTS, analyzer);
>                         Query query = queryParser.parse(searchString);
>                         TopDocs topDocs = isearcher.search(query, null, 1000);
>                         ScoreDoc[] docs = topDocs.scoreDocs;
>
>                         StringBuilder result = new StringBuilder();
>                         StringBuilder debugInfo = new StringBuilder();
>                         debugInfo.append("Number of hits: ");
>                         debugInfo.append(docs.length);
>                         debugInfo.append("\n");
>
>                         // Iterate through the results:
>                         for (int i = 0; i < docs.length; i++) {
>                                 Document hitDoc = isearcher.doc(docs[i].doc);
>
>                                 String path = hitDoc.get(FIELD_PATH);
>                                 debugInfo.append("Path: ");
>                                 debugInfo.append(path);
>                                 debugInfo.append("\n");
>
>
> result.append("-------------------------------------------------------");
>                                 result.append("File: ");
>                                 result.append(path);
>
> result.append("-------------------------------------------------------");
>                                 result.append("<br>");
>
>                                 String content = hitDoc.get(FIELD_CONTENTS);
>                                 QueryScorer scorer = new QueryScorer(query);
>                                 Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("",
> ""), scorer);
>                                 highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer,
> Integer.MAX_VALUE));
>                                 String highlighted = highlighter.getBestFragment(analyzer,
> FIELD_CONTENTS, content);
>
> result.append("-------------------------------------------------------");
>                                 result.append("Contents: ");
>
> result.append("-------------------------------------------------------");
>                                 result.append("<br>");
>                                 result.append(highlighted);
>                                 result.append("<br><br><br>");
>                         }
>
>                         //not working
>                         /*PostingsHighlighter highlighter = new PostingsHighlighter();
>                         String highlights[] = highlighter.highlight(FIELD_CONTENTS, query,
> isearcher, topDocs);*/
>                         mTextView.setText(Html.fromHtml(result.toString()));
>                         Log.d(getClass().getSimpleName(), debugInfo.toString());
>         } catch (Exception e) {
>                 e.printStackTrace();
>                 Log.e(getClass().getSimpleName(), e.getMessage());
>         }
>
>     }
>
>     private void createIndex() {
>         try {
>                 //Create directory for index.
>                 Directory indexDirectory = new
> NIOFSDirectory(getExternalFilesDir(null));
>
>                         Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
>
>                         IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44,
> analyzer);
>                         config.setOpenMode(OpenMode.CREATE);
>
>                         IndexWriter indexWriter = new IndexWriter(indexDirectory, config);
>
>                         //Loop through files in specified directory and adding them to index.
>                         File dir = new File(Environment.getExternalStorageDirectory() +
> "/lucene");
>                         File[] files = dir.listFiles();
>                         for (File file : files) {
>                                 Document document = new Document();
>
>                                 {
>                                         FieldType fieldType = new FieldType(TextField.TYPE_STORED);
>
> fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
>
>                                         String path = file.getCanonicalPath();
>                                         document.add(new Field(FIELD_PATH, path, fieldType));
>                                 }
>
>                                 {
>                                         FieldType fieldType = new FieldType(TextField.TYPE_STORED);
>                                         fieldType.setIndexed(true);
>
> fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
>                                         fieldType.setStored(true);
>                                         fieldType.setStoreTermVectors(true);
>                                         fieldType.setTokenized(true);
>                                         fieldType.setStoreTermVectorOffsets(true);
>                                         String content = readFully(new FileReader(file)); //we can't store
> Reader objects but we need to be able to access the content for highlighting
>                                         document.add(new Field(FIELD_CONTENTS, content, fieldType));
>                                 }
>
>                                 indexWriter.addDocument(document);
>                         }
>                         indexWriter.close();
>         } catch (Exception e) {
>                 e.printStackTrace();
>         }
>         }
>
>     public static String readFully(Reader reader) throws IOException {
>           char[] arr = new char[8*1024]; // 8K at a time
>           StringBuffer buf = new StringBuffer();
>           int numChars;
>
>           while ((numChars = reader.read(arr, 0, arr.length)) > 0) {
>               buf.append(arr, 0, numChars);
>           }
>
>           return buf.toString();
>             }
>
>     @Override
>     public boolean onCreateOptionsMenu(Menu menu) {
>         getMenuInflater().inflate(R.menu.main, menu);
>         return true;
>     }
>
>
> //-----------------------------------------------------------------------------------------------------
>     //
>     // Listeners
>     //
>
> //-----------------------------------------------------------------------------------------------------
>     private OnEditorActionListener mEditorActionListener = new
> OnEditorActionListener() {
>                 @Override
>                 public boolean onEditorAction(TextView v, int actionId, KeyEvent event) {
>                         if (actionId == EditorInfo.IME_ACTION_SEARCH) {
>                     performSearch(v.getText().toString());
>                     return true;
>                 }
>                 return false;
>         }
> };
> }
>
> So how can I get hit coordinates and maybe you have any other advices what
> I'm doing wrong?
>
>
>
> --
> View this message in context: http://lucene.472066.n3.nabble.com/How-to-get-hits-coordinates-in-Lucene-4-4-0-tp4083508.html
> Sent from the Lucene - Java Developer mailing list archive at Nabble.com.
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
> For additional commands, e-mail: dev-help@lucene.apache.org
>

---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: dev-help@lucene.apache.org