You are viewing a plain text version of this content. The canonical link for it is here.

Posted to java-user@lucene.apache.org by Lingviston <vl...@outlook.com> on 2013/08/12 10:02:12 UTC

How to get hits coordinates in Lucene 4.4.0

Hi, I'm trying to use Lucene in my Android project. To start with I've
created a small demo app. It works with .txt files but I need to work with
.pdf. So analyzing my code I understand that it will have some issues with
.pdfs due to memory management. However the question I want to ask here is
not related to memory but to hit highlighting. It works now but using of
`Highlighter` class with pdfs is not what I want. So to implement my own
highlighting I need to know some kind of coordinates of found words in the
text. How can I get them? I'm using lucene 4.4.0 while all of the examples
like here are for much older versions. Here is my code: 

    public class MainActivity extends Activity { 
   
//----------------------------------------------------------------------------------------------------- 
    // 
    // Constants 
    // 
   
//----------------------------------------------------------------------------------------------------- 
        public static final String FIELD_PATH = "path"; 
        public static final String FIELD_CONTENTS = "contents"; 
        
   
//----------------------------------------------------------------------------------------------------- 
    // 
    // Fields 
    // 
   
//----------------------------------------------------------------------------------------------------- 
        private EditText mEditText; 
        private TextView mTextView; 
        
   
//----------------------------------------------------------------------------------------------------- 
    // 
    // Methods 
    // 
   
//----------------------------------------------------------------------------------------------------- 
    @Override 
    protected void onCreate(Bundle savedInstanceState) { 
        super.onCreate(savedInstanceState); 
        setContentView(R.layout.activity_main); 
        findViews(); 
        initViews(); 
        createIndex(); 
    } 

    private void findViews() { 
    mEditText = (EditText) findViewById(R.id.activity_main_edittext); 
    mTextView = (TextView) findViewById(R.id.activity_main_textview); 
    } 
    
    private void initViews() { 
    mEditText.setOnEditorActionListener(mEditorActionListener); 
    } 

    private void performSearch(String searchString) { 
    try { 
                        Directory directory =
NIOFSDirectory.open(getExternalFilesDir(null)); 
                        DirectoryReader ireader =
DirectoryReader.open(directory); 
                    IndexSearcher isearcher = new IndexSearcher(ireader); 
        
                        Analyzer analyzer = new
StandardAnalyzer(Version.LUCENE_44); 
                        QueryParser queryParser = new
AnalyzingQueryParser(Version.LUCENE_44, FIELD_CONTENTS, analyzer); 
                        Query query = queryParser.parse(searchString); 
                        TopDocs topDocs = isearcher.search(query, null,
1000); 
                        ScoreDoc[] docs = topDocs.scoreDocs; 
                        
                        StringBuilder result = new StringBuilder(); 
                        StringBuilder debugInfo = new StringBuilder(); 
                        debugInfo.append("Number of hits: "); 
                        debugInfo.append(docs.length); 
                        debugInfo.append("\n"); 
                        
                        // Iterate through the results: 
                        for (int i = 0; i < docs.length; i++) { 
                                Document hitDoc =
isearcher.doc(docs[i].doc); 
                                
                                String path = hitDoc.get(FIELD_PATH); 
                                debugInfo.append("Path: "); 
                                debugInfo.append(path); 
                                debugInfo.append("\n"); 
                                
                               
result.append("-------------------------------------------------------"); 
                                result.append("File: "); 
                                result.append(path); 
                               
result.append("-------------------------------------------------------"); 
                                result.append("<br>"); 
                                
                                String content = hitDoc.get(FIELD_CONTENTS); 
                                QueryScorer scorer = new QueryScorer(query); 
                                Highlighter highlighter = new
Highlighter(new SimpleHTMLFormatter("", ""), scorer); 
                                highlighter.setTextFragmenter(new
SimpleSpanFragmenter(scorer, Integer.MAX_VALUE)); 
                                String highlighted =
highlighter.getBestFragment(analyzer, FIELD_CONTENTS, content); 
                               
result.append("-------------------------------------------------------"); 
                                result.append("Contents: "); 
                               
result.append("-------------------------------------------------------"); 
                                result.append("<br>"); 
                                result.append(highlighted); 
                                result.append("<br><br><br>"); 
                        } 
                        
                        //not working 
                        /*PostingsHighlighter highlighter = new
PostingsHighlighter(); 
                        String highlights[] =
highlighter.highlight(FIELD_CONTENTS, query, isearcher, topDocs);*/ 
                        mTextView.setText(Html.fromHtml(result.toString())); 
                        Log.d(getClass().getSimpleName(),
debugInfo.toString()); 
    } catch (Exception e) { 
    e.printStackTrace(); 
    Log.e(getClass().getSimpleName(), e.getMessage()); 
    } 

    } 
    
    private void createIndex() { 
    try { 
    //Create directory for index. 
    Directory indexDirectory = new
NIOFSDirectory(getExternalFilesDir(null)); 
    
                        Analyzer analyzer = new
StandardAnalyzer(Version.LUCENE_44); 
                        
                        IndexWriterConfig config = new
IndexWriterConfig(Version.LUCENE_44, analyzer); 
                        config.setOpenMode(OpenMode.CREATE); 
                        
                        IndexWriter indexWriter = new
IndexWriter(indexDirectory, config); 
                        
                        //Loop through files in specified directory and
adding them to index. 
                        File dir = new
File(Environment.getExternalStorageDirectory() + "/lucene"); 
                        File[] files = dir.listFiles(); 
                        for (File file : files) { 
                                Document document = new Document(); 
        
                                { 
                                        FieldType fieldType = new
FieldType(TextField.TYPE_STORED); 
                                       
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); 
                                        
                                        String path =
file.getCanonicalPath(); 
                                        document.add(new Field(FIELD_PATH,
path, fieldType)); 
                                } 
        
                                { 
                                        FieldType fieldType = new
FieldType(TextField.TYPE_STORED); 
                                        fieldType.setIndexed(true); 
                                       
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); 
                                        fieldType.setStored(true); 
                                        fieldType.setStoreTermVectors(true); 
                                        fieldType.setTokenized(true); 
                                       
fieldType.setStoreTermVectorOffsets(true); 
                                        String content = readFully(new
FileReader(file)); //we can't store Reader objects but we need to be able to
access the content for highlighting 
                                        document.add(new
Field(FIELD_CONTENTS, content, fieldType)); 
                                } 
        
                                indexWriter.addDocument(document); 
                        } 
                        indexWriter.close(); 
    } catch (Exception e) { 
    e.printStackTrace(); 
    } 
        } 
    
    public static String readFully(Reader reader) throws IOException { 
     char[] arr = new char[8*1024]; // 8K at a time 
     StringBuffer buf = new StringBuffer(); 
     int numChars; 

     while ((numChars = reader.read(arr, 0, arr.length)) > 0) { 
         buf.append(arr, 0, numChars); 
     } 

     return buf.toString(); 
       } 
    
    @Override 
    public boolean onCreateOptionsMenu(Menu menu) { 
        getMenuInflater().inflate(R.menu.main, menu); 
        return true; 
    } 
    
   
//----------------------------------------------------------------------------------------------------- 
    // 
    // Listeners 
    // 
   
//----------------------------------------------------------------------------------------------------- 
    private OnEditorActionListener mEditorActionListener = new
OnEditorActionListener() { 
                @Override 
                public boolean onEditorAction(TextView v, int actionId,
KeyEvent event) { 
                        if (actionId == EditorInfo.IME_ACTION_SEARCH) { 
                    performSearch(v.getText().toString()); 
                    return true; 
                } 
                return false; 
        } 
}; 
} 

So how can I get hit coordinates and maybe you have any other advices what
I'm doing wrong? This is rather common task I think so it must be rather
simple.



--
View this message in context: http://lucene.472066.n3.nabble.com/How-to-get-hits-coordinates-in-Lucene-4-4-0-tp4083913.html
Sent from the Lucene - Java Users mailing list archive at Nabble.com.

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org

Re: How to get hits coordinates in Lucene 4.4.0

Posted by Karl Wettin <ka...@kodapan.se>.

On Aug 13, 2013, at 12:55 PM, Michael McCandless wrote:

> I'm less familiar with the older highlighters but likely it's possible
> to get the absolute offsets from them as well.


Using vector highlighter I've achieved that by extending and cloning the code of ScoreOrderFragmentsBuilder#makeFragment something like this:

          final List offsets = new ArrayList<>();

          ScoreOrderFragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder(new String[]{highlightBlockStart}, new String[]{highlightBlockEnd}) {
            @Override
            protected String makeFragment(StringBuilder buffer, int[] index, Field[] values, FieldFragList.WeightedFragInfo fragInfo, String[] preTags, String[] postTags, Encoder encoder) {
              // todo You might not want to keep the StringBuilder if only accessing offsets...
              StringBuilder fragment = new StringBuilder();
              final int s = fragInfo.getStartOffset();
              int[] modifiedStartOffset = {s};
              String src = getFragmentSourceMSO(buffer, index, values, s, fragInfo.getEndOffset(), modifiedStartOffset);
              int srcIndex = 0;
              for (FieldFragList.WeightedFragInfo.SubInfo subInfo : fragInfo.getSubInfos()) {
                for (FieldPhraseList.WeightedPhraseInfo.Toffs to : subInfo.getTermsOffsets()) {

                  offsets.add(new int[]{to.getStartOffset(), to.getEndOffset()});

                  fragment
                      .append(encoder.encodeText(src.substring(srcIndex, to.getStartOffset() - modifiedStartOffset[0])))
                      .append(getPreTag(preTags, subInfo.getSeqnum()))
                      .append(encoder.encodeText(src.substring(to.getStartOffset() - modifiedStartOffset[0], to.getEndOffset() - modifiedStartOffset[0])))
                      .append(getPostTag(postTags, subInfo.getSeqnum()));
                  srcIndex = to.getEndOffset() - modifiedStartOffset[0];
                }
              }
              fragment.append(encoder.encodeText(src.substring(srcIndex)));
              return fragment.toString();
            }

          };

          FastVectorHighlighter fastVectorHighlighter = new FastVectorHighlighter(true, true, fragListBuilder, fragmentsBuilder);
          String fragment = fastVectorHighlighter.getBestFragment(fieldQuery, finalReader, 0, fieldName, fragmentLength);

          for (int[] offset : offsets) {
             ...
          }




			kalle

Re: How to get hits coordinates in Lucene 4.4.0

Posted by Jon Stewart <jo...@lightboxtechnologies.com>.

Done. https://issues.apache.org/jira/browse/LUCENE-5181


Jon



On Mon, Aug 19, 2013 at 1:26 PM, Michael McCandless <
lucene@mikemccandless.com> wrote:

> Hi Jon,
>
> Can you open an issue for this?  We can explore how/whether to get the
> current docID to the formatter...
>
> Mike McCandless
>
> http://blog.mikemccandless.com
>
>
> On Mon, Aug 19, 2013 at 1:07 PM, Jon Stewart
> <jo...@lightboxtechnologies.com> wrote:
> > Iterating over term matches is a recent need for me, too (experimenting
> > with ranking matches/passages independently, across documents). I'm using
> > the new PostingsHighlighter and giving it my own PassageFormatter. This
> > does no formatting, but does store away the offsets from each Passage.
> >
> > One big problem with Passage is that it does not give you the Document or
> > the docID. So, AFAICT, if the Document is needed, then you must manually
> > iterate TopDocs and call PostingsHighlighter.highlightFields() with a
> > single docID.
> >
> >
> > Jon
> >
> >
> >
> > On Tue, Aug 13, 2013 at 6:55 AM, Michael McCandless <
> > lucene@mikemccandless.com> wrote:
> >
> >> If you use PostingsHighlighter, then Passage.getMatchStarts/Ends gives
> >> you the offsets of each match.  You'd need a custom PassageFormatter
> >> that takes these ints and saves them somewhere; or possibly the patch
> >> on LUCENE-4906 (allowing you to return custom objects, not just
> >> String) from your highlighter.
> >>
> >> I'm less familiar with the older highlighters but likely it's possible
> >> to get the absolute offsets from them as well.
> >>
> >> Mike McCandless
> >>
> >> http://blog.mikemccandless.com
> >>
> >>
> >> On Mon, Aug 12, 2013 at 1:20 PM, Lingviston
> >> <vl...@outlook.com> wrote:
> >> > I think that's OK for me. I just need to know the right way to get
> them.
> >> > Notice that queries must support boolean operators, *, ? and qoutes.
> >> >
> >> >
> >> >
> >> > --
> >> > View this message in context:
> >>
> http://lucene.472066.n3.nabble.com/How-to-get-hits-coordinates-in-Lucene-4-4-0-tp4083913p4084046.html
> >> > Sent from the Lucene - Java Users mailing list archive at Nabble.com.
> >> >
> >> > ---------------------------------------------------------------------
> >> > To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> >> > For additional commands, e-mail: java-user-help@lucene.apache.org
> >> >
> >>
> >> ---------------------------------------------------------------------
> >> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> >> For additional commands, e-mail: java-user-help@lucene.apache.org
> >>
> >>
> >
> >
> > --
> > Jon Stewart, Principal
> > (646) 719-0317 | jon@lightboxtechnologies.com | Arlington, VA
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>
>


-- 
Jon Stewart, Principal
(646) 719-0317 | jon@lightboxtechnologies.com | Arlington, VA

Re: How to get hits coordinates in Lucene 4.4.0

Posted by Michael McCandless <lu...@mikemccandless.com>.

Hi Jon,

Can you open an issue for this?  We can explore how/whether to get the
current docID to the formatter...

Mike McCandless

http://blog.mikemccandless.com


On Mon, Aug 19, 2013 at 1:07 PM, Jon Stewart
<jo...@lightboxtechnologies.com> wrote:
> Iterating over term matches is a recent need for me, too (experimenting
> with ranking matches/passages independently, across documents). I'm using
> the new PostingsHighlighter and giving it my own PassageFormatter. This
> does no formatting, but does store away the offsets from each Passage.
>
> One big problem with Passage is that it does not give you the Document or
> the docID. So, AFAICT, if the Document is needed, then you must manually
> iterate TopDocs and call PostingsHighlighter.highlightFields() with a
> single docID.
>
>
> Jon
>
>
>
> On Tue, Aug 13, 2013 at 6:55 AM, Michael McCandless <
> lucene@mikemccandless.com> wrote:
>
>> If you use PostingsHighlighter, then Passage.getMatchStarts/Ends gives
>> you the offsets of each match.  You'd need a custom PassageFormatter
>> that takes these ints and saves them somewhere; or possibly the patch
>> on LUCENE-4906 (allowing you to return custom objects, not just
>> String) from your highlighter.
>>
>> I'm less familiar with the older highlighters but likely it's possible
>> to get the absolute offsets from them as well.
>>
>> Mike McCandless
>>
>> http://blog.mikemccandless.com
>>
>>
>> On Mon, Aug 12, 2013 at 1:20 PM, Lingviston
>> <vl...@outlook.com> wrote:
>> > I think that's OK for me. I just need to know the right way to get them.
>> > Notice that queries must support boolean operators, *, ? and qoutes.
>> >
>> >
>> >
>> > --
>> > View this message in context:
>> http://lucene.472066.n3.nabble.com/How-to-get-hits-coordinates-in-Lucene-4-4-0-tp4083913p4084046.html
>> > Sent from the Lucene - Java Users mailing list archive at Nabble.com.
>> >
>> > ---------------------------------------------------------------------
>> > To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
>> > For additional commands, e-mail: java-user-help@lucene.apache.org
>> >
>>
>> ---------------------------------------------------------------------
>> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
>> For additional commands, e-mail: java-user-help@lucene.apache.org
>>
>>
>
>
> --
> Jon Stewart, Principal
> (646) 719-0317 | jon@lightboxtechnologies.com | Arlington, VA

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org

Re: How to get hits coordinates in Lucene 4.4.0

Posted by Jon Stewart <jo...@lightboxtechnologies.com>.

Iterating over term matches is a recent need for me, too (experimenting
with ranking matches/passages independently, across documents). I'm using
the new PostingsHighlighter and giving it my own PassageFormatter. This
does no formatting, but does store away the offsets from each Passage.

One big problem with Passage is that it does not give you the Document or
the docID. So, AFAICT, if the Document is needed, then you must manually
iterate TopDocs and call PostingsHighlighter.highlightFields() with a
single docID.


Jon



On Tue, Aug 13, 2013 at 6:55 AM, Michael McCandless <
lucene@mikemccandless.com> wrote:

> If you use PostingsHighlighter, then Passage.getMatchStarts/Ends gives
> you the offsets of each match.  You'd need a custom PassageFormatter
> that takes these ints and saves them somewhere; or possibly the patch
> on LUCENE-4906 (allowing you to return custom objects, not just
> String) from your highlighter.
>
> I'm less familiar with the older highlighters but likely it's possible
> to get the absolute offsets from them as well.
>
> Mike McCandless
>
> http://blog.mikemccandless.com
>
>
> On Mon, Aug 12, 2013 at 1:20 PM, Lingviston
> <vl...@outlook.com> wrote:
> > I think that's OK for me. I just need to know the right way to get them.
> > Notice that queries must support boolean operators, *, ? and qoutes.
> >
> >
> >
> > --
> > View this message in context:
> http://lucene.472066.n3.nabble.com/How-to-get-hits-coordinates-in-Lucene-4-4-0-tp4083913p4084046.html
> > Sent from the Lucene - Java Users mailing list archive at Nabble.com.
> >
> > ---------------------------------------------------------------------
> > To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> > For additional commands, e-mail: java-user-help@lucene.apache.org
> >
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>
>


-- 
Jon Stewart, Principal
(646) 719-0317 | jon@lightboxtechnologies.com | Arlington, VA

Re: How to get hits coordinates in Lucene 4.4.0

Posted by Lingviston <vl...@outlook.com>.

I'm currently using this snippet (with older Highlighter):

HitPositionCollector collector = new HitPositionCollector();
				highlighter = new Highlighter(collector, scorer);
				highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer,
Integer.MAX_VALUE));
				
				TokenStream stream =
TokenSources.getAnyTokenStream(isearcher.getIndexReader(),
                        docs[i].doc,
                        FIELD_CONTENTS,
                        hitDoc,
                        analyzer);
				String fragment = highlighter.getBestFragment(stream, content);
				ArrayList<MatchOffset> list = collector.getMatchList();


	public static class HitPositionCollector implements Formatter{

		// MatchOffset is a simple DTO
		private ArrayList<MatchOffset> matchList;
		public HitPositionCollector(){

		    matchList= new ArrayList<MatchOffset>();

		}
		// this ie where the term start and end offset as well as the actual term
is captured
		@Override
		public String highlightTerm(String originalText, TokenGroup tokenGroup) {
		  if (tokenGroup.getTotalScore() <= 0) {

		  }
		      else{
		        MatchOffset mo= new MatchOffset(tokenGroup.getToken(0).toString(),
tokenGroup.getStartOffset(),tokenGroup.getEndOffset());
		        getMatchList().add(mo);
		      }
		      return originalText;
		}

		/**
		 * @return the matchList
		 */
		public ArrayList<MatchOffset> getMatchList() {
		    return matchList;
		}
		
	   
//-----------------------------------------------------------------------------------------------------
	    //
	    // Inner classes
	    //
	   
//-----------------------------------------------------------------------------------------------------
		public static class MatchOffset {
			public String smth;
			public int start;
			public int end;
			
			public MatchOffset(String smth, int start, int end) {
				this.smth = smth;
				this.start = start;
				this.end = end;
			}
		}
	}

The solution with PostingsHighlighter is similar to this? I mean here I have
custom Formatter and I need a custom one for PostingHighlighter too?



--
View this message in context: http://lucene.472066.n3.nabble.com/How-to-get-hits-coordinates-in-Lucene-4-4-0-tp4083913p4084233.html
Sent from the Lucene - Java Users mailing list archive at Nabble.com.

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org

Re: How to get hits coordinates in Lucene 4.4.0

Posted by Michael McCandless <lu...@mikemccandless.com>.

If you use PostingsHighlighter, then Passage.getMatchStarts/Ends gives
you the offsets of each match.  You'd need a custom PassageFormatter
that takes these ints and saves them somewhere; or possibly the patch
on LUCENE-4906 (allowing you to return custom objects, not just
String) from your highlighter.

I'm less familiar with the older highlighters but likely it's possible
to get the absolute offsets from them as well.

Mike McCandless

http://blog.mikemccandless.com

On Mon, Aug 12, 2013 at 1:20 PM, Lingviston
<vl...@outlook.com> wrote:
> I think that's OK for me. I just need to know the right way to get them.
> Notice that queries must support boolean operators, *, ? and qoutes.
>
>
>
> --
> View this message in context: http://lucene.472066.n3.nabble.com/How-to-get-hits-coordinates-in-Lucene-4-4-0-tp4083913p4084046.html
> Sent from the Lucene - Java Users mailing list archive at Nabble.com.
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org

Re: How to get hits coordinates in Lucene 4.4.0

Posted by Lingviston <vl...@outlook.com>.

I think that's OK for me. I just need to know the right way to get them.
Notice that queries must support boolean operators, *, ? and qoutes.



--
View this message in context: http://lucene.472066.n3.nabble.com/How-to-get-hits-coordinates-in-Lucene-4-4-0-tp4083913p4084046.html
Sent from the Lucene - Java Users mailing list archive at Nabble.com.

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org

Re: How to get hits coordinates in Lucene 4.4.0

Posted by Michael McCandless <lu...@mikemccandless.com>.

OK.

But, the offsets refer to the plain text after you filtered the PDF
document, not e.g. to offset in the original PDF content.


Mike McCandless

http://blog.mikemccandless.com


On Mon, Aug 12, 2013 at 9:58 AM, Lingviston
<vl...@outlook.com> wrote:
> Like I said I will work with pdf files. So I will draw highlights by myself
> over the rendered pdf file (as far as I know lucene can't work with pdf by
> default).
>
> Yes, offsets is what I'm looking for.
>
>
>
> --
> View this message in context: http://lucene.472066.n3.nabble.com/How-to-get-hits-coordinates-in-Lucene-4-4-0-tp4083913p4083989.html
> Sent from the Lucene - Java Users mailing list archive at Nabble.com.
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org

Re: How to get hits coordinates in Lucene 4.4.0

Posted by Lingviston <vl...@outlook.com>.

Like I said I will work with pdf files. So I will draw highlights by myself
over the rendered pdf file (as far as I know lucene can't work with pdf by
default). 

Yes, offsets is what I'm looking for.



--
View this message in context: http://lucene.472066.n3.nabble.com/How-to-get-hits-coordinates-in-Lucene-4-4-0-tp4083913p4083989.html
Sent from the Lucene - Java Users mailing list archive at Nabble.com.

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org

Re: How to get hits coordinates in Lucene 4.4.0

Posted by Michael McCandless <lu...@mikemccandless.com>.

I think you're asking for what Lucene calls "offsets", i.e. the
character indices into the original indexed text, telling you where
each hit occurred.

All highlighters use offsets to find the matches in the original indexed text.

One option, which both Highlighter and FastVectorHighlighter use, is
to store the offsets in term vectors.  But this is generally slow /
takes a lot of index space.  A better, newer, option is to use
postings offsets, which is what PostingsHighlighter does.

Your code is storing both term vectors w/ offsets, and postings
offsets, which is rather wasteful (you should only need one).

If you only need the highlighted snippets you should be able to just
use the highlighter APIs as-is?   Why do you also need to know the
offsets?

Mike McCandless

http://blog.mikemccandless.com


On Mon, Aug 12, 2013 at 4:02 AM, Lingviston
<vl...@outlook.com> wrote:
> Hi, I'm trying to use Lucene in my Android project. To start with I've
> created a small demo app. It works with .txt files but I need to work with
> .pdf. So analyzing my code I understand that it will have some issues with
> .pdfs due to memory management. However the question I want to ask here is
> not related to memory but to hit highlighting. It works now but using of
> `Highlighter` class with pdfs is not what I want. So to implement my own
> highlighting I need to know some kind of coordinates of found words in the
> text. How can I get them? I'm using lucene 4.4.0 while all of the examples
> like here are for much older versions. Here is my code:
>
>     public class MainActivity extends Activity {
>
> //-----------------------------------------------------------------------------------------------------
>     //
>     // Constants
>     //
>
> //-----------------------------------------------------------------------------------------------------
>         public static final String FIELD_PATH = "path";
>         public static final String FIELD_CONTENTS = "contents";
>
>
> //-----------------------------------------------------------------------------------------------------
>     //
>     // Fields
>     //
>
> //-----------------------------------------------------------------------------------------------------
>         private EditText mEditText;
>         private TextView mTextView;
>
>
> //-----------------------------------------------------------------------------------------------------
>     //
>     // Methods
>     //
>
> //-----------------------------------------------------------------------------------------------------
>     @Override
>     protected void onCreate(Bundle savedInstanceState) {
>         super.onCreate(savedInstanceState);
>         setContentView(R.layout.activity_main);
>         findViews();
>         initViews();
>         createIndex();
>     }
>
>     private void findViews() {
>     mEditText = (EditText) findViewById(R.id.activity_main_edittext);
>     mTextView = (TextView) findViewById(R.id.activity_main_textview);
>     }
>
>     private void initViews() {
>     mEditText.setOnEditorActionListener(mEditorActionListener);
>     }
>
>     private void performSearch(String searchString) {
>     try {
>                         Directory directory =
> NIOFSDirectory.open(getExternalFilesDir(null));
>                         DirectoryReader ireader =
> DirectoryReader.open(directory);
>                     IndexSearcher isearcher = new IndexSearcher(ireader);
>
>                         Analyzer analyzer = new
> StandardAnalyzer(Version.LUCENE_44);
>                         QueryParser queryParser = new
> AnalyzingQueryParser(Version.LUCENE_44, FIELD_CONTENTS, analyzer);
>                         Query query = queryParser.parse(searchString);
>                         TopDocs topDocs = isearcher.search(query, null,
> 1000);
>                         ScoreDoc[] docs = topDocs.scoreDocs;
>
>                         StringBuilder result = new StringBuilder();
>                         StringBuilder debugInfo = new StringBuilder();
>                         debugInfo.append("Number of hits: ");
>                         debugInfo.append(docs.length);
>                         debugInfo.append("\n");
>
>                         // Iterate through the results:
>                         for (int i = 0; i < docs.length; i++) {
>                                 Document hitDoc =
> isearcher.doc(docs[i].doc);
>
>                                 String path = hitDoc.get(FIELD_PATH);
>                                 debugInfo.append("Path: ");
>                                 debugInfo.append(path);
>                                 debugInfo.append("\n");
>
>
> result.append("-------------------------------------------------------");
>                                 result.append("File: ");
>                                 result.append(path);
>
> result.append("-------------------------------------------------------");
>                                 result.append("<br>");
>
>                                 String content = hitDoc.get(FIELD_CONTENTS);
>                                 QueryScorer scorer = new QueryScorer(query);
>                                 Highlighter highlighter = new
> Highlighter(new SimpleHTMLFormatter("", ""), scorer);
>                                 highlighter.setTextFragmenter(new
> SimpleSpanFragmenter(scorer, Integer.MAX_VALUE));
>                                 String highlighted =
> highlighter.getBestFragment(analyzer, FIELD_CONTENTS, content);
>
> result.append("-------------------------------------------------------");
>                                 result.append("Contents: ");
>
> result.append("-------------------------------------------------------");
>                                 result.append("<br>");
>                                 result.append(highlighted);
>                                 result.append("<br><br><br>");
>                         }
>
>                         //not working
>                         /*PostingsHighlighter highlighter = new
> PostingsHighlighter();
>                         String highlights[] =
> highlighter.highlight(FIELD_CONTENTS, query, isearcher, topDocs);*/
>                         mTextView.setText(Html.fromHtml(result.toString()));
>                         Log.d(getClass().getSimpleName(),
> debugInfo.toString());
>     } catch (Exception e) {
>     e.printStackTrace();
>     Log.e(getClass().getSimpleName(), e.getMessage());
>     }
>
>     }
>
>     private void createIndex() {
>     try {
>     //Create directory for index.
>     Directory indexDirectory = new
> NIOFSDirectory(getExternalFilesDir(null));
>
>                         Analyzer analyzer = new
> StandardAnalyzer(Version.LUCENE_44);
>
>                         IndexWriterConfig config = new
> IndexWriterConfig(Version.LUCENE_44, analyzer);
>                         config.setOpenMode(OpenMode.CREATE);
>
>                         IndexWriter indexWriter = new
> IndexWriter(indexDirectory, config);
>
>                         //Loop through files in specified directory and
> adding them to index.
>                         File dir = new
> File(Environment.getExternalStorageDirectory() + "/lucene");
>                         File[] files = dir.listFiles();
>                         for (File file : files) {
>                                 Document document = new Document();
>
>                                 {
>                                         FieldType fieldType = new
> FieldType(TextField.TYPE_STORED);
>
> fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
>
>                                         String path =
> file.getCanonicalPath();
>                                         document.add(new Field(FIELD_PATH,
> path, fieldType));
>                                 }
>
>                                 {
>                                         FieldType fieldType = new
> FieldType(TextField.TYPE_STORED);
>                                         fieldType.setIndexed(true);
>
> fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
>                                         fieldType.setStored(true);
>                                         fieldType.setStoreTermVectors(true);
>                                         fieldType.setTokenized(true);
>
> fieldType.setStoreTermVectorOffsets(true);
>                                         String content = readFully(new
> FileReader(file)); //we can't store Reader objects but we need to be able to
> access the content for highlighting
>                                         document.add(new
> Field(FIELD_CONTENTS, content, fieldType));
>                                 }
>
>                                 indexWriter.addDocument(document);
>                         }
>                         indexWriter.close();
>     } catch (Exception e) {
>     e.printStackTrace();
>     }
>         }
>
>     public static String readFully(Reader reader) throws IOException {
>      char[] arr = new char[8*1024]; // 8K at a time
>      StringBuffer buf = new StringBuffer();
>      int numChars;
>
>      while ((numChars = reader.read(arr, 0, arr.length)) > 0) {
>          buf.append(arr, 0, numChars);
>      }
>
>      return buf.toString();
>        }
>
>     @Override
>     public boolean onCreateOptionsMenu(Menu menu) {
>         getMenuInflater().inflate(R.menu.main, menu);
>         return true;
>     }
>
>
> //-----------------------------------------------------------------------------------------------------
>     //
>     // Listeners
>     //
>
> //-----------------------------------------------------------------------------------------------------
>     private OnEditorActionListener mEditorActionListener = new
> OnEditorActionListener() {
>                 @Override
>                 public boolean onEditorAction(TextView v, int actionId,
> KeyEvent event) {
>                         if (actionId == EditorInfo.IME_ACTION_SEARCH) {
>                     performSearch(v.getText().toString());
>                     return true;
>                 }
>                 return false;
>         }
> };
> }
>
> So how can I get hit coordinates and maybe you have any other advices what
> I'm doing wrong? This is rather common task I think so it must be rather
> simple.
>
>
>
> --
> View this message in context: http://lucene.472066.n3.nabble.com/How-to-get-hits-coordinates-in-Lucene-4-4-0-tp4083913.html
> Sent from the Lucene - Java Users mailing list archive at Nabble.com.
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org

Re: How to get hits coordinates in Lucene 4.4.0

Posted by Darren Hoffman <da...@jnamics.com>.

Lingviston,

Can you tell me what IDE and process you are using to build your APK file?

I am having issues with loading the Lucene42Codec and I see the code you
are using is just like mine. However, when I try to run the app, I get an
exception stating that it can't find the codec.

I am using IntelliJ to build the APK file using the discrete lucence
library jars.

Thanks,
Darren


On 8/12/13 1:02 AM, "Lingviston" <vl...@outlook.com> wrote:

>Hi, I'm trying to use Lucene in my Android project. To start with I've
>created a small demo app. It works with .txt files but I need to work with
>.pdf. So analyzing my code I understand that it will have some issues with
>.pdfs due to memory management. However the question I want to ask here is
>not related to memory but to hit highlighting. It works now but using of
>`Highlighter` class with pdfs is not what I want. So to implement my own
>highlighting I need to know some kind of coordinates of found words in the
>text. How can I get them? I'm using lucene 4.4.0 while all of the examples
>like here are for much older versions. Here is my code:
>
>    public class MainActivity extends Activity {
>   
>//------------------------------------------------------------------------
>-----------------------------
>    // 
>    // Constants 
>    // 
>   
>//------------------------------------------------------------------------
>-----------------------------
>        public static final String FIELD_PATH = "path";
>        public static final String FIELD_CONTENTS = "contents";
>        
>   
>//------------------------------------------------------------------------
>-----------------------------
>    // 
>    // Fields 
>    // 
>   
>//------------------------------------------------------------------------
>-----------------------------
>        private EditText mEditText;
>        private TextView mTextView;
>        
>   
>//------------------------------------------------------------------------
>-----------------------------
>    // 
>    // Methods 
>    // 
>   
>//------------------------------------------------------------------------
>-----------------------------
>    @Override 
>    protected void onCreate(Bundle savedInstanceState) {
>        super.onCreate(savedInstanceState);
>        setContentView(R.layout.activity_main);
>        findViews();
>        initViews();
>        createIndex();
>    } 
>
>    private void findViews() {
>    mEditText = (EditText) findViewById(R.id.activity_main_edittext);
>    mTextView = (TextView) findViewById(R.id.activity_main_textview);
>    } 
>    
>    private void initViews() {
>    mEditText.setOnEditorActionListener(mEditorActionListener);
>    } 
>
>    private void performSearch(String searchString) {
>    try { 
>                        Directory directory =
>NIOFSDirectory.open(getExternalFilesDir(null));
>                        DirectoryReader ireader =
>DirectoryReader.open(directory);
>                    IndexSearcher isearcher = new IndexSearcher(ireader);
>        
>                        Analyzer analyzer = new
>StandardAnalyzer(Version.LUCENE_44);
>                        QueryParser queryParser = new
>AnalyzingQueryParser(Version.LUCENE_44, FIELD_CONTENTS, analyzer);
>                        Query query = queryParser.parse(searchString);
>                        TopDocs topDocs = isearcher.search(query, null,
>1000); 
>                        ScoreDoc[] docs = topDocs.scoreDocs;
>                  
>                        StringBuilder result = new StringBuilder();
>                        StringBuilder debugInfo = new StringBuilder();
>                        debugInfo.append("Number of hits: ");
>                        debugInfo.append(docs.length);
>                        debugInfo.append("\n");
>                  
>                        // Iterate through the results:
>                        for (int i = 0; i < docs.length; i++) {
>                                Document hitDoc =
>isearcher.doc(docs[i].doc);
>                  
>                                String path = hitDoc.get(FIELD_PATH);
>                                debugInfo.append("Path: ");
>                                debugInfo.append(path);
>                                debugInfo.append("\n");
>                  
>                  
>result.append("-------------------------------------------------------");
>                                result.append("File: ");
>                                result.append(path);
>                  
>result.append("-------------------------------------------------------");
>                                result.append("<br>");
>                  
>                                String content =
>hitDoc.get(FIELD_CONTENTS);
>                                QueryScorer scorer = new
>QueryScorer(query);
>                                Highlighter highlighter = new
>Highlighter(new SimpleHTMLFormatter("", ""), scorer);
>                                highlighter.setTextFragmenter(new
>SimpleSpanFragmenter(scorer, Integer.MAX_VALUE));
>                                String highlighted =
>highlighter.getBestFragment(analyzer, FIELD_CONTENTS, content);
>                  
>result.append("-------------------------------------------------------");
>                                result.append("Contents: ");
>                  
>result.append("-------------------------------------------------------");
>                                result.append("<br>");
>                                result.append(highlighted);
>                                result.append("<br><br><br>");
>                        }
>                  
>                        //not working
>                        /*PostingsHighlighter highlighter = new
>PostingsHighlighter();
>                        String highlights[] =
>highlighter.highlight(FIELD_CONTENTS, query, isearcher, topDocs);*/
>                  
>mTextView.setText(Html.fromHtml(result.toString()));
>                        Log.d(getClass().getSimpleName(),
>debugInfo.toString());
>    } catch (Exception e) {
>    e.printStackTrace();
>    Log.e(getClass().getSimpleName(), e.getMessage());
>    } 
>
>    } 
>    
>    private void createIndex() {
>    try { 
>    //Create directory for index.
>    Directory indexDirectory = new
>NIOFSDirectory(getExternalFilesDir(null));
>    
>                        Analyzer analyzer = new
>StandardAnalyzer(Version.LUCENE_44);
>                  
>                        IndexWriterConfig config = new
>IndexWriterConfig(Version.LUCENE_44, analyzer);
>                        config.setOpenMode(OpenMode.CREATE);
>                  
>                        IndexWriter indexWriter = new
>IndexWriter(indexDirectory, config);
>                  
>                        //Loop through files in specified directory and
>adding them to index.
>                        File dir = new
>File(Environment.getExternalStorageDirectory() + "/lucene");
>                        File[] files = dir.listFiles();
>                        for (File file : files) {
>                                Document document = new Document();
>        
>                                {
>                                        FieldType fieldType = new
>FieldType(TextField.TYPE_STORED);
>                  
>fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OF
>FSETS); 
>                  
>                                        String path =
>file.getCanonicalPath();
>                                        document.add(new Field(FIELD_PATH,
>path, fieldType));
>                                }
>        
>                                {
>                                        FieldType fieldType = new
>FieldType(TextField.TYPE_STORED);
>                                        fieldType.setIndexed(true);
>                  
>fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OF
>FSETS); 
>                                        fieldType.setStored(true);
>                  
>fieldType.setStoreTermVectors(true);
>                                        fieldType.setTokenized(true);
>                  
>fieldType.setStoreTermVectorOffsets(true);
>                                        String content = readFully(new
>FileReader(file)); //we can't store Reader objects but we need to be able
>to
>access the content for highlighting
>                                        document.add(new
>Field(FIELD_CONTENTS, content, fieldType));
>                                }
>        
>                                indexWriter.addDocument(document);
>                        }
>                        indexWriter.close();
>    } catch (Exception e) {
>    e.printStackTrace();
>    } 
>        } 
>    
>    public static String readFully(Reader reader) throws IOException {
>     char[] arr = new char[8*1024]; // 8K at a time
>     StringBuffer buf = new StringBuffer();
>     int numChars;
>
>     while ((numChars = reader.read(arr, 0, arr.length)) > 0) {
>         buf.append(arr, 0, numChars);
>     } 
>
>     return buf.toString();
>       } 
>    
>    @Override 
>    public boolean onCreateOptionsMenu(Menu menu) {
>        getMenuInflater().inflate(R.menu.main, menu);
>        return true;
>    } 
>    
>   
>//------------------------------------------------------------------------
>-----------------------------
>    // 
>    // Listeners 
>    // 
>   
>//------------------------------------------------------------------------
>-----------------------------
>    private OnEditorActionListener mEditorActionListener = new
>OnEditorActionListener() {
>                @Override
>                public boolean onEditorAction(TextView v, int actionId,
>KeyEvent event) { 
>                        if (actionId == EditorInfo.IME_ACTION_SEARCH) {
>                    performSearch(v.getText().toString());
>                    return true;
>                } 
>                return false;
>        } 
>}; 
>} 
>
>So how can I get hit coordinates and maybe you have any other advices what
>I'm doing wrong? This is rather common task I think so it must be rather
>simple.
>
>
>
>--
>View this message in context:
>http://lucene.472066.n3.nabble.com/How-to-get-hits-coordinates-in-Lucene-4
>-4-0-tp4083913.html
>Sent from the Lucene - Java Users mailing list archive at Nabble.com.
>
>---------------------------------------------------------------------
>To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
>For additional commands, e-mail: java-user-help@lucene.apache.org
>



---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org