You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by Lingviston <vl...@outlook.com> on 2013/08/09 14:37:42 UTC
How to get hits coordinates in Lucene 4.4.0
Hi, I'm trying to use Lucene in my Android project. To start with I've
created a small demo app. It works with .txt files but I need to work with
.pdf. So analyzing my code I understand that it will have some issues with
.pdfs due to memory management. However the question I want to ask here is
not related to memory but to hit highlighting. It works now but using of
`Highlighter` class with pdfs is not what I want. So to implement my own
highlighting I need to know some kind of coordinates of found words in the
text. How can I get them? I'm using lucene 4.4.0 while all of the examples
like here
<http://stackoverflow.com/questions/1311199/finding-the-position-of-search-hits-from-lucene>
are for much older versions. Here is my code:
public class MainActivity extends Activity {
//-----------------------------------------------------------------------------------------------------
//
// Constants
//
//-----------------------------------------------------------------------------------------------------
public static final String FIELD_PATH = "path";
public static final String FIELD_CONTENTS = "contents";
//-----------------------------------------------------------------------------------------------------
//
// Fields
//
//-----------------------------------------------------------------------------------------------------
private EditText mEditText;
private TextView mTextView;
//-----------------------------------------------------------------------------------------------------
//
// Methods
//
//-----------------------------------------------------------------------------------------------------
@Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
findViews();
initViews();
createIndex();
}
private void findViews() {
mEditText = (EditText) findViewById(R.id.activity_main_edittext);
mTextView = (TextView) findViewById(R.id.activity_main_textview);
}
private void initViews() {
mEditText.setOnEditorActionListener(mEditorActionListener);
}
private void performSearch(String searchString) {
try {
Directory directory = NIOFSDirectory.open(getExternalFilesDir(null));
DirectoryReader ireader = DirectoryReader.open(directory);
IndexSearcher isearcher = new IndexSearcher(ireader);
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
QueryParser queryParser = new AnalyzingQueryParser(Version.LUCENE_44,
FIELD_CONTENTS, analyzer);
Query query = queryParser.parse(searchString);
TopDocs topDocs = isearcher.search(query, null, 1000);
ScoreDoc[] docs = topDocs.scoreDocs;
StringBuilder result = new StringBuilder();
StringBuilder debugInfo = new StringBuilder();
debugInfo.append("Number of hits: ");
debugInfo.append(docs.length);
debugInfo.append("\n");
// Iterate through the results:
for (int i = 0; i < docs.length; i++) {
Document hitDoc = isearcher.doc(docs[i].doc);
String path = hitDoc.get(FIELD_PATH);
debugInfo.append("Path: ");
debugInfo.append(path);
debugInfo.append("\n");
result.append("-------------------------------------------------------");
result.append("File: ");
result.append(path);
result.append("-------------------------------------------------------");
result.append("<br>");
String content = hitDoc.get(FIELD_CONTENTS);
QueryScorer scorer = new QueryScorer(query);
Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("",
""), scorer);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer,
Integer.MAX_VALUE));
String highlighted = highlighter.getBestFragment(analyzer,
FIELD_CONTENTS, content);
result.append("-------------------------------------------------------");
result.append("Contents: ");
result.append("-------------------------------------------------------");
result.append("<br>");
result.append(highlighted);
result.append("<br><br><br>");
}
//not working
/*PostingsHighlighter highlighter = new PostingsHighlighter();
String highlights[] = highlighter.highlight(FIELD_CONTENTS, query,
isearcher, topDocs);*/
mTextView.setText(Html.fromHtml(result.toString()));
Log.d(getClass().getSimpleName(), debugInfo.toString());
} catch (Exception e) {
e.printStackTrace();
Log.e(getClass().getSimpleName(), e.getMessage());
}
}
private void createIndex() {
try {
//Create directory for index.
Directory indexDirectory = new
NIOFSDirectory(getExternalFilesDir(null));
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44,
analyzer);
config.setOpenMode(OpenMode.CREATE);
IndexWriter indexWriter = new IndexWriter(indexDirectory, config);
//Loop through files in specified directory and adding them to index.
File dir = new File(Environment.getExternalStorageDirectory() +
"/lucene");
File[] files = dir.listFiles();
for (File file : files) {
Document document = new Document();
{
FieldType fieldType = new FieldType(TextField.TYPE_STORED);
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
String path = file.getCanonicalPath();
document.add(new Field(FIELD_PATH, path, fieldType));
}
{
FieldType fieldType = new FieldType(TextField.TYPE_STORED);
fieldType.setIndexed(true);
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
fieldType.setStored(true);
fieldType.setStoreTermVectors(true);
fieldType.setTokenized(true);
fieldType.setStoreTermVectorOffsets(true);
String content = readFully(new FileReader(file)); //we can't store
Reader objects but we need to be able to access the content for highlighting
document.add(new Field(FIELD_CONTENTS, content, fieldType));
}
indexWriter.addDocument(document);
}
indexWriter.close();
} catch (Exception e) {
e.printStackTrace();
}
}
public static String readFully(Reader reader) throws IOException {
char[] arr = new char[8*1024]; // 8K at a time
StringBuffer buf = new StringBuffer();
int numChars;
while ((numChars = reader.read(arr, 0, arr.length)) > 0) {
buf.append(arr, 0, numChars);
}
return buf.toString();
}
@Override
public boolean onCreateOptionsMenu(Menu menu) {
getMenuInflater().inflate(R.menu.main, menu);
return true;
}
//-----------------------------------------------------------------------------------------------------
//
// Listeners
//
//-----------------------------------------------------------------------------------------------------
private OnEditorActionListener mEditorActionListener = new
OnEditorActionListener() {
@Override
public boolean onEditorAction(TextView v, int actionId, KeyEvent event) {
if (actionId == EditorInfo.IME_ACTION_SEARCH) {
performSearch(v.getText().toString());
return true;
}
return false;
}
};
}
So how can I get hit coordinates and maybe you have any other advices what
I'm doing wrong?
--
View this message in context: http://lucene.472066.n3.nabble.com/How-to-get-hits-coordinates-in-Lucene-4-4-0-tp4083508.html
Sent from the Lucene - Java Developer mailing list archive at Nabble.com.
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: dev-help@lucene.apache.org
Re: How to get hits coordinates in Lucene 4.4.0
Posted by Michael McCandless <lu...@mikemccandless.com>.
Hi, could you please re-ask this on the user's list
(java-user@lucene.apache.org)?
The dev list is for discussing the development of Lucene's source code.
Thanks.
Mike McCandless
http://blog.mikemccandless.com
On Fri, Aug 9, 2013 at 8:37 AM, Lingviston
<vl...@outlook.com> wrote:
> Hi, I'm trying to use Lucene in my Android project. To start with I've
> created a small demo app. It works with .txt files but I need to work with
> .pdf. So analyzing my code I understand that it will have some issues with
> .pdfs due to memory management. However the question I want to ask here is
> not related to memory but to hit highlighting. It works now but using of
> `Highlighter` class with pdfs is not what I want. So to implement my own
> highlighting I need to know some kind of coordinates of found words in the
> text. How can I get them? I'm using lucene 4.4.0 while all of the examples
> like here
> <http://stackoverflow.com/questions/1311199/finding-the-position-of-search-hits-from-lucene>
> are for much older versions. Here is my code:
>
> public class MainActivity extends Activity {
>
> //-----------------------------------------------------------------------------------------------------
> //
> // Constants
> //
>
> //-----------------------------------------------------------------------------------------------------
> public static final String FIELD_PATH = "path";
> public static final String FIELD_CONTENTS = "contents";
>
>
> //-----------------------------------------------------------------------------------------------------
> //
> // Fields
> //
>
> //-----------------------------------------------------------------------------------------------------
> private EditText mEditText;
> private TextView mTextView;
>
>
> //-----------------------------------------------------------------------------------------------------
> //
> // Methods
> //
>
> //-----------------------------------------------------------------------------------------------------
> @Override
> protected void onCreate(Bundle savedInstanceState) {
> super.onCreate(savedInstanceState);
> setContentView(R.layout.activity_main);
> findViews();
> initViews();
> createIndex();
> }
>
> private void findViews() {
> mEditText = (EditText) findViewById(R.id.activity_main_edittext);
> mTextView = (TextView) findViewById(R.id.activity_main_textview);
> }
>
> private void initViews() {
> mEditText.setOnEditorActionListener(mEditorActionListener);
> }
>
> private void performSearch(String searchString) {
> try {
> Directory directory = NIOFSDirectory.open(getExternalFilesDir(null));
> DirectoryReader ireader = DirectoryReader.open(directory);
> IndexSearcher isearcher = new IndexSearcher(ireader);
>
> Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
> QueryParser queryParser = new AnalyzingQueryParser(Version.LUCENE_44,
> FIELD_CONTENTS, analyzer);
> Query query = queryParser.parse(searchString);
> TopDocs topDocs = isearcher.search(query, null, 1000);
> ScoreDoc[] docs = topDocs.scoreDocs;
>
> StringBuilder result = new StringBuilder();
> StringBuilder debugInfo = new StringBuilder();
> debugInfo.append("Number of hits: ");
> debugInfo.append(docs.length);
> debugInfo.append("\n");
>
> // Iterate through the results:
> for (int i = 0; i < docs.length; i++) {
> Document hitDoc = isearcher.doc(docs[i].doc);
>
> String path = hitDoc.get(FIELD_PATH);
> debugInfo.append("Path: ");
> debugInfo.append(path);
> debugInfo.append("\n");
>
>
> result.append("-------------------------------------------------------");
> result.append("File: ");
> result.append(path);
>
> result.append("-------------------------------------------------------");
> result.append("<br>");
>
> String content = hitDoc.get(FIELD_CONTENTS);
> QueryScorer scorer = new QueryScorer(query);
> Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("",
> ""), scorer);
> highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer,
> Integer.MAX_VALUE));
> String highlighted = highlighter.getBestFragment(analyzer,
> FIELD_CONTENTS, content);
>
> result.append("-------------------------------------------------------");
> result.append("Contents: ");
>
> result.append("-------------------------------------------------------");
> result.append("<br>");
> result.append(highlighted);
> result.append("<br><br><br>");
> }
>
> //not working
> /*PostingsHighlighter highlighter = new PostingsHighlighter();
> String highlights[] = highlighter.highlight(FIELD_CONTENTS, query,
> isearcher, topDocs);*/
> mTextView.setText(Html.fromHtml(result.toString()));
> Log.d(getClass().getSimpleName(), debugInfo.toString());
> } catch (Exception e) {
> e.printStackTrace();
> Log.e(getClass().getSimpleName(), e.getMessage());
> }
>
> }
>
> private void createIndex() {
> try {
> //Create directory for index.
> Directory indexDirectory = new
> NIOFSDirectory(getExternalFilesDir(null));
>
> Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
>
> IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_44,
> analyzer);
> config.setOpenMode(OpenMode.CREATE);
>
> IndexWriter indexWriter = new IndexWriter(indexDirectory, config);
>
> //Loop through files in specified directory and adding them to index.
> File dir = new File(Environment.getExternalStorageDirectory() +
> "/lucene");
> File[] files = dir.listFiles();
> for (File file : files) {
> Document document = new Document();
>
> {
> FieldType fieldType = new FieldType(TextField.TYPE_STORED);
>
> fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
>
> String path = file.getCanonicalPath();
> document.add(new Field(FIELD_PATH, path, fieldType));
> }
>
> {
> FieldType fieldType = new FieldType(TextField.TYPE_STORED);
> fieldType.setIndexed(true);
>
> fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
> fieldType.setStored(true);
> fieldType.setStoreTermVectors(true);
> fieldType.setTokenized(true);
> fieldType.setStoreTermVectorOffsets(true);
> String content = readFully(new FileReader(file)); //we can't store
> Reader objects but we need to be able to access the content for highlighting
> document.add(new Field(FIELD_CONTENTS, content, fieldType));
> }
>
> indexWriter.addDocument(document);
> }
> indexWriter.close();
> } catch (Exception e) {
> e.printStackTrace();
> }
> }
>
> public static String readFully(Reader reader) throws IOException {
> char[] arr = new char[8*1024]; // 8K at a time
> StringBuffer buf = new StringBuffer();
> int numChars;
>
> while ((numChars = reader.read(arr, 0, arr.length)) > 0) {
> buf.append(arr, 0, numChars);
> }
>
> return buf.toString();
> }
>
> @Override
> public boolean onCreateOptionsMenu(Menu menu) {
> getMenuInflater().inflate(R.menu.main, menu);
> return true;
> }
>
>
> //-----------------------------------------------------------------------------------------------------
> //
> // Listeners
> //
>
> //-----------------------------------------------------------------------------------------------------
> private OnEditorActionListener mEditorActionListener = new
> OnEditorActionListener() {
> @Override
> public boolean onEditorAction(TextView v, int actionId, KeyEvent event) {
> if (actionId == EditorInfo.IME_ACTION_SEARCH) {
> performSearch(v.getText().toString());
> return true;
> }
> return false;
> }
> };
> }
>
> So how can I get hit coordinates and maybe you have any other advices what
> I'm doing wrong?
>
>
>
> --
> View this message in context: http://lucene.472066.n3.nabble.com/How-to-get-hits-coordinates-in-Lucene-4-4-0-tp4083508.html
> Sent from the Lucene - Java Developer mailing list archive at Nabble.com.
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
> For additional commands, e-mail: dev-help@lucene.apache.org
>
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: dev-help@lucene.apache.org