You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-user@lucene.apache.org by Grant Ingersoll <gs...@apache.org> on 2009/01/04 03:23:42 UTC
Fwd: Search Test file
Begin forwarded message:
> From: Grant Ingersoll <gs...@apache.org>
> Date: January 3, 2009 8:19:14 PM EST
> To: java-dev@lucene.apache.org
> Subject: Fwd: Search Test file
> Reply-To: java-dev@lucene.apache.org
>
> Hi Amin,
>
> I see a couple of issues with your program below, and one that is
> the cause of the problem of not finding "amin" as a query term.
>
> When you construct your IndexWriter, you are doing:
>> IndexWriter indexWriter = new
>> IndexWriter(getDirectory(),getAnalyzer(),new
>> IndexWriter.MaxFieldLength(2));
>
> The MaxFieldLength parameter specifies the maximum number of tokens
> allowed in a Field. Everything else after that is dropped. See http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.html#IndexWriter(org.apache.lucene.store.Directory,%20org.apache.lucene.analysis.Analyzer,%20org.apache.lucene.index.IndexWriter.MaxFieldLength)
> and http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.MaxFieldLength.html
>
> Also,
> TopDocs topDocs = multiSearcher.search(query,
> BooleanQuery.getMaxClauseCount());
>
> strikes me as really odd. Why are you passing in the max clause
> count as the number of results you want returned?
>
> Cheers,
> Grant
>
>
>
> Begin forwarded message:
>
>> From: "aminmc@gmail.com" <am...@gmail.com>
>> Date: January 3, 2009 3:24:52 PM EST
>> To: gsingers@apache.org
>> Subject: Search Test file
>>
>> I've shared a document with you called "Search Test file":
>> http://docs.google.com/Doc?id=d77xf5q_0n6hb38fx&invite=cjq79zj
>>
>> It's not an attachment -- it's stored online at Google Docs. To
>> open this document, just click the link above.
>> ---
>>
>> Hi
>>
>> I have uploaded the test file at google docs. It is currently a txt
>> file but if you change the extension to .java it should work.
>>
>> package com.amin.app.lucene.search.impl;
>>
>> import static org.junit.Assert.assertEquals;
>> import static org.junit.Assert.assertNotNull;
>> import static org.junit.Assert.assertNotSame;
>> import static org.junit.Assert.assertTrue;
>>
>> import java.io.File;
>> import java.io.FileInputStream;
>> import java.io.FileOutputStream;
>> import java.io.IOException;
>> import java.io.InputStream;
>> import java.io.OutputStream;
>>
>> import javax.swing.text.BadLocationException;
>> import javax.swing.text.DefaultStyledDocument;
>> import javax.swing.text.rtf.RTFEditorKit;
>>
>> import org.apache.commons.lang.StringUtils;
>> import org.apache.lucene.analysis.Analyzer;
>> import org.apache.lucene.analysis.standard.StandardAnalyzer;
>> import org.apache.lucene.ant.DocumentHandler;
>> import org.apache.lucene.ant.DocumentHandlerException;
>> import org.apache.lucene.document.Document;
>> import org.apache.lucene.document.Field;
>> import org.apache.lucene.index.CorruptIndexException;
>> import org.apache.lucene.index.IndexReader;
>> import org.apache.lucene.index.IndexWriter;
>> import org.apache.lucene.queryParser.MultiFieldQueryParser;
>> import org.apache.lucene.queryParser.QueryParser;
>> import org.apache.lucene.search.BooleanQuery;
>> import org.apache.lucene.search.IndexSearcher;
>> import org.apache.lucene.search.MultiSearcher;
>> import org.apache.lucene.search.Query;
>> import org.apache.lucene.search.ScoreDoc;
>> import org.apache.lucene.search.Searchable;
>> import org.apache.lucene.search.TopDocs;
>> import org.apache.lucene.store.Directory;
>> import org.apache.lucene.store.FSDirectory;
>> import org.junit.After;
>> import org.junit.Before;
>> import org.junit.Test;
>>
>> import com.amin.app.lucene.util.WorkItem.IndexerType;
>>
>> public class SearchTest {
>>
>> private File rtfFile = null;
>> private static final String RTF_FILE_NAME = "rtfDocumentToIndex.rtf";
>>
>> @Before
>> public void setUp() throws Exception {
>> InputStream inputStream =
>> this.getClass().getClassLoader().getResourceAsStream(RTF_FILE_NAME);
>> rtfFile = new File(RTF_FILE_NAME);
>> convertInputStreamToFile(inputStream, rtfFile);
>> }
>>
>>
>>
>> @Test
>> public void testCanCreateLuceneDocumentForRTFDocument() throws
>> Exception {
>> JavaBuiltInRTFHandler builtInRTFHandler = new
>> JavaBuiltInRTFHandler();
>> Document document = builtInRTFHandler.getDocument(rtfFile);
>> assertNotNull(document);
>> String value = document.get(FieldNameEnum.BODY.getDescription());
>> assertNotNull(value);
>> assertNotSame("", value);
>> assertTrue(value.contains("Amin Mohammed-Coleman"));
>> assertTrue(value.contains("This is a test rtf document that will be
>> indexed."));
>> String path = document.get(FieldNameEnum.PATH.getDescription());
>> assertNotNull(path);
>> assertTrue(path.contains(".rtf"));
>> String fileName = document.get(FieldNameEnum.NAME.getDescription());
>> assertNotNull(fileName);
>> assertEquals(RTF_FILE_NAME, fileName);
>> assertEquals(WorkItem.IndexerType.RTF_INDEXER.name(),
>> document.get(FieldNameEnum.TYPE.getDescription()));
>>
>> }
>>
>>
>>
>> @Test
>> public void testCanSearchRtfDocument() throws Exception {
>> JavaBuiltInRTFHandler builtInRTFHandler = new
>> JavaBuiltInRTFHandler();
>> Document document = builtInRTFHandler.getDocument(rtfFile);
>> IndexWriter indexWriter = new
>> IndexWriter(getDirectory(),getAnalyzer(),new
>> IndexWriter.MaxFieldLength(2));
>> try {
>> indexWriter.addDocument(document);
>> commitAndCloseWriter(indexWriter);
>> } catch (CorruptIndexException e) {
>> throw new IllegalStateException(e);
>> } catch (IOException e) {
>> throw new IllegalStateException(e);
>> }
>>
>> //I plan to use other searchers later
>> IndexSearcher indexSearcher = new IndexSearcher(getDirectory());
>> MultiSearcher multiSearcher = new MultiSearcher(new Searchable[]
>> {indexSearcher});
>> QueryParser queryParser = new MultiFieldQueryParser(new String[]
>> {FieldNameEnum.BODY.getDescription()}, new StandardAnalyzer());
>> Query query = queryParser.parse("amin");
>> TopDocs topDocs = multiSearcher.search(query,
>> BooleanQuery.getMaxClauseCount());
>> assertNotNull(topDocs);
>> assertEquals(1, topDocs.totalHits);
>> ScoreDoc[] scoreDocs = topDocs.scoreDocs;
>> for (ScoreDoc scoreDoc : scoreDocs) {
>> Document documentFromSearch = indexSearcher.doc(scoreDoc.doc);
>> assertNotNull(documentFromSearch);
>> String bodyText =
>> documentFromSearch.get(FieldNameEnum.BODY.getDescription());
>> assertNotNull(bodyText);
>> assertNotSame("", bodyText);
>> assertTrue(bodyText.contains("Amin Mohammed-Coleman"));
>> assertTrue(bodyText.contains("This is a test rtf document that will
>> be indexed."));
>>
>> }
>> multiSearcher.close();
>>
>> }
>>
>> @After
>> public void tearDown() throws Exception {
>> rtfFile.delete();
>> if (getDirectory().list() != null && getDirectory().list().length >
>> 0) {
>> IndexReader reader = IndexReader.open(getDirectory());
>> for(int i = 0; i < reader.maxDoc();i++) {
>> reader.deleteDocument(i);
>> }
>> reader.close();
>> }
>> }
>>
>> private void commitAndCloseWriter(IndexWriter indexWriter) throws
>> CorruptIndexException,IOException {
>> indexWriter.commit();
>> indexWriter.close();
>> }
>>
>>
>> public Directory getDirectory() throws IOException {
>> return FSDirectory.getDirectory("/tmp/lucene/rtf");
>> }
>>
>> public Analyzer getAnalyzer() {
>> return new StandardAnalyzer();
>> }
>> private static void convertInputStreamToFile(InputStream
>> inputStream, File file) {
>> try
>> {
>> OutputStream out=new FileOutputStream(file);
>> byte buf[]=new byte[1024];
>> int len;
>> while((len=inputStream.read(buf))>0)
>> out.write(buf,0,len);
>> out.close();
>> inputStream.close();
>>
>> }catch (IOException e){
>> throw new IllegalStateException(e);
>> }
>> }
>> private static class JavaBuiltInRTFHandler implements
>> DocumentHandler{
>>
>> public Document getDocument(File file) throws
>> DocumentHandlerException {
>> String bodyText = null;
>> DefaultStyledDocument styledDoc = new DefaultStyledDocument();
>> try {
>> InputStream inputStream = new FileInputStream(file);
>> new RTFEditorKit().read(inputStream, styledDoc, 0);
>> bodyText = styledDoc.getText(0, styledDoc.getLength());
>> } catch (IOException ioex) {
>> throw new IllegalStateException(ioex);
>> } catch (BadLocationException e) {
>> throw new IllegalArgumentException(e);
>> }
>> //create Document object using body
>> if (bodyText != null) {
>> Document document = new Document();
>> String trimmedBodyText = StringUtils.trimToEmpty(bodyText);
>> trimmedBodyText = trimmedBodyText.replaceAll("\n", "");
>> Field field = new
>> Field(FieldNameEnum.BODY.getDescription(),trimmedBodyText,
>> Field.Store.YES, Field.Index.ANALYZED);
>> document.add(field);
>>
>> String pathToFile = file.getPath();
>> Field pathToFileField = new
>> Field(FieldNameEnum.PATH.getDescription(),pathToFile,
>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>> document.add(pathToFileField);
>>
>> String fileName = file.getName();
>> Field fileNameField = new
>> Field(FieldNameEnum.NAME.getDescription(),fileName,
>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>> document.add(fileNameField);
>>
>> Field typeField = new
>> Field
>> (FieldNameEnum
>> .TYPE.getDescription(),IndexerType.RTF_INDEXER.name(),
>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>> document.add(typeField);
>>
>> String summary = bodyText.substring(0, 10);
>>
>> Field summaryField = new
>> Field(FieldNameEnum.SUMMARY.getDescription(),summary,
>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>> document.add(summaryField);
>>
>> return document;
>> }
>> return null;
>> }
>> }
>>
>> private static class WorkItem {
>>
>> public enum WorkItemEvent {
>> ADD,
>> UPDATE,
>> DELETE;
>> }
>>
>> public enum IndexerType {
>> RTF_INDEXER,
>> PDF_INDEXER,
>> XML_INDEXER,
>> PLAIN_TEXT_INDEXER,
>> MS_WORD_INDEXER,
>> MS_EXCEL_INDEXER,
>> MS_POWERPOINT_INDEXER;
>> }
>>
>>
>> private final Document workLoad;
>>
>> private final WorkItemEvent workItemEvent;
>>
>> private final IndexerType indexerType;
>>
>>
>> public WorkItem(final Document workLoad, final WorkItemEvent
>> workItemEvent) {
>> this.workLoad = workLoad;
>> this.workItemEvent = workItemEvent;
>> String type = this.workLoad.get("type");
>> this.indexerType = IndexerType.valueOf(type);
>> }
>>
>> public IndexerType getIndexerType() {
>> return indexerType;
>> }
>>
>> public Document getWorkLoad() {
>> return workLoad;
>> }
>>
>> public WorkItemEvent getWorkItemEvent() {
>> return workItemEvent;
>> }
>> }
>>
>> private enum FieldNameEnum {
>>
>> AUTHOR("author"),
>> BODY("body"),
>> TITLE("title"),
>> SUBJECT("subject"),
>> KEYWORDS("keywords"),
>> PATH("path"), NAME ("name"),
>> TYPE("type"),
>> ID ("id"),
>> SUMMARY ("summary");
>>
>> private final String description;
>>
>> private FieldNameEnum(final String description) {
>> this.description = description;
>> }
>>
>> public String getDescription() {
>> return this.description;
>> }
>> }
>> }
>
> --------------------------
> Grant Ingersoll
>
> Lucene Helpful Hints:
> http://wiki.apache.org/lucene-java/BasicsOfPerformance
> http://wiki.apache.org/lucene-java/LuceneFAQ
>
>
>
>
>
>
>
>
>
>
--------------------------
Grant Ingersoll
Lucene Helpful Hints:
http://wiki.apache.org/lucene-java/BasicsOfPerformance
http://wiki.apache.org/lucene-java/LuceneFAQ
Re: Search Test file
Posted by Amin Mohammed-Coleman <am...@gmail.com>.
Hi,
Please ignore my last email. Just woke up and wrote the email. After
looking at the luke further it looks like the token is being stored at
index.amin, that is why "amin" wasn't working. Making those changes
that you recommended worked.
I will investigate further why "amin" token is being stored as
"indexed.amin".
Thanks again for all the help.
Cheers
Amin
On 4 Jan 2009, at 02:23, Grant Ingersoll wrote:
>
>
> Begin forwarded message:
>
>> From: Grant Ingersoll <gs...@apache.org>
>> Date: January 3, 2009 8:19:14 PM EST
>> To: java-dev@lucene.apache.org
>> Subject: Fwd: Search Test file
>> Reply-To: java-dev@lucene.apache.org
>>
>> Hi Amin,
>>
>> I see a couple of issues with your program below, and one that is
>> the cause of the problem of not finding "amin" as a query term.
>>
>> When you construct your IndexWriter, you are doing:
>>> IndexWriter indexWriter = new
>>> IndexWriter(getDirectory(),getAnalyzer(),new
>>> IndexWriter.MaxFieldLength(2));
>>
>> The MaxFieldLength parameter specifies the maximum number of tokens
>> allowed in a Field. Everything else after that is dropped. See http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.html#IndexWriter(org.apache.lucene.store.Directory,%20org.apache.lucene.analysis.Analyzer,%20org.apache.lucene.index.IndexWriter.MaxFieldLength)
>> and http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.MaxFieldLength.html
>>
>> Also,
>> TopDocs topDocs = multiSearcher.search(query,
>> BooleanQuery.getMaxClauseCount());
>>
>> strikes me as really odd. Why are you passing in the max clause
>> count as the number of results you want returned?
>>
>> Cheers,
>> Grant
>>
>>
>>
>> Begin forwarded message:
>>
>>> From: "aminmc@gmail.com" <am...@gmail.com>
>>> Date: January 3, 2009 3:24:52 PM EST
>>> To: gsingers@apache.org
>>> Subject: Search Test file
>>>
>>> I've shared a document with you called "Search Test file":
>>> http://docs.google.com/Doc?id=d77xf5q_0n6hb38fx&invite=cjq79zj
>>>
>>> It's not an attachment -- it's stored online at Google Docs. To
>>> open this document, just click the link above.
>>> ---
>>>
>>> Hi
>>>
>>> I have uploaded the test file at google docs. It is currently a
>>> txt file but if you change the extension to .java it should work.
>>>
>>> package com.amin.app.lucene.search.impl;
>>>
>>> import static org.junit.Assert.assertEquals;
>>> import static org.junit.Assert.assertNotNull;
>>> import static org.junit.Assert.assertNotSame;
>>> import static org.junit.Assert.assertTrue;
>>>
>>> import java.io.File;
>>> import java.io.FileInputStream;
>>> import java.io.FileOutputStream;
>>> import java.io.IOException;
>>> import java.io.InputStream;
>>> import java.io.OutputStream;
>>>
>>> import javax.swing.text.BadLocationException;
>>> import javax.swing.text.DefaultStyledDocument;
>>> import javax.swing.text.rtf.RTFEditorKit;
>>>
>>> import org.apache.commons.lang.StringUtils;
>>> import org.apache.lucene.analysis.Analyzer;
>>> import org.apache.lucene.analysis.standard.StandardAnalyzer;
>>> import org.apache.lucene.ant.DocumentHandler;
>>> import org.apache.lucene.ant.DocumentHandlerException;
>>> import org.apache.lucene.document.Document;
>>> import org.apache.lucene.document.Field;
>>> import org.apache.lucene.index.CorruptIndexException;
>>> import org.apache.lucene.index.IndexReader;
>>> import org.apache.lucene.index.IndexWriter;
>>> import org.apache.lucene.queryParser.MultiFieldQueryParser;
>>> import org.apache.lucene.queryParser.QueryParser;
>>> import org.apache.lucene.search.BooleanQuery;
>>> import org.apache.lucene.search.IndexSearcher;
>>> import org.apache.lucene.search.MultiSearcher;
>>> import org.apache.lucene.search.Query;
>>> import org.apache.lucene.search.ScoreDoc;
>>> import org.apache.lucene.search.Searchable;
>>> import org.apache.lucene.search.TopDocs;
>>> import org.apache.lucene.store.Directory;
>>> import org.apache.lucene.store.FSDirectory;
>>> import org.junit.After;
>>> import org.junit.Before;
>>> import org.junit.Test;
>>>
>>> import com.amin.app.lucene.util.WorkItem.IndexerType;
>>>
>>> public class SearchTest {
>>>
>>> private File rtfFile = null;
>>> private static final String RTF_FILE_NAME =
>>> "rtfDocumentToIndex.rtf";
>>>
>>> @Before
>>> public void setUp() throws Exception {
>>> InputStream inputStream =
>>> this.getClass().getClassLoader().getResourceAsStream(RTF_FILE_NAME);
>>> rtfFile = new File(RTF_FILE_NAME);
>>> convertInputStreamToFile(inputStream, rtfFile);
>>> }
>>>
>>>
>>>
>>> @Test
>>> public void testCanCreateLuceneDocumentForRTFDocument() throws
>>> Exception {
>>> JavaBuiltInRTFHandler builtInRTFHandler = new
>>> JavaBuiltInRTFHandler();
>>> Document document = builtInRTFHandler.getDocument(rtfFile);
>>> assertNotNull(document);
>>> String value = document.get(FieldNameEnum.BODY.getDescription());
>>> assertNotNull(value);
>>> assertNotSame("", value);
>>> assertTrue(value.contains("Amin Mohammed-Coleman"));
>>> assertTrue(value.contains("This is a test rtf document that will
>>> be indexed."));
>>> String path = document.get(FieldNameEnum.PATH.getDescription());
>>> assertNotNull(path);
>>> assertTrue(path.contains(".rtf"));
>>> String fileName = document.get(FieldNameEnum.NAME.getDescription());
>>> assertNotNull(fileName);
>>> assertEquals(RTF_FILE_NAME, fileName);
>>> assertEquals(WorkItem.IndexerType.RTF_INDEXER.name(),
>>> document.get(FieldNameEnum.TYPE.getDescription()));
>>>
>>> }
>>>
>>>
>>>
>>> @Test
>>> public void testCanSearchRtfDocument() throws Exception {
>>> JavaBuiltInRTFHandler builtInRTFHandler = new
>>> JavaBuiltInRTFHandler();
>>> Document document = builtInRTFHandler.getDocument(rtfFile);
>>> IndexWriter indexWriter = new
>>> IndexWriter(getDirectory(),getAnalyzer(),new
>>> IndexWriter.MaxFieldLength(2));
>>> try {
>>> indexWriter.addDocument(document);
>>> commitAndCloseWriter(indexWriter);
>>> } catch (CorruptIndexException e) {
>>> throw new IllegalStateException(e);
>>> } catch (IOException e) {
>>> throw new IllegalStateException(e);
>>> }
>>>
>>> //I plan to use other searchers later
>>> IndexSearcher indexSearcher = new IndexSearcher(getDirectory());
>>> MultiSearcher multiSearcher = new MultiSearcher(new Searchable[]
>>> {indexSearcher});
>>> QueryParser queryParser = new MultiFieldQueryParser(new String[]
>>> {FieldNameEnum.BODY.getDescription()}, new StandardAnalyzer());
>>> Query query = queryParser.parse("amin");
>>> TopDocs topDocs = multiSearcher.search(query,
>>> BooleanQuery.getMaxClauseCount());
>>> assertNotNull(topDocs);
>>> assertEquals(1, topDocs.totalHits);
>>> ScoreDoc[] scoreDocs = topDocs.scoreDocs;
>>> for (ScoreDoc scoreDoc : scoreDocs) {
>>> Document documentFromSearch = indexSearcher.doc(scoreDoc.doc);
>>> assertNotNull(documentFromSearch);
>>> String bodyText =
>>> documentFromSearch.get(FieldNameEnum.BODY.getDescription());
>>> assertNotNull(bodyText);
>>> assertNotSame("", bodyText);
>>> assertTrue(bodyText.contains("Amin Mohammed-Coleman"));
>>> assertTrue(bodyText.contains("This is a test rtf document that
>>> will be indexed."));
>>>
>>> }
>>> multiSearcher.close();
>>>
>>> }
>>>
>>> @After
>>> public void tearDown() throws Exception {
>>> rtfFile.delete();
>>> if (getDirectory().list() != null && getDirectory().list().length
>>> > 0) {
>>> IndexReader reader = IndexReader.open(getDirectory());
>>> for(int i = 0; i < reader.maxDoc();i++) {
>>> reader.deleteDocument(i);
>>> }
>>> reader.close();
>>> }
>>> }
>>>
>>> private void commitAndCloseWriter(IndexWriter indexWriter) throws
>>> CorruptIndexException,IOException {
>>> indexWriter.commit();
>>> indexWriter.close();
>>> }
>>>
>>>
>>> public Directory getDirectory() throws IOException {
>>> return FSDirectory.getDirectory("/tmp/lucene/rtf");
>>> }
>>>
>>> public Analyzer getAnalyzer() {
>>> return new StandardAnalyzer();
>>> }
>>> private static void convertInputStreamToFile(InputStream
>>> inputStream, File file) {
>>> try
>>> {
>>> OutputStream out=new FileOutputStream(file);
>>> byte buf[]=new byte[1024];
>>> int len;
>>> while((len=inputStream.read(buf))>0)
>>> out.write(buf,0,len);
>>> out.close();
>>> inputStream.close();
>>>
>>> }catch (IOException e){
>>> throw new IllegalStateException(e);
>>> }
>>> }
>>> private static class JavaBuiltInRTFHandler implements
>>> DocumentHandler{
>>>
>>> public Document getDocument(File file) throws
>>> DocumentHandlerException {
>>> String bodyText = null;
>>> DefaultStyledDocument styledDoc = new DefaultStyledDocument();
>>> try {
>>> InputStream inputStream = new FileInputStream(file);
>>> new RTFEditorKit().read(inputStream, styledDoc, 0);
>>> bodyText = styledDoc.getText(0, styledDoc.getLength());
>>> } catch (IOException ioex) {
>>> throw new IllegalStateException(ioex);
>>> } catch (BadLocationException e) {
>>> throw new IllegalArgumentException(e);
>>> }
>>> //create Document object using body
>>> if (bodyText != null) {
>>> Document document = new Document();
>>> String trimmedBodyText = StringUtils.trimToEmpty(bodyText);
>>> trimmedBodyText = trimmedBodyText.replaceAll("\n", "");
>>> Field field = new
>>> Field(FieldNameEnum.BODY.getDescription(),trimmedBodyText,
>>> Field.Store.YES, Field.Index.ANALYZED);
>>> document.add(field);
>>>
>>> String pathToFile = file.getPath();
>>> Field pathToFileField = new
>>> Field(FieldNameEnum.PATH.getDescription(),pathToFile,
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(pathToFileField);
>>>
>>> String fileName = file.getName();
>>> Field fileNameField = new
>>> Field(FieldNameEnum.NAME.getDescription(),fileName,
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(fileNameField);
>>>
>>> Field typeField = new
>>> Field
>>> (FieldNameEnum
>>> .TYPE.getDescription(),IndexerType.RTF_INDEXER.name(),
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(typeField);
>>>
>>> String summary = bodyText.substring(0, 10);
>>>
>>> Field summaryField = new
>>> Field(FieldNameEnum.SUMMARY.getDescription(),summary,
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(summaryField);
>>>
>>> return document;
>>> }
>>> return null;
>>> }
>>> }
>>>
>>> private static class WorkItem {
>>>
>>> public enum WorkItemEvent {
>>> ADD,
>>> UPDATE,
>>> DELETE;
>>> }
>>>
>>> public enum IndexerType {
>>> RTF_INDEXER,
>>> PDF_INDEXER,
>>> XML_INDEXER,
>>> PLAIN_TEXT_INDEXER,
>>> MS_WORD_INDEXER,
>>> MS_EXCEL_INDEXER,
>>> MS_POWERPOINT_INDEXER;
>>> }
>>>
>>>
>>> private final Document workLoad;
>>>
>>> private final WorkItemEvent workItemEvent;
>>>
>>> private final IndexerType indexerType;
>>>
>>>
>>> public WorkItem(final Document workLoad, final WorkItemEvent
>>> workItemEvent) {
>>> this.workLoad = workLoad;
>>> this.workItemEvent = workItemEvent;
>>> String type = this.workLoad.get("type");
>>> this.indexerType = IndexerType.valueOf(type);
>>> }
>>>
>>> public IndexerType getIndexerType() {
>>> return indexerType;
>>> }
>>>
>>> public Document getWorkLoad() {
>>> return workLoad;
>>> }
>>>
>>> public WorkItemEvent getWorkItemEvent() {
>>> return workItemEvent;
>>> }
>>> }
>>>
>>> private enum FieldNameEnum {
>>>
>>> AUTHOR("author"),
>>> BODY("body"),
>>> TITLE("title"),
>>> SUBJECT("subject"),
>>> KEYWORDS("keywords"),
>>> PATH("path"), NAME ("name"),
>>> TYPE("type"),
>>> ID ("id"),
>>> SUMMARY ("summary");
>>>
>>> private final String description;
>>>
>>> private FieldNameEnum(final String description) {
>>> this.description = description;
>>> }
>>>
>>> public String getDescription() {
>>> return this.description;
>>> }
>>> }
>>> }
>>
>> --------------------------
>> Grant Ingersoll
>>
>> Lucene Helpful Hints:
>> http://wiki.apache.org/lucene-java/BasicsOfPerformance
>> http://wiki.apache.org/lucene-java/LuceneFAQ
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>
> --------------------------
> Grant Ingersoll
>
> Lucene Helpful Hints:
> http://wiki.apache.org/lucene-java/BasicsOfPerformance
> http://wiki.apache.org/lucene-java/LuceneFAQ
>
>
>
>
>
>
>
>
>
>
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org
Re: Search Test file
Posted by Amin Mohammed-Coleman <am...@gmail.com>.
Hi
Test case passing now. Thanks for your help. I kind of thought it was
probably something I was doing wrong!
Cheers
Amin
On 4 Jan 2009, at 16:59, Grant Ingersoll <gs...@apache.org> wrote:
>
> On Jan 4, 2009, at 2:49 AM, Amin Mohammed-Coleman wrote:
>
>> Hi Grant
>>
>> Thank you for looking at the test case. I have updated the
>> IndexWriter to use UNLIMITED for MaxFieldLength. I tried using
>> Integer.MAX_VALUE for
>>
>>>> Also,
>>>> TopDocs topDocs = multiSearcher.search(query,
>>>> BooleanQuery.getMaxClauseCount());
>>>>
>>>> strikes me as really odd. Why are you passing in the max clause
>>>> count as the number of results you want returned?
>>
>>
>
> Just pass in something like "10".
>
>> However I get the following exception :
>>
>> java.lang.NegativeArraySizeException
>> at
>> org.apache.lucene.util.PriorityQueue.initialize(PriorityQueue.java:
>> 41)
>> at org.apache.lucene.search.HitQueue.<init>(HitQueue.java:24)
>> at
>> org.apache.lucene.search.MultiSearcher.search(MultiSearcher.java:200)
>> at org.apache.lucene.search.Searcher.search(Searcher.java:136)
>> at org.apache.lucene.search.Searcher.search(Searcher.java:146)
>> at
>> com.
>> amin.
>> app.
>> lucene.
>> search.impl.SearchTest.testCanSearchRtfDocument(SearchTest.java:101)
>> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>> at
>> sun.
>> reflect.
>> NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
>> at
>> sun.
>> reflect.
>> DelegatingMethodAccessorImpl.
>> invoke(DelegatingMethodAccessorImpl.java:25)
>> at java.lang.reflect.Method.invoke(Method.java:597)
>> at org.junit.internal.runners.TestMethod.invoke(TestMethod.java:
>> 59)
>> at
>> org.
>> junit.internal.runners.MethodRoadie.runTestMethod(MethodRoadie.java:
>> 98)
>> at org.junit.internal.runners.MethodRoadie
>> $2.run(MethodRoadie.java:79)
>> at
>> org.
>> junit.
>> internal.
>> runners.MethodRoadie.runBeforesThenTestThenAfters(MethodRoadie.java:
>> 87)
>> at
>> org.junit.internal.runners.MethodRoadie.runTest(MethodRoadie.java:77)
>> at org.junit.internal.runners.MethodRoadie.run(MethodRoadie.java:
>> 42)
>> at
>> org.
>> junit.
>> internal.
>> runners.JUnit4ClassRunner.invokeTestMethod(JUnit4ClassRunner.java:88)
>> at
>> org.
>> junit.
>> internal.
>> runners.JUnit4ClassRunner.runMethods(JUnit4ClassRunner.java:51)
>> at org.junit.internal.runners.JUnit4ClassRunner
>> $1.run(JUnit4ClassRunner.java:44)
>> at
>> org.
>> junit.internal.runners.ClassRoadie.runUnprotected(ClassRoadie.java:
>> 27)
>> at
>> org.
>> junit.internal.runners.ClassRoadie.runProtected(ClassRoadie.java:37)
>> at
>> org.
>> junit.internal.runners.JUnit4ClassRunner.run(JUnit4ClassRunner.java:
>> 42)
>> at
>> org.
>> eclipse.
>> jdt.
>> internal.
>> junit4.runner.JUnit4TestReference.run(JUnit4TestReference.java:45)
>> at
>> org.
>> eclipse.
>> jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)
>> at
>> org.
>> eclipse.
>> jdt.
>> internal.
>> junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:460)
>> at
>> org.
>> eclipse.
>> jdt.
>> internal.
>> junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:673)
>> at
>> org.
>> eclipse.
>> jdt.
>> internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java:386)
>> at
>> org.
>> eclipse.
>> jdt.
>> internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java:
>> 196)
>>
>>
>> I know that this is an issue (not being able to use
>> Integer.MAX_VALUE). I tried using 100 and my test still doesn't
>> pass.
>>
>>
>> Cheers
>> Amin
>>
>>
>> On 4 Jan 2009, at 02:23, Grant Ingersoll wrote:
>>
>>>
>>>
>>> Begin forwarded message:
>>>
>>>> From: Grant Ingersoll <gs...@apache.org>
>>>> Date: January 3, 2009 8:19:14 PM EST
>>>> To: java-dev@lucene.apache.org
>>>> Subject: Fwd: Search Test file
>>>> Reply-To: java-dev@lucene.apache.org
>>>>
>>>> Hi Amin,
>>>>
>>>> I see a couple of issues with your program below, and one that is
>>>> the cause of the problem of not finding "amin" as a query term.
>>>>
>>>> When you construct your IndexWriter, you are doing:
>>>>> IndexWriter indexWriter = new
>>>>> IndexWriter(getDirectory(),getAnalyzer(),new
>>>>> IndexWriter.MaxFieldLength(2));
>>>>
>>>> The MaxFieldLength parameter specifies the maximum number of
>>>> tokens allowed in a Field. Everything else after that is
>>>> dropped. See http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.html#IndexWriter(org.apache.lucene.store.Directory,%20org.apache.lucene.analysis.Analyzer,%20org.apache.lucene.index.IndexWriter.MaxFieldLength
>>>> ) and http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.MaxFieldLength.html
>>>>
>>>> Also,
>>>> TopDocs topDocs = multiSearcher.search(query,
>>>> BooleanQuery.getMaxClauseCount());
>>>>
>>>> strikes me as really odd. Why are you passing in the max clause
>>>> count as the number of results you want returned?
>>>>
>>>> Cheers,
>>>> Grant
>>>>
>>>>
>>>>
>>>> Begin forwarded message:
>>>>
>>>>> From: "aminmc@gmail.com" <am...@gmail.com>
>>>>> Date: January 3, 2009 3:24:52 PM EST
>>>>> To: gsingers@apache.org
>>>>> Subject: Search Test file
>>>>>
>>>>> I've shared a document with you called "Search Test file":
>>>>> http://docs.google.com/Doc?id=d77xf5q_0n6hb38fx&invite=cjq79zj
>>>>>
>>>>> It's not an attachment -- it's stored online at Google Docs. To
>>>>> open this document, just click the link above.
>>>>> ---
>>>>>
>>>>> Hi
>>>>>
>>>>> I have uploaded the test file at google docs. It is currently a
>>>>> txt file but if you change the extension to .java it should work.
>>>>>
>>>>> package com.amin.app.lucene.search.impl;
>>>>>
>>>>> import static org.junit.Assert.assertEquals;
>>>>> import static org.junit.Assert.assertNotNull;
>>>>> import static org.junit.Assert.assertNotSame;
>>>>> import static org.junit.Assert.assertTrue;
>>>>>
>>>>> import java.io.File;
>>>>> import java.io.FileInputStream;
>>>>> import java.io.FileOutputStream;
>>>>> import java.io.IOException;
>>>>> import java.io.InputStream;
>>>>> import java.io.OutputStream;
>>>>>
>>>>> import javax.swing.text.BadLocationException;
>>>>> import javax.swing.text.DefaultStyledDocument;
>>>>> import javax.swing.text.rtf.RTFEditorKit;
>>>>>
>>>>> import org.apache.commons.lang.StringUtils;
>>>>> import org.apache.lucene.analysis.Analyzer;
>>>>> import org.apache.lucene.analysis.standard.StandardAnalyzer;
>>>>> import org.apache.lucene.ant.DocumentHandler;
>>>>> import org.apache.lucene.ant.DocumentHandlerException;
>>>>> import org.apache.lucene.document.Document;
>>>>> import org.apache.lucene.document.Field;
>>>>> import org.apache.lucene.index.CorruptIndexException;
>>>>> import org.apache.lucene.index.IndexReader;
>>>>> import org.apache.lucene.index.IndexWriter;
>>>>> import org.apache.lucene.queryParser.MultiFieldQueryParser;
>>>>> import org.apache.lucene.queryParser.QueryParser;
>>>>> import org.apache.lucene.search.BooleanQuery;
>>>>> import org.apache.lucene.search.IndexSearcher;
>>>>> import org.apache.lucene.search.MultiSearcher;
>>>>> import org.apache.lucene.search.Query;
>>>>> import org.apache.lucene.search.ScoreDoc;
>>>>> import org.apache.lucene.search.Searchable;
>>>>> import org.apache.lucene.search.TopDocs;
>>>>> import org.apache.lucene.store.Directory;
>>>>> import org.apache.lucene.store.FSDirectory;
>>>>> import org.junit.After;
>>>>> import org.junit.Before;
>>>>> import org.junit.Test;
>>>>>
>>>>> import com.amin.app.lucene.util.WorkItem.IndexerType;
>>>>>
>>>>> public class SearchTest {
>>>>>
>>>>> private File rtfFile = null;
>>>>> private static final String RTF_FILE_NAME =
>>>>> "rtfDocumentToIndex.rtf";
>>>>>
>>>>> @Before
>>>>> public void setUp() throws Exception {
>>>>> InputStream inputStream =
>>>>> this.
>>>>> getClass().getClassLoader().getResourceAsStream(RTF_FILE_NAME);
>>>>> rtfFile = new File(RTF_FILE_NAME);
>>>>> convertInputStreamToFile(inputStream, rtfFile);
>>>>> }
>>>>>
>>>>>
>>>>>
>>>>> @Test
>>>>> public void testCanCreateLuceneDocumentForRTFDocument() throws
>>>>> Exception {
>>>>> JavaBuiltInRTFHandler builtInRTFHandler = new
>>>>> JavaBuiltInRTFHandler();
>>>>> Document document = builtInRTFHandler.getDocument(rtfFile);
>>>>> assertNotNull(document);
>>>>> String value = document.get(FieldNameEnum.BODY.getDescription());
>>>>> assertNotNull(value);
>>>>> assertNotSame("", value);
>>>>> assertTrue(value.contains("Amin Mohammed-Coleman"));
>>>>> assertTrue(value.contains("This is a test rtf document that will
>>>>> be indexed."));
>>>>> String path = document.get(FieldNameEnum.PATH.getDescription());
>>>>> assertNotNull(path);
>>>>> assertTrue(path.contains(".rtf"));
>>>>> String fileName =
>>>>> document.get(FieldNameEnum.NAME.getDescription());
>>>>> assertNotNull(fileName);
>>>>> assertEquals(RTF_FILE_NAME, fileName);
>>>>> assertEquals(WorkItem.IndexerType.RTF_INDEXER.name(),
>>>>> document.get(FieldNameEnum.TYPE.getDescription()));
>>>>>
>>>>> }
>>>>>
>>>>>
>>>>>
>>>>> @Test
>>>>> public void testCanSearchRtfDocument() throws Exception {
>>>>> JavaBuiltInRTFHandler builtInRTFHandler = new
>>>>> JavaBuiltInRTFHandler();
>>>>> Document document = builtInRTFHandler.getDocument(rtfFile);
>>>>> IndexWriter indexWriter = new
>>>>> IndexWriter(getDirectory(),getAnalyzer(),new
>>>>> IndexWriter.MaxFieldLength(2));
>>>>> try {
>>>>> indexWriter.addDocument(document);
>>>>> commitAndCloseWriter(indexWriter);
>>>>> } catch (CorruptIndexException e) {
>>>>> throw new IllegalStateException(e);
>>>>> } catch (IOException e) {
>>>>> throw new IllegalStateException(e);
>>>>> }
>>>>>
>>>>> //I plan to use other searchers later
>>>>> IndexSearcher indexSearcher = new IndexSearcher(getDirectory());
>>>>> MultiSearcher multiSearcher = new MultiSearcher(new Searchable[]
>>>>> {indexSearcher});
>>>>> QueryParser queryParser = new MultiFieldQueryParser(new String[]
>>>>> {FieldNameEnum.BODY.getDescription()}, new StandardAnalyzer());
>>>>> Query query = queryParser.parse("amin");
>>>>> TopDocs topDocs = multiSearcher.search(query,
>>>>> BooleanQuery.getMaxClauseCount());
>>>>> assertNotNull(topDocs);
>>>>> assertEquals(1, topDocs.totalHits);
>>>>> ScoreDoc[] scoreDocs = topDocs.scoreDocs;
>>>>> for (ScoreDoc scoreDoc : scoreDocs) {
>>>>> Document documentFromSearch = indexSearcher.doc(scoreDoc.doc);
>>>>> assertNotNull(documentFromSearch);
>>>>> String bodyText =
>>>>> documentFromSearch.get(FieldNameEnum.BODY.getDescription());
>>>>> assertNotNull(bodyText);
>>>>> assertNotSame("", bodyText);
>>>>> assertTrue(bodyText.contains("Amin Mohammed-Coleman"));
>>>>> assertTrue(bodyText.contains("This is a test rtf document that
>>>>> will be indexed."));
>>>>>
>>>>> }
>>>>> multiSearcher.close();
>>>>>
>>>>> }
>>>>>
>>>>> @After
>>>>> public void tearDown() throws Exception {
>>>>> rtfFile.delete();
>>>>> if (getDirectory().list() != null &&
>>>>> getDirectory().list().length > 0) {
>>>>> IndexReader reader = IndexReader.open(getDirectory());
>>>>> for(int i = 0; i < reader.maxDoc();i++) {
>>>>> reader.deleteDocument(i);
>>>>> }
>>>>> reader.close();
>>>>> }
>>>>> }
>>>>>
>>>>> private void commitAndCloseWriter(IndexWriter indexWriter)
>>>>> throws CorruptIndexException,IOException {
>>>>> indexWriter.commit();
>>>>> indexWriter.close();
>>>>> }
>>>>>
>>>>>
>>>>> public Directory getDirectory() throws IOException {
>>>>> return FSDirectory.getDirectory("/tmp/lucene/rtf");
>>>>> }
>>>>>
>>>>> public Analyzer getAnalyzer() {
>>>>> return new StandardAnalyzer();
>>>>> }
>>>>> private static void convertInputStreamToFile(InputStream
>>>>> inputStream, File file) {
>>>>> try
>>>>> {
>>>>> OutputStream out=new FileOutputStream(file);
>>>>> byte buf[]=new byte[1024];
>>>>> int len;
>>>>> while((len=inputStream.read(buf))>0)
>>>>> out.write(buf,0,len);
>>>>> out.close();
>>>>> inputStream.close();
>>>>>
>>>>> }catch (IOException e){
>>>>> throw new IllegalStateException(e);
>>>>> }
>>>>> }
>>>>> private static class JavaBuiltInRTFHandler implements
>>>>> DocumentHandler{
>>>>>
>>>>> public Document getDocument(File file) throws
>>>>> DocumentHandlerException {
>>>>> String bodyText = null;
>>>>> DefaultStyledDocument styledDoc = new DefaultStyledDocument();
>>>>> try {
>>>>> InputStream inputStream = new FileInputStream(file);
>>>>> new RTFEditorKit().read(inputStream, styledDoc, 0);
>>>>> bodyText = styledDoc.getText(0, styledDoc.getLength());
>>>>> } catch (IOException ioex) {
>>>>> throw new IllegalStateException(ioex);
>>>>> } catch (BadLocationException e) {
>>>>> throw new IllegalArgumentException(e);
>>>>> }
>>>>> //create Document object using body
>>>>> if (bodyText != null) {
>>>>> Document document = new Document();
>>>>> String trimmedBodyText = StringUtils.trimToEmpty(bodyText);
>>>>> trimmedBodyText = trimmedBodyText.replaceAll("\n", "");
>>>>> Field field = new
>>>>> Field(FieldNameEnum.BODY.getDescription(),trimmedBodyText,
>>>>> Field.Store.YES, Field.Index.ANALYZED);
>>>>> document.add(field);
>>>>>
>>>>> String pathToFile = file.getPath();
>>>>> Field pathToFileField = new
>>>>> Field(FieldNameEnum.PATH.getDescription(),pathToFile,
>>>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>>>> document.add(pathToFileField);
>>>>>
>>>>> String fileName = file.getName();
>>>>> Field fileNameField = new
>>>>> Field(FieldNameEnum.NAME.getDescription(),fileName,
>>>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>>>> document.add(fileNameField);
>>>>>
>>>>> Field typeField = new
>>>>> Field(
>>>>> FieldNameEnum.
>>>>> TYPE.getDescription(),IndexerType.RTF_INDEXER.name(),
>>>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>>>> document.add(typeField);
>>>>>
>>>>> String summary = bodyText.substring(0, 10);
>>>>>
>>>>> Field summaryField = new
>>>>> Field(FieldNameEnum.SUMMARY.getDescription(),summary,
>>>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>>>> document.add(summaryField);
>>>>>
>>>>> return document;
>>>>> }
>>>>> return null;
>>>>> }
>>>>> }
>>>>>
>>>>> private static class WorkItem {
>>>>>
>>>>> public enum WorkItemEvent {
>>>>> ADD,
>>>>> UPDATE,
>>>>> DELETE;
>>>>> }
>>>>>
>>>>> public enum IndexerType {
>>>>> RTF_INDEXER,
>>>>> PDF_INDEXER,
>>>>> XML_INDEXER,
>>>>> PLAIN_TEXT_INDEXER,
>>>>> MS_WORD_INDEXER,
>>>>> MS_EXCEL_INDEXER,
>>>>> MS_POWERPOINT_INDEXER;
>>>>> }
>>>>>
>>>>>
>>>>> private final Document workLoad;
>>>>>
>>>>> private final WorkItemEvent workItemEvent;
>>>>>
>>>>> private final IndexerType indexerType;
>>>>>
>>>>>
>>>>> public WorkItem(final Document workLoad, final WorkItemEvent
>>>>> workItemEvent) {
>>>>> this.workLoad = workLoad;
>>>>> this.workItemEvent = workItemEvent;
>>>>> String type = this.workLoad.get("type");
>>>>> this.indexerType = IndexerType.valueOf(type);
>>>>> }
>>>>>
>>>>> public IndexerType getIndexerType() {
>>>>> return indexerType;
>>>>> }
>>>>>
>>>>> public Document getWorkLoad() {
>>>>> return workLoad;
>>>>> }
>>>>>
>>>>> public WorkItemEvent getWorkItemEvent() {
>>>>> return workItemEvent;
>>>>> }
>>>>> }
>>>>>
>>>>> private enum FieldNameEnum {
>>>>>
>>>>> AUTHOR("author"),
>>>>> BODY("body"),
>>>>> TITLE("title"),
>>>>> SUBJECT("subject"),
>>>>> KEYWORDS("keywords"),
>>>>> PATH("path"), NAME ("name"),
>>>>> TYPE("type"),
>>>>> ID ("id"),
>>>>> SUMMARY ("summary");
>>>>>
>>>>> private final String description;
>>>>>
>>>>> private FieldNameEnum(final String description) {
>>>>> this.description = description;
>>>>> }
>>>>>
>>>>> public String getDescription() {
>>>>> return this.description;
>>>>> }
>>>>> }
>>>>> }
>>>>
>>>> --------------------------
>>>> Grant Ingersoll
>>>>
>>>> Lucene Helpful Hints:
>>>> http://wiki.apache.org/lucene-java/BasicsOfPerformance
>>>> http://wiki.apache.org/lucene-java/LuceneFAQ
>>>>
>>>>
>>>>
>>>>
>>>>
>>>>
>>>>
>>>>
>>>>
>>>>
>>>
>>> --------------------------
>>> Grant Ingersoll
>>>
>>> Lucene Helpful Hints:
>>> http://wiki.apache.org/lucene-java/BasicsOfPerformance
>>> http://wiki.apache.org/lucene-java/LuceneFAQ
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>
>>
>> ---------------------------------------------------------------------
>> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
>> For additional commands, e-mail: java-user-help@lucene.apache.org
>>
>
> --------------------------
> Grant Ingersoll
>
> Lucene Helpful Hints:
> http://wiki.apache.org/lucene-java/BasicsOfPerformance
> http://wiki.apache.org/lucene-java/LuceneFAQ
>
>
>
>
>
>
>
>
>
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org
Re: Search Test file
Posted by Grant Ingersoll <gs...@apache.org>.
On Jan 4, 2009, at 2:49 AM, Amin Mohammed-Coleman wrote:
> Hi Grant
>
> Thank you for looking at the test case. I have updated the
> IndexWriter to use UNLIMITED for MaxFieldLength. I tried using
> Integer.MAX_VALUE for
>
>>> Also,
>>> TopDocs topDocs = multiSearcher.search(query,
>>> BooleanQuery.getMaxClauseCount());
>>>
>>> strikes me as really odd. Why are you passing in the max clause
>>> count as the number of results you want returned?
>
>
Just pass in something like "10".
> However I get the following exception :
>
> java.lang.NegativeArraySizeException
> at
> org.apache.lucene.util.PriorityQueue.initialize(PriorityQueue.java:41)
> at org.apache.lucene.search.HitQueue.<init>(HitQueue.java:24)
> at org.apache.lucene.search.MultiSearcher.search(MultiSearcher.java:
> 200)
> at org.apache.lucene.search.Searcher.search(Searcher.java:136)
> at org.apache.lucene.search.Searcher.search(Searcher.java:146)
> at
> com
> .amin
> .app
> .lucene
> .search.impl.SearchTest.testCanSearchRtfDocument(SearchTest.java:101)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at
> sun
> .reflect
> .NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
> at
> sun
> .reflect
> .DelegatingMethodAccessorImpl
> .invoke(DelegatingMethodAccessorImpl.java:25)
> at java.lang.reflect.Method.invoke(Method.java:597)
> at org.junit.internal.runners.TestMethod.invoke(TestMethod.java:59)
> at
> org
> .junit.internal.runners.MethodRoadie.runTestMethod(MethodRoadie.java:
> 98)
> at org.junit.internal.runners.MethodRoadie$2.run(MethodRoadie.java:
> 79)
> at
> org
> .junit
> .internal
> .runners.MethodRoadie.runBeforesThenTestThenAfters(MethodRoadie.java:
> 87)
> at
> org.junit.internal.runners.MethodRoadie.runTest(MethodRoadie.java:77)
> at org.junit.internal.runners.MethodRoadie.run(MethodRoadie.java:42)
> at
> org
> .junit
> .internal
> .runners.JUnit4ClassRunner.invokeTestMethod(JUnit4ClassRunner.java:88)
> at
> org
> .junit
> .internal
> .runners.JUnit4ClassRunner.runMethods(JUnit4ClassRunner.java:51)
> at org.junit.internal.runners.JUnit4ClassRunner
> $1.run(JUnit4ClassRunner.java:44)
> at
> org
> .junit.internal.runners.ClassRoadie.runUnprotected(ClassRoadie.java:
> 27)
> at
> org.junit.internal.runners.ClassRoadie.runProtected(ClassRoadie.java:
> 37)
> at
> org
> .junit.internal.runners.JUnit4ClassRunner.run(JUnit4ClassRunner.java:
> 42)
> at
> org
> .eclipse
> .jdt
> .internal
> .junit4.runner.JUnit4TestReference.run(JUnit4TestReference.java:45)
> at
> org
> .eclipse
> .jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)
> at
> org
> .eclipse
> .jdt
> .internal
> .junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:460)
> at
> org
> .eclipse
> .jdt
> .internal
> .junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:673)
> at
> org
> .eclipse
> .jdt
> .internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java:386)
> at
> org
> .eclipse
> .jdt
> .internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java:
> 196)
>
>
> I know that this is an issue (not being able to use
> Integer.MAX_VALUE). I tried using 100 and my test still doesn't pass.
>
>
> Cheers
> Amin
>
>
> On 4 Jan 2009, at 02:23, Grant Ingersoll wrote:
>
>>
>>
>> Begin forwarded message:
>>
>>> From: Grant Ingersoll <gs...@apache.org>
>>> Date: January 3, 2009 8:19:14 PM EST
>>> To: java-dev@lucene.apache.org
>>> Subject: Fwd: Search Test file
>>> Reply-To: java-dev@lucene.apache.org
>>>
>>> Hi Amin,
>>>
>>> I see a couple of issues with your program below, and one that is
>>> the cause of the problem of not finding "amin" as a query term.
>>>
>>> When you construct your IndexWriter, you are doing:
>>>> IndexWriter indexWriter = new
>>>> IndexWriter(getDirectory(),getAnalyzer(),new
>>>> IndexWriter.MaxFieldLength(2));
>>>
>>> The MaxFieldLength parameter specifies the maximum number of
>>> tokens allowed in a Field. Everything else after that is
>>> dropped. See http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.html#IndexWriter(org.apache.lucene.store.Directory,%20org.apache.lucene.analysis.Analyzer,%20org.apache.lucene.index.IndexWriter.MaxFieldLength)
>>> and http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.MaxFieldLength.html
>>>
>>> Also,
>>> TopDocs topDocs = multiSearcher.search(query,
>>> BooleanQuery.getMaxClauseCount());
>>>
>>> strikes me as really odd. Why are you passing in the max clause
>>> count as the number of results you want returned?
>>>
>>> Cheers,
>>> Grant
>>>
>>>
>>>
>>> Begin forwarded message:
>>>
>>>> From: "aminmc@gmail.com" <am...@gmail.com>
>>>> Date: January 3, 2009 3:24:52 PM EST
>>>> To: gsingers@apache.org
>>>> Subject: Search Test file
>>>>
>>>> I've shared a document with you called "Search Test file":
>>>> http://docs.google.com/Doc?id=d77xf5q_0n6hb38fx&invite=cjq79zj
>>>>
>>>> It's not an attachment -- it's stored online at Google Docs. To
>>>> open this document, just click the link above.
>>>> ---
>>>>
>>>> Hi
>>>>
>>>> I have uploaded the test file at google docs. It is currently a
>>>> txt file but if you change the extension to .java it should work.
>>>>
>>>> package com.amin.app.lucene.search.impl;
>>>>
>>>> import static org.junit.Assert.assertEquals;
>>>> import static org.junit.Assert.assertNotNull;
>>>> import static org.junit.Assert.assertNotSame;
>>>> import static org.junit.Assert.assertTrue;
>>>>
>>>> import java.io.File;
>>>> import java.io.FileInputStream;
>>>> import java.io.FileOutputStream;
>>>> import java.io.IOException;
>>>> import java.io.InputStream;
>>>> import java.io.OutputStream;
>>>>
>>>> import javax.swing.text.BadLocationException;
>>>> import javax.swing.text.DefaultStyledDocument;
>>>> import javax.swing.text.rtf.RTFEditorKit;
>>>>
>>>> import org.apache.commons.lang.StringUtils;
>>>> import org.apache.lucene.analysis.Analyzer;
>>>> import org.apache.lucene.analysis.standard.StandardAnalyzer;
>>>> import org.apache.lucene.ant.DocumentHandler;
>>>> import org.apache.lucene.ant.DocumentHandlerException;
>>>> import org.apache.lucene.document.Document;
>>>> import org.apache.lucene.document.Field;
>>>> import org.apache.lucene.index.CorruptIndexException;
>>>> import org.apache.lucene.index.IndexReader;
>>>> import org.apache.lucene.index.IndexWriter;
>>>> import org.apache.lucene.queryParser.MultiFieldQueryParser;
>>>> import org.apache.lucene.queryParser.QueryParser;
>>>> import org.apache.lucene.search.BooleanQuery;
>>>> import org.apache.lucene.search.IndexSearcher;
>>>> import org.apache.lucene.search.MultiSearcher;
>>>> import org.apache.lucene.search.Query;
>>>> import org.apache.lucene.search.ScoreDoc;
>>>> import org.apache.lucene.search.Searchable;
>>>> import org.apache.lucene.search.TopDocs;
>>>> import org.apache.lucene.store.Directory;
>>>> import org.apache.lucene.store.FSDirectory;
>>>> import org.junit.After;
>>>> import org.junit.Before;
>>>> import org.junit.Test;
>>>>
>>>> import com.amin.app.lucene.util.WorkItem.IndexerType;
>>>>
>>>> public class SearchTest {
>>>>
>>>> private File rtfFile = null;
>>>> private static final String RTF_FILE_NAME =
>>>> "rtfDocumentToIndex.rtf";
>>>>
>>>> @Before
>>>> public void setUp() throws Exception {
>>>> InputStream inputStream =
>>>> this
>>>> .getClass().getClassLoader().getResourceAsStream(RTF_FILE_NAME);
>>>> rtfFile = new File(RTF_FILE_NAME);
>>>> convertInputStreamToFile(inputStream, rtfFile);
>>>> }
>>>>
>>>>
>>>>
>>>> @Test
>>>> public void testCanCreateLuceneDocumentForRTFDocument() throws
>>>> Exception {
>>>> JavaBuiltInRTFHandler builtInRTFHandler = new
>>>> JavaBuiltInRTFHandler();
>>>> Document document = builtInRTFHandler.getDocument(rtfFile);
>>>> assertNotNull(document);
>>>> String value = document.get(FieldNameEnum.BODY.getDescription());
>>>> assertNotNull(value);
>>>> assertNotSame("", value);
>>>> assertTrue(value.contains("Amin Mohammed-Coleman"));
>>>> assertTrue(value.contains("This is a test rtf document that will
>>>> be indexed."));
>>>> String path = document.get(FieldNameEnum.PATH.getDescription());
>>>> assertNotNull(path);
>>>> assertTrue(path.contains(".rtf"));
>>>> String fileName =
>>>> document.get(FieldNameEnum.NAME.getDescription());
>>>> assertNotNull(fileName);
>>>> assertEquals(RTF_FILE_NAME, fileName);
>>>> assertEquals(WorkItem.IndexerType.RTF_INDEXER.name(),
>>>> document.get(FieldNameEnum.TYPE.getDescription()));
>>>>
>>>> }
>>>>
>>>>
>>>>
>>>> @Test
>>>> public void testCanSearchRtfDocument() throws Exception {
>>>> JavaBuiltInRTFHandler builtInRTFHandler = new
>>>> JavaBuiltInRTFHandler();
>>>> Document document = builtInRTFHandler.getDocument(rtfFile);
>>>> IndexWriter indexWriter = new
>>>> IndexWriter(getDirectory(),getAnalyzer(),new
>>>> IndexWriter.MaxFieldLength(2));
>>>> try {
>>>> indexWriter.addDocument(document);
>>>> commitAndCloseWriter(indexWriter);
>>>> } catch (CorruptIndexException e) {
>>>> throw new IllegalStateException(e);
>>>> } catch (IOException e) {
>>>> throw new IllegalStateException(e);
>>>> }
>>>>
>>>> //I plan to use other searchers later
>>>> IndexSearcher indexSearcher = new IndexSearcher(getDirectory());
>>>> MultiSearcher multiSearcher = new MultiSearcher(new Searchable[]
>>>> {indexSearcher});
>>>> QueryParser queryParser = new MultiFieldQueryParser(new String[]
>>>> {FieldNameEnum.BODY.getDescription()}, new StandardAnalyzer());
>>>> Query query = queryParser.parse("amin");
>>>> TopDocs topDocs = multiSearcher.search(query,
>>>> BooleanQuery.getMaxClauseCount());
>>>> assertNotNull(topDocs);
>>>> assertEquals(1, topDocs.totalHits);
>>>> ScoreDoc[] scoreDocs = topDocs.scoreDocs;
>>>> for (ScoreDoc scoreDoc : scoreDocs) {
>>>> Document documentFromSearch = indexSearcher.doc(scoreDoc.doc);
>>>> assertNotNull(documentFromSearch);
>>>> String bodyText =
>>>> documentFromSearch.get(FieldNameEnum.BODY.getDescription());
>>>> assertNotNull(bodyText);
>>>> assertNotSame("", bodyText);
>>>> assertTrue(bodyText.contains("Amin Mohammed-Coleman"));
>>>> assertTrue(bodyText.contains("This is a test rtf document that
>>>> will be indexed."));
>>>>
>>>> }
>>>> multiSearcher.close();
>>>>
>>>> }
>>>>
>>>> @After
>>>> public void tearDown() throws Exception {
>>>> rtfFile.delete();
>>>> if (getDirectory().list() != null && getDirectory().list().length
>>>> > 0) {
>>>> IndexReader reader = IndexReader.open(getDirectory());
>>>> for(int i = 0; i < reader.maxDoc();i++) {
>>>> reader.deleteDocument(i);
>>>> }
>>>> reader.close();
>>>> }
>>>> }
>>>>
>>>> private void commitAndCloseWriter(IndexWriter indexWriter) throws
>>>> CorruptIndexException,IOException {
>>>> indexWriter.commit();
>>>> indexWriter.close();
>>>> }
>>>>
>>>>
>>>> public Directory getDirectory() throws IOException {
>>>> return FSDirectory.getDirectory("/tmp/lucene/rtf");
>>>> }
>>>>
>>>> public Analyzer getAnalyzer() {
>>>> return new StandardAnalyzer();
>>>> }
>>>> private static void convertInputStreamToFile(InputStream
>>>> inputStream, File file) {
>>>> try
>>>> {
>>>> OutputStream out=new FileOutputStream(file);
>>>> byte buf[]=new byte[1024];
>>>> int len;
>>>> while((len=inputStream.read(buf))>0)
>>>> out.write(buf,0,len);
>>>> out.close();
>>>> inputStream.close();
>>>>
>>>> }catch (IOException e){
>>>> throw new IllegalStateException(e);
>>>> }
>>>> }
>>>> private static class JavaBuiltInRTFHandler implements
>>>> DocumentHandler{
>>>>
>>>> public Document getDocument(File file) throws
>>>> DocumentHandlerException {
>>>> String bodyText = null;
>>>> DefaultStyledDocument styledDoc = new DefaultStyledDocument();
>>>> try {
>>>> InputStream inputStream = new FileInputStream(file);
>>>> new RTFEditorKit().read(inputStream, styledDoc, 0);
>>>> bodyText = styledDoc.getText(0, styledDoc.getLength());
>>>> } catch (IOException ioex) {
>>>> throw new IllegalStateException(ioex);
>>>> } catch (BadLocationException e) {
>>>> throw new IllegalArgumentException(e);
>>>> }
>>>> //create Document object using body
>>>> if (bodyText != null) {
>>>> Document document = new Document();
>>>> String trimmedBodyText = StringUtils.trimToEmpty(bodyText);
>>>> trimmedBodyText = trimmedBodyText.replaceAll("\n", "");
>>>> Field field = new
>>>> Field(FieldNameEnum.BODY.getDescription(),trimmedBodyText,
>>>> Field.Store.YES, Field.Index.ANALYZED);
>>>> document.add(field);
>>>>
>>>> String pathToFile = file.getPath();
>>>> Field pathToFileField = new
>>>> Field(FieldNameEnum.PATH.getDescription(),pathToFile,
>>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>>> document.add(pathToFileField);
>>>>
>>>> String fileName = file.getName();
>>>> Field fileNameField = new
>>>> Field(FieldNameEnum.NAME.getDescription(),fileName,
>>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>>> document.add(fileNameField);
>>>>
>>>> Field typeField = new
>>>> Field
>>>> (FieldNameEnum
>>>> .TYPE.getDescription(),IndexerType.RTF_INDEXER.name(),
>>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>>> document.add(typeField);
>>>>
>>>> String summary = bodyText.substring(0, 10);
>>>>
>>>> Field summaryField = new
>>>> Field(FieldNameEnum.SUMMARY.getDescription(),summary,
>>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>>> document.add(summaryField);
>>>>
>>>> return document;
>>>> }
>>>> return null;
>>>> }
>>>> }
>>>>
>>>> private static class WorkItem {
>>>>
>>>> public enum WorkItemEvent {
>>>> ADD,
>>>> UPDATE,
>>>> DELETE;
>>>> }
>>>>
>>>> public enum IndexerType {
>>>> RTF_INDEXER,
>>>> PDF_INDEXER,
>>>> XML_INDEXER,
>>>> PLAIN_TEXT_INDEXER,
>>>> MS_WORD_INDEXER,
>>>> MS_EXCEL_INDEXER,
>>>> MS_POWERPOINT_INDEXER;
>>>> }
>>>>
>>>>
>>>> private final Document workLoad;
>>>>
>>>> private final WorkItemEvent workItemEvent;
>>>>
>>>> private final IndexerType indexerType;
>>>>
>>>>
>>>> public WorkItem(final Document workLoad, final WorkItemEvent
>>>> workItemEvent) {
>>>> this.workLoad = workLoad;
>>>> this.workItemEvent = workItemEvent;
>>>> String type = this.workLoad.get("type");
>>>> this.indexerType = IndexerType.valueOf(type);
>>>> }
>>>>
>>>> public IndexerType getIndexerType() {
>>>> return indexerType;
>>>> }
>>>>
>>>> public Document getWorkLoad() {
>>>> return workLoad;
>>>> }
>>>>
>>>> public WorkItemEvent getWorkItemEvent() {
>>>> return workItemEvent;
>>>> }
>>>> }
>>>>
>>>> private enum FieldNameEnum {
>>>>
>>>> AUTHOR("author"),
>>>> BODY("body"),
>>>> TITLE("title"),
>>>> SUBJECT("subject"),
>>>> KEYWORDS("keywords"),
>>>> PATH("path"), NAME ("name"),
>>>> TYPE("type"),
>>>> ID ("id"),
>>>> SUMMARY ("summary");
>>>>
>>>> private final String description;
>>>>
>>>> private FieldNameEnum(final String description) {
>>>> this.description = description;
>>>> }
>>>>
>>>> public String getDescription() {
>>>> return this.description;
>>>> }
>>>> }
>>>> }
>>>
>>> --------------------------
>>> Grant Ingersoll
>>>
>>> Lucene Helpful Hints:
>>> http://wiki.apache.org/lucene-java/BasicsOfPerformance
>>> http://wiki.apache.org/lucene-java/LuceneFAQ
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>
>> --------------------------
>> Grant Ingersoll
>>
>> Lucene Helpful Hints:
>> http://wiki.apache.org/lucene-java/BasicsOfPerformance
>> http://wiki.apache.org/lucene-java/LuceneFAQ
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>
--------------------------
Grant Ingersoll
Lucene Helpful Hints:
http://wiki.apache.org/lucene-java/BasicsOfPerformance
http://wiki.apache.org/lucene-java/LuceneFAQ
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org
Re: Search Test file
Posted by Amin Mohammed-Coleman <am...@gmail.com>.
Hi Grant
Thank you for looking at the test case. I have updated the
IndexWriter to use UNLIMITED for MaxFieldLength. I tried using
Integer.MAX_VALUE for
>> Also,
>> TopDocs topDocs = multiSearcher.search(query,
>> BooleanQuery.getMaxClauseCount());
>>
>> strikes me as really odd. Why are you passing in the max clause
>> count as the number of results you want returned?
However I get the following exception :
java.lang.NegativeArraySizeException
at org.apache.lucene.util.PriorityQueue.initialize(PriorityQueue.java:
41)
at org.apache.lucene.search.HitQueue.<init>(HitQueue.java:24)
at org.apache.lucene.search.MultiSearcher.search(MultiSearcher.java:
200)
at org.apache.lucene.search.Searcher.search(Searcher.java:136)
at org.apache.lucene.search.Searcher.search(Searcher.java:146)
at
com
.amin
.app
.lucene
.search.impl.SearchTest.testCanSearchRtfDocument(SearchTest.java:101)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun
.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:
39)
at
sun
.reflect
.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:
25)
at java.lang.reflect.Method.invoke(Method.java:597)
at org.junit.internal.runners.TestMethod.invoke(TestMethod.java:59)
at
org
.junit.internal.runners.MethodRoadie.runTestMethod(MethodRoadie.java:98)
at org.junit.internal.runners.MethodRoadie$2.run(MethodRoadie.java:79)
at
org
.junit
.internal
.runners.MethodRoadie.runBeforesThenTestThenAfters(MethodRoadie.java:87)
at org.junit.internal.runners.MethodRoadie.runTest(MethodRoadie.java:
77)
at org.junit.internal.runners.MethodRoadie.run(MethodRoadie.java:42)
at
org
.junit
.internal
.runners.JUnit4ClassRunner.invokeTestMethod(JUnit4ClassRunner.java:88)
at
org
.junit
.internal.runners.JUnit4ClassRunner.runMethods(JUnit4ClassRunner.java:
51)
at org.junit.internal.runners.JUnit4ClassRunner
$1.run(JUnit4ClassRunner.java:44)
at
org.junit.internal.runners.ClassRoadie.runUnprotected(ClassRoadie.java:
27)
at
org.junit.internal.runners.ClassRoadie.runProtected(ClassRoadie.java:37)
at
org
.junit.internal.runners.JUnit4ClassRunner.run(JUnit4ClassRunner.java:42)
at
org
.eclipse
.jdt
.internal
.junit4.runner.JUnit4TestReference.run(JUnit4TestReference.java:45)
at
org
.eclipse
.jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)
at
org
.eclipse
.jdt
.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:
460)
at
org
.eclipse
.jdt
.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:
673)
at
org
.eclipse
.jdt.internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java:
386)
at
org
.eclipse
.jdt.internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java:
196)
I know that this is an issue (not being able to use
Integer.MAX_VALUE). I tried using 100 and my test still doesn't pass.
Cheers
Amin
On 4 Jan 2009, at 02:23, Grant Ingersoll wrote:
>
>
> Begin forwarded message:
>
>> From: Grant Ingersoll <gs...@apache.org>
>> Date: January 3, 2009 8:19:14 PM EST
>> To: java-dev@lucene.apache.org
>> Subject: Fwd: Search Test file
>> Reply-To: java-dev@lucene.apache.org
>>
>> Hi Amin,
>>
>> I see a couple of issues with your program below, and one that is
>> the cause of the problem of not finding "amin" as a query term.
>>
>> When you construct your IndexWriter, you are doing:
>>> IndexWriter indexWriter = new
>>> IndexWriter(getDirectory(),getAnalyzer(),new
>>> IndexWriter.MaxFieldLength(2));
>>
>> The MaxFieldLength parameter specifies the maximum number of tokens
>> allowed in a Field. Everything else after that is dropped. See http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.html#IndexWriter(org.apache.lucene.store.Directory,%20org.apache.lucene.analysis.Analyzer,%20org.apache.lucene.index.IndexWriter.MaxFieldLength)
>> and http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.MaxFieldLength.html
>>
>> Also,
>> TopDocs topDocs = multiSearcher.search(query,
>> BooleanQuery.getMaxClauseCount());
>>
>> strikes me as really odd. Why are you passing in the max clause
>> count as the number of results you want returned?
>>
>> Cheers,
>> Grant
>>
>>
>>
>> Begin forwarded message:
>>
>>> From: "aminmc@gmail.com" <am...@gmail.com>
>>> Date: January 3, 2009 3:24:52 PM EST
>>> To: gsingers@apache.org
>>> Subject: Search Test file
>>>
>>> I've shared a document with you called "Search Test file":
>>> http://docs.google.com/Doc?id=d77xf5q_0n6hb38fx&invite=cjq79zj
>>>
>>> It's not an attachment -- it's stored online at Google Docs. To
>>> open this document, just click the link above.
>>> ---
>>>
>>> Hi
>>>
>>> I have uploaded the test file at google docs. It is currently a
>>> txt file but if you change the extension to .java it should work.
>>>
>>> package com.amin.app.lucene.search.impl;
>>>
>>> import static org.junit.Assert.assertEquals;
>>> import static org.junit.Assert.assertNotNull;
>>> import static org.junit.Assert.assertNotSame;
>>> import static org.junit.Assert.assertTrue;
>>>
>>> import java.io.File;
>>> import java.io.FileInputStream;
>>> import java.io.FileOutputStream;
>>> import java.io.IOException;
>>> import java.io.InputStream;
>>> import java.io.OutputStream;
>>>
>>> import javax.swing.text.BadLocationException;
>>> import javax.swing.text.DefaultStyledDocument;
>>> import javax.swing.text.rtf.RTFEditorKit;
>>>
>>> import org.apache.commons.lang.StringUtils;
>>> import org.apache.lucene.analysis.Analyzer;
>>> import org.apache.lucene.analysis.standard.StandardAnalyzer;
>>> import org.apache.lucene.ant.DocumentHandler;
>>> import org.apache.lucene.ant.DocumentHandlerException;
>>> import org.apache.lucene.document.Document;
>>> import org.apache.lucene.document.Field;
>>> import org.apache.lucene.index.CorruptIndexException;
>>> import org.apache.lucene.index.IndexReader;
>>> import org.apache.lucene.index.IndexWriter;
>>> import org.apache.lucene.queryParser.MultiFieldQueryParser;
>>> import org.apache.lucene.queryParser.QueryParser;
>>> import org.apache.lucene.search.BooleanQuery;
>>> import org.apache.lucene.search.IndexSearcher;
>>> import org.apache.lucene.search.MultiSearcher;
>>> import org.apache.lucene.search.Query;
>>> import org.apache.lucene.search.ScoreDoc;
>>> import org.apache.lucene.search.Searchable;
>>> import org.apache.lucene.search.TopDocs;
>>> import org.apache.lucene.store.Directory;
>>> import org.apache.lucene.store.FSDirectory;
>>> import org.junit.After;
>>> import org.junit.Before;
>>> import org.junit.Test;
>>>
>>> import com.amin.app.lucene.util.WorkItem.IndexerType;
>>>
>>> public class SearchTest {
>>>
>>> private File rtfFile = null;
>>> private static final String RTF_FILE_NAME =
>>> "rtfDocumentToIndex.rtf";
>>>
>>> @Before
>>> public void setUp() throws Exception {
>>> InputStream inputStream =
>>> this.getClass().getClassLoader().getResourceAsStream(RTF_FILE_NAME);
>>> rtfFile = new File(RTF_FILE_NAME);
>>> convertInputStreamToFile(inputStream, rtfFile);
>>> }
>>>
>>>
>>>
>>> @Test
>>> public void testCanCreateLuceneDocumentForRTFDocument() throws
>>> Exception {
>>> JavaBuiltInRTFHandler builtInRTFHandler = new
>>> JavaBuiltInRTFHandler();
>>> Document document = builtInRTFHandler.getDocument(rtfFile);
>>> assertNotNull(document);
>>> String value = document.get(FieldNameEnum.BODY.getDescription());
>>> assertNotNull(value);
>>> assertNotSame("", value);
>>> assertTrue(value.contains("Amin Mohammed-Coleman"));
>>> assertTrue(value.contains("This is a test rtf document that will
>>> be indexed."));
>>> String path = document.get(FieldNameEnum.PATH.getDescription());
>>> assertNotNull(path);
>>> assertTrue(path.contains(".rtf"));
>>> String fileName = document.get(FieldNameEnum.NAME.getDescription());
>>> assertNotNull(fileName);
>>> assertEquals(RTF_FILE_NAME, fileName);
>>> assertEquals(WorkItem.IndexerType.RTF_INDEXER.name(),
>>> document.get(FieldNameEnum.TYPE.getDescription()));
>>>
>>> }
>>>
>>>
>>>
>>> @Test
>>> public void testCanSearchRtfDocument() throws Exception {
>>> JavaBuiltInRTFHandler builtInRTFHandler = new
>>> JavaBuiltInRTFHandler();
>>> Document document = builtInRTFHandler.getDocument(rtfFile);
>>> IndexWriter indexWriter = new
>>> IndexWriter(getDirectory(),getAnalyzer(),new
>>> IndexWriter.MaxFieldLength(2));
>>> try {
>>> indexWriter.addDocument(document);
>>> commitAndCloseWriter(indexWriter);
>>> } catch (CorruptIndexException e) {
>>> throw new IllegalStateException(e);
>>> } catch (IOException e) {
>>> throw new IllegalStateException(e);
>>> }
>>>
>>> //I plan to use other searchers later
>>> IndexSearcher indexSearcher = new IndexSearcher(getDirectory());
>>> MultiSearcher multiSearcher = new MultiSearcher(new Searchable[]
>>> {indexSearcher});
>>> QueryParser queryParser = new MultiFieldQueryParser(new String[]
>>> {FieldNameEnum.BODY.getDescription()}, new StandardAnalyzer());
>>> Query query = queryParser.parse("amin");
>>> TopDocs topDocs = multiSearcher.search(query,
>>> BooleanQuery.getMaxClauseCount());
>>> assertNotNull(topDocs);
>>> assertEquals(1, topDocs.totalHits);
>>> ScoreDoc[] scoreDocs = topDocs.scoreDocs;
>>> for (ScoreDoc scoreDoc : scoreDocs) {
>>> Document documentFromSearch = indexSearcher.doc(scoreDoc.doc);
>>> assertNotNull(documentFromSearch);
>>> String bodyText =
>>> documentFromSearch.get(FieldNameEnum.BODY.getDescription());
>>> assertNotNull(bodyText);
>>> assertNotSame("", bodyText);
>>> assertTrue(bodyText.contains("Amin Mohammed-Coleman"));
>>> assertTrue(bodyText.contains("This is a test rtf document that
>>> will be indexed."));
>>>
>>> }
>>> multiSearcher.close();
>>>
>>> }
>>>
>>> @After
>>> public void tearDown() throws Exception {
>>> rtfFile.delete();
>>> if (getDirectory().list() != null && getDirectory().list().length
>>> > 0) {
>>> IndexReader reader = IndexReader.open(getDirectory());
>>> for(int i = 0; i < reader.maxDoc();i++) {
>>> reader.deleteDocument(i);
>>> }
>>> reader.close();
>>> }
>>> }
>>>
>>> private void commitAndCloseWriter(IndexWriter indexWriter) throws
>>> CorruptIndexException,IOException {
>>> indexWriter.commit();
>>> indexWriter.close();
>>> }
>>>
>>>
>>> public Directory getDirectory() throws IOException {
>>> return FSDirectory.getDirectory("/tmp/lucene/rtf");
>>> }
>>>
>>> public Analyzer getAnalyzer() {
>>> return new StandardAnalyzer();
>>> }
>>> private static void convertInputStreamToFile(InputStream
>>> inputStream, File file) {
>>> try
>>> {
>>> OutputStream out=new FileOutputStream(file);
>>> byte buf[]=new byte[1024];
>>> int len;
>>> while((len=inputStream.read(buf))>0)
>>> out.write(buf,0,len);
>>> out.close();
>>> inputStream.close();
>>>
>>> }catch (IOException e){
>>> throw new IllegalStateException(e);
>>> }
>>> }
>>> private static class JavaBuiltInRTFHandler implements
>>> DocumentHandler{
>>>
>>> public Document getDocument(File file) throws
>>> DocumentHandlerException {
>>> String bodyText = null;
>>> DefaultStyledDocument styledDoc = new DefaultStyledDocument();
>>> try {
>>> InputStream inputStream = new FileInputStream(file);
>>> new RTFEditorKit().read(inputStream, styledDoc, 0);
>>> bodyText = styledDoc.getText(0, styledDoc.getLength());
>>> } catch (IOException ioex) {
>>> throw new IllegalStateException(ioex);
>>> } catch (BadLocationException e) {
>>> throw new IllegalArgumentException(e);
>>> }
>>> //create Document object using body
>>> if (bodyText != null) {
>>> Document document = new Document();
>>> String trimmedBodyText = StringUtils.trimToEmpty(bodyText);
>>> trimmedBodyText = trimmedBodyText.replaceAll("\n", "");
>>> Field field = new
>>> Field(FieldNameEnum.BODY.getDescription(),trimmedBodyText,
>>> Field.Store.YES, Field.Index.ANALYZED);
>>> document.add(field);
>>>
>>> String pathToFile = file.getPath();
>>> Field pathToFileField = new
>>> Field(FieldNameEnum.PATH.getDescription(),pathToFile,
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(pathToFileField);
>>>
>>> String fileName = file.getName();
>>> Field fileNameField = new
>>> Field(FieldNameEnum.NAME.getDescription(),fileName,
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(fileNameField);
>>>
>>> Field typeField = new
>>> Field
>>> (FieldNameEnum
>>> .TYPE.getDescription(),IndexerType.RTF_INDEXER.name(),
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(typeField);
>>>
>>> String summary = bodyText.substring(0, 10);
>>>
>>> Field summaryField = new
>>> Field(FieldNameEnum.SUMMARY.getDescription(),summary,
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(summaryField);
>>>
>>> return document;
>>> }
>>> return null;
>>> }
>>> }
>>>
>>> private static class WorkItem {
>>>
>>> public enum WorkItemEvent {
>>> ADD,
>>> UPDATE,
>>> DELETE;
>>> }
>>>
>>> public enum IndexerType {
>>> RTF_INDEXER,
>>> PDF_INDEXER,
>>> XML_INDEXER,
>>> PLAIN_TEXT_INDEXER,
>>> MS_WORD_INDEXER,
>>> MS_EXCEL_INDEXER,
>>> MS_POWERPOINT_INDEXER;
>>> }
>>>
>>>
>>> private final Document workLoad;
>>>
>>> private final WorkItemEvent workItemEvent;
>>>
>>> private final IndexerType indexerType;
>>>
>>>
>>> public WorkItem(final Document workLoad, final WorkItemEvent
>>> workItemEvent) {
>>> this.workLoad = workLoad;
>>> this.workItemEvent = workItemEvent;
>>> String type = this.workLoad.get("type");
>>> this.indexerType = IndexerType.valueOf(type);
>>> }
>>>
>>> public IndexerType getIndexerType() {
>>> return indexerType;
>>> }
>>>
>>> public Document getWorkLoad() {
>>> return workLoad;
>>> }
>>>
>>> public WorkItemEvent getWorkItemEvent() {
>>> return workItemEvent;
>>> }
>>> }
>>>
>>> private enum FieldNameEnum {
>>>
>>> AUTHOR("author"),
>>> BODY("body"),
>>> TITLE("title"),
>>> SUBJECT("subject"),
>>> KEYWORDS("keywords"),
>>> PATH("path"), NAME ("name"),
>>> TYPE("type"),
>>> ID ("id"),
>>> SUMMARY ("summary");
>>>
>>> private final String description;
>>>
>>> private FieldNameEnum(final String description) {
>>> this.description = description;
>>> }
>>>
>>> public String getDescription() {
>>> return this.description;
>>> }
>>> }
>>> }
>>
>> --------------------------
>> Grant Ingersoll
>>
>> Lucene Helpful Hints:
>> http://wiki.apache.org/lucene-java/BasicsOfPerformance
>> http://wiki.apache.org/lucene-java/LuceneFAQ
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>
> --------------------------
> Grant Ingersoll
>
> Lucene Helpful Hints:
> http://wiki.apache.org/lucene-java/BasicsOfPerformance
> http://wiki.apache.org/lucene-java/LuceneFAQ
>
>
>
>
>
>
>
>
>
>
---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org