You are viewing a plain text version of this content. The canonical link for it is here.

Posted to java-user@lucene.apache.org by Grant Ingersoll <gs...@apache.org> on 2009/01/04 03:23:42 UTC

Fwd: Search Test file


Begin forwarded message:

> From: Grant Ingersoll <gs...@apache.org>
> Date: January 3, 2009 8:19:14 PM EST
> To: java-dev@lucene.apache.org
> Subject: Fwd: Search Test file
> Reply-To: java-dev@lucene.apache.org
>
> Hi Amin,
>
> I see a couple of issues with your program below, and one that is  
> the cause of the problem of not finding "amin" as a query term.
>
> When you construct your IndexWriter, you are doing:
>> IndexWriter indexWriter = new  
>> IndexWriter(getDirectory(),getAnalyzer(),new  
>> IndexWriter.MaxFieldLength(2));
>
> The MaxFieldLength parameter specifies the maximum number of tokens  
> allowed in a Field.  Everything else after that is dropped.  See http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.html#IndexWriter(org.apache.lucene.store.Directory,%20org.apache.lucene.analysis.Analyzer,%20org.apache.lucene.index.IndexWriter.MaxFieldLength) 
>  and http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.MaxFieldLength.html
>
> Also,
>  TopDocs topDocs = multiSearcher.search(query,  
> BooleanQuery.getMaxClauseCount());
>
> strikes me as really odd.  Why are you passing in the max clause  
> count as the number of results you want returned?
>
> Cheers,
> Grant
>
>
>
> Begin forwarded message:
>
>> From: "aminmc@gmail.com" <am...@gmail.com>
>> Date: January 3, 2009 3:24:52 PM EST
>> To: gsingers@apache.org
>> Subject: Search Test file
>>
>> I've shared a document with you called "Search Test file":
>> http://docs.google.com/Doc?id=d77xf5q_0n6hb38fx&invite=cjq79zj
>>
>> It's not an attachment -- it's stored online at Google Docs. To  
>> open this document, just click the link above.
>> ---
>>
>> Hi
>>
>> I have uploaded the test file at google docs. It is currently a txt  
>> file but if you change the extension to .java it should work.
>>
>> package com.amin.app.lucene.search.impl;
>>
>> import static org.junit.Assert.assertEquals;
>> import static org.junit.Assert.assertNotNull;
>> import static org.junit.Assert.assertNotSame;
>> import static org.junit.Assert.assertTrue;
>>
>> import java.io.File;
>> import java.io.FileInputStream;
>> import java.io.FileOutputStream;
>> import java.io.IOException;
>> import java.io.InputStream;
>> import java.io.OutputStream;
>>
>> import javax.swing.text.BadLocationException;
>> import javax.swing.text.DefaultStyledDocument;
>> import javax.swing.text.rtf.RTFEditorKit;
>>
>> import org.apache.commons.lang.StringUtils;
>> import org.apache.lucene.analysis.Analyzer;
>> import org.apache.lucene.analysis.standard.StandardAnalyzer;
>> import org.apache.lucene.ant.DocumentHandler;
>> import org.apache.lucene.ant.DocumentHandlerException;
>> import org.apache.lucene.document.Document;
>> import org.apache.lucene.document.Field;
>> import org.apache.lucene.index.CorruptIndexException;
>> import org.apache.lucene.index.IndexReader;
>> import org.apache.lucene.index.IndexWriter;
>> import org.apache.lucene.queryParser.MultiFieldQueryParser;
>> import org.apache.lucene.queryParser.QueryParser;
>> import org.apache.lucene.search.BooleanQuery;
>> import org.apache.lucene.search.IndexSearcher;
>> import org.apache.lucene.search.MultiSearcher;
>> import org.apache.lucene.search.Query;
>> import org.apache.lucene.search.ScoreDoc;
>> import org.apache.lucene.search.Searchable;
>> import org.apache.lucene.search.TopDocs;
>> import org.apache.lucene.store.Directory;
>> import org.apache.lucene.store.FSDirectory;
>> import org.junit.After;
>> import org.junit.Before;
>> import org.junit.Test;
>>
>> import com.amin.app.lucene.util.WorkItem.IndexerType;
>>
>> public class SearchTest {
>>
>> private File rtfFile = null;
>> private static final String RTF_FILE_NAME = "rtfDocumentToIndex.rtf";
>>
>> @Before
>> public void setUp() throws Exception {
>> InputStream inputStream =  
>> this.getClass().getClassLoader().getResourceAsStream(RTF_FILE_NAME);
>> rtfFile = new File(RTF_FILE_NAME);
>> convertInputStreamToFile(inputStream, rtfFile);
>> }
>>
>>
>>
>> @Test
>> public void testCanCreateLuceneDocumentForRTFDocument() throws  
>> Exception {
>> JavaBuiltInRTFHandler builtInRTFHandler = new  
>> JavaBuiltInRTFHandler();
>> Document document = builtInRTFHandler.getDocument(rtfFile);
>> assertNotNull(document);
>> String value = document.get(FieldNameEnum.BODY.getDescription());
>> assertNotNull(value);
>> assertNotSame("", value);
>> assertTrue(value.contains("Amin Mohammed-Coleman"));
>> assertTrue(value.contains("This is a test rtf document that will be  
>> indexed."));
>> String path = document.get(FieldNameEnum.PATH.getDescription());
>> assertNotNull(path);
>> assertTrue(path.contains(".rtf"));
>> String fileName = document.get(FieldNameEnum.NAME.getDescription());
>> assertNotNull(fileName);
>> assertEquals(RTF_FILE_NAME, fileName);
>> assertEquals(WorkItem.IndexerType.RTF_INDEXER.name(),  
>> document.get(FieldNameEnum.TYPE.getDescription()));
>>
>> }
>>
>>
>>
>> @Test
>> public void testCanSearchRtfDocument() throws Exception {
>> JavaBuiltInRTFHandler builtInRTFHandler = new  
>> JavaBuiltInRTFHandler();
>> Document document = builtInRTFHandler.getDocument(rtfFile);
>> IndexWriter indexWriter = new  
>> IndexWriter(getDirectory(),getAnalyzer(),new  
>> IndexWriter.MaxFieldLength(2));
>> try {
>> indexWriter.addDocument(document);
>> commitAndCloseWriter(indexWriter);
>> } catch (CorruptIndexException e) {
>> throw new IllegalStateException(e);
>> } catch (IOException e) {
>> throw new IllegalStateException(e);
>> }
>>
>> //I plan to use other searchers later
>> IndexSearcher indexSearcher = new IndexSearcher(getDirectory());
>> MultiSearcher multiSearcher = new MultiSearcher(new Searchable[]  
>> {indexSearcher});
>> QueryParser queryParser = new MultiFieldQueryParser(new String[]  
>> {FieldNameEnum.BODY.getDescription()}, new StandardAnalyzer());
>> Query query = queryParser.parse("amin");
>> TopDocs topDocs = multiSearcher.search(query,  
>> BooleanQuery.getMaxClauseCount());
>> assertNotNull(topDocs);
>> assertEquals(1, topDocs.totalHits);
>> ScoreDoc[] scoreDocs = topDocs.scoreDocs;
>> for (ScoreDoc scoreDoc : scoreDocs) {
>> Document documentFromSearch = indexSearcher.doc(scoreDoc.doc);
>> assertNotNull(documentFromSearch);
>> String bodyText =  
>> documentFromSearch.get(FieldNameEnum.BODY.getDescription());
>> assertNotNull(bodyText);
>> assertNotSame("", bodyText);
>> assertTrue(bodyText.contains("Amin Mohammed-Coleman"));
>> assertTrue(bodyText.contains("This is a test rtf document that will  
>> be indexed."));
>>
>> }
>> multiSearcher.close();
>>
>> }
>>
>> @After
>> public void tearDown() throws Exception {
>> rtfFile.delete();
>> if (getDirectory().list() != null && getDirectory().list().length >  
>> 0) {
>> IndexReader reader = IndexReader.open(getDirectory());
>> for(int i = 0; i < reader.maxDoc();i++) {
>> reader.deleteDocument(i);
>> }
>> reader.close();
>> }
>> }
>>
>> private void commitAndCloseWriter(IndexWriter indexWriter) throws  
>> CorruptIndexException,IOException {
>> indexWriter.commit();
>> indexWriter.close();
>> }
>>
>>
>> public Directory getDirectory() throws IOException {
>> return FSDirectory.getDirectory("/tmp/lucene/rtf");
>> }
>>
>> public Analyzer getAnalyzer() {
>> return new StandardAnalyzer();
>> }
>> private static void convertInputStreamToFile(InputStream  
>> inputStream, File file) {
>> try
>>     {
>>     OutputStream out=new FileOutputStream(file);
>>     byte buf[]=new byte[1024];
>>     int len;
>>     while((len=inputStream.read(buf))>0)
>>     out.write(buf,0,len);
>>     out.close();
>>     inputStream.close();
>>
>>     }catch (IOException e){
>>     throw new IllegalStateException(e);
>>     }
>> }
>> private static class JavaBuiltInRTFHandler implements  
>> DocumentHandler{
>>
>> public Document getDocument(File file) throws  
>> DocumentHandlerException {
>> String bodyText = null;
>> DefaultStyledDocument styledDoc = new DefaultStyledDocument();
>> try {
>> InputStream inputStream = new FileInputStream(file);
>> new RTFEditorKit().read(inputStream, styledDoc, 0);
>> bodyText = styledDoc.getText(0, styledDoc.getLength());
>> } catch (IOException ioex) {
>> throw new IllegalStateException(ioex);
>> } catch (BadLocationException e) {
>> throw new IllegalArgumentException(e);
>> }
>> //create Document object using body
>> if (bodyText != null) {
>> Document document = new Document();
>> String trimmedBodyText = StringUtils.trimToEmpty(bodyText);
>> trimmedBodyText = trimmedBodyText.replaceAll("\n", "");
>> Field field = new  
>> Field(FieldNameEnum.BODY.getDescription(),trimmedBodyText,  
>> Field.Store.YES, Field.Index.ANALYZED);
>> document.add(field);
>>
>> String pathToFile = file.getPath();
>> Field pathToFileField = new  
>> Field(FieldNameEnum.PATH.getDescription(),pathToFile,  
>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>> document.add(pathToFileField);
>>
>> String fileName = file.getName();
>> Field fileNameField = new  
>> Field(FieldNameEnum.NAME.getDescription(),fileName,  
>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>> document.add(fileNameField);
>>
>> Field typeField = new  
>> Field 
>> (FieldNameEnum 
>> .TYPE.getDescription(),IndexerType.RTF_INDEXER.name(),  
>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>> document.add(typeField);
>>
>> String summary = bodyText.substring(0, 10);
>>
>> Field summaryField = new  
>> Field(FieldNameEnum.SUMMARY.getDescription(),summary,  
>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>> document.add(summaryField);
>>
>> return document;
>> }
>> return null;
>> }
>> }
>>
>> private static class WorkItem {
>>
>> public enum WorkItemEvent {
>> ADD,
>> UPDATE,
>> DELETE;
>> }
>>
>> public enum IndexerType {
>> RTF_INDEXER,
>> PDF_INDEXER,
>> XML_INDEXER,
>> PLAIN_TEXT_INDEXER,
>> MS_WORD_INDEXER,
>> MS_EXCEL_INDEXER,
>> MS_POWERPOINT_INDEXER;
>> }
>>
>>
>> private final Document workLoad;
>>
>> private final WorkItemEvent workItemEvent;
>>
>> private final IndexerType indexerType;
>>
>>
>> public WorkItem(final Document workLoad, final WorkItemEvent  
>> workItemEvent) {
>> this.workLoad = workLoad;
>> this.workItemEvent = workItemEvent;
>> String type = this.workLoad.get("type");
>> this.indexerType = IndexerType.valueOf(type);
>> }
>>
>> public IndexerType getIndexerType() {
>> return indexerType;
>> }
>>
>> public Document getWorkLoad() {
>> return workLoad;
>> }
>>
>> public WorkItemEvent getWorkItemEvent() {
>> return workItemEvent;
>> }
>> }
>>
>> private enum FieldNameEnum {
>>
>> AUTHOR("author"),
>> BODY("body"),
>> TITLE("title"),
>> SUBJECT("subject"),
>> KEYWORDS("keywords"),
>> PATH("path"), NAME ("name"),
>> TYPE("type"),
>> ID ("id"),
>> SUMMARY ("summary");
>>
>> private final String description;
>>
>> private FieldNameEnum(final String description) {
>> this.description = description;
>> }
>>
>> public String getDescription() {
>> return this.description;
>> }
>> }
>> }
>
> --------------------------
> Grant Ingersoll
>
> Lucene Helpful Hints:
> http://wiki.apache.org/lucene-java/BasicsOfPerformance
> http://wiki.apache.org/lucene-java/LuceneFAQ
>
>
>
>
>
>
>
>
>
>

--------------------------
Grant Ingersoll

Lucene Helpful Hints:
http://wiki.apache.org/lucene-java/BasicsOfPerformance
http://wiki.apache.org/lucene-java/LuceneFAQ

Re: Search Test file

Posted by Amin Mohammed-Coleman <am...@gmail.com>.

Hi,

Please ignore my last email.  Just woke up and wrote the email.  After  
looking at the luke further it looks like the token is being stored at  
index.amin, that is why "amin" wasn't working.  Making those changes  
that you recommended worked.

I will investigate further why "amin" token is being stored as  
"indexed.amin".


Thanks again for all the help.

Cheers

Amin


On 4 Jan 2009, at 02:23, Grant Ingersoll wrote:

>
>
> Begin forwarded message:
>
>> From: Grant Ingersoll <gs...@apache.org>
>> Date: January 3, 2009 8:19:14 PM EST
>> To: java-dev@lucene.apache.org
>> Subject: Fwd: Search Test file
>> Reply-To: java-dev@lucene.apache.org
>>
>> Hi Amin,
>>
>> I see a couple of issues with your program below, and one that is  
>> the cause of the problem of not finding "amin" as a query term.
>>
>> When you construct your IndexWriter, you are doing:
>>> IndexWriter indexWriter = new  
>>> IndexWriter(getDirectory(),getAnalyzer(),new  
>>> IndexWriter.MaxFieldLength(2));
>>
>> The MaxFieldLength parameter specifies the maximum number of tokens  
>> allowed in a Field.  Everything else after that is dropped.  See http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.html#IndexWriter(org.apache.lucene.store.Directory,%20org.apache.lucene.analysis.Analyzer,%20org.apache.lucene.index.IndexWriter.MaxFieldLength) 
>>  and http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.MaxFieldLength.html
>>
>> Also,
>> TopDocs topDocs = multiSearcher.search(query,  
>> BooleanQuery.getMaxClauseCount());
>>
>> strikes me as really odd.  Why are you passing in the max clause  
>> count as the number of results you want returned?
>>
>> Cheers,
>> Grant
>>
>>
>>
>> Begin forwarded message:
>>
>>> From: "aminmc@gmail.com" <am...@gmail.com>
>>> Date: January 3, 2009 3:24:52 PM EST
>>> To: gsingers@apache.org
>>> Subject: Search Test file
>>>
>>> I've shared a document with you called "Search Test file":
>>> http://docs.google.com/Doc?id=d77xf5q_0n6hb38fx&invite=cjq79zj
>>>
>>> It's not an attachment -- it's stored online at Google Docs. To  
>>> open this document, just click the link above.
>>> ---
>>>
>>> Hi
>>>
>>> I have uploaded the test file at google docs. It is currently a  
>>> txt file but if you change the extension to .java it should work.
>>>
>>> package com.amin.app.lucene.search.impl;
>>>
>>> import static org.junit.Assert.assertEquals;
>>> import static org.junit.Assert.assertNotNull;
>>> import static org.junit.Assert.assertNotSame;
>>> import static org.junit.Assert.assertTrue;
>>>
>>> import java.io.File;
>>> import java.io.FileInputStream;
>>> import java.io.FileOutputStream;
>>> import java.io.IOException;
>>> import java.io.InputStream;
>>> import java.io.OutputStream;
>>>
>>> import javax.swing.text.BadLocationException;
>>> import javax.swing.text.DefaultStyledDocument;
>>> import javax.swing.text.rtf.RTFEditorKit;
>>>
>>> import org.apache.commons.lang.StringUtils;
>>> import org.apache.lucene.analysis.Analyzer;
>>> import org.apache.lucene.analysis.standard.StandardAnalyzer;
>>> import org.apache.lucene.ant.DocumentHandler;
>>> import org.apache.lucene.ant.DocumentHandlerException;
>>> import org.apache.lucene.document.Document;
>>> import org.apache.lucene.document.Field;
>>> import org.apache.lucene.index.CorruptIndexException;
>>> import org.apache.lucene.index.IndexReader;
>>> import org.apache.lucene.index.IndexWriter;
>>> import org.apache.lucene.queryParser.MultiFieldQueryParser;
>>> import org.apache.lucene.queryParser.QueryParser;
>>> import org.apache.lucene.search.BooleanQuery;
>>> import org.apache.lucene.search.IndexSearcher;
>>> import org.apache.lucene.search.MultiSearcher;
>>> import org.apache.lucene.search.Query;
>>> import org.apache.lucene.search.ScoreDoc;
>>> import org.apache.lucene.search.Searchable;
>>> import org.apache.lucene.search.TopDocs;
>>> import org.apache.lucene.store.Directory;
>>> import org.apache.lucene.store.FSDirectory;
>>> import org.junit.After;
>>> import org.junit.Before;
>>> import org.junit.Test;
>>>
>>> import com.amin.app.lucene.util.WorkItem.IndexerType;
>>>
>>> public class SearchTest {
>>>
>>> private File rtfFile = null;
>>> private static final String RTF_FILE_NAME =  
>>> "rtfDocumentToIndex.rtf";
>>>
>>> @Before
>>> public void setUp() throws Exception {
>>> InputStream inputStream =  
>>> this.getClass().getClassLoader().getResourceAsStream(RTF_FILE_NAME);
>>> rtfFile = new File(RTF_FILE_NAME);
>>> convertInputStreamToFile(inputStream, rtfFile);
>>> }
>>>
>>>
>>>
>>> @Test
>>> public void testCanCreateLuceneDocumentForRTFDocument() throws  
>>> Exception {
>>> JavaBuiltInRTFHandler builtInRTFHandler = new  
>>> JavaBuiltInRTFHandler();
>>> Document document = builtInRTFHandler.getDocument(rtfFile);
>>> assertNotNull(document);
>>> String value = document.get(FieldNameEnum.BODY.getDescription());
>>> assertNotNull(value);
>>> assertNotSame("", value);
>>> assertTrue(value.contains("Amin Mohammed-Coleman"));
>>> assertTrue(value.contains("This is a test rtf document that will  
>>> be indexed."));
>>> String path = document.get(FieldNameEnum.PATH.getDescription());
>>> assertNotNull(path);
>>> assertTrue(path.contains(".rtf"));
>>> String fileName = document.get(FieldNameEnum.NAME.getDescription());
>>> assertNotNull(fileName);
>>> assertEquals(RTF_FILE_NAME, fileName);
>>> assertEquals(WorkItem.IndexerType.RTF_INDEXER.name(),  
>>> document.get(FieldNameEnum.TYPE.getDescription()));
>>>
>>> }
>>>
>>>
>>>
>>> @Test
>>> public void testCanSearchRtfDocument() throws Exception {
>>> JavaBuiltInRTFHandler builtInRTFHandler = new  
>>> JavaBuiltInRTFHandler();
>>> Document document = builtInRTFHandler.getDocument(rtfFile);
>>> IndexWriter indexWriter = new  
>>> IndexWriter(getDirectory(),getAnalyzer(),new  
>>> IndexWriter.MaxFieldLength(2));
>>> try {
>>> indexWriter.addDocument(document);
>>> commitAndCloseWriter(indexWriter);
>>> } catch (CorruptIndexException e) {
>>> throw new IllegalStateException(e);
>>> } catch (IOException e) {
>>> throw new IllegalStateException(e);
>>> }
>>>
>>> //I plan to use other searchers later
>>> IndexSearcher indexSearcher = new IndexSearcher(getDirectory());
>>> MultiSearcher multiSearcher = new MultiSearcher(new Searchable[]  
>>> {indexSearcher});
>>> QueryParser queryParser = new MultiFieldQueryParser(new String[]  
>>> {FieldNameEnum.BODY.getDescription()}, new StandardAnalyzer());
>>> Query query = queryParser.parse("amin");
>>> TopDocs topDocs = multiSearcher.search(query,  
>>> BooleanQuery.getMaxClauseCount());
>>> assertNotNull(topDocs);
>>> assertEquals(1, topDocs.totalHits);
>>> ScoreDoc[] scoreDocs = topDocs.scoreDocs;
>>> for (ScoreDoc scoreDoc : scoreDocs) {
>>> Document documentFromSearch = indexSearcher.doc(scoreDoc.doc);
>>> assertNotNull(documentFromSearch);
>>> String bodyText =  
>>> documentFromSearch.get(FieldNameEnum.BODY.getDescription());
>>> assertNotNull(bodyText);
>>> assertNotSame("", bodyText);
>>> assertTrue(bodyText.contains("Amin Mohammed-Coleman"));
>>> assertTrue(bodyText.contains("This is a test rtf document that  
>>> will be indexed."));
>>>
>>> }
>>> multiSearcher.close();
>>>
>>> }
>>>
>>> @After
>>> public void tearDown() throws Exception {
>>> rtfFile.delete();
>>> if (getDirectory().list() != null && getDirectory().list().length  
>>> > 0) {
>>> IndexReader reader = IndexReader.open(getDirectory());
>>> for(int i = 0; i < reader.maxDoc();i++) {
>>> reader.deleteDocument(i);
>>> }
>>> reader.close();
>>> }
>>> }
>>>
>>> private void commitAndCloseWriter(IndexWriter indexWriter) throws  
>>> CorruptIndexException,IOException {
>>> indexWriter.commit();
>>> indexWriter.close();
>>> }
>>>
>>>
>>> public Directory getDirectory() throws IOException {
>>> return FSDirectory.getDirectory("/tmp/lucene/rtf");
>>> }
>>>
>>> public Analyzer getAnalyzer() {
>>> return new StandardAnalyzer();
>>> }
>>> private static void convertInputStreamToFile(InputStream  
>>> inputStream, File file) {
>>> try
>>>    {
>>>    OutputStream out=new FileOutputStream(file);
>>>    byte buf[]=new byte[1024];
>>>    int len;
>>>    while((len=inputStream.read(buf))>0)
>>>    out.write(buf,0,len);
>>>    out.close();
>>>    inputStream.close();
>>>
>>>    }catch (IOException e){
>>>    throw new IllegalStateException(e);
>>>    }
>>> }
>>> private static class JavaBuiltInRTFHandler implements  
>>> DocumentHandler{
>>>
>>> public Document getDocument(File file) throws  
>>> DocumentHandlerException {
>>> String bodyText = null;
>>> DefaultStyledDocument styledDoc = new DefaultStyledDocument();
>>> try {
>>> InputStream inputStream = new FileInputStream(file);
>>> new RTFEditorKit().read(inputStream, styledDoc, 0);
>>> bodyText = styledDoc.getText(0, styledDoc.getLength());
>>> } catch (IOException ioex) {
>>> throw new IllegalStateException(ioex);
>>> } catch (BadLocationException e) {
>>> throw new IllegalArgumentException(e);
>>> }
>>> //create Document object using body
>>> if (bodyText != null) {
>>> Document document = new Document();
>>> String trimmedBodyText = StringUtils.trimToEmpty(bodyText);
>>> trimmedBodyText = trimmedBodyText.replaceAll("\n", "");
>>> Field field = new  
>>> Field(FieldNameEnum.BODY.getDescription(),trimmedBodyText,  
>>> Field.Store.YES, Field.Index.ANALYZED);
>>> document.add(field);
>>>
>>> String pathToFile = file.getPath();
>>> Field pathToFileField = new  
>>> Field(FieldNameEnum.PATH.getDescription(),pathToFile,  
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(pathToFileField);
>>>
>>> String fileName = file.getName();
>>> Field fileNameField = new  
>>> Field(FieldNameEnum.NAME.getDescription(),fileName,  
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(fileNameField);
>>>
>>> Field typeField = new  
>>> Field 
>>> (FieldNameEnum 
>>> .TYPE.getDescription(),IndexerType.RTF_INDEXER.name(),  
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(typeField);
>>>
>>> String summary = bodyText.substring(0, 10);
>>>
>>> Field summaryField = new  
>>> Field(FieldNameEnum.SUMMARY.getDescription(),summary,  
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(summaryField);
>>>
>>> return document;
>>> }
>>> return null;
>>> }
>>> }
>>>
>>> private static class WorkItem {
>>>
>>> public enum WorkItemEvent {
>>> ADD,
>>> UPDATE,
>>> DELETE;
>>> }
>>>
>>> public enum IndexerType {
>>> RTF_INDEXER,
>>> PDF_INDEXER,
>>> XML_INDEXER,
>>> PLAIN_TEXT_INDEXER,
>>> MS_WORD_INDEXER,
>>> MS_EXCEL_INDEXER,
>>> MS_POWERPOINT_INDEXER;
>>> }
>>>
>>>
>>> private final Document workLoad;
>>>
>>> private final WorkItemEvent workItemEvent;
>>>
>>> private final IndexerType indexerType;
>>>
>>>
>>> public WorkItem(final Document workLoad, final WorkItemEvent  
>>> workItemEvent) {
>>> this.workLoad = workLoad;
>>> this.workItemEvent = workItemEvent;
>>> String type = this.workLoad.get("type");
>>> this.indexerType = IndexerType.valueOf(type);
>>> }
>>>
>>> public IndexerType getIndexerType() {
>>> return indexerType;
>>> }
>>>
>>> public Document getWorkLoad() {
>>> return workLoad;
>>> }
>>>
>>> public WorkItemEvent getWorkItemEvent() {
>>> return workItemEvent;
>>> }
>>> }
>>>
>>> private enum FieldNameEnum {
>>>
>>> AUTHOR("author"),
>>> BODY("body"),
>>> TITLE("title"),
>>> SUBJECT("subject"),
>>> KEYWORDS("keywords"),
>>> PATH("path"), NAME ("name"),
>>> TYPE("type"),
>>> ID ("id"),
>>> SUMMARY ("summary");
>>>
>>> private final String description;
>>>
>>> private FieldNameEnum(final String description) {
>>> this.description = description;
>>> }
>>>
>>> public String getDescription() {
>>> return this.description;
>>> }
>>> }
>>> }
>>
>> --------------------------
>> Grant Ingersoll
>>
>> Lucene Helpful Hints:
>> http://wiki.apache.org/lucene-java/BasicsOfPerformance
>> http://wiki.apache.org/lucene-java/LuceneFAQ
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>
> --------------------------
> Grant Ingersoll
>
> Lucene Helpful Hints:
> http://wiki.apache.org/lucene-java/BasicsOfPerformance
> http://wiki.apache.org/lucene-java/LuceneFAQ
>
>
>
>
>
>
>
>
>
>


---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org

Re: Search Test file

Posted by Amin Mohammed-Coleman <am...@gmail.com>.

Hi

Test case passing now. Thanks for your help. I kind of thought it was  
probably something I was doing wrong!

Cheers

Amin

On 4 Jan 2009, at 16:59, Grant Ingersoll <gs...@apache.org> wrote:

>
> On Jan 4, 2009, at 2:49 AM, Amin Mohammed-Coleman wrote:
>
>> Hi Grant
>>
>> Thank you for looking at the test case.  I have updated the  
>> IndexWriter to use UNLIMITED for MaxFieldLength.   I tried using  
>> Integer.MAX_VALUE for
>>
>>>> Also,
>>>> TopDocs topDocs = multiSearcher.search(query,  
>>>> BooleanQuery.getMaxClauseCount());
>>>>
>>>> strikes me as really odd.  Why are you passing in the max clause  
>>>> count as the number of results you want returned?
>>
>>
>
> Just pass in something like "10".
>
>> However I get the following exception :
>>
>> java.lang.NegativeArraySizeException
>>    at  
>> org.apache.lucene.util.PriorityQueue.initialize(PriorityQueue.java: 
>> 41)
>>    at org.apache.lucene.search.HitQueue.<init>(HitQueue.java:24)
>>    at  
>> org.apache.lucene.search.MultiSearcher.search(MultiSearcher.java:200)
>>    at org.apache.lucene.search.Searcher.search(Searcher.java:136)
>>    at org.apache.lucene.search.Searcher.search(Searcher.java:146)
>>    at  
>> com. 
>> amin. 
>> app. 
>> lucene. 
>> search.impl.SearchTest.testCanSearchRtfDocument(SearchTest.java:101)
>>    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>>    at  
>> sun. 
>> reflect. 
>> NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
>>    at  
>> sun. 
>> reflect. 
>> DelegatingMethodAccessorImpl. 
>> invoke(DelegatingMethodAccessorImpl.java:25)
>>    at java.lang.reflect.Method.invoke(Method.java:597)
>>    at org.junit.internal.runners.TestMethod.invoke(TestMethod.java: 
>> 59)
>>    at  
>> org. 
>> junit.internal.runners.MethodRoadie.runTestMethod(MethodRoadie.java: 
>> 98)
>>    at org.junit.internal.runners.MethodRoadie 
>> $2.run(MethodRoadie.java:79)
>>    at  
>> org. 
>> junit. 
>> internal. 
>> runners.MethodRoadie.runBeforesThenTestThenAfters(MethodRoadie.java: 
>> 87)
>>    at  
>> org.junit.internal.runners.MethodRoadie.runTest(MethodRoadie.java:77)
>>    at org.junit.internal.runners.MethodRoadie.run(MethodRoadie.java: 
>> 42)
>>    at  
>> org. 
>> junit. 
>> internal. 
>> runners.JUnit4ClassRunner.invokeTestMethod(JUnit4ClassRunner.java:88)
>>    at  
>> org. 
>> junit. 
>> internal. 
>> runners.JUnit4ClassRunner.runMethods(JUnit4ClassRunner.java:51)
>>    at org.junit.internal.runners.JUnit4ClassRunner 
>> $1.run(JUnit4ClassRunner.java:44)
>>    at  
>> org. 
>> junit.internal.runners.ClassRoadie.runUnprotected(ClassRoadie.java: 
>> 27)
>>    at  
>> org. 
>> junit.internal.runners.ClassRoadie.runProtected(ClassRoadie.java:37)
>>    at  
>> org. 
>> junit.internal.runners.JUnit4ClassRunner.run(JUnit4ClassRunner.java: 
>> 42)
>>    at  
>> org. 
>> eclipse. 
>> jdt. 
>> internal. 
>> junit4.runner.JUnit4TestReference.run(JUnit4TestReference.java:45)
>>    at  
>> org. 
>> eclipse. 
>> jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)
>>    at  
>> org. 
>> eclipse. 
>> jdt. 
>> internal. 
>> junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:460)
>>    at  
>> org. 
>> eclipse. 
>> jdt. 
>> internal. 
>> junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:673)
>>    at  
>> org. 
>> eclipse. 
>> jdt. 
>> internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java:386)
>>    at  
>> org. 
>> eclipse. 
>> jdt. 
>> internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java: 
>> 196)
>>
>>
>> I know that this is an issue (not being able to use  
>> Integer.MAX_VALUE).  I tried using 100 and my test still doesn't  
>> pass.
>>
>>
>> Cheers
>> Amin
>>
>>
>> On 4 Jan 2009, at 02:23, Grant Ingersoll wrote:
>>
>>>
>>>
>>> Begin forwarded message:
>>>
>>>> From: Grant Ingersoll <gs...@apache.org>
>>>> Date: January 3, 2009 8:19:14 PM EST
>>>> To: java-dev@lucene.apache.org
>>>> Subject: Fwd: Search Test file
>>>> Reply-To: java-dev@lucene.apache.org
>>>>
>>>> Hi Amin,
>>>>
>>>> I see a couple of issues with your program below, and one that is  
>>>> the cause of the problem of not finding "amin" as a query term.
>>>>
>>>> When you construct your IndexWriter, you are doing:
>>>>> IndexWriter indexWriter = new  
>>>>> IndexWriter(getDirectory(),getAnalyzer(),new  
>>>>> IndexWriter.MaxFieldLength(2));
>>>>
>>>> The MaxFieldLength parameter specifies the maximum number of  
>>>> tokens allowed in a Field.  Everything else after that is  
>>>> dropped.  See http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.html#IndexWriter(org.apache.lucene.store.Directory,%20org.apache.lucene.analysis.Analyzer,%20org.apache.lucene.index.IndexWriter.MaxFieldLength 
>>>> ) and http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.MaxFieldLength.html
>>>>
>>>> Also,
>>>> TopDocs topDocs = multiSearcher.search(query,  
>>>> BooleanQuery.getMaxClauseCount());
>>>>
>>>> strikes me as really odd.  Why are you passing in the max clause  
>>>> count as the number of results you want returned?
>>>>
>>>> Cheers,
>>>> Grant
>>>>
>>>>
>>>>
>>>> Begin forwarded message:
>>>>
>>>>> From: "aminmc@gmail.com" <am...@gmail.com>
>>>>> Date: January 3, 2009 3:24:52 PM EST
>>>>> To: gsingers@apache.org
>>>>> Subject: Search Test file
>>>>>
>>>>> I've shared a document with you called "Search Test file":
>>>>> http://docs.google.com/Doc?id=d77xf5q_0n6hb38fx&invite=cjq79zj
>>>>>
>>>>> It's not an attachment -- it's stored online at Google Docs. To  
>>>>> open this document, just click the link above.
>>>>> ---
>>>>>
>>>>> Hi
>>>>>
>>>>> I have uploaded the test file at google docs. It is currently a  
>>>>> txt file but if you change the extension to .java it should work.
>>>>>
>>>>> package com.amin.app.lucene.search.impl;
>>>>>
>>>>> import static org.junit.Assert.assertEquals;
>>>>> import static org.junit.Assert.assertNotNull;
>>>>> import static org.junit.Assert.assertNotSame;
>>>>> import static org.junit.Assert.assertTrue;
>>>>>
>>>>> import java.io.File;
>>>>> import java.io.FileInputStream;
>>>>> import java.io.FileOutputStream;
>>>>> import java.io.IOException;
>>>>> import java.io.InputStream;
>>>>> import java.io.OutputStream;
>>>>>
>>>>> import javax.swing.text.BadLocationException;
>>>>> import javax.swing.text.DefaultStyledDocument;
>>>>> import javax.swing.text.rtf.RTFEditorKit;
>>>>>
>>>>> import org.apache.commons.lang.StringUtils;
>>>>> import org.apache.lucene.analysis.Analyzer;
>>>>> import org.apache.lucene.analysis.standard.StandardAnalyzer;
>>>>> import org.apache.lucene.ant.DocumentHandler;
>>>>> import org.apache.lucene.ant.DocumentHandlerException;
>>>>> import org.apache.lucene.document.Document;
>>>>> import org.apache.lucene.document.Field;
>>>>> import org.apache.lucene.index.CorruptIndexException;
>>>>> import org.apache.lucene.index.IndexReader;
>>>>> import org.apache.lucene.index.IndexWriter;
>>>>> import org.apache.lucene.queryParser.MultiFieldQueryParser;
>>>>> import org.apache.lucene.queryParser.QueryParser;
>>>>> import org.apache.lucene.search.BooleanQuery;
>>>>> import org.apache.lucene.search.IndexSearcher;
>>>>> import org.apache.lucene.search.MultiSearcher;
>>>>> import org.apache.lucene.search.Query;
>>>>> import org.apache.lucene.search.ScoreDoc;
>>>>> import org.apache.lucene.search.Searchable;
>>>>> import org.apache.lucene.search.TopDocs;
>>>>> import org.apache.lucene.store.Directory;
>>>>> import org.apache.lucene.store.FSDirectory;
>>>>> import org.junit.After;
>>>>> import org.junit.Before;
>>>>> import org.junit.Test;
>>>>>
>>>>> import com.amin.app.lucene.util.WorkItem.IndexerType;
>>>>>
>>>>> public class SearchTest {
>>>>>
>>>>> private File rtfFile = null;
>>>>> private static final String RTF_FILE_NAME =  
>>>>> "rtfDocumentToIndex.rtf";
>>>>>
>>>>> @Before
>>>>> public void setUp() throws Exception {
>>>>> InputStream inputStream =  
>>>>> this. 
>>>>> getClass().getClassLoader().getResourceAsStream(RTF_FILE_NAME);
>>>>> rtfFile = new File(RTF_FILE_NAME);
>>>>> convertInputStreamToFile(inputStream, rtfFile);
>>>>> }
>>>>>
>>>>>
>>>>>
>>>>> @Test
>>>>> public void testCanCreateLuceneDocumentForRTFDocument() throws  
>>>>> Exception {
>>>>> JavaBuiltInRTFHandler builtInRTFHandler = new  
>>>>> JavaBuiltInRTFHandler();
>>>>> Document document = builtInRTFHandler.getDocument(rtfFile);
>>>>> assertNotNull(document);
>>>>> String value = document.get(FieldNameEnum.BODY.getDescription());
>>>>> assertNotNull(value);
>>>>> assertNotSame("", value);
>>>>> assertTrue(value.contains("Amin Mohammed-Coleman"));
>>>>> assertTrue(value.contains("This is a test rtf document that will  
>>>>> be indexed."));
>>>>> String path = document.get(FieldNameEnum.PATH.getDescription());
>>>>> assertNotNull(path);
>>>>> assertTrue(path.contains(".rtf"));
>>>>> String fileName =  
>>>>> document.get(FieldNameEnum.NAME.getDescription());
>>>>> assertNotNull(fileName);
>>>>> assertEquals(RTF_FILE_NAME, fileName);
>>>>> assertEquals(WorkItem.IndexerType.RTF_INDEXER.name(),  
>>>>> document.get(FieldNameEnum.TYPE.getDescription()));
>>>>>
>>>>> }
>>>>>
>>>>>
>>>>>
>>>>> @Test
>>>>> public void testCanSearchRtfDocument() throws Exception {
>>>>> JavaBuiltInRTFHandler builtInRTFHandler = new  
>>>>> JavaBuiltInRTFHandler();
>>>>> Document document = builtInRTFHandler.getDocument(rtfFile);
>>>>> IndexWriter indexWriter = new  
>>>>> IndexWriter(getDirectory(),getAnalyzer(),new  
>>>>> IndexWriter.MaxFieldLength(2));
>>>>> try {
>>>>> indexWriter.addDocument(document);
>>>>> commitAndCloseWriter(indexWriter);
>>>>> } catch (CorruptIndexException e) {
>>>>> throw new IllegalStateException(e);
>>>>> } catch (IOException e) {
>>>>> throw new IllegalStateException(e);
>>>>> }
>>>>>
>>>>> //I plan to use other searchers later
>>>>> IndexSearcher indexSearcher = new IndexSearcher(getDirectory());
>>>>> MultiSearcher multiSearcher = new MultiSearcher(new Searchable[]  
>>>>> {indexSearcher});
>>>>> QueryParser queryParser = new MultiFieldQueryParser(new String[]  
>>>>> {FieldNameEnum.BODY.getDescription()}, new StandardAnalyzer());
>>>>> Query query = queryParser.parse("amin");
>>>>> TopDocs topDocs = multiSearcher.search(query,  
>>>>> BooleanQuery.getMaxClauseCount());
>>>>> assertNotNull(topDocs);
>>>>> assertEquals(1, topDocs.totalHits);
>>>>> ScoreDoc[] scoreDocs = topDocs.scoreDocs;
>>>>> for (ScoreDoc scoreDoc : scoreDocs) {
>>>>> Document documentFromSearch = indexSearcher.doc(scoreDoc.doc);
>>>>> assertNotNull(documentFromSearch);
>>>>> String bodyText =  
>>>>> documentFromSearch.get(FieldNameEnum.BODY.getDescription());
>>>>> assertNotNull(bodyText);
>>>>> assertNotSame("", bodyText);
>>>>> assertTrue(bodyText.contains("Amin Mohammed-Coleman"));
>>>>> assertTrue(bodyText.contains("This is a test rtf document that  
>>>>> will be indexed."));
>>>>>
>>>>> }
>>>>> multiSearcher.close();
>>>>>
>>>>> }
>>>>>
>>>>> @After
>>>>> public void tearDown() throws Exception {
>>>>> rtfFile.delete();
>>>>> if (getDirectory().list() != null &&  
>>>>> getDirectory().list().length > 0) {
>>>>> IndexReader reader = IndexReader.open(getDirectory());
>>>>> for(int i = 0; i < reader.maxDoc();i++) {
>>>>> reader.deleteDocument(i);
>>>>> }
>>>>> reader.close();
>>>>> }
>>>>> }
>>>>>
>>>>> private void commitAndCloseWriter(IndexWriter indexWriter)  
>>>>> throws CorruptIndexException,IOException {
>>>>> indexWriter.commit();
>>>>> indexWriter.close();
>>>>> }
>>>>>
>>>>>
>>>>> public Directory getDirectory() throws IOException {
>>>>> return FSDirectory.getDirectory("/tmp/lucene/rtf");
>>>>> }
>>>>>
>>>>> public Analyzer getAnalyzer() {
>>>>> return new StandardAnalyzer();
>>>>> }
>>>>> private static void convertInputStreamToFile(InputStream  
>>>>> inputStream, File file) {
>>>>> try
>>>>>  {
>>>>>  OutputStream out=new FileOutputStream(file);
>>>>>  byte buf[]=new byte[1024];
>>>>>  int len;
>>>>>  while((len=inputStream.read(buf))>0)
>>>>>  out.write(buf,0,len);
>>>>>  out.close();
>>>>>  inputStream.close();
>>>>>
>>>>>  }catch (IOException e){
>>>>>  throw new IllegalStateException(e);
>>>>>  }
>>>>> }
>>>>> private static class JavaBuiltInRTFHandler implements  
>>>>> DocumentHandler{
>>>>>
>>>>> public Document getDocument(File file) throws  
>>>>> DocumentHandlerException {
>>>>> String bodyText = null;
>>>>> DefaultStyledDocument styledDoc = new DefaultStyledDocument();
>>>>> try {
>>>>> InputStream inputStream = new FileInputStream(file);
>>>>> new RTFEditorKit().read(inputStream, styledDoc, 0);
>>>>> bodyText = styledDoc.getText(0, styledDoc.getLength());
>>>>> } catch (IOException ioex) {
>>>>> throw new IllegalStateException(ioex);
>>>>> } catch (BadLocationException e) {
>>>>> throw new IllegalArgumentException(e);
>>>>> }
>>>>> //create Document object using body
>>>>> if (bodyText != null) {
>>>>> Document document = new Document();
>>>>> String trimmedBodyText = StringUtils.trimToEmpty(bodyText);
>>>>> trimmedBodyText = trimmedBodyText.replaceAll("\n", "");
>>>>> Field field = new  
>>>>> Field(FieldNameEnum.BODY.getDescription(),trimmedBodyText,  
>>>>> Field.Store.YES, Field.Index.ANALYZED);
>>>>> document.add(field);
>>>>>
>>>>> String pathToFile = file.getPath();
>>>>> Field pathToFileField = new  
>>>>> Field(FieldNameEnum.PATH.getDescription(),pathToFile,  
>>>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>>>> document.add(pathToFileField);
>>>>>
>>>>> String fileName = file.getName();
>>>>> Field fileNameField = new  
>>>>> Field(FieldNameEnum.NAME.getDescription(),fileName,  
>>>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>>>> document.add(fileNameField);
>>>>>
>>>>> Field typeField = new  
>>>>> Field( 
>>>>> FieldNameEnum. 
>>>>> TYPE.getDescription(),IndexerType.RTF_INDEXER.name(),  
>>>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>>>> document.add(typeField);
>>>>>
>>>>> String summary = bodyText.substring(0, 10);
>>>>>
>>>>> Field summaryField = new  
>>>>> Field(FieldNameEnum.SUMMARY.getDescription(),summary,  
>>>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>>>> document.add(summaryField);
>>>>>
>>>>> return document;
>>>>> }
>>>>> return null;
>>>>> }
>>>>> }
>>>>>
>>>>> private static class WorkItem {
>>>>>
>>>>> public enum WorkItemEvent {
>>>>> ADD,
>>>>> UPDATE,
>>>>> DELETE;
>>>>> }
>>>>>
>>>>> public enum IndexerType {
>>>>> RTF_INDEXER,
>>>>> PDF_INDEXER,
>>>>> XML_INDEXER,
>>>>> PLAIN_TEXT_INDEXER,
>>>>> MS_WORD_INDEXER,
>>>>> MS_EXCEL_INDEXER,
>>>>> MS_POWERPOINT_INDEXER;
>>>>> }
>>>>>
>>>>>
>>>>> private final Document workLoad;
>>>>>
>>>>> private final WorkItemEvent workItemEvent;
>>>>>
>>>>> private final IndexerType indexerType;
>>>>>
>>>>>
>>>>> public WorkItem(final Document workLoad, final WorkItemEvent  
>>>>> workItemEvent) {
>>>>> this.workLoad = workLoad;
>>>>> this.workItemEvent = workItemEvent;
>>>>> String type = this.workLoad.get("type");
>>>>> this.indexerType = IndexerType.valueOf(type);
>>>>> }
>>>>>
>>>>> public IndexerType getIndexerType() {
>>>>> return indexerType;
>>>>> }
>>>>>
>>>>> public Document getWorkLoad() {
>>>>> return workLoad;
>>>>> }
>>>>>
>>>>> public WorkItemEvent getWorkItemEvent() {
>>>>> return workItemEvent;
>>>>> }
>>>>> }
>>>>>
>>>>> private enum FieldNameEnum {
>>>>>
>>>>> AUTHOR("author"),
>>>>> BODY("body"),
>>>>> TITLE("title"),
>>>>> SUBJECT("subject"),
>>>>> KEYWORDS("keywords"),
>>>>> PATH("path"), NAME ("name"),
>>>>> TYPE("type"),
>>>>> ID ("id"),
>>>>> SUMMARY ("summary");
>>>>>
>>>>> private final String description;
>>>>>
>>>>> private FieldNameEnum(final String description) {
>>>>> this.description = description;
>>>>> }
>>>>>
>>>>> public String getDescription() {
>>>>> return this.description;
>>>>> }
>>>>> }
>>>>> }
>>>>
>>>> --------------------------
>>>> Grant Ingersoll
>>>>
>>>> Lucene Helpful Hints:
>>>> http://wiki.apache.org/lucene-java/BasicsOfPerformance
>>>> http://wiki.apache.org/lucene-java/LuceneFAQ
>>>>
>>>>
>>>>
>>>>
>>>>
>>>>
>>>>
>>>>
>>>>
>>>>
>>>
>>> --------------------------
>>> Grant Ingersoll
>>>
>>> Lucene Helpful Hints:
>>> http://wiki.apache.org/lucene-java/BasicsOfPerformance
>>> http://wiki.apache.org/lucene-java/LuceneFAQ
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>
>>
>> ---------------------------------------------------------------------
>> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
>> For additional commands, e-mail: java-user-help@lucene.apache.org
>>
>
> --------------------------
> Grant Ingersoll
>
> Lucene Helpful Hints:
> http://wiki.apache.org/lucene-java/BasicsOfPerformance
> http://wiki.apache.org/lucene-java/LuceneFAQ
>
>
>
>
>
>
>
>
>
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org

Re: Search Test file

Posted by Grant Ingersoll <gs...@apache.org>.

On Jan 4, 2009, at 2:49 AM, Amin Mohammed-Coleman wrote:

> Hi Grant
>
> Thank you for looking at the test case.  I have updated the  
> IndexWriter to use UNLIMITED for MaxFieldLength.   I tried using  
> Integer.MAX_VALUE for
>
>>> Also,
>>> TopDocs topDocs = multiSearcher.search(query,  
>>> BooleanQuery.getMaxClauseCount());
>>>
>>> strikes me as really odd.  Why are you passing in the max clause  
>>> count as the number of results you want returned?
>
>

Just pass in something like "10".

> However I get the following exception :
>
> java.lang.NegativeArraySizeException
> 	at  
> org.apache.lucene.util.PriorityQueue.initialize(PriorityQueue.java:41)
> 	at org.apache.lucene.search.HitQueue.<init>(HitQueue.java:24)
> 	at org.apache.lucene.search.MultiSearcher.search(MultiSearcher.java: 
> 200)
> 	at org.apache.lucene.search.Searcher.search(Searcher.java:136)
> 	at org.apache.lucene.search.Searcher.search(Searcher.java:146)
> 	at  
> com 
> .amin 
> .app 
> .lucene 
> .search.impl.SearchTest.testCanSearchRtfDocument(SearchTest.java:101)
> 	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> 	at  
> sun 
> .reflect 
> .NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
> 	at  
> sun 
> .reflect 
> .DelegatingMethodAccessorImpl 
> .invoke(DelegatingMethodAccessorImpl.java:25)
> 	at java.lang.reflect.Method.invoke(Method.java:597)
> 	at org.junit.internal.runners.TestMethod.invoke(TestMethod.java:59)
> 	at  
> org 
> .junit.internal.runners.MethodRoadie.runTestMethod(MethodRoadie.java: 
> 98)
> 	at org.junit.internal.runners.MethodRoadie$2.run(MethodRoadie.java: 
> 79)
> 	at  
> org 
> .junit 
> .internal 
> .runners.MethodRoadie.runBeforesThenTestThenAfters(MethodRoadie.java: 
> 87)
> 	at  
> org.junit.internal.runners.MethodRoadie.runTest(MethodRoadie.java:77)
> 	at org.junit.internal.runners.MethodRoadie.run(MethodRoadie.java:42)
> 	at  
> org 
> .junit 
> .internal 
> .runners.JUnit4ClassRunner.invokeTestMethod(JUnit4ClassRunner.java:88)
> 	at  
> org 
> .junit 
> .internal 
> .runners.JUnit4ClassRunner.runMethods(JUnit4ClassRunner.java:51)
> 	at org.junit.internal.runners.JUnit4ClassRunner 
> $1.run(JUnit4ClassRunner.java:44)
> 	at  
> org 
> .junit.internal.runners.ClassRoadie.runUnprotected(ClassRoadie.java: 
> 27)
> 	at  
> org.junit.internal.runners.ClassRoadie.runProtected(ClassRoadie.java: 
> 37)
> 	at  
> org 
> .junit.internal.runners.JUnit4ClassRunner.run(JUnit4ClassRunner.java: 
> 42)
> 	at  
> org 
> .eclipse 
> .jdt 
> .internal 
> .junit4.runner.JUnit4TestReference.run(JUnit4TestReference.java:45)
> 	at  
> org 
> .eclipse 
> .jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)
> 	at  
> org 
> .eclipse 
> .jdt 
> .internal 
> .junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:460)
> 	at  
> org 
> .eclipse 
> .jdt 
> .internal 
> .junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:673)
> 	at  
> org 
> .eclipse 
> .jdt 
> .internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java:386)
> 	at  
> org 
> .eclipse 
> .jdt 
> .internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java: 
> 196)
>
>
> I know that this is an issue (not being able to use  
> Integer.MAX_VALUE).  I tried using 100 and my test still doesn't pass.
>
>
> Cheers
> Amin
>
>
> On 4 Jan 2009, at 02:23, Grant Ingersoll wrote:
>
>>
>>
>> Begin forwarded message:
>>
>>> From: Grant Ingersoll <gs...@apache.org>
>>> Date: January 3, 2009 8:19:14 PM EST
>>> To: java-dev@lucene.apache.org
>>> Subject: Fwd: Search Test file
>>> Reply-To: java-dev@lucene.apache.org
>>>
>>> Hi Amin,
>>>
>>> I see a couple of issues with your program below, and one that is  
>>> the cause of the problem of not finding "amin" as a query term.
>>>
>>> When you construct your IndexWriter, you are doing:
>>>> IndexWriter indexWriter = new  
>>>> IndexWriter(getDirectory(),getAnalyzer(),new  
>>>> IndexWriter.MaxFieldLength(2));
>>>
>>> The MaxFieldLength parameter specifies the maximum number of  
>>> tokens allowed in a Field.  Everything else after that is  
>>> dropped.  See http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.html#IndexWriter(org.apache.lucene.store.Directory,%20org.apache.lucene.analysis.Analyzer,%20org.apache.lucene.index.IndexWriter.MaxFieldLength) 
>>>  and http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.MaxFieldLength.html
>>>
>>> Also,
>>> TopDocs topDocs = multiSearcher.search(query,  
>>> BooleanQuery.getMaxClauseCount());
>>>
>>> strikes me as really odd.  Why are you passing in the max clause  
>>> count as the number of results you want returned?
>>>
>>> Cheers,
>>> Grant
>>>
>>>
>>>
>>> Begin forwarded message:
>>>
>>>> From: "aminmc@gmail.com" <am...@gmail.com>
>>>> Date: January 3, 2009 3:24:52 PM EST
>>>> To: gsingers@apache.org
>>>> Subject: Search Test file
>>>>
>>>> I've shared a document with you called "Search Test file":
>>>> http://docs.google.com/Doc?id=d77xf5q_0n6hb38fx&invite=cjq79zj
>>>>
>>>> It's not an attachment -- it's stored online at Google Docs. To  
>>>> open this document, just click the link above.
>>>> ---
>>>>
>>>> Hi
>>>>
>>>> I have uploaded the test file at google docs. It is currently a  
>>>> txt file but if you change the extension to .java it should work.
>>>>
>>>> package com.amin.app.lucene.search.impl;
>>>>
>>>> import static org.junit.Assert.assertEquals;
>>>> import static org.junit.Assert.assertNotNull;
>>>> import static org.junit.Assert.assertNotSame;
>>>> import static org.junit.Assert.assertTrue;
>>>>
>>>> import java.io.File;
>>>> import java.io.FileInputStream;
>>>> import java.io.FileOutputStream;
>>>> import java.io.IOException;
>>>> import java.io.InputStream;
>>>> import java.io.OutputStream;
>>>>
>>>> import javax.swing.text.BadLocationException;
>>>> import javax.swing.text.DefaultStyledDocument;
>>>> import javax.swing.text.rtf.RTFEditorKit;
>>>>
>>>> import org.apache.commons.lang.StringUtils;
>>>> import org.apache.lucene.analysis.Analyzer;
>>>> import org.apache.lucene.analysis.standard.StandardAnalyzer;
>>>> import org.apache.lucene.ant.DocumentHandler;
>>>> import org.apache.lucene.ant.DocumentHandlerException;
>>>> import org.apache.lucene.document.Document;
>>>> import org.apache.lucene.document.Field;
>>>> import org.apache.lucene.index.CorruptIndexException;
>>>> import org.apache.lucene.index.IndexReader;
>>>> import org.apache.lucene.index.IndexWriter;
>>>> import org.apache.lucene.queryParser.MultiFieldQueryParser;
>>>> import org.apache.lucene.queryParser.QueryParser;
>>>> import org.apache.lucene.search.BooleanQuery;
>>>> import org.apache.lucene.search.IndexSearcher;
>>>> import org.apache.lucene.search.MultiSearcher;
>>>> import org.apache.lucene.search.Query;
>>>> import org.apache.lucene.search.ScoreDoc;
>>>> import org.apache.lucene.search.Searchable;
>>>> import org.apache.lucene.search.TopDocs;
>>>> import org.apache.lucene.store.Directory;
>>>> import org.apache.lucene.store.FSDirectory;
>>>> import org.junit.After;
>>>> import org.junit.Before;
>>>> import org.junit.Test;
>>>>
>>>> import com.amin.app.lucene.util.WorkItem.IndexerType;
>>>>
>>>> public class SearchTest {
>>>>
>>>> private File rtfFile = null;
>>>> private static final String RTF_FILE_NAME =  
>>>> "rtfDocumentToIndex.rtf";
>>>>
>>>> @Before
>>>> public void setUp() throws Exception {
>>>> InputStream inputStream =  
>>>> this 
>>>> .getClass().getClassLoader().getResourceAsStream(RTF_FILE_NAME);
>>>> rtfFile = new File(RTF_FILE_NAME);
>>>> convertInputStreamToFile(inputStream, rtfFile);
>>>> }
>>>>
>>>>
>>>>
>>>> @Test
>>>> public void testCanCreateLuceneDocumentForRTFDocument() throws  
>>>> Exception {
>>>> JavaBuiltInRTFHandler builtInRTFHandler = new  
>>>> JavaBuiltInRTFHandler();
>>>> Document document = builtInRTFHandler.getDocument(rtfFile);
>>>> assertNotNull(document);
>>>> String value = document.get(FieldNameEnum.BODY.getDescription());
>>>> assertNotNull(value);
>>>> assertNotSame("", value);
>>>> assertTrue(value.contains("Amin Mohammed-Coleman"));
>>>> assertTrue(value.contains("This is a test rtf document that will  
>>>> be indexed."));
>>>> String path = document.get(FieldNameEnum.PATH.getDescription());
>>>> assertNotNull(path);
>>>> assertTrue(path.contains(".rtf"));
>>>> String fileName =  
>>>> document.get(FieldNameEnum.NAME.getDescription());
>>>> assertNotNull(fileName);
>>>> assertEquals(RTF_FILE_NAME, fileName);
>>>> assertEquals(WorkItem.IndexerType.RTF_INDEXER.name(),  
>>>> document.get(FieldNameEnum.TYPE.getDescription()));
>>>>
>>>> }
>>>>
>>>>
>>>>
>>>> @Test
>>>> public void testCanSearchRtfDocument() throws Exception {
>>>> JavaBuiltInRTFHandler builtInRTFHandler = new  
>>>> JavaBuiltInRTFHandler();
>>>> Document document = builtInRTFHandler.getDocument(rtfFile);
>>>> IndexWriter indexWriter = new  
>>>> IndexWriter(getDirectory(),getAnalyzer(),new  
>>>> IndexWriter.MaxFieldLength(2));
>>>> try {
>>>> indexWriter.addDocument(document);
>>>> commitAndCloseWriter(indexWriter);
>>>> } catch (CorruptIndexException e) {
>>>> throw new IllegalStateException(e);
>>>> } catch (IOException e) {
>>>> throw new IllegalStateException(e);
>>>> }
>>>>
>>>> //I plan to use other searchers later
>>>> IndexSearcher indexSearcher = new IndexSearcher(getDirectory());
>>>> MultiSearcher multiSearcher = new MultiSearcher(new Searchable[]  
>>>> {indexSearcher});
>>>> QueryParser queryParser = new MultiFieldQueryParser(new String[]  
>>>> {FieldNameEnum.BODY.getDescription()}, new StandardAnalyzer());
>>>> Query query = queryParser.parse("amin");
>>>> TopDocs topDocs = multiSearcher.search(query,  
>>>> BooleanQuery.getMaxClauseCount());
>>>> assertNotNull(topDocs);
>>>> assertEquals(1, topDocs.totalHits);
>>>> ScoreDoc[] scoreDocs = topDocs.scoreDocs;
>>>> for (ScoreDoc scoreDoc : scoreDocs) {
>>>> Document documentFromSearch = indexSearcher.doc(scoreDoc.doc);
>>>> assertNotNull(documentFromSearch);
>>>> String bodyText =  
>>>> documentFromSearch.get(FieldNameEnum.BODY.getDescription());
>>>> assertNotNull(bodyText);
>>>> assertNotSame("", bodyText);
>>>> assertTrue(bodyText.contains("Amin Mohammed-Coleman"));
>>>> assertTrue(bodyText.contains("This is a test rtf document that  
>>>> will be indexed."));
>>>>
>>>> }
>>>> multiSearcher.close();
>>>>
>>>> }
>>>>
>>>> @After
>>>> public void tearDown() throws Exception {
>>>> rtfFile.delete();
>>>> if (getDirectory().list() != null && getDirectory().list().length  
>>>> > 0) {
>>>> IndexReader reader = IndexReader.open(getDirectory());
>>>> for(int i = 0; i < reader.maxDoc();i++) {
>>>> reader.deleteDocument(i);
>>>> }
>>>> reader.close();
>>>> }
>>>> }
>>>>
>>>> private void commitAndCloseWriter(IndexWriter indexWriter) throws  
>>>> CorruptIndexException,IOException {
>>>> indexWriter.commit();
>>>> indexWriter.close();
>>>> }
>>>>
>>>>
>>>> public Directory getDirectory() throws IOException {
>>>> return FSDirectory.getDirectory("/tmp/lucene/rtf");
>>>> }
>>>>
>>>> public Analyzer getAnalyzer() {
>>>> return new StandardAnalyzer();
>>>> }
>>>> private static void convertInputStreamToFile(InputStream  
>>>> inputStream, File file) {
>>>> try
>>>>   {
>>>>   OutputStream out=new FileOutputStream(file);
>>>>   byte buf[]=new byte[1024];
>>>>   int len;
>>>>   while((len=inputStream.read(buf))>0)
>>>>   out.write(buf,0,len);
>>>>   out.close();
>>>>   inputStream.close();
>>>>
>>>>   }catch (IOException e){
>>>>   throw new IllegalStateException(e);
>>>>   }
>>>> }
>>>> private static class JavaBuiltInRTFHandler implements  
>>>> DocumentHandler{
>>>>
>>>> public Document getDocument(File file) throws  
>>>> DocumentHandlerException {
>>>> String bodyText = null;
>>>> DefaultStyledDocument styledDoc = new DefaultStyledDocument();
>>>> try {
>>>> InputStream inputStream = new FileInputStream(file);
>>>> new RTFEditorKit().read(inputStream, styledDoc, 0);
>>>> bodyText = styledDoc.getText(0, styledDoc.getLength());
>>>> } catch (IOException ioex) {
>>>> throw new IllegalStateException(ioex);
>>>> } catch (BadLocationException e) {
>>>> throw new IllegalArgumentException(e);
>>>> }
>>>> //create Document object using body
>>>> if (bodyText != null) {
>>>> Document document = new Document();
>>>> String trimmedBodyText = StringUtils.trimToEmpty(bodyText);
>>>> trimmedBodyText = trimmedBodyText.replaceAll("\n", "");
>>>> Field field = new  
>>>> Field(FieldNameEnum.BODY.getDescription(),trimmedBodyText,  
>>>> Field.Store.YES, Field.Index.ANALYZED);
>>>> document.add(field);
>>>>
>>>> String pathToFile = file.getPath();
>>>> Field pathToFileField = new  
>>>> Field(FieldNameEnum.PATH.getDescription(),pathToFile,  
>>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>>> document.add(pathToFileField);
>>>>
>>>> String fileName = file.getName();
>>>> Field fileNameField = new  
>>>> Field(FieldNameEnum.NAME.getDescription(),fileName,  
>>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>>> document.add(fileNameField);
>>>>
>>>> Field typeField = new  
>>>> Field 
>>>> (FieldNameEnum 
>>>> .TYPE.getDescription(),IndexerType.RTF_INDEXER.name(),  
>>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>>> document.add(typeField);
>>>>
>>>> String summary = bodyText.substring(0, 10);
>>>>
>>>> Field summaryField = new  
>>>> Field(FieldNameEnum.SUMMARY.getDescription(),summary,  
>>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>>> document.add(summaryField);
>>>>
>>>> return document;
>>>> }
>>>> return null;
>>>> }
>>>> }
>>>>
>>>> private static class WorkItem {
>>>>
>>>> public enum WorkItemEvent {
>>>> ADD,
>>>> UPDATE,
>>>> DELETE;
>>>> }
>>>>
>>>> public enum IndexerType {
>>>> RTF_INDEXER,
>>>> PDF_INDEXER,
>>>> XML_INDEXER,
>>>> PLAIN_TEXT_INDEXER,
>>>> MS_WORD_INDEXER,
>>>> MS_EXCEL_INDEXER,
>>>> MS_POWERPOINT_INDEXER;
>>>> }
>>>>
>>>>
>>>> private final Document workLoad;
>>>>
>>>> private final WorkItemEvent workItemEvent;
>>>>
>>>> private final IndexerType indexerType;
>>>>
>>>>
>>>> public WorkItem(final Document workLoad, final WorkItemEvent  
>>>> workItemEvent) {
>>>> this.workLoad = workLoad;
>>>> this.workItemEvent = workItemEvent;
>>>> String type = this.workLoad.get("type");
>>>> this.indexerType = IndexerType.valueOf(type);
>>>> }
>>>>
>>>> public IndexerType getIndexerType() {
>>>> return indexerType;
>>>> }
>>>>
>>>> public Document getWorkLoad() {
>>>> return workLoad;
>>>> }
>>>>
>>>> public WorkItemEvent getWorkItemEvent() {
>>>> return workItemEvent;
>>>> }
>>>> }
>>>>
>>>> private enum FieldNameEnum {
>>>>
>>>> AUTHOR("author"),
>>>> BODY("body"),
>>>> TITLE("title"),
>>>> SUBJECT("subject"),
>>>> KEYWORDS("keywords"),
>>>> PATH("path"), NAME ("name"),
>>>> TYPE("type"),
>>>> ID ("id"),
>>>> SUMMARY ("summary");
>>>>
>>>> private final String description;
>>>>
>>>> private FieldNameEnum(final String description) {
>>>> this.description = description;
>>>> }
>>>>
>>>> public String getDescription() {
>>>> return this.description;
>>>> }
>>>> }
>>>> }
>>>
>>> --------------------------
>>> Grant Ingersoll
>>>
>>> Lucene Helpful Hints:
>>> http://wiki.apache.org/lucene-java/BasicsOfPerformance
>>> http://wiki.apache.org/lucene-java/LuceneFAQ
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>>
>>
>> --------------------------
>> Grant Ingersoll
>>
>> Lucene Helpful Hints:
>> http://wiki.apache.org/lucene-java/BasicsOfPerformance
>> http://wiki.apache.org/lucene-java/LuceneFAQ
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>

--------------------------
Grant Ingersoll

Lucene Helpful Hints:
http://wiki.apache.org/lucene-java/BasicsOfPerformance
http://wiki.apache.org/lucene-java/LuceneFAQ











---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org

Re: Search Test file

Posted by Amin Mohammed-Coleman <am...@gmail.com>.

Hi Grant

Thank you for looking at the test case.  I have updated the  
IndexWriter to use UNLIMITED for MaxFieldLength.   I tried using  
Integer.MAX_VALUE for

>> Also,
>> TopDocs topDocs = multiSearcher.search(query,  
>> BooleanQuery.getMaxClauseCount());
>>
>> strikes me as really odd.  Why are you passing in the max clause  
>> count as the number of results you want returned?


However I get the following exception :

java.lang.NegativeArraySizeException
	at org.apache.lucene.util.PriorityQueue.initialize(PriorityQueue.java: 
41)
	at org.apache.lucene.search.HitQueue.<init>(HitQueue.java:24)
	at org.apache.lucene.search.MultiSearcher.search(MultiSearcher.java: 
200)
	at org.apache.lucene.search.Searcher.search(Searcher.java:136)
	at org.apache.lucene.search.Searcher.search(Searcher.java:146)
	at  
com 
.amin 
.app 
.lucene 
.search.impl.SearchTest.testCanSearchRtfDocument(SearchTest.java:101)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at  
sun 
.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java: 
39)
	at  
sun 
.reflect 
.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java: 
25)
	at java.lang.reflect.Method.invoke(Method.java:597)
	at org.junit.internal.runners.TestMethod.invoke(TestMethod.java:59)
	at  
org 
.junit.internal.runners.MethodRoadie.runTestMethod(MethodRoadie.java:98)
	at org.junit.internal.runners.MethodRoadie$2.run(MethodRoadie.java:79)
	at  
org 
.junit 
.internal 
.runners.MethodRoadie.runBeforesThenTestThenAfters(MethodRoadie.java:87)
	at org.junit.internal.runners.MethodRoadie.runTest(MethodRoadie.java: 
77)
	at org.junit.internal.runners.MethodRoadie.run(MethodRoadie.java:42)
	at  
org 
.junit 
.internal 
.runners.JUnit4ClassRunner.invokeTestMethod(JUnit4ClassRunner.java:88)
	at  
org 
.junit 
.internal.runners.JUnit4ClassRunner.runMethods(JUnit4ClassRunner.java: 
51)
	at org.junit.internal.runners.JUnit4ClassRunner 
$1.run(JUnit4ClassRunner.java:44)
	at  
org.junit.internal.runners.ClassRoadie.runUnprotected(ClassRoadie.java: 
27)
	at  
org.junit.internal.runners.ClassRoadie.runProtected(ClassRoadie.java:37)
	at  
org 
.junit.internal.runners.JUnit4ClassRunner.run(JUnit4ClassRunner.java:42)
	at  
org 
.eclipse 
.jdt 
.internal 
.junit4.runner.JUnit4TestReference.run(JUnit4TestReference.java:45)
	at  
org 
.eclipse 
.jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)
	at  
org 
.eclipse 
.jdt 
.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java: 
460)
	at  
org 
.eclipse 
.jdt 
.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java: 
673)
	at  
org 
.eclipse 
.jdt.internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java: 
386)
	at  
org 
.eclipse 
.jdt.internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java: 
196)


I know that this is an issue (not being able to use  
Integer.MAX_VALUE).  I tried using 100 and my test still doesn't pass.


Cheers
Amin


On 4 Jan 2009, at 02:23, Grant Ingersoll wrote:

>
>
> Begin forwarded message:
>
>> From: Grant Ingersoll <gs...@apache.org>
>> Date: January 3, 2009 8:19:14 PM EST
>> To: java-dev@lucene.apache.org
>> Subject: Fwd: Search Test file
>> Reply-To: java-dev@lucene.apache.org
>>
>> Hi Amin,
>>
>> I see a couple of issues with your program below, and one that is  
>> the cause of the problem of not finding "amin" as a query term.
>>
>> When you construct your IndexWriter, you are doing:
>>> IndexWriter indexWriter = new  
>>> IndexWriter(getDirectory(),getAnalyzer(),new  
>>> IndexWriter.MaxFieldLength(2));
>>
>> The MaxFieldLength parameter specifies the maximum number of tokens  
>> allowed in a Field.  Everything else after that is dropped.  See http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.html#IndexWriter(org.apache.lucene.store.Directory,%20org.apache.lucene.analysis.Analyzer,%20org.apache.lucene.index.IndexWriter.MaxFieldLength) 
>>  and http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.MaxFieldLength.html
>>
>> Also,
>> TopDocs topDocs = multiSearcher.search(query,  
>> BooleanQuery.getMaxClauseCount());
>>
>> strikes me as really odd.  Why are you passing in the max clause  
>> count as the number of results you want returned?
>>
>> Cheers,
>> Grant
>>
>>
>>
>> Begin forwarded message:
>>
>>> From: "aminmc@gmail.com" <am...@gmail.com>
>>> Date: January 3, 2009 3:24:52 PM EST
>>> To: gsingers@apache.org
>>> Subject: Search Test file
>>>
>>> I've shared a document with you called "Search Test file":
>>> http://docs.google.com/Doc?id=d77xf5q_0n6hb38fx&invite=cjq79zj
>>>
>>> It's not an attachment -- it's stored online at Google Docs. To  
>>> open this document, just click the link above.
>>> ---
>>>
>>> Hi
>>>
>>> I have uploaded the test file at google docs. It is currently a  
>>> txt file but if you change the extension to .java it should work.
>>>
>>> package com.amin.app.lucene.search.impl;
>>>
>>> import static org.junit.Assert.assertEquals;
>>> import static org.junit.Assert.assertNotNull;
>>> import static org.junit.Assert.assertNotSame;
>>> import static org.junit.Assert.assertTrue;
>>>
>>> import java.io.File;
>>> import java.io.FileInputStream;
>>> import java.io.FileOutputStream;
>>> import java.io.IOException;
>>> import java.io.InputStream;
>>> import java.io.OutputStream;
>>>
>>> import javax.swing.text.BadLocationException;
>>> import javax.swing.text.DefaultStyledDocument;
>>> import javax.swing.text.rtf.RTFEditorKit;
>>>
>>> import org.apache.commons.lang.StringUtils;
>>> import org.apache.lucene.analysis.Analyzer;
>>> import org.apache.lucene.analysis.standard.StandardAnalyzer;
>>> import org.apache.lucene.ant.DocumentHandler;
>>> import org.apache.lucene.ant.DocumentHandlerException;
>>> import org.apache.lucene.document.Document;
>>> import org.apache.lucene.document.Field;
>>> import org.apache.lucene.index.CorruptIndexException;
>>> import org.apache.lucene.index.IndexReader;
>>> import org.apache.lucene.index.IndexWriter;
>>> import org.apache.lucene.queryParser.MultiFieldQueryParser;
>>> import org.apache.lucene.queryParser.QueryParser;
>>> import org.apache.lucene.search.BooleanQuery;
>>> import org.apache.lucene.search.IndexSearcher;
>>> import org.apache.lucene.search.MultiSearcher;
>>> import org.apache.lucene.search.Query;
>>> import org.apache.lucene.search.ScoreDoc;
>>> import org.apache.lucene.search.Searchable;
>>> import org.apache.lucene.search.TopDocs;
>>> import org.apache.lucene.store.Directory;
>>> import org.apache.lucene.store.FSDirectory;
>>> import org.junit.After;
>>> import org.junit.Before;
>>> import org.junit.Test;
>>>
>>> import com.amin.app.lucene.util.WorkItem.IndexerType;
>>>
>>> public class SearchTest {
>>>
>>> private File rtfFile = null;
>>> private static final String RTF_FILE_NAME =  
>>> "rtfDocumentToIndex.rtf";
>>>
>>> @Before
>>> public void setUp() throws Exception {
>>> InputStream inputStream =  
>>> this.getClass().getClassLoader().getResourceAsStream(RTF_FILE_NAME);
>>> rtfFile = new File(RTF_FILE_NAME);
>>> convertInputStreamToFile(inputStream, rtfFile);
>>> }
>>>
>>>
>>>
>>> @Test
>>> public void testCanCreateLuceneDocumentForRTFDocument() throws  
>>> Exception {
>>> JavaBuiltInRTFHandler builtInRTFHandler = new  
>>> JavaBuiltInRTFHandler();
>>> Document document = builtInRTFHandler.getDocument(rtfFile);
>>> assertNotNull(document);
>>> String value = document.get(FieldNameEnum.BODY.getDescription());
>>> assertNotNull(value);
>>> assertNotSame("", value);
>>> assertTrue(value.contains("Amin Mohammed-Coleman"));
>>> assertTrue(value.contains("This is a test rtf document that will  
>>> be indexed."));
>>> String path = document.get(FieldNameEnum.PATH.getDescription());
>>> assertNotNull(path);
>>> assertTrue(path.contains(".rtf"));
>>> String fileName = document.get(FieldNameEnum.NAME.getDescription());
>>> assertNotNull(fileName);
>>> assertEquals(RTF_FILE_NAME, fileName);
>>> assertEquals(WorkItem.IndexerType.RTF_INDEXER.name(),  
>>> document.get(FieldNameEnum.TYPE.getDescription()));
>>>
>>> }
>>>
>>>
>>>
>>> @Test
>>> public void testCanSearchRtfDocument() throws Exception {
>>> JavaBuiltInRTFHandler builtInRTFHandler = new  
>>> JavaBuiltInRTFHandler();
>>> Document document = builtInRTFHandler.getDocument(rtfFile);
>>> IndexWriter indexWriter = new  
>>> IndexWriter(getDirectory(),getAnalyzer(),new  
>>> IndexWriter.MaxFieldLength(2));
>>> try {
>>> indexWriter.addDocument(document);
>>> commitAndCloseWriter(indexWriter);
>>> } catch (CorruptIndexException e) {
>>> throw new IllegalStateException(e);
>>> } catch (IOException e) {
>>> throw new IllegalStateException(e);
>>> }
>>>
>>> //I plan to use other searchers later
>>> IndexSearcher indexSearcher = new IndexSearcher(getDirectory());
>>> MultiSearcher multiSearcher = new MultiSearcher(new Searchable[]  
>>> {indexSearcher});
>>> QueryParser queryParser = new MultiFieldQueryParser(new String[]  
>>> {FieldNameEnum.BODY.getDescription()}, new StandardAnalyzer());
>>> Query query = queryParser.parse("amin");
>>> TopDocs topDocs = multiSearcher.search(query,  
>>> BooleanQuery.getMaxClauseCount());
>>> assertNotNull(topDocs);
>>> assertEquals(1, topDocs.totalHits);
>>> ScoreDoc[] scoreDocs = topDocs.scoreDocs;
>>> for (ScoreDoc scoreDoc : scoreDocs) {
>>> Document documentFromSearch = indexSearcher.doc(scoreDoc.doc);
>>> assertNotNull(documentFromSearch);
>>> String bodyText =  
>>> documentFromSearch.get(FieldNameEnum.BODY.getDescription());
>>> assertNotNull(bodyText);
>>> assertNotSame("", bodyText);
>>> assertTrue(bodyText.contains("Amin Mohammed-Coleman"));
>>> assertTrue(bodyText.contains("This is a test rtf document that  
>>> will be indexed."));
>>>
>>> }
>>> multiSearcher.close();
>>>
>>> }
>>>
>>> @After
>>> public void tearDown() throws Exception {
>>> rtfFile.delete();
>>> if (getDirectory().list() != null && getDirectory().list().length  
>>> > 0) {
>>> IndexReader reader = IndexReader.open(getDirectory());
>>> for(int i = 0; i < reader.maxDoc();i++) {
>>> reader.deleteDocument(i);
>>> }
>>> reader.close();
>>> }
>>> }
>>>
>>> private void commitAndCloseWriter(IndexWriter indexWriter) throws  
>>> CorruptIndexException,IOException {
>>> indexWriter.commit();
>>> indexWriter.close();
>>> }
>>>
>>>
>>> public Directory getDirectory() throws IOException {
>>> return FSDirectory.getDirectory("/tmp/lucene/rtf");
>>> }
>>>
>>> public Analyzer getAnalyzer() {
>>> return new StandardAnalyzer();
>>> }
>>> private static void convertInputStreamToFile(InputStream  
>>> inputStream, File file) {
>>> try
>>>    {
>>>    OutputStream out=new FileOutputStream(file);
>>>    byte buf[]=new byte[1024];
>>>    int len;
>>>    while((len=inputStream.read(buf))>0)
>>>    out.write(buf,0,len);
>>>    out.close();
>>>    inputStream.close();
>>>
>>>    }catch (IOException e){
>>>    throw new IllegalStateException(e);
>>>    }
>>> }
>>> private static class JavaBuiltInRTFHandler implements  
>>> DocumentHandler{
>>>
>>> public Document getDocument(File file) throws  
>>> DocumentHandlerException {
>>> String bodyText = null;
>>> DefaultStyledDocument styledDoc = new DefaultStyledDocument();
>>> try {
>>> InputStream inputStream = new FileInputStream(file);
>>> new RTFEditorKit().read(inputStream, styledDoc, 0);
>>> bodyText = styledDoc.getText(0, styledDoc.getLength());
>>> } catch (IOException ioex) {
>>> throw new IllegalStateException(ioex);
>>> } catch (BadLocationException e) {
>>> throw new IllegalArgumentException(e);
>>> }
>>> //create Document object using body
>>> if (bodyText != null) {
>>> Document document = new Document();
>>> String trimmedBodyText = StringUtils.trimToEmpty(bodyText);
>>> trimmedBodyText = trimmedBodyText.replaceAll("\n", "");
>>> Field field = new  
>>> Field(FieldNameEnum.BODY.getDescription(),trimmedBodyText,  
>>> Field.Store.YES, Field.Index.ANALYZED);
>>> document.add(field);
>>>
>>> String pathToFile = file.getPath();
>>> Field pathToFileField = new  
>>> Field(FieldNameEnum.PATH.getDescription(),pathToFile,  
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(pathToFileField);
>>>
>>> String fileName = file.getName();
>>> Field fileNameField = new  
>>> Field(FieldNameEnum.NAME.getDescription(),fileName,  
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(fileNameField);
>>>
>>> Field typeField = new  
>>> Field 
>>> (FieldNameEnum 
>>> .TYPE.getDescription(),IndexerType.RTF_INDEXER.name(),  
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(typeField);
>>>
>>> String summary = bodyText.substring(0, 10);
>>>
>>> Field summaryField = new  
>>> Field(FieldNameEnum.SUMMARY.getDescription(),summary,  
>>> Field.Store.YES, Field.Index.NOT_ANALYZED);
>>> document.add(summaryField);
>>>
>>> return document;
>>> }
>>> return null;
>>> }
>>> }
>>>
>>> private static class WorkItem {
>>>
>>> public enum WorkItemEvent {
>>> ADD,
>>> UPDATE,
>>> DELETE;
>>> }
>>>
>>> public enum IndexerType {
>>> RTF_INDEXER,
>>> PDF_INDEXER,
>>> XML_INDEXER,
>>> PLAIN_TEXT_INDEXER,
>>> MS_WORD_INDEXER,
>>> MS_EXCEL_INDEXER,
>>> MS_POWERPOINT_INDEXER;
>>> }
>>>
>>>
>>> private final Document workLoad;
>>>
>>> private final WorkItemEvent workItemEvent;
>>>
>>> private final IndexerType indexerType;
>>>
>>>
>>> public WorkItem(final Document workLoad, final WorkItemEvent  
>>> workItemEvent) {
>>> this.workLoad = workLoad;
>>> this.workItemEvent = workItemEvent;
>>> String type = this.workLoad.get("type");
>>> this.indexerType = IndexerType.valueOf(type);
>>> }
>>>
>>> public IndexerType getIndexerType() {
>>> return indexerType;
>>> }
>>>
>>> public Document getWorkLoad() {
>>> return workLoad;
>>> }
>>>
>>> public WorkItemEvent getWorkItemEvent() {
>>> return workItemEvent;
>>> }
>>> }
>>>
>>> private enum FieldNameEnum {
>>>
>>> AUTHOR("author"),
>>> BODY("body"),
>>> TITLE("title"),
>>> SUBJECT("subject"),
>>> KEYWORDS("keywords"),
>>> PATH("path"), NAME ("name"),
>>> TYPE("type"),
>>> ID ("id"),
>>> SUMMARY ("summary");
>>>
>>> private final String description;
>>>
>>> private FieldNameEnum(final String description) {
>>> this.description = description;
>>> }
>>>
>>> public String getDescription() {
>>> return this.description;
>>> }
>>> }
>>> }
>>
>> --------------------------
>> Grant Ingersoll
>>
>> Lucene Helpful Hints:
>> http://wiki.apache.org/lucene-java/BasicsOfPerformance
>> http://wiki.apache.org/lucene-java/LuceneFAQ
>>
>>
>>
>>
>>
>>
>>
>>
>>
>>
>
> --------------------------
> Grant Ingersoll
>
> Lucene Helpful Hints:
> http://wiki.apache.org/lucene-java/BasicsOfPerformance
> http://wiki.apache.org/lucene-java/LuceneFAQ
>
>
>
>
>
>
>
>
>
>


---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org