You are viewing a plain text version of this content. The canonical link for it is here.

Posted to dev@lucene.apache.org by "Gaurav Gupta (JIRA)" <ji...@apache.org> on 2007/03/08 03:29:24 UTC

[jira] Created: (LUCENE-827) Lucene Spell Index Not giving Proper Result

Lucene Spell Index Not giving Proper Result
-------------------------------------------

                 Key: LUCENE-827
                 URL: https://issues.apache.org/jira/browse/LUCENE-827
             Project: Lucene - Java
          Issue Type: Bug
         Environment: Windows XP, Linux
            Reporter: Gaurav Gupta


I am passing List of words 'Mayur Vihar Center Circle Udyog Vihar Noida Gurgaon' to create spell index from Lucene Index. when i searches for correct word for 'Centrer' i.e 'Center', it does'nt find it. I checked it whether its there in spell Index, i didnt find it there.

By making the spell Index directly from Plain text Dictionary gives me the correct word for 'centre' i.e 'center'. I cant understand why it is behaving like this.



Also attaching the source -: 

CreateDataStructure creates the Lucene Index and initializeSpellChecker initializes the spell Checker.



import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.sql.SQLException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.spell.LuceneDictionary;
import org.apache.lucene.search.spell.SpellChecker;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class SpellCheckImpl implements SpellCheck{

	
	private String SIMPLE_DIRECTORY;
	private String SPELL_DIRECTORY;
	private String DATA_TEXT_FILE;
	private String DEFAULT_FIELD="field";
	
	
	
	private SpellCheckImpl(){
		
	}
	
	//Configure the directories
	
	public SpellCheckImpl(String directoryPath){
		
		File f = new File(directoryPath);
		
		if(f.isDirectory()){
			
			this.SIMPLE_DIRECTORY = directoryPath+"/test";
			this.SPELL_DIRECTORY = directoryPath+"/sp";
			
			File simple = new File(this.SIMPLE_DIRECTORY);
			File spell = new File(this.SPELL_DIRECTORY);
			
			if(!simple.isDirectory()){
				simple.mkdir();
			}
			
			if(!spell.isDirectory()){
				spell.mkdir();
			}
			
		}
		
	}
	
	
	/**
	 * Initialize the Dictionary with given Keywords
	 */
	public void initialize(String filePath){
		
		this.DATA_TEXT_FILE = filePath;
		
		try{
			
			createDataStructure(SIMPLE_DIRECTORY);
			initializeSpellChecker(SIMPLE_DIRECTORY,SPELL_DIRECTORY);
			
		}catch(Exception e){
			System.out.println("Initialization failed "+e.getMessage());
		}
	}
	
	
	/**
	 * This method creates the index for the list of good words at the given location.
	 * @param origDirLocation
	 * @param dictionaryType
	 * @throws IOException
	 * @throws InstantiationException
	 * @throws IllegalAccessException
	 * @throws ClassNotFoundException
	 * @throws SQLException
	 */
	private void createDataStructure(String origDirLocation) throws IOException, InstantiationException, 
	IllegalAccessException, ClassNotFoundException, SQLException{

		Directory directory = FSDirectory.getDirectory(origDirLocation, true);
		Analyzer analyzer = new StandardAnalyzer();
		IndexWriter iwriter = new IndexWriter(directory, analyzer, true);
		
		long time=System.currentTimeMillis();
		
		InputStream is = null;
		
		is = new FileInputStream(new File(DATA_TEXT_FILE));
		
		Document doc = new Document();
		
		//doc.add(Field.Text(DEFAULT_FIELD, (Reader) new InputStreamReader(is)));
		doc.add(new Field(DEFAULT_FIELD, "Mayur Vihar Center Circle Udyog Vihar Noida Gurgaon", Field.Store.YES, Field.Index.TOKENIZED));
		
		
		iwriter.addDocument(doc);
		iwriter.optimize();
		time=System.currentTimeMillis()-time;
		System.out.println("time to Create Lucene Index "+time);
		
		iwriter.close();

	}
	
	
	
	/**
	 * This method creates the spell checker dictionary from the words directory at the specified location.
	 * @param origDirLocation
	 * @param spellDirLocation
	 * @throws IOException
	 */
	private void initializeSpellChecker(String origDirLocation, String spellDirLocation) throws IOException{
		FSDirectory origDir = FSDirectory.getDirectory(origDirLocation, false);
		FSDirectory spellDir = FSDirectory.getDirectory(spellDirLocation, true);
		
		long time=System.currentTimeMillis();
		
		IndexReader indexReader = null;
	    indexReader = IndexReader.open(origDir);
	    
	    SpellChecker  spellChecker = new SpellChecker(spellDir);

	    spellChecker.indexDictionnary(new LuceneDictionary(indexReader, DEFAULT_FIELD));
	    
	    time=System.currentTimeMillis()-time;
	    
		System.out.println("time to build Spell Checker Dictionary "+time);
		
	}
	
	
	public String[] GetMatches(String badWord) throws ParseException 
	{
		SpellChecker spellChecker = null;
		try
		{
			spellChecker = new SpellChecker(FSDirectory.getDirectory(SPELL_DIRECTORY,false));
			
			spellChecker.setAccuraty(0);
			
			if(spellChecker.exist(badWord)){
				System.out.println("here");
			}
			
			String[] similarWords = spellChecker.suggestSimilar(badWord, 25);
	
			return similarWords;
		} 
		catch (IOException e) 
		{
			throw new ParseException(e.getMessage());
		}
	}
	
	
	public String GetBestMatch(String badWord) throws ParseException
	{	
		
		String[] correctWords = GetMatches(badWord);
		
		if(correctWords != null && correctWords.length > 0){
			return correctWords[0];
		}
		
		return "No Correct Spelling Found";
		
	}
	
	public void addWords(String word) throws IOException{
		
		
		long time=System.currentTimeMillis();
		Analyzer analyzer = new StandardAnalyzer();
		IndexWriter writer   = new IndexWriter(SIMPLE_DIRECTORY, analyzer, false);
		
		if(word != null && !"".equals(word)){
			Document doc = new Document();
				doc.add(new Field(DEFAULT_FIELD, word , Field.Store.YES,
						Field.Index.TOKENIZED));
			writer.addDocument(doc);
		}
		
        writer.optimize();
        writer.close();
        time=System.currentTimeMillis()-time;
        initializeSpellChecker(SIMPLE_DIRECTORY,SPELL_DIRECTORY);
		System.out.println("time to add words "+time);
	}
	
	
		public void addWords(String[]	word) throws IOException{
		
		long time=System.currentTimeMillis();
		Analyzer analyzer = new StandardAnalyzer();
		IndexWriter writer   = new IndexWriter(SIMPLE_DIRECTORY, analyzer, false);
		
		if(word != null){
			for(int i=0;i<word.length;i++){
				if(word[i] != null && !"".equals(word[i])){
				Document doc = new Document();
					doc.add(new Field(DEFAULT_FIELD, word[i] , Field.Store.YES,
							Field.Index.TOKENIZED));
					writer.addDocument(doc);
				}	
			}
		}
		
        writer.optimize();
        writer.close();
        time=System.currentTimeMillis()-time;
        initializeSpellChecker(SIMPLE_DIRECTORY,SPELL_DIRECTORY);
		System.out.println("time to add words "+time);
	}
	
}


-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org

[jira] Resolved: (LUCENE-827) Lucene Spell Index Not giving Proper Result

Posted by "Hoss Man (JIRA)" <ji...@apache.org>.

     [ https://issues.apache.org/jira/browse/LUCENE-827?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Hoss Man resolved LUCENE-827.
-----------------------------

    Resolution: Invalid


if you have questions about using any lucence java code, pelase start be emailing the java-user mailing list ... bugs should only be filed once you have confirmed a bug truely does exist (after discussion with other users who confirm that your assumptions/methodology are correct)

Based on a quick reading of your problem description, i would suspect that if SpellChecker works for you when you build a "Dictionary" from plain text,  but not when you use a LuceneDictionary it may be because of the Analyzer you are using when building your initial index ... but please followup in email on the user list

> Lucene Spell Index Not giving Proper Result
> -------------------------------------------
>
>                 Key: LUCENE-827
>                 URL: https://issues.apache.org/jira/browse/LUCENE-827
>             Project: Lucene - Java
>          Issue Type: Bug
>         Environment: Windows XP, Linux
>            Reporter: Gaurav Gupta
>
> I am passing List of words 'Mayur Vihar Center Circle Udyog Vihar Noida Gurgaon' to create spell index from Lucene Index. when i searches for correct word for 'Centrer' i.e 'Center', it does'nt find it. I checked it whether its there in spell Index, i didnt find it there.
> By making the spell Index directly from Plain text Dictionary gives me the correct word for 'centre' i.e 'center'. I cant understand why it is behaving like this.
> Also attaching the source -: 
> CreateDataStructure creates the Lucene Index and initializeSpellChecker initializes the spell Checker.
> import java.io.File;
> import java.io.FileInputStream;
> import java.io.IOException;
> import java.io.InputStream;
> import java.io.InputStreamReader;
> import java.io.Reader;
> import java.sql.SQLException;
> import org.apache.lucene.analysis.Analyzer;
> import org.apache.lucene.analysis.standard.StandardAnalyzer;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.document.Field;
> import org.apache.lucene.index.IndexReader;
> import org.apache.lucene.index.IndexWriter;
> import org.apache.lucene.queryParser.ParseException;
> import org.apache.lucene.search.spell.LuceneDictionary;
> import org.apache.lucene.search.spell.SpellChecker;
> import org.apache.lucene.store.Directory;
> import org.apache.lucene.store.FSDirectory;
> public class SpellCheckImpl implements SpellCheck{
> 	
> 	private String SIMPLE_DIRECTORY;
> 	private String SPELL_DIRECTORY;
> 	private String DATA_TEXT_FILE;
> 	private String DEFAULT_FIELD="field";
> 	
> 	
> 	
> 	private SpellCheckImpl(){
> 		
> 	}
> 	
> 	//Configure the directories
> 	
> 	public SpellCheckImpl(String directoryPath){
> 		
> 		File f = new File(directoryPath);
> 		
> 		if(f.isDirectory()){
> 			
> 			this.SIMPLE_DIRECTORY = directoryPath+"/test";
> 			this.SPELL_DIRECTORY = directoryPath+"/sp";
> 			
> 			File simple = new File(this.SIMPLE_DIRECTORY);
> 			File spell = new File(this.SPELL_DIRECTORY);
> 			
> 			if(!simple.isDirectory()){
> 				simple.mkdir();
> 			}
> 			
> 			if(!spell.isDirectory()){
> 				spell.mkdir();
> 			}
> 			
> 		}
> 		
> 	}
> 	
> 	
> 	/**
> 	 * Initialize the Dictionary with given Keywords
> 	 */
> 	public void initialize(String filePath){
> 		
> 		this.DATA_TEXT_FILE = filePath;
> 		
> 		try{
> 			
> 			createDataStructure(SIMPLE_DIRECTORY);
> 			initializeSpellChecker(SIMPLE_DIRECTORY,SPELL_DIRECTORY);
> 			
> 		}catch(Exception e){
> 			System.out.println("Initialization failed "+e.getMessage());
> 		}
> 	}
> 	
> 	
> 	/**
> 	 * This method creates the index for the list of good words at the given location.
> 	 * @param origDirLocation
> 	 * @param dictionaryType
> 	 * @throws IOException
> 	 * @throws InstantiationException
> 	 * @throws IllegalAccessException
> 	 * @throws ClassNotFoundException
> 	 * @throws SQLException
> 	 */
> 	private void createDataStructure(String origDirLocation) throws IOException, InstantiationException, 
> 	IllegalAccessException, ClassNotFoundException, SQLException{
> 		Directory directory = FSDirectory.getDirectory(origDirLocation, true);
> 		Analyzer analyzer = new StandardAnalyzer();
> 		IndexWriter iwriter = new IndexWriter(directory, analyzer, true);
> 		
> 		long time=System.currentTimeMillis();
> 		
> 		InputStream is = null;
> 		
> 		is = new FileInputStream(new File(DATA_TEXT_FILE));
> 		
> 		Document doc = new Document();
> 		
> 		//doc.add(Field.Text(DEFAULT_FIELD, (Reader) new InputStreamReader(is)));
> 		doc.add(new Field(DEFAULT_FIELD, "Mayur Vihar Center Circle Udyog Vihar Noida Gurgaon", Field.Store.YES, Field.Index.TOKENIZED));
> 		
> 		
> 		iwriter.addDocument(doc);
> 		iwriter.optimize();
> 		time=System.currentTimeMillis()-time;
> 		System.out.println("time to Create Lucene Index "+time);
> 		
> 		iwriter.close();
> 	}
> 	
> 	
> 	
> 	/**
> 	 * This method creates the spell checker dictionary from the words directory at the specified location.
> 	 * @param origDirLocation
> 	 * @param spellDirLocation
> 	 * @throws IOException
> 	 */
> 	private void initializeSpellChecker(String origDirLocation, String spellDirLocation) throws IOException{
> 		FSDirectory origDir = FSDirectory.getDirectory(origDirLocation, false);
> 		FSDirectory spellDir = FSDirectory.getDirectory(spellDirLocation, true);
> 		
> 		long time=System.currentTimeMillis();
> 		
> 		IndexReader indexReader = null;
> 	    indexReader = IndexReader.open(origDir);
> 	    
> 	    SpellChecker  spellChecker = new SpellChecker(spellDir);
> 	    spellChecker.indexDictionnary(new LuceneDictionary(indexReader, DEFAULT_FIELD));
> 	    
> 	    time=System.currentTimeMillis()-time;
> 	    
> 		System.out.println("time to build Spell Checker Dictionary "+time);
> 		
> 	}
> 	
> 	
> 	public String[] GetMatches(String badWord) throws ParseException 
> 	{
> 		SpellChecker spellChecker = null;
> 		try
> 		{
> 			spellChecker = new SpellChecker(FSDirectory.getDirectory(SPELL_DIRECTORY,false));
> 			
> 			spellChecker.setAccuraty(0);
> 			
> 			if(spellChecker.exist(badWord)){
> 				System.out.println("here");
> 			}
> 			
> 			String[] similarWords = spellChecker.suggestSimilar(badWord, 25);
> 	
> 			return similarWords;
> 		} 
> 		catch (IOException e) 
> 		{
> 			throw new ParseException(e.getMessage());
> 		}
> 	}
> 	
> 	
> 	public String GetBestMatch(String badWord) throws ParseException
> 	{	
> 		
> 		String[] correctWords = GetMatches(badWord);
> 		
> 		if(correctWords != null && correctWords.length > 0){
> 			return correctWords[0];
> 		}
> 		
> 		return "No Correct Spelling Found";
> 		
> 	}
> 	
> 	public void addWords(String word) throws IOException{
> 		
> 		
> 		long time=System.currentTimeMillis();
> 		Analyzer analyzer = new StandardAnalyzer();
> 		IndexWriter writer   = new IndexWriter(SIMPLE_DIRECTORY, analyzer, false);
> 		
> 		if(word != null && !"".equals(word)){
> 			Document doc = new Document();
> 				doc.add(new Field(DEFAULT_FIELD, word , Field.Store.YES,
> 						Field.Index.TOKENIZED));
> 			writer.addDocument(doc);
> 		}
> 		
>         writer.optimize();
>         writer.close();
>         time=System.currentTimeMillis()-time;
>         initializeSpellChecker(SIMPLE_DIRECTORY,SPELL_DIRECTORY);
> 		System.out.println("time to add words "+time);
> 	}
> 	
> 	
> 		public void addWords(String[]	word) throws IOException{
> 		
> 		long time=System.currentTimeMillis();
> 		Analyzer analyzer = new StandardAnalyzer();
> 		IndexWriter writer   = new IndexWriter(SIMPLE_DIRECTORY, analyzer, false);
> 		
> 		if(word != null){
> 			for(int i=0;i<word.length;i++){
> 				if(word[i] != null && !"".equals(word[i])){
> 				Document doc = new Document();
> 					doc.add(new Field(DEFAULT_FIELD, word[i] , Field.Store.YES,
> 							Field.Index.TOKENIZED));
> 					writer.addDocument(doc);
> 				}	
> 			}
> 		}
> 		
>         writer.optimize();
>         writer.close();
>         time=System.currentTimeMillis()-time;
>         initializeSpellChecker(SIMPLE_DIRECTORY,SPELL_DIRECTORY);
> 		System.out.println("time to add words "+time);
> 	}
> 	
> }

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org