You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by "Gaurav Gupta (JIRA)" <ji...@apache.org> on 2007/03/08 03:29:24 UTC
[jira] Created: (LUCENE-827) Lucene Spell Index Not giving Proper
Result
Lucene Spell Index Not giving Proper Result
-------------------------------------------
Key: LUCENE-827
URL: https://issues.apache.org/jira/browse/LUCENE-827
Project: Lucene - Java
Issue Type: Bug
Environment: Windows XP, Linux
Reporter: Gaurav Gupta
I am passing List of words 'Mayur Vihar Center Circle Udyog Vihar Noida Gurgaon' to create spell index from Lucene Index. when i searches for correct word for 'Centrer' i.e 'Center', it does'nt find it. I checked it whether its there in spell Index, i didnt find it there.
By making the spell Index directly from Plain text Dictionary gives me the correct word for 'centre' i.e 'center'. I cant understand why it is behaving like this.
Also attaching the source -:
CreateDataStructure creates the Lucene Index and initializeSpellChecker initializes the spell Checker.
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.sql.SQLException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.spell.LuceneDictionary;
import org.apache.lucene.search.spell.SpellChecker;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class SpellCheckImpl implements SpellCheck{
private String SIMPLE_DIRECTORY;
private String SPELL_DIRECTORY;
private String DATA_TEXT_FILE;
private String DEFAULT_FIELD="field";
private SpellCheckImpl(){
}
//Configure the directories
public SpellCheckImpl(String directoryPath){
File f = new File(directoryPath);
if(f.isDirectory()){
this.SIMPLE_DIRECTORY = directoryPath+"/test";
this.SPELL_DIRECTORY = directoryPath+"/sp";
File simple = new File(this.SIMPLE_DIRECTORY);
File spell = new File(this.SPELL_DIRECTORY);
if(!simple.isDirectory()){
simple.mkdir();
}
if(!spell.isDirectory()){
spell.mkdir();
}
}
}
/**
* Initialize the Dictionary with given Keywords
*/
public void initialize(String filePath){
this.DATA_TEXT_FILE = filePath;
try{
createDataStructure(SIMPLE_DIRECTORY);
initializeSpellChecker(SIMPLE_DIRECTORY,SPELL_DIRECTORY);
}catch(Exception e){
System.out.println("Initialization failed "+e.getMessage());
}
}
/**
* This method creates the index for the list of good words at the given location.
* @param origDirLocation
* @param dictionaryType
* @throws IOException
* @throws InstantiationException
* @throws IllegalAccessException
* @throws ClassNotFoundException
* @throws SQLException
*/
private void createDataStructure(String origDirLocation) throws IOException, InstantiationException,
IllegalAccessException, ClassNotFoundException, SQLException{
Directory directory = FSDirectory.getDirectory(origDirLocation, true);
Analyzer analyzer = new StandardAnalyzer();
IndexWriter iwriter = new IndexWriter(directory, analyzer, true);
long time=System.currentTimeMillis();
InputStream is = null;
is = new FileInputStream(new File(DATA_TEXT_FILE));
Document doc = new Document();
//doc.add(Field.Text(DEFAULT_FIELD, (Reader) new InputStreamReader(is)));
doc.add(new Field(DEFAULT_FIELD, "Mayur Vihar Center Circle Udyog Vihar Noida Gurgaon", Field.Store.YES, Field.Index.TOKENIZED));
iwriter.addDocument(doc);
iwriter.optimize();
time=System.currentTimeMillis()-time;
System.out.println("time to Create Lucene Index "+time);
iwriter.close();
}
/**
* This method creates the spell checker dictionary from the words directory at the specified location.
* @param origDirLocation
* @param spellDirLocation
* @throws IOException
*/
private void initializeSpellChecker(String origDirLocation, String spellDirLocation) throws IOException{
FSDirectory origDir = FSDirectory.getDirectory(origDirLocation, false);
FSDirectory spellDir = FSDirectory.getDirectory(spellDirLocation, true);
long time=System.currentTimeMillis();
IndexReader indexReader = null;
indexReader = IndexReader.open(origDir);
SpellChecker spellChecker = new SpellChecker(spellDir);
spellChecker.indexDictionnary(new LuceneDictionary(indexReader, DEFAULT_FIELD));
time=System.currentTimeMillis()-time;
System.out.println("time to build Spell Checker Dictionary "+time);
}
public String[] GetMatches(String badWord) throws ParseException
{
SpellChecker spellChecker = null;
try
{
spellChecker = new SpellChecker(FSDirectory.getDirectory(SPELL_DIRECTORY,false));
spellChecker.setAccuraty(0);
if(spellChecker.exist(badWord)){
System.out.println("here");
}
String[] similarWords = spellChecker.suggestSimilar(badWord, 25);
return similarWords;
}
catch (IOException e)
{
throw new ParseException(e.getMessage());
}
}
public String GetBestMatch(String badWord) throws ParseException
{
String[] correctWords = GetMatches(badWord);
if(correctWords != null && correctWords.length > 0){
return correctWords[0];
}
return "No Correct Spelling Found";
}
public void addWords(String word) throws IOException{
long time=System.currentTimeMillis();
Analyzer analyzer = new StandardAnalyzer();
IndexWriter writer = new IndexWriter(SIMPLE_DIRECTORY, analyzer, false);
if(word != null && !"".equals(word)){
Document doc = new Document();
doc.add(new Field(DEFAULT_FIELD, word , Field.Store.YES,
Field.Index.TOKENIZED));
writer.addDocument(doc);
}
writer.optimize();
writer.close();
time=System.currentTimeMillis()-time;
initializeSpellChecker(SIMPLE_DIRECTORY,SPELL_DIRECTORY);
System.out.println("time to add words "+time);
}
public void addWords(String[] word) throws IOException{
long time=System.currentTimeMillis();
Analyzer analyzer = new StandardAnalyzer();
IndexWriter writer = new IndexWriter(SIMPLE_DIRECTORY, analyzer, false);
if(word != null){
for(int i=0;i<word.length;i++){
if(word[i] != null && !"".equals(word[i])){
Document doc = new Document();
doc.add(new Field(DEFAULT_FIELD, word[i] , Field.Store.YES,
Field.Index.TOKENIZED));
writer.addDocument(doc);
}
}
}
writer.optimize();
writer.close();
time=System.currentTimeMillis()-time;
initializeSpellChecker(SIMPLE_DIRECTORY,SPELL_DIRECTORY);
System.out.println("time to add words "+time);
}
}
--
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.
---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org
[jira] Resolved: (LUCENE-827) Lucene Spell Index Not giving Proper
Result
Posted by "Hoss Man (JIRA)" <ji...@apache.org>.
[ https://issues.apache.org/jira/browse/LUCENE-827?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Hoss Man resolved LUCENE-827.
-----------------------------
Resolution: Invalid
if you have questions about using any lucence java code, pelase start be emailing the java-user mailing list ... bugs should only be filed once you have confirmed a bug truely does exist (after discussion with other users who confirm that your assumptions/methodology are correct)
Based on a quick reading of your problem description, i would suspect that if SpellChecker works for you when you build a "Dictionary" from plain text, but not when you use a LuceneDictionary it may be because of the Analyzer you are using when building your initial index ... but please followup in email on the user list
> Lucene Spell Index Not giving Proper Result
> -------------------------------------------
>
> Key: LUCENE-827
> URL: https://issues.apache.org/jira/browse/LUCENE-827
> Project: Lucene - Java
> Issue Type: Bug
> Environment: Windows XP, Linux
> Reporter: Gaurav Gupta
>
> I am passing List of words 'Mayur Vihar Center Circle Udyog Vihar Noida Gurgaon' to create spell index from Lucene Index. when i searches for correct word for 'Centrer' i.e 'Center', it does'nt find it. I checked it whether its there in spell Index, i didnt find it there.
> By making the spell Index directly from Plain text Dictionary gives me the correct word for 'centre' i.e 'center'. I cant understand why it is behaving like this.
> Also attaching the source -:
> CreateDataStructure creates the Lucene Index and initializeSpellChecker initializes the spell Checker.
> import java.io.File;
> import java.io.FileInputStream;
> import java.io.IOException;
> import java.io.InputStream;
> import java.io.InputStreamReader;
> import java.io.Reader;
> import java.sql.SQLException;
> import org.apache.lucene.analysis.Analyzer;
> import org.apache.lucene.analysis.standard.StandardAnalyzer;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.document.Field;
> import org.apache.lucene.index.IndexReader;
> import org.apache.lucene.index.IndexWriter;
> import org.apache.lucene.queryParser.ParseException;
> import org.apache.lucene.search.spell.LuceneDictionary;
> import org.apache.lucene.search.spell.SpellChecker;
> import org.apache.lucene.store.Directory;
> import org.apache.lucene.store.FSDirectory;
> public class SpellCheckImpl implements SpellCheck{
>
> private String SIMPLE_DIRECTORY;
> private String SPELL_DIRECTORY;
> private String DATA_TEXT_FILE;
> private String DEFAULT_FIELD="field";
>
>
>
> private SpellCheckImpl(){
>
> }
>
> //Configure the directories
>
> public SpellCheckImpl(String directoryPath){
>
> File f = new File(directoryPath);
>
> if(f.isDirectory()){
>
> this.SIMPLE_DIRECTORY = directoryPath+"/test";
> this.SPELL_DIRECTORY = directoryPath+"/sp";
>
> File simple = new File(this.SIMPLE_DIRECTORY);
> File spell = new File(this.SPELL_DIRECTORY);
>
> if(!simple.isDirectory()){
> simple.mkdir();
> }
>
> if(!spell.isDirectory()){
> spell.mkdir();
> }
>
> }
>
> }
>
>
> /**
> * Initialize the Dictionary with given Keywords
> */
> public void initialize(String filePath){
>
> this.DATA_TEXT_FILE = filePath;
>
> try{
>
> createDataStructure(SIMPLE_DIRECTORY);
> initializeSpellChecker(SIMPLE_DIRECTORY,SPELL_DIRECTORY);
>
> }catch(Exception e){
> System.out.println("Initialization failed "+e.getMessage());
> }
> }
>
>
> /**
> * This method creates the index for the list of good words at the given location.
> * @param origDirLocation
> * @param dictionaryType
> * @throws IOException
> * @throws InstantiationException
> * @throws IllegalAccessException
> * @throws ClassNotFoundException
> * @throws SQLException
> */
> private void createDataStructure(String origDirLocation) throws IOException, InstantiationException,
> IllegalAccessException, ClassNotFoundException, SQLException{
> Directory directory = FSDirectory.getDirectory(origDirLocation, true);
> Analyzer analyzer = new StandardAnalyzer();
> IndexWriter iwriter = new IndexWriter(directory, analyzer, true);
>
> long time=System.currentTimeMillis();
>
> InputStream is = null;
>
> is = new FileInputStream(new File(DATA_TEXT_FILE));
>
> Document doc = new Document();
>
> //doc.add(Field.Text(DEFAULT_FIELD, (Reader) new InputStreamReader(is)));
> doc.add(new Field(DEFAULT_FIELD, "Mayur Vihar Center Circle Udyog Vihar Noida Gurgaon", Field.Store.YES, Field.Index.TOKENIZED));
>
>
> iwriter.addDocument(doc);
> iwriter.optimize();
> time=System.currentTimeMillis()-time;
> System.out.println("time to Create Lucene Index "+time);
>
> iwriter.close();
> }
>
>
>
> /**
> * This method creates the spell checker dictionary from the words directory at the specified location.
> * @param origDirLocation
> * @param spellDirLocation
> * @throws IOException
> */
> private void initializeSpellChecker(String origDirLocation, String spellDirLocation) throws IOException{
> FSDirectory origDir = FSDirectory.getDirectory(origDirLocation, false);
> FSDirectory spellDir = FSDirectory.getDirectory(spellDirLocation, true);
>
> long time=System.currentTimeMillis();
>
> IndexReader indexReader = null;
> indexReader = IndexReader.open(origDir);
>
> SpellChecker spellChecker = new SpellChecker(spellDir);
> spellChecker.indexDictionnary(new LuceneDictionary(indexReader, DEFAULT_FIELD));
>
> time=System.currentTimeMillis()-time;
>
> System.out.println("time to build Spell Checker Dictionary "+time);
>
> }
>
>
> public String[] GetMatches(String badWord) throws ParseException
> {
> SpellChecker spellChecker = null;
> try
> {
> spellChecker = new SpellChecker(FSDirectory.getDirectory(SPELL_DIRECTORY,false));
>
> spellChecker.setAccuraty(0);
>
> if(spellChecker.exist(badWord)){
> System.out.println("here");
> }
>
> String[] similarWords = spellChecker.suggestSimilar(badWord, 25);
>
> return similarWords;
> }
> catch (IOException e)
> {
> throw new ParseException(e.getMessage());
> }
> }
>
>
> public String GetBestMatch(String badWord) throws ParseException
> {
>
> String[] correctWords = GetMatches(badWord);
>
> if(correctWords != null && correctWords.length > 0){
> return correctWords[0];
> }
>
> return "No Correct Spelling Found";
>
> }
>
> public void addWords(String word) throws IOException{
>
>
> long time=System.currentTimeMillis();
> Analyzer analyzer = new StandardAnalyzer();
> IndexWriter writer = new IndexWriter(SIMPLE_DIRECTORY, analyzer, false);
>
> if(word != null && !"".equals(word)){
> Document doc = new Document();
> doc.add(new Field(DEFAULT_FIELD, word , Field.Store.YES,
> Field.Index.TOKENIZED));
> writer.addDocument(doc);
> }
>
> writer.optimize();
> writer.close();
> time=System.currentTimeMillis()-time;
> initializeSpellChecker(SIMPLE_DIRECTORY,SPELL_DIRECTORY);
> System.out.println("time to add words "+time);
> }
>
>
> public void addWords(String[] word) throws IOException{
>
> long time=System.currentTimeMillis();
> Analyzer analyzer = new StandardAnalyzer();
> IndexWriter writer = new IndexWriter(SIMPLE_DIRECTORY, analyzer, false);
>
> if(word != null){
> for(int i=0;i<word.length;i++){
> if(word[i] != null && !"".equals(word[i])){
> Document doc = new Document();
> doc.add(new Field(DEFAULT_FIELD, word[i] , Field.Store.YES,
> Field.Index.TOKENIZED));
> writer.addDocument(doc);
> }
> }
> }
>
> writer.optimize();
> writer.close();
> time=System.currentTimeMillis()-time;
> initializeSpellChecker(SIMPLE_DIRECTORY,SPELL_DIRECTORY);
> System.out.println("time to add words "+time);
> }
>
> }
--
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.
---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org