You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by eh...@apache.org on 2004/03/11 04:05:36 UTC
cvs commit: jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl DutchAnalyzer.java DutchStemFilter.java DutchStemmer.java WordlistLoader.java
ehatcher 2004/03/10 19:05:36
Modified: contributions/analyzers/src/java/org/apache/lucene/analysis
LengthFilter.java
contributions/analyzers/src/java/org/apache/lucene/analysis/br
BrazilianAnalyzer.java BrazilianStemFilter.java
contributions/analyzers/src/java/org/apache/lucene/analysis/cjk
CJKAnalyzer.java
contributions/analyzers/src/java/org/apache/lucene/analysis/cz
CzechAnalyzer.java
contributions/analyzers/src/java/org/apache/lucene/analysis/fr
FrenchAnalyzer.java FrenchStemFilter.java
contributions/analyzers/src/java/org/apache/lucene/analysis/nl
DutchAnalyzer.java DutchStemFilter.java
DutchStemmer.java WordlistLoader.java
Log:
bringing sandbox analyzers up to date with changes to the core StopFilter and migrating away from using Hashtable
Revision Changes Path
1.2 +2 -2 jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/LengthFilter.java
Index: LengthFilter.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/LengthFilter.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- LengthFilter.java 2 Mar 2004 12:52:16 -0000 1.1
+++ LengthFilter.java 11 Mar 2004 03:05:36 -0000 1.2
@@ -35,7 +35,7 @@
*/
public LengthFilter(TokenStream in, int min, int max)
{
- input = in;
+ super(in);
this.min = min;
this.max =max;
}
1.4 +10 -9 jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
Index: BrazilianAnalyzer.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- BrazilianAnalyzer.java 22 Jan 2004 20:54:46 -0000 1.3
+++ BrazilianAnalyzer.java 11 Mar 2004 03:05:36 -0000 1.4
@@ -64,6 +64,7 @@
import java.io.File;
import java.io.Reader;
import java.util.Hashtable;
+import java.util.HashSet;
/**
* Analyzer for brazilian language. Supports an external list of stopwords (words that
@@ -102,57 +103,57 @@
/**
* Contains the stopwords used with the StopFilter.
*/
- private Hashtable stoptable = new Hashtable();
+ private HashSet stoptable = new HashSet();
/**
* Contains words that should be indexed but not stemmed.
*/
- private Hashtable excltable = new Hashtable();
+ private HashSet excltable = new HashSet();
/**
* Builds an analyzer.
*/
public BrazilianAnalyzer() {
- stoptable = StopFilter.makeStopTable( BRAZILIAN_STOP_WORDS );
+ stoptable = StopFilter.makeStopSet( BRAZILIAN_STOP_WORDS );
}
/**
* Builds an analyzer with the given stop words.
*/
public BrazilianAnalyzer( String[] stopwords ) {
- stoptable = StopFilter.makeStopTable( stopwords );
+ stoptable = StopFilter.makeStopSet( stopwords );
}
/**
* Builds an analyzer with the given stop words.
*/
public BrazilianAnalyzer( Hashtable stopwords ) {
- stoptable = stopwords;
+ stoptable = new HashSet(stopwords.keySet());
}
/**
* Builds an analyzer with the given stop words.
*/
public BrazilianAnalyzer( File stopwords ) {
- stoptable = WordlistLoader.getWordtable( stopwords );
+ stoptable = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
}
/**
* Builds an exclusionlist from an array of Strings.
*/
public void setStemExclusionTable( String[] exclusionlist ) {
- excltable = StopFilter.makeStopTable( exclusionlist );
+ excltable = StopFilter.makeStopSet( exclusionlist );
}
/**
* Builds an exclusionlist from a Hashtable.
*/
public void setStemExclusionTable( Hashtable exclusionlist ) {
- excltable = exclusionlist;
+ excltable = new HashSet(exclusionlist.keySet());
}
/**
* Builds an exclusionlist from the words contained in the given file.
*/
public void setStemExclusionTable( File exclusionlist ) {
- excltable = WordlistLoader.getWordtable( exclusionlist );
+ excltable = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet());
}
/**
1.5 +9 -1 jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
Index: BrazilianStemFilter.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- BrazilianStemFilter.java 22 Jan 2004 20:54:46 -0000 1.4
+++ BrazilianStemFilter.java 11 Mar 2004 03:05:36 -0000 1.5
@@ -59,6 +59,7 @@
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
import java.util.Hashtable;
+import java.util.HashSet;
/**
* Based on (copied) the GermanStemFilter
@@ -79,7 +80,7 @@
*/
private Token token = null;
private BrazilianStemmer stemmer = null;
- private Hashtable exclusions = null;
+ private HashSet exclusions = null;
public BrazilianStemFilter( TokenStream in ) {
super(in);
@@ -88,8 +89,15 @@
/**
* Builds a BrazilianStemFilter that uses an exclusiontable.
+ *
+ * @deprecated
*/
public BrazilianStemFilter( TokenStream in, Hashtable exclusiontable ) {
+ this( in );
+ this.exclusions = new HashSet(exclusiontable.keySet());
+ }
+
+ public BrazilianStemFilter( TokenStream in, HashSet exclusiontable ) {
this( in );
this.exclusions = exclusiontable;
}
1.3 +5 -4 jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
Index: CJKAnalyzer.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- CJKAnalyzer.java 22 Jan 2004 20:54:47 -0000 1.2
+++ CJKAnalyzer.java 11 Mar 2004 03:05:36 -0000 1.3
@@ -63,6 +63,7 @@
import java.io.Reader;
import java.util.Hashtable;
+import java.util.HashSet;
/**
@@ -91,7 +92,7 @@
//~ Instance fields --------------------------------------------------------
/** stop word list */
- private Hashtable stopTable;
+ private HashSet stopTable;
//~ Constructors -----------------------------------------------------------
@@ -99,7 +100,7 @@
* Builds an analyzer which removes words in STOP_WORDS.
*/
public CJKAnalyzer() {
- stopTable = StopFilter.makeStopTable(stopWords);
+ stopTable = StopFilter.makeStopSet(stopWords);
}
/**
@@ -108,7 +109,7 @@
* @param stopWords stop word array
*/
public CJKAnalyzer(String[] stopWords) {
- stopTable = StopFilter.makeStopTable(stopWords);
+ stopTable = StopFilter.makeStopSet(stopWords);
}
//~ Methods ----------------------------------------------------------------
1.3 +14 -7 jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
Index: CzechAnalyzer.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- CzechAnalyzer.java 22 Jan 2004 20:54:47 -0000 1.2
+++ CzechAnalyzer.java 11 Mar 2004 03:05:36 -0000 1.3
@@ -64,6 +64,7 @@
import java.io.*;
import java.util.Hashtable;
+import java.util.HashSet;
/**
* Analyzer for Czech language. Supports an external list of stopwords (words that
@@ -102,26 +103,32 @@
/**
* Contains the stopwords used with the StopFilter.
*/
- private Hashtable stoptable = new Hashtable();
+ private HashSet stoptable;
/**
* Builds an analyzer.
*/
public CzechAnalyzer() {
- stoptable = StopFilter.makeStopTable( STOP_WORDS );
+ stoptable = StopFilter.makeStopSet( STOP_WORDS );
}
/**
* Builds an analyzer with the given stop words.
*/
public CzechAnalyzer( String[] stopwords ) {
- stoptable = StopFilter.makeStopTable( stopwords );
+ stoptable = StopFilter.makeStopSet( stopwords );
}
/**
* Builds an analyzer with the given stop words.
+ *
+ * @deprecated
*/
public CzechAnalyzer( Hashtable stopwords ) {
+ stoptable = new HashSet(stopwords.keySet());
+ }
+
+ public CzechAnalyzer( HashSet stopwords ) {
stoptable = stopwords;
}
@@ -129,7 +136,7 @@
* Builds an analyzer with the given stop words.
*/
public CzechAnalyzer( File stopwords ) {
- stoptable = WordlistLoader.getWordtable( stopwords );
+ stoptable = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
}
/**
@@ -139,12 +146,12 @@
*/
public void loadStopWords( InputStream wordfile, String encoding ) {
if ( wordfile == null ) {
- stoptable = new Hashtable();
+ stoptable = new HashSet();
return;
}
try {
// clear any previous table (if present)
- stoptable = new Hashtable();
+ stoptable = new HashSet();
InputStreamReader isr;
if (encoding == null)
@@ -156,7 +163,7 @@
LineNumberReader lnr = new LineNumberReader(isr);
String word;
while ( ( word = lnr.readLine() ) != null ) {
- stoptable.put(word, word);
+ stoptable.add(word);
}
} catch ( IOException e ) {
1.4 +14 -10 jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
Index: FrenchAnalyzer.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- FrenchAnalyzer.java 23 Jan 2004 12:49:34 -0000 1.3
+++ FrenchAnalyzer.java 11 Mar 2004 03:05:36 -0000 1.4
@@ -63,6 +63,8 @@
import java.io.File;
import java.io.Reader;
import java.util.Hashtable;
+import java.util.HashSet;
+
import org.apache.lucene.analysis.de.WordlistLoader;
/**
@@ -108,57 +110,59 @@
/**
* Contains the stopwords used with the StopFilter.
*/
- private Hashtable stoptable = new Hashtable();
+ private HashSet stoptable = new HashSet();
/**
* Contains words that should be indexed but not stemmed.
*/
- private Hashtable excltable = new Hashtable();
+ private HashSet excltable = new HashSet();
/**
* Builds an analyzer.
*/
public FrenchAnalyzer() {
- stoptable = StopFilter.makeStopTable( FRENCH_STOP_WORDS );
+ stoptable = StopFilter.makeStopSet( FRENCH_STOP_WORDS );
}
/**
* Builds an analyzer with the given stop words.
*/
public FrenchAnalyzer( String[] stopwords ) {
- stoptable = StopFilter.makeStopTable( stopwords );
+ stoptable = StopFilter.makeStopSet( stopwords );
}
/**
* Builds an analyzer with the given stop words.
+ *
+ * @deprecated
*/
public FrenchAnalyzer( Hashtable stopwords ) {
- stoptable = stopwords;
+ stoptable = new HashSet(stopwords.keySet());
}
/**
* Builds an analyzer with the given stop words.
*/
public FrenchAnalyzer( File stopwords ) {
- stoptable = WordlistLoader.getWordtable( stopwords );
+ stoptable = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
}
/**
* Builds an exclusionlist from an array of Strings.
*/
public void setStemExclusionTable( String[] exclusionlist ) {
- excltable = StopFilter.makeStopTable( exclusionlist );
+ excltable = StopFilter.makeStopSet( exclusionlist );
}
/**
* Builds an exclusionlist from a Hashtable.
*/
public void setStemExclusionTable( Hashtable exclusionlist ) {
- excltable = exclusionlist;
+ excltable = new HashSet(exclusionlist.keySet());
}
/**
* Builds an exclusionlist from the words contained in the given file.
*/
public void setStemExclusionTable( File exclusionlist ) {
- excltable = WordlistLoader.getWordtable( exclusionlist );
+ excltable = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet());
}
/**
1.3 +10 -2 jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
Index: FrenchStemFilter.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- FrenchStemFilter.java 22 Jan 2004 20:54:47 -0000 1.2
+++ FrenchStemFilter.java 11 Mar 2004 03:05:36 -0000 1.3
@@ -59,6 +59,7 @@
import org.apache.lucene.analysis.TokenStream;
import java.io.IOException;
import java.util.Hashtable;
+import java.util.HashSet;
/**
* A filter that stemms french words. It supports a table of words that should
@@ -74,7 +75,7 @@
*/
private Token token = null;
private FrenchStemmer stemmer = null;
- private Hashtable exclusions = null;
+ private HashSet exclusions = null;
public FrenchStemFilter( TokenStream in ) {
super(in);
@@ -83,9 +84,16 @@
/**
* Builds a FrenchStemFilter that uses an exclusiontable.
+ *
+ * @deprecated
*/
public FrenchStemFilter( TokenStream in, Hashtable exclusiontable ) {
this( in );
+ exclusions = new HashSet(exclusiontable.keySet());
+ }
+
+ public FrenchStemFilter( TokenStream in, HashSet exclusiontable ) {
+ this( in );
exclusions = exclusiontable;
}
@@ -122,7 +130,7 @@
* Set an alternative exclusion list for this filter.
*/
public void setExclusionTable( Hashtable exclusiontable ) {
- exclusions = exclusiontable;
+ exclusions = new HashSet(exclusiontable.keySet());
}
}
1.2 +127 -138 jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
Index: DutchAnalyzer.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- DutchAnalyzer.java 9 Mar 2004 14:55:08 -0000 1.1
+++ DutchAnalyzer.java 11 Mar 2004 03:05:36 -0000 1.2
@@ -21,148 +21,137 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.analysis.Token;
+
import java.io.File;
-import java.io.*;
import java.io.Reader;
-import java.util.Hashtable;
+import java.util.HashMap;
+import java.util.HashSet;
/**
- *
* @author Edwin de Jonge
- *
- * Analyzer for Dutch language. Supports an external list of stopwords (words that
- * will not be indexed at all), an external list of exclusions (word that will
- * not be stemmed, but indexed) and an external list of word-stem pairs that overrule
- * the algorithm (dictionary stemming).
- * A default set of stopwords is used unless an alternative list is specified, the
- * exclusion list is empty by default.
- * As start for the Analyzer the German Analyzer was used. The stemming algorithm
- * implemented can be found at @link
+ * <p/>
+ * Analyzer for Dutch language. Supports an external list of stopwords (words that
+ * will not be indexed at all), an external list of exclusions (word that will
+ * not be stemmed, but indexed) and an external list of word-stem pairs that overrule
+ * the algorithm (dictionary stemming).
+ * A default set of stopwords is used unless an alternative list is specified, the
+ * exclusion list is empty by default.
+ * As start for the Analyzer the German Analyzer was used. The stemming algorithm
+ * implemented can be found at @link
*/
-public class DutchAnalyzer extends Analyzer
-{
- /**
- * List of typical Dutch stopwords.
- */
- private String[] DUTCH_STOP_WORDS =
- {
- "de","en","van","ik","te","dat","die","in","een",
- "hij","het","niet","zijn","is","was","op","aan","met","als","voor","had",
- "er","maar","om","hem","dan","zou","of","wat","mijn","men","dit","zo",
- "door","over","ze","zich","bij","ook","tot","je","mij","uit","der","daar",
- "haar","naar","heb","hoe","heeft","hebben","deze","u","want","nog","zal",
- "me","zij","nu","ge","geen","omdat","iets","worden","toch","al","waren",
- "veel","meer","doen","toen","moet","ben","zonder","kan","hun","dus",
- "alles","onder","ja","eens","hier","wie","werd","altijd","doch","wordt",
- "wezen","kunnen","ons","zelf","tegen","na","reeds","wil","kon","niets",
- "uw","iemand","geweest","andere"
- };
-
-
- /**
- * Contains the stopwords used with the StopFilter.
- */
- private Hashtable stoptable = new Hashtable();
-
- /**
- * Contains words that should be indexed but not stemmed.
- */
- private Hashtable excltable = new Hashtable();
-
- private Hashtable _stemdict = new Hashtable();
-
-
- /**
- * Builds an analyzer.
- */
- public DutchAnalyzer()
- {
- stoptable = StopFilter.makeStopTable( DUTCH_STOP_WORDS );
- _stemdict.put("fiets","fiets"); //otherwise fiet
- _stemdict.put("bromfiets","bromfiets"); //otherwise bromfiet
- _stemdict.put("ei","eier");
- _stemdict.put("kind","kinder");
- }
-
- /**
- * Builds an analyzer with the given stop words.
- *
- * @param stopwords
- */
- public DutchAnalyzer( String[] stopwords )
- {
- stoptable = StopFilter.makeStopTable( stopwords );
- }
-
- /**
- * Builds an analyzer with the given stop words.
- *
- * @param stopwords
- */
- public DutchAnalyzer( Hashtable stopwords )
- {
- stoptable = stopwords;
- }
-
- /**
- * Builds an analyzer with the given stop words.
- *
- * @param stopwords
- */
- public DutchAnalyzer( File stopwords )
- {
- stoptable = WordlistLoader.getWordtable( stopwords );
- }
-
- /**
- * Builds an exclusionlist from an array of Strings.
- *
- * @param exclusionlist
- */
- public void setStemExclusionTable( String[] exclusionlist )
- {
- excltable = StopFilter.makeStopTable( exclusionlist );
- }
-
- /**
- * Builds an exclusionlist from a Hashtable.
- */
- public void setStemExclusionTable( Hashtable exclusionlist )
- {
- excltable = exclusionlist;
- }
-
- /**
- * Builds an exclusionlist from the words contained in the given file.
- */
- public void setStemExclusionTable(File exclusionlist)
- {
- excltable = WordlistLoader.getWordtable(exclusionlist);
- }
-
- /**
- * Reads a stemdictionary file , that overrules the stemming algorithm
- * This is a textfile that contains per line
- * word\tstem
- * i.e: tabseperated
- */
- public void setStemDictionary(File stemdict)
- {
- _stemdict = WordlistLoader.getStemDict(stemdict);
- }
-
- /**
- * Creates a TokenStream which tokenizes all the text in the provided TextReader.
- *
- * @return A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter
- */
- public TokenStream tokenStream(String fieldName, Reader reader)
- {
- TokenStream result = new StandardTokenizer( reader );
- result = new StandardFilter( result );
- result = new StopFilter( result, stoptable );
- result = new DutchStemFilter( result, excltable, _stemdict);
- return result;
- }
+public class DutchAnalyzer extends Analyzer {
+ /**
+ * List of typical Dutch stopwords.
+ */
+ private String[] DUTCH_STOP_WORDS =
+ {
+ "de", "en", "van", "ik", "te", "dat", "die", "in", "een",
+ "hij", "het", "niet", "zijn", "is", "was", "op", "aan", "met", "als", "voor", "had",
+ "er", "maar", "om", "hem", "dan", "zou", "of", "wat", "mijn", "men", "dit", "zo",
+ "door", "over", "ze", "zich", "bij", "ook", "tot", "je", "mij", "uit", "der", "daar",
+ "haar", "naar", "heb", "hoe", "heeft", "hebben", "deze", "u", "want", "nog", "zal",
+ "me", "zij", "nu", "ge", "geen", "omdat", "iets", "worden", "toch", "al", "waren",
+ "veel", "meer", "doen", "toen", "moet", "ben", "zonder", "kan", "hun", "dus",
+ "alles", "onder", "ja", "eens", "hier", "wie", "werd", "altijd", "doch", "wordt",
+ "wezen", "kunnen", "ons", "zelf", "tegen", "na", "reeds", "wil", "kon", "niets",
+ "uw", "iemand", "geweest", "andere"
+ };
+
+
+ /**
+ * Contains the stopwords used with the StopFilter.
+ */
+ private HashSet stoptable = new HashSet();
+
+ /**
+ * Contains words that should be indexed but not stemmed.
+ */
+ private HashSet excltable = new HashSet();
+
+ private HashMap _stemdict = new HashMap();
+
+
+ /**
+ * Builds an analyzer.
+ */
+ public DutchAnalyzer() {
+ stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS);
+ _stemdict.put("fiets", "fiets"); //otherwise fiet
+ _stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
+ _stemdict.put("ei", "eier");
+ _stemdict.put("kind", "kinder");
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param stopwords
+ */
+ public DutchAnalyzer(String[] stopwords) {
+ stoptable = StopFilter.makeStopSet(stopwords);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param stopwords
+ */
+ public DutchAnalyzer(HashSet stopwords) {
+ stoptable = stopwords;
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param stopwords
+ */
+ public DutchAnalyzer(File stopwords) {
+ stoptable = new HashSet(WordlistLoader.getWordtable(stopwords).keySet());
+ }
+
+ /**
+ * Builds an exclusionlist from an array of Strings.
+ *
+ * @param exclusionlist
+ */
+ public void setStemExclusionTable(String[] exclusionlist) {
+ excltable = StopFilter.makeStopSet(exclusionlist);
+ }
+
+ /**
+ * Builds an exclusionlist from a Hashtable.
+ */
+ public void setStemExclusionTable(HashSet exclusionlist) {
+ excltable = exclusionlist;
+ }
+
+ /**
+ * Builds an exclusionlist from the words contained in the given file.
+ */
+ public void setStemExclusionTable(File exclusionlist) {
+ excltable = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet());
+ }
+
+ /**
+ * Reads a stemdictionary file , that overrules the stemming algorithm
+ * This is a textfile that contains per line
+ * word\tstem
+ * i.e: tabseperated
+ */
+ public void setStemDictionary(File stemdict) {
+ _stemdict = WordlistLoader.getStemDict(stemdict);
+ }
+
+ /**
+ * Creates a TokenStream which tokenizes all the text in the provided TextReader.
+ *
+ * @return A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter
+ */
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ TokenStream result = new StandardTokenizer(reader);
+ result = new StandardFilter(result);
+ result = new StopFilter(result, stoptable);
+ result = new DutchStemFilter(result, excltable, _stemdict);
+ return result;
+ }
}
1.2 +82 -96 jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
Index: DutchStemFilter.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- DutchStemFilter.java 9 Mar 2004 14:55:08 -0000 1.1
+++ DutchStemFilter.java 11 Mar 2004 03:05:36 -0000 1.2
@@ -19,105 +19,91 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+
import java.io.IOException;
-import java.util.Hashtable;
+import java.util.HashMap;
+import java.util.HashSet;
/**
- *
* @author Edwin de Jonge
- *
- * A filter that stems Dutch words. It supports a table of words that should
- * not be stemmed at all. The stemmer used can be changed at runtime after the
- * filter object is created (as long as it is a DutchStemmer).
+ * <p/>
+ * A filter that stems Dutch words. It supports a table of words that should
+ * not be stemmed at all. The stemmer used can be changed at runtime after the
+ * filter object is created (as long as it is a DutchStemmer).
*/
-public final class DutchStemFilter extends TokenFilter
-{
- /**
- * The actual token in the input stream.
- */
- private Token token = null;
- private DutchStemmer stemmer = null;
- private Hashtable exclusions = null;
-
- public DutchStemFilter( TokenStream _in )
- {
- super(_in);
- stemmer = new DutchStemmer();
- }
-
- /**
- * Builds a DutchStemFilter that uses an exclusiontable.
- */
- public DutchStemFilter( TokenStream _in, Hashtable exclusiontable )
- {
- this(_in);
- exclusions = exclusiontable;
- }
-
- /**
- * @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm
- */
- public DutchStemFilter( TokenStream _in, Hashtable exclusiontable , Hashtable stemdictionary)
- {
- this(_in, exclusiontable);
- stemmer.setStemDictionary(stemdictionary);
- }
-
- /**
- * @return Returns the next token in the stream, or null at EOS
- */
- public Token next() throws IOException
-
- {
- if ( ( token = input.next() ) == null )
- {
- return null;
- }
-
- // Check the exclusiontable
- else if ( exclusions != null && exclusions.contains( token.termText() ) )
- {
- return token;
- }
- else
- {
- String s = stemmer.stem( token.termText() );
- // If not stemmed, dont waste the time creating a new token
- if ( !s.equals( token.termText() ) )
- {
- return new Token( s, token.startOffset(),
- token.endOffset(), token.type() );
- }
- return token;
- }
- }
-
- /**
- * Set a alternative/custom DutchStemmer for this filter.
- */
- public void setStemmer( DutchStemmer stemmer )
- {
- if ( stemmer != null )
- {
- this.stemmer = stemmer;
- }
- }
-
- /**
- * Set an alternative exclusion list for this filter.
- */
- public void setExclusionTable( Hashtable exclusiontable )
- {
- exclusions = exclusiontable;
- }
-
- /**
- * Set dictionary for stemming, this dictionary overrules the algorithm,
- * so you can correct for a particular unwanted word-stem pair.
- */
- public void setStemDictionary(Hashtable dict)
- {
- if (stemmer != null)
- stemmer.setStemDictionary(dict);
- }
+public final class DutchStemFilter extends TokenFilter {
+ /**
+ * The actual token in the input stream.
+ */
+ private Token token = null;
+ private DutchStemmer stemmer = null;
+ private HashSet exclusions = null;
+
+ public DutchStemFilter(TokenStream _in) {
+ super(_in);
+ stemmer = new DutchStemmer();
+ }
+
+ /**
+ * Builds a DutchStemFilter that uses an exclusiontable.
+ */
+ public DutchStemFilter(TokenStream _in, HashSet exclusiontable) {
+ this(_in);
+ exclusions = exclusiontable;
+ }
+
+ /**
+ * @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm
+ */
+ public DutchStemFilter(TokenStream _in, HashSet exclusiontable, HashMap stemdictionary) {
+ this(_in, exclusiontable);
+ stemmer.setStemDictionary(stemdictionary);
+ }
+
+ /**
+ * @return Returns the next token in the stream, or null at EOS
+ */
+ public Token next() throws IOException {
+ if ((token = input.next()) == null) {
+ return null;
+ }
+
+ // Check the exclusiontable
+ else if (exclusions != null && exclusions.contains(token.termText())) {
+ return token;
+ } else {
+ String s = stemmer.stem(token.termText());
+ // If not stemmed, dont waste the time creating a new token
+ if (!s.equals(token.termText())) {
+ return new Token(s, token.startOffset(),
+ token.endOffset(), token.type());
+ }
+ return token;
+ }
+ }
+
+ /**
+ * Set a alternative/custom DutchStemmer for this filter.
+ */
+ public void setStemmer(DutchStemmer stemmer) {
+ if (stemmer != null) {
+ this.stemmer = stemmer;
+ }
+ }
+
+ /**
+ * Set an alternative exclusion list for this filter.
+ */
+ public void setExclusionTable(HashSet exclusiontable) {
+ exclusions = exclusiontable;
+ }
+
+ /**
+ * Set dictionary for stemming, this dictionary overrules the algorithm,
+ * so you can correct for a particular unwanted word-stem pair.
+ */
+ public void setStemDictionary(HashMap dict) {
+ if (stemmer != null)
+ stemmer.setStemDictionary(dict);
+ }
}
1.2 +379 -425 jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java
Index: DutchStemmer.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- DutchStemmer.java 9 Mar 2004 14:55:08 -0000 1.1
+++ DutchStemmer.java 11 Mar 2004 03:05:36 -0000 1.2
@@ -16,9 +16,8 @@
* limitations under the License.
*/
-import java.util.Hashtable;
-import java.util.ArrayList;
-import java.io.*;
+import java.util.HashMap;
+
/*
* @author Edwin de Jonge (ejne@cbs.nl)
*
@@ -26,427 +25,382 @@
* the <see cref="http://snowball.tartarus.org/dutch/stemmer.html">dutch stemming</see>
* algorithm in snowball. Snowball is a project of Martin Porter (does Porter Stemmer ring a bell?):
*/
-public class DutchStemmer
-{
- /**
- * Buffer for the terms while stemming them.
- */
- private StringBuffer sb = new StringBuffer();
- private boolean _removedE;
- private Hashtable _stemDict;
-
- private int _R1;
- private int _R2;
-
- //TODO convert to internal
- /*
- * Stemms the given term to an unique <tt>discriminator</tt>.
- *
- * @param term The term that should be stemmed.
- * @return Discriminator for <tt>term</tt>
- */
- public String stem( String term )
- {
- term = term.toLowerCase();
- if ( !isStemmable( term ) )
- return term;
- if (_stemDict != null && _stemDict.contains(term))
- if (_stemDict.get(term) instanceof String)
- return (String)_stemDict.get(term);
- else return null;
-
- // Reset the StringBuffer.
- sb.delete(0, sb.length());
- sb.insert(0, term);
- // Stemming starts here...
- substitute(sb);
- storeYandI(sb);
- _R1 = getRIndex(sb, 0);
- _R1 = Math.max(3,_R1);
- step1(sb);
- step2(sb);
- _R2 = getRIndex(sb, _R1);
- step3a(sb);
- step3b(sb);
- step4(sb);
- reStoreYandI(sb);
- return sb.toString();
- }
-
- private boolean enEnding(StringBuffer sb)
- {
- String[] enend = new String[]{"ene","en"};
- for (int i = 0; i < enend.length; i++)
- {
- String end = enend[i];
- String s = sb.toString();
- int index = s.length() - end.length();
- if ( s.endsWith(end) &&
- index >= _R1 &&
- isValidEnEnding(sb,index-1)
- )
- {
- sb.delete(index, index + end.length());
- unDouble(sb,index);
- return true;
- }
- }
- return false;
- }
-
-
- private void step1(StringBuffer sb)
- {
- if (_R1 >= sb.length())
- return;
-
- String s = sb.toString();
- int lengthR1 = sb.length() - _R1;
- int index;
-
- if (s.endsWith("heden"))
- {
- sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid"));
- return;
- }
-
- if (enEnding(sb))
- return;
-
- if (s.endsWith("se") &&
- (index = s.length() - 2) >= _R1 &&
- isValidSEnding(sb, index -1)
- )
- {
- sb.delete(index, index + 2);
- return;
- }
- if (s.endsWith("s") &&
- (index = s.length() - 1) >= _R1 &&
- isValidSEnding(sb, index - 1))
- {
- sb.delete(index, index + 1);
- }
- }
-
- /**
- * Delete suffix e if in R1 and
- * preceded by a non-vowel, and then undouble the ending
- *
- * @param sb String being stemmed
- */
- private void step2(StringBuffer sb)
- {
- _removedE = false;
- if (_R1 >= sb.length())
- return;
- String s = sb.toString();
- int index = s.length() - 1;
- if ( index >= _R1 &&
- s.endsWith("e") &&
- !isVowel(sb.charAt(index-1)))
- {
- sb.delete(index, index + 1);
- unDouble(sb);
- _removedE = true;
- }
- }
-
- /**
- * Delete "heid"
- *
- * @param sb String being stemmed
- */
- private void step3a(StringBuffer sb)
- {
- if (_R2 >= sb.length())
- return;
- String s = sb.toString();
- int index = s.length() - 4;
- if (s.endsWith("heid")&& index >= _R2 && sb.charAt(index - 1) != 'c')
- {
- sb.delete(index, index + 4); //remove heid
- enEnding(sb);
- }
- }
-
- /**
- * <p>A d-suffix, or derivational suffix, enables a new word,
- * often with a different grammatical category, or with a different
- * sense, to be built from another word. Whether a d-suffix can be
- * attached is discovered not from the rules of grammar, but by
- * referring to a dictionary. So in English, ness can be added to
- * certain adjectives to form corresponding nouns (littleness,
- * kindness, foolishness ...) but not to all adjectives
- * (not for example, to big, cruel, wise ...) d-suffixes can be
- * used to change meaning, often in rather exotic ways.</p>
- * Remove "ing", "end", "ig", "lijk", "baar" and "bar"
- *
- * @param sb String being stemmed
- */
- private void step3b(StringBuffer sb)
- {
- if (_R2 >= sb.length())
- return;
- String s = sb.toString();
- int index;
-
- if ((s.endsWith("end") || s.endsWith("ing")) &&
- (index = s.length() - 3) >= _R2)
- {
- sb.delete(index, index + 3);
- if (sb.charAt(index - 2) == 'i' &&
- sb.charAt(index - 1) == 'g')
- {
- if (sb.charAt(index - 3) != 'e' & index-2 >= _R2)
- {
- index -= 2;
- sb.delete(index, index + 2);
- }
- }
- else
- {
- unDouble(sb,index);
- }
- return;
- }
- if ( s.endsWith("ig") &&
- (index = s.length() - 2) >= _R2
- )
- {
- if (sb.charAt(index - 1) != 'e')
- sb.delete(index, index + 2);
- return;
- }
- if (s.endsWith("lijk") &&
- (index = s.length() - 4) >= _R2
- )
- {
- sb.delete(index, index + 4);
- step2(sb);
- return;
- }
- if (s.endsWith("baar") &&
- (index = s.length() - 4) >= _R2
- )
- {
- sb.delete(index, index + 4);
- return;
- }
- if (s.endsWith("bar") &&
- (index = s.length() - 3) >= _R2
- )
- {
- if (_removedE)
- sb.delete(index, index + 3);
- return;
- }
- }
-
- /**
- * undouble vowel
- * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
- *
- * @param sb String being stemmed
- */
- private void step4(StringBuffer sb)
- {
- if (sb.length() < 4)
- return;
- String end = sb.substring(sb.length() - 4, sb.length());
- char c = end.charAt(0);
- char v1 = end.charAt(1);
- char v2 = end.charAt(2);
- char d = end.charAt(3);
- if (v1 == v2 &&
- d != 'I' &&
- v1 != 'i' &&
- isVowel(v1) &&
- !isVowel(d) &&
- !isVowel(c))
- {
- sb.delete(sb.length() - 2, sb.length() - 1);
- }
- }
-
- /**
- * Checks if a term could be stemmed.
- *
- * @return true if, and only if, the given term consists in letters.
- */
- private boolean isStemmable( String term )
- {
- for ( int c = 0; c < term.length(); c++ )
- {
- if ( !Character.isLetter(term.charAt(c))) return false;
- }
- return true;
- }
-
- /**
- * Substitute �, �, �, �, �, � , �, �, �, �
- */
- private void substitute( StringBuffer buffer )
- {
- for ( int i = 0; i < buffer.length(); i++ )
- {
- switch (buffer.charAt(i))
- {
- case '�':
- case '�':
- {
- buffer.setCharAt(i, 'a');
- break;
- }
- case '�':
- case '�':
- {
- buffer.setCharAt(i, 'e');
- break;
- }
- case '�':
- case '�':
- {
- buffer.setCharAt(i, 'u');
- break;
- }
- case '�':
- case 'i':
- {
- buffer.setCharAt(i, 'i');
- break;
- }
- case '�':
- case '�':
- {
- buffer.setCharAt(i, 'o');
- break;
- }
- }
- }
- }
-
- private boolean isValidSEnding(StringBuffer sb)
- {
- return isValidSEnding(sb,sb.length() - 1);
- }
-
- private boolean isValidSEnding(StringBuffer sb, int index)
- {
- char c = sb.charAt(index);
- if (isVowel(c) || c == 'j')
- return false;
- return true;
- }
-
- private boolean isValidEnEnding(StringBuffer sb)
- {
- return isValidEnEnding(sb,sb.length() - 1);
- }
-
- private boolean isValidEnEnding(StringBuffer sb, int index)
- {
- char c = sb.charAt(index);
- if (isVowel(c))
- return false;
- if (c < 3)
- return false;
- // ends with "gem"?
- if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index-1) == 'e')
- return false;
- return true;
- }
-
- private void unDouble(StringBuffer sb)
- {
- unDouble(sb, sb.length());
- }
-
- private void unDouble(StringBuffer sb, int endIndex)
- {
- String s = sb.substring(0, endIndex);
- if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn")|| s.endsWith("mm")|| s.endsWith("ff"))
- {
- sb.delete(endIndex-1, endIndex);
- }
- }
-
- private int getRIndex(StringBuffer sb, int start)
- {
- if (start == 0)
- start = 1;
- int i = start;
- for (; i < sb.length(); i++)
- {
- //first non-vowel preceded by a vowel
- if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i-1)))
- {
- return i + 1;
- }
- }
- return i + 1;
- }
-
- private void storeYandI(StringBuffer sb)
- {
- if (sb.charAt(0) == 'y')
- sb.setCharAt(0, 'Y');
-
- char c;
- int last = sb.length() - 1;
-
- for (int i = 1; i < last; i++)
- {
- switch (sb.charAt(i))
- {
- case 'i':
- {
- if (isVowel(sb.charAt(i-1)) &&
- isVowel(sb.charAt(i+1))
- )
- sb.setCharAt(i, 'I');
- break;
- }
- case 'y':
- {
- if (isVowel(sb.charAt(i-1)))
- sb.setCharAt(i, 'Y');
- break;
- }
- }
- }
- if (last > 0 && sb.charAt(last)=='y' && isVowel(sb.charAt(last-1)))
- sb.setCharAt(last, 'Y');
- }
-
- private void reStoreYandI(StringBuffer sb)
- {
- String tmp = sb.toString();
- sb.delete(0, sb.length());
- sb.insert(0, tmp.replaceAll("I","i").replaceAll("Y","y"));
- }
-
- private boolean isVowel(char c)
- {
- switch (c)
- {
- case 'e':
- case 'a':
- case 'o':
- case 'i':
- case 'u':
- case 'y':
- case '�':
- {
- return true;
- }
- }
- return false;
- }
-
- void setStemDictionary(Hashtable dict)
- {
- _stemDict = dict;
- }
+
+public class DutchStemmer {
+ /**
+ * Buffer for the terms while stemming them.
+ */
+ private StringBuffer sb = new StringBuffer();
+ private boolean _removedE;
+ private HashMap _stemDict;
+
+ private int _R1;
+ private int _R2;
+
+ //TODO convert to internal
+ /*
+ * Stemms the given term to an unique <tt>discriminator</tt>.
+ *
+ * @param term The term that should be stemmed.
+ * @return Discriminator for <tt>term</tt>
+ */
+ public String stem(String term) {
+ term = term.toLowerCase();
+ if (!isStemmable(term))
+ return term;
+ if (_stemDict != null && _stemDict.containsKey(term))
+ if (_stemDict.get(term) instanceof String)
+ return (String) _stemDict.get(term);
+ else
+ return null;
+
+ // Reset the StringBuffer.
+ sb.delete(0, sb.length());
+ sb.insert(0, term);
+ // Stemming starts here...
+ substitute(sb);
+ storeYandI(sb);
+ _R1 = getRIndex(sb, 0);
+ _R1 = Math.max(3, _R1);
+ step1(sb);
+ step2(sb);
+ _R2 = getRIndex(sb, _R1);
+ step3a(sb);
+ step3b(sb);
+ step4(sb);
+ reStoreYandI(sb);
+ return sb.toString();
+ }
+
+ private boolean enEnding(StringBuffer sb) {
+ String[] enend = new String[]{"ene", "en"};
+ for (int i = 0; i < enend.length; i++) {
+ String end = enend[i];
+ String s = sb.toString();
+ int index = s.length() - end.length();
+ if (s.endsWith(end) &&
+ index >= _R1 &&
+ isValidEnEnding(sb, index - 1)
+ ) {
+ sb.delete(index, index + end.length());
+ unDouble(sb, index);
+ return true;
+ }
+ }
+ return false;
+ }
+
+
+ private void step1(StringBuffer sb) {
+ if (_R1 >= sb.length())
+ return;
+
+ String s = sb.toString();
+ int lengthR1 = sb.length() - _R1;
+ int index;
+
+ if (s.endsWith("heden")) {
+ sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid"));
+ return;
+ }
+
+ if (enEnding(sb))
+ return;
+
+ if (s.endsWith("se") &&
+ (index = s.length() - 2) >= _R1 &&
+ isValidSEnding(sb, index - 1)
+ ) {
+ sb.delete(index, index + 2);
+ return;
+ }
+ if (s.endsWith("s") &&
+ (index = s.length() - 1) >= _R1 &&
+ isValidSEnding(sb, index - 1)) {
+ sb.delete(index, index + 1);
+ }
+ }
+
+ /**
+ * Delete suffix e if in R1 and
+ * preceded by a non-vowel, and then undouble the ending
+ *
+ * @param sb String being stemmed
+ */
+ private void step2(StringBuffer sb) {
+ _removedE = false;
+ if (_R1 >= sb.length())
+ return;
+ String s = sb.toString();
+ int index = s.length() - 1;
+ if (index >= _R1 &&
+ s.endsWith("e") &&
+ !isVowel(sb.charAt(index - 1))) {
+ sb.delete(index, index + 1);
+ unDouble(sb);
+ _removedE = true;
+ }
+ }
+
+ /**
+ * Delete "heid"
+ *
+ * @param sb String being stemmed
+ */
+ private void step3a(StringBuffer sb) {
+ if (_R2 >= sb.length())
+ return;
+ String s = sb.toString();
+ int index = s.length() - 4;
+ if (s.endsWith("heid") && index >= _R2 && sb.charAt(index - 1) != 'c') {
+ sb.delete(index, index + 4); //remove heid
+ enEnding(sb);
+ }
+ }
+
+ /**
+ * <p>A d-suffix, or derivational suffix, enables a new word,
+ * often with a different grammatical category, or with a different
+ * sense, to be built from another word. Whether a d-suffix can be
+ * attached is discovered not from the rules of grammar, but by
+ * referring to a dictionary. So in English, ness can be added to
+ * certain adjectives to form corresponding nouns (littleness,
+ * kindness, foolishness ...) but not to all adjectives
+ * (not for example, to big, cruel, wise ...) d-suffixes can be
+ * used to change meaning, often in rather exotic ways.</p>
+ * Remove "ing", "end", "ig", "lijk", "baar" and "bar"
+ *
+ * @param sb String being stemmed
+ */
+ private void step3b(StringBuffer sb) {
+ if (_R2 >= sb.length())
+ return;
+ String s = sb.toString();
+ int index = 0;
+
+ if ((s.endsWith("end") || s.endsWith("ing")) &&
+ (index = s.length() - 3) >= _R2) {
+ sb.delete(index, index + 3);
+ if (sb.charAt(index - 2) == 'i' &&
+ sb.charAt(index - 1) == 'g') {
+ if (sb.charAt(index - 3) != 'e' & index - 2 >= _R2) {
+ index -= 2;
+ sb.delete(index, index + 2);
+ }
+ } else {
+ unDouble(sb, index);
+ }
+ return;
+ }
+ if (s.endsWith("ig") &&
+ (index = s.length() - 2) >= _R2
+ ) {
+ if (sb.charAt(index - 1) != 'e')
+ sb.delete(index, index + 2);
+ return;
+ }
+ if (s.endsWith("lijk") &&
+ (index = s.length() - 4) >= _R2
+ ) {
+ sb.delete(index, index + 4);
+ step2(sb);
+ return;
+ }
+ if (s.endsWith("baar") &&
+ (index = s.length() - 4) >= _R2
+ ) {
+ sb.delete(index, index + 4);
+ return;
+ }
+ if (s.endsWith("bar") &&
+ (index = s.length() - 3) >= _R2
+ ) {
+ if (_removedE)
+ sb.delete(index, index + 3);
+ return;
+ }
+ }
+
+ /**
+ * undouble vowel
+ * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
+ *
+ * @param sb String being stemmed
+ */
+ private void step4(StringBuffer sb) {
+ if (sb.length() < 4)
+ return;
+ String end = sb.substring(sb.length() - 4, sb.length());
+ char c = end.charAt(0);
+ char v1 = end.charAt(1);
+ char v2 = end.charAt(2);
+ char d = end.charAt(3);
+ if (v1 == v2 &&
+ d != 'I' &&
+ v1 != 'i' &&
+ isVowel(v1) &&
+ !isVowel(d) &&
+ !isVowel(c)) {
+ sb.delete(sb.length() - 2, sb.length() - 1);
+ }
+ }
+
+ /**
+ * Checks if a term could be stemmed.
+ *
+ * @return true if, and only if, the given term consists in letters.
+ */
+ private boolean isStemmable(String term) {
+ for (int c = 0; c < term.length(); c++) {
+ if (!Character.isLetter(term.charAt(c))) return false;
+ }
+ return true;
+ }
+
+ /**
+ * Substitute �, �, �, �, �, � , �, �, �, �
+ */
+ private void substitute(StringBuffer buffer) {
+ for (int i = 0; i < buffer.length(); i++) {
+ switch (buffer.charAt(i)) {
+ case '�':
+ case '�':
+ {
+ buffer.setCharAt(i, 'a');
+ break;
+ }
+ case '�':
+ case '�':
+ {
+ buffer.setCharAt(i, 'e');
+ break;
+ }
+ case '�':
+ case '�':
+ {
+ buffer.setCharAt(i, 'u');
+ break;
+ }
+ case '�':
+ case 'i':
+ {
+ buffer.setCharAt(i, 'i');
+ break;
+ }
+ case '�':
+ case '�':
+ {
+ buffer.setCharAt(i, 'o');
+ break;
+ }
+ }
+ }
+ }
+
+ private boolean isValidSEnding(StringBuffer sb) {
+ return isValidSEnding(sb, sb.length() - 1);
+ }
+
+ private boolean isValidSEnding(StringBuffer sb, int index) {
+ char c = sb.charAt(index);
+ if (isVowel(c) || c == 'j')
+ return false;
+ return true;
+ }
+
+ private boolean isValidEnEnding(StringBuffer sb) {
+ return isValidEnEnding(sb, sb.length() - 1);
+ }
+
+ private boolean isValidEnEnding(StringBuffer sb, int index) {
+ char c = sb.charAt(index);
+ if (isVowel(c))
+ return false;
+ if (c < 3)
+ return false;
+ // ends with "gem"?
+ if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index - 1) == 'e')
+ return false;
+ return true;
+ }
+
+ private void unDouble(StringBuffer sb) {
+ unDouble(sb, sb.length());
+ }
+
+ private void unDouble(StringBuffer sb, int endIndex) {
+ String s = sb.substring(0, endIndex);
+ if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn") || s.endsWith("mm") || s.endsWith("ff")) {
+ sb.delete(endIndex - 1, endIndex);
+ }
+ }
+
+ private int getRIndex(StringBuffer sb, int start) {
+ if (start == 0)
+ start = 1;
+ int i = start;
+ for (; i < sb.length(); i++) {
+ //first non-vowel preceded by a vowel
+ if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i - 1))) {
+ return i + 1;
+ }
+ }
+ return i + 1;
+ }
+
+ private void storeYandI(StringBuffer sb) {
+ if (sb.charAt(0) == 'y')
+ sb.setCharAt(0, 'Y');
+
+ char c;
+ int last = sb.length() - 1;
+
+ for (int i = 1; i < last; i++) {
+ switch (sb.charAt(i)) {
+ case 'i':
+ {
+ if (isVowel(sb.charAt(i - 1)) &&
+ isVowel(sb.charAt(i + 1))
+ )
+ sb.setCharAt(i, 'I');
+ break;
+ }
+ case 'y':
+ {
+ if (isVowel(sb.charAt(i - 1)))
+ sb.setCharAt(i, 'Y');
+ break;
+ }
+ }
+ }
+ if (last > 0 && sb.charAt(last) == 'y' && isVowel(sb.charAt(last - 1)))
+ sb.setCharAt(last, 'Y');
+ }
+
+ private void reStoreYandI(StringBuffer sb) {
+ String tmp = sb.toString();
+ sb.delete(0, sb.length());
+ sb.insert(0, tmp.replaceAll("I", "i").replaceAll("Y", "y"));
+ }
+
+ private boolean isVowel(char c) {
+ switch (c) {
+ case 'e':
+ case 'a':
+ case 'o':
+ case 'i':
+ case 'u':
+ case 'y':
+ case '�':
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ void setStemDictionary(HashMap dict) {
+ _stemDict = dict;
+ }
}
1.2 +92 -111 jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java
Index: WordlistLoader.java
===================================================================
RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- WordlistLoader.java 9 Mar 2004 14:55:08 -0000 1.1
+++ WordlistLoader.java 11 Mar 2004 03:05:36 -0000 1.2
@@ -20,123 +20,104 @@
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
-import java.util.Hashtable;
+import java.util.HashMap;
/**
- *
* @author Gerhard Schwarz
- *
- * Loads a text file and adds every line as an entry to a Hashtable. Every line
- * should contain only one word. If the file is not found or on any error, an
- * empty table is returned.
+ * <p/>
+ * Loads a text file and adds every line as an entry to a Hashtable. Every line
+ * should contain only one word. If the file is not found or on any error, an
+ * empty table is returned.
*/
-public class WordlistLoader
-{
- /**
- * @param path Path to the wordlist
- * @param wordfile Name of the wordlist
- */
- public static Hashtable getWordtable( String path, String wordfile )
- {
- if ( path == null || wordfile == null )
- {
- return new Hashtable();
- }
- return getWordtable(new File(path, wordfile));
- }
+public class WordlistLoader {
+ /**
+ * @param path Path to the wordlist
+ * @param wordfile Name of the wordlist
+ */
+ public static HashMap getWordtable(String path, String wordfile) {
+ if (path == null || wordfile == null) {
+ return new HashMap();
+ }
+ return getWordtable(new File(path, wordfile));
+ }
- /**
- * @param wordfile Complete path to the wordlist
- */
- public static Hashtable getWordtable( String wordfile )
- {
- if ( wordfile == null )
- {
- return new Hashtable();
- }
- return getWordtable( new File( wordfile ) );
- }
+ /**
+ * @param wordfile Complete path to the wordlist
+ */
+ public static HashMap getWordtable(String wordfile) {
+ if (wordfile == null) {
+ return new HashMap();
+ }
+ return getWordtable(new File(wordfile));
+ }
- /**
- * Reads a stemsdictionary. Each line contains:
- * word \t stem
- * i.e. tab seperated)
- *
- * @return Stem dictionary that overrules, the stemming algorithm
- */
- public static Hashtable getStemDict( File wordstemfile)
- {
- if ( wordstemfile == null )
- {
- return new Hashtable();
- }
- Hashtable result = new Hashtable();
- try
- {
- LineNumberReader lnr = new LineNumberReader(new FileReader(wordstemfile));
- String line;
- String[] wordstem;
- while ((line = lnr.readLine()) != null)
- {
- wordstem = line.split("\t", 2);
- result.put(wordstem[0], wordstem[1]);
- }
- }
- catch (IOException e)
- {}
- return result;
- }
+ /**
+ * Reads a stemsdictionary. Each line contains:
+ * word \t stem
+ * i.e. tab seperated)
+ *
+ * @return Stem dictionary that overrules, the stemming algorithm
+ */
+ public static HashMap getStemDict(File wordstemfile) {
+ if (wordstemfile == null) {
+ return new HashMap();
+ }
+ HashMap result = new HashMap();
+ try {
+ LineNumberReader lnr = new LineNumberReader(new FileReader(wordstemfile));
+ String line;
+ String[] wordstem;
+ while ((line = lnr.readLine()) != null) {
+ wordstem = line.split("\t", 2);
+ result.put(wordstem[0], wordstem[1]);
+ }
+ } catch (IOException e) {
+ }
+ return result;
+ }
- /**
- * @param wordfile File containing the wordlist
- */
- public static Hashtable getWordtable( File wordfile )
- {
- if ( wordfile == null )
- {
- return new Hashtable();
- }
- Hashtable result = null;
- try
- {
- LineNumberReader lnr = new LineNumberReader(new FileReader(wordfile));
- String word = null;
- String[] stopwords = new String[100];
- int wordcount = 0;
- while ( ( word = lnr.readLine() ) != null )
- {
- wordcount++;
- if ( wordcount == stopwords.length )
- {
- String[] tmp = new String[stopwords.length + 50];
- System.arraycopy( stopwords, 0, tmp, 0, wordcount );
- stopwords = tmp;
- }
- stopwords[wordcount-1] = word;
- }
- result = makeWordTable( stopwords, wordcount );
- }
- // On error, use an empty table
- catch (IOException e)
- {
- result = new Hashtable();
- }
- return result;
- }
+ /**
+ * @param wordfile File containing the wordlist
+ */
+ public static HashMap getWordtable(File wordfile) {
+ if (wordfile == null) {
+ return new HashMap();
+ }
+ HashMap result = null;
+ try {
+ LineNumberReader lnr = new LineNumberReader(new FileReader(wordfile));
+ String word = null;
+ String[] stopwords = new String[100];
+ int wordcount = 0;
+ while ((word = lnr.readLine()) != null) {
+ wordcount++;
+ if (wordcount == stopwords.length) {
+ String[] tmp = new String[stopwords.length + 50];
+ System.arraycopy(stopwords, 0, tmp, 0, wordcount);
+ stopwords = tmp;
+ }
+ stopwords[wordcount - 1] = word;
+ }
+ result = makeWordTable(stopwords, wordcount);
+ }
+ // On error, use an empty table
+ catch (IOException e) {
+ result = new HashMap();
+ }
+ return result;
+ }
- /**
- * Builds the wordlist table.
- *
- * @param words Word that where read
- * @param length Amount of words that where read into <tt>words</tt>
- */
- private static Hashtable makeWordTable( String[] words, int length )
- {
- Hashtable table = new Hashtable( length );
- for ( int i = 0; i < length; i++ )
- {
- table.put(words[i], words[i]);
- }
- return table;
- }
+ /**
+ * Builds the wordlist table.
+ *
+ * @param words Word that where read
+ * @param length Amount of words that where read into <tt>words</tt>
+ */
+ private static HashMap makeWordTable(String[] words, int length) {
+ HashMap table = new HashMap(length);
+ for (int i = 0; i < length; i++) {
+ table.put(words[i], words[i]);
+ }
+ return table;
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org