You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2010/05/19 13:58:37 UTC
svn commit: r946139 - in /lucene/dev/trunk: lucene/contrib/
modules/analysis/common/src/java/org/apache/lucene/analysis/compound/
modules/analysis/common/src/test/org/apache/lucene/analysis/compound/
Author: rmuir
Date: Wed May 19 11:58:37 2010
New Revision: 946139
URL: http://svn.apache.org/viewvc?rev=946139&view=rev
Log:
LUCENE-1287: Allow usage of HyphenationCompoundWordTokenFilter without a dictionary
Modified:
lucene/dev/trunk/lucene/contrib/CHANGES.txt
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/package.html
lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=946139&r1=946138&r2=946139&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Wed May 19 11:58:37 2010
@@ -160,6 +160,9 @@ New features
* LUCENE-2463: Add a Greek inflectional stemmer. GreekAnalyzer will now stem words
when Version is set to 3.1 or higher. (Robert Muir)
+ * LUCENE-1287: Allow usage of HyphenationCompoundWordTokenFilter without dictionary.
+ (Thomas Peuss via Robert Muir)
+
Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java?rev=946139&r1=946138&r2=946139&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java Wed May 19 11:58:37 2010
@@ -154,7 +154,7 @@ public abstract class CompoundWordTokenF
this.maxSubwordSize=maxSubwordSize;
this.onlyLongestMatch=onlyLongestMatch;
- if (dictionary instanceof CharArraySet) {
+ if (dictionary==null || dictionary instanceof CharArraySet) {
this.dictionary = (CharArraySet) dictionary;
} else {
this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false);
@@ -181,6 +181,9 @@ public abstract class CompoundWordTokenF
}
public static final Set<?> makeDictionary(final Version matchVersion, final String[] dictionary) {
+ if (dictionary == null) {
+ return null;
+ }
// is the below really case insensitive?
CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false);
addAllLowerCase(dict, Arrays.asList(dictionary));
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java?rev=946139&r1=946138&r2=946139&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java Wed May 19 11:58:37 2010
@@ -154,6 +154,33 @@ public class HyphenationCompoundWordToke
}
/**
+ * Create a HyphenationCompoundWordTokenFilter with no dictionary.
+ * <p>
+ * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)
+ * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
+ * null, minWordSize, minSubwordSize, maxSubwordSize }
+ */
+ public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
+ HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
+ int maxSubwordSize) {
+ this(matchVersion, input, hyphenator, (Set<?>) null, minWordSize, minSubwordSize,
+ maxSubwordSize, false);
+ }
+
+ /**
+ * Create a HyphenationCompoundWordTokenFilter with no dictionary.
+ * <p>
+ * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, int, int, int)
+ * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
+ * DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
+ */
+ public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
+ HyphenationTree hyphenator) {
+ this(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE,
+ DEFAULT_MAX_SUBWORD_SIZE);
+ }
+
+ /**
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
*
* @param input the {@link TokenStream} to process
@@ -305,7 +332,7 @@ public class HyphenationCompoundWordToke
}
// check the dictionary
- if (dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
+ if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
if (this.onlyLongestMatch) {
if (longestMatchToken != null) {
if (longestMatchToken.termLength() < partLength) {
Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/package.html?rev=946139&r1=946138&r2=946139&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/package.html (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/package.html Wed May 19 11:58:37 2010
@@ -81,8 +81,9 @@ filter available:
The {@link
org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter
HyphenationCompoundWordTokenFilter} uses hyphenation grammars to find
-potential subwords that a worth to check against the dictionary. The
-quality of the output tokens is directly connected to the quality of the
+potential subwords that a worth to check against the dictionary. It can be used
+without a dictionary as well but then produces a lot of "nonword" tokens.
+The quality of the output tokens is directly connected to the quality of the
grammar file you use. For languages like German they are quite good.
<h5>Grammar file</h5>
Unfortunately we cannot bundle the hyphenation grammar files with Lucene
@@ -157,8 +158,24 @@ This decision matrix should help you:
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
- Token t;
- while ((t=tf.next())!=null) {
+ CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
+ while (tf.incrementToken()) {
+ System.out.println(t);
+ }
+ }
+
+ public void testHyphenationCompoundWordsWithoutDictionaryDE() throws Exception {
+ Reader reader = new FileReader("de_DR.xml");
+
+ HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
+ .getHyphenationTree(reader);
+
+ HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
+ new WhitespaceTokenizer(new StringReader(
+ "Rindfleischüberwachungsgesetz Drahtschere abba")), hyphenator);
+
+ CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
+ while (tf.incrementToken()) {
System.out.println(t);
}
}
@@ -173,8 +190,8 @@ This decision matrix should help you:
new StringReader(
"Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
dict);
- Token t;
- while ((t=tf.next())!=null) {
+ CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
+ while (tf.incrementToken()) {
System.out.println(t);
}
}
Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java?rev=946139&r1=946138&r2=946139&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java Wed May 19 11:58:37 2010
@@ -70,6 +70,54 @@ public class TestCompoundWordTokenFilter
}
+ /**
+ * With hyphenation-only, you can get a lot of nonsense tokens.
+ * This can be controlled with the min/max subword size.
+ */
+ public void testHyphenationOnly() throws Exception {
+ Reader reader = getHyphenationReader();
+ HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
+ .getHyphenationTree(reader);
+
+ HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
+ TEST_VERSION_CURRENT,
+ new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
+ hyphenator,
+ CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+ 2, 4);
+
+ // min=2, max=4
+ assertTokenStreamContents(tf,
+ new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" }
+ );
+
+ tf = new HyphenationCompoundWordTokenFilter(
+ TEST_VERSION_CURRENT,
+ new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
+ hyphenator,
+ CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+ 4, 6);
+
+ // min=4, max=6
+ assertTokenStreamContents(tf,
+ new String[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" }
+ );
+
+ tf = new HyphenationCompoundWordTokenFilter(
+ TEST_VERSION_CURRENT,
+ new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
+ hyphenator,
+ CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+ 4, 10);
+
+ // min=4, max=10
+ assertTokenStreamContents(tf,
+ new String[] { "basketballkurv", "basket", "basketbal", "basketball", "sket",
+ "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" }
+ );
+
+ }
+
public void testDumbCompoundWordsSE() throws Exception {
String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
"Pelar", "Glas", "Ãgon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",