You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2010/05/19 13:58:37 UTC

svn commit: r946139 - in /lucene/dev/trunk: lucene/contrib/ modules/analysis/common/src/java/org/apache/lucene/analysis/compound/ modules/analysis/common/src/test/org/apache/lucene/analysis/compound/

Author: rmuir
Date: Wed May 19 11:58:37 2010
New Revision: 946139

URL: http://svn.apache.org/viewvc?rev=946139&view=rev
Log:
LUCENE-1287: Allow usage of HyphenationCompoundWordTokenFilter without a dictionary

Modified:
    lucene/dev/trunk/lucene/contrib/CHANGES.txt
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
    lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/package.html
    lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java

Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=946139&r1=946138&r2=946139&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Wed May 19 11:58:37 2010
@@ -160,6 +160,9 @@ New features
  * LUCENE-2463: Add a Greek inflectional stemmer. GreekAnalyzer will now stem words
    when Version is set to 3.1 or higher.  (Robert Muir)
 
+ * LUCENE-1287: Allow usage of HyphenationCompoundWordTokenFilter without dictionary.
+   (Thomas Peuss via Robert Muir)
+
 Build
 
  * LUCENE-2124: Moved the JDK-based collation support from contrib/collation 

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java?rev=946139&r1=946138&r2=946139&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java Wed May 19 11:58:37 2010
@@ -154,7 +154,7 @@ public abstract class CompoundWordTokenF
     this.maxSubwordSize=maxSubwordSize;
     this.onlyLongestMatch=onlyLongestMatch;
     
-    if (dictionary instanceof CharArraySet) {
+    if (dictionary==null || dictionary instanceof CharArraySet) {
       this.dictionary = (CharArraySet) dictionary;
     } else {
       this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false);
@@ -181,6 +181,9 @@ public abstract class CompoundWordTokenF
   }
   
   public static final Set<?> makeDictionary(final Version matchVersion, final String[] dictionary) {
+    if (dictionary == null) {
+      return null;
+    }
     // is the below really case insensitive? 
     CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false);
     addAllLowerCase(dict, Arrays.asList(dictionary));

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java?rev=946139&r1=946138&r2=946139&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java Wed May 19 11:58:37 2010
@@ -154,6 +154,33 @@ public class HyphenationCompoundWordToke
   }
 
   /**
+   * Create a HyphenationCompoundWordTokenFilter with no dictionary.
+   * <p>
+   * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)
+   * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
+   * null, minWordSize, minSubwordSize, maxSubwordSize }
+   */
+  public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
+      HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
+      int maxSubwordSize) {
+    this(matchVersion, input, hyphenator, (Set<?>) null, minWordSize, minSubwordSize,
+        maxSubwordSize, false);
+  }
+  
+  /**
+   * Create a HyphenationCompoundWordTokenFilter with no dictionary.
+   * <p>
+   * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, int, int, int) 
+   * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator, 
+   * DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
+   */
+  public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
+      HyphenationTree hyphenator) {
+    this(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, 
+        DEFAULT_MAX_SUBWORD_SIZE);
+  }
+
+  /**
    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
    * 
    * @param input the {@link TokenStream} to process
@@ -305,7 +332,7 @@ public class HyphenationCompoundWordToke
         }
 
         // check the dictionary
-        if (dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
+        if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
           if (this.onlyLongestMatch) {
             if (longestMatchToken != null) {
               if (longestMatchToken.termLength() < partLength) {

Modified: lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/package.html?rev=946139&r1=946138&r2=946139&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/package.html (original)
+++ lucene/dev/trunk/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/package.html Wed May 19 11:58:37 2010
@@ -81,8 +81,9 @@ filter available:
 The {@link
 org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter
 HyphenationCompoundWordTokenFilter} uses hyphenation grammars to find
-potential subwords that a worth to check against the dictionary. The
-quality of the output tokens is directly connected to the quality of the
+potential subwords that a worth to check against the dictionary. It can be used
+without a dictionary as well but then produces a lot of "nonword" tokens.
+The quality of the output tokens is directly connected to the quality of the
 grammar file you use. For languages like German they are quite good.
 <h5>Grammar file</h5>
 Unfortunately we cannot bundle the hyphenation grammar files with Lucene
@@ -157,8 +158,24 @@ This decision matrix should help you:
         CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
         CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
         
-    Token t;
-    while ((t=tf.next())!=null) {
+    CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
+    while (tf.incrementToken()) {
+       System.out.println(t);
+    }
+  }
+
+  public void testHyphenationCompoundWordsWithoutDictionaryDE() throws Exception {
+    Reader reader = new FileReader("de_DR.xml");
+
+    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
+        .getHyphenationTree(reader);
+
+    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
+        new WhitespaceTokenizer(new StringReader(
+            "Rindfleisch&uuml;berwachungsgesetz Drahtschere abba")), hyphenator);
+        
+    CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
+    while (tf.incrementToken()) {
        System.out.println(t);
     }
   }
@@ -173,8 +190,8 @@ This decision matrix should help you:
             new StringReader(
                 "Bild&ouml;rr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glas&ouml;gonfodral Basfiolsfodral Basfiolsfodralmakareges&auml;ll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
         dict);
-    Token t;
-    while ((t=tf.next())!=null) {
+    CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
+    while (tf.incrementToken()) {
        System.out.println(t);
     }
   }

Modified: lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java?rev=946139&r1=946138&r2=946139&view=diff
==============================================================================
--- lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java (original)
+++ lucene/dev/trunk/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java Wed May 19 11:58:37 2010
@@ -70,6 +70,54 @@ public class TestCompoundWordTokenFilter
 
   }
 
+  /**
+   * With hyphenation-only, you can get a lot of nonsense tokens.
+   * This can be controlled with the min/max subword size.
+   */
+  public void testHyphenationOnly() throws Exception {
+    Reader reader = getHyphenationReader();
+    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
+      .getHyphenationTree(reader);
+    
+    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
+        TEST_VERSION_CURRENT,
+        new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
+        hyphenator,
+        CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+        2, 4);
+    
+    // min=2, max=4
+    assertTokenStreamContents(tf,
+        new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" }
+    );
+    
+    tf = new HyphenationCompoundWordTokenFilter(
+        TEST_VERSION_CURRENT,
+        new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
+        hyphenator,
+        CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+        4, 6);
+    
+    // min=4, max=6
+    assertTokenStreamContents(tf,
+        new String[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" }
+    );
+    
+    tf = new HyphenationCompoundWordTokenFilter(
+        TEST_VERSION_CURRENT,
+        new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
+        hyphenator,
+        CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+        4, 10);
+    
+    // min=4, max=10
+    assertTokenStreamContents(tf,
+        new String[] { "basketballkurv", "basket", "basketbal", "basketball", "sket", 
+                       "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" }
+    );
+    
+  }
+
   public void testDumbCompoundWordsSE() throws Exception {
     String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
         "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",