You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/08/05 00:37:14 UTC

svn commit: r1369502 - in /lucene/dev/trunk/lucene: ./ analysis/common/src/java/org/apache/lucene/analysis/cjk/ analysis/common/src/test/org/apache/lucene/analysis/cjk/

Author: rmuir
Date: Sat Aug  4 22:37:14 2012
New Revision: 1369502

URL: http://svn.apache.org/viewvc?rev=1369502&view=rev
Log:
LUCENE-4286: add unibigram option to CJKBigramFilter

Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java
    lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java
    lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1369502&r1=1369501&r2=1369502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Sat Aug  4 22:37:14 2012
@@ -47,6 +47,11 @@ New features
   int docID), to attempt deletion by docID as long as the provided
   reader is an NRT reader, and the segment has not yet been merged
   away (Mike McCandless).
+  
+* LUCENE-4286: Added option to CJKBigramFilter to always also output
+  unigrams. This can be used for a unigram+bigram approach, or at 
+  index-time only for better support of short queries.
+  (Tom Burton-West, Robert Muir)
 
 API Changes
 

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java?rev=1369502&r1=1369501&r2=1369502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java Sat Aug  4 22:37:14 2012
@@ -24,6 +24,8 @@ import org.apache.lucene.analysis.TokenS
 import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.util.ArrayUtil;
 
@@ -35,6 +37,12 @@ import org.apache.lucene.util.ArrayUtil;
  * {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which
  * of the CJK scripts are turned into bigrams.
  * <p>
+ * By default, when a CJK character has no adjacent characters to form
+ * a bigram, it is output in unigram form. If you want to always output
+ * both unigrams and bigrams, set the <code>outputUnigrams</code>
+ * flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}.
+ * This can be used for a combined unigram+bigram approach.
+ * <p>
  * In all cases, all non-CJK input is passed thru unmodified.
  */
 public final class CJKBigramFilter extends TokenFilter {
@@ -67,10 +75,16 @@ public final class CJKBigramFilter exten
   private final Object doHiragana;
   private final Object doKatakana;
   private final Object doHangul;
+  
+  // true if we should output unigram tokens always
+  private final boolean outputUnigrams;
+  private boolean ngramState; // false = output unigram, true = output bigram
     
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
   
   // buffers containing codepoint and offsets in parallel
   int buffer[] = new int[8];
@@ -88,23 +102,36 @@ public final class CJKBigramFilter exten
   
   /** 
    * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
-   *       CJKBigramFilter(HAN | HIRAGANA | KATAKANA | HANGUL)}
+   *       CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)}
    */
   public CJKBigramFilter(TokenStream in) {
     this(in, HAN | HIRAGANA | KATAKANA | HANGUL);
   }
   
   /** 
-   * Create a new CJKBigramFilter, specifying which writing systems should be bigrammed.
+   * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)
+   *       CJKBigramFilter(in, flags, false)}
+   */
+  public CJKBigramFilter(TokenStream in, int flags) {
+    this(in, flags, false);
+  }
+  
+  /**
+   * Create a new CJKBigramFilter, specifying which writing systems should be bigrammed,
+   * and whether or not unigrams should also be output.
    * @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA}, 
    *        {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
+   * @param outputUnigrams true if unigrams for the selected writing systems should also be output.
+   *        when this is false, this is only done when there are no adjacent characters to form
+   *        a bigram.
    */
-  public CJKBigramFilter(TokenStream in, int flags) {
+  public CJKBigramFilter(TokenStream in, int flags, boolean outputUnigrams) {
     super(in);
     doHan =      (flags & HAN) == 0      ? NO : HAN_TYPE;
     doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
     doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
     doHangul =   (flags & HANGUL) == 0   ? NO : HANGUL_TYPE;
+    this.outputUnigrams = outputUnigrams;
   }
   
   /*
@@ -120,7 +147,24 @@ public final class CJKBigramFilter exten
         // case 1: we have multiple remaining codepoints buffered,
         // so we can emit a bigram here.
         
-        flushBigram();
+        if (outputUnigrams) {
+
+          // when also outputting unigrams, we output the unigram first,
+          // then rewind back to revisit the bigram.
+          // so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
+          // the logic in hasBufferedUnigram ensures we output the C, 
+          // even though it did actually have adjacent CJK characters.
+
+          if (ngramState) {
+            flushBigram();
+          } else {
+            flushUnigram();
+            index--;
+          }
+          ngramState = !ngramState;
+        } else {
+          flushBigram();
+        }
         return true;
       } else if (doNext()) {
         
@@ -260,6 +304,11 @@ public final class CJKBigramFilter exten
     termAtt.setLength(len2);
     offsetAtt.setOffset(startOffset[index], endOffset[index+1]);
     typeAtt.setType(DOUBLE_TYPE);
+    // when outputting unigrams, all bigrams are synonyms that span two unigrams
+    if (outputUnigrams) {
+      posIncAtt.setPositionIncrement(0);
+      posLengthAtt.setPositionLength(2);
+    }
     index++;
   }
   
@@ -292,7 +341,13 @@ public final class CJKBigramFilter exten
    * inputs.
    */
   private boolean hasBufferedUnigram() {
-    return bufferLen == 1 && index == 0;
+    if (outputUnigrams) {
+      // when outputting unigrams always
+      return bufferLen - index == 1;
+    } else {
+      // otherwise its only when we have a lone CJK character
+      return bufferLen == 1 && index == 0;
+    }
   }
 
   @Override
@@ -303,5 +358,6 @@ public final class CJKBigramFilter exten
     lastEndOffset = 0;
     loneState = null;
     exhausted = false;
+    ngramState = false;
   }
 }

Modified: lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java?rev=1369502&r1=1369501&r2=1369502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilterFactory.java Sat Aug  4 22:37:14 2012
@@ -33,12 +33,13 @@ import org.apache.lucene.analysis.util.T
  *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
  *     &lt;filter class="solr.CJKBigramFilterFactory" 
  *       han="true" hiragana="true" 
- *       katakana="true" hangul="true" /&gt;
+ *       katakana="true" hangul="true" outputUnigrams="false" /&gt;
  *   &lt;/analyzer&gt;
  * &lt;/fieldType&gt;</pre>
  */
 public class CJKBigramFilterFactory extends TokenFilterFactory {
   int flags;
+  boolean outputUnigrams;
 
   @Override
   public void init(Map<String,String> args) {
@@ -56,10 +57,11 @@ public class CJKBigramFilterFactory exte
     if (getBoolean("hangul", true)) {
       flags |= CJKBigramFilter.HANGUL;
     }
+    outputUnigrams = getBoolean("outputUnigrams", false);
   }
   
   @Override
   public TokenStream create(TokenStream input) {
-    return new CJKBigramFilter(input, flags);
+    return new CJKBigramFilter(input, flags, outputUnigrams);
   }
 }

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java?rev=1369502&r1=1369501&r2=1369502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java Sat Aug  4 22:37:14 2012
@@ -18,6 +18,7 @@ package org.apache.lucene.analysis.cjk;
  */
 
 import java.io.Reader;
+import java.util.Random;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@@ -33,6 +34,15 @@ public class TestCJKBigramFilter extends
     }
   };
   
+  Analyzer unibiAnalyzer = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
+      return new TokenStreamComponents(t, 
+          new CJKBigramFilter(t, 0xff, true));
+    }
+  };
+  
   public void testHuge() throws Exception {
     assertAnalyzesTo(analyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
      + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
@@ -62,6 +72,96 @@ public class TestCJKBigramFilter extends
       }
     };
     assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
-        new String[] { "多", "く", "の",  "学生", "が",  "試験", "に",  "落", "ち", "た" });
+        new String[] { "多", "く", "の",  "学生", "が",  "試験", "に",  "落", "ち", "た" },
+        new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 },
+        new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 },
+        new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>", 
+                       "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
+        new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+        new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
+  }
+  
+  public void testAllScripts() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
+        return new TokenStreamComponents(t, 
+            new CJKBigramFilter(t, 0xff, false));
+      }
+    };
+    assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
+        new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" });
+  }
+  
+  public void testUnigramsAndBigramsAllScripts() throws Exception {
+    assertAnalyzesTo(unibiAnalyzer, "多くの学生が試験に落ちた。",
+        new String[] { 
+        "多", "多く", "く",  "くの", "の",  "の学", "学", "学生", "生", 
+        "生が", "が",  "が試", "試", "試験", "験", "験に", "に", 
+                "に落", "落", "落ち", "ち", "ちた", "た" 
+        },
+        new int[] { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
+                    6, 7, 7, 8, 8, 9, 9, 10, 10, 11 },
+        new int[] { 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 
+                    8, 8, 9, 9, 10, 10, 11, 11, 12, 12 },
+        new String[] { "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>",
+                       "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>",
+                       "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>" },
+        new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 
+                    0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+        new int[] { 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 
+                    2, 1, 2, 1, 2, 1, 2, 1, 2, 1 }
+    );
+  }
+  
+  public void testUnigramsAndBigramsHanOnly() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
+        return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true));
+      }
+    };
+    assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
+        new String[] { "多", "く", "の",  "学", "学生", "生", "が",  "試", "試験", "験", "に",  "落", "ち", "た" },
+        new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 },
+        new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 },
+        new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", 
+                       "<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", 
+                       "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
+        new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 },
+        new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 });
+  }
+  
+  public void testUnigramsAndBigramsHuge() throws Exception {
+    assertAnalyzesTo(unibiAnalyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
+     + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
+     + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた",
+       new String[] { 
+        "多", "多く", "く",  "くの", "の", "の学", "学", "学生", "生", "生が", "が",  "が試", "試", "試験", "験", "験に", "に",  "に落", "落", "落ち", "ち", "ちた", "た", "た多",
+        "多", "多く", "く",  "くの", "の", "の学", "学", "学生", "生", "生が", "が",  "が試", "試", "試験", "験", "験に", "に",  "に落", "落", "落ち", "ち", "ちた", "た", "た多",
+        "多", "多く", "く",  "くの", "の", "の学", "学", "学生", "生", "生が", "が",  "が試", "試", "試験", "験", "験に", "に",  "に落", "落", "落ち", "ち", "ちた", "た", "た多",
+        "多", "多く", "く",  "くの", "の", "の学", "学", "学生", "生", "生が", "が",  "が試", "試", "試験", "験", "験に", "に",  "に落", "落", "落ち", "ち", "ちた", "た", "た多",
+        "多", "多く", "く",  "くの", "の", "の学", "学", "学生", "生", "生が", "が",  "が試", "試", "試験", "験", "験に", "に",  "に落", "落", "落ち", "ち", "ちた", "た", "た多",
+        "多", "多く", "く",  "くの", "の", "の学", "学", "学生", "生", "生が", "が",  "が試", "試", "試験", "験", "験に", "に",  "に落", "落", "落ち", "ち", "ちた", "た", "た多",
+        "多", "多く", "く",  "くの", "の", "の学", "学", "学生", "生", "生が", "が",  "が試", "試", "試験", "験", "験に", "に",  "に落", "落", "落ち", "ち", "ちた", "た", "た多",
+        "多", "多く", "く",  "くの", "の", "の学", "学", "学生", "生", "生が", "が",  "が試", "試", "試験", "験", "験に", "に",  "に落", "落", "落ち", "ち", "ちた", "た", "た多",
+        "多", "多く", "く",  "くの", "の", "の学", "学", "学生", "生", "生が", "が",  "が試", "試", "試験", "験", "験に", "に",  "に落", "落", "落ち", "ち", "ちた", "た", "た多",
+        "多", "多く", "く",  "くの", "の", "の学", "学", "学生", "生", "生が", "が",  "が試", "試", "試験", "験", "験に", "に",  "に落", "落", "落ち", "ち", "ちた", "た", "た多",
+        "多", "多く", "く",  "くの", "の", "の学", "学", "学生", "生", "生が", "が",  "が試", "試", "試験", "験", "験に", "に",  "に落", "落", "落ち", "ち", "ちた", "た"
+       }    
+    );
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomUnibiStrings() throws Exception {
+    checkRandomData(random(), unibiAnalyzer, 1000*RANDOM_MULTIPLIER);
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomUnibiHugeStrings() throws Exception {
+    Random random = random();
+    checkRandomData(random, unibiAnalyzer, 100*RANDOM_MULTIPLIER, 8192);
   }
 }

Modified: lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java?rev=1369502&r1=1369501&r2=1369502&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java (original)
+++ lucene/dev/trunk/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilterFactory.java Sat Aug  4 22:37:14 2012
@@ -52,4 +52,16 @@ public class TestCJKBigramFilterFactory 
     assertTokenStreamContents(stream,
         new String[] { "多", "く", "の",  "学生", "が",  "試験", "に",  "落", "ち", "た" });
   }
+  
+  public void testHanOnlyUnigrams() throws Exception {
+    Reader reader = new StringReader("多くの学生が試験に落ちた。");
+    CJKBigramFilterFactory factory = new CJKBigramFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("hiragana", "false");
+    args.put("outputUnigrams", "true");
+    factory.init(args);
+    TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader));
+    assertTokenStreamContents(stream,
+        new String[] { "多", "く", "の",  "学", "学生", "生", "が",  "試", "試験", "験", "に",  "落", "ち", "た" });
+  }
 }