You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/04 06:47:30 UTC

svn commit: r1227053 - in /lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src: java/org/apache/lucene/analysis/kuromoji/ java/org/apache/lucene/analysis/kuromoji/dict/ java/org/apache/lucene/analysis/kuromoji/tokenAttributes/ resources/ test/...

Author: rmuir
Date: Wed Jan  4 05:47:27 2012
New Revision: 1227053

URL: http://svn.apache.org/viewvc?rev=1227053&view=rev
Log:
LUCENE-3305: shave 2MB off the jar and expose baseForm

Added:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/BasicFormAttribute.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/KuromojiAttributeImpl.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java   (with props)
Modified:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/tid.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/tid_map.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java?rev=1227053&r1=1227052&r2=1227053&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java Wed Jan  4 05:47:27 2012
@@ -65,6 +65,7 @@ public final class KuromojiTokenizer ext
     termAtt.copyBuffer(buffer, sentenceStart + position, length);
     int startOffset = offset + sentenceStart + position;
     offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset+length));
+    // nocommit: this is currently very very expensive and should be lazy!
     typeAtt.setType(token.getPartOfSpeech());
     tokenIndex++;
     return true;

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1227053&r1=1227052&r2=1227053&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Wed Jan  4 05:47:27 2012
@@ -76,6 +76,13 @@ public class Token {
   }
   
   /**
+   * @return basic form or null if token is not inflected
+   */
+  public String getBasicForm() {
+    return dictionary.getBasicForm(wordId);
+  }
+  
+  /**
    * Returns true if this token is known word
    * @return true if this token is in standard dictionary. false if not.
    */

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java?rev=1227053&r1=1227052&r2=1227053&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java Wed Jan  4 05:47:27 2012
@@ -71,6 +71,13 @@ public interface Dictionary {
   public String getReading(int wordId);
   
   /**
+   * Get base form of word
+   * @param wordId word ID of token
+   * @return Base form (only different for inflected words, otherwise null)
+   */
+  public String getBasicForm(int wordId);
+  
+  /**
    * Get feature(s) of tokens
    * @param wordId word ID token
    * @param fields array of index. If this is empty, return all features.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java?rev=1227053&r1=1227052&r2=1227053&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java Wed Jan  4 05:47:27 2012
@@ -176,7 +176,12 @@ public class TokenInfoDictionary impleme
     return getFeature(wordId, 0, 1, 2, 3);
   }
   
-  
+  @Override
+  public String getBasicForm(int wordId) {
+    String form = getFeature(wordId, 6);
+    return "*".equals(form) ? null : form;
+  }
+
   /**
    * Write dictionary in file
    * Dictionary format is:

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java?rev=1227053&r1=1227052&r2=1227053&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java Wed Jan  4 05:47:27 2012
@@ -122,6 +122,11 @@ public class UserDictionary implements D
   }
   
   @Override
+  public String getBasicForm(int wordId) {
+    return null; // TODO: add support?
+  }
+  
+  @Override
   public String[] getAllFeaturesArray(int wordId) {
     String allFeatures = featureEntries.get(wordId);
     if(allFeatures == null) {

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/BasicFormAttribute.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/BasicFormAttribute.java?rev=1227053&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/BasicFormAttribute.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/BasicFormAttribute.java Wed Jan  4 05:47:27 2012
@@ -0,0 +1,32 @@
+package org.apache.lucene.analysis.kuromoji.tokenAttributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.Token;
+import org.apache.lucene.util.Attribute;
+
+/**
+ * Attribute for {@link Token#getBasicForm()}.
+ * <p>
+ * Note: depending on part of speech, this value may not be applicable,
+ * and will be null.
+ */
+public interface BasicFormAttribute extends Attribute {
+  public String getBasicForm();
+  public void setToken(Token token);
+}

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/KuromojiAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/KuromojiAttributeImpl.java?rev=1227053&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/KuromojiAttributeImpl.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/KuromojiAttributeImpl.java Wed Jan  4 05:47:27 2012
@@ -0,0 +1,52 @@
+package org.apache.lucene.analysis.kuromoji.tokenAttributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.Token;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
+
+// TODO: we need to cache lazy state of POS/basicForm/reading/etc here (and implement all those attributes)
+// so that we don't do the (currently expensive) decoding of the metadata multiple times.
+public class KuromojiAttributeImpl extends AttributeImpl implements BasicFormAttribute, Cloneable {
+  private Token token;
+  
+  public String getBasicForm() {
+    return token == null ? null : token.getBasicForm();
+  }
+  
+  public void setToken(Token token) {
+    this.token = token;
+  }
+
+  @Override
+  public void clear() {
+    token = null;
+  }
+
+  @Override
+  public void copyTo(AttributeImpl target) {
+    BasicFormAttribute t = (BasicFormAttribute) target;
+    t.setToken(token);
+  }
+  
+  @Override
+  public void reflectWith(AttributeReflector reflector) {
+    reflector.reflect(BasicFormAttribute.class, "basicForm", getBasicForm());
+  }
+}

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/tid.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/tid.dat?rev=1227053&r1=1227052&r2=1227053&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/tid_map.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/tid_map.dat?rev=1227053&r1=1227052&r2=1227053&view=diff
==============================================================================
Binary files - no diff available.

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java?rev=1227053&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java Wed Jan  4 05:47:27 2012
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.kuromoji;
+
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.cjk.CJKAnalyzer;
+import org.apache.lucene.analysis.kuromoji.Tokenizer.Mode;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
+
+// nocommit: just for basic testing
+// java -cp modules/analysis/build/kuromoji/classes/java:modules/analysis/build/kuromoji/classes/test:modules/analysis/build/common/classes/java:lucene/build/classes/java org.apache.lucene.analysis.kuromoji.SimpleBench
+public class SimpleBench {
+  
+  public static void main(String args[]) throws Exception {
+    org.apache.lucene.analysis.kuromoji.Tokenizer tokenizer = 
+        org.apache.lucene.analysis.kuromoji.Tokenizer.builder().mode(Mode.NORMAL).build();
+    Analyzer a = new KuromojiAnalyzer(tokenizer);
+    Analyzer b = new CJKAnalyzer(Version.LUCENE_CURRENT);
+    
+    /* slight warmup */
+    consume(a, "fsdfdsfsdfdsf sdfdsfds fsdf dsfds");
+    consume(a, "多くの学生が試験に落ちた。");
+    
+    for (int i = 0; i < 4; i++) {
+      long ms = System.currentTimeMillis();
+      for (int j = 0; j < 50000; j++) {
+        consume(a, "魔女狩大将マシュー・ホプキンス。 魔女狩大将マシュー・ホプキンス。");
+      }
+      long ms2 = System.currentTimeMillis();
+      for (int j = 0; j < 50000; j++) {
+        consume(b, "魔女狩大将マシュー・ホプキンス。 魔女狩大将マシュー・ホプキンス。");
+      }
+      long ms3 = System.currentTimeMillis();
+      if (i != 0 /* exclude first round */) {
+        System.out.println("KUROMOJI: " + (ms2 - ms));
+        System.out.println("CJK: " + (ms3 - ms2));
+      }
+    }
+  }
+  
+  public static void consume(Analyzer a, String s) throws Exception {
+    TokenStream ts = a.tokenStream("foo", new StringReader(s));
+    ts.reset();
+    ts.addAttribute(CharTermAttribute.class);
+    while (ts.incrementToken()) {
+      // nothing
+    }
+    ts.end();
+    ts.close();
+  }
+}
\ No newline at end of file

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java?rev=1227053&r1=1227052&r2=1227053&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java Wed Jan  4 05:47:27 2012
@@ -76,6 +76,21 @@ public class TokenizerTest extends Lucen
     assertEquals(tokens.get(5).getReading(), "。");
   }
   
+  @Test
+  public void testBasicForms() {
+    List<Token> tokens = tokenizer.tokenize("それはまだ実験段階にあります。");
+    assertEquals(9, tokens.size());
+    assertNull(tokens.get(0).getBasicForm());
+    assertNull(tokens.get(1).getBasicForm());
+    assertNull(tokens.get(2).getBasicForm());
+    assertNull(tokens.get(3).getBasicForm());
+    assertNull(tokens.get(4).getBasicForm());
+    assertNull(tokens.get(5).getBasicForm());
+    assertEquals(tokens.get(6).getBasicForm(), "ある");
+    assertNull(tokens.get(7).getBasicForm());
+    assertNull(tokens.get(8).getBasicForm());
+  }
+  
   public void testBocchan() throws Exception {
     doTestBocchan(1);
   }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?rev=1227053&r1=1227052&r2=1227053&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Wed Jan  4 05:47:27 2012
@@ -152,9 +152,45 @@ public class TokenInfoDictionaryBuilder 
    * 12	- surface form
    * 13	- surface reading
    */
+  
+  // TODO: encoding in the current way (basically as CSV) is transparent, but we can
+  // save a lot of space, RAM, and some cpu if we use a binary format (via ByteBuffer) 
+  // instead: (example entry from IPADIC):
+  //
+  // 零れ話,1285,1285,5622,名詞,一般,*,*,*,*,零れ話,コボレバナシ,コボレバナシ
+  //
+  // 1. (名詞,一般,*,*,*,*) POS parts are huge but don't have many unique entries. 
+  //    they can be deduplicated into separate 'indexes' (arrays) and encoded as a byte or vint refs.
+  // 2. surface (零れ話) == basicForm (零れ話) very often, i think all words except adjectives/verbs.
+  //    in these common cases we can use a bit (or even '*' as basic form) to mark this.
+  // 3. reading (コボレバナシ) == pronunciation (コボレバナシ) often, we can do the same as 2 here.
+  // 4. readings and pronunciations are often completely katakana, cuts their size in half 
+  //    to mark this as a bit and encode singlebyte instead of utf-16: writeByte(ch - 0x30A0)
+  // 
+  // we can associate lazy decoding of these additional features in Token and speed things up a lot:
+  // in lucene-gosen we have an enum 'loadState' (3-stages of laziness):
+  //   NONE, BASIC (pos and base forms), FULL (BASIC + reading/pronunciation).
+  // the idea is that reading/pronunciation is rarely used by consumers, so its worth it to lazily
+  // only load whats needed, and in the rarer case where someone asks for reading/pronunciation its
+  // not too slow to skip over the BASIC stuff we already read in previously.
+  // in some cases people just tokenize and don't even use POS or anything and this really helps then.
+  public String[] compactEntry(String[] features) {
+    // 2. when basicForm == surfaceForm, just indicate this with a * instead of duplicating it.
+    if (features[0].equals(features[10])) {
+      features[10] = "*";
+    }
+    // 3. when pronunciation == reading, just indicate this with a * instead of duplicating it.
+    // TODO: is there also additional redundancy in the unidic case here that we can improve? 
+    if (features[12].equals(features[11])) {
+      features[12] = "*";
+    }
+    
+    return features;
+  }
+  
   public String[] formatEntry(String[] features) {
     if (this.format == DictionaryFormat.IPADIC) {
-      return features;
+      return compactEntry(features);
     } else {
       String[] features2 = new String[13];
       features2[0] = features[0];
@@ -178,7 +214,7 @@ public class TokenInfoDictionaryBuilder 
         features2[11] = features[13];
         features2[12] = features[13];
       }			
-      return features2;
+      return compactEntry(features2);
     }
   }