You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/04 17:29:42 UTC

svn commit: r1227211 - in /lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji: ./ tokenAttributes/

Author: rmuir
Date: Wed Jan  4 16:29:41 2012
New Revision: 1227211

URL: http://svn.apache.org/viewvc?rev=1227211&view=rev
Log:
LUCENE-3305: lazy metadata decode (not the best yet, but a 4x perf improvement if you dont use those atts)

Added:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/BasicFormAttributeImpl.java
      - copied, changed from r1227053, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/KuromojiAttributeImpl.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/PartOfSpeechAttribute.java   (with props)
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/PartOfSpeechAttributeImpl.java   (with props)
Removed:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/KuromojiAttributeImpl.java
Modified:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java?rev=1227211&r1=1227210&r2=1227211&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java Wed Jan  4 16:29:41 2012
@@ -22,16 +22,18 @@ import java.text.BreakIterator;
 import java.util.List;
 import java.util.Locale;
 
+import org.apache.lucene.analysis.kuromoji.tokenAttributes.BasicFormAttribute;
+import org.apache.lucene.analysis.kuromoji.tokenAttributes.PartOfSpeechAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.analysis.util.SegmentingTokenizerBase;
 
 public final class KuromojiTokenizer extends SegmentingTokenizerBase {
   private static final BreakIterator proto = BreakIterator.getSentenceInstance(Locale.JAPAN);
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+  private final BasicFormAttribute basicFormAtt = addAttribute(BasicFormAttribute.class);
+  private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class);
   private final org.apache.lucene.analysis.kuromoji.Tokenizer tokenizer;
   
   private List<Token> tokens; 
@@ -65,8 +67,8 @@ public final class KuromojiTokenizer ext
     termAtt.copyBuffer(buffer, sentenceStart + position, length);
     int startOffset = offset + sentenceStart + position;
     offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset+length));
-    // nocommit: this is currently very very expensive and should be lazy!
-    typeAtt.setType(token.getPartOfSpeech());
+    basicFormAtt.setToken(token);
+    posAtt.setToken(token);
     tokenIndex++;
     return true;
   }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1227211&r1=1227210&r2=1227211&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Wed Jan  4 16:29:41 2012
@@ -20,6 +20,8 @@ package org.apache.lucene.analysis.kurom
 import org.apache.lucene.analysis.kuromoji.dict.Dictionary;
 import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
 
+// TODO: somehow this thing needs to keep state, so that once it decodes metadata
+// it never does it again.
 public class Token {
   private final Dictionary dictionary;
   
@@ -47,21 +49,6 @@ public class Token {
   }
   
   /**
-   * @return all features
-   */
-  public String getAllFeatures() {
-    return dictionary.getAllFeatures(wordId);
-  }
-  
-  /**
-   * @return all features as array
-   */
-  public String[] getAllFeaturesArray() {
-    return dictionary.getAllFeaturesArray(wordId);
-  }
-  
-  
-  /**
    * @return reading. null if token doesn't have reading.
    */
   public String getReading() {

Copied: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/BasicFormAttributeImpl.java (from r1227053, lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/KuromojiAttributeImpl.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/BasicFormAttributeImpl.java?p2=lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/BasicFormAttributeImpl.java&p1=lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/KuromojiAttributeImpl.java&r1=1227053&r2=1227211&rev=1227211&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/KuromojiAttributeImpl.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/BasicFormAttributeImpl.java Wed Jan  4 16:29:41 2012
@@ -21,9 +21,7 @@ import org.apache.lucene.analysis.kuromo
 import org.apache.lucene.util.AttributeImpl;
 import org.apache.lucene.util.AttributeReflector;
 
-// TODO: we need to cache lazy state of POS/basicForm/reading/etc here (and implement all those attributes)
-// so that we don't do the (currently expensive) decoding of the metadata multiple times.
-public class KuromojiAttributeImpl extends AttributeImpl implements BasicFormAttribute, Cloneable {
+public class BasicFormAttributeImpl extends AttributeImpl implements BasicFormAttribute, Cloneable {
   private Token token;
   
   public String getBasicForm() {

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/PartOfSpeechAttribute.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/PartOfSpeechAttribute.java?rev=1227211&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/PartOfSpeechAttribute.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/PartOfSpeechAttribute.java Wed Jan  4 16:29:41 2012
@@ -0,0 +1,29 @@
+package org.apache.lucene.analysis.kuromoji.tokenAttributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.Token;
+import org.apache.lucene.util.Attribute;
+
+/**
+ * Attribute for {@link Token#getPartOfSpeech()}.
+ */
+public interface PartOfSpeechAttribute extends Attribute {
+  public String getPartOfSpeech();
+  public void setToken(Token token);
+}

Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/PartOfSpeechAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/PartOfSpeechAttributeImpl.java?rev=1227211&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/PartOfSpeechAttributeImpl.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/PartOfSpeechAttributeImpl.java Wed Jan  4 16:29:41 2012
@@ -0,0 +1,51 @@
+package org.apache.lucene.analysis.kuromoji.tokenAttributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.Token;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
+
+public class PartOfSpeechAttributeImpl extends AttributeImpl implements PartOfSpeechAttribute, Cloneable {
+  private Token token;
+  
+  public String getPartOfSpeech() {
+    return token == null ? null : token.getPartOfSpeech();
+  }
+  
+  public void setToken(Token token) {
+    this.token = token;
+  }
+
+  @Override
+  public void clear() {
+    token = null;
+  }
+
+  @Override
+  public void copyTo(AttributeImpl target) {
+    PartOfSpeechAttribute t = (PartOfSpeechAttribute) target;
+    t.setToken(token);
+  }
+  
+  @Override
+  public void reflectWith(AttributeReflector reflector) {
+    String partOfSpeech = getPartOfSpeech();
+    reflector.reflect(PartOfSpeechAttribute.class, "partOfSpeech", partOfSpeech);
+  }
+}
\ No newline at end of file