You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by rm...@apache.org on 2012/01/04 06:47:30 UTC
svn commit: r1227053 - in
/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src:
java/org/apache/lucene/analysis/kuromoji/
java/org/apache/lucene/analysis/kuromoji/dict/
java/org/apache/lucene/analysis/kuromoji/tokenAttributes/ resources/ test/...
Author: rmuir
Date: Wed Jan 4 05:47:27 2012
New Revision: 1227053
URL: http://svn.apache.org/viewvc?rev=1227053&view=rev
Log:
LUCENE-3305: shave 2MB off the jar and expose baseForm
Added:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/BasicFormAttribute.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/KuromojiAttributeImpl.java (with props)
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java (with props)
Modified:
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/tid.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/tid_map.dat
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java
lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java?rev=1227053&r1=1227052&r2=1227053&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/KuromojiTokenizer.java Wed Jan 4 05:47:27 2012
@@ -65,6 +65,7 @@ public final class KuromojiTokenizer ext
termAtt.copyBuffer(buffer, sentenceStart + position, length);
int startOffset = offset + sentenceStart + position;
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset+length));
+ // nocommit: this is currently very very expensive and should be lazy!
typeAtt.setType(token.getPartOfSpeech());
tokenIndex++;
return true;
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java?rev=1227053&r1=1227052&r2=1227053&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java Wed Jan 4 05:47:27 2012
@@ -76,6 +76,13 @@ public class Token {
}
/**
+ * @return basic form or null if token is not inflected
+ */
+ public String getBasicForm() {
+ return dictionary.getBasicForm(wordId);
+ }
+
+ /**
* Returns true if this token is known word
* @return true if this token is in standard dictionary. false if not.
*/
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java?rev=1227053&r1=1227052&r2=1227053&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java Wed Jan 4 05:47:27 2012
@@ -71,6 +71,13 @@ public interface Dictionary {
public String getReading(int wordId);
/**
+ * Get base form of word
+ * @param wordId word ID of token
+ * @return Base form (only different for inflected words, otherwise null)
+ */
+ public String getBasicForm(int wordId);
+
+ /**
* Get feature(s) of tokens
* @param wordId word ID token
* @param fields array of index. If this is empty, return all features.
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java?rev=1227053&r1=1227052&r2=1227053&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary.java Wed Jan 4 05:47:27 2012
@@ -176,7 +176,12 @@ public class TokenInfoDictionary impleme
return getFeature(wordId, 0, 1, 2, 3);
}
-
+ @Override
+ public String getBasicForm(int wordId) {
+ String form = getFeature(wordId, 6);
+ return "*".equals(form) ? null : form;
+ }
+
/**
* Write dictionary in file
* Dictionary format is:
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java?rev=1227053&r1=1227052&r2=1227053&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java Wed Jan 4 05:47:27 2012
@@ -122,6 +122,11 @@ public class UserDictionary implements D
}
@Override
+ public String getBasicForm(int wordId) {
+ return null; // TODO: add support?
+ }
+
+ @Override
public String[] getAllFeaturesArray(int wordId) {
String allFeatures = featureEntries.get(wordId);
if(allFeatures == null) {
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/BasicFormAttribute.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/BasicFormAttribute.java?rev=1227053&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/BasicFormAttribute.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/BasicFormAttribute.java Wed Jan 4 05:47:27 2012
@@ -0,0 +1,32 @@
+package org.apache.lucene.analysis.kuromoji.tokenAttributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.Token;
+import org.apache.lucene.util.Attribute;
+
+/**
+ * Attribute for {@link Token#getBasicForm()}.
+ * <p>
+ * Note: depending on part of speech, this value may not be applicable,
+ * and will be null.
+ */
+public interface BasicFormAttribute extends Attribute {
+ public String getBasicForm();
+ public void setToken(Token token);
+}
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/KuromojiAttributeImpl.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/KuromojiAttributeImpl.java?rev=1227053&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/KuromojiAttributeImpl.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/tokenAttributes/KuromojiAttributeImpl.java Wed Jan 4 05:47:27 2012
@@ -0,0 +1,52 @@
+package org.apache.lucene.analysis.kuromoji.tokenAttributes;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.kuromoji.Token;
+import org.apache.lucene.util.AttributeImpl;
+import org.apache.lucene.util.AttributeReflector;
+
+// TODO: we need to cache lazy state of POS/basicForm/reading/etc here (and implement all those attributes)
+// so that we don't do the (currently expensive) decoding of the metadata multiple times.
+public class KuromojiAttributeImpl extends AttributeImpl implements BasicFormAttribute, Cloneable {
+ private Token token;
+
+ public String getBasicForm() {
+ return token == null ? null : token.getBasicForm();
+ }
+
+ public void setToken(Token token) {
+ this.token = token;
+ }
+
+ @Override
+ public void clear() {
+ token = null;
+ }
+
+ @Override
+ public void copyTo(AttributeImpl target) {
+ BasicFormAttribute t = (BasicFormAttribute) target;
+ t.setToken(token);
+ }
+
+ @Override
+ public void reflectWith(AttributeReflector reflector) {
+ reflector.reflect(BasicFormAttribute.class, "basicForm", getBasicForm());
+ }
+}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/tid.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/tid.dat?rev=1227053&r1=1227052&r2=1227053&view=diff
==============================================================================
Binary files - no diff available.
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/tid_map.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/tid_map.dat?rev=1227053&r1=1227052&r2=1227053&view=diff
==============================================================================
Binary files - no diff available.
Added: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java?rev=1227053&view=auto
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java (added)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/SimpleBench.java Wed Jan 4 05:47:27 2012
@@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.kuromoji;
+
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.cjk.CJKAnalyzer;
+import org.apache.lucene.analysis.kuromoji.Tokenizer.Mode;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
+
+// nocommit: just for basic testing
+// java -cp modules/analysis/build/kuromoji/classes/java:modules/analysis/build/kuromoji/classes/test:modules/analysis/build/common/classes/java:lucene/build/classes/java org.apache.lucene.analysis.kuromoji.SimpleBench
+public class SimpleBench {
+
+ public static void main(String args[]) throws Exception {
+ org.apache.lucene.analysis.kuromoji.Tokenizer tokenizer =
+ org.apache.lucene.analysis.kuromoji.Tokenizer.builder().mode(Mode.NORMAL).build();
+ Analyzer a = new KuromojiAnalyzer(tokenizer);
+ Analyzer b = new CJKAnalyzer(Version.LUCENE_CURRENT);
+
+ /* slight warmup */
+ consume(a, "fsdfdsfsdfdsf sdfdsfds fsdf dsfds");
+ consume(a, "å¤ãã®å¦çã試é¨ã«è½ã¡ãã");
+
+ for (int i = 0; i < 4; i++) {
+ long ms = System.currentTimeMillis();
+ for (int j = 0; j < 50000; j++) {
+ consume(a, "é女ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã é女ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã");
+ }
+ long ms2 = System.currentTimeMillis();
+ for (int j = 0; j < 50000; j++) {
+ consume(b, "é女ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã é女ç©å¤§å°ãã·ã¥ã¼ã»ãããã³ã¹ã");
+ }
+ long ms3 = System.currentTimeMillis();
+ if (i != 0 /* exclude first round */) {
+ System.out.println("KUROMOJI: " + (ms2 - ms));
+ System.out.println("CJK: " + (ms3 - ms2));
+ }
+ }
+ }
+
+ public static void consume(Analyzer a, String s) throws Exception {
+ TokenStream ts = a.tokenStream("foo", new StringReader(s));
+ ts.reset();
+ ts.addAttribute(CharTermAttribute.class);
+ while (ts.incrementToken()) {
+ // nothing
+ }
+ ts.end();
+ ts.close();
+ }
+}
\ No newline at end of file
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java?rev=1227053&r1=1227052&r2=1227053&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/TokenizerTest.java Wed Jan 4 05:47:27 2012
@@ -76,6 +76,21 @@ public class TokenizerTest extends Lucen
assertEquals(tokens.get(5).getReading(), "ã");
}
+ @Test
+ public void testBasicForms() {
+ List<Token> tokens = tokenizer.tokenize("ããã¯ã¾ã å®é¨æ®µéã«ããã¾ãã");
+ assertEquals(9, tokens.size());
+ assertNull(tokens.get(0).getBasicForm());
+ assertNull(tokens.get(1).getBasicForm());
+ assertNull(tokens.get(2).getBasicForm());
+ assertNull(tokens.get(3).getBasicForm());
+ assertNull(tokens.get(4).getBasicForm());
+ assertNull(tokens.get(5).getBasicForm());
+ assertEquals(tokens.get(6).getBasicForm(), "ãã");
+ assertNull(tokens.get(7).getBasicForm());
+ assertNull(tokens.get(8).getBasicForm());
+ }
+
public void testBocchan() throws Exception {
doTestBocchan(1);
}
Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java?rev=1227053&r1=1227052&r2=1227053&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java (original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/TokenInfoDictionaryBuilder.java Wed Jan 4 05:47:27 2012
@@ -152,9 +152,45 @@ public class TokenInfoDictionaryBuilder
* 12 - surface form
* 13 - surface reading
*/
+
+ // TODO: encoding in the current way (basically as CSV) is transparent, but we can
+ // save a lot of space, RAM, and some cpu if we use a binary format (via ByteBuffer)
+ // instead: (example entry from IPADIC):
+ //
+ // é¶ã話,1285,1285,5622,åè©,ä¸è¬,*,*,*,*,é¶ã話,ã³ãã¬ããã·,ã³ãã¬ããã·
+ //
+ // 1. (åè©,ä¸è¬,*,*,*,*) POS parts are huge but don't have many unique entries.
+ // they can be deduplicated into separate 'indexes' (arrays) and encoded as a byte or vint refs.
+ // 2. surface (é¶ã話) == basicForm (é¶ã話) very often, i think all words except adjectives/verbs.
+ // in these common cases we can use a bit (or even '*' as basic form) to mark this.
+ // 3. reading (ã³ãã¬ããã·) == pronunciation (ã³ãã¬ããã·) often, we can do the same as 2 here.
+ // 4. readings and pronunciations are often completely katakana, cuts their size in half
+ // to mark this as a bit and encode singlebyte instead of utf-16: writeByte(ch - 0x30A0)
+ //
+ // we can associate lazy decoding of these additional features in Token and speed things up a lot:
+ // in lucene-gosen we have an enum 'loadState' (3-stages of laziness):
+ // NONE, BASIC (pos and base forms), FULL (BASIC + reading/pronunciation).
+ // the idea is that reading/pronunciation is rarely used by consumers, so its worth it to lazily
+ // only load whats needed, and in the rarer case where someone asks for reading/pronunciation its
+ // not too slow to skip over the BASIC stuff we already read in previously.
+ // in some cases people just tokenize and don't even use POS or anything and this really helps then.
+ public String[] compactEntry(String[] features) {
+ // 2. when basicForm == surfaceForm, just indicate this with a * instead of duplicating it.
+ if (features[0].equals(features[10])) {
+ features[10] = "*";
+ }
+ // 3. when pronunciation == reading, just indicate this with a * instead of duplicating it.
+ // TODO: is there also additional redundancy in the unidic case here that we can improve?
+ if (features[12].equals(features[11])) {
+ features[12] = "*";
+ }
+
+ return features;
+ }
+
public String[] formatEntry(String[] features) {
if (this.format == DictionaryFormat.IPADIC) {
- return features;
+ return compactEntry(features);
} else {
String[] features2 = new String[13];
features2[0] = features[0];
@@ -178,7 +214,7 @@ public class TokenInfoDictionaryBuilder
features2[11] = features[13];
features2[12] = features[13];
}
- return features2;
+ return compactEntry(features2);
}
}