You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by si...@apache.org on 2009/07/01 12:32:24 UTC
svn commit: r790102 - in /lucene/java/trunk/contrib/analyzers/src:
java/org/apache/lucene/analysis/cn/ java/org/apache/lucene/analysis/cn/smart/
java/org/apache/lucene/analysis/cn/smart/hhmm/
resources/org/apache/lucene/analysis/cn/
Author: simonw
Date: Wed Jul 1 10:32:23 2009
New Revision: 790102
URL: http://svn.apache.org/viewvc?rev=790102&view=rev
Log:
LUCENE-1722: SmartChineseAnalyzer JavaDoc improvements - Replacing Chinese JavaDoc with English version. Robert Muir via Simon Willnauer
Modified:
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/package.html
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/CharType.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/Utility.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordTokenizer.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordType.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java
lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java
lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/stopwords.txt
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java Wed Jul 1 10:32:23 2009
@@ -33,23 +33,26 @@
import org.apache.lucene.analysis.cn.smart.WordSegmenter;
import org.apache.lucene.analysis.cn.smart.WordTokenizer;
+import org.apache.lucene.analysis.cn.smart.AnalyzerProfile; // for javadoc
+
/**
- *
- * SmartChineseAnalyzer æ¯ä¸ä¸ªæºè½ä¸æåè¯æ¨¡åï¼ è½å¤å©ç¨æ¦ç对æ±è¯å¥åè¿è¡æä¼ååï¼
- * 并å
åµè±ætokenizerï¼è½ææå¤çä¸è±ææ··åçææ¬å
容ã
- *
- * å®çåçåºäºèªç¶è¯è¨å¤çé¢åçé马å°ç§å¤«æ¨¡å(HMM)ï¼ å©ç¨å¤§éè¯æåºçè®ç»æ¥ç»è®¡æ±è¯è¯æ±çè¯é¢å跳转æ¦çï¼
- * ä»èæ ¹æ®è¿äºç»è®¡ç»æ对æ´ä¸ªæ±è¯å¥å计ç®æä¼¼ç¶(likelihood)çååã
- *
- * å 为æºè½åè¯éè¦è¯å
¸æ¥ä¿åè¯æ±çç»è®¡å¼ï¼SmartChineseAnalyzerçè¿è¡éè¦æå®è¯å
¸ä½ç½®ï¼å¦ä½æå®è¯å
¸ä½ç½®è¯·åè
- * org.apache.lucene.analysis.cn.smart.AnalyzerProfile
- *
- * SmartChineseAnalyzerçç®æ³åè¯æåºè¯å
¸æ¥èªäºictclas1.0项ç®(http://www.ictclas.org)ï¼
- * å
¶ä¸è¯å
¸å·²è·åwww.ictclas.orgçapache license v2(APLv2)çææãå¨éµå¾ªAPLv2çæ¡ä»¶ä¸ï¼æ¬¢è¿ç¨æ·ä½¿ç¨ã
- * å¨æ¤æè°¢www.ictclas.org以åictclasåè¯è½¯ä»¶çå·¥ä½äººåçæ ç§å¥ç®ï¼
- *
- * @see org.apache.lucene.analysis.cn.smart.AnalyzerProfile
- *
+ * <p>
+ * SmartChineseAnalyzer is an analyzer for Chinese or mixed Chinese-English text.
+ * The analyzer uses probabilistic knowledge to find the optimal word segmentation for Simplified Chinese text.
+ * The text is first broken into sentences, then each sentence is segmented into words.
+ * </p>
+ * <p>
+ * Segmentation is based upon the <a href="http://en.wikipedia.org/wiki/Hidden_Markov_Model">Hidden Markov Model</a>.
+ * A large training corpus was used to calculate Chinese word frequency probability.
+ * </p>
+ * <p>
+ * This analyzer requires a dictionary to provide statistical data.
+ * To specify the location of the dictionary data, refer to {@link AnalyzerProfile}
+ * </p>
+ * <p>
+ * The included dictionary data is from <a href="http://www.ictclas.org">ICTCLAS1.0</a>.
+ * Thanks to ICTCLAS for their hard work, and for contributing the data under the Apache 2 License!
+ * </p>
*/
public class SmartChineseAnalyzer extends Analyzer {
@@ -57,15 +60,23 @@
private WordSegmenter wordSegment;
+ /**
+ * Create a new SmartChineseAnalyzer, using the default stopword list.
+ */
public SmartChineseAnalyzer() {
this(true);
}
/**
- * SmartChineseAnalyzerå
é¨å¸¦æé»è®¤åæ¢è¯åºï¼ä¸»è¦æ¯æ ç¹ç¬¦å·ãå¦æä¸å¸æç»æä¸åºç°æ ç¹ç¬¦å·ï¼
- * å¯ä»¥å°useDefaultStopWords设为trueï¼ useDefaultStopWords为falseæ¶ä¸ä½¿ç¨ä»»ä½åæ¢è¯
+ * <p>
+ * Create a new SmartChineseAnalyzer, optionally using the default stopword list.
+ * </p>
+ * <p>
+ * The included default stopword list is simply a list of punctuation.
+ * If you do not use this list, punctuation will not be removed from the text!
+ * </p>
*
- * @param useDefaultStopWords
+ * @param useDefaultStopWords true to use the default stopword list.
*/
public SmartChineseAnalyzer(boolean useDefaultStopWords) {
if (useDefaultStopWords) {
@@ -76,10 +87,14 @@
}
/**
- * 使ç¨èªå®ä¹çèä¸ä½¿ç¨å
ç½®çåæ¢è¯åºï¼åæ¢è¯å¯ä»¥ä½¿ç¨SmartChineseAnalyzer.loadStopWords(InputStream)å è½½
- *
- * @param stopWords
- * @see SmartChineseAnalyzer.loadStopWords(InputStream)
+ * <p>
+ * Create a new SmartChineseAnalyzer, using the provided {@link Set} of stopwords.
+ * </p>
+ * <p>
+ * Note: the set should include punctuation, unless you want to index punctuation!
+ * </p>
+ * @param stopWords {@link Set} of stopwords to use.
+ * @see SmartChineseAnalyzer#loadStopWords(InputStream)
*/
public SmartChineseAnalyzer(Set stopWords) {
this.stopWords = stopWords;
@@ -90,8 +105,8 @@
TokenStream result = new SentenceTokenizer(reader);
result = new WordTokenizer(result, wordSegment);
// result = new LowerCaseFilter(result);
- // ä¸åéè¦LowerCaseFilterï¼å 为SegTokenFilterå·²ç»å°ææè±æå符转æ¢æå°å
- // stemå¤ªä¸¥æ ¼äº, This is not bug, this feature:)
+ // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
+ // The porter stemming is too strict, this is not a bug, this is a feature:)
result = new PorterStemFilter(result);
if (stopWords != null) {
result = new StopFilter(result, stopWords, false);
@@ -100,13 +115,17 @@
}
/**
- * ä»åç¨è¯æ件ä¸å è½½åç¨è¯ï¼ åç¨è¯æ件æ¯æ®éUTF-8ç¼ç çææ¬æä»¶ï¼ æ¯ä¸è¡æ¯ä¸ä¸ªåç¨è¯ï¼æ³¨éå©ç¨â//âï¼ åç¨è¯ä¸å
æ¬ä¸ææ ç¹ç¬¦å·ï¼ ä¸æç©ºæ ¼ï¼
- * 以å使ç¨ç太é«è对索å¼æä¹ä¸å¤§çè¯ã
+ * Utility function to return a {@link Set} of stopwords from a UTF-8 encoded {@link InputStream}.
+ * The comment "//" can be used in the stopword list.
*
- * @param input åç¨è¯æ件
- * @return åç¨è¯ç»æçHashSet
+ * @param input {@link InputStream} of UTF-8 encoded stopwords
+ * @return {@link Set} of stopwords.
*/
public static Set loadStopWords(InputStream input) {
+ /*
+ * Note: WordListLoader is not used here because this method allows for inline "//" comments.
+ * WordListLoader will only filter out these comments if they are on a separate line.
+ */
String line;
Set stopWords = new HashSet();
try {
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/package.html?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/package.html (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/package.html Wed Jul 1 10:32:23 2009
@@ -1,51 +1,22 @@
<html>
<head></head>
<body>
-Analyzer for Chinese.
+Analyzers for Chinese.
+<p>
+Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
+<ul>
+ <li>ChineseAnalyzer: Index unigrams (individual Chinese characters) as a token.
+ <li>CJKAnalyzer: Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
+ <li>SmartChineseAnalyzer: Index words (attempt to segment Chinese text into words) as tokens.
+</ul>
-
-<h2>About SmartChineseAnalyzer</h2>
-<p>SmartChineseAnalyzer æ¯ä¸ä¸ªæºè½ä¸æåè¯æ¨¡åï¼ ä¸ ChineseAnalyzer ï¼ååæ¯ä¸ªæ±åï¼å
-CJKAnalyzer ï¼ç»åæ¯ä¸¤ä¸ªæ±åï¼ä¸åï¼ å®è½å¤å©ç¨æ¦ç对æ±è¯å¥åè¿è¡æä¼ååï¼ å¹¶å
åµè±ætokenizerï¼
-è½ææå¤çä¸è±ææ··åçææ¬å
容ãç®åSmartChineseAnalyzerçè¯å
¸åºåªæ¯æç®ä½ä¸æã</p>
-
-<p>å®çåçåºäºèªç¶è¯è¨å¤çé¢åçé马å°ç§å¤«æ¨¡å(HMM)ï¼ å©ç¨å¤§éè¯æåºçè®ç»æ¥ç»è®¡æ±è¯è¯æ±çè¯é¢å跳转æ¦çï¼
-ä»èæ ¹æ®è¿äºç»è®¡ç»æ对æ´ä¸ªæ±è¯å¥å计ç®æä¼¼ç¶(likelihood)çååã</p>
-
-<p>ä¸ç§åè¯æ¨¡åçåè¯ç»ææ¯è¾, ç±æ¤å¯ä»¥çåºæºè½åè¯æ´ç¬¦åå¥åçåæ¬è¯ä¹ï¼ ä»èæé«æç´¢çåç¡®çã
-<pre>è¯å¥ï¼ ææ¯ä¸å½äºº</pre>
+Example phraseï¼ "ææ¯ä¸å½äºº"
<ol>
- <li>SmartChineseAnalyzer: æï¼æ¯ï¼ä¸å½ï¼äºº</li>
<li>ChineseAnalyzer: æï¼æ¯ï¼ä¸ï¼å½ï¼äºº</li>
<li>CJKAnalyzer: ææ¯ï¼æ¯ä¸ï¼ä¸å½ï¼å½äºº</li>
+ <li>SmartChineseAnalyzer: æï¼æ¯ï¼ä¸å½ï¼äºº</li>
</ol>
</p>
-<h3>åè¯è¯å
¸ç设置</h3>
-<p>å 为æºè½åè¯éè¦è¯å
¸æ¥ä¿åè¯æ±çç»è®¡å¼ï¼é»è®¤æ
åµä¸ï¼SmartChineseAnalyzer使ç¨å
ç½®çè¯å
¸åºï¼å½éè¦æå®çè¯å
¸åºæ¶ï¼éè¦æå®è¯å
¸ä½ç½®ï¼å¦ä½æå®è¯å
¸ä½ç½®è¯·åè
-org.apache.lucene.analysis.cn.smart.AnalyzerProfileã</p>
-
-<p><b>è¯åºçä¸è½½å°å为ï¼<a
- href="http://code.google.com/p/imdict-chinese-analyzer/downloads/list">http://code.google.com/p/imdict-chinese-analyzer/downloads/list</a>
-</b> ä¸è½½æ件analysis-data.zipä¿åå°æ¬å°ï¼è§£åå³å¯ä½¿ç¨ã</p>
-
-<p>æç®åçæå®è¯å
¸åºçåæ³å°±æ¯è¿è¡æ¶å ä¸åæ°-Danalysis.data.dir
-<pre>å¦ï¼ java -Danalysis.data.dir=/path/to/analysis-data com.example.YourApplication</pre>
-</p>
-
-<h3>çæ¬è¦æ±</h3>
-<p>SmartChineseAnalyzerçJVMè¦æ±java 1.4å以ä¸çæ¬ï¼Lucene
-è¦æ±2.4.0å以ä¸çæ¬ï¼Lucene 2.3.Xçåºè¯¥ä¹å¯ä»¥ä½¿ç¨ï¼ä½æªç»æµè¯ï¼æéè¦çç¨æ·å¯èªè¡æµè¯ã</p>
-
-<h3>æºæ件åææ¬ç¼ç </h3>
-é¤ç¹å®çäºè¿å¶ç æ件å¤ï¼SmartChineseAnalyzerçææææ¬åJavaæºç é½éç¨UTF-8ç¼ç ï¼
-å æ¤å¨è¯»åææ¬åç¼è¯Javaæºç æ¯è¯·æ³¨æéç¨æ£ç¡®çæ¹å¼ï¼ä»¥é¿å
产çä¹±ç é误ã
-
-<h3>SmartChineseAnalyzerçææ</h3>
-<p>SmartChineseAnalyzerçç®æ³åè¯æåºè¯å
¸æ¥èªäºictclas1.0项ç®(<a
- href="http://www.ictclas.org">http://www.ictclas.org</a>)ï¼
-å
¶ä¸è¯å
¸å·²ç»èä½æ人www.ictclas.orgå
许ï¼ä»¥apache license
-v2(APLv2)åè®®åå¸ãå¨éµå¾ªAPLv2çæ¡ä»¶ä¸ï¼æ¬¢è¿ç¨æ·ä½¿ç¨ã
-å¨æ¤æè°¢www.ictclas.org以åictclasåè¯è½¯ä»¶çå·¥ä½äººåçè¾å¤å·¥ä½åæ ç§å¥ç®ï¼</p>
</body>
</html>
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java Wed Jul 1 10:32:23 2009
@@ -23,38 +23,37 @@
import java.util.Properties;
/**
- * å¨é»è®¤æ
åµä¸ï¼SmartChineseAnalyzerå
ç½®æè¯å
¸åºãé»è®¤åæ¢è¯åºï¼å·²ç»ç»è¿å°è£
ï¼ç¨æ·å¯ä»¥ç´æ¥ä½¿ç¨ã
- *
- * ç¹æ®æ
åµä¸ï¼ç¨æ·éè¦ä½¿ç¨æå®çè¯å
¸åºååæ¢è¯åºï¼æ¤æ¶éè¦å é¤org.apache.lucene.analysis.cn.smart. hhmmä¸ç
- * coredict.mem å bigramdict.memï¼ ç¶å使ç¨AnalyzerProfileæ¥æå®è¯å
¸åºç®å½ã
- *
- * AnalyzerProfile ç¨æ¥å¯»æ¾åæ¾åè¯è¯åºæ°æ® ååç¨è¯æ°æ®çç®å½ï¼ 该ç®å½ä¸åºè¯¥æ bigramdict.dct, coredict.dct,
- * stopwords_utf8.txt, æ¥æ¾è¿ç¨ä¾æ¬¡å¦ä¸ï¼
+ * Configure analysis data for SmartChineseAnalyzer
+ * <p>
+ * SmartChineseAnalyzer has a built-in dictionary and stopword list out-of-box.
+ * </p>
+ * <p>
+ * In special circumstances a user may wish to configure SmartChineseAnalyzer with a custom data directory location.
+ * </p>
+ * AnalyzerProfile is used to determine the location of the data directory containing bigramdict.dct and coredict.dct.
+ * The following order is used to determine the location of the data directory:
*
* <ol>
- * <li>读åç³»ç»è¿è¡æ¶åæ°ï¼-Danalysis.data.dir=/path/to/analysis-dataï¼å¦æ没æï¼ç»§ç»ä¸ä¸æ¡</li>
- * <li>æ§è¡å½ä»¤çå½åç®å½ä¸æ¯å¦åå¨analysis-dataç®å½</li>
- * <li>æ§è¡å½ä»¤çlib/ç®å½ä¸æ¯å¦åå¨analysis-dataç®å½</li>
- * <li>æ§è¡å½ä»¤çå½åç®å½ä¸æ¯å¦åå¨analysis.propertiesæ件</li>
- * <li>æ§è¡å½ä»¤çlib/ç®å½ä¸æ¯å¦åå¨analysis.propertiesæ件</li>
+ * <li>System propertyï¼ -Danalysis.data.dir=/path/to/analysis-data</li>
+ * <li>Relative path: analysis-data</li>
+ * <li>Relative path: lib/analysis-data</li>
+ * <li>Property file: analysis.data.dir property from relative path analysis.properties</li>
+ * <li>Property file: analysis.data.dir property from relative path lib/analysis.properties</li>
* </ol>
*
- * å
¶ä¸analysis.propertiesæ件analysis.data.dirææanalysis-dataç®å½æå¨ä½ç½®.
- * analysis.propertiesæ件çå
容示ä¾ï¼
+ * Example property fileï¼
*
* <pre>
* analysis.data.dir=D:/path/to/analysis-data/
* </pre>
*
- * å½æ¾ä¸å°analysis-dataç®å½æ¶ï¼ANALYSIS_DATA_DIR设置为""ï¼å æ¤å¨ä½¿ç¨åï¼å¿
é¡»å¨ç¨åºéæ¾å¼æå®dataç®å½ï¼ä¾å¦ï¼
- *
- * <pre>
- * AnalyzerProfile.ANALYSIS_DATA_DIR = "/path/to/analysis-data";
- * </pre>
*
*/
public class AnalyzerProfile {
+ /**
+ * Global indicating the configured analysis data directory
+ */
public static String ANALYSIS_DATA_DIR = "";
static {
@@ -65,7 +64,7 @@
String dirName = "analysis-data";
String propName = "analysis.properties";
- // 读åç³»ç»è®¾ç½®ï¼å¨è¿è¡æ¶å å
¥åæ°ï¼-Danalysis.data.dir=/path/to/analysis-data
+ // Try the system propertyï¼-Danalysis.data.dir=/path/to/analysis-data
ANALYSIS_DATA_DIR = System.getProperty("analysis.data.dir", "");
if (ANALYSIS_DATA_DIR.length() != 0)
return;
@@ -86,9 +85,9 @@
}
if (ANALYSIS_DATA_DIR.length() == 0) {
- // æ示ç¨æ·æªæ¾å°è¯å
¸æ件夹
+ // Dictionary directory cannot be found.
System.err
- .println("WARNING: Can not found lexical dictionary directory!");
+ .println("WARNING: Can not find lexical dictionary directory!");
System.err
.println("WARNING: This will cause unpredictable exceptions in your application!");
System.err
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/CharType.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/CharType.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/CharType.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/CharType.java Wed Jul 1 10:32:23 2009
@@ -17,23 +17,49 @@
package org.apache.lucene.analysis.cn.smart;
+/**
+ * Internal SmartChineseAnalyzer character type constants.
+ */
public class CharType {
+ /**
+ * Punctuation Characters
+ */
public final static int DELIMITER = 0;
+ /**
+ * Letters
+ */
public final static int LETTER = 1;
+ /**
+ * Numeric Digits
+ */
public final static int DIGIT = 2;
+ /**
+ * Han Ideographs
+ */
public final static int HANZI = 3;
+ /**
+ * Characters that act as a space
+ */
public final static int SPACE_LIKE = 4;
- // (å
¨è§åè§)æ ç¹ç¬¦å·ï¼åè§ï¼åæ¯ï¼æ°åï¼ï¼æ±åï¼ç©ºæ ¼ï¼"\t\r\n"çç©ºæ ¼ææ¢è¡å符
+ /**
+ * Full-Width letters
+ */
public final static int FULLWIDTH_LETTER = 5;
- public final static int FULLWIDTH_DIGIT = 6; // å
¨è§å符ï¼åæ¯ï¼æ°å
-
+ /**
+ * Full-Width alphanumeric characters
+ */
+ public final static int FULLWIDTH_DIGIT = 6;
+
+ /**
+ * Other (not fitting any of the other categories)
+ */
public final static int OTHER = 7;
}
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java Wed Jul 1 10:32:23 2009
@@ -25,14 +25,12 @@
import org.apache.lucene.analysis.Tokenizer;
/**
- *
- * å
å«ä¸ä¸ªå®æ´å¥åçTokenï¼ä»æ件ä¸è¯»åºï¼æ¯ä¸ä¸æ¥åè¯ç对象
- *
+ * Tokenizes input into sentences.
*/
public class SentenceTokenizer extends Tokenizer {
/**
- * ç¨æ¥åæå¥åçæ ç¹ç¬¦å· ãï¼ï¼ï¼ï¼,!?;
+ * End of sentence punctuation: ãï¼ï¼ï¼ï¼,!?;
*/
public final static String PUNCTION = "ãï¼ï¼ï¼ï¼,!?;";
@@ -62,7 +60,7 @@
if (ci == -1) {
break;
} else if (PUNCTION.indexOf(ch) != -1) {
- // æ¾å°äºå¥åæ«å°¾
+ // End of a sentence
buffer.append(ch);
tokenEnd++;
break;
@@ -78,8 +76,7 @@
pch = ch;
ci = bufferInput.read();
ch = (char) ci;
- // å¦æ碰ä¸äºä¸¤ä¸ªè¿ç»çskipå符ï¼ä¾å¦ä¸¤ä¸ªå车ï¼ä¸¤ä¸ªç©ºæ ¼æè
ï¼
- // ä¸ä¸ªå车ï¼ä¸ä¸ªç©ºæ ¼ççï¼å°å
¶è§ä¸ºå¥åç»æï¼ä»¥å
å¥å太é¿èå
åä¸è¶³
+ // Two spaces, such as CR, LF
if (Utility.SPACES.indexOf(ch) != -1
&& Utility.SPACES.indexOf(pch) != -1) {
// buffer.append(ch);
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/Utility.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/Utility.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/Utility.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/Utility.java Wed Jul 1 10:32:23 2009
@@ -17,6 +17,12 @@
package org.apache.lucene.analysis.cn.smart;
+import org.apache.lucene.analysis.cn.smart.hhmm.BiSegGraph; // for javadoc
+import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; // for javadoc
+
+/**
+ * SmartChineseAnalyzer utility constants and methods
+ */
public class Utility {
public static final char[] STRING_CHAR_ARRAY = new String("æª##串")
@@ -30,24 +36,29 @@
public static final char[] END_CHAR_ARRAY = new String("æ«##æ«").toCharArray();
+ /**
+ * Delimiters will be filtered to this character by {@link SegTokenFilter}
+ */
public static final char[] COMMON_DELIMITER = new char[] { ',' };
/**
- * éè¦è·³è¿ç符å·ï¼ä¾å¦å¶è¡¨ç¬¦ï¼å车ï¼æ¢è¡ççã
+ * Space-like characters that need to be skipped: such as space, tab, newline, carriage return.
*/
public static final String SPACES = " ã\t\r\n";
+ /**
+ * Maximum bigram frequency (used in the {@link BiSegGraph} smoothing function).
+ */
public static final int MAX_FREQUENCE = 2079997 + 80000;
/**
- * æ¯è¾ä¸¤ä¸ªæ´æ°æ°ç»ç大å°, åå«ä»æ°ç»çä¸å®ä½ç½®å¼å§é个æ¯è¾, å½ä¾æ¬¡ç¸çä¸é½å°è¾¾æ«å°¾æ¶, è¿åç¸ç, å¦åæªå°è¾¾æ«å°¾ç大äºå°è¾¾æ«å°¾ç;
- * å½æªå°è¾¾æ«å°¾æ¶æä¸ä½ä¸ç¸ç, 该ä½ç½®æ°å¼å¤§çæ°ç»å¤§äºå°ç
+ * compare two arrays starting at the specified offsets.
*
- * @param larray
- * @param lstartIndex larrayçèµ·å§ä½ç½®
- * @param rarray
- * @param rstartIndex rarrayçèµ·å§ä½ç½®
- * @return 0表示ç¸çï¼1表示larray > rarray, -1表示larray < rarray
+ * @param larray left array
+ * @param lstartIndex start offset into larray
+ * @param rarray right array
+ * @param rstartIndex start offset into rarray
+ * @return 0 if the arrays are equalï¼1 if larray > rarray, -1 if larray < rarray
*/
public static int compareArray(char[] larray, int lstartIndex, char[] rarray,
int rstartIndex) {
@@ -74,21 +85,19 @@
}
if (li == larray.length) {
if (ri == rarray.length) {
- // 两è
ä¸ç´ç¸çå°æ«å°¾ï¼å æ¤è¿åç¸çï¼ä¹å°±æ¯ç»æ0
+ // Both arrays are equivalent, return 0.
return 0;
} else {
- // æ¤æ¶ä¸å¯è½ri>rarray.lengthå æ¤åªæri<rarray.length
- // 表示larrayå·²ç»ç»æï¼rarray没æç»æï¼å æ¤larray < rarrayï¼è¿å-1
+ // larray < rarray because larray has ended first.
return -1;
}
} else {
- // æ¤æ¶ä¸å¯è½li>larray.lengthå æ¤åªæli < larray.lengthï¼è¡¨ç¤ºli没æå°è¾¾larrayæ«å°¾
+ // differing lengths
if (ri == rarray.length) {
- // larray没æç»æï¼ä½æ¯rarrayå·²ç»ç»æï¼å æ¤larray > rarray
+ // larray > rarray because rarray has ended first.
return 1;
} else {
- // æ¤æ¶ä¸å¯è½ri>rarray.lengthå æ¤åªæri < rarray.length
- // 表示larrayårarrayé½æ²¡æç»æï¼å æ¤æä¸ä¸ä¸ªæ°ç大å°å¤æ
+ // determine by comparison
if (larray[li] > rarray[ri])
return 1;
else
@@ -98,18 +107,20 @@
}
/**
- * æ ¹æ®åç¼æ¥å¤æ两个å符æ°ç»ç大å°ï¼å½åè
为åè
çåç¼æ¶ï¼è¡¨ç¤ºç¸çï¼å½ä¸ä¸ºåç¼æ¶ï¼æç
§æ®éå符串æ¹å¼æ¯è¾
+ * Compare two arrays, starting at the specified offsets, but treating shortArray as a prefix to longArray.
+ * As long as shortArray is a prefix of longArray, return 0.
+ * Otherwise, behave as {@link Utility#compareArray(char[], int, char[], int)}
*
- * @param shortArray
- * @param shortIndex
- * @param longArray
- * @param longIndex
- * @return
+ * @param shortArray prefix array
+ * @param shortIndex offset into shortArray
+ * @param longArray long array (word)
+ * @param longIndex offset into longArray
+ * @return 0 if shortArray is a prefix of longArray, otherwise act as {@link Utility#compareArray(char[], int, char[], int)}
*/
public static int compareArrayByPrefix(char[] shortArray, int shortIndex,
char[] longArray, int longIndex) {
- // 空æ°ç»æ¯æææ°ç»çåç¼ï¼ä¸èèindex
+ // a null prefix is a prefix of longArray
if (shortArray == null)
return 0;
else if (longArray == null)
@@ -122,24 +133,27 @@
li++;
}
if (si == shortArray.length) {
- // shortArray æ¯ longArrayçprefix
+ // shortArray is a prefix of longArray
return 0;
} else {
- // æ¤æ¶ä¸å¯è½si>shortArray.lengthå æ¤åªæsi <
- // shortArray.lengthï¼è¡¨ç¤ºsi没æå°è¾¾shortArrayæ«å°¾
-
- // shortArray没æç»æï¼ä½æ¯longArrayå·²ç»ç»æï¼å æ¤shortArray > longArray
+ // shortArray > longArray because longArray ended first.
if (li == longArray.length)
return 1;
else
- // æ¤æ¶ä¸å¯è½li>longArray.lengthå æ¤åªæli < longArray.length
- // 表示shortArrayålongArrayé½æ²¡æç»æï¼å æ¤æä¸ä¸ä¸ªæ°ç大å°å¤æ
+ // determine by comparison
return (shortArray[si] > longArray[li]) ? 1 : -1;
}
}
+ /**
+ * Return the internal {@link CharType} constant of a given character.
+ * @param ch input character
+ * @return constant from {@link CharType} describing the character type.
+ *
+ * @see CharType
+ */
public static int getCharType(char ch) {
- // æå¤çæ¯æ±å
+ // Most (but not all!) of these are Han Ideographic Characters
if (ch >= 0x4E00 && ch <= 0x9FA5)
return CharType.HANZI;
if ((ch >= 0x0041 && ch <= 0x005A) || (ch >= 0x0061 && ch <= 0x007A))
@@ -148,12 +162,12 @@
return CharType.DIGIT;
if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' || ch == 'ã')
return CharType.SPACE_LIKE;
- // æåé¢çå
¶å®çé½æ¯æ ç¹ç¬¦å·äº
+ // Punctuation Marks
if ((ch >= 0x0021 && ch <= 0x00BB) || (ch >= 0x2010 && ch <= 0x2642)
|| (ch >= 0x3001 && ch <= 0x301E))
return CharType.DELIMITER;
- // å
¨è§å符åºå
+ // Full-Width range
if ((ch >= 0xFF21 && ch <= 0xFF3A) || (ch >= 0xFF41 && ch <= 0xFF5A))
return CharType.FULLWIDTH_LETTER;
if (ch >= 0xFF10 && ch <= 0xFF19)
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java Wed Jul 1 10:32:23 2009
@@ -25,6 +25,9 @@
import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;
import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter;
+/**
+ * Segment a sentence of Chinese text into words.
+ */
public class WordSegmenter {
private HHMMSegmenter hhmmSegmenter = new HHMMSegmenter();
@@ -32,20 +35,19 @@
private SegTokenFilter tokenFilter = new SegTokenFilter();
/**
- * è°ç¨HHMMSegmentç¨åºå°å½åçsentence Tokenåè¯ï¼è¿ååè¯ç»æï¼ä¿åå¨Token Listä¸
+ * Segment a sentence into words with {@link HHMMSegmenter}
*
- * @param sentenceToken å¥åçToken
- * @param shortPathCount HHMMç®æ³åè¯æéè¦çä¼ååçæçè·¯å¾ä¸ªæ°ãä¸è¬è¶å¤§åè¯ç»æè¶ç²¾ç¡®ï¼ä½æ¯è®¡ç®ä»£ä»·ä¹è¾é«ã
- * @return åè¯ç»æçToken List
+ * @param sentenceToken sentence {@link Token}
+ * @return {@link List} of {@link SegToken}
*/
- public List segmentSentence(Token sentenceToken, int shortPathCount) {
+ public List segmentSentence(Token sentenceToken) {
String sentence = sentenceToken.term();
List segTokenList = hhmmSegmenter.process(sentence);
List result = new ArrayList();
- // iä»1å°rawTokens.length-2ï¼ä¹å°±æ¯è¯´å°âå§##å§âï¼âæ«##æ«â两个RawTokenå»æ
+ // tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
for (int i = 1; i < segTokenList.size() - 1; i++) {
result.add(convertSegToken((SegToken) segTokenList.get(i), sentence,
sentenceToken.startOffset(), "word"));
@@ -55,14 +57,13 @@
}
/**
+ * Convert a {@link SegToken} to a Lucene {@link Token}
*
- * å°RawTokenç±»å转æ¢æç´¢å¼éè¦çTokenç±»åï¼ å 为索å¼éè¦RawTokenå¨åå¥ä¸çå
å®¹ï¼ å æ¤è½¬æ¢æ¶éè¦æå®åå¥åã
- *
- * @param rt
- * @param sentence 转æ¢éè¦çå¥åå
容
- * @param sentenceStartOffset sentenceå¨æç« ä¸çåå§ä½ç½®
- * @param type tokenç±»åï¼é»è®¤åºè¯¥æ¯word
- * @return
+ * @param st input {@link SegToken}
+ * @param sentence associated Sentence
+ * @param sentenceStartOffset offset into sentence
+ * @param type token type, default is word
+ * @return Lucene {@link Token}
*/
public Token convertSegToken(SegToken st, String sentence,
int sentenceStartOffset, String type) {
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordTokenizer.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordTokenizer.java Wed Jul 1 10:32:23 2009
@@ -25,11 +25,11 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+/**
+ * A {@link Tokenizer} that breaks sentences into words.
+ */
public class WordTokenizer extends Tokenizer {
- /**
- * åè¯ä¸»ç¨åºï¼WordTokenizeråå§åæ¶å è½½ã
- */
private WordSegmenter wordSegmenter;
private TokenStream in;
@@ -41,13 +41,10 @@
private Token sentenceToken = new Token();
/**
- * 设计ä¸æ¯SentenceTokenizerçä¸ä¸å¤çå±ãå°SentenceTokenizerçå¥å读åºï¼
- * å©ç¨HHMMSegment主ç¨åºå°å¥ååè¯ï¼ç¶åå°åè¯ç»æè¿åã
+ * Construct a new WordTokenizer.
*
- * @param in å¥åçToken
- * @param smooth å¹³æ»å½æ°
- * @param dataPath è£
è½½æ ¸å¿åå
¸ä¸äºååå
¸çç®å½
- * @see init()
+ * @param in {@link TokenStream} of sentences
+ * @param wordSegmenter {@link WordSegmenter} to break sentences into words
*/
public WordTokenizer(TokenStream in, WordSegmenter wordSegmenter) {
this.in = in;
@@ -66,17 +63,16 @@
}
/**
- * å½å½åçå¥ååè¯å¹¶ç´¢å¼å®æ¯æ¶ï¼éè¦è¯»åä¸ä¸ä¸ªå¥åTokenï¼ æ¬å½æ°è´è´£è°ç¨ä¸ä¸å±çSentenceTokenizerå»å è½½ä¸ä¸ä¸ªå¥åï¼ å¹¶å°å
¶åè¯ï¼
- * å°åè¯ç»æä¿åæTokenæ¾å¨tokenBufferä¸
+ * Process the next input sentence, placing tokens into tokenBuffer
*
- * @return 读å并å¤çä¸ä¸ä¸ªå¥åæåä¸å¦ï¼å¦æ没ææåï¼è¯´ææ件å¤çå®æ¯ï¼åé¢æ²¡æTokenäº
+ * @return true if more tokens were placed into tokenBuffer.
* @throws IOException
*/
private boolean processNextSentence() throws IOException {
sentenceToken = in.next(sentenceToken);
if (sentenceToken == null)
return false;
- tokenBuffer = wordSegmenter.segmentSentence(sentenceToken, 1);
+ tokenBuffer = wordSegmenter.segmentSentence(sentenceToken);
tokenIter = tokenBuffer.iterator();
return tokenBuffer != null && tokenIter.hasNext();
}
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordType.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordType.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordType.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordType.java Wed Jul 1 10:32:23 2009
@@ -17,22 +17,49 @@
package org.apache.lucene.analysis.cn.smart;
+/**
+ * Internal SmartChineseAnalyzer token type constants
+ */
public class WordType {
+ /**
+ * Start of a Sentence
+ */
public final static int SENTENCE_BEGIN = 0;
- public final static int SENTENCE_END = 1;// å¥åçå¼å¤´åç»æ
-
- public final static int CHINESE_WORD = 2;// ä¸æè¯
-
+ /**
+ * End of a Sentence
+ */
+ public final static int SENTENCE_END = 1;
+
+ /**
+ * Chinese Word
+ */
+ public final static int CHINESE_WORD = 2;
+
+ /**
+ * ASCII String
+ */
public final static int STRING = 3;
- public final static int NUMBER = 4; // asciiå符串åæ°å
-
- public final static int DELIMITER = 5; // æææ ç¹ç¬¦å·
-
+ /**
+ * ASCII Alphanumeric
+ */
+ public final static int NUMBER = 4;
+
+ /**
+ * Punctuation Symbol
+ */
+ public final static int DELIMITER = 5;
+
+ /**
+ * Full-Width String
+ */
public final static int FULLWIDTH_STRING = 6;
- public final static int FULLWIDTH_NUMBER = 7;// å«æå
¨è§å符çå符串ï¼å«å
¨è§æ°åçæ°å
+ /**
+ * Full-Width Alphanumeric
+ */
+ public final static int FULLWIDTH_NUMBER = 7;
}
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java Wed Jul 1 10:32:23 2009
@@ -19,19 +19,29 @@
import java.io.UnsupportedEncodingException;
+/**
+ * <p>
+ * SmartChineseAnalyzer abstract dictionary implementation.
+ * </p>
+ * <p>
+ * Contains methods for dealing with GB2312 encoding.
+ * </p>
+ */
public abstract class AbstractDictionary {
/**
- * 第ä¸ä¸ªæ±å为âåâï¼ä»åé¢æ15个åºï¼å
±15*94个å符
+ * First Chinese Character in GB2312 (15 * 94)
+ * Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation.
*/
public static final int GB2312_FIRST_CHAR = 1410;
/**
- * GB2312å符éä¸01~87çå符éæå¯è½ææï¼å
±8178个
+ * Last Chinese Character in GB2312 (87 * 94).
+ * Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are unassigned.
*/
public static final int GB2312_CHAR_NUM = 87 * 94;
/**
- * è¯åºæ件ä¸æ¶å½äº6768个æ±åçè¯é¢ç»è®¡
+ * Dictionary data contains 6768 Chinese characters with frequency statistics.
*/
public static final int CHAR_NUM_IN_FILE = 6768;
@@ -45,33 +55,33 @@
// B0F0 æ¢ æ¦ è ç» æ£ ç£
è é å è°¤ è è å
è¤ å¥
// =====================================================
//
- // GB2312 å符éçåºä½åå¸è¡¨ï¼
- // åºå· åæ° å符类å«
- // 01 94 ä¸è¬ç¬¦å·
- // 02 72 顺åºå·ç
- // 03 94 æä¸åæ¯
- // 04 83 æ¥æåå
+ // GB2312 character setï¼
+ // 01 94 Symbols
+ // 02 72 Numbers
+ // 03 94 Latin
+ // 04 83 Kana
// 05 86 Katakana
- // 06 48 å¸è
åæ¯
- // 07 66 ä¿æåæ¯
- // 08 63 æ±è¯æ¼é³ç¬¦å·
- // 09 76 å¾å½¢ç¬¦å·
- // 10-15 å¤ç¨åº
- // 16-55 3755 ä¸çº§æ±åï¼ä»¥æ¼é³ä¸ºåº
- // 56-87 3008 äºçº§æ±åï¼ä»¥ç¬å为åº
- // 88-94 å¤ç¨åº
+ // 06 48 Greek
+ // 07 66 Cyrillic
+ // 08 63 Phonetic Symbols
+ // 09 76 Drawing Symbols
+ // 10-15 Unassigned
+ // 16-55 3755 Plane 1, in pinyin order
+ // 56-87 3008 Plane 2, in radical/stroke order
+ // 88-94 Unassigned
// ======================================================
/**
- * GB2312 å
±æ¶å½æ 7445 个å符ï¼å
¶ä¸ç®åæ±å 6763 个ï¼åæ¯åç¬¦å· 682 个ã
+ * <p>
+ * Transcode from GB2312 ID to Unicode
+ * </p>
+ * <p>
+ * GB2312 is divided into a 94 * 94 grid, containing 7445 characters consisting of 6763 Chinese characters and 682 symbols.
+ * Some regions are unassigned (reserved).
+ * </p>
*
- * GB2312 å°ææ¶å½çå符å为 94 个åºï¼ç¼å·ä¸º 01 åºè³ 94 åºï¼æ¯ä¸ªåºæ¶å½ 94 个å符ï¼ç¼å·ä¸º 01 ä½è³ 94
- * ä½ï¼01为起å§ä¸0xA1ï¼94ä½å¤äº0xFEãGB2312 çæ¯ä¸ä¸ªå符é½ç±ä¸å
¶å¯ä¸å¯¹åºçåºå·åä½å·æç¡®å®ãä¾å¦ï¼æ±åâåâï¼ç¼å·ä¸º 16 åº 01
- * ä½ã
- */
- /**
- * @param ccid
- * @return
+ * @param ccid GB2312 id
+ * @return unicode String
*/
public String getCCByGB2312Id(int ccid) {
if (ccid < 0 || ccid > WordDictionary.GB2312_CHAR_NUM)
@@ -90,16 +100,16 @@
}
/**
- * æ ¹æ®è¾å
¥çUnicodeå符ï¼è·åå®çGB2312ç¼ç æè
asciiç¼ç ï¼
+ * Transcode from Unicode to GB2312
*
- * @param ch è¾å
¥çGB2312ä¸æå符æè
ASCIIå符(128个)
- * @return chå¨GB2312ä¸çä½ç½®ï¼-1表示该å符ä¸è®¤è¯
+ * @param ch input character in Unicode, or character in Basic Latin range.
+ * @return position in GB2312
*/
public short getGB2312Id(char ch) {
try {
byte[] buffer = Character.toString(ch).getBytes("GB2312");
if (buffer.length != 2) {
- // æ£å¸¸æ
åµä¸bufferåºè¯¥æ¯ä¸¤ä¸ªåèï¼å¦å说æchä¸å±äºGB2312ç¼ç ï¼æ
è¿å'?'ï¼æ¤æ¶è¯´æä¸è®¤è¯è¯¥å符
+ // Should be a two-byte character
return -1;
}
int b0 = (int) (buffer[0] & 0x0FF) - 161; // ç¼ç ä»A1å¼å§ï¼å æ¤åå»0xA1=161
@@ -112,12 +122,10 @@
}
/**
- * æ¹è¿ç32ä½FNV hashç®æ³ï¼ç¨ä½æ¬ç¨åºä¸ç第ä¸hashå½æ°.第ä¸å第äºhashå½æ°ç¨æ¥èå计ç®hashè¡¨ï¼ ä½¿å
¶åååå¸ï¼
- * 并è½é¿å
å hash表è¿å¯è导è´çé¿æ¶é´è®¡ç®çé®é¢
+ * 32-bit FNV Hash Function
*
- * @param c å¾
hashçUnicodeå符
- * @return cçåå¸å¼
- * @see Utility.hash2()
+ * @param c input character
+ * @return hashcode
*/
public long hash1(char c) {
final long p = 1099511628211L;
@@ -133,9 +141,10 @@
}
/**
- * @see Utility.hash1(char[])
- * @param carray
- * @return
+ * 32-bit FNV Hash Function
+ *
+ * @param carray character array
+ * @return hashcode
*/
public long hash1(char carray[]) {
final long p = 1099511628211L;
@@ -155,16 +164,14 @@
}
/**
- * djb2åå¸ç®æ³ï¼ç¨ä½æ¬ç¨åºä¸ç第äºhashå½æ°
- *
* djb2 hash algorithmï¼this algorithm (k=33) was first reported by dan
* bernstein many years ago in comp.lang.c. another version of this algorithm
* (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i];
* the magic of number 33 (why it works better than many other constants,
* prime or not) has never been adequately explained.
*
- * @param c
- * @return
+ * @param c character
+ * @return hashcode
*/
public int hash2(char c) {
int hash = 5381;
@@ -177,9 +184,14 @@
}
/**
- * @see Utility.hash2(char[])
- * @param carray
- * @return
+ * djb2 hash algorithmï¼this algorithm (k=33) was first reported by dan
+ * bernstein many years ago in comp.lang.c. another version of this algorithm
+ * (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i];
+ * the magic of number 33 (why it works better than many other constants,
+ * prime or not) has never been adequately explained.
+ *
+ * @param carray character array
+ * @return hashcode
*/
public int hash2(char carray[]) {
int hash = 5381;
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java Wed Jul 1 10:32:23 2009
@@ -26,6 +26,12 @@
import org.apache.lucene.analysis.cn.smart.Utility;
+/**
+ * Graph representing possible token pairs (bigrams) at each start offset in the sentence.
+ * <p>
+ * For each start offset, a list of possible token pairs is stored.
+ * </p>
+ */
public class BiSegGraph {
private Map tokenPairListTable = new HashMap();
@@ -39,15 +45,8 @@
generateBiSegGraph(segGraph);
}
- /**
- * çæ两两è¯ä¹é´çäºåå¾è¡¨ï¼å°ç»æä¿åå¨ä¸ä¸ªMultiTokenPairMapä¸
- *
- * @param segGraph ææçTokenå表
- * @param smooth å¹³æ»ç³»æ°
- * @param biDict äºåè¯å
¸
- * @return
- *
- * @see MultiTokenPairMap
+ /*
+ * Generate a BiSegGraph based upon a SegGraph
*/
private void generateBiSegGraph(SegGraph segGraph) {
double smooth = 0.1;
@@ -57,7 +56,7 @@
int next;
char[] idBuffer;
- // 为segGraphä¸çæ¯ä¸ªå
ç´ èµä»¥ä¸ä¸ªä¸æ
+ // get the list of tokens ordered and indexed
segTokenList = segGraph.makeIndex();
// å 为startTokenï¼"å§##å§"ï¼çèµ·å§ä½ç½®æ¯-1å æ¤key为-1æ¶å¯ä»¥ååºstartToken
int key = -1;
@@ -119,31 +118,29 @@
}
/**
- * æ¥çSegTokenPairçç»æä½ç½®ä¸ºto(SegTokenPair.to为to)æ¯å¦åå¨SegTokenPairï¼
- * å¦æ没æå说ætoå¤æ²¡æSegTokenPairæè
è¿æ²¡ææ·»å
+ * Returns true if their is a list of token pairs at this offset (index of the second token)
*
- * @param to SegTokenPair.to
- * @return
+ * @param to index of the second token in the token pair
+ * @return true if a token pair exists
*/
public boolean isToExist(int to) {
return tokenPairListTable.get(new Integer(to)) != null;
}
/**
- * ååºSegTokenPair.to为toçææSegTokenPairï¼å¦æ没æåè¿ånull
+ * Return a {@link List} of all token pairs at this offset (index of the second token)
*
- * @param to
- * @return ææç¸åSegTokenPair.toçSegTokenPairçåºå
+ * @param to index of the second token in the token pair
+ * @return {@link List} of token pairs.
*/
public List getToList(int to) {
return (List) tokenPairListTable.get(new Integer(to));
}
/**
- * åBiSegGraphä¸å¢å ä¸ä¸ªSegTokenPairï¼è¿äºSegTokenPairæç
§ç¸åSegTokenPair.
- * toæ¾å¨åä¸ä¸ªArrayListä¸
+ * Add a {@link SegTokenPair}
*
- * @param tokenPair
+ * @param tokenPair {@link SegTokenPair}
*/
public void addSegTokenPair(SegTokenPair tokenPair) {
int to = tokenPair.to;
@@ -158,16 +155,16 @@
}
/**
- * @return TokenPairçåæ°ï¼ä¹å°±æ¯Mapä¸ä¸ååå·çTokenPairç§æ°ã
+ * Get the number of {@link SegTokenPair} entries in the table.
+ * @return number of {@link SegTokenPair} entries
*/
public int getToCount() {
return tokenPairListTable.size();
}
/**
- * ç¨veterbiç®æ³è®¡ç®ä»èµ·ç¹å°ç»ç¹çæçè·¯å¾
- *
- * @return
+ * Find the shortest path with the Viterbi algorithm.
+ * @return {@link List}
*/
public List getShortPath() {
int current;
@@ -198,7 +195,7 @@
path.add(newNode);
}
- // æ¥ä¸æ¥ä»nodePathsä¸è®¡ç®ä»èµ·ç¹å°ç»ç¹ççå®è·¯å¾
+ // Calculate PathNodes
int preNode, lastNode;
lastNode = path.size() - 1;
current = lastNode;
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java Wed Jul 1 10:32:23 2009
@@ -32,6 +32,9 @@
import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
+/**
+ * SmartChineseAnalyzer Bigram dictionary.
+ */
public class BigramDictionary extends AbstractDictionary {
private BigramDictionary() {
@@ -43,12 +46,8 @@
public static final int PRIME_BIGRAM_LENGTH = 402137;
- /**
- * bigramTable æ¥åå¨è¯ä¸è¯ä¹é´ç跳转é¢çï¼ bigramHashTable å frequencyTable
- * å°±æ¯ç¨æ¥åå¨è¿äºé¢ççæ°æ®ç»æã 为äºæé«æ¥è¯¢é度åèçå
åï¼ éç¨ hash å¼æ¥ä»£æ¿å
³èè¯ä½ä¸ºæ¥è¯¢ä¾æ®ï¼ å
³èè¯å°±æ¯
- * (formWord+'@'+toWord) ï¼ å©ç¨ FNV1 hash ç®æ³æ¥è®¡ç®å
³èè¯çhashå¼ ï¼å¹¶ä¿åå¨ bigramHashTable
- * ä¸ï¼å©ç¨ hash å¼æ¥ä»£æ¿å
³èè¯æå¯è½ä¼äº§çå¾å°æ¦ççå²çªï¼ ä½æ¯ long ç±»å
- * (64bit)çhashå¼ææå°å°æ¤æ¦çéå°æä½ãbigramHashTable[i]ä¸frequencyTable[i]ä¸ä¸å¯¹åº
+ /*
+ * The word associations are stored as FNV1 hashcodes, which have a small probability of collision, but save memory.
*/
private long[] bigramHashTable;
@@ -128,7 +127,7 @@
bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
frequencyTable = new int[PRIME_BIGRAM_LENGTH];
for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++) {
- // å®é
ä¸å°0ä½ä¸ºåå§å¼æä¸ç¹é®é¢ï¼å 为æ个å符串å¯è½hashå¼ä¸º0ï¼ä½æ¯æ¦çé常å°ï¼å æ¤å½±åä¸å¤§
+ // it is possible for a value to hash to 0, but the probability is extremely low
bigramHashTable[i] = 0;
frequencyTable[i] = 0;
}
@@ -141,10 +140,9 @@
}
/**
- * å°è¯åºæ件å è½½å°WordDictionaryçç¸å
³æ°æ®ç»æä¸ï¼åªæ¯å è½½ï¼æ²¡æè¿è¡å并åä¿®æ¹æä½
+ * Load the datafile into this BigramDictionary
*
- * @param dctFilePath
- * @return
+ * @param dctFilePath path to the Bigramdictionary (bigramdict.mem)
* @throws FileNotFoundException
* @throws IOException
* @throws UnsupportedEncodingException
@@ -159,14 +157,14 @@
String tmpword;
RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
- // åå
¸æ件ä¸ç¬¬ä¸ä¸ªæ±ååºç°çä½ç½®æ¯0ï¼æåä¸ä¸ªæ¯6768
+ // GB2312 characters 0 - 6768
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
String currentStr = getCCByGB2312Id(i);
// if (i == 5231)
// System.out.println(i);
- dctFile.read(intBuffer);// åè¯åºæ件å¨cä¸å¼åï¼æ以åå
¥çæ件为little
- // endianç¼ç ï¼èjava为big endianï¼å¿
须转æ¢è¿æ¥
+ dctFile.read(intBuffer);
+ // the dictionary was developed for C, and byte order must be converted to work with Java
cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
if (cnt <= 0) {
continue;
@@ -272,9 +270,8 @@
return -1;
}
- /**
- * @param c
- * @return
+ /*
+ * lookup the index into the frequency array.
*/
private int getBigramItemIndex(char carray[]) {
long hashId = hash1(carray);
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java Wed Jul 1 10:32:23 2009
@@ -23,18 +23,18 @@
import org.apache.lucene.analysis.cn.smart.Utility;
import org.apache.lucene.analysis.cn.smart.WordType;
+/**
+ * Finds the optimal segmentation of a sentence into Chinese words
+ */
public class HHMMSegmenter {
private static WordDictionary wordDict = WordDictionary.getInstance();
/**
- * 寻æ¾sentenceä¸ææå¯è½çTokenï¼æååæ·»å 两个ç¹æ®Tokenï¼"å§##å§",
- * "æ«##æ«"ï¼"å§##å§"Tokençèµ·å§ä½ç½®æ¯-1,"æ«##æ«"Tokençèµ·å§ä½ç½®æ¯å¥åçé¿åº¦
+ * Create the {@link SegGraph} for a sentence.
*
- * @param sentence è¾å
¥çå¥åï¼ä¸å
å«"å§##å§","æ«##æ«"ç
- * @param coreDict æ ¸å¿åå
¸
- * @return ææå¯è½çToken
- * @see MultiTokenMap
+ * @param sentence input sentence, without start and end markers
+ * @return {@link SegGraph} corresponding to the input sentence.
*/
private SegGraph createSegGraph(String sentence) {
int i = 0, j;
@@ -168,16 +168,16 @@
}
/**
- * 为sentenceä¸çæ¯ä¸ªå符确å®å¯ä¸çå符类å
+ * Get the character types for every character in a sentence.
*
* @see Utility.charType(char)
- * @param sentence è¾å
¥çå®æå¥å
- * @return è¿åçå符类åæ°ç»ï¼å¦æè¾å
¥ä¸ºnullï¼è¿åä¹æ¯null
+ * @param sentence input sentence
+ * @return array of character types corresponding to character positions in the sentence
*/
private static int[] getCharTypes(String sentence) {
int length = sentence.length();
int[] charTypeArray = new int[length];
- // çæ对åºå个æ±åçå符类åæ°ç»
+ // the type of each character by position
for (int i = 0; i < length; i++) {
charTypeArray[i] = Utility.getCharType(sentence.charAt(i));
}
@@ -185,6 +185,11 @@
return charTypeArray;
}
+ /**
+ * Return a list of {@link PathNode} representing the best segmentation of a sentence
+ * @param sentence input sentence
+ * @return best segmentation as a {@link List}
+ */
public List process(String sentence) {
SegGraph segGraph = createSegGraph(sentence);
BiSegGraph biSegGraph = new BiSegGraph(segGraph);
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java Wed Jul 1 10:32:23 2009
@@ -17,6 +17,12 @@
package org.apache.lucene.analysis.cn.smart.hhmm;
+/**
+ * SmartChineseAnalyzer internal node representation
+ * <p>
+ * Used by {@link BiSegGraph} to maximize the segmentation with the Viterbi algorithm.
+ * </p>
+ */
public class PathNode implements Comparable {
public double weight;
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java Wed Jul 1 10:32:23 2009
@@ -23,42 +23,53 @@
import java.util.List;
import java.util.Map;
+/**
+ * Graph representing possible tokens at each start offset in the sentence.
+ * <p>
+ * For each start offset, a list of possible tokens is stored.
+ * </p>
+ */
public class SegGraph {
/**
- * ç¨ä¸ä¸ªArrayListè®°å½startOffsetç¸åçTokenï¼è¿ä¸ªstartOffsetå°±æ¯Tokençkey
+ * Map of start offsets to ArrayList of tokens at that position
*/
- private Map tokenListTable = new HashMap();
+ private Map /* <Integer, ArrayList<SegToken>> */ tokenListTable = new HashMap();
private int maxStart = -1;
/**
- * æ¥çstartOffset为sçTokenæ¯å¦åå¨ï¼å¦æ没æå说æså¤æ²¡æTokenæè
è¿æ²¡ææ·»å
+ * Returns true if a mapping for the specified start offset exists
*
* @param s startOffset
- * @return
+ * @return true if there are tokens for the startOffset
*/
public boolean isStartExist(int s) {
return tokenListTable.get(new Integer(s)) != null;
}
/**
- * ååºstartOffset为sçææTokensï¼å¦æ没æåè¿ånull
+ * Get the list of tokens at the specified start offset
*
- * @param s
- * @return ææç¸åstartOffsetçTokençåºå
+ * @param s startOffset
+ * @return List of tokens at the specified start offset.
*/
public List getStartList(int s) {
return (List) tokenListTable.get(new Integer(s));
}
+ /**
+ * Get the highest start offset in the map
+ *
+ * @return maximum start offset, or -1 if the map is empty.
+ */
public int getMaxStart() {
return maxStart;
}
/**
- * 为SegGraphä¸çææTokensçæä¸ä¸ªç»ä¸çindexï¼indexä»0å¼å§ï¼
- * æç
§startOffsetéå¢ç顺åºæåºï¼ç¸åstartOffsetçTokensæç
§æ¾ç½®å
å顺åºæåº
+ * Set the {@link SegToken#index} for each token, based upon its order by startOffset.
+ * @return a {@link List} of these ordered tokens.
*/
public List makeIndex() {
List result = new ArrayList();
@@ -82,9 +93,8 @@
}
/**
- * åMapä¸å¢å ä¸ä¸ªTokenï¼è¿äºTokenæç
§ç¸åstartOffsetæ¾å¨åä¸ä¸ªå表ä¸ï¼
- *
- * @param token
+ * Add a {@link SegToken} to the mapping, creating a new mapping at the token's startOffset if one does not exist.
+ * @param token {@link SegToken}
*/
public void addToken(SegToken token) {
int s = token.startOffset;
@@ -101,18 +111,18 @@
}
/**
- * è·åSegGraphä¸ä¸åèµ·å§ï¼Startï¼ä½ç½®Tokenç±»ç个æ°ï¼æ¯ä¸ªå¼å§ä½ç½®å¯è½æå¤ä¸ªTokenï¼å æ¤ä½ç½®æ°ä¸Tokenæ°å¹¶ä¸ä¸è´
- *
- * @return
+ * Get the number of startOffsets.
+ *
+ * @return number of startOffsets in the mapping
*/
public int getStartCount() {
return tokenListTable.size();
}
/**
- * å°Mapä¸åå¨çææTokenæç
§èµ·å§ä½ç½®ä»å°å°å¤§çæ¹å¼ç»æä¸ä¸ªå表
+ * Return a {@link List} of all tokens in the map, ordered by startOffset.
*
- * @return
+ * @return {@link List} of all tokens in the map.
*/
public List toTokenList() {
List result = new ArrayList();
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java Wed Jul 1 10:32:23 2009
@@ -17,6 +17,9 @@
package org.apache.lucene.analysis.cn.smart.hhmm;
+/**
+ * SmartChineseAnalyzer internal token
+ */
public class SegToken {
public char[] charArray;
@@ -51,13 +54,6 @@
// + endOffset + ")/w(" + weight + ")t(" + wordType + ")";
// }
- /**
- * å¤æ两个Tokenç¸ççå
è¦æ¡ä»¶æ¯ä»ä»¬çèµ·å§ä½ç½®ç¸çï¼å 为è¿æ ·ä»ä»¬çåå¥ä¸çå
容ä¸æ ·ï¼
- * èposä¸weighté½å¯ä»¥ä»è¯å
¸ä¸æ¥å°å¤ä¸ªï¼å¯ä»¥ç¨ä¸å¯¹å¤çæ¹æ³è¡¨ç¤ºï¼å æ¤åªéè¦ä¸ä¸ªToken
- *
- * @param t
- * @return
- */
// public boolean equals(RawToken t) {
// return this.startOffset == t.startOffset
// && this.endOffset == t.endOffset;
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java Wed Jul 1 10:32:23 2009
@@ -20,27 +20,43 @@
import org.apache.lucene.analysis.cn.smart.Utility;
import org.apache.lucene.analysis.cn.smart.WordType;
+/**
+ * <p>
+ * Filters a {@link SegToken} by converting full-width latin to half-width, then lowercasing latin.
+ * Additionally, all punctuation is converted into {@link Utility#COMMON_DELIMITER}
+ * </p>
+ */
public class SegTokenFilter {
+ /**
+ * Filter an input {@link SegToken}
+ * <p>
+ * Full-width latin will be converted to half-width, then all latin will be lowercased.
+ * All punctuation is converted into {@link Utility#COMMON_DELIMITER}
+ * </p>
+ *
+ * @param token input {@link SegToken}
+ * @return normalized {@link SegToken}
+ */
public SegToken filter(SegToken token) {
switch (token.wordType) {
case WordType.FULLWIDTH_NUMBER:
- case WordType.FULLWIDTH_STRING:
+ case WordType.FULLWIDTH_STRING: /* first convert full-width -> half-width */
for (int i = 0; i < token.charArray.length; i++) {
if (token.charArray[i] >= 0xFF10)
token.charArray[i] -= 0xFEE0;
- if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A)
+ if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */
token.charArray[i] += 0x0020;
}
break;
case WordType.STRING:
for (int i = 0; i < token.charArray.length; i++) {
- if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A)
+ if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */
token.charArray[i] += 0x0020;
}
break;
- case WordType.DELIMITER:
+ case WordType.DELIMITER: /* convert all punctuation to Utility.COMMON_DELIMITER */
token.charArray = Utility.COMMON_DELIMITER;
break;
default:
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java Wed Jul 1 10:32:23 2009
@@ -17,15 +17,21 @@
package org.apache.lucene.analysis.cn.smart.hhmm;
+/**
+ * A pair of tokens in {@link SegGraph}
+ */
public class SegTokenPair {
public char[] charArray;
/**
- * fromåtoæ¯Token对çindexå·ï¼è¡¨ç¤ºæ¬TokenPairç两个Tokenå¨segGraghä¸çä½ç½®ã
+ * index of the first token in {@link SegGraph}
*/
public int from;
+ /**
+ * index of the second token in {@link SegGraph}
+ */
public int to;
public double weight;
Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java Wed Jul 1 10:32:23 2009
@@ -33,6 +33,10 @@
import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
import org.apache.lucene.analysis.cn.smart.Utility;
+/**
+ * SmartChineseAnalyzer Word Dictionary
+ *
+ */
public class WordDictionary extends AbstractDictionary {
private WordDictionary() {
@@ -41,7 +45,7 @@
private static WordDictionary singleInstance;
/**
- * ä¸ä¸ªè¾å¤§çç´ æ°ï¼ä¿è¯hashæ¥æ¾è½å¤éåææä½ç½®
+ * Large prime number for hash function
*/
public static final int PRIME_INDEX_LENGTH = 12071;
@@ -66,6 +70,10 @@
// static Logger log = Logger.getLogger(WordDictionary.class);
+ /**
+ * Get the singleton dictionary instance.
+ * @return singleton
+ */
public synchronized static WordDictionary getInstance() {
if (singleInstance == null) {
singleInstance = new WordDictionary();
@@ -82,10 +90,9 @@
}
/**
- * ä»å¤é¨æ件夹dctFileRootå è½½è¯å
¸åºæ件ï¼é¦å
æµè¯æ¯å¦æcoredict.memæä»¶ï¼ å¦ææåç´æ¥ä½ä¸ºåºåå对象å è½½ï¼
- * å¦æ没æåå è½½è¯å
¸åºæºæ件coredict.dct
+ * Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct
*
- * @param dctFileName è¯å
¸åºæ件çè·¯å¾
+ * @param dctFileRoot path to dictionary directory
*/
public void load(String dctFileRoot) {
String dctFilePath = dctFileRoot + "/coredict.dct";
@@ -119,9 +126,8 @@
}
/**
- * ä»jarå
é¨å è½½è¯å
¸åºæ件ï¼è¦æ±ä¿è¯WordDictionaryç±»å½åè·¯å¾ä¸æcoredict.memæ件ï¼ä»¥å°å
¶ä½ä¸ºåºåå对象å è½½
+ * Load coredict.mem internally from the jar file.
*
- * @param dctFileName è¯å
¸åºæ件çè·¯å¾
* @throws ClassNotFoundException
* @throws IOException
*/
@@ -171,10 +177,10 @@
}
/**
- * å°è¯åºæ件å è½½å°WordDictionaryçç¸å
³æ°æ®ç»æä¸ï¼åªæ¯å è½½ï¼æ²¡æè¿è¡å并åä¿®æ¹æä½
+ * Load the datafile into this WordDictionary
*
- * @param dctFilePath
- * @return
+ * @param dctFilePath path to word dictionary (coredict.mem)
+ * @return number of words read
* @throws FileNotFoundException
* @throws IOException
* @throws UnsupportedEncodingException
@@ -188,13 +194,13 @@
String tmpword;
RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
- // åå
¸æ件ä¸ç¬¬ä¸ä¸ªæ±ååºç°çä½ç½®æ¯0ï¼æåä¸ä¸ªæ¯6768
+ // GB2312 characters 0 - 6768
for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
// if (i == 5231)
// System.out.println(i);
- dctFile.read(intBuffer);// åè¯åºæ件å¨cä¸å¼åï¼æ以åå
¥çæ件为little
- // endianç¼ç ï¼èjava为big endianï¼å¿
须转æ¢è¿æ¥
+ dctFile.read(intBuffer);
+ // the dictionary was developed for C, and byte order must be converted to work with Java
cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
if (cnt <= 0) {
wordItem_charArrayTable[i] = null;
@@ -287,8 +293,8 @@
wordItem_frequencyTable[delimiterIndex] = null;
}
- /**
- * æ¬ç¨åºä¸åè¯æ§æ 注ï¼å æ¤å°ç¸åè¯ä¸åè¯æ§çé¢çå并å°åä¸ä¸ªè¯ä¸ï¼ä»¥åå°åå¨ç©ºé´ï¼å å¿«æç´¢é度
+ /*
+ * since we aren't doing POS-tagging, merge the frequencies for entries of the same word (with different POS)
*/
private void mergeSameWords() {
int i;
@@ -350,12 +356,9 @@
}
}
- /**
+ /*
* 计ç®å符cå¨åå¸è¡¨ä¸åºè¯¥å¨çä½ç½®ï¼ç¶åå°å°åå表ä¸è¯¥ä½ç½®çå¼åå§å
*
- * @param c
- * @param j
- * @return
*/
private boolean setTableIndex(char c, int j) {
int index = getAvaliableTableIndex(c);
@@ -390,10 +393,6 @@
return -1;
}
- /**
- * @param c
- * @return
- */
private short getWordItemTableIndex(char c) {
int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
@@ -465,32 +464,33 @@
}
/**
- * charArrayè¿ä¸ªåè¯å¯¹åºçè¯ç»å¨ä¸å¨WordDictionaryä¸åºç°
+ * Returns true if the input word appears in the dictionary
*
- * @param charArray
- * @return true表示åå¨ï¼false表示ä¸åå¨
+ * @param charArray input word
+ * @return true if the word exists
*/
public boolean isExist(char[] charArray) {
return findInTable(charArray) != -1;
}
/**
- * @see{getPrefixMatch(char[] charArray, int knownStart)}
- * @param charArray
- * @return
+ * Find the first word in the dictionary that starts with the supplied prefix
+ *
+ * @see #getPrefixMatch(char[], int)
+ * @param charArray input prefix
+ * @return index of word, or -1 if not found
*/
public int getPrefixMatch(char[] charArray) {
return getPrefixMatch(charArray, 0);
}
/**
- * ä»è¯å
¸ä¸æ¥æ¾ä»¥charArray对åºçåè¯ä¸ºåç¼(prefix)çåè¯çä½ç½®, 并è¿å第ä¸ä¸ªæ»¡è¶³æ¡ä»¶çä½ç½®ã为äºåå°æ索代价,
- * å¯ä»¥æ ¹æ®å·²æç¥è¯è®¾ç½®èµ·å§æç´¢ä½ç½®, å¦æä¸ç¥éèµ·å§ä½ç½®ï¼é»è®¤æ¯0
+ * Find the nth word in the dictionary that starts with the supplied prefix
*
- * @see{getPrefixMatch(char[] charArray)}
- * @param charArray åç¼åè¯
- * @param knownStart å·²ç¥çèµ·å§ä½ç½®
- * @return 满足åç¼æ¡ä»¶ç第ä¸ä¸ªåè¯çä½ç½®
+ * @see #getPrefixMatch(char[])
+ * @param charArray input prefix
+ * @param knownStart relative position in the dictionary to start
+ * @return index of word, or -1 if not found
*/
public int getPrefixMatch(char[] charArray, int knownStart) {
short index = getWordItemTableIndex(charArray[0]);
@@ -521,11 +521,10 @@
}
/**
- * è·åidArray对åºçè¯çè¯é¢ï¼è¥pos为-1åè·åææè¯æ§çè¯é¢
+ * Get the frequency of a word from the dictionary
*
- * @param charArray è¾å
¥çåè¯å¯¹åºçcharArray
- * @param pos è¯æ§ï¼-1表示è¦æ±æ±åºææçè¯æ§çè¯é¢
- * @return idArray对åºçè¯é¢
+ * @param charArray input word
+ * @return word frequency, or zero if the word is not found
*/
public int getFrequency(char[] charArray) {
short hashIndex = getWordItemTableIndex(charArray[0]);
@@ -539,12 +538,11 @@
}
/**
- * å¤æcharArray对åºçå符串æ¯å¦è·è¯å
¸ä¸charArray[0]对åºçwordIndexçcharArrayç¸ç,
- * ä¹å°±æ¯è¯´charArrayçä½ç½®æ¥æ¾ç»ææ¯ä¸æ¯å°±æ¯wordIndex
+ * Return true if the dictionary entry at itemIndex for table charArray[0] is charArray
*
- * @param charArray è¾å
¥çcharArrayè¯ç»ï¼ç¬¬ä¸ä¸ªæ°è¡¨ç¤ºè¯å
¸ä¸çç´¢å¼å·
- * @param itemIndex ä½ç½®ç¼å·
- * @return æ¯å¦ç¸ç
+ * @param charArray input word
+ * @param itemIndex item index for table charArray[0]
+ * @return true if the entry exists
*/
public boolean isEqual(char[] charArray, int itemIndex) {
short hashIndex = getWordItemTableIndex(charArray[0]);
Modified: lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/stopwords.txt?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/stopwords.txt (original)
+++ lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/stopwords.txt Wed Jul 1 10:32:23 2009
@@ -1,4 +1,4 @@
-////////// å°æ ç¹ç¬¦å·å
¨é¨å»æ ////////////////
+////////// Punctuation tokens to remove ////////////////
,
.
`
@@ -51,8 +51,8 @@
ï¼»
ï¼½
â
-ã//ä¸æç©ºæ ¼å符
+ã//IDEOGRAPHIC SPACE character (Used as a space in Chinese)
-//////////////// è±æåç¨è¯ ////////////////
+//////////////// English Stop Words ////////////////
-//////////////// ä¸æåç¨è¯ ////////////////
+//////////////// Chinese Stop Words ////////////////