You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by si...@apache.org on 2009/07/01 12:32:24 UTC

svn commit: r790102 - in /lucene/java/trunk/contrib/analyzers/src: java/org/apache/lucene/analysis/cn/ java/org/apache/lucene/analysis/cn/smart/ java/org/apache/lucene/analysis/cn/smart/hhmm/ resources/org/apache/lucene/analysis/cn/

Author: simonw
Date: Wed Jul  1 10:32:23 2009
New Revision: 790102

URL: http://svn.apache.org/viewvc?rev=790102&view=rev
Log:
LUCENE-1722: SmartChineseAnalyzer JavaDoc improvements - Replacing Chinese JavaDoc with English version. Robert Muir via Simon Willnauer

Modified:
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/package.html
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/CharType.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/Utility.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordTokenizer.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordType.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java
    lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/stopwords.txt

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java Wed Jul  1 10:32:23 2009
@@ -33,23 +33,26 @@
 import org.apache.lucene.analysis.cn.smart.WordSegmenter;
 import org.apache.lucene.analysis.cn.smart.WordTokenizer;
 
+import org.apache.lucene.analysis.cn.smart.AnalyzerProfile; // for javadoc
+
 /**
- * 
- * SmartChineseAnalyzer 是一个智能中文分词模块, 能够利用概率对汉语句子进行最优切分,
- * 并内嵌英文tokenizer,能有效处理中英文混合的文本内容。
- * 
- * 它的原理基于自然语言处理领域的隐马尔科夫模型(HMM), 利用大量语料库的训练来统计汉语词汇的词频和跳转概率,
- * 从而根据这些统计结果对整个汉语句子计算最似然(likelihood)的切分。
- * 
- * 因为智能分词需要词典来保存词汇的统计值,SmartChineseAnalyzer的运行需要指定词典位置,如何指定词典位置请参考
- * org.apache.lucene.analysis.cn.smart.AnalyzerProfile
- * 
- * SmartChineseAnalyzer的算法和语料库词典来自于ictclas1.0项目(http://www.ictclas.org),
- * 其中词典已获取www.ictclas.org的apache license v2(APLv2)的授权。在遵循APLv2的条件下,欢迎用户使用。
- * 在此感谢www.ictclas.org以及ictclas分词软件的工作人员的无私奉献!
- * 
- * @see org.apache.lucene.analysis.cn.smart.AnalyzerProfile
- * 
+ * <p>
+ * SmartChineseAnalyzer is an analyzer for Chinese or mixed Chinese-English text.
+ * The analyzer uses probabilistic knowledge to find the optimal word segmentation for Simplified Chinese text.
+ * The text is first broken into sentences, then each sentence is segmented into words.
+ * </p>
+ * <p>
+ * Segmentation is based upon the <a href="http://en.wikipedia.org/wiki/Hidden_Markov_Model">Hidden Markov Model</a>. 
+ * A large training corpus was used to calculate Chinese word frequency probability.
+ * </p>
+ * <p>
+ * This analyzer requires a dictionary to provide statistical data. 
+ * To specify the location of the dictionary data, refer to {@link AnalyzerProfile}
+ * </p>
+ * <p>
+ * The included dictionary data is from <a href="http://www.ictclas.org">ICTCLAS1.0</a>.
+ * Thanks to ICTCLAS for their hard work, and for contributing the data under the Apache 2 License!
+ * </p>
  */
 public class SmartChineseAnalyzer extends Analyzer {
 
@@ -57,15 +60,23 @@
 
   private WordSegmenter wordSegment;
 
+  /**
+   * Create a new SmartChineseAnalyzer, using the default stopword list.
+   */
   public SmartChineseAnalyzer() {
     this(true);
   }
 
   /**
-   * SmartChineseAnalyzer内部带有默认停止词库,主要是标点符号。如果不希望结果中出现标点符号,
-   * 可以将useDefaultStopWords设为true, useDefaultStopWords为false时不使用任何停止词
+   * <p>
+   * Create a new SmartChineseAnalyzer, optionally using the default stopword list.
+   * </p>
+   * <p>
+   * The included default stopword list is simply a list of punctuation.
+   * If you do not use this list, punctuation will not be removed from the text!
+   * </p>
    * 
-   * @param useDefaultStopWords
+   * @param useDefaultStopWords true to use the default stopword list.
    */
   public SmartChineseAnalyzer(boolean useDefaultStopWords) {
     if (useDefaultStopWords) {
@@ -76,10 +87,14 @@
   }
 
   /**
-   * 使用自定义的而不使用内置的停止词库,停止词可以使用SmartChineseAnalyzer.loadStopWords(InputStream)加载
-   * 
-   * @param stopWords
-   * @see SmartChineseAnalyzer.loadStopWords(InputStream)
+   * <p>
+   * Create a new SmartChineseAnalyzer, using the provided {@link Set} of stopwords.
+   * </p>
+   * <p>
+   * Note: the set should include punctuation, unless you want to index punctuation!
+   * </p>
+   * @param stopWords {@link Set} of stopwords to use.
+   * @see SmartChineseAnalyzer#loadStopWords(InputStream)
    */
   public SmartChineseAnalyzer(Set stopWords) {
     this.stopWords = stopWords;
@@ -90,8 +105,8 @@
     TokenStream result = new SentenceTokenizer(reader);
     result = new WordTokenizer(result, wordSegment);
     // result = new LowerCaseFilter(result);
-    // 不再需要LowerCaseFilter,因为SegTokenFilter已经将所有英文字符转换成小写
-    // stem太严格了, This is not bug, this feature:)
+    // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
+    // The porter stemming is too strict, this is not a bug, this is a feature:)
     result = new PorterStemFilter(result);
     if (stopWords != null) {
       result = new StopFilter(result, stopWords, false);
@@ -100,13 +115,17 @@
   }
 
   /**
-   * 从停用词文件中加载停用词, 停用词文件是普通UTF-8编码的文本文件, 每一行是一个停用词,注释利用“//”, 停用词中包括中文标点符号, 中文空格,
-   * 以及使用率太高而对索引意义不大的词。
+   * Utility function to return a {@link Set} of stopwords from a UTF-8 encoded {@link InputStream}.
+   * The comment "//" can be used in the stopword list.
    * 
-   * @param input 停用词文件
-   * @return 停用词组成的HashSet
+   * @param input {@link InputStream} of UTF-8 encoded stopwords
+   * @return {@link Set} of stopwords.
    */
   public static Set loadStopWords(InputStream input) {
+    /*
+     * Note: WordListLoader is not used here because this method allows for inline "//" comments.
+     * WordListLoader will only filter out these comments if they are on a separate line.
+     */
     String line;
     Set stopWords = new HashSet();
     try {

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/package.html?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/package.html (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/package.html Wed Jul  1 10:32:23 2009
@@ -1,51 +1,22 @@
 <html>
 <head></head>
 <body>
-Analyzer for Chinese.
+Analyzers for Chinese.
+<p>
+Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
+<ul>
+	<li>ChineseAnalyzer: Index unigrams (individual Chinese characters) as a token.
+	<li>CJKAnalyzer: Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
+	<li>SmartChineseAnalyzer: Index words (attempt to segment Chinese text into words) as tokens.
+</ul>
 
-
-<h2>About SmartChineseAnalyzer</h2>
-<p>SmartChineseAnalyzer 是一个智能中文分词模块, 与 ChineseAnalyzer (切分每个汉字)和
-CJKAnalyzer (组合每两个汉字)不同, 它能够利用概率对汉语句子进行最优切分, 并内嵌英文tokenizer,
-能有效处理中英文混合的文本内容。目前SmartChineseAnalyzer的词典库只支持简体中文。</p>
-
-<p>它的原理基于自然语言处理领域的隐马尔科夫模型(HMM), 利用大量语料库的训练来统计汉语词汇的词频和跳转概率,
-从而根据这些统计结果对整个汉语句子计算最似然(likelihood)的切分。</p>
-
-<p>三种分词模块的分词结果比较, 由此可以看出智能分词更符合句子的原本语义, 从而提高搜索的准确率。
-<pre>语句: 我是中国人</pre>
+Example phrase: "我是中国人"
 <ol>
-	<li>SmartChineseAnalyzer: 我-是-中国-人</li>
 	<li>ChineseAnalyzer: 我-是-中-国-人</li>
 	<li>CJKAnalyzer: 我是-是中-中国-国人</li>
+	<li>SmartChineseAnalyzer: 我-是-中国-人</li>
 </ol>
 </p>
 
-<h3>分词词典的设置</h3>
-<p>因为智能分词需要词典来保存词汇的统计值,默认情况下,SmartChineseAnalyzer使用内置的词典库,当需要指定的词典库时,需要指定词典位置,如何指定词典位置请参考
-org.apache.lucene.analysis.cn.smart.AnalyzerProfile。</p>
-
-<p><b>词库的下载地址为:<a
-	href="http://code.google.com/p/imdict-chinese-analyzer/downloads/list">http://code.google.com/p/imdict-chinese-analyzer/downloads/list</a>
-</b> 下载文件analysis-data.zip保存到本地,解压即可使用。</p>
-
-<p>最简单的指定词典库的办法就是运行时加上参数-Danalysis.data.dir
-<pre>如: java -Danalysis.data.dir=/path/to/analysis-data com.example.YourApplication</pre>
-</p>
-
-<h3>版本要求</h3>
-<p>SmartChineseAnalyzer的JVM要求java 1.4及以上版本;Lucene
-要求2.4.0及以上版本,Lucene 2.3.X版应该也可以使用,但未经测试,有需要的用户可自行测试。</p>
-
-<h3>源文件和文本编码</h3>
-除特定的二进制码文件外,SmartChineseAnalyzer的所有文本和Java源码都采用UTF-8编码,
-因此在读取文本和编译Java源码是请注意采用正确的方式,以避免产生乱码错误。
-
-<h3>SmartChineseAnalyzer的授权</h3>
-<p>SmartChineseAnalyzer的算法和语料库词典来自于ictclas1.0项目(<a
-	href="http://www.ictclas.org">http://www.ictclas.org</a>),
-其中词典已经著作权人www.ictclas.org允许,以apache license
-v2(APLv2)协议发布。在遵循APLv2的条件下,欢迎用户使用。
-在此感谢www.ictclas.org以及ictclas分词软件的工作人员的辛勤工作和无私奉献!</p>
 </body>
 </html>

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java Wed Jul  1 10:32:23 2009
@@ -23,38 +23,37 @@
 import java.util.Properties;
 
 /**
- * 在默认情况下,SmartChineseAnalyzer内置有词典库、默认停止词库,已经经过封装,用户可以直接使用。
- * 
- * 特殊情况下,用户需要使用指定的词典库和停止词库,此时需要删除org.apache.lucene.analysis.cn.smart. hhmm下的
- * coredict.mem 和 bigramdict.mem, 然后使用AnalyzerProfile来指定词典库目录。
- * 
- * AnalyzerProfile 用来寻找存放分词词库数据 和停用词数据的目录, 该目录下应该有 bigramdict.dct, coredict.dct,
- * stopwords_utf8.txt, 查找过程依次如下:
+ * Configure analysis data for SmartChineseAnalyzer
+ * <p>
+ * SmartChineseAnalyzer has a built-in dictionary and stopword list out-of-box.
+ * </p>
+ * <p>
+ * In special circumstances a user may wish to configure SmartChineseAnalyzer with a custom data directory location.
+ * </p>
+ * AnalyzerProfile is used to determine the location of the data directory containing bigramdict.dct and coredict.dct.
+ * The following order is used to determine the location of the data directory:
  * 
  * <ol>
- * <li>读取系统运行时参数:-Danalysis.data.dir=/path/to/analysis-data,如果没有,继续下一条</li>
- * <li>执行命令的当前目录中是否存在analysis-data目录</li>
- * <li>执行命令的lib/目录中是否存在analysis-data目录</li>
- * <li>执行命令的当前目录中是否存在analysis.properties文件</li>
- * <li>执行命令的lib/目录中是否存在analysis.properties文件</li>
+ * <li>System property: -Danalysis.data.dir=/path/to/analysis-data</li>
+ * <li>Relative path: analysis-data</li>
+ * <li>Relative path: lib/analysis-data</li>
+ * <li>Property file: analysis.data.dir property from relative path analysis.properties</li>
+ * <li>Property file: analysis.data.dir property from relative path lib/analysis.properties</li>
  * </ol>
  * 
- * 其中analysis.properties文件analysis.data.dir指明analysis-data目录所在位置.
- * analysis.properties文件的内容示例:
+ * Example property file:
  * 
  * <pre>
  * analysis.data.dir=D:/path/to/analysis-data/
  * </pre>
  * 
- * 当找不到analysis-data目录时,ANALYSIS_DATA_DIR设置为"",因此在使用前,必须在程序里显式指定data目录,例如:
- * 
- * <pre>
- * AnalyzerProfile.ANALYSIS_DATA_DIR = &quot;/path/to/analysis-data&quot;;
- * </pre>
  * 
  */
 public class AnalyzerProfile {
 
+  /**
+   * Global indicating the configured analysis data directory
+   */
   public static String ANALYSIS_DATA_DIR = "";
 
   static {
@@ -65,7 +64,7 @@
     String dirName = "analysis-data";
     String propName = "analysis.properties";
 
-    // 读取系统设置,在运行时加入参数:-Danalysis.data.dir=/path/to/analysis-data
+    // Try the system property:-Danalysis.data.dir=/path/to/analysis-data
     ANALYSIS_DATA_DIR = System.getProperty("analysis.data.dir", "");
     if (ANALYSIS_DATA_DIR.length() != 0)
       return;
@@ -86,9 +85,9 @@
     }
 
     if (ANALYSIS_DATA_DIR.length() == 0) {
-      // 提示用户未找到词典文件夹
+      // Dictionary directory cannot be found.
       System.err
-          .println("WARNING: Can not found lexical dictionary directory!");
+          .println("WARNING: Can not find lexical dictionary directory!");
       System.err
           .println("WARNING: This will cause unpredictable exceptions in your application!");
       System.err

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/CharType.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/CharType.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/CharType.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/CharType.java Wed Jul  1 10:32:23 2009
@@ -17,23 +17,49 @@
 
 package org.apache.lucene.analysis.cn.smart;
 
+/**
+ * Internal SmartChineseAnalyzer character type constants.
+ */
 public class CharType {
 
+  /**
+   * Punctuation Characters
+   */
   public final static int DELIMITER = 0;
 
+  /**
+   * Letters
+   */
   public final static int LETTER = 1;
 
+  /**
+   * Numeric Digits
+   */
   public final static int DIGIT = 2;
 
+  /**
+   * Han Ideographs
+   */
   public final static int HANZI = 3;
 
+  /**
+   * Characters that act as a space
+   */
   public final static int SPACE_LIKE = 4;
 
-  // (全角半角)标点符号,半角(字母,数字),汉字,空格,"\t\r\n"等空格或换行字符
+  /**
+   * Full-Width letters
+   */
   public final static int FULLWIDTH_LETTER = 5;
 
-  public final static int FULLWIDTH_DIGIT = 6; // 全角字符,字母,数字
-
+  /**
+   * Full-Width alphanumeric characters
+   */
+  public final static int FULLWIDTH_DIGIT = 6;
+
+  /**
+   * Other (not fitting any of the other categories)
+   */
   public final static int OTHER = 7;
 
 }

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java Wed Jul  1 10:32:23 2009
@@ -25,14 +25,12 @@
 import org.apache.lucene.analysis.Tokenizer;
 
 /**
- * 
- * 包含一个完整句子的Token,从文件中读出,是下一步分词的对象
- * 
+ * Tokenizes input into sentences.
  */
 public class SentenceTokenizer extends Tokenizer {
 
   /**
-   * 用来切断句子的标点符号 。,!?;,!?;
+   * End of sentence punctuation: 。,!?;,!?;
    */
   public final static String PUNCTION = "。,!?;,!?;";
 
@@ -62,7 +60,7 @@
       if (ci == -1) {
         break;
       } else if (PUNCTION.indexOf(ch) != -1) {
-        // 找到了句子末尾
+        // End of a sentence
         buffer.append(ch);
         tokenEnd++;
         break;
@@ -78,8 +76,7 @@
         pch = ch;
         ci = bufferInput.read();
         ch = (char) ci;
-        // 如果碰上了两个连续的skip字符,例如两个回车,两个空格或者,
-        // 一个回车,一个空格等等,将其视为句子结束,以免句子太长而内存不足
+        // Two spaces, such as CR, LF
         if (Utility.SPACES.indexOf(ch) != -1
             && Utility.SPACES.indexOf(pch) != -1) {
           // buffer.append(ch);

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/Utility.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/Utility.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/Utility.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/Utility.java Wed Jul  1 10:32:23 2009
@@ -17,6 +17,12 @@
 
 package org.apache.lucene.analysis.cn.smart;
 
+import org.apache.lucene.analysis.cn.smart.hhmm.BiSegGraph; // for javadoc
+import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; // for javadoc
+
+/**
+ * SmartChineseAnalyzer utility constants and methods
+ */
 public class Utility {
 
   public static final char[] STRING_CHAR_ARRAY = new String("未##串")
@@ -30,24 +36,29 @@
 
   public static final char[] END_CHAR_ARRAY = new String("末##末").toCharArray();
 
+  /**
+   * Delimiters will be filtered to this character by {@link SegTokenFilter}
+   */
   public static final char[] COMMON_DELIMITER = new char[] { ',' };
 
   /**
-   * 需要跳过的符号,例如制表符,回车,换行等等。
+   * Space-like characters that need to be skipped: such as space, tab, newline, carriage return.
    */
   public static final String SPACES = "  \t\r\n";
 
+  /**
+   * Maximum bigram frequency (used in the {@link BiSegGraph} smoothing function). 
+   */
   public static final int MAX_FREQUENCE = 2079997 + 80000;
 
   /**
-   * 比较两个整数数组的大小, 分别从数组的一定位置开始逐个比较, 当依次相等且都到达末尾时, 返回相等, 否则未到达末尾的大于到达末尾的;
-   * 当未到达末尾时有一位不相等, 该位置数值大的数组大于小的
+   * compare two arrays starting at the specified offsets.
    * 
-   * @param larray
-   * @param lstartIndex larray的起始位置
-   * @param rarray
-   * @param rstartIndex rarray的起始位置
-   * @return 0表示相等,1表示larray > rarray, -1表示larray < rarray
+   * @param larray left array
+   * @param lstartIndex start offset into larray
+   * @param rarray right array
+   * @param rstartIndex start offset into rarray
+   * @return 0 if the arrays are equal,1 if larray > rarray, -1 if larray < rarray
    */
   public static int compareArray(char[] larray, int lstartIndex, char[] rarray,
       int rstartIndex) {
@@ -74,21 +85,19 @@
     }
     if (li == larray.length) {
       if (ri == rarray.length) {
-        // 两者一直相等到末尾,因此返回相等,也就是结果0
+        // Both arrays are equivalent, return 0.
         return 0;
       } else {
-        // 此时不可能ri>rarray.length因此只有ri<rarray.length
-        // 表示larray已经结束,rarray没有结束,因此larray < rarray,返回-1
+        // larray < rarray because larray has ended first.
         return -1;
       }
     } else {
-      // 此时不可能li>larray.length因此只有li < larray.length,表示li没有到达larray末尾
+      // differing lengths
       if (ri == rarray.length) {
-        // larray没有结束,但是rarray已经结束,因此larray > rarray
+        // larray > rarray because rarray has ended first.
         return 1;
       } else {
-        // 此时不可能ri>rarray.length因此只有ri < rarray.length
-        // 表示larray和rarray都没有结束,因此按下一个数的大小判断
+        // determine by comparison
         if (larray[li] > rarray[ri])
           return 1;
         else
@@ -98,18 +107,20 @@
   }
 
   /**
-   * 根据前缀来判断两个字符数组的大小,当前者为后者的前缀时,表示相等,当不为前缀时,按照普通字符串方式比较
+   * Compare two arrays, starting at the specified offsets, but treating shortArray as a prefix to longArray.
+   * As long as shortArray is a prefix of longArray, return 0.
+   * Otherwise, behave as {@link Utility#compareArray(char[], int, char[], int)}
    * 
-   * @param shortArray
-   * @param shortIndex
-   * @param longArray
-   * @param longIndex
-   * @return
+   * @param shortArray prefix array
+   * @param shortIndex offset into shortArray
+   * @param longArray long array (word)
+   * @param longIndex offset into longArray
+   * @return 0 if shortArray is a prefix of longArray, otherwise act as {@link Utility#compareArray(char[], int, char[], int)}
    */
   public static int compareArrayByPrefix(char[] shortArray, int shortIndex,
       char[] longArray, int longIndex) {
 
-    // 空数组是所有数组的前缀,不考虑index
+    // a null prefix is a prefix of longArray
     if (shortArray == null)
       return 0;
     else if (longArray == null)
@@ -122,24 +133,27 @@
       li++;
     }
     if (si == shortArray.length) {
-      // shortArray 是 longArray的prefix
+      // shortArray is a prefix of longArray
       return 0;
     } else {
-      // 此时不可能si>shortArray.length因此只有si <
-      // shortArray.length,表示si没有到达shortArray末尾
-
-      // shortArray没有结束,但是longArray已经结束,因此shortArray > longArray
+      // shortArray > longArray because longArray ended first.
       if (li == longArray.length)
         return 1;
       else
-        // 此时不可能li>longArray.length因此只有li < longArray.length
-        // 表示shortArray和longArray都没有结束,因此按下一个数的大小判断
+        // determine by comparison
         return (shortArray[si] > longArray[li]) ? 1 : -1;
     }
   }
 
+  /**
+   * Return the internal {@link CharType} constant of a given character. 
+   * @param ch input character
+   * @return constant from {@link CharType} describing the character type.
+   * 
+   * @see CharType
+   */
   public static int getCharType(char ch) {
-    // 最多的是汉字
+    // Most (but not all!) of these are Han Ideographic Characters
     if (ch >= 0x4E00 && ch <= 0x9FA5)
       return CharType.HANZI;
     if ((ch >= 0x0041 && ch <= 0x005A) || (ch >= 0x0061 && ch <= 0x007A))
@@ -148,12 +162,12 @@
       return CharType.DIGIT;
     if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' || ch == ' ')
       return CharType.SPACE_LIKE;
-    // 最前面的其它的都是标点符号了
+    // Punctuation Marks
     if ((ch >= 0x0021 && ch <= 0x00BB) || (ch >= 0x2010 && ch <= 0x2642)
         || (ch >= 0x3001 && ch <= 0x301E))
       return CharType.DELIMITER;
 
-    // 全角字符区域
+    // Full-Width range
     if ((ch >= 0xFF21 && ch <= 0xFF3A) || (ch >= 0xFF41 && ch <= 0xFF5A))
       return CharType.FULLWIDTH_LETTER;
     if (ch >= 0xFF10 && ch <= 0xFF19)

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java Wed Jul  1 10:32:23 2009
@@ -25,6 +25,9 @@
 import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;
 import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter;
 
+/**
+ * Segment a sentence of Chinese text into words.
+ */
 public class WordSegmenter {
 
   private HHMMSegmenter hhmmSegmenter = new HHMMSegmenter();
@@ -32,20 +35,19 @@
   private SegTokenFilter tokenFilter = new SegTokenFilter();
 
   /**
-   * 调用HHMMSegment程序将当前的sentence Token分词,返回分词结果,保存在Token List中
+   * Segment a sentence into words with {@link HHMMSegmenter}
    * 
-   * @param sentenceToken 句子的Token
-   * @param shortPathCount HHMM算法分词所需要的优化前的最短路径个数。一般越大分词结果越精确,但是计算代价也较高。
-   * @return 分词结果的Token List
+   * @param sentenceToken sentence {@link Token}
+   * @return {@link List} of {@link SegToken}
    */
-  public List segmentSentence(Token sentenceToken, int shortPathCount) {
+  public List segmentSentence(Token sentenceToken) {
     String sentence = sentenceToken.term();
 
     List segTokenList = hhmmSegmenter.process(sentence);
 
     List result = new ArrayList();
 
-    // i从1到rawTokens.length-2,也就是说将“始##始”,“末##末”两个RawToken去掉
+    // tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
     for (int i = 1; i < segTokenList.size() - 1; i++) {
       result.add(convertSegToken((SegToken) segTokenList.get(i), sentence,
           sentenceToken.startOffset(), "word"));
@@ -55,14 +57,13 @@
   }
 
   /**
+   * Convert a {@link SegToken} to a Lucene {@link Token}
    * 
-   * 将RawToken类型转换成索引需要的Token类型, 因为索引需要RawToken在原句中的内容, 因此转换时需要指定原句子。
-   * 
-   * @param rt
-   * @param sentence 转换需要的句子内容
-   * @param sentenceStartOffset sentence在文章中的初始位置
-   * @param type token类型,默认应该是word
-   * @return
+   * @param st input {@link SegToken}
+   * @param sentence associated Sentence
+   * @param sentenceStartOffset offset into sentence
+   * @param type token type, default is word
+   * @return Lucene {@link Token}
    */
   public Token convertSegToken(SegToken st, String sentence,
       int sentenceStartOffset, String type) {

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordTokenizer.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordTokenizer.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordTokenizer.java Wed Jul  1 10:32:23 2009
@@ -25,11 +25,11 @@
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 
+/**
+ * A {@link Tokenizer} that breaks sentences into words.
+ */
 public class WordTokenizer extends Tokenizer {
 
-  /**
-   * 分词主程序,WordTokenizer初始化时加载。
-   */
   private WordSegmenter wordSegmenter;
 
   private TokenStream in;
@@ -41,13 +41,10 @@
   private Token sentenceToken = new Token();
 
   /**
-   * 设计上是SentenceTokenizer的下一处理层。将SentenceTokenizer的句子读出,
-   * 利用HHMMSegment主程序将句子分词,然后将分词结果返回。
+   * Construct a new WordTokenizer.
    * 
-   * @param in 句子的Token
-   * @param smooth 平滑函数
-   * @param dataPath 装载核心字典与二叉字典的目录
-   * @see init()
+   * @param in {@link TokenStream} of sentences
+   * @param wordSegmenter {@link WordSegmenter} to break sentences into words 
    */
   public WordTokenizer(TokenStream in, WordSegmenter wordSegmenter) {
     this.in = in;
@@ -66,17 +63,16 @@
   }
 
   /**
-   * 当当前的句子分词并索引完毕时,需要读取下一个句子Token, 本函数负责调用上一层的SentenceTokenizer去加载下一个句子, 并将其分词,
-   * 将分词结果保存成Token放在tokenBuffer中
+   * Process the next input sentence, placing tokens into tokenBuffer
    * 
-   * @return 读取并处理下一个句子成功与否,如果没有成功,说明文件处理完毕,后面没有Token了
+   * @return true if more tokens were placed into tokenBuffer.
    * @throws IOException
    */
   private boolean processNextSentence() throws IOException {
     sentenceToken = in.next(sentenceToken);
     if (sentenceToken == null)
       return false;
-    tokenBuffer = wordSegmenter.segmentSentence(sentenceToken, 1);
+    tokenBuffer = wordSegmenter.segmentSentence(sentenceToken);
     tokenIter = tokenBuffer.iterator();
     return tokenBuffer != null && tokenIter.hasNext();
   }

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordType.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordType.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordType.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordType.java Wed Jul  1 10:32:23 2009
@@ -17,22 +17,49 @@
 
 package org.apache.lucene.analysis.cn.smart;
 
+/**
+ * Internal SmartChineseAnalyzer token type constants
+ */
 public class WordType {
 
+  /**
+   * Start of a Sentence
+   */
   public final static int SENTENCE_BEGIN = 0;
 
-  public final static int SENTENCE_END = 1;// 句子的开头和结束
-
-  public final static int CHINESE_WORD = 2;// 中文词
-
+  /**
+   * End of a Sentence
+   */
+  public final static int SENTENCE_END = 1;
+
+  /**
+   * Chinese Word 
+   */
+  public final static int CHINESE_WORD = 2;
+
+  /**
+   * ASCII String
+   */
   public final static int STRING = 3;
 
-  public final static int NUMBER = 4; // ascii字符串和数字
-
-  public final static int DELIMITER = 5; // 所有标点符号
-
+  /**
+   * ASCII Alphanumeric 
+   */
+  public final static int NUMBER = 4;
+
+  /**
+   * Punctuation Symbol
+   */
+  public final static int DELIMITER = 5;
+
+  /**
+   * Full-Width String
+   */
   public final static int FULLWIDTH_STRING = 6;
 
-  public final static int FULLWIDTH_NUMBER = 7;// 含有全角字符的字符串,含全角数字的数字
+  /**
+   * Full-Width Alphanumeric
+   */
+  public final static int FULLWIDTH_NUMBER = 7;
 
 }

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java Wed Jul  1 10:32:23 2009
@@ -19,19 +19,29 @@
 
 import java.io.UnsupportedEncodingException;
 
+/**
+ * <p>
+ * SmartChineseAnalyzer abstract dictionary implementation.
+ * </p>
+ * <p>
+ * Contains methods for dealing with GB2312 encoding.
+ * </p>
+ */
 public abstract class AbstractDictionary {
   /**
-   * 第一个汉字为“啊”,他前面有15个区,共15*94个字符
+   * First Chinese Character in GB2312 (15 * 94)
+   * Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation.
    */
   public static final int GB2312_FIRST_CHAR = 1410;
 
   /**
-   * GB2312字符集中01~87的字符集才可能有效,共8178个
+   * Last Chinese Character in GB2312 (87 * 94). 
+   * Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are unassigned.
    */
   public static final int GB2312_CHAR_NUM = 87 * 94;
 
   /**
-   * 词库文件中收录了6768个汉字的词频统计
+   * Dictionary data contains 6768 Chinese characters with frequency statistics.
    */
   public static final int CHAR_NUM_IN_FILE = 6768;
 
@@ -45,33 +55,33 @@
   // B0F0 梆 榜 膀 绑 棒 磅 蚌 镑 傍 谤 苞 胞 包 褒 剥
   // =====================================================
   //
-  // GB2312 字符集的区位分布表:
-  // 区号 字数 字符类别
-  // 01 94 一般符号
-  // 02 72 顺序号码
-  // 03 94 拉丁字母
-  // 04 83 日文假名
+  // GB2312 character set:
+  // 01 94 Symbols
+  // 02 72 Numbers
+  // 03 94 Latin
+  // 04 83 Kana
   // 05 86 Katakana
-  // 06 48 希腊字母
-  // 07 66 俄文字母
-  // 08 63 汉语拼音符号
-  // 09 76 图形符号
-  // 10-15 备用区
-  // 16-55 3755 一级汉字,以拼音为序
-  // 56-87 3008 二级汉字,以笔划为序
-  // 88-94 备用区
+  // 06 48 Greek
+  // 07 66 Cyrillic
+  // 08 63 Phonetic Symbols
+  // 09 76 Drawing Symbols
+  // 10-15 Unassigned
+  // 16-55 3755 Plane 1, in pinyin order
+  // 56-87 3008 Plane 2, in radical/stroke order
+  // 88-94 Unassigned
   // ======================================================
 
   /**
-   * GB2312 共收录有 7445 个字符,其中简化汉字 6763 个,字母和符号 682 个。
+   * <p>
+   * Transcode from GB2312 ID to Unicode
+   * </p>
+   * <p>
+   * GB2312 is divided into a 94 * 94 grid, containing 7445 characters consisting of 6763 Chinese characters and 682 symbols.
+   * Some regions are unassigned (reserved).
+   * </p>
    * 
-   * GB2312 将所收录的字符分为 94 个区,编号为 01 区至 94 区;每个区收录 94 个字符,编号为 01 位至 94
-   * 位,01为起始与0xA1,94位处于0xFE。GB2312 的每一个字符都由与其唯一对应的区号和位号所确定。例如:汉字“啊”,编号为 16 区 01
-   * 位。
-   */
-  /**
-   * @param ccid
-   * @return
+   * @param ccid GB2312 id
+   * @return unicode String
    */
   public String getCCByGB2312Id(int ccid) {
     if (ccid < 0 || ccid > WordDictionary.GB2312_CHAR_NUM)
@@ -90,16 +100,16 @@
   }
 
   /**
-   * 根据输入的Unicode字符,获取它的GB2312编码或者ascii编码,
+   * Transcode from Unicode to GB2312
    * 
-   * @param ch 输入的GB2312中文字符或者ASCII字符(128个)
-   * @return ch在GB2312中的位置,-1表示该字符不认识
+   * @param ch input character in Unicode, or character in Basic Latin range.
+   * @return position in GB2312
    */
   public short getGB2312Id(char ch) {
     try {
       byte[] buffer = Character.toString(ch).getBytes("GB2312");
       if (buffer.length != 2) {
-        // 正常情况下buffer应该是两个字节,否则说明ch不属于GB2312编码,故返回'?',此时说明不认识该字符
+        // Should be a two-byte character
         return -1;
       }
       int b0 = (int) (buffer[0] & 0x0FF) - 161; // 编码从A1开始,因此减去0xA1=161
@@ -112,12 +122,10 @@
   }
 
   /**
-   * 改进的32位FNV hash算法,用作本程序中的第一hash函数.第一和第二hash函数用来联合计算hash表, 使其均匀分布,
-   * 并能避免因hash表过密而导致的长时间计算的问题
+   * 32-bit FNV Hash Function
    * 
-   * @param c 待hash的Unicode字符
-   * @return c的哈希值
-   * @see Utility.hash2()
+   * @param c input character
+   * @return hashcode
    */
   public long hash1(char c) {
     final long p = 1099511628211L;
@@ -133,9 +141,10 @@
   }
 
   /**
-   * @see Utility.hash1(char[])
-   * @param carray
-   * @return
+   * 32-bit FNV Hash Function
+   * 
+   * @param carray character array
+   * @return hashcode
    */
   public long hash1(char carray[]) {
     final long p = 1099511628211L;
@@ -155,16 +164,14 @@
   }
 
   /**
-   * djb2哈希算法,用作本程序中的第二hash函数
-   * 
    * djb2 hash algorithm,this algorithm (k=33) was first reported by dan
    * bernstein many years ago in comp.lang.c. another version of this algorithm
    * (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i];
    * the magic of number 33 (why it works better than many other constants,
    * prime or not) has never been adequately explained.
    * 
-   * @param c
-   * @return
+   * @param c character
+   * @return hashcode
    */
   public int hash2(char c) {
     int hash = 5381;
@@ -177,9 +184,14 @@
   }
 
   /**
-   * @see Utility.hash2(char[])
-   * @param carray
-   * @return
+   * djb2 hash algorithm,this algorithm (k=33) was first reported by dan
+   * bernstein many years ago in comp.lang.c. another version of this algorithm
+   * (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i];
+   * the magic of number 33 (why it works better than many other constants,
+   * prime or not) has never been adequately explained.
+   * 
+   * @param carray character array
+   * @return hashcode
    */
   public int hash2(char carray[]) {
     int hash = 5381;

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java Wed Jul  1 10:32:23 2009
@@ -26,6 +26,12 @@
 
 import org.apache.lucene.analysis.cn.smart.Utility;
 
+/**
+ * Graph representing possible token pairs (bigrams) at each start offset in the sentence.
+ * <p>
+ * For each start offset, a list of possible token pairs is stored.
+ * </p>
+ */
 public class BiSegGraph {
 
   private Map tokenPairListTable = new HashMap();
@@ -39,15 +45,8 @@
     generateBiSegGraph(segGraph);
   }
 
-  /**
-   * 生成两两词之间的二叉图表,将结果保存在一个MultiTokenPairMap中
-   * 
-   * @param segGraph 所有的Token列表
-   * @param smooth 平滑系数
-   * @param biDict 二叉词典
-   * @return
-   * 
-   * @see MultiTokenPairMap
+  /*
+   * Generate a BiSegGraph based upon a SegGraph
    */
   private void generateBiSegGraph(SegGraph segGraph) {
     double smooth = 0.1;
@@ -57,7 +56,7 @@
 
     int next;
     char[] idBuffer;
-    // 为segGraph中的每个元素赋以一个下标
+    // get the list of tokens ordered and indexed
     segTokenList = segGraph.makeIndex();
     // 因为startToken("始##始")的起始位置是-1因此key为-1时可以取出startToken
     int key = -1;
@@ -119,31 +118,29 @@
   }
 
   /**
-   * 查看SegTokenPair的结束位置为to(SegTokenPair.to为to)是否存在SegTokenPair,
-   * 如果没有则说明to处没有SegTokenPair或者还没有添加
+   * Returns true if their is a list of token pairs at this offset (index of the second token)
    * 
-   * @param to SegTokenPair.to
-   * @return
+   * @param to index of the second token in the token pair
+   * @return true if a token pair exists
    */
   public boolean isToExist(int to) {
     return tokenPairListTable.get(new Integer(to)) != null;
   }
 
   /**
-   * 取出SegTokenPair.to为to的所有SegTokenPair,如果没有则返回null
+   * Return a {@link List} of all token pairs at this offset (index of the second token)
    * 
-   * @param to
-   * @return 所有相同SegTokenPair.to的SegTokenPair的序列
+   * @param to index of the second token in the token pair
+   * @return {@link List} of token pairs.
    */
   public List getToList(int to) {
     return (List) tokenPairListTable.get(new Integer(to));
   }
 
   /**
-   * 向BiSegGraph中增加一个SegTokenPair,这些SegTokenPair按照相同SegTokenPair.
-   * to放在同一个ArrayList中
+   * Add a {@link SegTokenPair}
    * 
-   * @param tokenPair
+   * @param tokenPair {@link SegTokenPair}
    */
   public void addSegTokenPair(SegTokenPair tokenPair) {
     int to = tokenPair.to;
@@ -158,16 +155,16 @@
   }
 
   /**
-   * @return TokenPair的列数,也就是Map中不同列号的TokenPair种数。
+   * Get the number of {@link SegTokenPair} entries in the table.
+   * @return number of {@link SegTokenPair} entries
    */
   public int getToCount() {
     return tokenPairListTable.size();
   }
 
   /**
-   * 用veterbi算法计算从起点到终点的最短路径
-   * 
-   * @return
+   * Find the shortest path with the Viterbi algorithm.
+   * @return {@link List}
    */
   public List getShortPath() {
     int current;
@@ -198,7 +195,7 @@
       path.add(newNode);
     }
 
-    // 接下来从nodePaths中计算从起点到终点的真实路径
+    // Calculate PathNodes
     int preNode, lastNode;
     lastNode = path.size() - 1;
     current = lastNode;

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java Wed Jul  1 10:32:23 2009
@@ -32,6 +32,9 @@
 
 import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
 
+/**
+ * SmartChineseAnalyzer Bigram dictionary.
+ */
 public class BigramDictionary extends AbstractDictionary {
 
   private BigramDictionary() {
@@ -43,12 +46,8 @@
 
   public static final int PRIME_BIGRAM_LENGTH = 402137;
 
-  /**
-   * bigramTable 来存储词与词之间的跳转频率, bigramHashTable 和 frequencyTable
-   * 就是用来存储这些频率的数据结构。 为了提高查询速度和节省内存, 采用 hash 值来代替关联词作为查询依据, 关联词就是
-   * (formWord+'@'+toWord) , 利用 FNV1 hash 算法来计算关联词的hash值 ,并保存在 bigramHashTable
-   * 中,利用 hash 值来代替关联词有可能会产生很小概率的冲突, 但是 long 类型
-   * (64bit)的hash值有效地将此概率降到极低。bigramHashTable[i]与frequencyTable[i]一一对应
+  /*
+   * The word associations are stored as FNV1 hashcodes, which have a small probability of collision, but save memory.  
    */
   private long[] bigramHashTable;
 
@@ -128,7 +127,7 @@
         bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
         frequencyTable = new int[PRIME_BIGRAM_LENGTH];
         for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++) {
-          // 实际上将0作为初始值有一点问题,因为某个字符串可能hash值为0,但是概率非常小,因此影响不大
+          // it is possible for a value to hash to 0, but the probability is extremely low
           bigramHashTable[i] = 0;
           frequencyTable[i] = 0;
         }
@@ -141,10 +140,9 @@
   }
 
   /**
-   * 将词库文件加载到WordDictionary的相关数据结构中,只是加载,没有进行合并和修改操作
+   * Load the datafile into this BigramDictionary
    * 
-   * @param dctFilePath
-   * @return
+   * @param dctFilePath path to the Bigramdictionary (bigramdict.mem)
    * @throws FileNotFoundException
    * @throws IOException
    * @throws UnsupportedEncodingException
@@ -159,14 +157,14 @@
     String tmpword;
     RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
 
-    // 字典文件中第一个汉字出现的位置是0,最后一个是6768
+    // GB2312 characters 0 - 6768
     for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
       String currentStr = getCCByGB2312Id(i);
       // if (i == 5231)
       // System.out.println(i);
 
-      dctFile.read(intBuffer);// 原词库文件在c下开发,所以写入的文件为little
-      // endian编码,而java为big endian,必须转换过来
+      dctFile.read(intBuffer);
+      // the dictionary was developed for C, and byte order must be converted to work with Java
       cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
       if (cnt <= 0) {
         continue;
@@ -272,9 +270,8 @@
       return -1;
   }
 
-  /**
-   * @param c
-   * @return
+  /*
+   * lookup the index into the frequency array.
    */
   private int getBigramItemIndex(char carray[]) {
     long hashId = hash1(carray);

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java Wed Jul  1 10:32:23 2009
@@ -23,18 +23,18 @@
 import org.apache.lucene.analysis.cn.smart.Utility;
 import org.apache.lucene.analysis.cn.smart.WordType;
 
+/**
+ * Finds the optimal segmentation of a sentence into Chinese words
+ */
 public class HHMMSegmenter {
 
   private static WordDictionary wordDict = WordDictionary.getInstance();
 
   /**
-   * 寻找sentence中所有可能的Token,最后再添加两个特殊Token,"始##始",
-   * "末##末","始##始"Token的起始位置是-1,"末##末"Token的起始位置是句子的长度
+   * Create the {@link SegGraph} for a sentence.
    * 
-   * @param sentence 输入的句子,不包含"始##始","末##末"等
-   * @param coreDict 核心字典
-   * @return 所有可能的Token
-   * @see MultiTokenMap
+   * @param sentence input sentence, without start and end markers
+   * @return {@link SegGraph} corresponding to the input sentence.
    */
   private SegGraph createSegGraph(String sentence) {
     int i = 0, j;
@@ -168,16 +168,16 @@
   }
 
   /**
-   * 为sentence中的每个字符确定唯一的字符类型
+   * Get the character types for every character in a sentence.
    * 
    * @see Utility.charType(char)
-   * @param sentence 输入的完成句子
-   * @return 返回的字符类型数组,如果输入为null,返回也是null
+   * @param sentence input sentence
+   * @return array of character types corresponding to character positions in the sentence
    */
   private static int[] getCharTypes(String sentence) {
     int length = sentence.length();
     int[] charTypeArray = new int[length];
-    // 生成对应单个汉字的字符类型数组
+    // the type of each character by position
     for (int i = 0; i < length; i++) {
       charTypeArray[i] = Utility.getCharType(sentence.charAt(i));
     }
@@ -185,6 +185,11 @@
     return charTypeArray;
   }
 
+  /**
+   * Return a list of {@link PathNode} representing the best segmentation of a sentence
+   * @param sentence input sentence
+   * @return best segmentation as a {@link List}
+   */
   public List process(String sentence) {
     SegGraph segGraph = createSegGraph(sentence);
     BiSegGraph biSegGraph = new BiSegGraph(segGraph);

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java Wed Jul  1 10:32:23 2009
@@ -17,6 +17,12 @@
 
 package org.apache.lucene.analysis.cn.smart.hhmm;
 
+/**
+ * SmartChineseAnalyzer internal node representation
+ * <p>
+ * Used by {@link BiSegGraph} to maximize the segmentation with the Viterbi algorithm.
+ * </p>
+ */
 public class PathNode implements Comparable {
   public double weight;
 

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java Wed Jul  1 10:32:23 2009
@@ -23,42 +23,53 @@
 import java.util.List;
 import java.util.Map;
 
+/**
+ * Graph representing possible tokens at each start offset in the sentence.
+ * <p>
+ * For each start offset, a list of possible tokens is stored.
+ * </p>
+ */
 public class SegGraph {
 
   /**
-   * 用一个ArrayList记录startOffset相同的Token,这个startOffset就是Token的key
+   * Map of start offsets to ArrayList of tokens at that position
    */
-  private Map tokenListTable = new HashMap();
+  private Map /* <Integer, ArrayList<SegToken>> */ tokenListTable = new HashMap();
 
   private int maxStart = -1;
 
   /**
-   * 查看startOffset为s的Token是否存在,如果没有则说明s处没有Token或者还没有添加
+   * Returns true if a mapping for the specified start offset exists
    * 
    * @param s startOffset
-   * @return
+   * @return true if there are tokens for the startOffset
    */
   public boolean isStartExist(int s) {
     return tokenListTable.get(new Integer(s)) != null;
   }
 
   /**
-   * 取出startOffset为s的所有Tokens,如果没有则返回null
+   * Get the list of tokens at the specified start offset
    * 
-   * @param s
-   * @return 所有相同startOffset的Token的序列
+   * @param s startOffset
+   * @return List of tokens at the specified start offset.
    */
   public List getStartList(int s) {
     return (List) tokenListTable.get(new Integer(s));
   }
 
+  /**
+   * Get the highest start offset in the map
+   * 
+   * @return maximum start offset, or -1 if the map is empty.
+   */
   public int getMaxStart() {
     return maxStart;
   }
 
   /**
-   * 为SegGraph中的所有Tokens生成一个统一的index,index从0开始,
-   * 按照startOffset递增的顺序排序,相同startOffset的Tokens按照放置先后顺序排序
+   * Set the {@link SegToken#index} for each token, based upon its order by startOffset. 
+   * @return a {@link List} of these ordered tokens.
    */
   public List makeIndex() {
     List result = new ArrayList();
@@ -82,9 +93,8 @@
   }
 
   /**
-   * 向Map中增加一个Token,这些Token按照相同startOffset放在同一个列表中,
-   * 
-   * @param token
+   * Add a {@link SegToken} to the mapping, creating a new mapping at the token's startOffset if one does not exist. 
+   * @param token {@link SegToken}
    */
   public void addToken(SegToken token) {
     int s = token.startOffset;
@@ -101,18 +111,18 @@
   }
 
   /**
-   * 获取SegGraph中不同起始(Start)位置Token类的个数,每个开始位置可能有多个Token,因此位置数与Token数并不一致
-   * 
-   * @return
+   * Get the number of startOffsets.
+   *
+   * @return number of startOffsets in the mapping
    */
   public int getStartCount() {
     return tokenListTable.size();
   }
 
   /**
-   * 将Map中存储的所有Token按照起始位置从小到大的方式组成一个列表
+   * Return a {@link List} of all tokens in the map, ordered by startOffset.
    * 
-   * @return
+   * @return {@link List} of all tokens in the map.
    */
   public List toTokenList() {
     List result = new ArrayList();

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java Wed Jul  1 10:32:23 2009
@@ -17,6 +17,9 @@
 
 package org.apache.lucene.analysis.cn.smart.hhmm;
 
+/**
+ * SmartChineseAnalyzer internal token
+ */
 public class SegToken {
   public char[] charArray;
 
@@ -51,13 +54,6 @@
   // + endOffset + ")/w(" + weight + ")t(" + wordType + ")";
   // }
 
-  /**
-   * 判断两个Token相等的充要条件是他们的起始位置相等,因为这样他们的原句中的内容一样,
-   * 而pos与weight都可以从词典中查到多个,可以用一对多的方法表示,因此只需要一个Token
-   * 
-   * @param t
-   * @return
-   */
   // public boolean equals(RawToken t) {
   // return this.startOffset == t.startOffset
   // && this.endOffset == t.endOffset;

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java Wed Jul  1 10:32:23 2009
@@ -20,27 +20,43 @@
 import org.apache.lucene.analysis.cn.smart.Utility;
 import org.apache.lucene.analysis.cn.smart.WordType;
 
+/**
+ * <p>
+ * Filters a {@link SegToken} by converting full-width latin to half-width, then lowercasing latin.
+ * Additionally, all punctuation is converted into {@link Utility#COMMON_DELIMITER}
+ * </p>
+ */
 public class SegTokenFilter {
 
+  /**
+   * Filter an input {@link SegToken}
+   * <p>
+   * Full-width latin will be converted to half-width, then all latin will be lowercased.
+   * All punctuation is converted into {@link Utility#COMMON_DELIMITER}
+   * </p>
+   * 
+   * @param token input {@link SegToken}
+   * @return normalized {@link SegToken}
+   */
   public SegToken filter(SegToken token) {
     switch (token.wordType) {
       case WordType.FULLWIDTH_NUMBER:
-      case WordType.FULLWIDTH_STRING:
+      case WordType.FULLWIDTH_STRING: /* first convert full-width -> half-width */
         for (int i = 0; i < token.charArray.length; i++) {
           if (token.charArray[i] >= 0xFF10)
             token.charArray[i] -= 0xFEE0;
 
-          if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A)
+          if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */
             token.charArray[i] += 0x0020;
         }
         break;
       case WordType.STRING:
         for (int i = 0; i < token.charArray.length; i++) {
-          if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A)
+          if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */
             token.charArray[i] += 0x0020;
         }
         break;
-      case WordType.DELIMITER:
+      case WordType.DELIMITER: /* convert all punctuation to Utility.COMMON_DELIMITER */
         token.charArray = Utility.COMMON_DELIMITER;
         break;
       default:

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java Wed Jul  1 10:32:23 2009
@@ -17,15 +17,21 @@
 
 package org.apache.lucene.analysis.cn.smart.hhmm;
 
+/**
+ * A pair of tokens in {@link SegGraph}
+ */
 public class SegTokenPair {
 
   public char[] charArray;
 
   /**
-   * from和to是Token对的index号,表示本TokenPair的两个Token在segGragh中的位置。
+   * index of the first token in {@link SegGraph}
    */
   public int from;
 
+  /**
+   * index of the second token in {@link SegGraph}
+   */
   public int to;
 
   public double weight;

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java Wed Jul  1 10:32:23 2009
@@ -33,6 +33,10 @@
 import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
 import org.apache.lucene.analysis.cn.smart.Utility;
 
+/**
+ * SmartChineseAnalyzer Word Dictionary
+ *
+ */
 public class WordDictionary extends AbstractDictionary {
 
   private WordDictionary() {
@@ -41,7 +45,7 @@
   private static WordDictionary singleInstance;
 
   /**
-   * 一个较大的素数,保证hash查找能够遍历所有位置
+   * Large prime number for hash function
    */
   public static final int PRIME_INDEX_LENGTH = 12071;
 
@@ -66,6 +70,10 @@
 
   // static Logger log = Logger.getLogger(WordDictionary.class);
 
+  /**
+   * Get the singleton dictionary instance.
+   * @return singleton
+   */
   public synchronized static WordDictionary getInstance() {
     if (singleInstance == null) {
       singleInstance = new WordDictionary();
@@ -82,10 +90,9 @@
   }
 
   /**
-   * 从外部文件夹dctFileRoot加载词典库文件,首先测试是否有coredict.mem文件, 如果有则直接作为序列化对象加载,
-   * 如果没有则加载词典库源文件coredict.dct
+   * Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct
    * 
-   * @param dctFileName 词典库文件的路径
+   * @param dctFileRoot path to dictionary directory
    */
   public void load(String dctFileRoot) {
     String dctFilePath = dctFileRoot + "/coredict.dct";
@@ -119,9 +126,8 @@
   }
 
   /**
-   * 从jar内部加载词典库文件,要求保证WordDictionary类当前路径中有coredict.mem文件,以将其作为序列化对象加载
+   * Load coredict.mem internally from the jar file.
    * 
-   * @param dctFileName 词典库文件的路径
    * @throws ClassNotFoundException
    * @throws IOException
    */
@@ -171,10 +177,10 @@
   }
 
   /**
-   * 将词库文件加载到WordDictionary的相关数据结构中,只是加载,没有进行合并和修改操作
+   * Load the datafile into this WordDictionary
    * 
-   * @param dctFilePath
-   * @return
+   * @param dctFilePath path to word dictionary (coredict.mem)
+   * @return number of words read
    * @throws FileNotFoundException
    * @throws IOException
    * @throws UnsupportedEncodingException
@@ -188,13 +194,13 @@
     String tmpword;
     RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
 
-    // 字典文件中第一个汉字出现的位置是0,最后一个是6768
+    // GB2312 characters 0 - 6768
     for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
       // if (i == 5231)
       // System.out.println(i);
 
-      dctFile.read(intBuffer);// 原词库文件在c下开发,所以写入的文件为little
-      // endian编码,而java为big endian,必须转换过来
+      dctFile.read(intBuffer);
+      // the dictionary was developed for C, and byte order must be converted to work with Java
       cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
       if (cnt <= 0) {
         wordItem_charArrayTable[i] = null;
@@ -287,8 +293,8 @@
     wordItem_frequencyTable[delimiterIndex] = null;
   }
 
-  /**
-   * 本程序不做词性标注,因此将相同词不同词性的频率合并到同一个词下,以减小存储空间,加快搜索速度
+  /*
+   * since we aren't doing POS-tagging, merge the frequencies for entries of the same word (with different POS)
    */
   private void mergeSameWords() {
     int i;
@@ -350,12 +356,9 @@
     }
   }
 
-  /**
+  /*
    * 计算字符c在哈希表中应该在的位置,然后将地址列表中该位置的值初始化
    * 
-   * @param c
-   * @param j
-   * @return
    */
   private boolean setTableIndex(char c, int j) {
     int index = getAvaliableTableIndex(c);
@@ -390,10 +393,6 @@
       return -1;
   }
 
-  /**
-   * @param c
-   * @return
-   */
   private short getWordItemTableIndex(char c) {
     int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
     int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
@@ -465,32 +464,33 @@
   }
 
   /**
-   * charArray这个单词对应的词组在不在WordDictionary中出现
+   * Returns true if the input word appears in the dictionary
    * 
-   * @param charArray
-   * @return true表示存在,false表示不存在
+   * @param charArray input word
+   * @return true if the word exists
    */
   public boolean isExist(char[] charArray) {
     return findInTable(charArray) != -1;
   }
 
   /**
-   * @see{getPrefixMatch(char[] charArray, int knownStart)}
-   * @param charArray
-   * @return
+   * Find the first word in the dictionary that starts with the supplied prefix
+   * 
+   * @see #getPrefixMatch(char[], int)
+   * @param charArray input prefix
+   * @return index of word, or -1 if not found
    */
   public int getPrefixMatch(char[] charArray) {
     return getPrefixMatch(charArray, 0);
   }
 
   /**
-   * 从词典中查找以charArray对应的单词为前缀(prefix)的单词的位置, 并返回第一个满足条件的位置。为了减小搜索代价,
-   * 可以根据已有知识设置起始搜索位置, 如果不知道起始位置,默认是0
+   * Find the nth word in the dictionary that starts with the supplied prefix
    * 
-   * @see{getPrefixMatch(char[] charArray)}
-   * @param charArray 前缀单词
-   * @param knownStart 已知的起始位置
-   * @return 满足前缀条件的第一个单词的位置
+   * @see #getPrefixMatch(char[])
+   * @param charArray input prefix
+   * @param knownStart relative position in the dictionary to start
+   * @return index of word, or -1 if not found
    */
   public int getPrefixMatch(char[] charArray, int knownStart) {
     short index = getWordItemTableIndex(charArray[0]);
@@ -521,11 +521,10 @@
   }
 
   /**
-   * 获取idArray对应的词的词频,若pos为-1则获取所有词性的词频
+   * Get the frequency of a word from the dictionary
    * 
-   * @param charArray 输入的单词对应的charArray
-   * @param pos 词性,-1表示要求求出所有的词性的词频
-   * @return idArray对应的词频
+   * @param charArray input word
+   * @return word frequency, or zero if the word is not found
    */
   public int getFrequency(char[] charArray) {
     short hashIndex = getWordItemTableIndex(charArray[0]);
@@ -539,12 +538,11 @@
   }
 
   /**
-   * 判断charArray对应的字符串是否跟词典中charArray[0]对应的wordIndex的charArray相等,
-   * 也就是说charArray的位置查找结果是不是就是wordIndex
+   * Return true if the dictionary entry at itemIndex for table charArray[0] is charArray
    * 
-   * @param charArray 输入的charArray词组,第一个数表示词典中的索引号
-   * @param itemIndex 位置编号
-   * @return 是否相等
+   * @param charArray input word
+   * @param itemIndex item index for table charArray[0]
+   * @return true if the entry exists
    */
   public boolean isEqual(char[] charArray, int itemIndex) {
     short hashIndex = getWordItemTableIndex(charArray[0]);

Modified: lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/stopwords.txt?rev=790102&r1=790101&r2=790102&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/stopwords.txt (original)
+++ lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/stopwords.txt Wed Jul  1 10:32:23 2009
@@ -1,4 +1,4 @@
-////////// 将标点符号全部去掉 ////////////////
+////////// Punctuation tokens to remove ////////////////
 ,
 .
 `
@@ -51,8 +51,8 @@
 ï¼»
 ï¼½
 ●
- //中文空格字符
+ //IDEOGRAPHIC SPACE character (Used as a space in Chinese)
 
-//////////////// 英文停用词 ////////////////
+//////////////// English Stop Words ////////////////
 
-//////////////// 中文停用词 ////////////////
+//////////////// Chinese Stop Words ////////////////