You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by to...@apache.org on 2013/02/20 12:51:48 UTC

svn commit: r1448105 - in /lucene/dev/branches/branch_4x: ./ dev-tools/ dev-tools/idea/.idea/ dev-tools/idea/lucene/classification/ dev-tools/scripts/ lucene/ lucene/analysis/ lucene/analysis/icu/src/java/org/apache/lucene/collation/ lucene/backwards/ ...

Author: tommaso
Date: Wed Feb 20 11:51:46 2013
New Revision: 1448105

URL: http://svn.apache.org/r1448105
Log:
LUCENE-4781 - backporting classification module to branch_4x

Added:
    lucene/dev/branches/branch_4x/dev-tools/idea/lucene/classification/
      - copied from r1384220, lucene/dev/trunk/dev-tools/idea/lucene/classification/
    lucene/dev/branches/branch_4x/dev-tools/idea/lucene/classification/classification.iml   (props changed)
      - copied unchanged from r1384220, lucene/dev/trunk/dev-tools/idea/lucene/classification/classification.iml
    lucene/dev/branches/branch_4x/lucene/classification/   (props changed)
      - copied from r1384219, lucene/dev/trunk/lucene/classification/
    lucene/dev/branches/branch_4x/lucene/classification/build.xml   (contents, props changed)
      - copied, changed from r1384219, lucene/dev/trunk/lucene/classification/build.xml
    lucene/dev/branches/branch_4x/lucene/classification/ivy.xml   (props changed)
      - copied unchanged from r1384219, lucene/dev/trunk/lucene/classification/ivy.xml
    lucene/dev/branches/branch_4x/lucene/classification/src/
      - copied from r1384219, lucene/dev/trunk/lucene/classification/src/
    lucene/dev/branches/branch_4x/lucene/classification/src/java/
      - copied from r1384219, lucene/dev/trunk/lucene/classification/src/java/
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/
      - copied from r1384219, lucene/dev/trunk/lucene/classification/src/java/org/
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/
      - copied from r1384219, lucene/dev/trunk/lucene/classification/src/java/org/apache/
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/
      - copied from r1384219, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/
      - copied from r1384219, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java
      - copied, changed from r1401338, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java   (contents, props changed)
      - copied, changed from r1384219, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
      - copied, changed from r1401338, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java   (contents, props changed)
      - copied, changed from r1384219, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/package.html
      - copied, changed from r1384293, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/package.html
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/
      - copied from r1415060, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/utils/
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java
      - copied, changed from r1415060, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java
    lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/package.html
      - copied unchanged from r1415136, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/utils/package.html
    lucene/dev/branches/branch_4x/lucene/classification/src/java/overview.html   (props changed)
      - copied unchanged from r1384225, lucene/dev/trunk/lucene/classification/src/java/overview.html
    lucene/dev/branches/branch_4x/lucene/classification/src/test/
      - copied from r1384219, lucene/dev/trunk/lucene/classification/src/test/
    lucene/dev/branches/branch_4x/lucene/classification/src/test/org/
      - copied from r1384219, lucene/dev/trunk/lucene/classification/src/test/org/
    lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/
      - copied from r1384219, lucene/dev/trunk/lucene/classification/src/test/org/apache/
    lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/
      - copied from r1384219, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/
    lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/
      - copied from r1384219, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/
    lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java
      - copied, changed from r1401338, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java
    lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java
      - copied, changed from r1401338, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java
    lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java   (contents, props changed)
      - copied, changed from r1384219, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
    lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/utils/
      - copied from r1415060, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/utils/
    lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java
      - copied, changed from r1415060, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java
Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/dev-tools/   (props changed)
    lucene/dev/branches/branch_4x/dev-tools/idea/.idea/modules.xml
    lucene/dev/branches/branch_4x/dev-tools/idea/.idea/workspace.xml
    lucene/dev/branches/branch_4x/dev-tools/scripts/smokeTestRelease.py
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/BUILD.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/JRE_VERSION_MIGRATION.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/LICENSE.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/MIGRATE.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/README.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/SYSTEM_REQUIREMENTS.txt   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilterFactory.java   (props changed)
    lucene/dev/branches/branch_4x/lucene/backwards/   (props changed)
    lucene/dev/branches/branch_4x/lucene/benchmark/   (props changed)
    lucene/dev/branches/branch_4x/lucene/build.xml   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/codecs/   (props changed)
    lucene/dev/branches/branch_4x/lucene/common-build.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.cfs.zip   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.nocfs.zip   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.cfs.zip   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.nocfs.zip   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSort.java   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSortDocValues.java   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestTotalHitCountCollector.java   (props changed)
    lucene/dev/branches/branch_4x/lucene/demo/   (props changed)
    lucene/dev/branches/branch_4x/lucene/facet/   (props changed)
    lucene/dev/branches/branch_4x/lucene/grouping/   (props changed)
    lucene/dev/branches/branch_4x/lucene/highlighter/   (props changed)
    lucene/dev/branches/branch_4x/lucene/ivy-settings.xml   (props changed)
    lucene/dev/branches/branch_4x/lucene/join/   (props changed)
    lucene/dev/branches/branch_4x/lucene/licenses/   (props changed)
    lucene/dev/branches/branch_4x/lucene/memory/   (props changed)
    lucene/dev/branches/branch_4x/lucene/misc/   (props changed)
    lucene/dev/branches/branch_4x/lucene/module-build.xml   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/queries/   (props changed)
    lucene/dev/branches/branch_4x/lucene/queryparser/   (props changed)
    lucene/dev/branches/branch_4x/lucene/sandbox/   (props changed)
    lucene/dev/branches/branch_4x/lucene/site/   (props changed)
    lucene/dev/branches/branch_4x/lucene/spatial/   (props changed)
    lucene/dev/branches/branch_4x/lucene/suggest/   (props changed)
    lucene/dev/branches/branch_4x/lucene/test-framework/   (props changed)
    lucene/dev/branches/branch_4x/lucene/tools/   (props changed)
    lucene/dev/branches/branch_4x/solr/   (props changed)
    lucene/dev/branches/branch_4x/solr/CHANGES.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/LICENSE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/README.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/SYSTEM_REQUIREMENTS.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/build.xml   (props changed)
    lucene/dev/branches/branch_4x/solr/cloud-dev/   (props changed)
    lucene/dev/branches/branch_4x/solr/common-build.xml   (props changed)
    lucene/dev/branches/branch_4x/solr/contrib/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/   (props changed)
    lucene/dev/branches/branch_4x/solr/example/   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpclient-LICENSE-ASL.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpclient-NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpcore-LICENSE-ASL.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpcore-NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpmime-LICENSE-ASL.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/licenses/httpmime-NOTICE.txt   (props changed)
    lucene/dev/branches/branch_4x/solr/scripts/   (props changed)
    lucene/dev/branches/branch_4x/solr/site/   (props changed)
    lucene/dev/branches/branch_4x/solr/solrj/   (props changed)
    lucene/dev/branches/branch_4x/solr/test-framework/   (props changed)
    lucene/dev/branches/branch_4x/solr/testlogging.properties   (props changed)
    lucene/dev/branches/branch_4x/solr/webapp/   (props changed)

Modified: lucene/dev/branches/branch_4x/dev-tools/idea/.idea/modules.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/dev-tools/idea/.idea/modules.xml?rev=1448105&r1=1448104&r2=1448105&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/dev-tools/idea/.idea/modules.xml (original)
+++ lucene/dev/branches/branch_4x/dev-tools/idea/.idea/modules.xml Wed Feb 20 11:51:46 2013
@@ -18,6 +18,7 @@
       <module filepath="$PROJECT_DIR$/lucene/analysis/uima/analysis-uima.iml" />
       <module filepath="$PROJECT_DIR$/lucene/benchmark/src/benchmark.iml" />
       <module filepath="$PROJECT_DIR$/lucene/benchmark/conf/benchmark-conf.iml" />
+      <module filepath="$PROJECT_DIR$/lucene/classification/classification.iml" />
       <module filepath="$PROJECT_DIR$/lucene/codecs/src/java/codecs.iml" />
       <module filepath="$PROJECT_DIR$/lucene/codecs/src/test/codecs-tests.iml" />
       <module filepath="$PROJECT_DIR$/lucene/codecs/src/resources/codecs-resources.iml" />

Modified: lucene/dev/branches/branch_4x/dev-tools/idea/.idea/workspace.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/dev-tools/idea/.idea/workspace.xml?rev=1448105&r1=1448104&r2=1448105&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/dev-tools/idea/.idea/workspace.xml (original)
+++ lucene/dev/branches/branch_4x/dev-tools/idea/.idea/workspace.xml Wed Feb 20 11:51:46 2013
@@ -74,6 +74,13 @@
       <option name="VM_PARAMETERS" value="-ea -DtempDir=temp" />
       <option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
     </configuration>
+    <configuration default="false" name="Module classification" type="JUnit" factoryName="JUnit">
+      <module name="classification" />
+      <option name="TEST_OBJECT" value="package" />
+      <option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/lucene/build/classification" />
+      <option name="VM_PARAMETERS" value="-ea -DtempDir=temp" />
+      <option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
+    </configuration>
     <configuration default="false" name="Module codecs" type="JUnit" factoryName="JUnit">
       <module name="codecs-tests" />
       <option name="TEST_OBJECT" value="package" />

Modified: lucene/dev/branches/branch_4x/dev-tools/scripts/smokeTestRelease.py
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/dev-tools/scripts/smokeTestRelease.py?rev=1448105&r1=1448104&r2=1448105&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/dev-tools/scripts/smokeTestRelease.py (original)
+++ lucene/dev/branches/branch_4x/dev-tools/scripts/smokeTestRelease.py Wed Feb 20 11:51:46 2013
@@ -617,7 +617,7 @@ def verifyUnpacked(project, artifact, un
 
   if project == 'lucene':
     # TODO: clean this up to not be a list of modules that we must maintain
-    extras = ('analysis', 'benchmark', 'codecs', 'core', 'demo', 'docs', 'facet', 'grouping', 'highlighter', 'join', 'memory', 'misc', 'queries', 'queryparser', 'sandbox', 'spatial', 'suggest', 'test-framework', 'licenses')
+    extras = ('analysis', 'benchmark', 'classification' 'codecs', 'core', 'demo', 'docs', 'facet', 'grouping', 'highlighter', 'join', 'memory', 'misc', 'queries', 'queryparser', 'sandbox', 'spatial', 'suggest', 'test-framework', 'licenses')
     if isSrc:
       extras += ('build.xml', 'common-build.xml', 'module-build.xml', 'ivy-settings.xml', 'backwards', 'tools', 'site')
   else:

Modified: lucene/dev/branches/branch_4x/lucene/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/build.xml?rev=1448105&r1=1448104&r2=1448105&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/build.xml (original)
+++ lucene/dev/branches/branch_4x/lucene/build.xml Wed Feb 20 11:51:46 2013
@@ -293,6 +293,7 @@
     <check-missing-javadocs dir="build/docs/core/org/apache/lucene/search/similarities" level="method"/>
     <check-missing-javadocs dir="build/docs/core/org/apache/lucene/index" level="method"/>
     <check-missing-javadocs dir="build/docs/core/org/apache/lucene/codecs" level="method"/>
+    <check-missing-javadocs dir="build/docs/classification" level="method"/>
   </target>
   
   <target name="-ecj-javadoc-lint" depends="compile,compile-test,-ecj-resolve">

Copied: lucene/dev/branches/branch_4x/lucene/classification/build.xml (from r1384219, lucene/dev/trunk/lucene/classification/build.xml)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/build.xml?p2=lucene/dev/branches/branch_4x/lucene/classification/build.xml&p1=lucene/dev/trunk/lucene/classification/build.xml&r1=1384219&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/build.xml (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/build.xml Wed Feb 20 11:51:46 2013
@@ -23,4 +23,33 @@
   </description>
 
   <import file="../module-build.xml"/>
+
+  <path id="classpath">
+    <path refid="base.classpath"/>
+    <pathelement path="${lucene-core.jar}"/>
+    <pathelement path="${queries.jar}"/>
+    <pathelement path="${project.classpath}"/>
+    <pathelement location="${build.dir}/classes/java" />
+  </path>
+
+  <path id="test.classpath">
+    <pathelement path="${analyzers-common.jar}"/>
+    <pathelement location="${test-framework.jar}"/>
+    <pathelement location="${codecs.jar}"/>
+    <path refid="test.base.classpath"/>
+  </path>
+
+  <target name="dist-maven" depends="dist-maven-src-java"/>
+  <target name="compile-core" depends="jar-queries,jar-analyzers-common,common.compile-core" />
+
+  <target name="jar-core" depends="common.jar-core" />
+
+  <target name="javadocs" depends="javadocs-queries,compile-core">
+    <invoke-module-javadoc>
+      <links>
+        <link href="../queries"/>
+      </links>
+    </invoke-module-javadoc>
+  </target>
+
 </project>

Copied: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java (from r1401338, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java?p2=lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java&p1=lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java&r1=1401338&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java Wed Feb 20 11:51:46 2013
@@ -1,5 +1,3 @@
-package org.apache.lucene.classification;
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,24 +14,39 @@ package org.apache.lucene.classification
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.lucene.classification;
 
 /**
- * The result of a call to {@link Classifier#assignClass(String)} holding an assigned class and a score.
+ * The result of a call to {@link Classifier#assignClass(String)} holding an assigned class of type <code>T</code> and a score.
+ * @lucene.experimental
  */
-public class ClassificationResult {
+public class ClassificationResult<T> {
 
-  private String assignedClass;
+  private T assignedClass;
   private double score;
 
-  public ClassificationResult(String assignedClass, double score) {
+  /**
+   * Constructor
+   * @param assignedClass the class <code>T</code> assigned by a {@link Classifier}
+   * @param score the score for the assignedClass as a <code>double</code>
+   */
+  public ClassificationResult(T assignedClass, double score) {
     this.assignedClass = assignedClass;
     this.score = score;
   }
 
-  public String getAssignedClass() {
+  /**
+   * retrieve the result class
+   * @return a <code>T</code> representing an assigned class
+   */
+  public T getAssignedClass() {
     return assignedClass;
   }
 
+  /**
+   * retrieve the result score
+   * @return a <code>double</code> representing a result score
+   */
   public double getScore() {
     return score;
   }

Copied: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java (from r1384219, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java?p2=lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java&p1=lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java&r1=1384219&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java Wed Feb 20 11:51:46 2013
@@ -1,5 +1,3 @@
-package org.apache.lucene.classification;
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,6 +14,7 @@ package org.apache.lucene.classification
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.lucene.classification;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.index.AtomicReader;
@@ -23,17 +22,19 @@ import org.apache.lucene.index.AtomicRea
 import java.io.IOException;
 
 /**
- * A classifier, see <code>http://en.wikipedia.org/wiki/Classifier_(mathematics)</code>
+ * A classifier, see <code>http://en.wikipedia.org/wiki/Classifier_(mathematics)</code>, which assign classes of type
+ * <code>T</code>
+ * @lucene.experimental
  */
-public interface Classifier {
+public interface Classifier<T> {
 
   /**
-   * Assign a class to the given text String
+   * Assign a class (with score) to the given text String
    * @param text a String containing text to be classified
-   * @return a String representing a class
+   * @return a {@link ClassificationResult} holding assigned class of type <code>T</code> and score
    * @throws IOException
    */
-  public String assignClass(String text) throws IOException;
+  public ClassificationResult<T> assignClass(String text) throws IOException;
 
   /**
    * Train the classifier using the underlying Lucene index

Copied: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java (from r1401338, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java?p2=lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java&p1=lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java&r1=1401338&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java Wed Feb 20 11:51:46 2013
@@ -1,5 +1,3 @@
-package org.apache.lucene.classification;
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,6 +14,7 @@ package org.apache.lucene.classification
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.lucene.classification;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.index.AtomicReader;
@@ -24,6 +23,7 @@ import org.apache.lucene.search.IndexSea
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.util.BytesRef;
 
 import java.io.IOException;
 import java.io.StringReader;
@@ -33,8 +33,10 @@ import java.util.Map;
 /**
  * A k-Nearest Neighbor classifier (see <code>http://en.wikipedia.org/wiki/K-nearest_neighbors</code>) based
  * on {@link MoreLikeThis}
+ *
+ * @lucene.experimental
  */
-public class KNearestNeighborClassifier implements Classifier {
+public class KNearestNeighborClassifier implements Classifier<BytesRef> {
 
   private MoreLikeThis mlt;
   private String textFieldName;
@@ -42,40 +44,55 @@ public class KNearestNeighborClassifier 
   private IndexSearcher indexSearcher;
   private int k;
 
+  /**
+   * Create a {@link Classifier} using kNN algorithm
+   *
+   * @param k the number of neighbors to analyze as an <code>int</code>
+   */
   public KNearestNeighborClassifier(int k) {
     this.k = k;
   }
 
+  /**
+   * {@inheritDoc}
+   */
   @Override
-  public ClassificationResult assignClass(String text) throws IOException {
+  public ClassificationResult<BytesRef> assignClass(String text) throws IOException {
     Query q = mlt.like(new StringReader(text), textFieldName);
-    TopDocs docs = indexSearcher.search(q, k);
+    TopDocs topDocs = indexSearcher.search(q, k);
+    return selectClassFromNeighbors(topDocs);
+  }
 
+  private ClassificationResult<BytesRef> selectClassFromNeighbors(TopDocs topDocs) throws IOException {
     // TODO : improve the nearest neighbor selection
-    Map<String, Integer> classCounts = new HashMap<String, Integer>();
-    for (ScoreDoc scoreDoc : docs.scoreDocs) {
-      String cl = indexSearcher.doc(scoreDoc.doc).getField(classFieldName).stringValue();
-      Integer count = classCounts.get(cl);
-      if (count != null) {
-        classCounts.put(cl, count + 1);
-      }
-      else {
-        classCounts.put(cl, 1);
+    Map<BytesRef, Integer> classCounts = new HashMap<BytesRef, Integer>();
+    for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
+      BytesRef cl = new BytesRef(indexSearcher.doc(scoreDoc.doc).getField(classFieldName).stringValue());
+      if (cl != null) {
+        Integer count = classCounts.get(cl);
+        if (count != null) {
+          classCounts.put(cl, count + 1);
+        } else {
+          classCounts.put(cl, 1);
+        }
       }
     }
-    int max = 0;
-    String assignedClass = null;
-    for (String cl : classCounts.keySet()) {
+    double max = 0;
+    BytesRef assignedClass = new BytesRef();
+    for (BytesRef cl : classCounts.keySet()) {
       Integer count = classCounts.get(cl);
       if (count > max) {
         max = count;
-        assignedClass = cl;
+        assignedClass = cl.clone();
       }
     }
-    double score = 1; // TODO : derive score from query
-    return new ClassificationResult(assignedClass, score);
+    double score = max / (double) k;
+    return new ClassificationResult<BytesRef>(assignedClass, score);
   }
 
+  /**
+   * {@inheritDoc}
+   */
   @Override
   public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer) throws IOException {
     this.textFieldName = textFieldName;

Copied: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java (from r1384219, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java?p2=lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java&p1=lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java&r1=1384219&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java Wed Feb 20 11:51:46 2013
@@ -1,5 +1,3 @@
-package org.apache.lucene.classification;
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,6 +14,7 @@ package org.apache.lucene.classification
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.lucene.classification;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
@@ -29,6 +28,7 @@ import org.apache.lucene.search.BooleanC
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TotalHitCountCollector;
 import org.apache.lucene.util.BytesRef;
 
 import java.io.IOException;
@@ -38,8 +38,10 @@ import java.util.LinkedList;
 
 /**
  * A simplistic Lucene based NaiveBayes classifier, see <code>http://en.wikipedia.org/wiki/Naive_Bayes_classifier</code>
+ *
+ * @lucene.experimental
  */
-public class SimpleNaiveBayesClassifier implements Classifier {
+public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
 
   private AtomicReader atomicReader;
   private String textFieldName;
@@ -48,6 +50,18 @@ public class SimpleNaiveBayesClassifier 
   private Analyzer analyzer;
   private IndexSearcher indexSearcher;
 
+  /**
+   * Creates a new NaiveBayes classifier.
+   * Note that you must call {@link #train(AtomicReader, String, String, Analyzer) train()} before you can
+   * classify any documents.
+   */
+  public SimpleNaiveBayesClassifier() {
+  }
+
+  /**
+   * {@inheritDoc}
+   */
+  @Override
   public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer)
       throws IOException {
     this.atomicReader = atomicReader;
@@ -71,34 +85,37 @@ public class SimpleNaiveBayesClassifier 
     return result.toArray(new String[result.size()]);
   }
 
-  public String assignClass(String inputDocument) throws IOException {
+  /**
+   * {@inheritDoc}
+   */
+  @Override
+  public ClassificationResult<BytesRef> assignClass(String inputDocument) throws IOException {
     if (atomicReader == null) {
       throw new RuntimeException("need to train the classifier first");
     }
-    Double max = 0d;
-    String foundClass = null;
+    double max = 0d;
+    BytesRef foundClass = new BytesRef();
 
     Terms terms = MultiFields.getTerms(atomicReader, classFieldName);
     TermsEnum termsEnum = terms.iterator(null);
-    BytesRef t = termsEnum.next();
-    while (t != null) {
-      String classValue = t.utf8ToString();
+    BytesRef next;
+    String[] tokenizedDoc = tokenizeDoc(inputDocument);
+    while ((next = termsEnum.next()) != null) {
       // TODO : turn it to be in log scale
-      Double clVal = calculatePrior(classValue) * calculateLikelihood(inputDocument, classValue);
+      double clVal = calculatePrior(next) * calculateLikelihood(tokenizedDoc, next);
       if (clVal > max) {
         max = clVal;
-        foundClass = classValue;
+        foundClass = next.clone();
       }
-      t = termsEnum.next();
     }
-    return foundClass;
+    return new ClassificationResult<BytesRef>(foundClass, max);
   }
 
 
-  private Double calculateLikelihood(String document, String c) throws IOException {
+  private double calculateLikelihood(String[] tokenizedDoc, BytesRef c) throws IOException {
     // for each word
-    Double result = 1d;
-    for (String word : tokenizeDoc(document)) {
+    double result = 1d;
+    for (String word : tokenizedDoc) {
       // search with text:word AND class:c
       int hits = getWordFreqForClass(word, c);
 
@@ -117,26 +134,28 @@ public class SimpleNaiveBayesClassifier 
     return result;
   }
 
-  private double getTextTermFreqForClass(String c) throws IOException {
+  private double getTextTermFreqForClass(BytesRef c) throws IOException {
     Terms terms = MultiFields.getTerms(atomicReader, textFieldName);
     long numPostings = terms.getSumDocFreq(); // number of term/doc pairs
     double avgNumberOfUniqueTerms = numPostings / (double) terms.getDocCount(); // avg # of unique terms per doc
-    int docsWithC = atomicReader.docFreq(classFieldName, new BytesRef(c));
+    int docsWithC = atomicReader.docFreq(new Term(classFieldName, c));
     return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text field per doc * # docs with c
   }
 
-  private int getWordFreqForClass(String word, String c) throws IOException {
+  private int getWordFreqForClass(String word, BytesRef c) throws IOException {
     BooleanQuery booleanQuery = new BooleanQuery();
     booleanQuery.add(new BooleanClause(new TermQuery(new Term(textFieldName, word)), BooleanClause.Occur.MUST));
     booleanQuery.add(new BooleanClause(new TermQuery(new Term(classFieldName, c)), BooleanClause.Occur.MUST));
-    return indexSearcher.search(booleanQuery, 1).totalHits;
+    TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector();
+    indexSearcher.search(booleanQuery, totalHitCountCollector);
+    return totalHitCountCollector.getTotalHits();
   }
 
-  private Double calculatePrior(String currentClass) throws IOException {
+  private double calculatePrior(BytesRef currentClass) throws IOException {
     return (double) docCount(currentClass) / docsWithClassSize;
   }
 
-  private int docCount(String countedClass) throws IOException {
+  private int docCount(BytesRef countedClass) throws IOException {
     return atomicReader.docFreq(new Term(classFieldName, countedClass));
   }
 }

Copied: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/package.html (from r1384293, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/package.html)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/package.html?p2=lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/package.html&p1=lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/package.html&r1=1384293&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/package.html (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/package.html Wed Feb 20 11:51:46 2013
@@ -18,6 +18,6 @@
 <body>
 Uses already seen data (the indexed documents) to classify new documents.
 Currently only contains a (simplistic) Lucene based Naive Bayes classifier 
-but more implementations will be added in the future.
+and a k-Nearest Neighbor classifier
 </body>
 </html>

Copied: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java (from r1415060, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java?p2=lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java&p1=lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java&r1=1415060&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java Wed Feb 20 11:51:46 2013
@@ -25,7 +25,7 @@ import org.apache.lucene.document.TextFi
 import org.apache.lucene.index.AtomicReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.index.StorableField;
+import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.ScoreDoc;
@@ -43,20 +43,37 @@ public class DatasetSplitter {
   private double crossValidationRatio;
   private double testRatio;
 
+  /**
+   * Create a {@link DatasetSplitter} by giving test and cross validation IDXs sizes
+   *
+   * @param testRatio            the ratio of the original index to be used for the test IDX as a <code>double</code> between 0.0 and 1.0
+   * @param crossValidationRatio the ratio of the original index to be used for the c.v. IDX as a <code>double</code> between 0.0 and 1.0
+   */
   public DatasetSplitter(double testRatio, double crossValidationRatio) {
     this.crossValidationRatio = crossValidationRatio;
     this.testRatio = testRatio;
   }
 
+  /**
+   * Split a given index into 3 indexes for training, test and cross validation tasks respectively
+   *
+   * @param originalIndex        an {@link AtomicReader} on the source index
+   * @param trainingIndex        a {@link Directory} used to write the training index
+   * @param testIndex            a {@link Directory} used to write the test index
+   * @param crossValidationIndex a {@link Directory} used to write the cross validation index
+   * @param analyzer             {@link Analyzer} used to create the new docs
+   * @param fieldNames           names of fields that need to be put in the new indexes or <code>null</code> if all should be used
+   * @throws IOException if any writing operation fails on any of the indexes
+   */
   public void split(AtomicReader originalIndex, Directory trainingIndex, Directory testIndex, Directory crossValidationIndex,
                     Analyzer analyzer, String... fieldNames) throws IOException {
 
     // TODO : check that the passed fields are stored in the original index
 
     // create IWs for train / test / cv IDXs
-    IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(Version.LUCENE_50, analyzer));
-    IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(Version.LUCENE_50, analyzer));
-    IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(Version.LUCENE_50, analyzer));
+    IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(Version.LUCENE_42, analyzer));
+    IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(Version.LUCENE_42, analyzer));
+    IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(Version.LUCENE_42, analyzer));
 
     try {
       int size = originalIndex.maxDoc();
@@ -82,17 +99,14 @@ public class DatasetSplitter {
             doc.add(new Field(fieldName, originalIndex.document(scoreDoc.doc).getField(fieldName).stringValue(), ft));
           }
         } else {
-          for (StorableField storableField : originalIndex.document(scoreDoc.doc).getFields()) {
-            if (storableField.readerValue()!= null){
+          for (IndexableField storableField : originalIndex.document(scoreDoc.doc).getFields()) {
+            if (storableField.readerValue() != null) {
               doc.add(new Field(storableField.name(), storableField.readerValue(), ft));
-            }
-            else if (storableField.binaryValue()!= null){
+            } else if (storableField.binaryValue() != null) {
               doc.add(new Field(storableField.name(), storableField.binaryValue(), ft));
-            }
-            else if (storableField.stringValue()!= null){
+            } else if (storableField.stringValue() != null) {
               doc.add(new Field(storableField.name(), storableField.stringValue(), ft));
-            }
-            else if (storableField.numericValue()!= null){
+            } else if (storableField.numericValue() != null) {
               doc.add(new Field(storableField.name(), storableField.numericValue().toString(), ft));
             }
           }
@@ -101,19 +115,19 @@ public class DatasetSplitter {
         // add it to one of the IDXs
         if (b % 2 == 0 && testWriter.maxDoc() < size * testRatio) {
           testWriter.addDocument(doc);
-          testWriter.commit();
         } else if (cvWriter.maxDoc() < size * crossValidationRatio) {
           cvWriter.addDocument(doc);
-          cvWriter.commit();
         } else {
           trainingWriter.addDocument(doc);
-          trainingWriter.commit();
         }
         b++;
       }
     } catch (Exception e) {
       throw new IOException(e);
     } finally {
+      testWriter.commit();
+      cvWriter.commit();
+      trainingWriter.commit();
       // close IWs
       testWriter.close();
       cvWriter.close();

Copied: lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java (from r1401338, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java?p2=lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java&p1=lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java&r1=1401338&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java Wed Feb 20 11:51:46 2013
@@ -1,5 +1,3 @@
-package org.apache.lucene.classification;
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,14 +14,17 @@ package org.apache.lucene.classification
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.lucene.classification;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.SlowCompositeReaderWrapper;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.After;
 import org.junit.Before;
@@ -31,7 +32,7 @@ import org.junit.Before;
 /**
  * Base class for testing {@link Classifier}s
  */
-public class ClassificationTestBase extends LuceneTestCase {
+public abstract class ClassificationTestBase extends LuceneTestCase {
 
   private RandomIndexWriter indexWriter;
   private String textFieldName;
@@ -54,15 +55,17 @@ public class ClassificationTestBase exte
     dir.close();
   }
 
-  protected void checkCorrectClassification(Classifier classifier, Analyzer analyzer) throws Exception {
+
+  protected void checkCorrectClassification(Classifier<BytesRef> classifier, Analyzer analyzer) throws Exception {
     SlowCompositeReaderWrapper compositeReaderWrapper = null;
     try {
       populateIndex(analyzer);
       compositeReaderWrapper = new SlowCompositeReaderWrapper(indexWriter.getReader());
       classifier.train(compositeReaderWrapper, textFieldName, classFieldName, analyzer);
       String newText = "Much is made of what the likes of Facebook, Google and Apple know about users. Truth is, Amazon may know more.";
-      ClassificationResult classificationResult = classifier.assignClass(newText);
-      assertEquals("technology", classificationResult.getAssignedClass());
+      ClassificationResult<BytesRef> classificationResult = classifier.assignClass(newText);
+      assertNotNull(classificationResult.getAssignedClass());
+      assertEquals(new BytesRef("technology"), classificationResult.getAssignedClass());
       assertTrue(classificationResult.getScore() > 0);
     } finally {
       if (compositeReaderWrapper != null)
@@ -72,52 +75,57 @@ public class ClassificationTestBase exte
 
   private void populateIndex(Analyzer analyzer) throws Exception {
 
+    FieldType ft = new FieldType(TextField.TYPE_STORED);
+    ft.setStoreTermVectors(true);
+    ft.setStoreTermVectorOffsets(true);
+    ft.setStoreTermVectorPositions(true);
+
     Document doc = new Document();
-    doc.add(new TextField(textFieldName, "The traveling press secretary for Mitt Romney lost his cool and cursed at reporters " +
+    doc.add(new Field(textFieldName, "The traveling press secretary for Mitt Romney lost his cool and cursed at reporters " +
         "who attempted to ask questions of the Republican presidential candidate in a public plaza near the Tomb of " +
-        "the Unknown Soldier in Warsaw Tuesday.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
+        "the Unknown Soldier in Warsaw Tuesday.", ft));
+    doc.add(new Field(classFieldName, "politics", ft));
 
     indexWriter.addDocument(doc, analyzer);
 
     doc = new Document();
-    doc.add(new TextField(textFieldName, "Mitt Romney seeks to assure Israel and Iran, as well as Jewish voters in the United" +
-        " States, that he will be tougher against Iran's nuclear ambitions than President Barack Obama.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
+    doc.add(new Field(textFieldName, "Mitt Romney seeks to assure Israel and Iran, as well as Jewish voters in the United" +
+        " States, that he will be tougher against Iran's nuclear ambitions than President Barack Obama.", ft));
+    doc.add(new Field(classFieldName, "politics", ft));
     indexWriter.addDocument(doc, analyzer);
 
     doc = new Document();
-    doc.add(new TextField(textFieldName, "And there's a threshold question that he has to answer for the American people and " +
+    doc.add(new Field(textFieldName, "And there's a threshold question that he has to answer for the American people and " +
         "that's whether he is prepared to be commander-in-chief,\" she continued. \"As we look to the past events, we " +
-        "know that this raises some questions about his preparedness and we'll see how the rest of his trip goes.\"", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
+        "know that this raises some questions about his preparedness and we'll see how the rest of his trip goes.\"", ft));
+    doc.add(new Field(classFieldName, "politics", ft));
     indexWriter.addDocument(doc, analyzer);
 
     doc = new Document();
-    doc.add(new TextField(textFieldName, "Still, when it comes to gun policy, many congressional Democrats have \"decided to " +
+    doc.add(new Field(textFieldName, "Still, when it comes to gun policy, many congressional Democrats have \"decided to " +
         "keep quiet and not go there,\" said Alan Lizotte, dean and professor at the State University of New York at " +
-        "Albany's School of Criminal Justice.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
+        "Albany's School of Criminal Justice.", ft));
+    doc.add(new Field(classFieldName, "politics", ft));
     indexWriter.addDocument(doc, analyzer);
 
     doc = new Document();
-    doc.add(new TextField(textFieldName, "Standing amongst the thousands of people at the state Capitol, Jorstad, director of " +
+    doc.add(new Field(textFieldName, "Standing amongst the thousands of people at the state Capitol, Jorstad, director of " +
         "technology at the University of Wisconsin-La Crosse, documented the historic moment and shared it with the " +
-        "world through the Internet.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
+        "world through the Internet.", ft));
+    doc.add(new Field(classFieldName, "technology", ft));
     indexWriter.addDocument(doc, analyzer);
 
     doc = new Document();
-    doc.add(new TextField(textFieldName, "So, about all those experts and analysts who've spent the past year or so saying " +
-        "Facebook was going to make a phone. A new expert has stepped forward to say it's not going to happen.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
+    doc.add(new Field(textFieldName, "So, about all those experts and analysts who've spent the past year or so saying " +
+        "Facebook was going to make a phone. A new expert has stepped forward to say it's not going to happen.", ft));
+    doc.add(new Field(classFieldName, "technology", ft));
     indexWriter.addDocument(doc, analyzer);
 
     doc = new Document();
-    doc.add(new TextField(textFieldName, "More than 400 million people trust Google with their e-mail, and 50 million store files" +
+    doc.add(new Field(textFieldName, "More than 400 million people trust Google with their e-mail, and 50 million store files" +
         " in the cloud using the Dropbox service. People manage their bank accounts, pay bills, trade stocks and " +
-        "generally transfer or store huge volumes of personal data online.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
+        "generally transfer or store huge volumes of personal data online.", ft));
+    doc.add(new Field(classFieldName, "technology", ft));
     indexWriter.addDocument(doc, analyzer);
 
     indexWriter.commit();

Copied: lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java (from r1401338, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java?p2=lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java&p1=lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java&r1=1401338&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java Wed Feb 20 11:51:46 2013
@@ -1,5 +1,3 @@
-package org.apache.lucene.classification;
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,6 +14,7 @@ package org.apache.lucene.classification
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.lucene.classification;
 
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.junit.Test;
@@ -27,7 +26,7 @@ public class KNearestNeighborClassifierT
 
   @Test
   public void testBasicUsage() throws Exception {
-     checkCorrectClassification(new KNearestNeighborClassifier(1), new MockAnalyzer(random()));
+    checkCorrectClassification(new KNearestNeighborClassifier(1), new MockAnalyzer(random()));
   }
 
 }

Copied: lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java (from r1384219, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java?p2=lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java&p1=lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java&r1=1384219&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java Wed Feb 20 11:51:46 2013
@@ -1,5 +1,3 @@
-package org.apache.lucene.classification;
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,115 +14,36 @@ package org.apache.lucene.classification
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.lucene.classification;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.RandomIndexWriter;
-import org.apache.lucene.index.SlowCompositeReaderWrapper;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.LuceneTestCase;
-import org.junit.After;
-import org.junit.Before;
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
 import org.junit.Test;
 
+import java.io.Reader;
+
 /**
  * Testcase for {@link SimpleNaiveBayesClassifier}
  */
-public class SimpleNaiveBayesClassifierTest extends LuceneTestCase {
-
-  private RandomIndexWriter indexWriter;
-  private String textFieldName;
-  private String classFieldName;
-  private Analyzer analyzer;
-  private Directory dir;
-
-  @Before
-  public void setUp() throws Exception {
-    super.setUp();
-    analyzer = new MockAnalyzer(random());
-    dir = newDirectory();
-    indexWriter = new RandomIndexWriter(random(), dir);
-    textFieldName = "text";
-    classFieldName = "cat";
-  }
-
-  @After
-  public void tearDown() throws Exception {
-    super.tearDown();
-    indexWriter.close();
-    dir.close();
-  }
+public class SimpleNaiveBayesClassifierTest extends ClassificationTestBase {
 
   @Test
   public void testBasicUsage() throws Exception {
-    SlowCompositeReaderWrapper compositeReaderWrapper = null;
-    try {
-      populateIndex();
-      SimpleNaiveBayesClassifier simpleNaiveBayesClassifier = new SimpleNaiveBayesClassifier();
-      compositeReaderWrapper = new SlowCompositeReaderWrapper(indexWriter.getReader());
-      simpleNaiveBayesClassifier.train(compositeReaderWrapper, textFieldName, classFieldName, analyzer);
-      String newText = "Much is made of what the likes of Facebook, Google and Apple know about users. Truth is, Amazon may know more. ";
-      assertEquals("technology", simpleNaiveBayesClassifier.assignClass(newText));
-    } finally {
-      if (compositeReaderWrapper != null)
-        compositeReaderWrapper.close();
-    }
+    checkCorrectClassification(new SimpleNaiveBayesClassifier(), new MockAnalyzer(random()));
   }
 
-  private void populateIndex() throws Exception {
-
-    Document doc = new Document();
-    doc.add(new TextField(textFieldName, "The traveling press secretary for Mitt Romney lost his cool and cursed at reporters " +
-        "who attempted to ask questions of the Republican presidential candidate in a public plaza near the Tomb of " +
-        "the Unknown Soldier in Warsaw Tuesday.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
-
-    indexWriter.addDocument(doc, analyzer);
-
-    doc = new Document();
-    doc.add(new TextField(textFieldName, "Mitt Romney seeks to assure Israel and Iran, as well as Jewish voters in the United" +
-        " States, that he will be tougher against Iran's nuclear ambitions than President Barack Obama.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
-    indexWriter.addDocument(doc, analyzer);
-
-    doc = new Document();
-    doc.add(new TextField(textFieldName, "And there's a threshold question that he has to answer for the American people and " +
-        "that's whether he is prepared to be commander-in-chief,\" she continued. \"As we look to the past events, we " +
-        "know that this raises some questions about his preparedness and we'll see how the rest of his trip goes.\"", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
-    indexWriter.addDocument(doc, analyzer);
-
-    doc = new Document();
-    doc.add(new TextField(textFieldName, "Still, when it comes to gun policy, many congressional Democrats have \"decided to " +
-        "keep quiet and not go there,\" said Alan Lizotte, dean and professor at the State University of New York at " +
-        "Albany's School of Criminal Justice.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
-    indexWriter.addDocument(doc, analyzer);
-
-    doc = new Document();
-    doc.add(new TextField(textFieldName, "Standing amongst the thousands of people at the state Capitol, Jorstad, director of " +
-        "technology at the University of Wisconsin-La Crosse, documented the historic moment and shared it with the " +
-        "world through the Internet.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
-    indexWriter.addDocument(doc, analyzer);
-
-    doc = new Document();
-    doc.add(new TextField(textFieldName, "So, about all those experts and analysts who've spent the past year or so saying " +
-        "Facebook was going to make a phone. A new expert has stepped forward to say it's not going to happen.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
-    indexWriter.addDocument(doc, analyzer);
-
-    doc = new Document();
-    doc.add(new TextField(textFieldName, "More than 400 million people trust Google with their e-mail, and 50 million store files" +
-        " in the cloud using the Dropbox service. People manage their bank accounts, pay bills, trade stocks and " +
-        "generally transfer or store huge volumes of personal data online.", Field.Store.YES));
-    doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
-    indexWriter.addDocument(doc, analyzer);
+  @Test
+  public void testNGramUsage() throws Exception {
+    checkCorrectClassification(new SimpleNaiveBayesClassifier(), new NGramAnalyzer());
+  }
 
-    indexWriter.commit();
+  private class NGramAnalyzer extends Analyzer {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      return new TokenStreamComponents(new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.Side.BACK,
+          10, 20));
+    }
   }
 
 }

Copied: lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java (from r1415060, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java?p2=lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java&p1=lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java&r1=1415060&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java Wed Feb 20 11:51:46 2013
@@ -55,7 +55,7 @@ public class DataSplitterTest extends Lu
   public void setUp() throws Exception {
     super.setUp();
     dir = newDirectory();
-    indexWriter = new RandomIndexWriter(random(), dir);
+    indexWriter = new RandomIndexWriter(random(), dir, new MockAnalyzer(random()));
 
     FieldType ft = new FieldType(TextField.TYPE_STORED);
     ft.setStoreTermVectors(true);
@@ -91,7 +91,7 @@ public class DataSplitterTest extends Lu
 
   @Test
   public void testSplitOnAllFields() throws Exception {
-    assertSplit(originalIndex, 0.1, 0.1, null);
+    assertSplit(originalIndex, 0.1, 0.1);
   }
 
 

Modified: lucene/dev/branches/branch_4x/lucene/module-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/module-build.xml?rev=1448105&r1=1448104&r2=1448105&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/module-build.xml (original)
+++ lucene/dev/branches/branch_4x/lucene/module-build.xml Wed Feb 20 11:51:46 2013
@@ -176,6 +176,28 @@
     </ant>
     <property name="queries-javadocs.uptodate" value="true"/>
   </target>
+
+  <property name="classification.jar" value="${common.dir}/build/classification/lucene-classification-${version}.jar"/>
+  <target name="check-classification-uptodate" unless="classification.uptodate">
+    <module-uptodate name="classification" jarfile="${classification.jar}" property="classification.uptodate"/>
+  </target>
+  <target name="jar-classification" unless="classification.uptodate" depends="check-classification-uptodate">
+    <ant dir="${common.dir}/classification" target="jar-core" inheritAll="false">
+      <propertyset refid="uptodate.and.compiled.properties"/>
+    </ant>
+    <property name="classification.uptodate" value="true"/>
+  </target>
+
+  <property name="classification-javadoc.jar" value="${common.dir}/build/classification/lucene-classification-${version}-javadoc.jar"/>
+  <target name="check-classification-javadocs-uptodate" unless="classification-javadocs.uptodate">
+    <module-uptodate name="classification" jarfile="${classification-javadoc.jar}" property="classification-javadocs.uptodate"/>
+  </target>
+  <target name="javadocs-classification" unless="classification-javadocs.uptodate" depends="check-classification-javadocs-uptodate">
+    <ant dir="${common.dir}/classification" target="javadocs" inheritAll="false">
+      <propertyset refid="uptodate.and.compiled.properties"/>
+    </ant>
+    <property name="classification-javadocs.uptodate" value="true"/>
+  </target>
   
   <property name="facet.jar" value="${common.dir}/build/facet/lucene-facet-${version}.jar"/>
   <target name="check-facet-uptodate" unless="facet.uptodate">