You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by to...@apache.org on 2013/02/20 12:51:48 UTC
svn commit: r1448105 - in /lucene/dev/branches/branch_4x: ./ dev-tools/
dev-tools/idea/.idea/ dev-tools/idea/lucene/classification/
dev-tools/scripts/ lucene/ lucene/analysis/
lucene/analysis/icu/src/java/org/apache/lucene/collation/ lucene/backwards/
...
Author: tommaso
Date: Wed Feb 20 11:51:46 2013
New Revision: 1448105
URL: http://svn.apache.org/r1448105
Log:
LUCENE-4781 - backporting classification module to branch_4x
Added:
lucene/dev/branches/branch_4x/dev-tools/idea/lucene/classification/
- copied from r1384220, lucene/dev/trunk/dev-tools/idea/lucene/classification/
lucene/dev/branches/branch_4x/dev-tools/idea/lucene/classification/classification.iml (props changed)
- copied unchanged from r1384220, lucene/dev/trunk/dev-tools/idea/lucene/classification/classification.iml
lucene/dev/branches/branch_4x/lucene/classification/ (props changed)
- copied from r1384219, lucene/dev/trunk/lucene/classification/
lucene/dev/branches/branch_4x/lucene/classification/build.xml (contents, props changed)
- copied, changed from r1384219, lucene/dev/trunk/lucene/classification/build.xml
lucene/dev/branches/branch_4x/lucene/classification/ivy.xml (props changed)
- copied unchanged from r1384219, lucene/dev/trunk/lucene/classification/ivy.xml
lucene/dev/branches/branch_4x/lucene/classification/src/
- copied from r1384219, lucene/dev/trunk/lucene/classification/src/
lucene/dev/branches/branch_4x/lucene/classification/src/java/
- copied from r1384219, lucene/dev/trunk/lucene/classification/src/java/
lucene/dev/branches/branch_4x/lucene/classification/src/java/org/
- copied from r1384219, lucene/dev/trunk/lucene/classification/src/java/org/
lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/
- copied from r1384219, lucene/dev/trunk/lucene/classification/src/java/org/apache/
lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/
- copied from r1384219, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/
lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/
- copied from r1384219, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/
lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java
- copied, changed from r1401338, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java
lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java (contents, props changed)
- copied, changed from r1384219, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java
lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
- copied, changed from r1401338, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java (contents, props changed)
- copied, changed from r1384219, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/package.html
- copied, changed from r1384293, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/package.html
lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/
- copied from r1415060, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/utils/
lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java
- copied, changed from r1415060, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java
lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/package.html
- copied unchanged from r1415136, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/utils/package.html
lucene/dev/branches/branch_4x/lucene/classification/src/java/overview.html (props changed)
- copied unchanged from r1384225, lucene/dev/trunk/lucene/classification/src/java/overview.html
lucene/dev/branches/branch_4x/lucene/classification/src/test/
- copied from r1384219, lucene/dev/trunk/lucene/classification/src/test/
lucene/dev/branches/branch_4x/lucene/classification/src/test/org/
- copied from r1384219, lucene/dev/trunk/lucene/classification/src/test/org/
lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/
- copied from r1384219, lucene/dev/trunk/lucene/classification/src/test/org/apache/
lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/
- copied from r1384219, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/
lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/
- copied from r1384219, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/
lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java
- copied, changed from r1401338, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java
lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java
- copied, changed from r1401338, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java
lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java (contents, props changed)
- copied, changed from r1384219, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/utils/
- copied from r1415060, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/utils/
lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java
- copied, changed from r1415060, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/dev-tools/ (props changed)
lucene/dev/branches/branch_4x/dev-tools/idea/.idea/modules.xml
lucene/dev/branches/branch_4x/dev-tools/idea/.idea/workspace.xml
lucene/dev/branches/branch_4x/dev-tools/scripts/smokeTestRelease.py
lucene/dev/branches/branch_4x/lucene/ (props changed)
lucene/dev/branches/branch_4x/lucene/BUILD.txt (props changed)
lucene/dev/branches/branch_4x/lucene/CHANGES.txt (props changed)
lucene/dev/branches/branch_4x/lucene/JRE_VERSION_MIGRATION.txt (props changed)
lucene/dev/branches/branch_4x/lucene/LICENSE.txt (props changed)
lucene/dev/branches/branch_4x/lucene/MIGRATE.txt (props changed)
lucene/dev/branches/branch_4x/lucene/NOTICE.txt (props changed)
lucene/dev/branches/branch_4x/lucene/README.txt (props changed)
lucene/dev/branches/branch_4x/lucene/SYSTEM_REQUIREMENTS.txt (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/ (props changed)
lucene/dev/branches/branch_4x/lucene/analysis/icu/src/java/org/apache/lucene/collation/ICUCollationKeyFilterFactory.java (props changed)
lucene/dev/branches/branch_4x/lucene/backwards/ (props changed)
lucene/dev/branches/branch_4x/lucene/benchmark/ (props changed)
lucene/dev/branches/branch_4x/lucene/build.xml (contents, props changed)
lucene/dev/branches/branch_4x/lucene/codecs/ (props changed)
lucene/dev/branches/branch_4x/lucene/common-build.xml (props changed)
lucene/dev/branches/branch_4x/lucene/core/ (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.cfs.zip (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.nocfs.zip (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.cfs.zip (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.nocfs.zip (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSort.java (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSortDocValues.java (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestSortRandom.java (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestTopFieldCollector.java (props changed)
lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/search/TestTotalHitCountCollector.java (props changed)
lucene/dev/branches/branch_4x/lucene/demo/ (props changed)
lucene/dev/branches/branch_4x/lucene/facet/ (props changed)
lucene/dev/branches/branch_4x/lucene/grouping/ (props changed)
lucene/dev/branches/branch_4x/lucene/highlighter/ (props changed)
lucene/dev/branches/branch_4x/lucene/ivy-settings.xml (props changed)
lucene/dev/branches/branch_4x/lucene/join/ (props changed)
lucene/dev/branches/branch_4x/lucene/licenses/ (props changed)
lucene/dev/branches/branch_4x/lucene/memory/ (props changed)
lucene/dev/branches/branch_4x/lucene/misc/ (props changed)
lucene/dev/branches/branch_4x/lucene/module-build.xml (contents, props changed)
lucene/dev/branches/branch_4x/lucene/queries/ (props changed)
lucene/dev/branches/branch_4x/lucene/queryparser/ (props changed)
lucene/dev/branches/branch_4x/lucene/sandbox/ (props changed)
lucene/dev/branches/branch_4x/lucene/site/ (props changed)
lucene/dev/branches/branch_4x/lucene/spatial/ (props changed)
lucene/dev/branches/branch_4x/lucene/suggest/ (props changed)
lucene/dev/branches/branch_4x/lucene/test-framework/ (props changed)
lucene/dev/branches/branch_4x/lucene/tools/ (props changed)
lucene/dev/branches/branch_4x/solr/ (props changed)
lucene/dev/branches/branch_4x/solr/CHANGES.txt (props changed)
lucene/dev/branches/branch_4x/solr/LICENSE.txt (props changed)
lucene/dev/branches/branch_4x/solr/NOTICE.txt (props changed)
lucene/dev/branches/branch_4x/solr/README.txt (props changed)
lucene/dev/branches/branch_4x/solr/SYSTEM_REQUIREMENTS.txt (props changed)
lucene/dev/branches/branch_4x/solr/build.xml (props changed)
lucene/dev/branches/branch_4x/solr/cloud-dev/ (props changed)
lucene/dev/branches/branch_4x/solr/common-build.xml (props changed)
lucene/dev/branches/branch_4x/solr/contrib/ (props changed)
lucene/dev/branches/branch_4x/solr/core/ (props changed)
lucene/dev/branches/branch_4x/solr/example/ (props changed)
lucene/dev/branches/branch_4x/solr/licenses/ (props changed)
lucene/dev/branches/branch_4x/solr/licenses/httpclient-LICENSE-ASL.txt (props changed)
lucene/dev/branches/branch_4x/solr/licenses/httpclient-NOTICE.txt (props changed)
lucene/dev/branches/branch_4x/solr/licenses/httpcore-LICENSE-ASL.txt (props changed)
lucene/dev/branches/branch_4x/solr/licenses/httpcore-NOTICE.txt (props changed)
lucene/dev/branches/branch_4x/solr/licenses/httpmime-LICENSE-ASL.txt (props changed)
lucene/dev/branches/branch_4x/solr/licenses/httpmime-NOTICE.txt (props changed)
lucene/dev/branches/branch_4x/solr/scripts/ (props changed)
lucene/dev/branches/branch_4x/solr/site/ (props changed)
lucene/dev/branches/branch_4x/solr/solrj/ (props changed)
lucene/dev/branches/branch_4x/solr/test-framework/ (props changed)
lucene/dev/branches/branch_4x/solr/testlogging.properties (props changed)
lucene/dev/branches/branch_4x/solr/webapp/ (props changed)
Modified: lucene/dev/branches/branch_4x/dev-tools/idea/.idea/modules.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/dev-tools/idea/.idea/modules.xml?rev=1448105&r1=1448104&r2=1448105&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/dev-tools/idea/.idea/modules.xml (original)
+++ lucene/dev/branches/branch_4x/dev-tools/idea/.idea/modules.xml Wed Feb 20 11:51:46 2013
@@ -18,6 +18,7 @@
<module filepath="$PROJECT_DIR$/lucene/analysis/uima/analysis-uima.iml" />
<module filepath="$PROJECT_DIR$/lucene/benchmark/src/benchmark.iml" />
<module filepath="$PROJECT_DIR$/lucene/benchmark/conf/benchmark-conf.iml" />
+ <module filepath="$PROJECT_DIR$/lucene/classification/classification.iml" />
<module filepath="$PROJECT_DIR$/lucene/codecs/src/java/codecs.iml" />
<module filepath="$PROJECT_DIR$/lucene/codecs/src/test/codecs-tests.iml" />
<module filepath="$PROJECT_DIR$/lucene/codecs/src/resources/codecs-resources.iml" />
Modified: lucene/dev/branches/branch_4x/dev-tools/idea/.idea/workspace.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/dev-tools/idea/.idea/workspace.xml?rev=1448105&r1=1448104&r2=1448105&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/dev-tools/idea/.idea/workspace.xml (original)
+++ lucene/dev/branches/branch_4x/dev-tools/idea/.idea/workspace.xml Wed Feb 20 11:51:46 2013
@@ -74,6 +74,13 @@
<option name="VM_PARAMETERS" value="-ea -DtempDir=temp" />
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
</configuration>
+ <configuration default="false" name="Module classification" type="JUnit" factoryName="JUnit">
+ <module name="classification" />
+ <option name="TEST_OBJECT" value="package" />
+ <option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/lucene/build/classification" />
+ <option name="VM_PARAMETERS" value="-ea -DtempDir=temp" />
+ <option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
+ </configuration>
<configuration default="false" name="Module codecs" type="JUnit" factoryName="JUnit">
<module name="codecs-tests" />
<option name="TEST_OBJECT" value="package" />
Modified: lucene/dev/branches/branch_4x/dev-tools/scripts/smokeTestRelease.py
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/dev-tools/scripts/smokeTestRelease.py?rev=1448105&r1=1448104&r2=1448105&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/dev-tools/scripts/smokeTestRelease.py (original)
+++ lucene/dev/branches/branch_4x/dev-tools/scripts/smokeTestRelease.py Wed Feb 20 11:51:46 2013
@@ -617,7 +617,7 @@ def verifyUnpacked(project, artifact, un
if project == 'lucene':
# TODO: clean this up to not be a list of modules that we must maintain
- extras = ('analysis', 'benchmark', 'codecs', 'core', 'demo', 'docs', 'facet', 'grouping', 'highlighter', 'join', 'memory', 'misc', 'queries', 'queryparser', 'sandbox', 'spatial', 'suggest', 'test-framework', 'licenses')
+ extras = ('analysis', 'benchmark', 'classification' 'codecs', 'core', 'demo', 'docs', 'facet', 'grouping', 'highlighter', 'join', 'memory', 'misc', 'queries', 'queryparser', 'sandbox', 'spatial', 'suggest', 'test-framework', 'licenses')
if isSrc:
extras += ('build.xml', 'common-build.xml', 'module-build.xml', 'ivy-settings.xml', 'backwards', 'tools', 'site')
else:
Modified: lucene/dev/branches/branch_4x/lucene/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/build.xml?rev=1448105&r1=1448104&r2=1448105&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/build.xml (original)
+++ lucene/dev/branches/branch_4x/lucene/build.xml Wed Feb 20 11:51:46 2013
@@ -293,6 +293,7 @@
<check-missing-javadocs dir="build/docs/core/org/apache/lucene/search/similarities" level="method"/>
<check-missing-javadocs dir="build/docs/core/org/apache/lucene/index" level="method"/>
<check-missing-javadocs dir="build/docs/core/org/apache/lucene/codecs" level="method"/>
+ <check-missing-javadocs dir="build/docs/classification" level="method"/>
</target>
<target name="-ecj-javadoc-lint" depends="compile,compile-test,-ecj-resolve">
Copied: lucene/dev/branches/branch_4x/lucene/classification/build.xml (from r1384219, lucene/dev/trunk/lucene/classification/build.xml)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/build.xml?p2=lucene/dev/branches/branch_4x/lucene/classification/build.xml&p1=lucene/dev/trunk/lucene/classification/build.xml&r1=1384219&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/build.xml (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/build.xml Wed Feb 20 11:51:46 2013
@@ -23,4 +23,33 @@
</description>
<import file="../module-build.xml"/>
+
+ <path id="classpath">
+ <path refid="base.classpath"/>
+ <pathelement path="${lucene-core.jar}"/>
+ <pathelement path="${queries.jar}"/>
+ <pathelement path="${project.classpath}"/>
+ <pathelement location="${build.dir}/classes/java" />
+ </path>
+
+ <path id="test.classpath">
+ <pathelement path="${analyzers-common.jar}"/>
+ <pathelement location="${test-framework.jar}"/>
+ <pathelement location="${codecs.jar}"/>
+ <path refid="test.base.classpath"/>
+ </path>
+
+ <target name="dist-maven" depends="dist-maven-src-java"/>
+ <target name="compile-core" depends="jar-queries,jar-analyzers-common,common.compile-core" />
+
+ <target name="jar-core" depends="common.jar-core" />
+
+ <target name="javadocs" depends="javadocs-queries,compile-core">
+ <invoke-module-javadoc>
+ <links>
+ <link href="../queries"/>
+ </links>
+ </invoke-module-javadoc>
+ </target>
+
</project>
Copied: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java (from r1401338, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java?p2=lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java&p1=lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java&r1=1401338&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java Wed Feb 20 11:51:46 2013
@@ -1,5 +1,3 @@
-package org.apache.lucene.classification;
-
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,24 +14,39 @@ package org.apache.lucene.classification
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+package org.apache.lucene.classification;
/**
- * The result of a call to {@link Classifier#assignClass(String)} holding an assigned class and a score.
+ * The result of a call to {@link Classifier#assignClass(String)} holding an assigned class of type <code>T</code> and a score.
+ * @lucene.experimental
*/
-public class ClassificationResult {
+public class ClassificationResult<T> {
- private String assignedClass;
+ private T assignedClass;
private double score;
- public ClassificationResult(String assignedClass, double score) {
+ /**
+ * Constructor
+ * @param assignedClass the class <code>T</code> assigned by a {@link Classifier}
+ * @param score the score for the assignedClass as a <code>double</code>
+ */
+ public ClassificationResult(T assignedClass, double score) {
this.assignedClass = assignedClass;
this.score = score;
}
- public String getAssignedClass() {
+ /**
+ * retrieve the result class
+ * @return a <code>T</code> representing an assigned class
+ */
+ public T getAssignedClass() {
return assignedClass;
}
+ /**
+ * retrieve the result score
+ * @return a <code>double</code> representing a result score
+ */
public double getScore() {
return score;
}
Copied: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java (from r1384219, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java?p2=lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java&p1=lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java&r1=1384219&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java Wed Feb 20 11:51:46 2013
@@ -1,5 +1,3 @@
-package org.apache.lucene.classification;
-
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,6 +14,7 @@ package org.apache.lucene.classification
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+package org.apache.lucene.classification;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.AtomicReader;
@@ -23,17 +22,19 @@ import org.apache.lucene.index.AtomicRea
import java.io.IOException;
/**
- * A classifier, see <code>http://en.wikipedia.org/wiki/Classifier_(mathematics)</code>
+ * A classifier, see <code>http://en.wikipedia.org/wiki/Classifier_(mathematics)</code>, which assign classes of type
+ * <code>T</code>
+ * @lucene.experimental
*/
-public interface Classifier {
+public interface Classifier<T> {
/**
- * Assign a class to the given text String
+ * Assign a class (with score) to the given text String
* @param text a String containing text to be classified
- * @return a String representing a class
+ * @return a {@link ClassificationResult} holding assigned class of type <code>T</code> and score
* @throws IOException
*/
- public String assignClass(String text) throws IOException;
+ public ClassificationResult<T> assignClass(String text) throws IOException;
/**
* Train the classifier using the underlying Lucene index
Copied: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java (from r1401338, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java?p2=lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java&p1=lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java&r1=1401338&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java Wed Feb 20 11:51:46 2013
@@ -1,5 +1,3 @@
-package org.apache.lucene.classification;
-
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,6 +14,7 @@ package org.apache.lucene.classification
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+package org.apache.lucene.classification;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.AtomicReader;
@@ -24,6 +23,7 @@ import org.apache.lucene.search.IndexSea
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.util.BytesRef;
import java.io.IOException;
import java.io.StringReader;
@@ -33,8 +33,10 @@ import java.util.Map;
/**
* A k-Nearest Neighbor classifier (see <code>http://en.wikipedia.org/wiki/K-nearest_neighbors</code>) based
* on {@link MoreLikeThis}
+ *
+ * @lucene.experimental
*/
-public class KNearestNeighborClassifier implements Classifier {
+public class KNearestNeighborClassifier implements Classifier<BytesRef> {
private MoreLikeThis mlt;
private String textFieldName;
@@ -42,40 +44,55 @@ public class KNearestNeighborClassifier
private IndexSearcher indexSearcher;
private int k;
+ /**
+ * Create a {@link Classifier} using kNN algorithm
+ *
+ * @param k the number of neighbors to analyze as an <code>int</code>
+ */
public KNearestNeighborClassifier(int k) {
this.k = k;
}
+ /**
+ * {@inheritDoc}
+ */
@Override
- public ClassificationResult assignClass(String text) throws IOException {
+ public ClassificationResult<BytesRef> assignClass(String text) throws IOException {
Query q = mlt.like(new StringReader(text), textFieldName);
- TopDocs docs = indexSearcher.search(q, k);
+ TopDocs topDocs = indexSearcher.search(q, k);
+ return selectClassFromNeighbors(topDocs);
+ }
+ private ClassificationResult<BytesRef> selectClassFromNeighbors(TopDocs topDocs) throws IOException {
// TODO : improve the nearest neighbor selection
- Map<String, Integer> classCounts = new HashMap<String, Integer>();
- for (ScoreDoc scoreDoc : docs.scoreDocs) {
- String cl = indexSearcher.doc(scoreDoc.doc).getField(classFieldName).stringValue();
- Integer count = classCounts.get(cl);
- if (count != null) {
- classCounts.put(cl, count + 1);
- }
- else {
- classCounts.put(cl, 1);
+ Map<BytesRef, Integer> classCounts = new HashMap<BytesRef, Integer>();
+ for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
+ BytesRef cl = new BytesRef(indexSearcher.doc(scoreDoc.doc).getField(classFieldName).stringValue());
+ if (cl != null) {
+ Integer count = classCounts.get(cl);
+ if (count != null) {
+ classCounts.put(cl, count + 1);
+ } else {
+ classCounts.put(cl, 1);
+ }
}
}
- int max = 0;
- String assignedClass = null;
- for (String cl : classCounts.keySet()) {
+ double max = 0;
+ BytesRef assignedClass = new BytesRef();
+ for (BytesRef cl : classCounts.keySet()) {
Integer count = classCounts.get(cl);
if (count > max) {
max = count;
- assignedClass = cl;
+ assignedClass = cl.clone();
}
}
- double score = 1; // TODO : derive score from query
- return new ClassificationResult(assignedClass, score);
+ double score = max / (double) k;
+ return new ClassificationResult<BytesRef>(assignedClass, score);
}
+ /**
+ * {@inheritDoc}
+ */
@Override
public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer) throws IOException {
this.textFieldName = textFieldName;
Copied: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java (from r1384219, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java?p2=lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java&p1=lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java&r1=1384219&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java Wed Feb 20 11:51:46 2013
@@ -1,5 +1,3 @@
-package org.apache.lucene.classification;
-
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,6 +14,7 @@ package org.apache.lucene.classification
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+package org.apache.lucene.classification;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
@@ -29,6 +28,7 @@ import org.apache.lucene.search.BooleanC
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TotalHitCountCollector;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
@@ -38,8 +38,10 @@ import java.util.LinkedList;
/**
* A simplistic Lucene based NaiveBayes classifier, see <code>http://en.wikipedia.org/wiki/Naive_Bayes_classifier</code>
+ *
+ * @lucene.experimental
*/
-public class SimpleNaiveBayesClassifier implements Classifier {
+public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
private AtomicReader atomicReader;
private String textFieldName;
@@ -48,6 +50,18 @@ public class SimpleNaiveBayesClassifier
private Analyzer analyzer;
private IndexSearcher indexSearcher;
+ /**
+ * Creates a new NaiveBayes classifier.
+ * Note that you must call {@link #train(AtomicReader, String, String, Analyzer) train()} before you can
+ * classify any documents.
+ */
+ public SimpleNaiveBayesClassifier() {
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer)
throws IOException {
this.atomicReader = atomicReader;
@@ -71,34 +85,37 @@ public class SimpleNaiveBayesClassifier
return result.toArray(new String[result.size()]);
}
- public String assignClass(String inputDocument) throws IOException {
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public ClassificationResult<BytesRef> assignClass(String inputDocument) throws IOException {
if (atomicReader == null) {
throw new RuntimeException("need to train the classifier first");
}
- Double max = 0d;
- String foundClass = null;
+ double max = 0d;
+ BytesRef foundClass = new BytesRef();
Terms terms = MultiFields.getTerms(atomicReader, classFieldName);
TermsEnum termsEnum = terms.iterator(null);
- BytesRef t = termsEnum.next();
- while (t != null) {
- String classValue = t.utf8ToString();
+ BytesRef next;
+ String[] tokenizedDoc = tokenizeDoc(inputDocument);
+ while ((next = termsEnum.next()) != null) {
// TODO : turn it to be in log scale
- Double clVal = calculatePrior(classValue) * calculateLikelihood(inputDocument, classValue);
+ double clVal = calculatePrior(next) * calculateLikelihood(tokenizedDoc, next);
if (clVal > max) {
max = clVal;
- foundClass = classValue;
+ foundClass = next.clone();
}
- t = termsEnum.next();
}
- return foundClass;
+ return new ClassificationResult<BytesRef>(foundClass, max);
}
- private Double calculateLikelihood(String document, String c) throws IOException {
+ private double calculateLikelihood(String[] tokenizedDoc, BytesRef c) throws IOException {
// for each word
- Double result = 1d;
- for (String word : tokenizeDoc(document)) {
+ double result = 1d;
+ for (String word : tokenizedDoc) {
// search with text:word AND class:c
int hits = getWordFreqForClass(word, c);
@@ -117,26 +134,28 @@ public class SimpleNaiveBayesClassifier
return result;
}
- private double getTextTermFreqForClass(String c) throws IOException {
+ private double getTextTermFreqForClass(BytesRef c) throws IOException {
Terms terms = MultiFields.getTerms(atomicReader, textFieldName);
long numPostings = terms.getSumDocFreq(); // number of term/doc pairs
double avgNumberOfUniqueTerms = numPostings / (double) terms.getDocCount(); // avg # of unique terms per doc
- int docsWithC = atomicReader.docFreq(classFieldName, new BytesRef(c));
+ int docsWithC = atomicReader.docFreq(new Term(classFieldName, c));
return avgNumberOfUniqueTerms * docsWithC; // avg # of unique terms in text field per doc * # docs with c
}
- private int getWordFreqForClass(String word, String c) throws IOException {
+ private int getWordFreqForClass(String word, BytesRef c) throws IOException {
BooleanQuery booleanQuery = new BooleanQuery();
booleanQuery.add(new BooleanClause(new TermQuery(new Term(textFieldName, word)), BooleanClause.Occur.MUST));
booleanQuery.add(new BooleanClause(new TermQuery(new Term(classFieldName, c)), BooleanClause.Occur.MUST));
- return indexSearcher.search(booleanQuery, 1).totalHits;
+ TotalHitCountCollector totalHitCountCollector = new TotalHitCountCollector();
+ indexSearcher.search(booleanQuery, totalHitCountCollector);
+ return totalHitCountCollector.getTotalHits();
}
- private Double calculatePrior(String currentClass) throws IOException {
+ private double calculatePrior(BytesRef currentClass) throws IOException {
return (double) docCount(currentClass) / docsWithClassSize;
}
- private int docCount(String countedClass) throws IOException {
+ private int docCount(BytesRef countedClass) throws IOException {
return atomicReader.docFreq(new Term(classFieldName, countedClass));
}
}
Copied: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/package.html (from r1384293, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/package.html)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/package.html?p2=lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/package.html&p1=lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/package.html&r1=1384293&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/package.html (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/package.html Wed Feb 20 11:51:46 2013
@@ -18,6 +18,6 @@
<body>
Uses already seen data (the indexed documents) to classify new documents.
Currently only contains a (simplistic) Lucene based Naive Bayes classifier
-but more implementations will be added in the future.
+and a k-Nearest Neighbor classifier
</body>
</html>
Copied: lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java (from r1415060, lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java?p2=lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java&p1=lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java&r1=1415060&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/java/org/apache/lucene/classification/utils/DatasetSplitter.java Wed Feb 20 11:51:46 2013
@@ -25,7 +25,7 @@ import org.apache.lucene.document.TextFi
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.index.StorableField;
+import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.ScoreDoc;
@@ -43,20 +43,37 @@ public class DatasetSplitter {
private double crossValidationRatio;
private double testRatio;
+ /**
+ * Create a {@link DatasetSplitter} by giving test and cross validation IDXs sizes
+ *
+ * @param testRatio the ratio of the original index to be used for the test IDX as a <code>double</code> between 0.0 and 1.0
+ * @param crossValidationRatio the ratio of the original index to be used for the c.v. IDX as a <code>double</code> between 0.0 and 1.0
+ */
public DatasetSplitter(double testRatio, double crossValidationRatio) {
this.crossValidationRatio = crossValidationRatio;
this.testRatio = testRatio;
}
+ /**
+ * Split a given index into 3 indexes for training, test and cross validation tasks respectively
+ *
+ * @param originalIndex an {@link AtomicReader} on the source index
+ * @param trainingIndex a {@link Directory} used to write the training index
+ * @param testIndex a {@link Directory} used to write the test index
+ * @param crossValidationIndex a {@link Directory} used to write the cross validation index
+ * @param analyzer {@link Analyzer} used to create the new docs
+ * @param fieldNames names of fields that need to be put in the new indexes or <code>null</code> if all should be used
+ * @throws IOException if any writing operation fails on any of the indexes
+ */
public void split(AtomicReader originalIndex, Directory trainingIndex, Directory testIndex, Directory crossValidationIndex,
Analyzer analyzer, String... fieldNames) throws IOException {
// TODO : check that the passed fields are stored in the original index
// create IWs for train / test / cv IDXs
- IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(Version.LUCENE_50, analyzer));
- IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(Version.LUCENE_50, analyzer));
- IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(Version.LUCENE_50, analyzer));
+ IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(Version.LUCENE_42, analyzer));
+ IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(Version.LUCENE_42, analyzer));
+ IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(Version.LUCENE_42, analyzer));
try {
int size = originalIndex.maxDoc();
@@ -82,17 +99,14 @@ public class DatasetSplitter {
doc.add(new Field(fieldName, originalIndex.document(scoreDoc.doc).getField(fieldName).stringValue(), ft));
}
} else {
- for (StorableField storableField : originalIndex.document(scoreDoc.doc).getFields()) {
- if (storableField.readerValue()!= null){
+ for (IndexableField storableField : originalIndex.document(scoreDoc.doc).getFields()) {
+ if (storableField.readerValue() != null) {
doc.add(new Field(storableField.name(), storableField.readerValue(), ft));
- }
- else if (storableField.binaryValue()!= null){
+ } else if (storableField.binaryValue() != null) {
doc.add(new Field(storableField.name(), storableField.binaryValue(), ft));
- }
- else if (storableField.stringValue()!= null){
+ } else if (storableField.stringValue() != null) {
doc.add(new Field(storableField.name(), storableField.stringValue(), ft));
- }
- else if (storableField.numericValue()!= null){
+ } else if (storableField.numericValue() != null) {
doc.add(new Field(storableField.name(), storableField.numericValue().toString(), ft));
}
}
@@ -101,19 +115,19 @@ public class DatasetSplitter {
// add it to one of the IDXs
if (b % 2 == 0 && testWriter.maxDoc() < size * testRatio) {
testWriter.addDocument(doc);
- testWriter.commit();
} else if (cvWriter.maxDoc() < size * crossValidationRatio) {
cvWriter.addDocument(doc);
- cvWriter.commit();
} else {
trainingWriter.addDocument(doc);
- trainingWriter.commit();
}
b++;
}
} catch (Exception e) {
throw new IOException(e);
} finally {
+ testWriter.commit();
+ cvWriter.commit();
+ trainingWriter.commit();
// close IWs
testWriter.close();
cvWriter.close();
Copied: lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java (from r1401338, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java?p2=lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java&p1=lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java&r1=1401338&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java Wed Feb 20 11:51:46 2013
@@ -1,5 +1,3 @@
-package org.apache.lucene.classification;
-
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,14 +14,17 @@ package org.apache.lucene.classification
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+package org.apache.lucene.classification;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.After;
import org.junit.Before;
@@ -31,7 +32,7 @@ import org.junit.Before;
/**
* Base class for testing {@link Classifier}s
*/
-public class ClassificationTestBase extends LuceneTestCase {
+public abstract class ClassificationTestBase extends LuceneTestCase {
private RandomIndexWriter indexWriter;
private String textFieldName;
@@ -54,15 +55,17 @@ public class ClassificationTestBase exte
dir.close();
}
- protected void checkCorrectClassification(Classifier classifier, Analyzer analyzer) throws Exception {
+
+ protected void checkCorrectClassification(Classifier<BytesRef> classifier, Analyzer analyzer) throws Exception {
SlowCompositeReaderWrapper compositeReaderWrapper = null;
try {
populateIndex(analyzer);
compositeReaderWrapper = new SlowCompositeReaderWrapper(indexWriter.getReader());
classifier.train(compositeReaderWrapper, textFieldName, classFieldName, analyzer);
String newText = "Much is made of what the likes of Facebook, Google and Apple know about users. Truth is, Amazon may know more.";
- ClassificationResult classificationResult = classifier.assignClass(newText);
- assertEquals("technology", classificationResult.getAssignedClass());
+ ClassificationResult<BytesRef> classificationResult = classifier.assignClass(newText);
+ assertNotNull(classificationResult.getAssignedClass());
+ assertEquals(new BytesRef("technology"), classificationResult.getAssignedClass());
assertTrue(classificationResult.getScore() > 0);
} finally {
if (compositeReaderWrapper != null)
@@ -72,52 +75,57 @@ public class ClassificationTestBase exte
private void populateIndex(Analyzer analyzer) throws Exception {
+ FieldType ft = new FieldType(TextField.TYPE_STORED);
+ ft.setStoreTermVectors(true);
+ ft.setStoreTermVectorOffsets(true);
+ ft.setStoreTermVectorPositions(true);
+
Document doc = new Document();
- doc.add(new TextField(textFieldName, "The traveling press secretary for Mitt Romney lost his cool and cursed at reporters " +
+ doc.add(new Field(textFieldName, "The traveling press secretary for Mitt Romney lost his cool and cursed at reporters " +
"who attempted to ask questions of the Republican presidential candidate in a public plaza near the Tomb of " +
- "the Unknown Soldier in Warsaw Tuesday.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
+ "the Unknown Soldier in Warsaw Tuesday.", ft));
+ doc.add(new Field(classFieldName, "politics", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
- doc.add(new TextField(textFieldName, "Mitt Romney seeks to assure Israel and Iran, as well as Jewish voters in the United" +
- " States, that he will be tougher against Iran's nuclear ambitions than President Barack Obama.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
+ doc.add(new Field(textFieldName, "Mitt Romney seeks to assure Israel and Iran, as well as Jewish voters in the United" +
+ " States, that he will be tougher against Iran's nuclear ambitions than President Barack Obama.", ft));
+ doc.add(new Field(classFieldName, "politics", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
- doc.add(new TextField(textFieldName, "And there's a threshold question that he has to answer for the American people and " +
+ doc.add(new Field(textFieldName, "And there's a threshold question that he has to answer for the American people and " +
"that's whether he is prepared to be commander-in-chief,\" she continued. \"As we look to the past events, we " +
- "know that this raises some questions about his preparedness and we'll see how the rest of his trip goes.\"", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
+ "know that this raises some questions about his preparedness and we'll see how the rest of his trip goes.\"", ft));
+ doc.add(new Field(classFieldName, "politics", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
- doc.add(new TextField(textFieldName, "Still, when it comes to gun policy, many congressional Democrats have \"decided to " +
+ doc.add(new Field(textFieldName, "Still, when it comes to gun policy, many congressional Democrats have \"decided to " +
"keep quiet and not go there,\" said Alan Lizotte, dean and professor at the State University of New York at " +
- "Albany's School of Criminal Justice.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
+ "Albany's School of Criminal Justice.", ft));
+ doc.add(new Field(classFieldName, "politics", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
- doc.add(new TextField(textFieldName, "Standing amongst the thousands of people at the state Capitol, Jorstad, director of " +
+ doc.add(new Field(textFieldName, "Standing amongst the thousands of people at the state Capitol, Jorstad, director of " +
"technology at the University of Wisconsin-La Crosse, documented the historic moment and shared it with the " +
- "world through the Internet.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
+ "world through the Internet.", ft));
+ doc.add(new Field(classFieldName, "technology", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
- doc.add(new TextField(textFieldName, "So, about all those experts and analysts who've spent the past year or so saying " +
- "Facebook was going to make a phone. A new expert has stepped forward to say it's not going to happen.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
+ doc.add(new Field(textFieldName, "So, about all those experts and analysts who've spent the past year or so saying " +
+ "Facebook was going to make a phone. A new expert has stepped forward to say it's not going to happen.", ft));
+ doc.add(new Field(classFieldName, "technology", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
- doc.add(new TextField(textFieldName, "More than 400 million people trust Google with their e-mail, and 50 million store files" +
+ doc.add(new Field(textFieldName, "More than 400 million people trust Google with their e-mail, and 50 million store files" +
" in the cloud using the Dropbox service. People manage their bank accounts, pay bills, trade stocks and " +
- "generally transfer or store huge volumes of personal data online.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
+ "generally transfer or store huge volumes of personal data online.", ft));
+ doc.add(new Field(classFieldName, "technology", ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.commit();
Copied: lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java (from r1401338, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java?p2=lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java&p1=lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java&r1=1401338&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java Wed Feb 20 11:51:46 2013
@@ -1,5 +1,3 @@
-package org.apache.lucene.classification;
-
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,6 +14,7 @@ package org.apache.lucene.classification
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+package org.apache.lucene.classification;
import org.apache.lucene.analysis.MockAnalyzer;
import org.junit.Test;
@@ -27,7 +26,7 @@ public class KNearestNeighborClassifierT
@Test
public void testBasicUsage() throws Exception {
- checkCorrectClassification(new KNearestNeighborClassifier(1), new MockAnalyzer(random()));
+ checkCorrectClassification(new KNearestNeighborClassifier(1), new MockAnalyzer(random()));
}
}
Copied: lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java (from r1384219, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java?p2=lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java&p1=lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java&r1=1384219&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java Wed Feb 20 11:51:46 2013
@@ -1,5 +1,3 @@
-package org.apache.lucene.classification;
-
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,115 +14,36 @@ package org.apache.lucene.classification
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+package org.apache.lucene.classification;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.RandomIndexWriter;
-import org.apache.lucene.index.SlowCompositeReaderWrapper;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.LuceneTestCase;
-import org.junit.After;
-import org.junit.Before;
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.junit.Test;
+import java.io.Reader;
+
/**
* Testcase for {@link SimpleNaiveBayesClassifier}
*/
-public class SimpleNaiveBayesClassifierTest extends LuceneTestCase {
-
- private RandomIndexWriter indexWriter;
- private String textFieldName;
- private String classFieldName;
- private Analyzer analyzer;
- private Directory dir;
-
- @Before
- public void setUp() throws Exception {
- super.setUp();
- analyzer = new MockAnalyzer(random());
- dir = newDirectory();
- indexWriter = new RandomIndexWriter(random(), dir);
- textFieldName = "text";
- classFieldName = "cat";
- }
-
- @After
- public void tearDown() throws Exception {
- super.tearDown();
- indexWriter.close();
- dir.close();
- }
+public class SimpleNaiveBayesClassifierTest extends ClassificationTestBase {
@Test
public void testBasicUsage() throws Exception {
- SlowCompositeReaderWrapper compositeReaderWrapper = null;
- try {
- populateIndex();
- SimpleNaiveBayesClassifier simpleNaiveBayesClassifier = new SimpleNaiveBayesClassifier();
- compositeReaderWrapper = new SlowCompositeReaderWrapper(indexWriter.getReader());
- simpleNaiveBayesClassifier.train(compositeReaderWrapper, textFieldName, classFieldName, analyzer);
- String newText = "Much is made of what the likes of Facebook, Google and Apple know about users. Truth is, Amazon may know more. ";
- assertEquals("technology", simpleNaiveBayesClassifier.assignClass(newText));
- } finally {
- if (compositeReaderWrapper != null)
- compositeReaderWrapper.close();
- }
+ checkCorrectClassification(new SimpleNaiveBayesClassifier(), new MockAnalyzer(random()));
}
- private void populateIndex() throws Exception {
-
- Document doc = new Document();
- doc.add(new TextField(textFieldName, "The traveling press secretary for Mitt Romney lost his cool and cursed at reporters " +
- "who attempted to ask questions of the Republican presidential candidate in a public plaza near the Tomb of " +
- "the Unknown Soldier in Warsaw Tuesday.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
-
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "Mitt Romney seeks to assure Israel and Iran, as well as Jewish voters in the United" +
- " States, that he will be tougher against Iran's nuclear ambitions than President Barack Obama.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "And there's a threshold question that he has to answer for the American people and " +
- "that's whether he is prepared to be commander-in-chief,\" she continued. \"As we look to the past events, we " +
- "know that this raises some questions about his preparedness and we'll see how the rest of his trip goes.\"", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "Still, when it comes to gun policy, many congressional Democrats have \"decided to " +
- "keep quiet and not go there,\" said Alan Lizotte, dean and professor at the State University of New York at " +
- "Albany's School of Criminal Justice.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "Standing amongst the thousands of people at the state Capitol, Jorstad, director of " +
- "technology at the University of Wisconsin-La Crosse, documented the historic moment and shared it with the " +
- "world through the Internet.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "So, about all those experts and analysts who've spent the past year or so saying " +
- "Facebook was going to make a phone. A new expert has stepped forward to say it's not going to happen.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "More than 400 million people trust Google with their e-mail, and 50 million store files" +
- " in the cloud using the Dropbox service. People manage their bank accounts, pay bills, trade stocks and " +
- "generally transfer or store huge volumes of personal data online.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
+ @Test
+ public void testNGramUsage() throws Exception {
+ checkCorrectClassification(new SimpleNaiveBayesClassifier(), new NGramAnalyzer());
+ }
- indexWriter.commit();
+ private class NGramAnalyzer extends Analyzer {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ return new TokenStreamComponents(new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.Side.BACK,
+ 10, 20));
+ }
}
}
Copied: lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java (from r1415060, lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java?p2=lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java&p1=lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java&r1=1415060&r2=1448105&rev=1448105&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java (original)
+++ lucene/dev/branches/branch_4x/lucene/classification/src/test/org/apache/lucene/classification/utils/DataSplitterTest.java Wed Feb 20 11:51:46 2013
@@ -55,7 +55,7 @@ public class DataSplitterTest extends Lu
public void setUp() throws Exception {
super.setUp();
dir = newDirectory();
- indexWriter = new RandomIndexWriter(random(), dir);
+ indexWriter = new RandomIndexWriter(random(), dir, new MockAnalyzer(random()));
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setStoreTermVectors(true);
@@ -91,7 +91,7 @@ public class DataSplitterTest extends Lu
@Test
public void testSplitOnAllFields() throws Exception {
- assertSplit(originalIndex, 0.1, 0.1, null);
+ assertSplit(originalIndex, 0.1, 0.1);
}
Modified: lucene/dev/branches/branch_4x/lucene/module-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/module-build.xml?rev=1448105&r1=1448104&r2=1448105&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/module-build.xml (original)
+++ lucene/dev/branches/branch_4x/lucene/module-build.xml Wed Feb 20 11:51:46 2013
@@ -176,6 +176,28 @@
</ant>
<property name="queries-javadocs.uptodate" value="true"/>
</target>
+
+ <property name="classification.jar" value="${common.dir}/build/classification/lucene-classification-${version}.jar"/>
+ <target name="check-classification-uptodate" unless="classification.uptodate">
+ <module-uptodate name="classification" jarfile="${classification.jar}" property="classification.uptodate"/>
+ </target>
+ <target name="jar-classification" unless="classification.uptodate" depends="check-classification-uptodate">
+ <ant dir="${common.dir}/classification" target="jar-core" inheritAll="false">
+ <propertyset refid="uptodate.and.compiled.properties"/>
+ </ant>
+ <property name="classification.uptodate" value="true"/>
+ </target>
+
+ <property name="classification-javadoc.jar" value="${common.dir}/build/classification/lucene-classification-${version}-javadoc.jar"/>
+ <target name="check-classification-javadocs-uptodate" unless="classification-javadocs.uptodate">
+ <module-uptodate name="classification" jarfile="${classification-javadoc.jar}" property="classification-javadocs.uptodate"/>
+ </target>
+ <target name="javadocs-classification" unless="classification-javadocs.uptodate" depends="check-classification-javadocs-uptodate">
+ <ant dir="${common.dir}/classification" target="javadocs" inheritAll="false">
+ <propertyset refid="uptodate.and.compiled.properties"/>
+ </ant>
+ <property name="classification-javadocs.uptodate" value="true"/>
+ </target>
<property name="facet.jar" value="${common.dir}/build/facet/lucene-facet-${version}.jar"/>
<target name="check-facet-uptodate" unless="facet.uptodate">