You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by to...@apache.org on 2012/10/23 18:18:42 UTC
svn commit: r1401338 - in /lucene/dev/trunk/lucene/classification: ./
src/java/org/apache/lucene/classification/
src/test/org/apache/lucene/classification/
Author: tommaso
Date: Tue Oct 23 16:18:42 2012
New Revision: 1401338
URL: http://svn.apache.org/viewvc?rev=1401338&view=rev
Log:
[LUCENE-4345] - adding k-nearestneighbor classifier (based on mlt), improving testing by abstracting a basetest class, added ngram test for simplenaivebayes, changed build.xml to incorporate queries and analyzer-common deps
Added:
lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java (with props)
lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java (with props)
lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java (with props)
lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java (with props)
Modified:
lucene/dev/trunk/lucene/classification/build.xml
lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java
lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/package.html
lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
Modified: lucene/dev/trunk/lucene/classification/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/build.xml?rev=1401338&r1=1401337&r2=1401338&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/build.xml (original)
+++ lucene/dev/trunk/lucene/classification/build.xml Tue Oct 23 16:18:42 2012
@@ -23,4 +23,22 @@
</description>
<import file="../module-build.xml"/>
+
+ <path id="base.classpath">
+ <pathelement location="${common.dir}/build/core/classes/java"/>
+ <pathelement path="${queries.jar}"/>
+ <pathelement path="${project.classpath}"/>
+ </path>
+
+ <path id="test.classpath">
+ <pathelement path="${analyzers-common.jar}"/>
+ <pathelement location="${common.dir}/build/test-framework/classes/java"/>
+ <pathelement location="${common.dir}/build/codecs/classes/java"/>
+ <path refid="classpath"/>
+ <path refid="junit-path"/>
+ <pathelement location="${build.dir}/classes/java"/>
+ </path>
+
+ <target name="compile-core" depends="jar-queries,jar-analyzers-common,common.compile-core" />
+
</project>
Added: lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java?rev=1401338&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java (added)
+++ lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/ClassificationResult.java Tue Oct 23 16:18:42 2012
@@ -0,0 +1,40 @@
+package org.apache.lucene.classification;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * The result of a call to {@link Classifier#assignClass(String)} holding an assigned class and a score.
+ */
+public class ClassificationResult {
+
+ private String assignedClass;
+ private double score;
+
+ public ClassificationResult(String assignedClass, double score) {
+ this.assignedClass = assignedClass;
+ this.score = score;
+ }
+
+ public String getAssignedClass() {
+ return assignedClass;
+ }
+
+ public double getScore() {
+ return score;
+ }
+}
Modified: lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java?rev=1401338&r1=1401337&r2=1401338&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java (original)
+++ lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/Classifier.java Tue Oct 23 16:18:42 2012
@@ -29,12 +29,12 @@ import java.io.IOException;
public interface Classifier {
/**
- * Assign a class to the given text String
+ * Assign a class (with score) to the given text String
* @param text a String containing text to be classified
- * @return a String representing a class
+ * @return a {@link ClassificationResult} holding assigned class and score
* @throws IOException If there is a low-level I/O error.
*/
- public String assignClass(String text) throws IOException;
+ public ClassificationResult assignClass(String text) throws IOException;
/**
* Train the classifier using the underlying Lucene index
Added: lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java?rev=1401338&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java (added)
+++ lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/KNearestNeighborClassifier.java Tue Oct 23 16:18:42 2012
@@ -0,0 +1,88 @@
+package org.apache.lucene.classification;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.queries.mlt.MoreLikeThis;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * A k-Nearest Neighbor classifier (see <code>http://en.wikipedia.org/wiki/K-nearest_neighbors</code>) based
+ * on {@link MoreLikeThis}
+ */
+public class KNearestNeighborClassifier implements Classifier {
+
+ private MoreLikeThis mlt;
+ private String textFieldName;
+ private String classFieldName;
+ private IndexSearcher indexSearcher;
+ private int k;
+
+ public KNearestNeighborClassifier(int k) {
+ this.k = k;
+ }
+
+ @Override
+ public ClassificationResult assignClass(String text) throws IOException {
+ Query q = mlt.like(new StringReader(text), textFieldName);
+ TopDocs docs = indexSearcher.search(q, k);
+
+ // TODO : improve the nearest neighbor selection
+ Map<String, Integer> classCounts = new HashMap<String, Integer>();
+ for (ScoreDoc scoreDoc : docs.scoreDocs) {
+ String cl = indexSearcher.doc(scoreDoc.doc).getField(classFieldName).stringValue();
+ Integer count = classCounts.get(cl);
+ if (count != null) {
+ classCounts.put(cl, count + 1);
+ }
+ else {
+ classCounts.put(cl, 1);
+ }
+ }
+ int max = 0;
+ String assignedClass = null;
+ for (String cl : classCounts.keySet()) {
+ Integer count = classCounts.get(cl);
+ if (count > max) {
+ max = count;
+ assignedClass = cl;
+ }
+ }
+ double score = 1; // TODO : derive score from query
+ return new ClassificationResult(assignedClass, score);
+ }
+
+ @Override
+ public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer) throws IOException {
+ this.textFieldName = textFieldName;
+ this.classFieldName = classFieldName;
+ mlt = new MoreLikeThis(atomicReader);
+ mlt.setAnalyzer(analyzer);
+ mlt.setFieldNames(new String[]{textFieldName});
+ indexSearcher = new IndexSearcher(atomicReader);
+ }
+}
Modified: lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java?rev=1401338&r1=1401337&r2=1401338&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java (original)
+++ lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java Tue Oct 23 16:18:42 2012
@@ -80,7 +80,7 @@ public class SimpleNaiveBayesClassifier
return result.toArray(new String[result.size()]);
}
- public String assignClass(String inputDocument) throws IOException {
+ public ClassificationResult assignClass(String inputDocument) throws IOException {
if (atomicReader == null) {
throw new RuntimeException("need to train the classifier first");
}
@@ -98,7 +98,7 @@ public class SimpleNaiveBayesClassifier
foundClass = next.utf8ToString();
}
}
- return foundClass;
+ return new ClassificationResult(foundClass, max);
}
Modified: lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/package.html?rev=1401338&r1=1401337&r2=1401338&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/package.html (original)
+++ lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/package.html Tue Oct 23 16:18:42 2012
@@ -18,6 +18,6 @@
<body>
Uses already seen data (the indexed documents) to classify new documents.
Currently only contains a (simplistic) Lucene based Naive Bayes classifier
-but more implementations will be added in the future.
+and a k-Nearest Neighbor classifier
</body>
</html>
Added: lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java?rev=1401338&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java (added)
+++ lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/ClassificationTestBase.java Tue Oct 23 16:18:42 2012
@@ -0,0 +1,125 @@
+package org.apache.lucene.classification;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.SlowCompositeReaderWrapper;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.After;
+import org.junit.Before;
+
+/**
+ * Base class for testing {@link Classifier}s
+ */
+public class ClassificationTestBase extends LuceneTestCase {
+
+ private RandomIndexWriter indexWriter;
+ private String textFieldName;
+ private String classFieldName;
+ private Directory dir;
+
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ dir = newDirectory();
+ indexWriter = new RandomIndexWriter(random(), dir);
+ textFieldName = "text";
+ classFieldName = "cat";
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ super.tearDown();
+ indexWriter.close();
+ dir.close();
+ }
+
+ protected void checkCorrectClassification(Classifier classifier, Analyzer analyzer) throws Exception {
+ SlowCompositeReaderWrapper compositeReaderWrapper = null;
+ try {
+ populateIndex(analyzer);
+ compositeReaderWrapper = new SlowCompositeReaderWrapper(indexWriter.getReader());
+ classifier.train(compositeReaderWrapper, textFieldName, classFieldName, analyzer);
+ String newText = "Much is made of what the likes of Facebook, Google and Apple know about users. Truth is, Amazon may know more.";
+ ClassificationResult classificationResult = classifier.assignClass(newText);
+ assertEquals("technology", classificationResult.getAssignedClass());
+ assertTrue(classificationResult.getScore() > 0);
+ } finally {
+ if (compositeReaderWrapper != null)
+ compositeReaderWrapper.close();
+ }
+ }
+
+ private void populateIndex(Analyzer analyzer) throws Exception {
+
+ Document doc = new Document();
+ doc.add(new TextField(textFieldName, "The traveling press secretary for Mitt Romney lost his cool and cursed at reporters " +
+ "who attempted to ask questions of the Republican presidential candidate in a public plaza near the Tomb of " +
+ "the Unknown Soldier in Warsaw Tuesday.", Field.Store.YES));
+ doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
+
+ indexWriter.addDocument(doc, analyzer);
+
+ doc = new Document();
+ doc.add(new TextField(textFieldName, "Mitt Romney seeks to assure Israel and Iran, as well as Jewish voters in the United" +
+ " States, that he will be tougher against Iran's nuclear ambitions than President Barack Obama.", Field.Store.YES));
+ doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
+ indexWriter.addDocument(doc, analyzer);
+
+ doc = new Document();
+ doc.add(new TextField(textFieldName, "And there's a threshold question that he has to answer for the American people and " +
+ "that's whether he is prepared to be commander-in-chief,\" she continued. \"As we look to the past events, we " +
+ "know that this raises some questions about his preparedness and we'll see how the rest of his trip goes.\"", Field.Store.YES));
+ doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
+ indexWriter.addDocument(doc, analyzer);
+
+ doc = new Document();
+ doc.add(new TextField(textFieldName, "Still, when it comes to gun policy, many congressional Democrats have \"decided to " +
+ "keep quiet and not go there,\" said Alan Lizotte, dean and professor at the State University of New York at " +
+ "Albany's School of Criminal Justice.", Field.Store.YES));
+ doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
+ indexWriter.addDocument(doc, analyzer);
+
+ doc = new Document();
+ doc.add(new TextField(textFieldName, "Standing amongst the thousands of people at the state Capitol, Jorstad, director of " +
+ "technology at the University of Wisconsin-La Crosse, documented the historic moment and shared it with the " +
+ "world through the Internet.", Field.Store.YES));
+ doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
+ indexWriter.addDocument(doc, analyzer);
+
+ doc = new Document();
+ doc.add(new TextField(textFieldName, "So, about all those experts and analysts who've spent the past year or so saying " +
+ "Facebook was going to make a phone. A new expert has stepped forward to say it's not going to happen.", Field.Store.YES));
+ doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
+ indexWriter.addDocument(doc, analyzer);
+
+ doc = new Document();
+ doc.add(new TextField(textFieldName, "More than 400 million people trust Google with their e-mail, and 50 million store files" +
+ " in the cloud using the Dropbox service. People manage their bank accounts, pay bills, trade stocks and " +
+ "generally transfer or store huge volumes of personal data online.", Field.Store.YES));
+ doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
+ indexWriter.addDocument(doc, analyzer);
+
+ indexWriter.commit();
+ }
+}
Added: lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java?rev=1401338&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java (added)
+++ lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/KNearestNeighborClassifierTest.java Tue Oct 23 16:18:42 2012
@@ -0,0 +1,33 @@
+package org.apache.lucene.classification;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.junit.Test;
+
+/**
+ * Testcase for {@link KNearestNeighborClassifier}
+ */
+public class KNearestNeighborClassifierTest extends ClassificationTestBase {
+
+ @Test
+ public void testBasicUsage() throws Exception {
+ checkCorrectClassification(new KNearestNeighborClassifier(1), new MockAnalyzer(random()));
+ }
+
+}
Modified: lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java?rev=1401338&r1=1401337&r2=1401338&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java (original)
+++ lucene/dev/trunk/lucene/classification/src/test/org/apache/lucene/classification/SimpleNaiveBayesClassifierTest.java Tue Oct 23 16:18:42 2012
@@ -19,112 +19,32 @@ package org.apache.lucene.classification
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.RandomIndexWriter;
-import org.apache.lucene.index.SlowCompositeReaderWrapper;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.LuceneTestCase;
-import org.junit.After;
-import org.junit.Before;
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.junit.Test;
+import java.io.Reader;
+
/**
* Testcase for {@link SimpleNaiveBayesClassifier}
*/
-public class SimpleNaiveBayesClassifierTest extends LuceneTestCase {
-
- private RandomIndexWriter indexWriter;
- private String textFieldName;
- private String classFieldName;
- private Analyzer analyzer;
- private Directory dir;
-
- @Before
- public void setUp() throws Exception {
- super.setUp();
- analyzer = new MockAnalyzer(random());
- dir = newDirectory();
- indexWriter = new RandomIndexWriter(random(), dir);
- textFieldName = "text";
- classFieldName = "cat";
- }
-
- @After
- public void tearDown() throws Exception {
- super.tearDown();
- indexWriter.close();
- dir.close();
- }
+public class SimpleNaiveBayesClassifierTest extends ClassificationTestBase {
@Test
public void testBasicUsage() throws Exception {
- SlowCompositeReaderWrapper compositeReaderWrapper = null;
- try {
- populateIndex();
- SimpleNaiveBayesClassifier simpleNaiveBayesClassifier = new SimpleNaiveBayesClassifier();
- compositeReaderWrapper = new SlowCompositeReaderWrapper(indexWriter.getReader());
- simpleNaiveBayesClassifier.train(compositeReaderWrapper, textFieldName, classFieldName, analyzer);
- String newText = "Much is made of what the likes of Facebook, Google and Apple know about users. Truth is, Amazon may know more. ";
- assertEquals("technology", simpleNaiveBayesClassifier.assignClass(newText));
- } finally {
- if (compositeReaderWrapper != null)
- compositeReaderWrapper.close();
- }
+ checkCorrectClassification(new SimpleNaiveBayesClassifier(), new MockAnalyzer(random()));
}
- private void populateIndex() throws Exception {
-
- Document doc = new Document();
- doc.add(new TextField(textFieldName, "The traveling press secretary for Mitt Romney lost his cool and cursed at reporters " +
- "who attempted to ask questions of the Republican presidential candidate in a public plaza near the Tomb of " +
- "the Unknown Soldier in Warsaw Tuesday.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
-
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "Mitt Romney seeks to assure Israel and Iran, as well as Jewish voters in the United" +
- " States, that he will be tougher against Iran's nuclear ambitions than President Barack Obama.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "And there's a threshold question that he has to answer for the American people and " +
- "that's whether he is prepared to be commander-in-chief,\" she continued. \"As we look to the past events, we " +
- "know that this raises some questions about his preparedness and we'll see how the rest of his trip goes.\"", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "Still, when it comes to gun policy, many congressional Democrats have \"decided to " +
- "keep quiet and not go there,\" said Alan Lizotte, dean and professor at the State University of New York at " +
- "Albany's School of Criminal Justice.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "politics", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "Standing amongst the thousands of people at the state Capitol, Jorstad, director of " +
- "technology at the University of Wisconsin-La Crosse, documented the historic moment and shared it with the " +
- "world through the Internet.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "So, about all those experts and analysts who've spent the past year or so saying " +
- "Facebook was going to make a phone. A new expert has stepped forward to say it's not going to happen.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
-
- doc = new Document();
- doc.add(new TextField(textFieldName, "More than 400 million people trust Google with their e-mail, and 50 million store files" +
- " in the cloud using the Dropbox service. People manage their bank accounts, pay bills, trade stocks and " +
- "generally transfer or store huge volumes of personal data online.", Field.Store.YES));
- doc.add(new TextField(classFieldName, "technology", Field.Store.YES));
- indexWriter.addDocument(doc, analyzer);
+ @Test
+ public void testNGramUsage() throws Exception {
+ checkCorrectClassification(new SimpleNaiveBayesClassifier(), new NGramAnalyzer());
+ }
- indexWriter.commit();
+ private class NGramAnalyzer extends Analyzer {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ return new TokenStreamComponents(new EdgeNGramTokenizer(reader, EdgeNGramTokenizer.Side.BACK,
+ 10, 20));
+ }
}
}