You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by to...@apache.org on 2014/08/22 10:02:16 UTC
svn commit: r1619699 -
/lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
Author: tommaso
Date: Fri Aug 22 08:02:15 2014
New Revision: 1619699
URL: http://svn.apache.org/r1619699
Log:
LUCENE-5699 - fixed javadoc
Modified:
lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
Modified: lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java?rev=1619699&r1=1619698&r2=1619699&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java (original)
+++ lucene/dev/trunk/lucene/classification/src/java/org/apache/lucene/classification/SimpleNaiveBayesClassifier.java Fri Aug 22 08:02:15 2014
@@ -47,11 +47,34 @@ import org.apache.lucene.util.BytesRef;
*/
public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
+ /**
+ *
+ */
protected AtomicReader atomicReader;
+
+ /**
+ * names of the fields to be used as input text
+ */
protected String[] textFieldNames;
+
+ /**
+ * name of the field to be used as a class / category output
+ */
protected String classFieldName;
+
+ /**
+ * {@link org.apache.lucene.analysis.Analyzer} to be used for tokenizing unseen input text
+ */
protected Analyzer analyzer;
+
+ /**
+ * {@link org.apache.lucene.search.IndexSearcher} to run searches on the index for retrieving frequencies
+ */
protected IndexSearcher indexSearcher;
+
+ /**
+ * {@link org.apache.lucene.search.Query} used to eventually filter the document set to be used to classify
+ */
protected Query query;
/**
@@ -172,6 +195,12 @@ public class SimpleNaiveBayesClassifier
return returnList;
}
+ /**
+ * count the number of documents in the index having at least a value for the 'class' field
+ *
+ * @return the no. of documents having a value for the 'class' field
+ * @throws IOException
+ */
protected int countDocsWithClass() throws IOException {
int docCount = MultiFields.getTerms(this.atomicReader, this.classFieldName).getDocCount();
if (docCount == -1) { // in case codec doesn't support getDocCount
@@ -188,6 +217,13 @@ public class SimpleNaiveBayesClassifier
return docCount;
}
+ /**
+ * tokenize a <code>String</code> on this classifier's text fields and analyzer
+ *
+ * @param doc the <code>String</code> representing an input text (to be classified)
+ * @return
+ * @throws IOException
+ */
protected String[] tokenizeDoc(String doc) throws IOException {
Collection<String> result = new LinkedList<>();
for (String textFieldName : textFieldNames) {