You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by "Cuong Hoang (JIRA)" <ji...@apache.org> on 2008/04/04 03:29:24 UTC
[jira] Commented: (LUCENE-1039) Bayesian classifiers using Lucene as data store

    [ https://issues.apache.org/jira/browse/LUCENE-1039?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12585368#action_12585368 ] 

Cuong Hoang commented on LUCENE-1039:
-------------------------------------

>>Each document must only contain one token in the class field

Does that mean each document in the training set can only belong to one class? 

I try to run the test case but get NullPointerException:

TestClassifier
org.apache.lucene.classifier.TestClassifier
test(org.apache.lucene.classifier.TestClassifier)
java.lang.NullPointerException
	at org.apache.lucene.index.MultiTermDocs.doc(MultiReader.java:356)
	at org.apache.lucene.classifier.BayesianClassifier.classFeatureFrequency(BayesianClassifier.java:92)
	at org.apache.lucene.classifier.BayesianClassifier.weightedFeatureClassProbability(BayesianClassifier.java:137)
	at org.apache.lucene.classifier.NaiveBayesClassifier.featuresClassProbability(NaiveBayesClassifier.java:54)
	at org.apache.lucene.classifier.NaiveBayesClassifier.classify(NaiveBayesClassifier.java:72)
	at org.apache.lucene.classifier.BayesianClassifier.classify(BayesianClassifier.java:70)
	at org.apache.lucene.classifier.BayesianClassifier.classify(BayesianClassifier.java:62)
	at org.apache.lucene.classifier.TestClassifier.testClassifier(TestClassifier.java:110)
	at org.apache.lucene.classifier.TestClassifier.test(TestClassifier.java:101)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:39)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
	at java.lang.reflect.Method.invoke(Method.java:597)
	at junit.framework.TestCase.runTest(TestCase.java:154)
	at junit.framework.TestCase.runBare(TestCase.java:127)
	at junit.framework.TestResult$1.protect(TestResult.java:106)
	at junit.framework.TestResult.runProtected(TestResult.java:124)
	at junit.framework.TestResult.run(TestResult.java:109)
	at junit.framework.TestCase.run(TestCase.java:118)
	at junit.framework.TestSuite.runTest(TestSuite.java:208)
	at junit.framework.TestSuite.run(TestSuite.java:203)
	at org.eclipse.jdt.internal.junit.runner.junit3.JUnit3TestReference.run(JUnit3TestReference.java:130)
	at org.eclipse.jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)
	at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:460)
	at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:673)
	at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java:386)
	at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java:196)
 

> Bayesian classifiers using Lucene as data store
> -----------------------------------------------
>
>                 Key: LUCENE-1039
>                 URL: https://issues.apache.org/jira/browse/LUCENE-1039
>             Project: Lucene - Java
>          Issue Type: New Feature
>            Reporter: Karl Wettin
>            Priority: Minor
>         Attachments: LUCENE-1039.txt
>
>
> Bayesian classifiers using Lucene as data store. Based on the Naive Bayes and Fisher method algorithms as described by Toby Segaran in "Programming Collective Intelligence", ISBN 978-0-596-52932-1. 
> Have fun.
> Poor java docs, but the TestCase shows how to use it:
> {code:java}
> public class TestClassifier extends TestCase {
>   public void test() throws Exception {
>     InstanceFactory instanceFactory = new InstanceFactory() {
>       public Document factory(String text, String _class) {
>         Document doc = new Document();
>         doc.add(new Field("class", _class, Field.Store.YES, Field.Index.NO_NORMS));
>         doc.add(new Field("text", text, Field.Store.YES, Field.Index.NO, Field.TermVector.NO));
>         doc.add(new Field("text/ngrams/start", text, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
>         doc.add(new Field("text/ngrams/inner", text, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
>         doc.add(new Field("text/ngrams/end", text, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
>         return doc;
>       }
>       Analyzer analyzer = new Analyzer() {
>         private int minGram = 2;
>         private int maxGram = 3;
>         public TokenStream tokenStream(String fieldName, Reader reader) {
>           TokenStream ts = new StandardTokenizer(reader);
>           ts = new LowerCaseFilter(ts);
>           if (fieldName.endsWith("/ngrams/start")) {
>             ts = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.FRONT, minGram, maxGram);
>           } else if (fieldName.endsWith("/ngrams/inner")) {
>             ts = new NGramTokenFilter(ts, minGram, maxGram);
>           } else if (fieldName.endsWith("/ngrams/end")) {
>             ts = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.BACK, minGram, maxGram);
>           }
>           return ts;
>         }
>       };
>       public Analyzer getAnalyzer() {
>         return analyzer;
>       }
>     };
>     Directory dir = new RAMDirectory();
>     new IndexWriter(dir, null, true).close();
>     Instances instances = new Instances(dir, instanceFactory, "class");
>     instances.addInstance("hello world", "en");
>     instances.addInstance("hallå världen", "sv");
>     instances.addInstance("this is london calling", "en");
>     instances.addInstance("detta är london som ringer", "sv");
>     instances.addInstance("john has a long mustache", "en");
>     instances.addInstance("john har en lång mustache", "sv");
>     instances.addInstance("all work and no play makes jack a dull boy", "en");
>     instances.addInstance("att bara arbeta och aldrig leka gör jack en trist gosse", "sv");
>     instances.addInstance("shrimp sandwich", "en");
>     instances.addInstance("räksmörgås", "sv");
>     instances.addInstance("it's now or never", "en");
>     instances.addInstance("det är nu eller aldrig", "sv");
>     instances.addInstance("to tie up at a landing-stage", "en");
>     instances.addInstance("att angöra en brygga", "sv");
>     instances.addInstance("it's now time for the children's television shows", "en");
>     instances.addInstance("nu är det dags för barnprogram", "sv");
>     instances.flush();
>     testClassifier(instances, new NaiveBayesClassifier());
>     testClassifier(instances, new FishersMethodClassifier());
>     instances.close();
>   }
>   private void testClassifier(Instances instances, BayesianClassifier classifier) throws IOException {
>     assertEquals("sv", classifier.classify(instances, "detta blir ett test")[0].getClassification());
>     assertEquals("en", classifier.classify(instances, "this will be a test")[0].getClassification());
>     // test training data instances. all ought to match!
>     for (int documentNumber = 0; documentNumber < instances.getIndexReader().maxDoc(); documentNumber++) {
>       if (!instances.getIndexReader().isDeleted(documentNumber)) {
>         Map<Term, Double> features = instances.extractFeatures(instances.getIndexReader(), documentNumber, classifier.isNormalized());
>         Document document = instances.getIndexReader().document(documentNumber);
>         assertEquals(document.get("class"), classifier.classify(instances, features)[0].getClassification());
>       }
>     }
>   }
> {code}

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.


---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org