You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by "Karl Wettin (JIRA)" <ji...@apache.org> on 2008/08/24 14:23:44 UTC
[jira] Closed: (LUCENE-1039) Bayesian classifiers using Lucene as
data store
[ https://issues.apache.org/jira/browse/LUCENE-1039?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Karl Wettin closed LUCENE-1039.
-------------------------------
Resolution: Later
Assignee: Karl Wettin
I close this issue due to uncertainy about intellectual property rights, pending an answer from Toby. I've tried to contact him several times via numerus media without response : (
> Bayesian classifiers using Lucene as data store
> -----------------------------------------------
>
> Key: LUCENE-1039
> URL: https://issues.apache.org/jira/browse/LUCENE-1039
> Project: Lucene - Java
> Issue Type: New Feature
> Reporter: Karl Wettin
> Assignee: Karl Wettin
> Priority: Minor
> Attachments: LUCENE-1039.txt
>
>
> Bayesian classifiers using Lucene as data store. Based on the Naive Bayes and Fisher method algorithms as described by Toby Segaran in "Programming Collective Intelligence", ISBN 978-0-596-52932-1.
> Have fun.
> Poor java docs, but the TestCase shows how to use it:
> {code:java}
> public class TestClassifier extends TestCase {
> public void test() throws Exception {
> InstanceFactory instanceFactory = new InstanceFactory() {
> public Document factory(String text, String _class) {
> Document doc = new Document();
> doc.add(new Field("class", _class, Field.Store.YES, Field.Index.NO_NORMS));
> doc.add(new Field("text", text, Field.Store.YES, Field.Index.NO, Field.TermVector.NO));
> doc.add(new Field("text/ngrams/start", text, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
> doc.add(new Field("text/ngrams/inner", text, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
> doc.add(new Field("text/ngrams/end", text, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
> return doc;
> }
> Analyzer analyzer = new Analyzer() {
> private int minGram = 2;
> private int maxGram = 3;
> public TokenStream tokenStream(String fieldName, Reader reader) {
> TokenStream ts = new StandardTokenizer(reader);
> ts = new LowerCaseFilter(ts);
> if (fieldName.endsWith("/ngrams/start")) {
> ts = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.FRONT, minGram, maxGram);
> } else if (fieldName.endsWith("/ngrams/inner")) {
> ts = new NGramTokenFilter(ts, minGram, maxGram);
> } else if (fieldName.endsWith("/ngrams/end")) {
> ts = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.BACK, minGram, maxGram);
> }
> return ts;
> }
> };
> public Analyzer getAnalyzer() {
> return analyzer;
> }
> };
> Directory dir = new RAMDirectory();
> new IndexWriter(dir, null, true).close();
> Instances instances = new Instances(dir, instanceFactory, "class");
> instances.addInstance("hello world", "en");
> instances.addInstance("hallå världen", "sv");
> instances.addInstance("this is london calling", "en");
> instances.addInstance("detta är london som ringer", "sv");
> instances.addInstance("john has a long mustache", "en");
> instances.addInstance("john har en lång mustache", "sv");
> instances.addInstance("all work and no play makes jack a dull boy", "en");
> instances.addInstance("att bara arbeta och aldrig leka gör jack en trist gosse", "sv");
> instances.addInstance("shrimp sandwich", "en");
> instances.addInstance("räksmörgås", "sv");
> instances.addInstance("it's now or never", "en");
> instances.addInstance("det är nu eller aldrig", "sv");
> instances.addInstance("to tie up at a landing-stage", "en");
> instances.addInstance("att angöra en brygga", "sv");
> instances.addInstance("it's now time for the children's television shows", "en");
> instances.addInstance("nu är det dags för barnprogram", "sv");
> instances.flush();
> testClassifier(instances, new NaiveBayesClassifier());
> testClassifier(instances, new FishersMethodClassifier());
> instances.close();
> }
> private void testClassifier(Instances instances, BayesianClassifier classifier) throws IOException {
> assertEquals("sv", classifier.classify(instances, "detta blir ett test")[0].getClassification());
> assertEquals("en", classifier.classify(instances, "this will be a test")[0].getClassification());
> // test training data instances. all ought to match!
> for (int documentNumber = 0; documentNumber < instances.getIndexReader().maxDoc(); documentNumber++) {
> if (!instances.getIndexReader().isDeleted(documentNumber)) {
> Map<Term, Double> features = instances.extractFeatures(instances.getIndexReader(), documentNumber, classifier.isNormalized());
> Document document = instances.getIndexReader().document(documentNumber);
> assertEquals(document.get("class"), classifier.classify(instances, features)[0].getClassification());
> }
> }
> }
> {code}
--
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.
---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org