You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/06/05 09:23:15 UTC

svn commit: r1489728 [1/3] - in /stanbol/trunk/enhancement-engines/topic/engine: ./ src/main/java/org/apache/stanbol/enhancer/engine/topic/ src/main/java/org/apache/stanbol/enhancer/topic/ src/main/java/org/apache/stanbol/enhancer/topic/training/ src/m...

Author: rwesten
Date: Wed Jun  5 07:23:15 2013
New Revision: 1489728

URL: http://svn.apache.org/r1489728
Log:
The Topic Engine now uses the ManagedSolrServer to init SolrCores. The same is true for Trainingset. Note that this means that SolrCore configurations are now loaded via the DataFileProvider infrastructure. This allows users to load custom models and/or pre-trained models copied to the 'stanbol/datafiles' folder - STANBOL-1087; updates default schemas used by the TopicEngine and Trainingset to Solr 4 - STANBOL-1086; Added a default SolrCore configuration for the Topic Engine that supports n-grams - STANBOL-1089; removed all configuration properties used to configure the name of Solr fields from the Felix Webconsole dialog. Added constants for the default values. Configuration of those properties is still supported by parsing OSGI configuration files - STANBOL-1090

Added:
    stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/shingle-topic-model/
    stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/shingle-topic-model/conf/
    stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/shingle-topic-model/conf/schema.xml
    stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/shingle-topic-model/conf/solrconfig.xml
Modified:
    stanbol/trunk/enhancement-engines/topic/engine/pom.xml
    stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
    stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/ConfiguredSolrCoreTracker.java
    stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java
    stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/OSGI-INF/metatype/metatype.properties
    stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/default-topic-model/conf/schema.xml
    stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/default-topic-model/conf/solrconfig.xml
    stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/default-topic-trainingset/conf/solrconfig.xml
    stanbol/trunk/enhancement-engines/topic/engine/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java

Modified: stanbol/trunk/enhancement-engines/topic/engine/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/topic/engine/pom.xml?rev=1489728&r1=1489727&r2=1489728&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/topic/engine/pom.xml (original)
+++ stanbol/trunk/enhancement-engines/topic/engine/pom.xml Wed Jun  5 07:23:15 2013
@@ -74,7 +74,7 @@
             <Private-Package>
               org.apache.stanbol.enhancer.engine.topic
             </Private-Package>
-            <Install-Path>install-config</Install-Path>
+            <!--  Install-Path>install-config</Install-Path -->
             <Data-Files>data-files</Data-Files>
             <Data-Files-Priority>-100</Data-Files-Priority>
           </instructions>

Modified: stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1489728&r1=1489727&r2=1489728&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java (original)
+++ stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java Wed Jun  5 07:23:15 2013
@@ -25,6 +25,7 @@ import java.util.Collections;
 import java.util.Date;
 import java.util.Dictionary;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Hashtable;
 import java.util.Iterator;
 import java.util.LinkedHashSet;
@@ -142,70 +143,118 @@ import org.slf4j.LoggerFactory;
 @Component(metatype = true, immediate = true, configurationFactory = true, policy = ConfigurationPolicy.REQUIRE)
 @Service
 @Properties(value = {
-                     @Property(name = EnhancementEngine.PROPERTY_NAME),
-                     @Property(name = TopicClassificationEngine.ORDER, intValue = 100),
-                     @Property(name = TopicClassificationEngine.SOLR_CORE),
-                     @Property(name = TopicClassificationEngine.LANGUAGES),
-                     @Property(name = TopicClassificationEngine.SIMILARTITY_FIELD, value = "classifier_features"),
-                     @Property(name = TopicClassificationEngine.CONCEPT_URI_FIELD, value = "concept"),
-                     @Property(name = TopicClassificationEngine.PRIMARY_TOPIC_URI_FIELD, value = "primary_topic"),
-                     @Property(name = TopicClassificationEngine.BROADER_FIELD, value = "broader"),
-                     @Property(name = TopicClassificationEngine.MODEL_UPDATE_DATE_FIELD, value = "last_update_dt"),
-                     @Property(name = TopicClassificationEngine.PRECISION_FIELD, value = "precision"),
-                     @Property(name = TopicClassificationEngine.RECALL_FIELD, value = "recall"),
-                     @Property(name = TopicClassificationEngine.ENTRY_ID_FIELD, value = "entry_id"),
-                     @Property(name = TopicClassificationEngine.MODEL_ENTRY_ID_FIELD, value = "model_entry_id"),
-                     @Property(name = TopicClassificationEngine.ENTRY_TYPE_FIELD, value = "entry_type"),
-                     @Property(name = TopicClassificationEngine.MODEL_EVALUATION_DATE_FIELD, value = "last_evaluation_dt"),
-                     @Property(name = TopicClassificationEngine.FALSE_NEGATIVES_FIELD, value = "false_negatives"),
-                     @Property(name = TopicClassificationEngine.FALSE_POSITIVES_FIELD, value = "false_positives"),
-                     @Property(name = TopicClassificationEngine.POSITIVE_SUPPORT_FIELD, value = "positive_support"),
-                     @Property(name = TopicClassificationEngine.NEGATIVE_SUPPORT_FIELD, value = "negative_support"),
-                     @Property(name = TopicClassificationEngine.TRAINING_SET_ID),
-                     @Property(name = Constants.SERVICE_RANKING, intValue = 0)})
+        @Property(name = EnhancementEngine.PROPERTY_NAME),
+        @Property(name = TopicClassificationEngine.SOLR_CORE),
+        @Property(name = TopicClassificationEngine.SOLR_CORE_CONFIG,
+            value = TopicClassificationEngine.DEFAULT_SOLR_CORE_CONFIG),
+        @Property(name = TopicClassificationEngine.LANGUAGES),
+// those properties can still be set via a configuration file, but as most users
+// will not use them exclude those from the configuration form
+//        @Property(name = TopicClassificationEngine.SIMILARTITY_FIELD, value = TopicClassificationEngine.DEFAULT_SIMILARTITY_FIELD),
+//        @Property(name = TopicClassificationEngine.CONCEPT_URI_FIELD, value = TopicClassificationEngine.DEFAULT_CONCEPT_URI_FIELD),
+//        @Property(name = TopicClassificationEngine.PRIMARY_TOPIC_URI_FIELD, value = TopicClassificationEngine.DEFAULT_PRIMARY_TOPIC_URI_FIELD),
+//        @Property(name = TopicClassificationEngine.BROADER_FIELD, value = TopicClassificationEngine.DEFAULT_BROADER_FIELD),
+//        @Property(name = TopicClassificationEngine.MODEL_UPDATE_DATE_FIELD, value = TopicClassificationEngine.DEFAULT_MODEL_UPDATE_DATE_FIELD),
+//        @Property(name = TopicClassificationEngine.PRECISION_FIELD, value = TopicClassificationEngine.DEFAULT_PRECISION_FIELD),
+//        @Property(name = TopicClassificationEngine.RECALL_FIELD, value = TopicClassificationEngine.DEFAULT_RECALL_FIELD),
+//        @Property(name = TopicClassificationEngine.ENTRY_ID_FIELD, value = TopicClassificationEngine.DEFAULT_ENTRY_ID_FIELD),
+//        @Property(name = TopicClassificationEngine.MODEL_ENTRY_ID_FIELD, value = TopicClassificationEngine.DEFAULT_MODEL_ENTRY_ID_FIELD),
+//        @Property(name = TopicClassificationEngine.ENTRY_TYPE_FIELD, value = TopicClassificationEngine.DEFAULT_ENTRY_TYPE_FIELD),
+//        @Property(name = TopicClassificationEngine.MODEL_EVALUATION_DATE_FIELD, value = TopicClassificationEngine.DEFAULT_MODEL_EVALUATION_DATE_FIELD),
+//        @Property(name = TopicClassificationEngine.FALSE_NEGATIVES_FIELD, value = TopicClassificationEngine.DEFAULT_FALSE_NEGATIVES_FIELD),
+//        @Property(name = TopicClassificationEngine.FALSE_POSITIVES_FIELD, value = TopicClassificationEngine.DEFAULT_FALSE_POSITIVES_FIELD),
+//        @Property(name = TopicClassificationEngine.POSITIVE_SUPPORT_FIELD, value = TopicClassificationEngine.DEFAULT_POSITIVE_SUPPORT_FIELD),
+//        @Property(name = TopicClassificationEngine.NEGATIVE_SUPPORT_FIELD, value = TopicClassificationEngine.DEFAULT_NEGATIVE_SUPPORT_FIELD),
+//        @Property(name = TopicClassificationEngine.ORDER, intValue = TopicClassificationEngine.DEFAULT_ENGINE_ORDER),
+        @Property(name = TopicClassificationEngine.TRAINING_SET_ID),
+        @Property(name = Constants.SERVICE_RANKING, intValue = 0)})
 public class TopicClassificationEngine extends ConfiguredSolrCoreTracker implements EnhancementEngine,
         ServiceProperties, TopicClassifier {
 
+    public static final String DEFAULT_SOLR_CORE_CONFIG = "default-topic-model.solrindex.zip";
+
     public static final String MODEL_ENTRY = "model";
 
     public static final String METADATA_ENTRY = "metadata";
-
+    /**
+     * The reference to the SolrCore used ny the TopicClassificationEngine.
+     * The default is the engine name with the suffix '-model'. This also supports
+     * the {server-name}:{index-name} syntax. if n
+     */
     public static final String SOLR_CORE = "org.apache.stanbol.enhancer.engine.topic.solrCore";
+    /**
+     * The name of the Solr Index archive (default: "default-topic-model.solrindex.zip").
+     * The file is loaded by using the DataFileProvider infrastructure. The archive may
+     * also include a pre-trained model.
+     */
+    public static final String SOLR_CORE_CONFIG = "org.apache.stanbol.enhancer.engine.topic.solrCoreConfig";
 
     public static final String LANGUAGES = "org.apache.stanbol.enhancer.engine.topic.languages";
 
     public static final String ORDER = "org.apache.stanbol.enhancer.engine.topic.order";
+    
+    public static final Integer DEFAULT_ENGINE_ORDER = ServiceProperties.ORDERING_CONTENT_EXTRACTION;
 
     public static final String ENTRY_ID_FIELD = "org.apache.stanbol.enhancer.engine.topic.entryIdField";
+    
+    public static final String DEFAULT_ENTRY_ID_FIELD = "entry_id";
 
     public static final String ENTRY_TYPE_FIELD = "org.apache.stanbol.enhancer.engine.topic.entryTypeField";
+    
+    public static final String DEFAULT_ENTRY_TYPE_FIELD = "entry_type";
 
     public static final String SIMILARTITY_FIELD = "org.apache.stanbol.enhancer.engine.topic.similarityField";
+    
+    public static final String DEFAULT_SIMILARTITY_FIELD = "classifier_features";
 
     public static final String CONCEPT_URI_FIELD = "org.apache.stanbol.enhancer.engine.topic.conceptUriField";
+    
+    public static final String DEFAULT_CONCEPT_URI_FIELD = "concept";
 
     public static final String BROADER_FIELD = "org.apache.stanbol.enhancer.engine.topic.broaderField";
+    
+    public static final String DEFAULT_BROADER_FIELD = "broader";
 
     public static final String PRIMARY_TOPIC_URI_FIELD = "org.apache.stanbol.enhancer.engine.topic.primaryTopicField";
+    
+    public static final String DEFAULT_PRIMARY_TOPIC_URI_FIELD = "primary_topic";
 
     public static final String MODEL_UPDATE_DATE_FIELD = "org.apache.stanbol.enhancer.engine.topic.modelUpdateDateField";
 
+    public static final String DEFAULT_MODEL_UPDATE_DATE_FIELD = "last_update_dt";
+    
     public static final String MODEL_EVALUATION_DATE_FIELD = "org.apache.stanbol.enhancer.engine.topic.modelEvaluationDateField";
+    
+    public static final String DEFAULT_MODEL_EVALUATION_DATE_FIELD = "last_evaluation_dt";
 
     public static final String MODEL_ENTRY_ID_FIELD = "org.apache.stanbol.enhancer.engine.topic.modelEntryIdField";
+    
+    public static final String DEFAULT_MODEL_ENTRY_ID_FIELD = "model_entry_id";
 
     public static final String PRECISION_FIELD = "org.apache.stanbol.enhancer.engine.topic.precisionField";
+    
+    public static final String DEFAULT_PRECISION_FIELD = "precision";
 
     public static final String RECALL_FIELD = "org.apache.stanbol.enhancer.engine.topic.recallField";
+    
+    public static final String DEFAULT_RECALL_FIELD = "recall";
 
     public static final String FALSE_POSITIVES_FIELD = "org.apache.stanbol.enhancer.engine.topic.falsePositivesField";
+    
+    public static final String DEFAULT_FALSE_POSITIVES_FIELD = "false_positives";
 
     public static final String FALSE_NEGATIVES_FIELD = "org.apache.stanbol.enhancer.engine.topic.falseNegativesField";
+    
+    public static final String DEFAULT_FALSE_NEGATIVES_FIELD = "false_negatives";
 
     public static final String POSITIVE_SUPPORT_FIELD = "org.apache.stanbol.enhancer.engine.topic.positiveSupportField";
 
+    public static final String DEFAULT_POSITIVE_SUPPORT_FIELD = "positive_support";
+    
     public static final String NEGATIVE_SUPPORT_FIELD = "org.apache.stanbol.enhancer.engine.topic.negativeSupportField";
 
+    public static final String DEFAULT_NEGATIVE_SUPPORT_FIELD = "negative_support";
+    
     public static final String TRAINING_SET_ID = "org.apache.stanbol.enhancer.engine.topic.trainingSetId";
 
     private static final Logger log = LoggerFactory.getLogger(TopicClassificationEngine.class);
@@ -250,6 +299,7 @@ public class TopicClassificationEngine e
     protected String engineName;
 
     protected List<String> acceptedLanguages;
+    private Set<String> acceptedLanguageSet;
 
     protected Integer order = ORDERING_EXTRACTION_ENHANCEMENT;
 
@@ -337,7 +387,6 @@ public class TopicClassificationEngine e
     protected void activate(ComponentContext context, Dictionary<String,Object> config) throws ConfigurationException,
                                                                                        InvalidSyntaxException {
         this.context = context;
-        indexArchiveName = "default-topic-model";
         configure(config);
 
         // if training set is not null, track it
@@ -376,36 +425,52 @@ public class TopicClassificationEngine e
 
     public void configure(Dictionary<String,Object> config) throws ConfigurationException {
         engineName = getRequiredStringParam(config, EnhancementEngine.PROPERTY_NAME);
-        entryIdField = getRequiredStringParam(config, ENTRY_ID_FIELD);
-        modelEntryIdField = getRequiredStringParam(config, MODEL_ENTRY_ID_FIELD);
-        conceptUriField = getRequiredStringParam(config, CONCEPT_URI_FIELD);
-        entryTypeField = getRequiredStringParam(config, ENTRY_TYPE_FIELD);
-        similarityField = getRequiredStringParam(config, SIMILARTITY_FIELD);
+        entryIdField = getRequiredStringParam(config, ENTRY_ID_FIELD, DEFAULT_ENTRY_ID_FIELD);
+        modelEntryIdField = getRequiredStringParam(config, MODEL_ENTRY_ID_FIELD, DEFAULT_MODEL_ENTRY_ID_FIELD);
+        conceptUriField = getRequiredStringParam(config, CONCEPT_URI_FIELD, DEFAULT_CONCEPT_URI_FIELD);
+        entryTypeField = getRequiredStringParam(config, ENTRY_TYPE_FIELD, DEFAULT_ENTRY_TYPE_FIELD);
+        similarityField = getRequiredStringParam(config, SIMILARTITY_FIELD, DEFAULT_SIMILARTITY_FIELD);
         acceptedLanguages = getStringListParan(config, LANGUAGES);
-        precisionField = getRequiredStringParam(config, PRECISION_FIELD);
-        recallField = getRequiredStringParam(config, RECALL_FIELD);
-        modelUpdateDateField = getRequiredStringParam(config, MODEL_UPDATE_DATE_FIELD);
-        modelEvaluationDateField = getRequiredStringParam(config, MODEL_EVALUATION_DATE_FIELD);
-        falsePositivesField = getRequiredStringParam(config, FALSE_POSITIVES_FIELD);
-        falseNegativesField = getRequiredStringParam(config, FALSE_NEGATIVES_FIELD);
-        positiveSupportField = getRequiredStringParam(config, POSITIVE_SUPPORT_FIELD);
-        negativeSupportField = getRequiredStringParam(config, NEGATIVE_SUPPORT_FIELD);
-        configureSolrCore(config, SOLR_CORE, engineName + "-model");
+        acceptedLanguageSet = new HashSet<String>(acceptedLanguages);
+        precisionField = getRequiredStringParam(config, PRECISION_FIELD, DEFAULT_PRECISION_FIELD);
+        recallField = getRequiredStringParam(config, RECALL_FIELD, DEFAULT_RECALL_FIELD);
+        modelUpdateDateField = getRequiredStringParam(config, MODEL_UPDATE_DATE_FIELD, DEFAULT_MODEL_UPDATE_DATE_FIELD);
+        modelEvaluationDateField = getRequiredStringParam(config, MODEL_EVALUATION_DATE_FIELD, DEFAULT_MODEL_EVALUATION_DATE_FIELD);
+        falsePositivesField = getRequiredStringParam(config, FALSE_POSITIVES_FIELD, DEFAULT_FALSE_POSITIVES_FIELD);
+        falseNegativesField = getRequiredStringParam(config, FALSE_NEGATIVES_FIELD, DEFAULT_FALSE_NEGATIVES_FIELD);
+        positiveSupportField = getRequiredStringParam(config, POSITIVE_SUPPORT_FIELD, DEFAULT_POSITIVE_SUPPORT_FIELD);
+        negativeSupportField = getRequiredStringParam(config, NEGATIVE_SUPPORT_FIELD, DEFAULT_NEGATIVE_SUPPORT_FIELD);
+        configureSolrCore(config, SOLR_CORE, engineName + "-model",SOLR_CORE_CONFIG);
 
         // optional fields, can be null
-        broaderField = (String) config.get(BROADER_FIELD);
-        primaryTopicUriField = (String) config.get(PRIMARY_TOPIC_URI_FIELD);
+        broaderField = getRequiredStringParam(config, BROADER_FIELD, DEFAULT_BROADER_FIELD);
+        primaryTopicUriField = getRequiredStringParam(config, PRIMARY_TOPIC_URI_FIELD, DEFAULT_PRIMARY_TOPIC_URI_FIELD);
         trainingSetId = (String) config.get(TRAINING_SET_ID);
         Object orderParamValue = config.get(ORDER);
-        if (orderParamValue != null) {
-            order = (Integer) orderParamValue;
+        if (orderParamValue instanceof Number) {
+            order = ((Number) orderParamValue).intValue();
+        } else if(orderParamValue != null){
+            try {
+                Integer.parseInt(orderParamValue.toString());
+            }catch (NumberFormatException e) {
+                throw new ConfigurationException(ORDER, "The configured EnhancementEngine "
+                    + "order MUST BE an Intever value!",e);
+            }
+        } else {
+            order = DEFAULT_ENGINE_ORDER;
         }
     }
 
     @Override
     public int canEnhance(ContentItem ci) throws EngineException {
         if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null && getActiveSolrServer() != null) {
-            return ENHANCE_SYNCHRONOUS;
+            String language = EnhancementEngineHelper.getLanguage(ci);
+            if(acceptedLanguageSet.isEmpty() || acceptedLanguageSet.contains(language) ||
+                    acceptedLanguageSet.contains("")){
+                return ENHANCE_SYNCHRONOUS;
+            } else {
+                return CANNOT_ENHANCE;
+            }
         } else {
             return CANNOT_ENHANCE;
         }
@@ -421,6 +486,12 @@ public class TopicClassificationEngine e
                             + "') -> this indicates that canEnhance was"
                             + "NOT called and indicates a bug in the used EnhancementJobManager!");
         }
+        String language = EnhancementEngineHelper.getLanguage(ci);
+        if(!(acceptedLanguageSet.isEmpty() || acceptedLanguageSet.contains(language) ||
+                acceptedLanguageSet.contains(""))){
+            throw new IllegalStateException("The language '"+language+"' of the ContentItem is not configured as "
+                +" active for this Engine (active: "+acceptedLanguageSet+").");
+        }
         String text;
         try {
             text = ContentItemHelper.getText(contentPart.getValue());
@@ -551,7 +622,7 @@ public class TopicClassificationEngine e
         List<TopicSuggestion> suggestedTopics = new ArrayList<TopicSuggestion>(MAX_SUGGESTIONS * 3);
         SolrServer solrServer = getActiveSolrServer();
         SolrQuery query = new SolrQuery();
-        query.setQueryType("/" + MoreLikeThisParams.MLT);
+        query.setRequestHandler("/" + MoreLikeThisParams.MLT);
         query.setFilterQueries(entryTypeField + ":" + MODEL_ENTRY);
         query.set(MoreLikeThisParams.MATCH_INCLUDE, false);
         query.set(MoreLikeThisParams.MIN_DOC_FREQ, 1);
@@ -823,7 +894,15 @@ public class TopicClassificationEngine e
             return trainingSet;
         }
         if (trainingSetTracker != null) {
-            return (TrainingSet) trainingSetTracker.getService();
+            TrainingSet trainingsSet = (TrainingSet) trainingSetTracker.getService();
+            if(trainingsSet == null){
+                for(int i=0; i < 5 && trainingsSet == null; i++){
+                    try {
+                        trainingsSet = (TrainingSet) trainingSetTracker.waitForService(1000);
+                    } catch (InterruptedException e) {/*ignore*/}
+                }
+            }
+            return trainingsSet;
         }
         return null;
     }
@@ -1023,13 +1102,14 @@ public class TopicClassificationEngine e
         cvFoldCount = foldCount;
     }
 
-    protected Dictionary<String,Object> getCanonicalConfiguration(Object server) {
+    protected Dictionary<String,Object> getCanonicalConfiguration(Object server, Object coreConfig) {
         Hashtable<String,Object> config = new Hashtable<String,Object>();
         config.put(EnhancementEngine.PROPERTY_NAME, engineName + "-evaluation");
         config.put(TopicClassificationEngine.ENTRY_ID_FIELD, "entry_id");
         config.put(TopicClassificationEngine.ENTRY_TYPE_FIELD, "entry_type");
         config.put(TopicClassificationEngine.MODEL_ENTRY_ID_FIELD, "model_entry_id");
         config.put(TopicClassificationEngine.SOLR_CORE, server);
+        config.put(TopicClassificationEngine.SOLR_CORE_CONFIG, coreConfig);
         config.put(TopicClassificationEngine.CONCEPT_URI_FIELD, "concept");
         config.put(TopicClassificationEngine.PRIMARY_TOPIC_URI_FIELD, "primary_topic");
         config.put(TopicClassificationEngine.SIMILARTITY_FIELD, "classifier_features");
@@ -1085,12 +1165,8 @@ public class TopicClassificationEngine e
         return updatedTopics;
     }
 
-    protected int performCVFold(int cvFoldIndex,
-                                int cvFoldCount,
-                                int cvIterations,
-                                boolean incremental) throws ConfigurationException,
-                                                    TrainingSetException,
-                                                    ClassifierException {
+    protected int performCVFold(int cvFoldIndex, int cvFoldCount, int cvIterations, boolean incremental)
+            throws ConfigurationException, TrainingSetException, ClassifierException {
 
         cvIterations = cvIterations <= 0 ? cvFoldCount : cvFoldCount;
         log.info(String.format("Performing evaluation %d-fold CV iteration %d/%d on classifier %s",
@@ -1102,7 +1178,9 @@ public class TopicClassificationEngine e
                 // OSGi setup: the evaluation server will be generated automatically using the
                 // managedSolrServer
                 classifier.bindManagedSolrServer(managedSolrServer);
-                classifier.activate(context, getCanonicalConfiguration(engineName + "-evaluation"));
+                classifier.activate(context, getCanonicalConfiguration(
+                    engineName + "-evaluation", //TODO: maybe we should use the SolrCoreName instead
+                    solrCoreConfig));
             } else {
                 if(__evaluationServer == null){
                     __evaluationServerDir = new File(embeddedSolrServerDir,engineName + "-evaluation");
@@ -1112,7 +1190,7 @@ public class TopicClassificationEngine e
                     __evaluationServer = EmbeddedSolrHelper.makeEmbeddedSolrServer(__evaluationServerDir,
                         "evaluationclassifierserver", "default-topic-model", "default-topic-model");
                 }
-                classifier.configure(getCanonicalConfiguration(__evaluationServer));
+                classifier.configure(getCanonicalConfiguration(__evaluationServer,solrCoreConfig));
             }
         } catch (Exception e) {
             throw new ClassifierException(e);

Modified: stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/ConfiguredSolrCoreTracker.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/ConfiguredSolrCoreTracker.java?rev=1489728&r1=1489727&r2=1489728&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/ConfiguredSolrCoreTracker.java (original)
+++ stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/ConfiguredSolrCoreTracker.java Wed Jun  5 07:23:15 2013
@@ -17,20 +17,22 @@
 package org.apache.stanbol.enhancer.topic;
 
 import java.io.IOException;
-import java.net.URL;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Dictionary;
 import java.util.List;
 
-import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.apache.solr.client.solrj.SolrServer;
 import org.apache.stanbol.commons.solr.IndexReference;
 import org.apache.stanbol.commons.solr.RegisteredSolrServerTracker;
 import org.apache.stanbol.commons.solr.managed.IndexMetadata;
+import org.apache.stanbol.commons.solr.managed.ManagedIndexState;
 import org.apache.stanbol.commons.solr.managed.ManagedSolrServer;
+import org.apache.stanbol.enhancer.engine.topic.TopicClassificationEngine;
 import org.osgi.service.cm.ConfigurationException;
 import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.xml.sax.SAXException;
 
 /**
@@ -38,6 +40,8 @@ import org.xml.sax.SAXException;
  */
 public abstract class ConfiguredSolrCoreTracker {
 
+    protected final Logger log = LoggerFactory.getLogger(getClass());
+    
     protected ManagedSolrServer managedSolrServer;
 
     protected String solrCoreId;
@@ -49,7 +53,9 @@ public abstract class ConfiguredSolrCore
 
     protected ComponentContext context;
 
-    protected String indexArchiveName;
+    protected String solrCoreConfig;
+
+    //protected String indexArchiveName;
 
     abstract public void configure(Dictionary<String,Object> config) throws ConfigurationException;
 
@@ -93,7 +99,22 @@ public abstract class ConfiguredSolrCore
      *         tracker.
      */
     public SolrServer getActiveSolrServer() {
-        SolrServer result = solrServer != null ? solrServer : indexTracker.getService();
+        SolrServer result;
+        if(solrServer != null){
+            result = solrServer;
+        } else {
+            result = indexTracker.getService();
+            if(result == null){
+                //try to wait for the server (mainly because the evaluation
+                //server is created on demand and will need some time to be
+                //initialised).
+                for(int i = 0; i < 5 && result == null; i++){
+                    try {
+                        result = (SolrServer) indexTracker.waitForService(1000);
+                    } catch (InterruptedException e) {/* ignore */ }
+                }
+            }
+        }
         if (result == null) {
             if (solrCoreId != null) {
                 throw new RuntimeException("No Solr Core registered with id: " + solrCoreId);
@@ -105,27 +126,32 @@ public abstract class ConfiguredSolrCore
     }
 
     protected void configureSolrCore(Dictionary<String,Object> config,
-                                     String solrCoreProperty,
-                                     String defaultCoreId) throws ConfigurationException {
+            String solrCoreProperty, String defaultCoreId,
+            String solrCoreConfigProperty) 
+                    throws ConfigurationException {
         Object solrCoreInfo = config.get(solrCoreProperty);
         if (solrCoreInfo instanceof SolrServer) {
             // Bind a fixed Solr server client instead of doing dynamic OSGi lookup using the service tracker.
             // This can be useful both for unit-testing .
             solrServer = (SolrServer) config.get(solrCoreProperty);
+            solrCoreConfig = TopicClassificationEngine.DEFAULT_SOLR_CORE_CONFIG;
         } else {
-            if (solrCoreInfo != null && !solrCoreInfo.toString().trim().isEmpty()) {
-                this.solrCoreId = solrCoreInfo.toString();
-            } else {
-                this.solrCoreId = defaultCoreId;
-            }
             if (context == null) {
                 throw new ConfigurationException(solrCoreProperty,
                         solrCoreProperty + " should be a SolrServer instance for using"
                                 + " the engine without any OSGi context. Got: " + solrCoreId);
             }
+            if (solrCoreInfo != null && !solrCoreInfo.toString().trim().isEmpty()) {
+                this.solrCoreId = solrCoreInfo.toString().trim();
+            } else {
+                this.solrCoreId = defaultCoreId;
+            }
+            solrCoreConfig = getRequiredStringParam(config, solrCoreConfigProperty, 
+                this.solrCoreId + ".solrindex.zip");
             try {
                 IndexReference indexReference = IndexReference.parse(solrCoreId);
-                indexReference = checkInitSolrIndex(indexReference);
+                //String configName = getRequiredStringParam(config, SOLR_CONFIG, defaultValue)
+                indexReference = checkInitSolrIndex(indexReference, solrCoreConfig);
                 // track the solr core OSGi updates
                 indexTracker = new RegisteredSolrServerTracker(context.getBundleContext(), indexReference);
                 indexTracker.open();
@@ -134,30 +160,61 @@ public abstract class ConfiguredSolrCore
             }
         }
     }
-
-    protected IndexReference checkInitSolrIndex(IndexReference indexReference) throws IOException,
-                                                                              ConfigurationException,
-                                                                              SAXException {
+    /**
+     * Checks if the SolrIndex is available and if not it tries to initialise it
+     * @param indexReference the SolrCore reference
+     * @param solrCoreConfig the name of the SolrIndex configuration ({name}.solrindex.zip)
+     * @return
+     * @throws IOException
+     * @throws ConfigurationException
+     * @throws SAXException
+     */
+    protected IndexReference checkInitSolrIndex(IndexReference indexReference, String solrCoreConfig) 
+            throws IOException, ConfigurationException, SAXException {
         // if the solr core is managed, check that the index is properly activated
         if (managedSolrServer != null && indexReference.checkServer(managedSolrServer.getServerName())
-            && context != null) {
+            && context != null && solrCoreConfig != null) {
+            log.info(" > check/init index {} on ManagedSolrServer {}", indexReference, managedSolrServer.getServerName());
             String indexName = indexReference.getIndex();
-            IndexMetadata indexMetadata = managedSolrServer.getIndexMetadata(indexName);
-            if (indexMetadata == null) {
-                // TODO: debug the DataFileProvider init race conditions instead
-                // indexMetadata = managedSolrServer.createSolrIndex(indexName, indexArchiveName, null);
-                URL archiveUrl = context.getBundleContext().getBundle()
-                        .getEntry("/data-files/" + indexArchiveName + ".solrindex.zip");
-                if (archiveUrl == null) {
-                    throw new ConfigurationException(solrCoreId, "Could not find index archive for "
-                                                                 + indexArchiveName);
+            final IndexMetadata indexMetadata;
+            ManagedIndexState indexState = managedSolrServer.getIndexState(indexName);
+            if(indexState == null){
+                if(solrCoreConfig.indexOf(".solrindex.") < 0){ //if the suffix is missing
+                    solrCoreConfig = solrCoreConfig + ".solrindex.zip"; //append it
+                }
+                log.info("Create SolrCore {} (config: {}) on ManagedSolrServer {} ...",
+                    new Object[]{indexName,solrCoreConfig,managedSolrServer.getServerName()});
+                indexMetadata = managedSolrServer.createSolrIndex(indexName, 
+                    solrCoreConfig, null);
+                if(indexMetadata != null)
+                log.info("  ... created {}", indexMetadata.getIndexReference());
+            } else {
+                indexMetadata = managedSolrServer.getIndexMetadata(indexName);
+                if(indexState != ManagedIndexState.ACTIVE){
+                    log.info("  ... activate {}", indexMetadata.getIndexReference());
+                    managedSolrServer.activateIndex(indexName);
+                } else {
+                    log.info("  ... index {} already active", indexMetadata.getIndexReference());
                 }
-                ZipArchiveInputStream zis = new ZipArchiveInputStream(archiveUrl.openStream());
-                indexMetadata = managedSolrServer.updateIndex(indexName, zis, indexArchiveName);
-            }
-            if (!indexMetadata.isActive()) {
-                managedSolrServer.activateIndex(indexName);
             }
+//            IndexMetadata indexMetadata = managedSolrServer.getIndexMetadata(indexName);
+//            if (indexMetadata == null) {
+//                // TODO: debug the DataFileProvider init race conditions instead
+//                // indexMetadata = managedSolrServer.createSolrIndex(indexName, indexArchiveName, null);
+//                dfp.getInputStream(context.getBundleContext().getBundle().getSymbolicName(), 
+//                    indexArchiveName + ".solrindex.zip", null);
+//                URL archiveUrl = context.getBundleContext().getBundle()
+//                        .getEntry("/data-files/" + indexArchiveName + ".solrindex.zip");
+//                if (archiveUrl == null) {
+//                    throw new ConfigurationException(solrCoreId, "Could not find index archive for "
+//                                                                 + indexArchiveName);
+//                }
+//                ZipArchiveInputStream zis = new ZipArchiveInputStream(archiveUrl.openStream());
+//                indexMetadata = managedSolrServer.updateIndex(indexName, zis, indexArchiveName);
+//            }
+//            if (!indexMetadata.isActive()) {
+//                managedSolrServer.activateIndex(indexName);
+//            }
             indexReference = indexMetadata.getIndexReference();
         }
         return indexReference;

Modified: stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java?rev=1489728&r1=1489727&r2=1489728&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java (original)
+++ stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java Wed Jun  5 07:23:15 2013
@@ -43,6 +43,7 @@ import org.apache.solr.client.solrj.util
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.stanbol.commons.solr.managed.ManagedSolrServer;
+import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
 import org.apache.stanbol.enhancer.topic.Batch;
 import org.apache.stanbol.enhancer.topic.ConfiguredSolrCoreTracker;
 import org.apache.stanbol.enhancer.topic.UTCTimeStamper;
@@ -59,24 +60,38 @@ import org.slf4j.LoggerFactory;
 @Component(metatype = true, immediate = true, configurationFactory = true, policy = ConfigurationPolicy.REQUIRE)
 @Service
 @Properties(value = {@Property(name = SolrTrainingSet.TRAINING_SET_NAME),
-                     @Property(name = SolrTrainingSet.SOLR_CORE),
-                     @Property(name = SolrTrainingSet.EXAMPLE_ID_FIELD, value = "id"),
-                     @Property(name = SolrTrainingSet.EXAMPLE_TEXT_FIELD, value = "text"),
-                     @Property(name = SolrTrainingSet.TOPICS_URI_FIELD, value = "topics"),
-                     @Property(name = SolrTrainingSet.MODIFICATION_DATE_FIELD, value = "modification_dt")})
+        @Property(name = SolrTrainingSet.SOLR_CORE),
+        @Property(name = SolrTrainingSet.SOLR_CORE_CONFIG, value = SolrTrainingSet.DEFAULT_SOLR_CORE_CONFIG)
+//        @Property(name = SolrTrainingSet.EXAMPLE_ID_FIELD, value = SolrTrainingSet.DEFAULT_EXAMPLE_ID_FIELD),
+//        @Property(name = SolrTrainingSet.EXAMPLE_TEXT_FIELD, value = SolrTrainingSet.DEFAULT_EXAMPLE_TEXT_FIELD),
+//        @Property(name = SolrTrainingSet.TOPICS_URI_FIELD, value = SolrTrainingSet.DEFAULT_TOPICS_URI_FIELD),
+//        @Property(name = SolrTrainingSet.MODIFICATION_DATE_FIELD, value = SolrTrainingSet.DEFAULT_MODIFICATION_DATE_FIELD)
+})
 public class SolrTrainingSet extends ConfiguredSolrCoreTracker implements TrainingSet {
 
     public static final String TRAINING_SET_NAME = "org.apache.stanbol.enhancer.topic.trainingset.id";
 
     public static final String SOLR_CORE = "org.apache.stanbol.enhancer.engine.topic.solrCore";
 
+    public static final String SOLR_CORE_CONFIG = "org.apache.stanbol.enhancer.engine.topic.solrCoreConfig";
+    
+    public static final String DEFAULT_SOLR_CORE_CONFIG = "default-topic-trainingset.solrindex.zip";
+
     public static final String TOPICS_URI_FIELD = "org.apache.stanbol.enhancer.engine.topic.topicsUriField";
+    
+    public static final String DEFAULT_TOPICS_URI_FIELD = "topics";
 
     public static final String EXAMPLE_ID_FIELD = "org.apache.stanbol.enhancer.engine.topic.exampleIdField";
+    
+    public static final String DEFAULT_EXAMPLE_ID_FIELD = "id";
 
     public static final String EXAMPLE_TEXT_FIELD = "org.apache.stanbol.enhancer.engine.topic.exampleTextField";
+    
+    public static final String DEFAULT_EXAMPLE_TEXT_FIELD = "text";
 
     public static final String MODIFICATION_DATE_FIELD = "org.apache.stanbol.enhancer.engine.topic.modificiationDateField";
+    
+    public static final String DEFAULT_MODIFICATION_DATE_FIELD = "modification_dt";
 
     @SuppressWarnings("unused")
     private static final Logger log = LoggerFactory.getLogger(SolrTrainingSet.class);
@@ -96,14 +111,13 @@ public class SolrTrainingSet extends Con
 
     @Reference(cardinality = ReferenceCardinality.OPTIONAL_UNARY, bind = "bindManagedSolrServer", unbind = "unbindManagedSolrServer", strategy = ReferenceStrategy.EVENT, policy = ReferencePolicy.DYNAMIC)
     protected ManagedSolrServer managedSolrServer;
-
+    
     public String getName() {
         return trainingSetId;
     }
 
     @Activate
     protected void activate(ComponentContext context) throws ConfigurationException, InvalidSyntaxException {
-        indexArchiveName = "default-topic-trainingset";
         @SuppressWarnings("unchecked")
         Dictionary<String,Object> config = context.getProperties();
         this.context = context;
@@ -120,11 +134,11 @@ public class SolrTrainingSet extends Con
     @Override
     public void configure(Dictionary<String,Object> config) throws ConfigurationException {
         trainingSetId = getRequiredStringParam(config, TRAINING_SET_NAME);
-        exampleIdField = getRequiredStringParam(config, EXAMPLE_ID_FIELD);
-        exampleTextField = getRequiredStringParam(config, EXAMPLE_TEXT_FIELD);
-        topicUrisField = getRequiredStringParam(config, TOPICS_URI_FIELD);
-        modificationDateField = getRequiredStringParam(config, MODIFICATION_DATE_FIELD);
-        configureSolrCore(config, SOLR_CORE, trainingSetId);
+        exampleIdField = getRequiredStringParam(config, EXAMPLE_ID_FIELD, DEFAULT_EXAMPLE_ID_FIELD);
+        exampleTextField = getRequiredStringParam(config, EXAMPLE_TEXT_FIELD, DEFAULT_EXAMPLE_TEXT_FIELD);
+        topicUrisField = getRequiredStringParam(config, TOPICS_URI_FIELD, DEFAULT_TOPICS_URI_FIELD);
+        modificationDateField = getRequiredStringParam(config, MODIFICATION_DATE_FIELD, DEFAULT_MODIFICATION_DATE_FIELD);
+        configureSolrCore(config, SOLR_CORE, trainingSetId, SOLR_CORE_CONFIG);
     }
 
     public static ConfiguredSolrCoreTracker fromParameters(Dictionary<String,Object> config) throws ConfigurationException {

Modified: stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1489728&r1=1489727&r2=1489728&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/OSGI-INF/metatype/metatype.properties Wed Jun  5 07:23:15 2013
@@ -42,9 +42,27 @@ same value can be executed in parallel.
 
 #org.apache.stanbol.enhancer.engine.topic.languages
 org.apache.stanbol.enhancer.engine.topic.languages.name=Languages
+org.apache.stanbol.enhancer.engine.topic.languages.description=The list of \
+supported languages (default: all)
 
 #org.apache.stanbol.enhancer.engine.topic.solrCore
 org.apache.stanbol.enhancer.engine.topic.solrCore.name=Solr Core
+org.apache.stanbol.enhancer.engine.topic.solrCore.description=The name of the \
+Solr Core (default: '{engine-name}-model'). This also supports the \
+'{server-name}:{core-name}' syntax.
+
+#org.apache.stanbol.enhancer.engine.topic.solrCoreConfig
+org.apache.stanbol.enhancer.engine.topic.solrCoreConfig.name=Solr Core Config
+org.apache.stanbol.enhancer.engine.topic.solrCoreConfig.description=Allows to \
+specify the Solr Core Configuration used by the Topic Classification instance. \
+The file with this name is loaded via the DataFileProvider. It may also contain \
+a pre-trained model. In that case the 
+
+#org.apache.stanbol.enhancer.engine.topic.trainingSetId
+org.apache.stanbol.enhancer.engine.topic.trainingSetId.name=Training Set
+org.apache.stanbol.enhancer.engine.topic.trainingSetId.description=The name of \
+the Training Set used for this Topic Classification engine. If not specified the \
+model of this Engine will be read-only.
 
 #org.apache.stanbol.enhancer.engine.topic.entryIdField
 org.apache.stanbol.enhancer.engine.topic.entryIdField.name=ID Field
@@ -93,3 +111,32 @@ org.apache.stanbol.enhancer.engine.topic
 
 #org.apache.stanbol.enhancer.engine.topic.negativeSupportField
 org.apache.stanbol.enhancer.engine.topic.negativeSupportField.name=Negative Support Field
+
+org.apache.stanbol.enhancer.engine.topic.conceptUriField.name="Concept URI Field"
+
+org.apache.stanbol.enhancer.engine.topic.primaryTopicField.name="Primary Topic Field"
+
+# Configuration Properties for the Solr Training Set
+# org.apache.stanbol.enhancer.topic.training.SolrTrainingSet
+org.apache.stanbol.enhancer.topic.training.SolrTrainingSet.name=Apache Stanbol \
+Enhancer: Solr based Topic Classifier TrainingSet
+org.apache.stanbol.enhancer.topic.training.SolrTrainingSet.description=Solr \
+based implementation of a TrainingSet for Topic Classifiers
+
+
+org.apache.stanbol.enhancer.topic.trainingset.id.name=Training Set Name
+org.apache.stanbol.enhancer.engine.topic.exampleIdField.name=Document ID Field
+org.apache.stanbol.enhancer.engine.topic.exampleIdField.description=The Solr \
+field name used to store the ID of the training document
+org.apache.stanbol.enhancer.engine.topic.exampleIdField.name=Document Text Field
+org.apache.stanbol.enhancer.engine.topic.exampleIdField.description=The Solr \
+field name used to store the text of the training document
+org.apache.stanbol.enhancer.engine.topic.exampleIdField.name=Topic URI Field
+org.apache.stanbol.enhancer.engine.topic.exampleIdField.description=The Solr \
+field name used to store the URIs of Concepts the training document is assigned to
+org.apache.stanbol.enhancer.engine.topic.exampleIdField.name=Modification Date Field
+org.apache.stanbol.enhancer.engine.topic.exampleIdField.description=The Solr \
+field name used to store the last change to the training document
+
+
+

Modified: stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/default-topic-model/conf/schema.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/default-topic-model/conf/schema.xml?rev=1489728&r1=1489727&r2=1489728&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/default-topic-model/conf/schema.xml (original)
+++ stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/default-topic-model/conf/schema.xml Wed Jun  5 07:23:15 2013
@@ -9,7 +9,7 @@
   License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
   OF ANY KIND, either express or implied. See the License for the specific 
   language governing permissions and limitations under the License. -->
-<schema name="example" version="1.3">
+<schema name="default-topic-model" version="1.3">
   <types>
     <fieldType name="uuid" class="solr.UUIDField" indexed="true" />