You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2016/02/23 10:43:03 UTC
svn commit: r1731820 - in /stanbol/trunk/enhancement-engines/lucenefstlinking: ./ src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/ src/main/resources/config/ src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/

Author: rwesten
Date: Tue Feb 23 09:43:03 2016
New Revision: 1731820

URL: http://svn.apache.org/viewvc?rev=1731820&view=rev
Log:
merged implementations for STANBOL-1447 and STANBOL-1448 to trunk

Added:
    stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/config/
      - copied from r1731818, stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/resources/config/
Modified:
    stanbol/trunk/enhancement-engines/lucenefstlinking/pom.xml
    stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusCreationTask.java
    stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java
    stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
    stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
    stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java
    stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java
    stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java

Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/pom.xml?rev=1731820&r1=1731819&r2=1731820&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/pom.xml (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/pom.xml Tue Feb 23 09:43:03 2016
@@ -73,6 +73,8 @@
             <Embed-Dependency>
               solr-text-tagger
             </Embed-Dependency>
+            <!-- we install a logger configuration to set TaggerFstCorpus loggings to ERROR -->
+            <Install-Path>config</Install-Path>
           </instructions>
         </configuration>
       </plugin>

Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusCreationTask.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusCreationTask.java?rev=1731820&r1=1731819&r2=1731820&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusCreationTask.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusCreationTask.java Tue Feb 23 09:43:03 2016
@@ -39,13 +39,12 @@ import org.slf4j.LoggerFactory;
  * @author Rupert Westenthaler
  *
  */
-public class CorpusCreationTask implements Runnable{
+public class CorpusCreationTask implements Callable<TaggerFstCorpus>{
 
     private final Logger log = LoggerFactory.getLogger(CorpusCreationTask.class);
     
     private final CorpusInfo fstInfo;
     private final IndexConfiguration indexConfig;
-    private final long enqueued;
     
     public CorpusCreationTask(IndexConfiguration indexConfig, CorpusInfo fstInfo){
         if(indexConfig == null || fstInfo == null){
@@ -53,22 +52,20 @@ public class CorpusCreationTask implemen
         }
         this.indexConfig = indexConfig;
         this.fstInfo = fstInfo;
-        this.enqueued = fstInfo.enqueue();
     }
     
     @Override
-    public void run() {
+    public TaggerFstCorpus call() {
         if(!indexConfig.isActive()){
-            return; //task cancelled
-        }
-        //check if the FST corpus was enqueued a 2nd time
-        if(enqueued != fstInfo.getEnqueued()){
-            return;
+            String msg = "Index Configuration already deactivated";
+            fstInfo.setError(msg);
+            throw new IllegalStateException(msg);
         }
         SolrCore core = indexConfig.getIndex();
         if(core.isClosed()){
-            log.warn("Unable to build {} becuase SolrCore {} is closed!",fstInfo,core.getName());
-            return;
+            String msg = "Unable to build " + fstInfo + " becuase SolrCore " + core.getName() + " is closed!";
+            fstInfo.setError(msg);
+            throw new IllegalStateException(msg);
         }
         final TaggerFstCorpus corpus;
         RefCounted<SolrIndexSearcher> searcherRef = core.getSearcher();
@@ -85,6 +82,14 @@ public class CorpusCreationTask implemen
                         fstInfo.partialMatches,1,100);
                 }
             });
+            if(indexConfig.isActive()){
+                //set the created corpus to the FST Info
+                fstInfo.setCorpus(corpus);
+            } else { //index configuration no longer active ... ignore the built FST
+                log.warn("Index Config for "+ fstInfo + "was deactivated while building FST. "
+                        + "Built FST will be ignored.");
+            }
+            return corpus;
         } catch (PrivilegedActionException pae) {
             Exception e = pae.getException();
             if(e instanceof IOException){ //IO Exception while loading the file
@@ -96,31 +101,6 @@ public class CorpusCreationTask implemen
         } finally {
             searcherRef.decref(); //ensure that we dereference the searcher
         }
-        if(indexConfig.isActive()){
-            //set the created corpus to the FST Info
-            fstInfo.setCorpus(enqueued, corpus);
-            try { //STANBOL-1177: save FST models in AccessController.doPrivileged(..)
-                AccessController.doPrivileged(new PrivilegedExceptionAction<Object>() {
-                    public Object run() throws IOException {
-                        if(fstInfo.fst.exists()){
-                            if(!FileUtils.deleteQuietly(fstInfo.fst)){
-                                log.warn("Unable to delete existing FST file for {}", fstInfo);
-                            }
-                        }
-                        corpus.save(fstInfo.fst);
-                        return null; //not used
-                    }
-                });
-            } catch (PrivilegedActionException pae) {
-                Exception e = pae.getException();
-                if(e instanceof IOException){ //IO Exception while loading the file
-                    log.warn("Unable to store FST corpus " + fstInfo + " to "
-                            + fstInfo.fst.getAbsolutePath() + "!", e);
-                } else { //Runtime exception
-                    throw RuntimeException.class.cast(e);
-                }
-            }
-        } //else index configuration no longer active ... ignore the built FST
     }
     
     @Override

Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java?rev=1731820&r1=1731819&r2=1731820&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java Tue Feb 23 09:43:03 2016
@@ -26,6 +26,9 @@ import java.security.PrivilegedActionExc
 import java.security.PrivilegedExceptionAction;
 import java.text.SimpleDateFormat;
 import java.util.Date;
+import java.util.concurrent.Future;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
 
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang.ObjectUtils;
@@ -80,9 +83,11 @@ public class CorpusInfo {
     
     public final Analyzer taggingAnalyzer;
     
+    protected final ReadWriteLock corpusLock = new ReentrantReadWriteLock();
+    
     protected Reference<TaggerFstCorpus> taggerCorpusRef;
     
-    protected long enqueued = -1;
+    private Future<TaggerFstCorpus> enqueuedCorpus;
     /**
      * Allows to store an error message encountered while loading/creating the
      * FST corpus.
@@ -97,6 +102,7 @@ public class CorpusInfo {
      */
     private boolean creationError = false;
     
+    
     /** 
      * @param language
      * @param indexField
@@ -118,14 +124,10 @@ public class CorpusInfo {
      * Allows to set an error occurring during the creation of 
      * @param message
      */
-    protected void setError(long enqueued, String message){
+    protected void setError(String message){
         this.errorMessage = message;
-        if(message != null){
-            this.creationError = true;
-        }
-        if(this.enqueued == enqueued){
-            this.enqueued = -1;
-        }
+        this.creationError = true;
+        setCorpus(null);
     }
     public boolean isFstFile(){
         return fst != null && fst.isFile();
@@ -151,67 +153,127 @@ public class CorpusInfo {
      * @param enqueued the version of the corpus
      * @param corpus the corpus
      */
-    protected final void setCorpus(long enqueued, TaggerFstCorpus corpus) {
-        if(taggerCorpusRef != null){
-            taggerCorpusRef.clear();
-            taggerCorpusRef = null;
+    protected final void setCorpus(final TaggerFstCorpus corpus) {
+        corpusLock.writeLock().lock();
+        try {
+            enqueuedCorpus = null; //clear the future ref
+            if(taggerCorpusRef != null){
+                taggerCorpusRef.clear();
+                taggerCorpusRef = null;
+            }
+            if(corpus != null){
+                //reset any error
+                this.errorMessage = null; 
+                this.creationError = false;
+                //we set the corpus as a weak reference. This allows the
+                //GC to free the corpus earlier.
+                //This is done, because here the corpus was just built and not
+                //yet requested. So we want those to be GCed earlier.
+                taggerCorpusRef = new WeakReference<TaggerFstCorpus>(corpus);
+            }
+        } finally {
+            corpusLock.writeLock().unlock();
         }
+        //Store the newly built FST corpus to disc. A read level lock is sufficient
+        //for this.
+        //NOTE: the WeakReference to the corpus can only be GC'ed after we
+        //      have written the corpus to disc, as we still have a reference
+        //      to corpus!
         if(corpus != null){
-            //reset any error
-            this.errorMessage = null; 
-            this.creationError = false;
-            //we set the corpus as a weak reference. This allows the
-            //GC to free the corpus earlier.
-            //This is done, because here the corpus was just built and not
-            //yet requested. So we want those to be GCed earlier.
-            taggerCorpusRef = new WeakReference<TaggerFstCorpus>(corpus);
-        }
-        //check if the set version is the most current one
-        if(enqueued == this.enqueued){ //if so
-            this.enqueued = -1; //mark this one as up-to-date
+            try {
+                corpusLock.readLock().lock();
+                try { //STANBOL-1177: save FST models in AccessController.doPrivileged(..)
+                    AccessController.doPrivileged(new PrivilegedExceptionAction<Object>() {
+                        public Object run() throws IOException {
+                            if(fst.exists()){
+                                if(!FileUtils.deleteQuietly(fst)){
+                                    log.warn("Unable to delete existing FST file for {}", fst);
+                                }
+                            }
+                            corpus.save(fst);
+                            return null; //not used
+                        }
+                    });
+                } finally {
+                    corpusLock.readLock().unlock();
+                }
+            } catch (PrivilegedActionException pae) {
+                Exception e = pae.getException();
+                if(e instanceof IOException){ //IO Exception while loading the file
+                    log.warn("Unable to store FST corpus to "
+                            + fst.getAbsolutePath() + "!", e);
+                    //if we can not save the FST corpus we replace the WeakReference
+                    //with a SoftReference to avoid frequent rebuilding of the corpus
+                    corpusLock.writeLock().lock();
+                    try {
+                        if(taggerCorpusRef instanceof WeakReference<?>){
+                            taggerCorpusRef.clear();
+                            taggerCorpusRef = new SoftReference<TaggerFstCorpus>(corpus);
+                        }
+                    } finally {
+                        corpusLock.writeLock().lock();
+                    }
+                } else { //Runtime exception
+                    throw RuntimeException.class.cast(e);
+                }
+            }
         }
     }
 
     public TaggerFstCorpus getCorpus() {
-        TaggerFstCorpus corpus = taggerCorpusRef == null ? null : taggerCorpusRef.get();
-        if(corpus != null){
-            //on first usage replace a WeakReference with a SoftReference
-            if(taggerCorpusRef instanceof WeakReference<?>){
-                log.debug(" ... convert Weak to Soft Reference for Corpus {}", fst);
-                taggerCorpusRef.clear();
-                taggerCorpusRef = new SoftReference<TaggerFstCorpus>(corpus);
+        TaggerFstCorpus corpus;
+        corpusLock.readLock().lock();
+        try {
+            corpus = taggerCorpusRef == null ? null : taggerCorpusRef.get();
+            if(corpus != null){
+                //on first usage replace a WeakReference with a SoftReference
+                if(taggerCorpusRef instanceof WeakReference<?>){
+                    log.debug(" ... convert Weak to Soft Reference for Corpus {}", fst);
+                    taggerCorpusRef.clear();
+                    taggerCorpusRef = new SoftReference<TaggerFstCorpus>(corpus);
+                }
+            } else if(taggerCorpusRef != null){
+                taggerCorpusRef = null; //reset to null as the reference was taken
             }
-        } else if(taggerCorpusRef != null){
-            taggerCorpusRef = null; //reset to null as the reference was taken
+        } finally {
+            corpusLock.readLock().unlock();
         }
         if(corpus == null) {
             log.info(" ... load FST corpus {}",fst);
+            corpusLock.writeLock().lock();
             try { //STANBOL-1177: load FST models in AccessController.doPrivileged(..)
-                corpus = AccessController.doPrivileged(new PrivilegedExceptionAction<TaggerFstCorpus>() {
-                    public TaggerFstCorpus run() throws IOException {
-                        if(fst.exists() && //if the file exists AND the file was not yet failing to load 
-                                //OR the file is newer as the last version failing to load
-                                (!fstFileError || FileUtils.isFileNewer(fst, fstDate))){
-                            TaggerFstCorpus corpus = TaggerFstCorpus.load(fst);
-                            if(corpus != null){
-                                //I need to set fstDate here, because I can not
-                                //access lastModified() outside doPrivileged
-                                fstDate = new Date(fst.lastModified());
-                                if(log.isInfoEnabled()){
-                                    log.info(" ... loaded FST (date: {})", 
-                                        SimpleDateFormat.getDateTimeInstance().format(fstDate));
+                corpus = taggerCorpusRef == null ? null : taggerCorpusRef.get();
+                if(corpus == null){ //corpus not loaded while waiting for the write lock
+                    corpus = AccessController.doPrivileged(new PrivilegedExceptionAction<TaggerFstCorpus>() {
+                        public TaggerFstCorpus run() throws IOException {
+                            if(fst.exists() && //if the file exists AND the file was not yet failing to load 
+                                    //OR the file is newer as the last version failing to load
+                                    (!fstFileError || FileUtils.isFileNewer(fst, fstDate))){
+                                TaggerFstCorpus corpus = TaggerFstCorpus.load(fst);
+                                if(corpus != null){
+                                    //I need to set fstDate here, because I can not
+                                    //access lastModified() outside doPrivileged
+                                    fstDate = new Date(fst.lastModified());
+                                    if(log.isInfoEnabled()){
+                                        log.info(" ... loaded FST (date: {})", 
+                                            SimpleDateFormat.getDateTimeInstance().format(fstDate));
+                                    }
+                                } else {
+                                    log.warn(" ... no corpus loaded from {}",fst);
                                 }
+                                return corpus;
                             } else {
-                                log.warn(" ... no corpus loaded from {}",fst);
+                                log.warn(" ... unable to load FST from {} (exists: {}, fileError {})",
+                                    new Object[]{fst, fst.exists(),fstFileError});
+                                return null;
                             }
-                            return corpus;
-                        } else {
-                            log.warn(" ... unable to load FST from {} (exists: {}, fileError {})",
-                                new Object[]{fst, fst.exists(),fstFileError});
-                            return null;
                         }
-                    }
-                });
+                    });
+                    if(corpus != null){
+                        fstFileError = false;
+                        taggerCorpusRef = new SoftReference<TaggerFstCorpus>(corpus);
+                    } //else not loaded from file
+                } //else corpus was loaded while waiting for the write lock
             } catch (PrivilegedActionException pae) {
                 Exception e = pae.getException();
                 if(e instanceof IOException){ //IO Exception while loading the file
@@ -223,28 +285,25 @@ public class CorpusInfo {
                 } else { //Runtime exception
                     throw RuntimeException.class.cast(e);
                 }
+            } finally {
+                corpusLock.writeLock().unlock();
             }
-            if(corpus != null){
-                fstFileError = false;
-                taggerCorpusRef = new SoftReference<TaggerFstCorpus>(corpus);
-            } //else not loaded from file
         }
         return corpus;
     }
     /**
-     * Called when a {@link CorpusInfo} object is enqueued for runtime generation.
-     * This is used to prevent multiple FST generation in cases where the
-     * FstInfo is enqueued a 2nd time before the first one was processed.
-     * @return the {@link System#currentTimeMillis() current time} when calling
-     * this method.
-     */
-    protected long enqueue(){
-        enqueued = System.currentTimeMillis();
-        return enqueued;
+     * Called after the curpus was enqueued for rebuilding
+     */
+    protected void enqueued(Future<TaggerFstCorpus> enqueued){
+        this.enqueuedCorpus = enqueued;
     }
-    
-    protected long getEnqueued(){
-        return enqueued;
+    /**
+     * Allows to get the {@link Future} of a ongoing {@link CorpusCreationTask}.
+     * @return returns a {@link Future} that allows to wait for a corpus that is
+     * currently be built. 
+     */
+    public Future<TaggerFstCorpus> getEnqueued(){
+        return enqueuedCorpus;
     }
     
     /**
@@ -255,7 +314,7 @@ public class CorpusInfo {
      * @return <code>true</code> if the FST corpus is enqueued for (re)generation.
      */
     public boolean isEnqueued(){
-        return enqueued > 0;
+        return taggerCorpusRef != null;
     }
     
     

Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java?rev=1731820&r1=1731819&r2=1731820&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java Tue Feb 23 09:43:03 2016
@@ -28,7 +28,6 @@ import static org.apache.stanbol.enhance
 
 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashSet;
@@ -164,12 +163,17 @@ public class FstLinkingEngine implements
             return CANNOT_ENHANCE;
         }
         //(2) check if we have a FST model for the language
-        if(indexConfig.getCorpus(language) == null &&  //for the language
-        		indexConfig.getDefaultCorpus() == null){ //a default model
-            log.debug("Engine {} ignores ContentItem {} becuase no FST modles for language {} "
-            		+ "are available", new Object[] {getName(), ci.getUri(), language});
-                return CANNOT_ENHANCE;
-        }
+        //NOTE: as STANBOL-1448 the index configuration is Solr index version
+        //      dependent. This means that we can not use informations of the
+        //      current IndexConfiguration to check if we have an FST model for
+        //      the language of the requested document. Those information might
+        //      be already out dated.
+//        if(indexConfig.getCorpus(language) == null &&  //for the language
+//        		indexConfig.getDefaultCorpus() == null){ //a default model
+//            log.debug("Engine {} ignores ContentItem {} becuase no FST modles for language {} "
+//            		+ "are available", new Object[] {getName(), ci.getUri(), language});
+//                return CANNOT_ENHANCE;
+//        }
         // we need a detected language, the AnalyzedText contentPart with
         // Tokens.
         AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);

Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java?rev=1731820&r1=1731819&r2=1731820&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java Tue Feb 23 09:43:03 2016
@@ -46,6 +46,7 @@ import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
 
 import org.apache.clerezza.rdf.core.Literal;
 import org.apache.clerezza.rdf.core.Resource;
@@ -79,6 +80,7 @@ import org.apache.stanbol.enhancer.nlp.n
 import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.opensextant.solrtexttagger.TaggerFstCorpus;
 import org.osgi.framework.BundleContext;
 import org.osgi.framework.Constants;
 import org.osgi.framework.InvalidSyntaxException;
@@ -760,7 +762,7 @@ public class FstLinkingEngineComponent {
                 }
                 //File fstDir = new File(dataDir,"fst");
                 //now collect the FST configuration
-                indexConfig = new IndexConfiguration(fstConfig, core, fieldEncoding);
+                indexConfig = new IndexConfiguration(fstConfig, core, fieldEncoding, entityLinkerConfig.getDefaultLanguage());
                 indexConfig.setTypeField(solrTypeField);
                 indexConfig.setRankingField(solrRankingField);
                 //set fields parsed in the activate method
@@ -778,48 +780,40 @@ public class FstLinkingEngineComponent {
                 if(skipAltTokensConfig != null){
                     indexConfig.setSkipAltTokens(skipAltTokensConfig);
                 }
-                //create a new searcher for creating FSTs
-                if(!indexConfig.activate()){
-                    log.warn("Processing of the FST configuration was not successfull "
-                        + "for any language. See WARN level loggings for more details!");
-                    log.warn("  ... FstLinkingEnigne wiht name {} will be registered but"
-                        + "be inactive as there seam to be no data for linking available" 
-                        + "in the SolrCore {} (dir: {})", 
-                        new Object []{engineName, core.getName(), 
-                                core.getCoreDescriptor().getInstanceDir()});
-                } else { //some FST corpora initialised
-                    if(log.isInfoEnabled()){ //log the initialised languages
-                        Set<String> langSet = new HashSet<String>(indexConfig.getCorpusLanguages());
-                        if(langSet.remove(null)){ //replace the null for the default language
-                            langSet.add(""); //with an empty string
-                        }
-                        String[] langArray = langSet.toArray(new String[langSet.size()]);
-                        Arrays.sort(langArray,String.CASE_INSENSITIVE_ORDER);
-                        log.info(" ... initialised FST corpora for languages {}",
-                            Arrays.toString(langArray));
+                //activate the index configuration
+                try {
+                    //this will init the FST directory if necessary so we might run
+                    //into IOExceptions
+                    indexConfig.activate(); 
+                } catch (IOException e) {
+                    throw new RuntimeException("Unable to activate Index for FST Linking Engine '"
+                        + engineName +"' (solrCore: "+ core.getName() + ", instanceDir: "
+                        + core.getCoreDescriptor().getInstanceDir() +")!", e);
+                }
+                if(log.isInfoEnabled()){ //log the initialised languages
+                    Set<String> langSet = new HashSet<String>(indexConfig.getCorpusLanguages());
+                    if(langSet.remove(null)){ //replace the null for the default language
+                        langSet.add(""); //with an empty string
                     }
+                    String[] langArray = langSet.toArray(new String[langSet.size()]);
+                    Arrays.sort(langArray,String.CASE_INSENSITIVE_ORDER);
+                    log.info(" ... initialised FST corpora for languages {}",
+                        Arrays.toString(langArray));
                 }
                 //check if we need to create some FST files
                 for(CorpusInfo fstInfo : indexConfig.getCorpora()){
                     //check if the fst does not exist and the fstInfo allows creation
                     if(!fstInfo.fst.exists() && fstInfo.allowCreation){
                         //create a task on the FST corpus creation service
-                        fstCreatorService.execute(new CorpusCreationTask(indexConfig, fstInfo));
+                        fstInfo.corpusLock.writeLock().lock();
+                        try {
+                            Future<TaggerFstCorpus> enqueued = fstCreatorService.submit(new CorpusCreationTask(indexConfig, fstInfo));
+                            fstInfo.enqueued(enqueued);
+                        } finally {
+                            fstInfo.corpusLock.writeLock().unlock();
+                        }
                     }
                 }
-                //set the default linking corpora
-                String defaultLanguage = entityLinkerConfig.getDefaultLanguage();
-                if(defaultLanguage == null){
-                    defaultLanguage = ""; //FST uses an empty string for the default
-                }
-                CorpusInfo defaultCoprous = indexConfig.getCorpus(defaultLanguage);
-                if(defaultCoprous != null){
-                    log.info(" ... set '{}' as default FST Corpus: {}", defaultCoprous.language, defaultCoprous);
-                    indexConfig.setDefaultCorpus(defaultCoprous);
-                } else {
-                    log.info("  ... no corpus for default language {} available", defaultCoprous);
-                }
-                //create the new configuration
                 
                 //set the newly configured instances to the fields
                 this.indexConfig = indexConfig;

Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java?rev=1731820&r1=1731819&r2=1731820&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java Tue Feb 23 09:43:03 2016
@@ -18,6 +18,7 @@ package org.apache.stanbol.enhancer.engi
 
 import java.io.File;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
@@ -26,6 +27,8 @@ import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ExecutorService;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
 
 import org.apache.clerezza.rdf.core.Literal;
 import org.apache.clerezza.rdf.core.Resource;
@@ -39,6 +42,7 @@ import org.apache.lucene.document.Docume
 import org.apache.lucene.index.AtomicReader;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.search.IndexSearcher;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.schema.FieldType;
 import org.apache.solr.schema.IndexSchema;
@@ -78,9 +82,13 @@ public class IndexConfiguration {
     private String rankingField;
 
     /**
+     * Used to sync access to {@link #corpusInfos}
+     */
+    private ReadWriteLock corpusInfoLock = new ReentrantReadWriteLock();
+    /**
      * FST corpus configuration
      */
-    private Map<String,CorpusInfo> corpusInfos = new HashMap<String,CorpusInfo>();
+    private Map<String,CorpusInfo> corpusInfos;
     /**
      * {@link ExecutorService} used to create {@link TaggerFstCorpus} instances
      * at runtime.
@@ -97,15 +105,20 @@ public class IndexConfiguration {
      */
     private EntityCacheManager entityCacheManager;
     
+    private final LanguageConfiguration fstConfig;
+    
     /**
-     * The FST corpus used for linking regardless of the language of the
-     * document
+     * If runtime generation is enabled by default (Note: explicitly configured
+     * lanugages might override this)
      */
-    private CorpusInfo defaultFstCorpus;
-   
-    private final LanguageConfiguration fstConfig;
+    private final boolean runtimeGeneration;
 
+    /**
+     * used to track if this index configuration is active
+     */
     private boolean active = false;
+    
+    private long indexVersion = -1;
 
     private File fstDirectory;
     
@@ -122,6 +135,11 @@ public class IndexConfiguration {
      * tokens should cause an {@link UnsupportedTokenException}.
      */
     private boolean skipAltTokens;
+
+    /**
+     * The default language
+     */
+    private String defaultLanguage;
     /**
      * If alternate tokens (<code>posInc == 0</code>) can be skipped or if such
      * tokens should cause an {@link UnsupportedTokenException}.
@@ -206,7 +224,7 @@ public class IndexConfiguration {
      */
     public static final String PARAM_FST = "fst";
     
-    public IndexConfiguration(LanguageConfiguration fstConfig, SolrCore index, FieldEncodingEnum fieldEncoding){
+    public IndexConfiguration(LanguageConfiguration fstConfig, SolrCore index, FieldEncodingEnum fieldEncoding, String defaultLanguage){
         if(fstConfig == null){
             throw new IllegalArgumentException("The parsed FST configuration MUST NOT be NULL!");
         }
@@ -214,6 +232,14 @@ public class IndexConfiguration {
         if(index == null || index.isClosed()){
             throw new IllegalArgumentException("The parsed SolrCore MUST NOT be NULL nore closed!");
         }
+        //check if we have runtime generation
+        String allowCreationString = fstConfig.getDefaultParameters().get(IndexConfiguration.PARAM_RUNTIME_GENERATION);
+        if(allowCreationString == null){
+            runtimeGeneration = IndexConfiguration.DEFAULT_RUNTIME_GENERATION;
+        } else {
+            runtimeGeneration = Boolean.parseBoolean(allowCreationString);
+        }
+        
         this.index = index;
         if(fieldEncoding == null){
             fieldEncoding = FieldEncodingEnum.None;
@@ -226,23 +252,16 @@ public class IndexConfiguration {
         } else {
             this.skipAltTokens = false;
         }
+        this.defaultLanguage = defaultLanguage == null ? "" : defaultLanguage;
     }
     
-    public CorpusInfo setDefaultCorpus(CorpusInfo corpus){
-        CorpusInfo oldDefault = defaultFstCorpus;
-        if(corpus != null){
-            this.defaultFstCorpus = corpus;
-        } else {
-            this.defaultFstCorpus = null;
-        }
-        return oldDefault;
-    }
-    
-    protected CorpusInfo addCorpus(CorpusInfo corpus){
+    /**
+     * Assumed to be called in a write lock on {@link #corpusInfoLock}
+     * @param corpus
+     */
+    private void addCorpusInfo(CorpusInfo corpus){
         if(corpus != null){
-            return corpusInfos.put(corpus.language, corpus);
-        } else {
-            return null;
+            corpusInfos.put(corpus.language, corpus);
         }
     }
     
@@ -303,6 +322,13 @@ public class IndexConfiguration {
             FieldEncodingEnum.encodeFloat(rankingField, fieldEncoding);
     }
     /**
+     * The version of the {@link #getIndex()} this configuration was built for.
+     * @return the index version this configuration was built for.
+     */
+    public long getVersion() {
+        return indexVersion;
+    }
+    /**
      * Returns the CorpusInfo for the parsed language. If the language has an
      * extension (e.g. en-US) it first tires to load the corpus for the exact
      * match and falls back to the main lanugage (en) if such a corpus does not
@@ -311,32 +337,52 @@ public class IndexConfiguration {
      * @return the corpus information or <code>null</code> if not present
      */
     public CorpusInfo getCorpus(String language) {
-        CorpusInfo langCorpusInfo =  corpusInfos.get(language);
-        if(langCorpusInfo == null && language.indexOf('-') > 0){
-        	String rootLang = language.substring(0,language.indexOf('-'));
-        	log.debug(" - no FST corpus for {}. Fallback to {}", language,rootLang);
-        	langCorpusInfo =  corpusInfos.get(rootLang);
+        corpusInfoLock.readLock().lock();
+        try {
+            CorpusInfo langCorpusInfo =  corpusInfos.get(language);
+            if(langCorpusInfo == null && language.indexOf('-') > 0){
+            	String rootLang = language.substring(0,language.indexOf('-'));
+            	log.debug(" - no FST corpus for {}. Fallback to {}", language,rootLang);
+            	langCorpusInfo =  corpusInfos.get(rootLang);
+            }
+            return langCorpusInfo;
+        } finally {
+            corpusInfoLock.readLock().unlock();
         }
-        return langCorpusInfo;
     }
     /**
      * Getter for the languages of all configured FST corpora
-     * @return the languages of all configured FST corpora
+     * @return a read-only copy of the languages of all configured FST corpora
      */
     public Set<String> getCorpusLanguages(){
-        return Collections.unmodifiableSet(corpusInfos.keySet());
+        return Collections.unmodifiableSet(new HashSet<String>(corpusInfos.keySet()));
     }
     /**
      * Read-only collection of all {@link CorpusInfo}s defined for this
      * configuration.
-     * @return
+     * @return a read only copy of the current {@link CorpusInfo}s
      */
     public Collection<CorpusInfo> getCorpora(){
-        return Collections.unmodifiableCollection(corpusInfos.values());
+        corpusInfoLock.readLock().lock();
+        try {
+            return Collections.unmodifiableCollection(new ArrayList<CorpusInfo>(corpusInfos.values()));
+        } finally {
+            corpusInfoLock.readLock().unlock();
+        }
     }
     
+    /**
+     * The {@link CorpusInfo} for the default laugnage
+     * @return the default corpus or <code>null</code> if no corpus is available
+     * for the default language
+     */
     public CorpusInfo getDefaultCorpus() {
-        return defaultFstCorpus;
+        corpusInfoLock.readLock().lock();
+        try {
+            return corpusInfos.get(defaultLanguage);
+        } finally {
+            corpusInfoLock.readLock().unlock();
+        }
     }
 
     public void setExecutorService(ExecutorService executorService) {
@@ -400,252 +446,357 @@ public class IndexConfiguration {
     }
     
     /**
-     * If this {@link IndexConfiguration} is still active
+     * If this {@link IndexConfiguration} is still in sync with the version
+     * of the {@link #getIndex() SolrCore}. This will return true if 
+     * <code>{@link #isRuntimeGeneration()} == false </code>
      * @return <code>true</code> if still active. Otherwise <code>false</code>
      */
-    public boolean isActive(){
-        return active;
+    public boolean isCurrent(){
+        if(!runtimeGeneration){
+            return true;
+        } else {
+            RefCounted<SolrIndexSearcher> searcherRef = index.getSearcher();
+            try {
+                long version = searcherRef.get().getIndexReader().getVersion();
+                return indexVersion == version;
+            } finally {
+                searcherRef.decref();
+            }
+        }
     }
-    /**
-     * Activated this indexing configuration by inspecting the {@link SolrCore}
-     * based on the provided configuration 
-     * @return
-     */
-    public boolean activate() {
-        active = true;
+
+    private long getIndexVersion(){
         RefCounted<SolrIndexSearcher> searcherRef = index.getSearcher();
         try {
-            return processFstConfig(searcherRef.get().getAtomicReader());
-        }catch (RuntimeException e) { //in case of any excpetion
-            throw e; //re-throw 
-        } catch (IOException e) {
-            throw new IllegalStateException("Unable to activate IndexConfiguration", e);
+            return getIndexVersion(searcherRef.get());
         } finally {
-            searcherRef.decref(); //decrease the count on the searcher
+            searcherRef.decref();
         }
     }
+    
+    private long getIndexVersion(SolrIndexSearcher searcher){
+            return searcher.getIndexReader().getVersion();
+    }
+    
+    public boolean isRuntimeGeneration() {
+        return runtimeGeneration;
+    }
+    
+    public boolean isActive() {
+        return active;
+    }
+    
     /**
-     * This method combines the {@link #fstConfig} with the data present in the
-     * {@link SolrCore}.
-     * @param indexReader The {@link AtomicReader} has access to the actual
-     * fields present in the {@link SolrCore}. It is used to compare field
-     * configurations in the {@link #fstConfig} with fields present in the solr
-     * index.
-     * @return if any FST configuration was successfully processed
+     * Activated this indexing configuration by inspecting the {@link SolrCore}
+     * based on the provided configuration 
+     * @return
      */
-    private boolean processFstConfig(AtomicReader indexReader) throws IOException {
-        if(index == null){
+    public void activate() throws IOException {
+        active = true;
+        if(index == null){ //do we have an SolrCore
             throw new IllegalArgumentException("No SolrCore set for this configuration");
         }
-        if(fstDirectory == null){
+        //if no fstDirectory is configured
+        if(fstDirectory == null){ //use the default
             fstDirectory = new File(index.getDataDir(),"fst");
         }
-        log.debug("> process FST config for {} (FST dir: {})", index.getName(),
-            fstDirectory.getAbsolutePath());
-        //init the fstDirectory
+        //init the fstDirectory (may throw IOException)
         if(fstDirectory.isFile()){
             throw new IOException("Default FST directory exists and "
                     + "is a File. Use #setFstDirectory() to set different one");
         } else if(!fstDirectory.exists()){
             FileUtils.forceMkdir(fstDirectory);
         }
-        IndexSchema schema = index.getLatestSchema();
-        boolean foundCorpus = false;
-        //(0) get basic parameters of the default configuration
-        log.debug(" - default config");
-        Map<String,String> defaultParams = fstConfig.getDefaultParameters();
-        String fstName = defaultParams.get(IndexConfiguration.PARAM_FST);
-        String indexField = defaultParams.get(IndexConfiguration.PARAM_FIELD);
-        String storeField = defaultParams.get(IndexConfiguration.PARAM_STORE_FIELD);
-        if(storeField == null){ 
-            //apply indexField as default if indexField is NOT NULL
-            storeField = indexField;
-        }
-        if(indexField == null){ //apply the defaults if null
-            indexField = IndexConfiguration.DEFAULT_FIELD;
-        }
-        if(fstName == null){ //use default
-            fstName = getDefaultFstFileName(indexField);
+        //acquire the initial index configuration
+        update();
+    }
+    
+    /**
+     * Updates the configuration based on the  current version of the 
+     * {@link #getIndex()}. If the SolrCore was not updated this will do
+     * nothing.
+     */
+    public void update(){
+        RefCounted<SolrIndexSearcher> searcherRef = index.getSearcher();
+        try {
+            update(getIndexVersion(searcherRef.get()), searcherRef.get());
+        } finally {
+            searcherRef.decref(); //decrease the count on the searcher
         }
-        final boolean allowCreation;
-        String allowCreationString = defaultParams.get(IndexConfiguration.PARAM_RUNTIME_GENERATION);
-        if(allowCreationString == null){
-            allowCreation = IndexConfiguration.DEFAULT_RUNTIME_GENERATION;
-        } else {
-            allowCreation = Boolean.parseBoolean(allowCreationString);
+    }
+    /**
+     * Version of {@link #update()} to be used in cases where the indexVersion
+     * and a Solr searcher is already available in the calling method
+     * @param indexVersion
+     * @param searcher
+     */
+    protected void update(long indexVersion, SolrIndexSearcher searcher){
+        assert searcher != null;
+        assert searcher.getCore().equals(index);
+        processFstConfig(indexVersion, searcher.getAtomicReader());
+    }
+    
+    /**
+     * This method combines the {@link #fstConfig} with the data present in the
+     * {@link SolrCore}. 
+     * <p>
+     * As information for fields are only available when a
+     * field was actually used by a document stored in the index one needs to
+     * inspect the index after every change. 
+     * <p>
+     * An empty Solr index will result in
+     * an empty {@link #corpusInfos} map. The first document with an value
+     * for the English field will cause an {@link CorpusInfo} for the English
+     * language to be created. As soon as the last document with an label for
+     * a given language will be deleted the {@link CorpusInfo} for that language
+     * will also disappear.
+     * @param indexVersion the current version of the {@link #index} to process
+     * the FST config for.
+     * <p>
+     * This method acquires a write lock on {@link #corpusInfoLock} while it
+     * inspects the Solr index
+     * @param indexReader The {@link AtomicReader} has access to the actual
+     * fields present in the {@link SolrCore}. It is used to compare field
+     * configurations in the {@link #fstConfig} with fields present in the Solr
+     * {@link #index}.
+     * @return If any {@link CorpusInfo FST configuration} where found during
+     * inspecting the Solr {@link #index}
+     */
+    private boolean processFstConfig(long indexVersion, AtomicReader indexReader) {
+        //first check if the Solr index was updated
+        corpusInfoLock.readLock().lock();
+        try {
+            if(indexVersion == this.indexVersion){ //no update?
+                return !corpusInfos.isEmpty(); //nothing to do
+            }
+        } finally {
+            corpusInfoLock.readLock().unlock();
         }
-        //This are all fields actually present in the index (distinguished with
-        //those defined in the schema). This also includes actual instances of
-        //dynamic field definition in the schema.
-        FieldInfos fieldInfos = indexReader.getFieldInfos(); //we need this twice
+        log.debug("> {} FST config for {} (FST dir: {})", 
+            corpusInfos == null ? "create" : "update",
+            index.getName(), fstDirectory.getAbsolutePath());
         
-        //(1) in case the fstConfig uses a wildcard we need to search for
-        //    languages present in the SolrIndex. For that we use the indexReader
-        //    to get the FieldInfos and match them against FST files in the FST
-        //    directory and FieldType definitions in the schema of the SolrCore
-        //NOTE: this needs only do be done if wildcards are enabled in the fstConfig
-        if(fstConfig.useWildcard()){ 
-            //(1.a) search for present FST files in the FST directory
-            Map<String,File> presentFstFiles = new HashMap<String,File>();
-            WildcardFileFilter fstFilter = new WildcardFileFilter(
-                fstName+".*.fst");
-            Iterator<File> fstFiles = FileUtils.iterateFiles(fstDirectory, fstFilter, null);
-            while(fstFiles.hasNext()){
-                File fstFile = fstFiles.next();
-                String fstFileName = fstFile.getName();
-                //files are named such as "{name}.{lang}.fst"
-                String language = FilenameUtils.getExtension(
-                    FilenameUtils.getBaseName(fstFileName));
-                presentFstFiles.put(language, fstFile);
+        boolean foundCorpus = false;
+
+        corpusInfoLock.writeLock().lock();
+        try {
+            this.indexVersion = indexVersion;
+            IndexSchema schema = index.getLatestSchema();
+            Map<String,CorpusInfo> corpusInfosCopy;
+            if(corpusInfos == null){ //first call
+                corpusInfos = new HashMap<String,CorpusInfo>(); //init the field
+                corpusInfosCopy = new HashMap<String,CorpusInfo>();
+            } else {
+                corpusInfosCopy = new HashMap<String,CorpusInfo>(corpusInfos);
+                corpusInfos.clear(); //clear the old data
             }
-            //(1.b) iterate over the fields in the Solr index and search for 
-            //      matches against the configured indexField name
-            String fieldWildcard = FieldEncodingEnum.encodeLanguage(indexField,
-                fieldEncoding, "*");
-            for(FieldInfo fieldInfo : fieldInfos){
-                //try to match the field names against the wildcard
-                if(FilenameUtils.wildcardMatch(fieldInfo.name, fieldWildcard)){
-                    //for matches parse the language from the field name
-                    String language = FieldEncodingEnum.parseLanguage(
-                        fieldInfo.name, fieldEncoding, indexField);
-                    if(language != null && //successfully parsed language
-                            //is current language is enabled? 
-                            fstConfig.isLanguage(language) &&
-                            //is there no explicit configuration for this language?
-                            !fstConfig.getExplicitlyIncluded().contains(language)){
-                        //generate the FST file name
-                        StringBuilder fstFileName = new StringBuilder(fstName);
-                        if(!language.isEmpty()){
-                            fstFileName.append('.').append(language);
-                        }
-                        fstFileName.append(".fst");
-                        File fstFile = new File(fstDirectory,fstFileName.toString());
-                        //get the FieldType of the field from the Solr schema
-                        FieldType fieldType = schema.getFieldTypeNoEx(fieldInfo.name);
-                        if(fieldType != null){ //if the fieldType is present
-                            if(allowCreation || fstFile.isFile()){ //and FST is present or can be created
-                                //we need also to check if the stored field with
-                                //the labels is present
-                                //get the stored Field and check if it is present!
-                                String storeFieldName;
-                                if(storeField == null){ //storeField == indexField
-                                    storeFieldName = fieldInfo.name;
-                                } else { // check that the storeField is present in the index
-                                    storeFieldName = FieldEncodingEnum.encodeLanguage(
-                                        storeField, fieldEncoding, language);
-                                    FieldInfo storedFieldInfos = fieldInfos.fieldInfo(storeFieldName);
-                                    if(storedFieldInfos == null){
-                                        log.warn(" ... ignore language {} because Stored Field {} "
-                                                + "for IndexField {} does not exist! ", new Object[]{
-                                                language,storeFieldName,fieldInfo.name});
-                                        storeFieldName = null;
+            //(0) get basic parameters of the default configuration
+            log.debug(" - default config");
+            Map<String,String> defaultParams = fstConfig.getDefaultParameters();
+            String fstName = defaultParams.get(IndexConfiguration.PARAM_FST);
+            String indexField = defaultParams.get(IndexConfiguration.PARAM_FIELD);
+            String storeField = defaultParams.get(IndexConfiguration.PARAM_STORE_FIELD);
+            if(storeField == null){ 
+                //apply indexField as default if indexField is NOT NULL
+                storeField = indexField;
+            }
+            if(indexField == null){ //apply the defaults if null
+                indexField = IndexConfiguration.DEFAULT_FIELD;
+            }
+            if(fstName == null){ //use default
+                fstName = getDefaultFstFileName(indexField);
+            }
+            //This are all fields actually present in the index (distinguished with
+            //those defined in the schema). This also includes actual instances of
+            //dynamic field definition in the schema.
+            FieldInfos fieldInfos = indexReader.getFieldInfos(); //we need this twice
+            
+            //(1) in case the fstConfig uses a wildcard we need to search for
+            //    languages present in the SolrIndex. For that we use the indexReader
+            //    to get the FieldInfos and match them against FST files in the FST
+            //    directory and FieldType definitions in the schema of the SolrCore
+            //NOTE: this needs only do be done if wildcards are enabled in the fstConfig
+            if(fstConfig.useWildcard()){ 
+                //(1.a) search for present FST files in the FST directory
+                Map<String,File> presentFstFiles = new HashMap<String,File>();
+                WildcardFileFilter fstFilter = new WildcardFileFilter(
+                    fstName+".*.fst");
+                Iterator<File> fstFiles = FileUtils.iterateFiles(fstDirectory, fstFilter, null);
+                while(fstFiles.hasNext()){
+                    File fstFile = fstFiles.next();
+                    String fstFileName = fstFile.getName();
+                    //files are named such as "{name}.{lang}.fst"
+                    String language = FilenameUtils.getExtension(
+                        FilenameUtils.getBaseName(fstFileName));
+                    presentFstFiles.put(language, fstFile);
+                }
+                //(1.b) iterate over the fields in the Solr index and search for 
+                //      matches against the configured indexField name
+                String fieldWildcard = FieldEncodingEnum.encodeLanguage(indexField,
+                    fieldEncoding, "*");
+                for(FieldInfo fieldInfo : fieldInfos){
+                    //try to match the field names against the wildcard
+                    if(FilenameUtils.wildcardMatch(fieldInfo.name, fieldWildcard)){
+                        //for matches parse the language from the field name
+                        String language = FieldEncodingEnum.parseLanguage(
+                            fieldInfo.name, fieldEncoding, indexField);
+                        if(language != null && //successfully parsed language
+                                //is current language is enabled? 
+                                fstConfig.isLanguage(language) &&
+                                //is there no explicit configuration for this language?
+                                !fstConfig.getExplicitlyIncluded().contains(language)){
+                            //generate the FST file name
+                            StringBuilder fstFileName = new StringBuilder(fstName);
+                            if(!language.isEmpty()){
+                                fstFileName.append('.').append(language);
+                            }
+                            fstFileName.append(".fst");
+                            File fstFile = new File(fstDirectory,fstFileName.toString());
+                            //get the FieldType of the field from the Solr schema
+                            FieldType fieldType = schema.getFieldTypeNoEx(fieldInfo.name);
+                            if(fieldType != null){ //if the fieldType is present
+                                if(runtimeGeneration || fstFile.isFile()){ //and FST is present or can be created
+                                    //we need also to check if the stored field with
+                                    //the labels is present
+                                    //get the stored Field and check if it is present!
+                                    String storeFieldName;
+                                    if(storeField == null){ //storeField == indexField
+                                        storeFieldName = fieldInfo.name;
+                                    } else { // check that the storeField is present in the index
+                                        storeFieldName = FieldEncodingEnum.encodeLanguage(
+                                            storeField, fieldEncoding, language);
+                                        FieldInfo storedFieldInfos = fieldInfos.fieldInfo(storeFieldName);
+                                        if(storedFieldInfos == null){
+                                            log.debug(" ... ignore language {} because Stored Field {} "
+                                                    + "for IndexField {} does not exist! ", new Object[]{
+                                                    language,storeFieldName,fieldInfo.name});
+                                            storeFieldName = null;
+                                        }
+                                        
                                     }
-                                    
-                                }
-                                if(storeFieldName != null){ // == valid configuration
-                                    CorpusInfo fstInfo = new CorpusInfo(language, 
-                                        fieldInfo.name, storeFieldName,  
-                                        fieldType, fstFile, allowCreation);
-                                    log.debug(" ... init {} ", fstInfo);
-                                    addCorpus(fstInfo);
-                                    foundCorpus = true;
+                                    if(storeFieldName != null){ // == valid configuration
+                                        CorpusInfo fstInfo = corpusInfosCopy.get(language);
+                                        if(fstInfo == null || //new one
+                                                !fstInfo.indexedField.equals(fieldInfo.name) || //index field compatible
+                                                !fstInfo.storedField.equals(storeFieldName)){ //store field compatible
+                                            CorpusInfo newFstInfo = new CorpusInfo(language, 
+                                                fieldInfo.name, storeFieldName,  
+                                                fieldType, fstFile, runtimeGeneration);
+                                            log.debug(" ... {} {} ", fstInfo == null ? "create" : "update", newFstInfo);
+                                            addCorpusInfo(newFstInfo);
+                                            corpusInfosCopy.put(language, newFstInfo);
+                                        } else { //no change in the SolrIndex ... use the exsisting CorpusInfo
+                                            addCorpusInfo(fstInfo);
+                                        }
+                                        foundCorpus = true;
+                                    }
+                                } else {
+                                    log.debug(" ... ignore language {} (field: {}) because "
+                                        + "FST file '{}' does not exist and runtime creation "
+                                        + "is deactivated!",new Object[]{ language,
+                                                fieldInfo.name, fstFile.getAbsolutePath()});
                                 }
                             } else {
-                                log.warn(" ... ignore language {} (field: {}) because "
-                                    + "FST file '{}' does not exist and runtime creation "
-                                    + "is deactivated!",new Object[]{ language,
-                                            fieldInfo.name, fstFile.getAbsolutePath()});
+                                log.debug(" ... ignore language {} becuase unknown fieldtype "
+                                    + "for SolrFied {}",language,fieldInfo.name);
                             }
-                        } else {
-                            log.warn(" ... ignore language {} becuase unknown fieldtype "
-                                + "for SolrFied {}",language,fieldInfo.name);
+                        } //else the field matched the wildcard, but has not passed the
+                        //encoding test.
+                    } //Solr field does not match the field definition in the config
+                } // end iterate over all fields in the SolrIndex
+            } //else Wildcard not enabled in the fstConfig
+            
+            //(2) process explicit configuration for configured languages
+            for(String language : fstConfig.getExplicitlyIncluded()){
+                //(2.a) get the language specific config (with fallback to default)
+                Map<String,String> config = fstConfig.getParameters(language);
+                String langIndexField = config.get(IndexConfiguration.PARAM_FIELD);
+                String langStoreField = config.get(IndexConfiguration.PARAM_STORE_FIELD);
+                String langFstFileName = config.get(IndexConfiguration.PARAM_FST);
+                final boolean langAllowCreation;
+                final String langAllowCreationString = config.get(IndexConfiguration.PARAM_RUNTIME_GENERATION);
+                if(langIndexField != null){
+                    //also consider explicit field names as default for the fst name
+                    if(langFstFileName == null){
+                        StringBuilder fileName = new StringBuilder(
+                            getDefaultFstFileName(langIndexField));
+                        if(!language.isEmpty()){
+                            fileName.append('.').append(language);
                         }
-                    } //else the field matched the wildcard, but has not passed the
-                    //encoding test.
-                } //Solr field does not match the field definition in the config
-            } // end iterate over all fields in the SolrIndex
-        } //else Wildcard not enabled in the fstConfig
-        
-        //(2) process explicit configuration for configured languages
-        for(String language : fstConfig.getExplicitlyIncluded()){
-            //(2.a) get the language specific config (with fallback to default)
-            Map<String,String> config = fstConfig.getLanguageParams(language);
-            String langIndexField = config.get(IndexConfiguration.PARAM_FIELD);
-            String langStoreField = config.get(IndexConfiguration.PARAM_STORE_FIELD);
-            String langFstFileName = config.get(IndexConfiguration.PARAM_FST);
-            final boolean langAllowCreation;
-            final String langAllowCreationString = config.get(IndexConfiguration.PARAM_RUNTIME_GENERATION);
-            if(langIndexField != null){
-                //also consider explicit field names as default for the fst name
-                if(langFstFileName == null){
-                    StringBuilder fileName = new StringBuilder(
-                        getDefaultFstFileName(langIndexField));
-                    if(!language.isEmpty()){
-                        fileName.append('.').append(language);
+                        fileName.append(".fst");
+                        langFstFileName = fileName.toString();
                     }
-                    fileName.append(".fst");
-                    langFstFileName = fileName.toString();
+                } else {
+                    langIndexField = indexField;
                 }
-            } else {
-                langIndexField = indexField;
-            }
-            if(langStoreField == null){ //fallbacks
-                if(storeField != null){ //first to default store field
-                    langStoreField = storeField;
-                } else { //else to the lang index field
-                    langStoreField = langIndexField;
+                if(langStoreField == null){ //fallbacks
+                    if(storeField != null){ //first to default store field
+                        langStoreField = storeField;
+                    } else { //else to the lang index field
+                        langStoreField = langIndexField;
+                    }
                 }
-            }
-            if(langFstFileName == null){ //no fstFileName config
-                // ... use the default
-                langFstFileName = new StringBuilder(fstName).append('.')
-                        .append(language).append(".fst").toString(); 
-            }
-            if(langAllowCreationString != null){
-                langAllowCreation = Boolean.parseBoolean(langAllowCreationString);
-            } else {
-                langAllowCreation = allowCreation;
-            }
-            //(2.b) check if the Solr field is present
-            String encodedLangIndexField = FieldEncodingEnum.encodeLanguage(
-                langIndexField, fieldEncoding, language);
-            String encodedLangStoreField = FieldEncodingEnum.encodeLanguage(
-                langStoreField, fieldEncoding, language);
-            FieldInfo langIndexFieldInfo = fieldInfos.fieldInfo(encodedLangIndexField);
-            if(langIndexFieldInfo != null){
-                FieldInfo langStoreFieldInfo = fieldInfos.fieldInfo(encodedLangStoreField);
-                if(langStoreFieldInfo != null){
-                    FieldType fieldType = schema.getFieldTypeNoEx(langIndexFieldInfo.name);
-                    if(fieldType != null){
-                        //(2.c) check the FST file
-                        File langFstFile = new File(fstDirectory,langFstFileName);
-                        if(langFstFile.isFile() || langAllowCreation){
-                            CorpusInfo langFstInfo = new CorpusInfo(language, 
-                                encodedLangIndexField,encodedLangStoreField,
-                                fieldType, langFstFile, langAllowCreation);
-                            log.debug("   ... add {} for explicitly configured language", langFstInfo);
-                            addCorpus(langFstInfo);
-                            foundCorpus = true;
+                if(langFstFileName == null){ //no fstFileName config
+                    // ... use the default
+                    langFstFileName = new StringBuilder(fstName).append('.')
+                            .append(language).append(".fst").toString(); 
+                }
+                if(langAllowCreationString != null){
+                    langAllowCreation = Boolean.parseBoolean(langAllowCreationString);
+                } else {
+                    langAllowCreation = runtimeGeneration;
+                }
+                //(2.b) check if the Solr field is present
+                String encodedLangIndexField = FieldEncodingEnum.encodeLanguage(
+                    langIndexField, fieldEncoding, language);
+                String encodedLangStoreField = FieldEncodingEnum.encodeLanguage(
+                    langStoreField, fieldEncoding, language);
+                FieldInfo langIndexFieldInfo = fieldInfos.fieldInfo(encodedLangIndexField);
+                if(langIndexFieldInfo != null){
+                    FieldInfo langStoreFieldInfo = fieldInfos.fieldInfo(encodedLangStoreField);
+                    if(langStoreFieldInfo != null){
+                        FieldType fieldType = schema.getFieldTypeNoEx(langIndexFieldInfo.name);
+                        if(fieldType != null){
+                            //(2.c) check the FST file
+                            File langFstFile = new File(fstDirectory,langFstFileName);
+                            if(langFstFile.isFile() || langAllowCreation){
+                                CorpusInfo langFstInfo = corpusInfosCopy.get(language);
+                                if(langFstInfo == null || //new one
+                                        !langFstInfo.indexedField.equals(encodedLangIndexField) || //index field compatible
+                                        !langFstInfo.storedField.equals(encodedLangStoreField)){ //store field compatible
+                                    CorpusInfo newLangFstInfo = new CorpusInfo(language, 
+                                        encodedLangIndexField,encodedLangStoreField,
+                                        fieldType, langFstFile, langAllowCreation);
+                                    log.debug("   ... {} {} for explicitly configured language", 
+                                        langFstInfo == null ? "create" : "update", newLangFstInfo);
+                                    addCorpusInfo(newLangFstInfo);
+                                } else { //we can use the existing instance
+                                    addCorpusInfo(langFstInfo);
+                                }
+                                foundCorpus = true;
+                            } else {
+                                log.debug(" ... ignore explicitly configured language {} (field: {}) because "
+                                        + "FST file '{}' does not exist and runtime creation "
+                                        + "is deactivated!",new Object[]{ language,
+                                                langIndexFieldInfo.name, langFstFile.getAbsolutePath()});
+                            }
                         } else {
-                            log.warn(" ... ignore language {} (field: {}) because "
-                                    + "FST file '{}' does not exist and runtime creation "
-                                    + "is deactivated!",new Object[]{ language,
-                                            langIndexFieldInfo.name, langFstFile.getAbsolutePath()});
+                            log.debug(" ... ignore explicitly configured language {} becuase unknown fieldtype "
+                                    + "for SolrFied {}", language, langIndexFieldInfo.name);
                         }
                     } else {
-                        log.warn(" ... ignore language {} becuase unknown fieldtype "
-                                + "for SolrFied {}", language, langIndexFieldInfo.name);
+                        log.debug(" ... ignore explicitly configured language {} because configured stored Field {} "
+                                + "for IndexField {} does not exist! ", new Object[]{
+                                language,langStoreField,langIndexFieldInfo.name});
                     }
                 } else {
-                    log.warn(" ... ignore language {} because configured stored Field {} "
-                            + "for IndexField {} does not exist! ", new Object[]{
-                            language,langStoreField,langIndexFieldInfo.name});
+                    log.debug(" ... ignore explicitly configured language {} because configured field {} (encoded: {}) "
+                        + "is not present in the SolrIndex!", new Object[]{
+                                language, langIndexField, encodedLangIndexField });
                 }
-            } else {
-                log.warn(" ... ignore language {} because configured field {} (encoded: {}) "
-                    + "is not present in the SolrIndex!", new Object[]{
-                            language, langIndexField, encodedLangIndexField });
             }
+        } finally {
+            corpusInfoLock.writeLock().unlock();
         }
         return foundCorpus;
     }
@@ -684,4 +835,5 @@ public class IndexConfiguration {
         this.skipAltTokens = skipAltTokens;
         
     }
+
 }

Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java?rev=1731820&r1=1731819&r2=1731820&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java Tue Feb 23 09:43:03 2016
@@ -27,6 +27,11 @@ import java.util.Map;
 import java.util.NavigableMap;
 import java.util.Set;
 import java.util.TreeMap;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 
 import org.apache.clerezza.rdf.core.Language;
 import org.apache.clerezza.rdf.core.Literal;
@@ -134,6 +139,12 @@ public class TaggingSession implements C
         SolrIndexSearcher searcher = searcherRef.get();
         DirectoryReader indexReader = searcher.getIndexReader();
         indexVersion = Long.valueOf(indexReader.getVersion());
+        //check if the IndexConfiguration is up to date with the version of the index
+        long confVersion = config.getVersion();
+        if(confVersion != indexVersion){
+            log.debug("> update IndexConfiguration (from: {} | to: {}",confVersion, indexVersion);
+            config.update(indexVersion, searcher);
+        }
         
         //get the corpusInfo
         CorpusInfo langCorpusInfo = config.getCorpus(language);
@@ -338,58 +349,106 @@ public class TaggingSession implements C
      */
     private TaggerFstCorpus obtainFstCorpus(Long indexVersion, CorpusInfo fstInfo) throws CorpusException {
         TaggerFstCorpus fstCorpus;
-        synchronized (fstInfo) { // one at a time
-            fstCorpus = fstInfo.getCorpus(); 
-            if (fstCorpus == null) {
-                if (fstInfo.isEnqueued()) {
-                    throw new CorpusException("The FST corpus for language '"
-                            + fstInfo.language + "' is enqueued for creation, but not yet "
-                            + "available. Try at a  later point in time", null);
+        fstCorpus = fstInfo.getCorpus(); 
+        Future<TaggerFstCorpus> enqueuedCorpus = null;
+        if (fstCorpus == null) {
+            if (!fstInfo.allowCreation && fstInfo.isFstCreationError()) {
+                throw new CorpusException(fstInfo.getErrorMessage(), null);
+            }
+            fstInfo.corpusLock.readLock().lock();
+            try {
+                enqueuedCorpus = fstInfo.getEnqueued();
+            } finally {
+                fstInfo.corpusLock.readLock().unlock();
+            }
+            if(enqueuedCorpus == null && //not enqueued
+                    fstInfo.allowCreation){ 
+                log.debug(" - enqueue creation of {}", fstInfo);
+                enqueuedCorpus = enqueue(fstInfo);
+            }
+            if(enqueuedCorpus == null){
+                throw new CorpusException("Unable to abtain Fst Corpus for " + fstInfo
+                    + "(message: " + fstInfo.getErrorMessage() + ")!", null);
+            }
+        } else { //fstCorpus != null
+            //check if the current FST corpus is up to date with the Solr index
+            if(indexVersion != null && indexVersion.longValue() != fstCorpus.getIndexVersion()){
+                log.debug(" - FST corpus for language '{}' is outdated", fstInfo.language);
+                fstInfo.corpusLock.readLock().lock();
+                try {
+                    enqueuedCorpus = fstInfo.getEnqueued();
+                } finally {
+                    fstInfo.corpusLock.readLock().unlock();
                 }
-                if (fstInfo.isFstCreationError()) {
-                    throw new CorpusException(fstInfo.getErrorMessage(), null);
+                if(enqueuedCorpus == null && //not already enqueued
+                        fstInfo.allowCreation && config.getExecutorService() != null){
+                    log.debug(" - enqueue creation of {}", fstInfo);
+                    enqueuedCorpus = enqueue(fstInfo);
+                } else {
+                    log.warn("Unable to update outdated FST corpus for language '{}' "
+                            + "because runtimeCreation is {} and ExecutorServic "
+                            + "is {} available!", new Object[]{fstInfo.language,
+                            fstInfo.allowCreation ? "enabled" : "disabled" ,
+                            config.getExecutorService() == null ? "not" : ""});
+                    log.warn("  ... please adapt the Engine configuration for up "
+                        + "to date FST corpora!");
                 }
-                if (fstInfo.isFstFileError() && fstInfo.allowCreation) {
-                    //try to recreate the FST corpus
-                    if(config.getExecutorService() != null){
-                        // TODO: this code should get moved to a CorpusManager class
-                        config.getExecutorService().execute(
-                            new CorpusCreationTask(config, fstInfo));
-                        throw new CorpusException("The FST corpus for language '"
-                                + fstInfo.language + "' was invalid and is now "
-                                + "enqueued for re-creation. Retry at a  later "
-                                + "point in time.", null);
-                    } else {
-                        throw new CorpusException(fstInfo.getErrorMessage(), null);
-                    }
+            } else { //FST corpus is up to date with the current Solr index version
+                log.debug("FST corpus for language '{}' is up to date", fstInfo.language);
+            }
+        }
+        //TODO: maybe make this configurable
+        int waitTime = fstCorpus == null ? 30 : 10; 
+        if(enqueuedCorpus != null){ //we needed to build a new corpus
+            try {
+                log.debug(" - will wait max {}sec for creation of {}", waitTime, fstInfo);
+                fstCorpus = enqueuedCorpus.get(waitTime, TimeUnit.SECONDS);
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt(); //recover interrupted state
+            } catch (ExecutionException e) {
+                log.warn("Unable to update outdated FST corpus " + fstInfo 
+                    + " (message: " + fstInfo.getErrorMessage() + ")",e); 
+            } catch (TimeoutException e) {
+                if(fstCorpus != null){
+                    log.debug("unable to build FST corpus for {} in time ({}sec). Will use "
+                        + "previouse version ",fstInfo, waitTime);
+                } else {
+                    throw new CorpusException("Unable to build Fst Corpus for " + fstInfo
+                        + "within " + waitTime+ "sec! Try again later.", null);
                 }
-            } else { //fstCorpus != null
-                if(indexVersion != null && indexVersion.longValue() != fstCorpus.getIndexVersion()){
-                    log.info("FST corpus for language '{}' is outdated ...", fstInfo.language);
-                    if(fstInfo.isEnqueued()){
-                        log.info("  ... already sheduled for recreation. "
-                            + "Use outaded corpus for tagging");
-                    } else if(fstInfo.allowCreation && config.getExecutorService() != null){
-                        log.info("  ... initialise recreation");
-                        config.getExecutorService().execute(
-                            new CorpusCreationTask(config, fstInfo));
-                    } else {
-                        log.warn("Unable to update outdated FST corpus for language '{}' "
-                                + "because runtimeCreation is {} and ExecutorServic "
-                                + "is {} available!", new Object[]{fstInfo.language,
-                                fstInfo.allowCreation ? "enabled" : "disabled" ,
-                                config.getExecutorService() == null ? "not" : ""});
-                        log.warn("  ... please adapt the Engine configuration for up "
-                            + "to date FST corpora!");
-                    }
-                } else { //FST corpus is up to date with the current Solr index version
-                    log.debug("FST corpus for language '{}' is up to date", fstInfo.language);
+            } catch (CancellationException e) {
+                if(fstCorpus != null){
+                    log.debug("building of  FST corpus for {} was cancelled. Will use "
+                        + "previouse version.",fstInfo);
+                } else {
+                    throw new CorpusException("Building of FST Corpus " + fstInfo 
+                        + "was cancelled!", null);
                 }
             }
         }
         return fstCorpus;
     }
     /**
+     * @param fstInfo
+     * @return
+     */
+    private Future<TaggerFstCorpus> enqueue(CorpusInfo fstInfo) {
+        Future<TaggerFstCorpus> enqueuedCorpus;
+        fstInfo.corpusLock.writeLock().lock();
+        try {
+            enqueuedCorpus = fstInfo.getEnqueued(); //check again in write lock
+            if(enqueuedCorpus == null){
+                //enqueue for re-creation
+                enqueuedCorpus = config.getExecutorService().submit(
+                    new CorpusCreationTask(config, fstInfo));
+                fstInfo.enqueued(enqueuedCorpus);;
+            }
+        } finally {
+            fstInfo.corpusLock.writeLock().unlock();
+        }
+        return enqueuedCorpus;
+    }
+    /**
      * The current version of the SolrIndex as reported by the {@link IndexReader}
      * used by this TaggingSession.
      * @return the current version of the SolrIndex.

Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java?rev=1731820&r1=1731819&r2=1731820&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java Tue Feb 23 09:43:03 2016
@@ -196,7 +196,7 @@ public class FstLinkingEngineTest {
         //setup the index configuration
         LanguageConfiguration langConf = new LanguageConfiguration("not.used", 
             new String[]{"en;field=dbpedia-ont:surfaceForm;generate=true"});
-        fstConfig = new IndexConfiguration(langConf, core, FieldEncodingEnum.SolrYard);
+        fstConfig = new IndexConfiguration(langConf, core, FieldEncodingEnum.SolrYard,"");
         fstConfig.setExecutorService(Executors.newFixedThreadPool(1));
         fstConfig.setTypeField("rdf:type");
         fstConfig.setRankingField("entityhub:entityRank");