You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2016/02/23 10:43:03 UTC
svn commit: r1731820 - in
/stanbol/trunk/enhancement-engines/lucenefstlinking: ./
src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/
src/main/resources/config/
src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/
Author: rwesten
Date: Tue Feb 23 09:43:03 2016
New Revision: 1731820
URL: http://svn.apache.org/viewvc?rev=1731820&view=rev
Log:
merged implementations for STANBOL-1447 and STANBOL-1448 to trunk
Added:
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/resources/config/
- copied from r1731818, stanbol/branches/release-0.12/enhancement-engines/lucenefstlinking/src/main/resources/config/
Modified:
stanbol/trunk/enhancement-engines/lucenefstlinking/pom.xml
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusCreationTask.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java
stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java
Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/pom.xml?rev=1731820&r1=1731819&r2=1731820&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/pom.xml (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/pom.xml Tue Feb 23 09:43:03 2016
@@ -73,6 +73,8 @@
<Embed-Dependency>
solr-text-tagger
</Embed-Dependency>
+ <!-- we install a logger configuration to set TaggerFstCorpus loggings to ERROR -->
+ <Install-Path>config</Install-Path>
</instructions>
</configuration>
</plugin>
Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusCreationTask.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusCreationTask.java?rev=1731820&r1=1731819&r2=1731820&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusCreationTask.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusCreationTask.java Tue Feb 23 09:43:03 2016
@@ -39,13 +39,12 @@ import org.slf4j.LoggerFactory;
* @author Rupert Westenthaler
*
*/
-public class CorpusCreationTask implements Runnable{
+public class CorpusCreationTask implements Callable<TaggerFstCorpus>{
private final Logger log = LoggerFactory.getLogger(CorpusCreationTask.class);
private final CorpusInfo fstInfo;
private final IndexConfiguration indexConfig;
- private final long enqueued;
public CorpusCreationTask(IndexConfiguration indexConfig, CorpusInfo fstInfo){
if(indexConfig == null || fstInfo == null){
@@ -53,22 +52,20 @@ public class CorpusCreationTask implemen
}
this.indexConfig = indexConfig;
this.fstInfo = fstInfo;
- this.enqueued = fstInfo.enqueue();
}
@Override
- public void run() {
+ public TaggerFstCorpus call() {
if(!indexConfig.isActive()){
- return; //task cancelled
- }
- //check if the FST corpus was enqueued a 2nd time
- if(enqueued != fstInfo.getEnqueued()){
- return;
+ String msg = "Index Configuration already deactivated";
+ fstInfo.setError(msg);
+ throw new IllegalStateException(msg);
}
SolrCore core = indexConfig.getIndex();
if(core.isClosed()){
- log.warn("Unable to build {} becuase SolrCore {} is closed!",fstInfo,core.getName());
- return;
+ String msg = "Unable to build " + fstInfo + " becuase SolrCore " + core.getName() + " is closed!";
+ fstInfo.setError(msg);
+ throw new IllegalStateException(msg);
}
final TaggerFstCorpus corpus;
RefCounted<SolrIndexSearcher> searcherRef = core.getSearcher();
@@ -85,6 +82,14 @@ public class CorpusCreationTask implemen
fstInfo.partialMatches,1,100);
}
});
+ if(indexConfig.isActive()){
+ //set the created corpus to the FST Info
+ fstInfo.setCorpus(corpus);
+ } else { //index configuration no longer active ... ignore the built FST
+ log.warn("Index Config for "+ fstInfo + "was deactivated while building FST. "
+ + "Built FST will be ignored.");
+ }
+ return corpus;
} catch (PrivilegedActionException pae) {
Exception e = pae.getException();
if(e instanceof IOException){ //IO Exception while loading the file
@@ -96,31 +101,6 @@ public class CorpusCreationTask implemen
} finally {
searcherRef.decref(); //ensure that we dereference the searcher
}
- if(indexConfig.isActive()){
- //set the created corpus to the FST Info
- fstInfo.setCorpus(enqueued, corpus);
- try { //STANBOL-1177: save FST models in AccessController.doPrivileged(..)
- AccessController.doPrivileged(new PrivilegedExceptionAction<Object>() {
- public Object run() throws IOException {
- if(fstInfo.fst.exists()){
- if(!FileUtils.deleteQuietly(fstInfo.fst)){
- log.warn("Unable to delete existing FST file for {}", fstInfo);
- }
- }
- corpus.save(fstInfo.fst);
- return null; //not used
- }
- });
- } catch (PrivilegedActionException pae) {
- Exception e = pae.getException();
- if(e instanceof IOException){ //IO Exception while loading the file
- log.warn("Unable to store FST corpus " + fstInfo + " to "
- + fstInfo.fst.getAbsolutePath() + "!", e);
- } else { //Runtime exception
- throw RuntimeException.class.cast(e);
- }
- }
- } //else index configuration no longer active ... ignore the built FST
}
@Override
Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java?rev=1731820&r1=1731819&r2=1731820&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/CorpusInfo.java Tue Feb 23 09:43:03 2016
@@ -26,6 +26,9 @@ import java.security.PrivilegedActionExc
import java.security.PrivilegedExceptionAction;
import java.text.SimpleDateFormat;
import java.util.Date;
+import java.util.concurrent.Future;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.ObjectUtils;
@@ -80,9 +83,11 @@ public class CorpusInfo {
public final Analyzer taggingAnalyzer;
+ protected final ReadWriteLock corpusLock = new ReentrantReadWriteLock();
+
protected Reference<TaggerFstCorpus> taggerCorpusRef;
- protected long enqueued = -1;
+ private Future<TaggerFstCorpus> enqueuedCorpus;
/**
* Allows to store an error message encountered while loading/creating the
* FST corpus.
@@ -97,6 +102,7 @@ public class CorpusInfo {
*/
private boolean creationError = false;
+
/**
* @param language
* @param indexField
@@ -118,14 +124,10 @@ public class CorpusInfo {
* Allows to set an error occurring during the creation of
* @param message
*/
- protected void setError(long enqueued, String message){
+ protected void setError(String message){
this.errorMessage = message;
- if(message != null){
- this.creationError = true;
- }
- if(this.enqueued == enqueued){
- this.enqueued = -1;
- }
+ this.creationError = true;
+ setCorpus(null);
}
public boolean isFstFile(){
return fst != null && fst.isFile();
@@ -151,67 +153,127 @@ public class CorpusInfo {
* @param enqueued the version of the corpus
* @param corpus the corpus
*/
- protected final void setCorpus(long enqueued, TaggerFstCorpus corpus) {
- if(taggerCorpusRef != null){
- taggerCorpusRef.clear();
- taggerCorpusRef = null;
+ protected final void setCorpus(final TaggerFstCorpus corpus) {
+ corpusLock.writeLock().lock();
+ try {
+ enqueuedCorpus = null; //clear the future ref
+ if(taggerCorpusRef != null){
+ taggerCorpusRef.clear();
+ taggerCorpusRef = null;
+ }
+ if(corpus != null){
+ //reset any error
+ this.errorMessage = null;
+ this.creationError = false;
+ //we set the corpus as a weak reference. This allows the
+ //GC to free the corpus earlier.
+ //This is done, because here the corpus was just built and not
+ //yet requested. So we want those to be GCed earlier.
+ taggerCorpusRef = new WeakReference<TaggerFstCorpus>(corpus);
+ }
+ } finally {
+ corpusLock.writeLock().unlock();
}
+ //Store the newly built FST corpus to disc. A read level lock is sufficient
+ //for this.
+ //NOTE: the WeakReference to the corpus can only be GC'ed after we
+ // have written the corpus to disc, as we still have a reference
+ // to corpus!
if(corpus != null){
- //reset any error
- this.errorMessage = null;
- this.creationError = false;
- //we set the corpus as a weak reference. This allows the
- //GC to free the corpus earlier.
- //This is done, because here the corpus was just built and not
- //yet requested. So we want those to be GCed earlier.
- taggerCorpusRef = new WeakReference<TaggerFstCorpus>(corpus);
- }
- //check if the set version is the most current one
- if(enqueued == this.enqueued){ //if so
- this.enqueued = -1; //mark this one as up-to-date
+ try {
+ corpusLock.readLock().lock();
+ try { //STANBOL-1177: save FST models in AccessController.doPrivileged(..)
+ AccessController.doPrivileged(new PrivilegedExceptionAction<Object>() {
+ public Object run() throws IOException {
+ if(fst.exists()){
+ if(!FileUtils.deleteQuietly(fst)){
+ log.warn("Unable to delete existing FST file for {}", fst);
+ }
+ }
+ corpus.save(fst);
+ return null; //not used
+ }
+ });
+ } finally {
+ corpusLock.readLock().unlock();
+ }
+ } catch (PrivilegedActionException pae) {
+ Exception e = pae.getException();
+ if(e instanceof IOException){ //IO Exception while loading the file
+ log.warn("Unable to store FST corpus to "
+ + fst.getAbsolutePath() + "!", e);
+ //if we can not save the FST corpus we replace the WeakReference
+ //with a SoftReference to avoid frequent rebuilding of the corpus
+ corpusLock.writeLock().lock();
+ try {
+ if(taggerCorpusRef instanceof WeakReference<?>){
+ taggerCorpusRef.clear();
+ taggerCorpusRef = new SoftReference<TaggerFstCorpus>(corpus);
+ }
+ } finally {
+ corpusLock.writeLock().lock();
+ }
+ } else { //Runtime exception
+ throw RuntimeException.class.cast(e);
+ }
+ }
}
}
public TaggerFstCorpus getCorpus() {
- TaggerFstCorpus corpus = taggerCorpusRef == null ? null : taggerCorpusRef.get();
- if(corpus != null){
- //on first usage replace a WeakReference with a SoftReference
- if(taggerCorpusRef instanceof WeakReference<?>){
- log.debug(" ... convert Weak to Soft Reference for Corpus {}", fst);
- taggerCorpusRef.clear();
- taggerCorpusRef = new SoftReference<TaggerFstCorpus>(corpus);
+ TaggerFstCorpus corpus;
+ corpusLock.readLock().lock();
+ try {
+ corpus = taggerCorpusRef == null ? null : taggerCorpusRef.get();
+ if(corpus != null){
+ //on first usage replace a WeakReference with a SoftReference
+ if(taggerCorpusRef instanceof WeakReference<?>){
+ log.debug(" ... convert Weak to Soft Reference for Corpus {}", fst);
+ taggerCorpusRef.clear();
+ taggerCorpusRef = new SoftReference<TaggerFstCorpus>(corpus);
+ }
+ } else if(taggerCorpusRef != null){
+ taggerCorpusRef = null; //reset to null as the reference was taken
}
- } else if(taggerCorpusRef != null){
- taggerCorpusRef = null; //reset to null as the reference was taken
+ } finally {
+ corpusLock.readLock().unlock();
}
if(corpus == null) {
log.info(" ... load FST corpus {}",fst);
+ corpusLock.writeLock().lock();
try { //STANBOL-1177: load FST models in AccessController.doPrivileged(..)
- corpus = AccessController.doPrivileged(new PrivilegedExceptionAction<TaggerFstCorpus>() {
- public TaggerFstCorpus run() throws IOException {
- if(fst.exists() && //if the file exists AND the file was not yet failing to load
- //OR the file is newer as the last version failing to load
- (!fstFileError || FileUtils.isFileNewer(fst, fstDate))){
- TaggerFstCorpus corpus = TaggerFstCorpus.load(fst);
- if(corpus != null){
- //I need to set fstDate here, because I can not
- //access lastModified() outside doPrivileged
- fstDate = new Date(fst.lastModified());
- if(log.isInfoEnabled()){
- log.info(" ... loaded FST (date: {})",
- SimpleDateFormat.getDateTimeInstance().format(fstDate));
+ corpus = taggerCorpusRef == null ? null : taggerCorpusRef.get();
+ if(corpus == null){ //corpus not loaded while waiting for the write lock
+ corpus = AccessController.doPrivileged(new PrivilegedExceptionAction<TaggerFstCorpus>() {
+ public TaggerFstCorpus run() throws IOException {
+ if(fst.exists() && //if the file exists AND the file was not yet failing to load
+ //OR the file is newer as the last version failing to load
+ (!fstFileError || FileUtils.isFileNewer(fst, fstDate))){
+ TaggerFstCorpus corpus = TaggerFstCorpus.load(fst);
+ if(corpus != null){
+ //I need to set fstDate here, because I can not
+ //access lastModified() outside doPrivileged
+ fstDate = new Date(fst.lastModified());
+ if(log.isInfoEnabled()){
+ log.info(" ... loaded FST (date: {})",
+ SimpleDateFormat.getDateTimeInstance().format(fstDate));
+ }
+ } else {
+ log.warn(" ... no corpus loaded from {}",fst);
}
+ return corpus;
} else {
- log.warn(" ... no corpus loaded from {}",fst);
+ log.warn(" ... unable to load FST from {} (exists: {}, fileError {})",
+ new Object[]{fst, fst.exists(),fstFileError});
+ return null;
}
- return corpus;
- } else {
- log.warn(" ... unable to load FST from {} (exists: {}, fileError {})",
- new Object[]{fst, fst.exists(),fstFileError});
- return null;
}
- }
- });
+ });
+ if(corpus != null){
+ fstFileError = false;
+ taggerCorpusRef = new SoftReference<TaggerFstCorpus>(corpus);
+ } //else not loaded from file
+ } //else corpus was loaded while waiting for the write lock
} catch (PrivilegedActionException pae) {
Exception e = pae.getException();
if(e instanceof IOException){ //IO Exception while loading the file
@@ -223,28 +285,25 @@ public class CorpusInfo {
} else { //Runtime exception
throw RuntimeException.class.cast(e);
}
+ } finally {
+ corpusLock.writeLock().unlock();
}
- if(corpus != null){
- fstFileError = false;
- taggerCorpusRef = new SoftReference<TaggerFstCorpus>(corpus);
- } //else not loaded from file
}
return corpus;
}
/**
- * Called when a {@link CorpusInfo} object is enqueued for runtime generation.
- * This is used to prevent multiple FST generation in cases where the
- * FstInfo is enqueued a 2nd time before the first one was processed.
- * @return the {@link System#currentTimeMillis() current time} when calling
- * this method.
- */
- protected long enqueue(){
- enqueued = System.currentTimeMillis();
- return enqueued;
+ * Called after the curpus was enqueued for rebuilding
+ */
+ protected void enqueued(Future<TaggerFstCorpus> enqueued){
+ this.enqueuedCorpus = enqueued;
}
-
- protected long getEnqueued(){
- return enqueued;
+ /**
+ * Allows to get the {@link Future} of a ongoing {@link CorpusCreationTask}.
+ * @return returns a {@link Future} that allows to wait for a corpus that is
+ * currently be built.
+ */
+ public Future<TaggerFstCorpus> getEnqueued(){
+ return enqueuedCorpus;
}
/**
@@ -255,7 +314,7 @@ public class CorpusInfo {
* @return <code>true</code> if the FST corpus is enqueued for (re)generation.
*/
public boolean isEnqueued(){
- return enqueued > 0;
+ return taggerCorpusRef != null;
}
Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java?rev=1731820&r1=1731819&r2=1731820&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngine.java Tue Feb 23 09:43:03 2016
@@ -28,7 +28,6 @@ import static org.apache.stanbol.enhance
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
@@ -164,12 +163,17 @@ public class FstLinkingEngine implements
return CANNOT_ENHANCE;
}
//(2) check if we have a FST model for the language
- if(indexConfig.getCorpus(language) == null && //for the language
- indexConfig.getDefaultCorpus() == null){ //a default model
- log.debug("Engine {} ignores ContentItem {} becuase no FST modles for language {} "
- + "are available", new Object[] {getName(), ci.getUri(), language});
- return CANNOT_ENHANCE;
- }
+ //NOTE: as STANBOL-1448 the index configuration is Solr index version
+ // dependent. This means that we can not use informations of the
+ // current IndexConfiguration to check if we have an FST model for
+ // the language of the requested document. Those information might
+ // be already out dated.
+// if(indexConfig.getCorpus(language) == null && //for the language
+// indexConfig.getDefaultCorpus() == null){ //a default model
+// log.debug("Engine {} ignores ContentItem {} becuase no FST modles for language {} "
+// + "are available", new Object[] {getName(), ci.getUri(), language});
+// return CANNOT_ENHANCE;
+// }
// we need a detected language, the AnalyzedText contentPart with
// Tokens.
AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java?rev=1731820&r1=1731819&r2=1731820&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineComponent.java Tue Feb 23 09:43:03 2016
@@ -46,6 +46,7 @@ import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
import org.apache.clerezza.rdf.core.Literal;
import org.apache.clerezza.rdf.core.Resource;
@@ -79,6 +80,7 @@ import org.apache.stanbol.enhancer.nlp.n
import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration;
import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.opensextant.solrtexttagger.TaggerFstCorpus;
import org.osgi.framework.BundleContext;
import org.osgi.framework.Constants;
import org.osgi.framework.InvalidSyntaxException;
@@ -760,7 +762,7 @@ public class FstLinkingEngineComponent {
}
//File fstDir = new File(dataDir,"fst");
//now collect the FST configuration
- indexConfig = new IndexConfiguration(fstConfig, core, fieldEncoding);
+ indexConfig = new IndexConfiguration(fstConfig, core, fieldEncoding, entityLinkerConfig.getDefaultLanguage());
indexConfig.setTypeField(solrTypeField);
indexConfig.setRankingField(solrRankingField);
//set fields parsed in the activate method
@@ -778,48 +780,40 @@ public class FstLinkingEngineComponent {
if(skipAltTokensConfig != null){
indexConfig.setSkipAltTokens(skipAltTokensConfig);
}
- //create a new searcher for creating FSTs
- if(!indexConfig.activate()){
- log.warn("Processing of the FST configuration was not successfull "
- + "for any language. See WARN level loggings for more details!");
- log.warn(" ... FstLinkingEnigne wiht name {} will be registered but"
- + "be inactive as there seam to be no data for linking available"
- + "in the SolrCore {} (dir: {})",
- new Object []{engineName, core.getName(),
- core.getCoreDescriptor().getInstanceDir()});
- } else { //some FST corpora initialised
- if(log.isInfoEnabled()){ //log the initialised languages
- Set<String> langSet = new HashSet<String>(indexConfig.getCorpusLanguages());
- if(langSet.remove(null)){ //replace the null for the default language
- langSet.add(""); //with an empty string
- }
- String[] langArray = langSet.toArray(new String[langSet.size()]);
- Arrays.sort(langArray,String.CASE_INSENSITIVE_ORDER);
- log.info(" ... initialised FST corpora for languages {}",
- Arrays.toString(langArray));
+ //activate the index configuration
+ try {
+ //this will init the FST directory if necessary so we might run
+ //into IOExceptions
+ indexConfig.activate();
+ } catch (IOException e) {
+ throw new RuntimeException("Unable to activate Index for FST Linking Engine '"
+ + engineName +"' (solrCore: "+ core.getName() + ", instanceDir: "
+ + core.getCoreDescriptor().getInstanceDir() +")!", e);
+ }
+ if(log.isInfoEnabled()){ //log the initialised languages
+ Set<String> langSet = new HashSet<String>(indexConfig.getCorpusLanguages());
+ if(langSet.remove(null)){ //replace the null for the default language
+ langSet.add(""); //with an empty string
}
+ String[] langArray = langSet.toArray(new String[langSet.size()]);
+ Arrays.sort(langArray,String.CASE_INSENSITIVE_ORDER);
+ log.info(" ... initialised FST corpora for languages {}",
+ Arrays.toString(langArray));
}
//check if we need to create some FST files
for(CorpusInfo fstInfo : indexConfig.getCorpora()){
//check if the fst does not exist and the fstInfo allows creation
if(!fstInfo.fst.exists() && fstInfo.allowCreation){
//create a task on the FST corpus creation service
- fstCreatorService.execute(new CorpusCreationTask(indexConfig, fstInfo));
+ fstInfo.corpusLock.writeLock().lock();
+ try {
+ Future<TaggerFstCorpus> enqueued = fstCreatorService.submit(new CorpusCreationTask(indexConfig, fstInfo));
+ fstInfo.enqueued(enqueued);
+ } finally {
+ fstInfo.corpusLock.writeLock().unlock();
+ }
}
}
- //set the default linking corpora
- String defaultLanguage = entityLinkerConfig.getDefaultLanguage();
- if(defaultLanguage == null){
- defaultLanguage = ""; //FST uses an empty string for the default
- }
- CorpusInfo defaultCoprous = indexConfig.getCorpus(defaultLanguage);
- if(defaultCoprous != null){
- log.info(" ... set '{}' as default FST Corpus: {}", defaultCoprous.language, defaultCoprous);
- indexConfig.setDefaultCorpus(defaultCoprous);
- } else {
- log.info(" ... no corpus for default language {} available", defaultCoprous);
- }
- //create the new configuration
//set the newly configured instances to the fields
this.indexConfig = indexConfig;
Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java?rev=1731820&r1=1731819&r2=1731820&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/IndexConfiguration.java Tue Feb 23 09:43:03 2016
@@ -18,6 +18,7 @@ package org.apache.stanbol.enhancer.engi
import java.io.File;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
@@ -26,6 +27,8 @@ import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.apache.clerezza.rdf.core.Literal;
import org.apache.clerezza.rdf.core.Resource;
@@ -39,6 +42,7 @@ import org.apache.lucene.document.Docume
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.search.IndexSearcher;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
@@ -78,9 +82,13 @@ public class IndexConfiguration {
private String rankingField;
/**
+ * Used to sync access to {@link #corpusInfos}
+ */
+ private ReadWriteLock corpusInfoLock = new ReentrantReadWriteLock();
+ /**
* FST corpus configuration
*/
- private Map<String,CorpusInfo> corpusInfos = new HashMap<String,CorpusInfo>();
+ private Map<String,CorpusInfo> corpusInfos;
/**
* {@link ExecutorService} used to create {@link TaggerFstCorpus} instances
* at runtime.
@@ -97,15 +105,20 @@ public class IndexConfiguration {
*/
private EntityCacheManager entityCacheManager;
+ private final LanguageConfiguration fstConfig;
+
/**
- * The FST corpus used for linking regardless of the language of the
- * document
+ * If runtime generation is enabled by default (Note: explicitly configured
+ * lanugages might override this)
*/
- private CorpusInfo defaultFstCorpus;
-
- private final LanguageConfiguration fstConfig;
+ private final boolean runtimeGeneration;
+ /**
+ * used to track if this index configuration is active
+ */
private boolean active = false;
+
+ private long indexVersion = -1;
private File fstDirectory;
@@ -122,6 +135,11 @@ public class IndexConfiguration {
* tokens should cause an {@link UnsupportedTokenException}.
*/
private boolean skipAltTokens;
+
+ /**
+ * The default language
+ */
+ private String defaultLanguage;
/**
* If alternate tokens (<code>posInc == 0</code>) can be skipped or if such
* tokens should cause an {@link UnsupportedTokenException}.
@@ -206,7 +224,7 @@ public class IndexConfiguration {
*/
public static final String PARAM_FST = "fst";
- public IndexConfiguration(LanguageConfiguration fstConfig, SolrCore index, FieldEncodingEnum fieldEncoding){
+ public IndexConfiguration(LanguageConfiguration fstConfig, SolrCore index, FieldEncodingEnum fieldEncoding, String defaultLanguage){
if(fstConfig == null){
throw new IllegalArgumentException("The parsed FST configuration MUST NOT be NULL!");
}
@@ -214,6 +232,14 @@ public class IndexConfiguration {
if(index == null || index.isClosed()){
throw new IllegalArgumentException("The parsed SolrCore MUST NOT be NULL nore closed!");
}
+ //check if we have runtime generation
+ String allowCreationString = fstConfig.getDefaultParameters().get(IndexConfiguration.PARAM_RUNTIME_GENERATION);
+ if(allowCreationString == null){
+ runtimeGeneration = IndexConfiguration.DEFAULT_RUNTIME_GENERATION;
+ } else {
+ runtimeGeneration = Boolean.parseBoolean(allowCreationString);
+ }
+
this.index = index;
if(fieldEncoding == null){
fieldEncoding = FieldEncodingEnum.None;
@@ -226,23 +252,16 @@ public class IndexConfiguration {
} else {
this.skipAltTokens = false;
}
+ this.defaultLanguage = defaultLanguage == null ? "" : defaultLanguage;
}
- public CorpusInfo setDefaultCorpus(CorpusInfo corpus){
- CorpusInfo oldDefault = defaultFstCorpus;
- if(corpus != null){
- this.defaultFstCorpus = corpus;
- } else {
- this.defaultFstCorpus = null;
- }
- return oldDefault;
- }
-
- protected CorpusInfo addCorpus(CorpusInfo corpus){
+ /**
+ * Assumed to be called in a write lock on {@link #corpusInfoLock}
+ * @param corpus
+ */
+ private void addCorpusInfo(CorpusInfo corpus){
if(corpus != null){
- return corpusInfos.put(corpus.language, corpus);
- } else {
- return null;
+ corpusInfos.put(corpus.language, corpus);
}
}
@@ -303,6 +322,13 @@ public class IndexConfiguration {
FieldEncodingEnum.encodeFloat(rankingField, fieldEncoding);
}
/**
+ * The version of the {@link #getIndex()} this configuration was built for.
+ * @return the index version this configuration was built for.
+ */
+ public long getVersion() {
+ return indexVersion;
+ }
+ /**
* Returns the CorpusInfo for the parsed language. If the language has an
* extension (e.g. en-US) it first tires to load the corpus for the exact
* match and falls back to the main lanugage (en) if such a corpus does not
@@ -311,32 +337,52 @@ public class IndexConfiguration {
* @return the corpus information or <code>null</code> if not present
*/
public CorpusInfo getCorpus(String language) {
- CorpusInfo langCorpusInfo = corpusInfos.get(language);
- if(langCorpusInfo == null && language.indexOf('-') > 0){
- String rootLang = language.substring(0,language.indexOf('-'));
- log.debug(" - no FST corpus for {}. Fallback to {}", language,rootLang);
- langCorpusInfo = corpusInfos.get(rootLang);
+ corpusInfoLock.readLock().lock();
+ try {
+ CorpusInfo langCorpusInfo = corpusInfos.get(language);
+ if(langCorpusInfo == null && language.indexOf('-') > 0){
+ String rootLang = language.substring(0,language.indexOf('-'));
+ log.debug(" - no FST corpus for {}. Fallback to {}", language,rootLang);
+ langCorpusInfo = corpusInfos.get(rootLang);
+ }
+ return langCorpusInfo;
+ } finally {
+ corpusInfoLock.readLock().unlock();
}
- return langCorpusInfo;
}
/**
* Getter for the languages of all configured FST corpora
- * @return the languages of all configured FST corpora
+ * @return a read-only copy of the languages of all configured FST corpora
*/
public Set<String> getCorpusLanguages(){
- return Collections.unmodifiableSet(corpusInfos.keySet());
+ return Collections.unmodifiableSet(new HashSet<String>(corpusInfos.keySet()));
}
/**
* Read-only collection of all {@link CorpusInfo}s defined for this
* configuration.
- * @return
+ * @return a read only copy of the current {@link CorpusInfo}s
*/
public Collection<CorpusInfo> getCorpora(){
- return Collections.unmodifiableCollection(corpusInfos.values());
+ corpusInfoLock.readLock().lock();
+ try {
+ return Collections.unmodifiableCollection(new ArrayList<CorpusInfo>(corpusInfos.values()));
+ } finally {
+ corpusInfoLock.readLock().unlock();
+ }
}
+ /**
+ * The {@link CorpusInfo} for the default laugnage
+ * @return the default corpus or <code>null</code> if no corpus is available
+ * for the default language
+ */
public CorpusInfo getDefaultCorpus() {
- return defaultFstCorpus;
+ corpusInfoLock.readLock().lock();
+ try {
+ return corpusInfos.get(defaultLanguage);
+ } finally {
+ corpusInfoLock.readLock().unlock();
+ }
}
public void setExecutorService(ExecutorService executorService) {
@@ -400,252 +446,357 @@ public class IndexConfiguration {
}
/**
- * If this {@link IndexConfiguration} is still active
+ * If this {@link IndexConfiguration} is still in sync with the version
+ * of the {@link #getIndex() SolrCore}. This will return true if
+ * <code>{@link #isRuntimeGeneration()} == false </code>
* @return <code>true</code> if still active. Otherwise <code>false</code>
*/
- public boolean isActive(){
- return active;
+ public boolean isCurrent(){
+ if(!runtimeGeneration){
+ return true;
+ } else {
+ RefCounted<SolrIndexSearcher> searcherRef = index.getSearcher();
+ try {
+ long version = searcherRef.get().getIndexReader().getVersion();
+ return indexVersion == version;
+ } finally {
+ searcherRef.decref();
+ }
+ }
}
- /**
- * Activated this indexing configuration by inspecting the {@link SolrCore}
- * based on the provided configuration
- * @return
- */
- public boolean activate() {
- active = true;
+
+ private long getIndexVersion(){
RefCounted<SolrIndexSearcher> searcherRef = index.getSearcher();
try {
- return processFstConfig(searcherRef.get().getAtomicReader());
- }catch (RuntimeException e) { //in case of any excpetion
- throw e; //re-throw
- } catch (IOException e) {
- throw new IllegalStateException("Unable to activate IndexConfiguration", e);
+ return getIndexVersion(searcherRef.get());
} finally {
- searcherRef.decref(); //decrease the count on the searcher
+ searcherRef.decref();
}
}
+
+ private long getIndexVersion(SolrIndexSearcher searcher){
+ return searcher.getIndexReader().getVersion();
+ }
+
+ public boolean isRuntimeGeneration() {
+ return runtimeGeneration;
+ }
+
+ public boolean isActive() {
+ return active;
+ }
+
/**
- * This method combines the {@link #fstConfig} with the data present in the
- * {@link SolrCore}.
- * @param indexReader The {@link AtomicReader} has access to the actual
- * fields present in the {@link SolrCore}. It is used to compare field
- * configurations in the {@link #fstConfig} with fields present in the solr
- * index.
- * @return if any FST configuration was successfully processed
+ * Activated this indexing configuration by inspecting the {@link SolrCore}
+ * based on the provided configuration
+ * @return
*/
- private boolean processFstConfig(AtomicReader indexReader) throws IOException {
- if(index == null){
+ public void activate() throws IOException {
+ active = true;
+ if(index == null){ //do we have an SolrCore
throw new IllegalArgumentException("No SolrCore set for this configuration");
}
- if(fstDirectory == null){
+ //if no fstDirectory is configured
+ if(fstDirectory == null){ //use the default
fstDirectory = new File(index.getDataDir(),"fst");
}
- log.debug("> process FST config for {} (FST dir: {})", index.getName(),
- fstDirectory.getAbsolutePath());
- //init the fstDirectory
+ //init the fstDirectory (may throw IOException)
if(fstDirectory.isFile()){
throw new IOException("Default FST directory exists and "
+ "is a File. Use #setFstDirectory() to set different one");
} else if(!fstDirectory.exists()){
FileUtils.forceMkdir(fstDirectory);
}
- IndexSchema schema = index.getLatestSchema();
- boolean foundCorpus = false;
- //(0) get basic parameters of the default configuration
- log.debug(" - default config");
- Map<String,String> defaultParams = fstConfig.getDefaultParameters();
- String fstName = defaultParams.get(IndexConfiguration.PARAM_FST);
- String indexField = defaultParams.get(IndexConfiguration.PARAM_FIELD);
- String storeField = defaultParams.get(IndexConfiguration.PARAM_STORE_FIELD);
- if(storeField == null){
- //apply indexField as default if indexField is NOT NULL
- storeField = indexField;
- }
- if(indexField == null){ //apply the defaults if null
- indexField = IndexConfiguration.DEFAULT_FIELD;
- }
- if(fstName == null){ //use default
- fstName = getDefaultFstFileName(indexField);
+ //acquire the initial index configuration
+ update();
+ }
+
+ /**
+ * Updates the configuration based on the current version of the
+ * {@link #getIndex()}. If the SolrCore was not updated this will do
+ * nothing.
+ */
+ public void update(){
+ RefCounted<SolrIndexSearcher> searcherRef = index.getSearcher();
+ try {
+ update(getIndexVersion(searcherRef.get()), searcherRef.get());
+ } finally {
+ searcherRef.decref(); //decrease the count on the searcher
}
- final boolean allowCreation;
- String allowCreationString = defaultParams.get(IndexConfiguration.PARAM_RUNTIME_GENERATION);
- if(allowCreationString == null){
- allowCreation = IndexConfiguration.DEFAULT_RUNTIME_GENERATION;
- } else {
- allowCreation = Boolean.parseBoolean(allowCreationString);
+ }
+ /**
+ * Version of {@link #update()} to be used in cases where the indexVersion
+ * and a Solr searcher is already available in the calling method
+ * @param indexVersion
+ * @param searcher
+ */
+ protected void update(long indexVersion, SolrIndexSearcher searcher){
+ assert searcher != null;
+ assert searcher.getCore().equals(index);
+ processFstConfig(indexVersion, searcher.getAtomicReader());
+ }
+
+ /**
+ * This method combines the {@link #fstConfig} with the data present in the
+ * {@link SolrCore}.
+ * <p>
+ * As information for fields are only available when a
+ * field was actually used by a document stored in the index one needs to
+ * inspect the index after every change.
+ * <p>
+ * An empty Solr index will result in
+ * an empty {@link #corpusInfos} map. The first document with an value
+ * for the English field will cause an {@link CorpusInfo} for the English
+ * language to be created. As soon as the last document with an label for
+ * a given language will be deleted the {@link CorpusInfo} for that language
+ * will also disappear.
+ * @param indexVersion the current version of the {@link #index} to process
+ * the FST config for.
+ * <p>
+ * This method acquires a write lock on {@link #corpusInfoLock} while it
+ * inspects the Solr index
+ * @param indexReader The {@link AtomicReader} has access to the actual
+ * fields present in the {@link SolrCore}. It is used to compare field
+ * configurations in the {@link #fstConfig} with fields present in the Solr
+ * {@link #index}.
+ * @return If any {@link CorpusInfo FST configuration} where found during
+ * inspecting the Solr {@link #index}
+ */
+ private boolean processFstConfig(long indexVersion, AtomicReader indexReader) {
+ //first check if the Solr index was updated
+ corpusInfoLock.readLock().lock();
+ try {
+ if(indexVersion == this.indexVersion){ //no update?
+ return !corpusInfos.isEmpty(); //nothing to do
+ }
+ } finally {
+ corpusInfoLock.readLock().unlock();
}
- //This are all fields actually present in the index (distinguished with
- //those defined in the schema). This also includes actual instances of
- //dynamic field definition in the schema.
- FieldInfos fieldInfos = indexReader.getFieldInfos(); //we need this twice
+ log.debug("> {} FST config for {} (FST dir: {})",
+ corpusInfos == null ? "create" : "update",
+ index.getName(), fstDirectory.getAbsolutePath());
- //(1) in case the fstConfig uses a wildcard we need to search for
- // languages present in the SolrIndex. For that we use the indexReader
- // to get the FieldInfos and match them against FST files in the FST
- // directory and FieldType definitions in the schema of the SolrCore
- //NOTE: this needs only do be done if wildcards are enabled in the fstConfig
- if(fstConfig.useWildcard()){
- //(1.a) search for present FST files in the FST directory
- Map<String,File> presentFstFiles = new HashMap<String,File>();
- WildcardFileFilter fstFilter = new WildcardFileFilter(
- fstName+".*.fst");
- Iterator<File> fstFiles = FileUtils.iterateFiles(fstDirectory, fstFilter, null);
- while(fstFiles.hasNext()){
- File fstFile = fstFiles.next();
- String fstFileName = fstFile.getName();
- //files are named such as "{name}.{lang}.fst"
- String language = FilenameUtils.getExtension(
- FilenameUtils.getBaseName(fstFileName));
- presentFstFiles.put(language, fstFile);
+ boolean foundCorpus = false;
+
+ corpusInfoLock.writeLock().lock();
+ try {
+ this.indexVersion = indexVersion;
+ IndexSchema schema = index.getLatestSchema();
+ Map<String,CorpusInfo> corpusInfosCopy;
+ if(corpusInfos == null){ //first call
+ corpusInfos = new HashMap<String,CorpusInfo>(); //init the field
+ corpusInfosCopy = new HashMap<String,CorpusInfo>();
+ } else {
+ corpusInfosCopy = new HashMap<String,CorpusInfo>(corpusInfos);
+ corpusInfos.clear(); //clear the old data
}
- //(1.b) iterate over the fields in the Solr index and search for
- // matches against the configured indexField name
- String fieldWildcard = FieldEncodingEnum.encodeLanguage(indexField,
- fieldEncoding, "*");
- for(FieldInfo fieldInfo : fieldInfos){
- //try to match the field names against the wildcard
- if(FilenameUtils.wildcardMatch(fieldInfo.name, fieldWildcard)){
- //for matches parse the language from the field name
- String language = FieldEncodingEnum.parseLanguage(
- fieldInfo.name, fieldEncoding, indexField);
- if(language != null && //successfully parsed language
- //is current language is enabled?
- fstConfig.isLanguage(language) &&
- //is there no explicit configuration for this language?
- !fstConfig.getExplicitlyIncluded().contains(language)){
- //generate the FST file name
- StringBuilder fstFileName = new StringBuilder(fstName);
- if(!language.isEmpty()){
- fstFileName.append('.').append(language);
- }
- fstFileName.append(".fst");
- File fstFile = new File(fstDirectory,fstFileName.toString());
- //get the FieldType of the field from the Solr schema
- FieldType fieldType = schema.getFieldTypeNoEx(fieldInfo.name);
- if(fieldType != null){ //if the fieldType is present
- if(allowCreation || fstFile.isFile()){ //and FST is present or can be created
- //we need also to check if the stored field with
- //the labels is present
- //get the stored Field and check if it is present!
- String storeFieldName;
- if(storeField == null){ //storeField == indexField
- storeFieldName = fieldInfo.name;
- } else { // check that the storeField is present in the index
- storeFieldName = FieldEncodingEnum.encodeLanguage(
- storeField, fieldEncoding, language);
- FieldInfo storedFieldInfos = fieldInfos.fieldInfo(storeFieldName);
- if(storedFieldInfos == null){
- log.warn(" ... ignore language {} because Stored Field {} "
- + "for IndexField {} does not exist! ", new Object[]{
- language,storeFieldName,fieldInfo.name});
- storeFieldName = null;
+ //(0) get basic parameters of the default configuration
+ log.debug(" - default config");
+ Map<String,String> defaultParams = fstConfig.getDefaultParameters();
+ String fstName = defaultParams.get(IndexConfiguration.PARAM_FST);
+ String indexField = defaultParams.get(IndexConfiguration.PARAM_FIELD);
+ String storeField = defaultParams.get(IndexConfiguration.PARAM_STORE_FIELD);
+ if(storeField == null){
+ //apply indexField as default if indexField is NOT NULL
+ storeField = indexField;
+ }
+ if(indexField == null){ //apply the defaults if null
+ indexField = IndexConfiguration.DEFAULT_FIELD;
+ }
+ if(fstName == null){ //use default
+ fstName = getDefaultFstFileName(indexField);
+ }
+ //This are all fields actually present in the index (distinguished with
+ //those defined in the schema). This also includes actual instances of
+ //dynamic field definition in the schema.
+ FieldInfos fieldInfos = indexReader.getFieldInfos(); //we need this twice
+
+ //(1) in case the fstConfig uses a wildcard we need to search for
+ // languages present in the SolrIndex. For that we use the indexReader
+ // to get the FieldInfos and match them against FST files in the FST
+ // directory and FieldType definitions in the schema of the SolrCore
+ //NOTE: this needs only do be done if wildcards are enabled in the fstConfig
+ if(fstConfig.useWildcard()){
+ //(1.a) search for present FST files in the FST directory
+ Map<String,File> presentFstFiles = new HashMap<String,File>();
+ WildcardFileFilter fstFilter = new WildcardFileFilter(
+ fstName+".*.fst");
+ Iterator<File> fstFiles = FileUtils.iterateFiles(fstDirectory, fstFilter, null);
+ while(fstFiles.hasNext()){
+ File fstFile = fstFiles.next();
+ String fstFileName = fstFile.getName();
+ //files are named such as "{name}.{lang}.fst"
+ String language = FilenameUtils.getExtension(
+ FilenameUtils.getBaseName(fstFileName));
+ presentFstFiles.put(language, fstFile);
+ }
+ //(1.b) iterate over the fields in the Solr index and search for
+ // matches against the configured indexField name
+ String fieldWildcard = FieldEncodingEnum.encodeLanguage(indexField,
+ fieldEncoding, "*");
+ for(FieldInfo fieldInfo : fieldInfos){
+ //try to match the field names against the wildcard
+ if(FilenameUtils.wildcardMatch(fieldInfo.name, fieldWildcard)){
+ //for matches parse the language from the field name
+ String language = FieldEncodingEnum.parseLanguage(
+ fieldInfo.name, fieldEncoding, indexField);
+ if(language != null && //successfully parsed language
+ //is current language is enabled?
+ fstConfig.isLanguage(language) &&
+ //is there no explicit configuration for this language?
+ !fstConfig.getExplicitlyIncluded().contains(language)){
+ //generate the FST file name
+ StringBuilder fstFileName = new StringBuilder(fstName);
+ if(!language.isEmpty()){
+ fstFileName.append('.').append(language);
+ }
+ fstFileName.append(".fst");
+ File fstFile = new File(fstDirectory,fstFileName.toString());
+ //get the FieldType of the field from the Solr schema
+ FieldType fieldType = schema.getFieldTypeNoEx(fieldInfo.name);
+ if(fieldType != null){ //if the fieldType is present
+ if(runtimeGeneration || fstFile.isFile()){ //and FST is present or can be created
+ //we need also to check if the stored field with
+ //the labels is present
+ //get the stored Field and check if it is present!
+ String storeFieldName;
+ if(storeField == null){ //storeField == indexField
+ storeFieldName = fieldInfo.name;
+ } else { // check that the storeField is present in the index
+ storeFieldName = FieldEncodingEnum.encodeLanguage(
+ storeField, fieldEncoding, language);
+ FieldInfo storedFieldInfos = fieldInfos.fieldInfo(storeFieldName);
+ if(storedFieldInfos == null){
+ log.debug(" ... ignore language {} because Stored Field {} "
+ + "for IndexField {} does not exist! ", new Object[]{
+ language,storeFieldName,fieldInfo.name});
+ storeFieldName = null;
+ }
+
}
-
- }
- if(storeFieldName != null){ // == valid configuration
- CorpusInfo fstInfo = new CorpusInfo(language,
- fieldInfo.name, storeFieldName,
- fieldType, fstFile, allowCreation);
- log.debug(" ... init {} ", fstInfo);
- addCorpus(fstInfo);
- foundCorpus = true;
+ if(storeFieldName != null){ // == valid configuration
+ CorpusInfo fstInfo = corpusInfosCopy.get(language);
+ if(fstInfo == null || //new one
+ !fstInfo.indexedField.equals(fieldInfo.name) || //index field compatible
+ !fstInfo.storedField.equals(storeFieldName)){ //store field compatible
+ CorpusInfo newFstInfo = new CorpusInfo(language,
+ fieldInfo.name, storeFieldName,
+ fieldType, fstFile, runtimeGeneration);
+ log.debug(" ... {} {} ", fstInfo == null ? "create" : "update", newFstInfo);
+ addCorpusInfo(newFstInfo);
+ corpusInfosCopy.put(language, newFstInfo);
+ } else { //no change in the SolrIndex ... use the exsisting CorpusInfo
+ addCorpusInfo(fstInfo);
+ }
+ foundCorpus = true;
+ }
+ } else {
+ log.debug(" ... ignore language {} (field: {}) because "
+ + "FST file '{}' does not exist and runtime creation "
+ + "is deactivated!",new Object[]{ language,
+ fieldInfo.name, fstFile.getAbsolutePath()});
}
} else {
- log.warn(" ... ignore language {} (field: {}) because "
- + "FST file '{}' does not exist and runtime creation "
- + "is deactivated!",new Object[]{ language,
- fieldInfo.name, fstFile.getAbsolutePath()});
+ log.debug(" ... ignore language {} becuase unknown fieldtype "
+ + "for SolrFied {}",language,fieldInfo.name);
}
- } else {
- log.warn(" ... ignore language {} becuase unknown fieldtype "
- + "for SolrFied {}",language,fieldInfo.name);
+ } //else the field matched the wildcard, but has not passed the
+ //encoding test.
+ } //Solr field does not match the field definition in the config
+ } // end iterate over all fields in the SolrIndex
+ } //else Wildcard not enabled in the fstConfig
+
+ //(2) process explicit configuration for configured languages
+ for(String language : fstConfig.getExplicitlyIncluded()){
+ //(2.a) get the language specific config (with fallback to default)
+ Map<String,String> config = fstConfig.getParameters(language);
+ String langIndexField = config.get(IndexConfiguration.PARAM_FIELD);
+ String langStoreField = config.get(IndexConfiguration.PARAM_STORE_FIELD);
+ String langFstFileName = config.get(IndexConfiguration.PARAM_FST);
+ final boolean langAllowCreation;
+ final String langAllowCreationString = config.get(IndexConfiguration.PARAM_RUNTIME_GENERATION);
+ if(langIndexField != null){
+ //also consider explicit field names as default for the fst name
+ if(langFstFileName == null){
+ StringBuilder fileName = new StringBuilder(
+ getDefaultFstFileName(langIndexField));
+ if(!language.isEmpty()){
+ fileName.append('.').append(language);
}
- } //else the field matched the wildcard, but has not passed the
- //encoding test.
- } //Solr field does not match the field definition in the config
- } // end iterate over all fields in the SolrIndex
- } //else Wildcard not enabled in the fstConfig
-
- //(2) process explicit configuration for configured languages
- for(String language : fstConfig.getExplicitlyIncluded()){
- //(2.a) get the language specific config (with fallback to default)
- Map<String,String> config = fstConfig.getLanguageParams(language);
- String langIndexField = config.get(IndexConfiguration.PARAM_FIELD);
- String langStoreField = config.get(IndexConfiguration.PARAM_STORE_FIELD);
- String langFstFileName = config.get(IndexConfiguration.PARAM_FST);
- final boolean langAllowCreation;
- final String langAllowCreationString = config.get(IndexConfiguration.PARAM_RUNTIME_GENERATION);
- if(langIndexField != null){
- //also consider explicit field names as default for the fst name
- if(langFstFileName == null){
- StringBuilder fileName = new StringBuilder(
- getDefaultFstFileName(langIndexField));
- if(!language.isEmpty()){
- fileName.append('.').append(language);
+ fileName.append(".fst");
+ langFstFileName = fileName.toString();
}
- fileName.append(".fst");
- langFstFileName = fileName.toString();
+ } else {
+ langIndexField = indexField;
}
- } else {
- langIndexField = indexField;
- }
- if(langStoreField == null){ //fallbacks
- if(storeField != null){ //first to default store field
- langStoreField = storeField;
- } else { //else to the lang index field
- langStoreField = langIndexField;
+ if(langStoreField == null){ //fallbacks
+ if(storeField != null){ //first to default store field
+ langStoreField = storeField;
+ } else { //else to the lang index field
+ langStoreField = langIndexField;
+ }
}
- }
- if(langFstFileName == null){ //no fstFileName config
- // ... use the default
- langFstFileName = new StringBuilder(fstName).append('.')
- .append(language).append(".fst").toString();
- }
- if(langAllowCreationString != null){
- langAllowCreation = Boolean.parseBoolean(langAllowCreationString);
- } else {
- langAllowCreation = allowCreation;
- }
- //(2.b) check if the Solr field is present
- String encodedLangIndexField = FieldEncodingEnum.encodeLanguage(
- langIndexField, fieldEncoding, language);
- String encodedLangStoreField = FieldEncodingEnum.encodeLanguage(
- langStoreField, fieldEncoding, language);
- FieldInfo langIndexFieldInfo = fieldInfos.fieldInfo(encodedLangIndexField);
- if(langIndexFieldInfo != null){
- FieldInfo langStoreFieldInfo = fieldInfos.fieldInfo(encodedLangStoreField);
- if(langStoreFieldInfo != null){
- FieldType fieldType = schema.getFieldTypeNoEx(langIndexFieldInfo.name);
- if(fieldType != null){
- //(2.c) check the FST file
- File langFstFile = new File(fstDirectory,langFstFileName);
- if(langFstFile.isFile() || langAllowCreation){
- CorpusInfo langFstInfo = new CorpusInfo(language,
- encodedLangIndexField,encodedLangStoreField,
- fieldType, langFstFile, langAllowCreation);
- log.debug(" ... add {} for explicitly configured language", langFstInfo);
- addCorpus(langFstInfo);
- foundCorpus = true;
+ if(langFstFileName == null){ //no fstFileName config
+ // ... use the default
+ langFstFileName = new StringBuilder(fstName).append('.')
+ .append(language).append(".fst").toString();
+ }
+ if(langAllowCreationString != null){
+ langAllowCreation = Boolean.parseBoolean(langAllowCreationString);
+ } else {
+ langAllowCreation = runtimeGeneration;
+ }
+ //(2.b) check if the Solr field is present
+ String encodedLangIndexField = FieldEncodingEnum.encodeLanguage(
+ langIndexField, fieldEncoding, language);
+ String encodedLangStoreField = FieldEncodingEnum.encodeLanguage(
+ langStoreField, fieldEncoding, language);
+ FieldInfo langIndexFieldInfo = fieldInfos.fieldInfo(encodedLangIndexField);
+ if(langIndexFieldInfo != null){
+ FieldInfo langStoreFieldInfo = fieldInfos.fieldInfo(encodedLangStoreField);
+ if(langStoreFieldInfo != null){
+ FieldType fieldType = schema.getFieldTypeNoEx(langIndexFieldInfo.name);
+ if(fieldType != null){
+ //(2.c) check the FST file
+ File langFstFile = new File(fstDirectory,langFstFileName);
+ if(langFstFile.isFile() || langAllowCreation){
+ CorpusInfo langFstInfo = corpusInfosCopy.get(language);
+ if(langFstInfo == null || //new one
+ !langFstInfo.indexedField.equals(encodedLangIndexField) || //index field compatible
+ !langFstInfo.storedField.equals(encodedLangStoreField)){ //store field compatible
+ CorpusInfo newLangFstInfo = new CorpusInfo(language,
+ encodedLangIndexField,encodedLangStoreField,
+ fieldType, langFstFile, langAllowCreation);
+ log.debug(" ... {} {} for explicitly configured language",
+ langFstInfo == null ? "create" : "update", newLangFstInfo);
+ addCorpusInfo(newLangFstInfo);
+ } else { //we can use the existing instance
+ addCorpusInfo(langFstInfo);
+ }
+ foundCorpus = true;
+ } else {
+ log.debug(" ... ignore explicitly configured language {} (field: {}) because "
+ + "FST file '{}' does not exist and runtime creation "
+ + "is deactivated!",new Object[]{ language,
+ langIndexFieldInfo.name, langFstFile.getAbsolutePath()});
+ }
} else {
- log.warn(" ... ignore language {} (field: {}) because "
- + "FST file '{}' does not exist and runtime creation "
- + "is deactivated!",new Object[]{ language,
- langIndexFieldInfo.name, langFstFile.getAbsolutePath()});
+ log.debug(" ... ignore explicitly configured language {} becuase unknown fieldtype "
+ + "for SolrFied {}", language, langIndexFieldInfo.name);
}
} else {
- log.warn(" ... ignore language {} becuase unknown fieldtype "
- + "for SolrFied {}", language, langIndexFieldInfo.name);
+ log.debug(" ... ignore explicitly configured language {} because configured stored Field {} "
+ + "for IndexField {} does not exist! ", new Object[]{
+ language,langStoreField,langIndexFieldInfo.name});
}
} else {
- log.warn(" ... ignore language {} because configured stored Field {} "
- + "for IndexField {} does not exist! ", new Object[]{
- language,langStoreField,langIndexFieldInfo.name});
+ log.debug(" ... ignore explicitly configured language {} because configured field {} (encoded: {}) "
+ + "is not present in the SolrIndex!", new Object[]{
+ language, langIndexField, encodedLangIndexField });
}
- } else {
- log.warn(" ... ignore language {} because configured field {} (encoded: {}) "
- + "is not present in the SolrIndex!", new Object[]{
- language, langIndexField, encodedLangIndexField });
}
+ } finally {
+ corpusInfoLock.writeLock().unlock();
}
return foundCorpus;
}
@@ -684,4 +835,5 @@ public class IndexConfiguration {
this.skipAltTokens = skipAltTokens;
}
+
}
Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java?rev=1731820&r1=1731819&r2=1731820&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/main/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/TaggingSession.java Tue Feb 23 09:43:03 2016
@@ -27,6 +27,11 @@ import java.util.Map;
import java.util.NavigableMap;
import java.util.Set;
import java.util.TreeMap;
+import java.util.concurrent.CancellationException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
import org.apache.clerezza.rdf.core.Language;
import org.apache.clerezza.rdf.core.Literal;
@@ -134,6 +139,12 @@ public class TaggingSession implements C
SolrIndexSearcher searcher = searcherRef.get();
DirectoryReader indexReader = searcher.getIndexReader();
indexVersion = Long.valueOf(indexReader.getVersion());
+ //check if the IndexConfiguration is up to date with the version of the index
+ long confVersion = config.getVersion();
+ if(confVersion != indexVersion){
+ log.debug("> update IndexConfiguration (from: {} | to: {}",confVersion, indexVersion);
+ config.update(indexVersion, searcher);
+ }
//get the corpusInfo
CorpusInfo langCorpusInfo = config.getCorpus(language);
@@ -338,58 +349,106 @@ public class TaggingSession implements C
*/
private TaggerFstCorpus obtainFstCorpus(Long indexVersion, CorpusInfo fstInfo) throws CorpusException {
TaggerFstCorpus fstCorpus;
- synchronized (fstInfo) { // one at a time
- fstCorpus = fstInfo.getCorpus();
- if (fstCorpus == null) {
- if (fstInfo.isEnqueued()) {
- throw new CorpusException("The FST corpus for language '"
- + fstInfo.language + "' is enqueued for creation, but not yet "
- + "available. Try at a later point in time", null);
+ fstCorpus = fstInfo.getCorpus();
+ Future<TaggerFstCorpus> enqueuedCorpus = null;
+ if (fstCorpus == null) {
+ if (!fstInfo.allowCreation && fstInfo.isFstCreationError()) {
+ throw new CorpusException(fstInfo.getErrorMessage(), null);
+ }
+ fstInfo.corpusLock.readLock().lock();
+ try {
+ enqueuedCorpus = fstInfo.getEnqueued();
+ } finally {
+ fstInfo.corpusLock.readLock().unlock();
+ }
+ if(enqueuedCorpus == null && //not enqueued
+ fstInfo.allowCreation){
+ log.debug(" - enqueue creation of {}", fstInfo);
+ enqueuedCorpus = enqueue(fstInfo);
+ }
+ if(enqueuedCorpus == null){
+ throw new CorpusException("Unable to abtain Fst Corpus for " + fstInfo
+ + "(message: " + fstInfo.getErrorMessage() + ")!", null);
+ }
+ } else { //fstCorpus != null
+ //check if the current FST corpus is up to date with the Solr index
+ if(indexVersion != null && indexVersion.longValue() != fstCorpus.getIndexVersion()){
+ log.debug(" - FST corpus for language '{}' is outdated", fstInfo.language);
+ fstInfo.corpusLock.readLock().lock();
+ try {
+ enqueuedCorpus = fstInfo.getEnqueued();
+ } finally {
+ fstInfo.corpusLock.readLock().unlock();
}
- if (fstInfo.isFstCreationError()) {
- throw new CorpusException(fstInfo.getErrorMessage(), null);
+ if(enqueuedCorpus == null && //not already enqueued
+ fstInfo.allowCreation && config.getExecutorService() != null){
+ log.debug(" - enqueue creation of {}", fstInfo);
+ enqueuedCorpus = enqueue(fstInfo);
+ } else {
+ log.warn("Unable to update outdated FST corpus for language '{}' "
+ + "because runtimeCreation is {} and ExecutorServic "
+ + "is {} available!", new Object[]{fstInfo.language,
+ fstInfo.allowCreation ? "enabled" : "disabled" ,
+ config.getExecutorService() == null ? "not" : ""});
+ log.warn(" ... please adapt the Engine configuration for up "
+ + "to date FST corpora!");
}
- if (fstInfo.isFstFileError() && fstInfo.allowCreation) {
- //try to recreate the FST corpus
- if(config.getExecutorService() != null){
- // TODO: this code should get moved to a CorpusManager class
- config.getExecutorService().execute(
- new CorpusCreationTask(config, fstInfo));
- throw new CorpusException("The FST corpus for language '"
- + fstInfo.language + "' was invalid and is now "
- + "enqueued for re-creation. Retry at a later "
- + "point in time.", null);
- } else {
- throw new CorpusException(fstInfo.getErrorMessage(), null);
- }
+ } else { //FST corpus is up to date with the current Solr index version
+ log.debug("FST corpus for language '{}' is up to date", fstInfo.language);
+ }
+ }
+ //TODO: maybe make this configurable
+ int waitTime = fstCorpus == null ? 30 : 10;
+ if(enqueuedCorpus != null){ //we needed to build a new corpus
+ try {
+ log.debug(" - will wait max {}sec for creation of {}", waitTime, fstInfo);
+ fstCorpus = enqueuedCorpus.get(waitTime, TimeUnit.SECONDS);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt(); //recover interrupted state
+ } catch (ExecutionException e) {
+ log.warn("Unable to update outdated FST corpus " + fstInfo
+ + " (message: " + fstInfo.getErrorMessage() + ")",e);
+ } catch (TimeoutException e) {
+ if(fstCorpus != null){
+ log.debug("unable to build FST corpus for {} in time ({}sec). Will use "
+ + "previouse version ",fstInfo, waitTime);
+ } else {
+ throw new CorpusException("Unable to build Fst Corpus for " + fstInfo
+ + "within " + waitTime+ "sec! Try again later.", null);
}
- } else { //fstCorpus != null
- if(indexVersion != null && indexVersion.longValue() != fstCorpus.getIndexVersion()){
- log.info("FST corpus for language '{}' is outdated ...", fstInfo.language);
- if(fstInfo.isEnqueued()){
- log.info(" ... already sheduled for recreation. "
- + "Use outaded corpus for tagging");
- } else if(fstInfo.allowCreation && config.getExecutorService() != null){
- log.info(" ... initialise recreation");
- config.getExecutorService().execute(
- new CorpusCreationTask(config, fstInfo));
- } else {
- log.warn("Unable to update outdated FST corpus for language '{}' "
- + "because runtimeCreation is {} and ExecutorServic "
- + "is {} available!", new Object[]{fstInfo.language,
- fstInfo.allowCreation ? "enabled" : "disabled" ,
- config.getExecutorService() == null ? "not" : ""});
- log.warn(" ... please adapt the Engine configuration for up "
- + "to date FST corpora!");
- }
- } else { //FST corpus is up to date with the current Solr index version
- log.debug("FST corpus for language '{}' is up to date", fstInfo.language);
+ } catch (CancellationException e) {
+ if(fstCorpus != null){
+ log.debug("building of FST corpus for {} was cancelled. Will use "
+ + "previouse version.",fstInfo);
+ } else {
+ throw new CorpusException("Building of FST Corpus " + fstInfo
+ + "was cancelled!", null);
}
}
}
return fstCorpus;
}
/**
+ * @param fstInfo
+ * @return
+ */
+ private Future<TaggerFstCorpus> enqueue(CorpusInfo fstInfo) {
+ Future<TaggerFstCorpus> enqueuedCorpus;
+ fstInfo.corpusLock.writeLock().lock();
+ try {
+ enqueuedCorpus = fstInfo.getEnqueued(); //check again in write lock
+ if(enqueuedCorpus == null){
+ //enqueue for re-creation
+ enqueuedCorpus = config.getExecutorService().submit(
+ new CorpusCreationTask(config, fstInfo));
+ fstInfo.enqueued(enqueuedCorpus);;
+ }
+ } finally {
+ fstInfo.corpusLock.writeLock().unlock();
+ }
+ return enqueuedCorpus;
+ }
+ /**
* The current version of the SolrIndex as reported by the {@link IndexReader}
* used by this TaggingSession.
* @return the current version of the SolrIndex.
Modified: stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java?rev=1731820&r1=1731819&r2=1731820&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java (original)
+++ stanbol/trunk/enhancement-engines/lucenefstlinking/src/test/java/org/apache/stanbol/enhancer/engines/lucenefstlinking/FstLinkingEngineTest.java Tue Feb 23 09:43:03 2016
@@ -196,7 +196,7 @@ public class FstLinkingEngineTest {
//setup the index configuration
LanguageConfiguration langConf = new LanguageConfiguration("not.used",
new String[]{"en;field=dbpedia-ont:surfaceForm;generate=true"});
- fstConfig = new IndexConfiguration(langConf, core, FieldEncodingEnum.SolrYard);
+ fstConfig = new IndexConfiguration(langConf, core, FieldEncodingEnum.SolrYard,"");
fstConfig.setExecutorService(Executors.newFixedThreadPool(1));
fstConfig.setTypeField("rdf:type");
fstConfig.setRankingField("entityhub:entityRank");