You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/10/09 14:45:16 UTC

svn commit: r1530587 - in /stanbol/trunk: commons/solr/managed/src/main/java/org/apache/stanbol/commons/solr/managed/standalone/ entityhub/indexing/destination/solryard/ entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub...

Author: rwesten
Date: Wed Oct  9 12:45:15 2013
New Revision: 1530587

URL: http://svn.apache.org/r1530587
Log:
STANBOL-1167: implementation of the FST model builder for the SolrYard Indexing Destination

Added:
    stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/
    stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationInfo.java   (with props)
    stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationTask.java   (with props)
    stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstConfig.java   (with props)
    stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstModelGenerator.java   (with props)
Modified:
    stanbol/trunk/commons/solr/managed/src/main/java/org/apache/stanbol/commons/solr/managed/standalone/StandaloneEmbeddedSolrServerProvider.java
    stanbol/trunk/entityhub/indexing/destination/solryard/pom.xml
    stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/SolrYardIndexingDestination.java

Modified: stanbol/trunk/commons/solr/managed/src/main/java/org/apache/stanbol/commons/solr/managed/standalone/StandaloneEmbeddedSolrServerProvider.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/commons/solr/managed/src/main/java/org/apache/stanbol/commons/solr/managed/standalone/StandaloneEmbeddedSolrServerProvider.java?rev=1530587&r1=1530586&r2=1530587&view=diff
==============================================================================
--- stanbol/trunk/commons/solr/managed/src/main/java/org/apache/stanbol/commons/solr/managed/standalone/StandaloneEmbeddedSolrServerProvider.java (original)
+++ stanbol/trunk/commons/solr/managed/src/main/java/org/apache/stanbol/commons/solr/managed/standalone/StandaloneEmbeddedSolrServerProvider.java Wed Oct  9 12:45:15 2013
@@ -76,7 +76,7 @@ public class StandaloneEmbeddedSolrServe
      * by using the configName on the ManagedSolrServer referenced by 
      * {@link IndexReference#getServer()}
      */
-    public SolrServer getSolrServer(IndexReference indexRef, String configName){
+    public EmbeddedSolrServer getSolrServer(IndexReference indexRef, String configName){
         if(indexRef == null){
             throw new IllegalArgumentException("The parsed InexReference MUST NOT be NULL!");
         }

Modified: stanbol/trunk/entityhub/indexing/destination/solryard/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/destination/solryard/pom.xml?rev=1530587&r1=1530586&r2=1530587&view=diff
==============================================================================
--- stanbol/trunk/entityhub/indexing/destination/solryard/pom.xml (original)
+++ stanbol/trunk/entityhub/indexing/destination/solryard/pom.xml Wed Oct  9 12:45:15 2013
@@ -114,6 +114,12 @@
       <artifactId>org.apache.stanbol.entityhub.yard.solr</artifactId>
       <version>0.12.0-SNAPSHOT</version>
     </dependency>
+    <!-- FST model generation -->
+    <dependency>
+      <groupId>org.opensextant</groupId>
+      <artifactId>solr-text-tagger</artifactId>
+      <version>1.2</version>
+    </dependency>
 
     <dependency>
       <groupId>commons-io</groupId>

Modified: stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/SolrYardIndexingDestination.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/SolrYardIndexingDestination.java?rev=1530587&r1=1530586&r2=1530587&view=diff
==============================================================================
--- stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/SolrYardIndexingDestination.java (original)
+++ stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/SolrYardIndexingDestination.java Wed Oct  9 12:45:15 2013
@@ -24,25 +24,44 @@ import static org.apache.stanbol.entityh
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
+import java.io.FileReader;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Dictionary;
 import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
 import java.util.Map;
 import java.util.Properties;
 import java.util.Map.Entry;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipOutputStream;
 
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.IndexSearcher;
 import org.apache.solr.client.solrj.SolrServer;
+import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.schema.IndexSchema;
+import org.apache.solr.search.SolrIndexSearcher;
+import org.apache.solr.util.RefCounted;
 import org.apache.stanbol.entityhub.core.mapping.FieldMappingUtils;
 import org.apache.stanbol.entityhub.core.site.CacheUtils;
 import org.apache.stanbol.entityhub.indexing.core.IndexingDestination;
 import org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig;
 import org.apache.stanbol.entityhub.indexing.core.destination.OsgiConfigurationUtil;
+import org.apache.stanbol.entityhub.indexing.destination.solryard.fst.CorpusCreationInfo;
+import org.apache.stanbol.entityhub.indexing.destination.solryard.fst.CorpusCreationTask;
+import org.apache.stanbol.entityhub.indexing.destination.solryard.fst.FstConfig;
 import org.apache.stanbol.entityhub.servicesapi.mapping.FieldMapper;
 import org.apache.stanbol.entityhub.servicesapi.mapping.FieldMapping;
 import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
@@ -140,6 +159,18 @@ public class SolrYardIndexingDestination
     public static final boolean DEFAULT_SYNCHRONIZED_STATE = true;
     
     /**
+     * The name of the properties file containing the FST configuration.<p>
+     * If not present no FST models will be created in the {@link #finalise()}
+     * state.
+     */
+    public static final String FST_CONF = "fstConf";
+    /**
+     * The number of Threads used to concurrently build FST models
+     */
+    public static final String FST_THREADS = "fstThreads";
+    
+    private static final int DEFAULT_FST_THREADS = 4;
+    /**
      * The location of the SolrIndex. This MUST BE an absolute Path in case it 
      * refers to a directory of the local file system and <code>null</code> in
      * case an external SolrServer is used.
@@ -179,6 +210,25 @@ public class SolrYardIndexingDestination
     private Collection<FieldMapping> indexFieldConfiguration;
 
     private IndexingConfig indexingConfig;
+
+    /*
+     * Fields required for the FST model creation 
+     */
+    /**
+     * The SolrCore used by the {@link #solrYard}
+     */
+    private SolrCore core;
+    /**
+     * The FST configurations. Parsed in the {@link #setConfiguration(Map)}
+     * and initialised during {@link #initialise()}. <code>null</code> if no
+     * {@link #FST_CONF} is set.
+     */
+    private List<FstConfig> fstConfigs;
+    /**
+     * The number of threads used to build FST models. 
+     * Set in {@link #setConfiguration(Map)}
+     */
+    private int fstThreads;
     
     /**
      * This Constructor relays on a subsequent call to 
@@ -427,12 +477,69 @@ public class SolrYardIndexingDestination
                 } catch (Exception e) {
                     //throw exception for any invalid entry!
                     throw new IllegalArgumentException(String.format(
-                        "Unable to parse Field Boost entry from field {} and boost {}",
+                        "Unable to parse Field Boost entry from field %s and boost %s",
                         entry.getKey(),entry.getValue()),e);
                 }
             }
             solrYardConfig.setFieldBoosts(fieldBoosts);
         }
+        //read the FST config
+        value = config.get(FST_CONF);
+        if(value != null && !StringUtils.isBlank(value.toString())){
+            File fstConfigFile = indexingConfig.getConfigFile(value.toString());
+            if(!fstConfigFile.isFile()){
+                throw new IllegalArgumentException(String.format(
+                    "Unable to find configured FST configuration file %s",
+                    fstConfigFile));
+            }
+            Collection<String> lines;
+            try {
+                lines = FileUtils.readLines(fstConfigFile, "UTF-8");
+            } catch (IOException e) {
+                throw new IllegalArgumentException(String.format(
+                    "Unable to read FST configuration file %s",
+                    fstConfigFile),e);
+            }
+            fstConfigs = new ArrayList<FstConfig>();
+            for(String line : lines){
+                line = line.trim();
+                if(!line.isEmpty() && line.charAt(0) != '#'){
+                    String[] fields = new String[] {null,null};
+                    int index = -1;
+                    for(String part : line.split("=|;")){
+                        if(index >= 0){
+                            fields[index] = part;
+                            index = -1;
+                        } else if("index".equalsIgnoreCase(part)){
+                            index = 0;
+                        } else if("store".equalsIgnoreCase(part)){
+                            index = 1;
+                        }
+                    }
+                    if(fields[0] == null){
+                        throw new IllegalArgumentException("Invalid FST configuration "
+                            + "line: "+line +". Param 'index={field}' is required "
+                            + "(syntax: 'index={field};store={field}', 'store is optional'')!");
+                    }
+                    fstConfigs.add(new FstConfig(fields[0], fields[1]));
+                }
+            }
+        }
+        value = config.get(FST_THREADS);
+        if(value instanceof Number){
+            fstThreads = ((Number)value).intValue();
+        } else if(value != null){
+            try {
+                fstThreads = Integer.parseInt(value.toString());
+            }catch (NumberFormatException e) {
+                throw new IllegalArgumentException("Unable to parse the FST thread number from "
+                    +value.toString(), e);
+            }
+        }
+        if(fstThreads <= 0){
+            fstThreads = DEFAULT_FST_THREADS;
+        }
+        
     }
     /**
      * Creates a {@link SolrYardConfig} and initialised it to used single Yard
@@ -461,7 +568,7 @@ public class SolrYardIndexingDestination
         //parameters and initialise the member variables. This method performs 
         //the the actual initialisation of the SolrYard!
         //copy a custom configuration (if present)
-        SolrServer server;
+        EmbeddedSolrServer server;
         IndexReference solrServerRef = IndexReference.parse(solrYardConfig.getSolrServerLocation());
         if(solrIndexConfig != null){ //can only be != null if also solrIndexLocation
             //copy the configuration
@@ -485,6 +592,7 @@ public class SolrYardIndexingDestination
         }
         log.info("   ... create SolrYard");
         this.solrYard = new SolrYard(server,solrYardConfig,indexingConfig.getNamespacePrefixService());
+        this.core = server.getCoreContainer().getCore(solrServerRef.getIndex());
     }
 
     @Override
@@ -505,13 +613,81 @@ public class SolrYardIndexingDestination
         } catch (YardException e) {
             log.error("Unable to store FieldMapperConfiguration to the Store!",e);
         }
+        log.info(" ... optimize SolrCore");
         try {
             solrYard.optimize();
         } catch (YardException e) {
             log.error("Unable to optimize SolrIndex after indexing! IndexArchive will not be optimized ...",e);
         }
+        //build the FST models
+        if(fstConfigs != null){
+            //(1) FST config initialisation
+            log.info(" ... init FST configuration(s)");
+            IndexSchema schema = core.getLatestSchema();
+            File fstDir = new File(new File(core.getDataDir()),"fst");
+            if(!fstDir.isDirectory()){
+                try {
+                    FileUtils.forceMkdir(fstDir);
+                } catch (IOException e) {
+                    throw new IllegalStateException("Unable to create Directory "
+                        + fstDir.getAbsolutePath() + "for storing the FST models "
+                        + "of SolrCore "+core.getName());
+                }
+            }
+            RefCounted<SolrIndexSearcher> searcherRef = core.getSearcher();
+            try {
+                for(FstConfig fstConfig : fstConfigs){
+                    fstConfig.setFstDirectory(fstDir); //set the FST directory
+                    log.info("> FST config {}", fstConfig);
+                    fstConfig.buildConfig(schema, searcherRef.get().getAtomicReader());
+                    for(CorpusCreationInfo corpus : fstConfig.getCorpusCreationInfos()){
+                        log.info("  - {}",corpus);
+                    }
+                }
+            } finally {
+                searcherRef.decref();
+            }
+
+            List<Future<?>> fstCreationTasks = new ArrayList<Future<?>>();
+            ExecutorService es = Executors.newFixedThreadPool(fstThreads);
+            log.info(" ... build FST models ");
+            for(FstConfig config : fstConfigs){
+                for(final CorpusCreationInfo corpus : config.getCorpusCreationInfos()){
+                    fstCreationTasks.add(es.submit(new CorpusCreationTask(core, corpus)));
+                }
+            }
+            //now wait for the completion of the tasks
+            Iterator<Future<?>> taskIt = fstCreationTasks.iterator();
+            while(taskIt.hasNext()){
+                Future<?> task = taskIt.next();
+                try {
+                    task.get(); //wait until ready
+                    taskIt.remove();
+                } catch (ExecutionException e) {
+                    log.error("Exception while building FST models for SolrCore "
+                            + core.getName(),e);
+                } catch (InterruptedException e) {
+                    log.error("Interupped while building FST models for SolrCore "
+                            + core.getName(),e);
+                    Thread.currentThread().interrupt();
+                    
+                }
+            }
+            if(!fstCreationTasks.isEmpty()){
+                log.warn("Unable to build {} FST models for SolrCore {}",
+                    fstCreationTasks.size(), core.getName());
+            } else {
+                log.info("All FST modles for SolrCore {} build successfully!",
+                    core.getName());
+            }
+        } //no FST modles to build
+        
+        //all Solr specific stuff is now ready
+        log.info(" ... close SolrCore");
         solrYard.close();
+        
         //zip the index and copy it over to distribution
+        log.info(" ... build Solr index archive");
         if(solrArchive != null){
             try {
                 writeSolrIndexArchive();

Added: stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationInfo.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationInfo.java?rev=1530587&view=auto
==============================================================================
--- stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationInfo.java (added)
+++ stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationInfo.java Wed Oct  9 12:45:15 2013
@@ -0,0 +1,99 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.entityhub.indexing.destination.solryard.fst;
+
+import java.io.File;
+
+import org.apache.commons.lang.ObjectUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.solr.schema.FieldType;
+
+/**
+ * Holds the information required to build an FST corpus for a given language
+ * @author Rupert Westenthaler
+ *
+ */
+public class CorpusCreationInfo {
+
+    /**
+     * The language
+     */
+    public final String language;
+    /**
+     * The Corpus FST
+     */
+    public final File fst;
+    /**
+     * The Solr field used for FST indexing (already encoded)
+     */
+    public final String indexedField;
+    /**
+     * The Solr stored field holding the labels indexed in the FST corpus 
+     */
+    public final String storedField;
+    /**
+     * TODO: partial matches are currently deactivated
+     */
+    public final boolean partialMatches = false;
+    /**
+     * The Solr {@link Analyzer} used for the field
+     */
+    public final Analyzer analyzer;
+    
+    /** 
+     * @param language
+     * @param indexField
+     * @param analyzer
+     * @param fst
+     * @param allowCreation
+     */
+    protected CorpusCreationInfo(String language, String indexField, String storeField, FieldType fieldType, File fst){
+        this.language = language;
+        this.indexedField = indexField;
+        this.storedField = storeField;
+        this.fst = fst;
+        this.analyzer = fieldType.getAnalyzer();
+    }
+    
+    @Override
+    public String toString() {
+        StringBuilder sb = new StringBuilder("FST Info[language: ").append(language);
+        if(indexedField.equals(storedField)){
+            sb.append(" | field: ").append(indexedField);
+        } else {
+            sb.append(" | fields(index:").append(indexedField).append(", stored:")
+                .append(storedField).append(')');
+        }
+        sb.append(" | file: ").append(fst.getName())
+            .append("(exists: ").append(fst.isFile()).append(')')
+            .append("]");
+        return sb.toString();
+    }
+    
+    @Override
+    public int hashCode() {
+        return indexedField.hashCode();
+    }
+    
+    @Override
+    public boolean equals(Object obj) {
+        return obj instanceof CorpusCreationInfo && 
+                ((CorpusCreationInfo)obj).indexedField.equals(indexedField) &&
+                ((CorpusCreationInfo)obj).storedField.equals(storedField) &&
+                ObjectUtils.equals(language, language);
+    }
+}

Propchange: stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationInfo.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationTask.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationTask.java?rev=1530587&view=auto
==============================================================================
--- stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationTask.java (added)
+++ stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationTask.java Wed Oct  9 12:45:15 2013
@@ -0,0 +1,89 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.entityhub.indexing.destination.solryard.fst;
+
+import java.io.IOException;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.lucene.index.IndexReader;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.search.SolrIndexSearcher;
+import org.apache.solr.util.RefCounted;
+import org.opensextant.solrtexttagger.TaggerFstCorpus;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Runtime creation of FST corpora is done as {@link Callable}. This allows
+ * users to decide by the configuration of the {@link ExecutorService} to
+ * control how Corpora are build (e.g. how many can be built at a time.
+ * @author Rupert Westenthaler
+ *
+ */
+public class CorpusCreationTask implements Runnable{
+
+    private final Logger log = LoggerFactory.getLogger(CorpusCreationTask.class);
+    
+    CorpusCreationInfo corpusInfo;
+    SolrCore core;
+    
+    public CorpusCreationTask(SolrCore core, CorpusCreationInfo corpus){
+        this.core = core;
+        this.corpusInfo = corpus;
+    }
+    
+    @Override
+    public void run() {
+        TaggerFstCorpus corpus = null;
+        RefCounted<SolrIndexSearcher> searcherRef = core.getSearcher();
+        try {
+            SolrIndexSearcher searcher = searcherRef.get();
+            //we do get the AtomicReader, because TaggerFstCorpus will need it
+            //anyways. This prevents to create another SlowCompositeReaderWrapper.
+            IndexReader reader = searcher.getAtomicReader();
+            log.info(" ... build {}", corpusInfo);
+            corpus = new TaggerFstCorpus(reader, searcher.getIndexReader().getVersion(),
+                null, corpusInfo.indexedField, corpusInfo.storedField, corpusInfo.analyzer,
+                corpusInfo.partialMatches,1,200);
+        } catch (IOException e) {
+            throw new IllegalStateException("Unable to read Information to build "
+                    + corpusInfo + " from SolrIndex '" + core.getName() + "'!", e);
+        } finally {
+            searcherRef.decref(); //ensure that we dereference the searcher
+        }
+        if(corpusInfo.fst.exists()){
+            if(!FileUtils.deleteQuietly(corpusInfo.fst)){
+                log.warn("Unable to delete existing FST fiel for {}",corpusInfo);
+            }
+        }
+        try {
+            corpus.save(corpusInfo.fst);
+        } catch (IOException e) {
+            log.warn("Unable to store FST corpus " + corpusInfo + " to "
+                    + corpusInfo.fst.getAbsolutePath() + "!", e);
+        }
+    }
+    
+    @Override
+    public String toString() {
+        return new StringBuilder("Task: building ").append(corpusInfo)
+                .append(" for SolrCore ").append(core.getName()).toString();
+    }
+
+}

Propchange: stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationTask.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstConfig.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstConfig.java?rev=1530587&view=auto
==============================================================================
--- stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstConfig.java (added)
+++ stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstConfig.java Wed Oct  9 12:45:15 2013
@@ -0,0 +1,186 @@
+package org.apache.stanbol.entityhub.indexing.destination.solryard.fst;
+
+import java.io.File;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.solr.schema.FieldType;
+import org.apache.solr.schema.IndexSchema;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class FstConfig {
+    
+    protected final Logger log = LoggerFactory.getLogger(FstConfig.class);
+    
+    private final String indexField;
+    private final String storeField;
+    private final String fstName;
+    private File fstDirectory;
+    /**
+     * FST corpus configuration
+     */
+    private Map<String,CorpusCreationInfo> corpusInfos = new HashMap<String,CorpusCreationInfo>();
+
+    public FstConfig(String indexField){
+        this(indexField,null);
+    }
+    
+    public FstConfig(String indexField, String storeField){
+        this.indexField = indexField;
+        this.storeField = storeField == null ? indexField : storeField;
+        this.fstName = getFstFileName(indexField);
+    }
+
+    public void setFstDirectory(File fstDirectory) {
+        this.fstDirectory = fstDirectory;
+    }
+    
+    public File getFstDirectory() {
+        return fstDirectory;
+    }
+    
+    protected final CorpusCreationInfo addCorpus(CorpusCreationInfo corpus){
+        if(corpus != null){
+            return corpusInfos.put(corpus.language, corpus);
+        } else {
+            return null;
+        }
+    }
+    
+    public Collection<CorpusCreationInfo> getCorpusCreationInfos(){
+        return Collections.unmodifiableCollection(corpusInfos.values());
+    }
+    
+    public CorpusCreationInfo getCorpusCreationInfo(String language){
+        return corpusInfos.get(language);
+    }
+    public boolean isLanguage(String language){
+        return corpusInfos.containsKey(language);
+    }
+    
+    public Set<String> getLanguages(){
+        return Collections.unmodifiableSet(corpusInfos.keySet());
+    }
+    /**
+     * Inspects the SolrCore to get defined languages for the configured
+     * {@link #indexField} and {@link #storeField}. Initialises the
+     * {@link #getCorpusCreationInfos()}
+     * @param schema the schema of the SolrCore
+     * @param indexReader the index reader of the SolrCore
+     */
+    public void buildConfig(IndexSchema schema, AtomicReader indexReader){
+        FieldInfos fieldInfos = indexReader.getFieldInfos(); //we need this twice
+        String fieldWildcard = encodeLanguage(indexField,"*");
+        for(FieldInfo fieldInfo : fieldInfos){
+            //try to match the field names against the wildcard
+            if(FilenameUtils.wildcardMatch(fieldInfo.name, fieldWildcard)){
+                //for matches parse the language from the field name
+                String language = parseLanguage(fieldInfo.name, indexField);
+                if(language != null){
+                    //generate the FST file name
+                    StringBuilder fstFileName = new StringBuilder(fstName);
+                    if(!language.isEmpty()){
+                        fstFileName.append('.').append(language);
+                    }
+                    fstFileName.append(".fst");
+                    File fstFile = new File(fstDirectory,fstFileName.toString());
+                    //get the FieldType of the field from the Solr schema
+                    FieldType fieldType = schema.getFieldTypeNoEx(fieldInfo.name);
+                    if(fieldType != null){ //if the fieldType is present
+                        //we need also to check if the stored field with
+                        //the labels is present
+                        //get the stored Field and check if it is present!
+                        String storeFieldName;
+                        if(storeField == null){ //storeField == indexField
+                            storeFieldName = fieldInfo.name;
+                        } else { // check that the storeField is present in the index
+                            storeFieldName = encodeLanguage(storeField, language);
+                            FieldInfo storedFieldInfos = fieldInfos.fieldInfo(storeFieldName);
+                            if(storedFieldInfos == null){
+                                log.warn(" ... ignore language {} because Stored Field {} "
+                                        + "for IndexField {} does not exist! ", new Object[]{
+                                        language,storeFieldName,fieldInfo.name});
+                                storeFieldName = null;
+                            }
+                            
+                        }
+                        if(storeFieldName != null){ // == valid configuration
+                            CorpusCreationInfo fstInfo = new CorpusCreationInfo(language, 
+                                fieldInfo.name, storeFieldName,  
+                                fieldType, fstFile);
+                            log.debug(" ... init {} ", fstInfo);
+                            addCorpus(fstInfo);
+                        }
+                    } else {
+                        log.warn(" ... ignore language {} becuase unknown fieldtype "
+                            + "for SolrFied {}",language,fieldInfo.name);
+                    }
+                } //else the field matched the wildcard, but has not passed the
+                //encoding test.
+            } //Solr field does not match the field definition in the config
+        } // end iterate over all fields in the SolrIndex        
+    }
+    
+    protected static String encodeLanguage(String field, String language){
+        StringBuilder sb = new StringBuilder();
+        sb.append('@').append(language).append('/');
+        sb.append(field).append('/');
+        return sb.toString();
+
+    }
+    
+    protected static String parseLanguage(String value, String field){
+        int atIndex = value.indexOf('@');
+        int slashIndex = value.indexOf('/');
+        //expect @{lang}/{field}/
+        if(value.indexOf(field, slashIndex) != value.length()-1-field.length()){
+            return null; //no match
+        }
+        if(atIndex == 0 && slashIndex > 0){
+            return value.substring(1,slashIndex);
+        } else {
+            return null;//no match
+        }
+    }
+    
+    /**
+     * Getter for the default FST file name based on the configured field
+     * name. This method returns the '<code>{name}</code>' part of the
+     * '<code>{name}.{lang}.fst</code>' name.
+     * @param fstFieldName the field name.
+     * @return the '<code>{name}</code>' part of the'<code>{name}.{lang}.fst</code>' name
+     */
+    protected static String getFstFileName(final String fstFieldName) {
+        String fstName;
+        if(!StringUtils.isAlphanumeric(fstFieldName)) {
+            StringBuilder escaped = new StringBuilder(fstFieldName.length());
+            for(int i = 0; i < fstFieldName.length();i++){
+                int codepoint = fstFieldName.codePointAt(i);
+                if(Character.isLetterOrDigit(codepoint)){
+                    escaped.appendCodePoint(codepoint);
+                } else {
+                    escaped.append('_');
+                }
+            }
+            fstName = escaped.toString();
+        } else {
+            fstName = fstFieldName;
+        }
+        return fstName;
+    }    
+    
+    @Override
+    public String toString() {
+        return new StringBuilder("FSTConfig[index: ").append(indexField)
+                .append(" | store: ").append(storeField).append(']').toString();
+    }
+}

Propchange: stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstConfig.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstModelGenerator.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstModelGenerator.java?rev=1530587&view=auto
==============================================================================
--- stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstModelGenerator.java (added)
+++ stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstModelGenerator.java Wed Oct  9 12:45:15 2013
@@ -0,0 +1,15 @@
+package org.apache.stanbol.entityhub.indexing.destination.solryard.fst;
+
+import org.apache.solr.core.SolrCore;
+
+public class FstModelGenerator {
+
+    private SolrCore core;
+    private FstConfig config;
+
+    protected FstModelGenerator(SolrCore core, FstConfig config) {
+        this.core = core;
+        this.config = config;
+    }
+    
+}

Propchange: stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstModelGenerator.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain