You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/10/09 14:45:16 UTC
svn commit: r1530587 - in /stanbol/trunk:
commons/solr/managed/src/main/java/org/apache/stanbol/commons/solr/managed/standalone/
entityhub/indexing/destination/solryard/
entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub...
Author: rwesten
Date: Wed Oct 9 12:45:15 2013
New Revision: 1530587
URL: http://svn.apache.org/r1530587
Log:
STANBOL-1167: implementation of the FST model builder for the SolrYard Indexing Destination
Added:
stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/
stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationInfo.java (with props)
stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationTask.java (with props)
stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstConfig.java (with props)
stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstModelGenerator.java (with props)
Modified:
stanbol/trunk/commons/solr/managed/src/main/java/org/apache/stanbol/commons/solr/managed/standalone/StandaloneEmbeddedSolrServerProvider.java
stanbol/trunk/entityhub/indexing/destination/solryard/pom.xml
stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/SolrYardIndexingDestination.java
Modified: stanbol/trunk/commons/solr/managed/src/main/java/org/apache/stanbol/commons/solr/managed/standalone/StandaloneEmbeddedSolrServerProvider.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/commons/solr/managed/src/main/java/org/apache/stanbol/commons/solr/managed/standalone/StandaloneEmbeddedSolrServerProvider.java?rev=1530587&r1=1530586&r2=1530587&view=diff
==============================================================================
--- stanbol/trunk/commons/solr/managed/src/main/java/org/apache/stanbol/commons/solr/managed/standalone/StandaloneEmbeddedSolrServerProvider.java (original)
+++ stanbol/trunk/commons/solr/managed/src/main/java/org/apache/stanbol/commons/solr/managed/standalone/StandaloneEmbeddedSolrServerProvider.java Wed Oct 9 12:45:15 2013
@@ -76,7 +76,7 @@ public class StandaloneEmbeddedSolrServe
* by using the configName on the ManagedSolrServer referenced by
* {@link IndexReference#getServer()}
*/
- public SolrServer getSolrServer(IndexReference indexRef, String configName){
+ public EmbeddedSolrServer getSolrServer(IndexReference indexRef, String configName){
if(indexRef == null){
throw new IllegalArgumentException("The parsed InexReference MUST NOT be NULL!");
}
Modified: stanbol/trunk/entityhub/indexing/destination/solryard/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/destination/solryard/pom.xml?rev=1530587&r1=1530586&r2=1530587&view=diff
==============================================================================
--- stanbol/trunk/entityhub/indexing/destination/solryard/pom.xml (original)
+++ stanbol/trunk/entityhub/indexing/destination/solryard/pom.xml Wed Oct 9 12:45:15 2013
@@ -114,6 +114,12 @@
<artifactId>org.apache.stanbol.entityhub.yard.solr</artifactId>
<version>0.12.0-SNAPSHOT</version>
</dependency>
+ <!-- FST model generation -->
+ <dependency>
+ <groupId>org.opensextant</groupId>
+ <artifactId>solr-text-tagger</artifactId>
+ <version>1.2</version>
+ </dependency>
<dependency>
<groupId>commons-io</groupId>
Modified: stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/SolrYardIndexingDestination.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/SolrYardIndexingDestination.java?rev=1530587&r1=1530586&r2=1530587&view=diff
==============================================================================
--- stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/SolrYardIndexingDestination.java (original)
+++ stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/SolrYardIndexingDestination.java Wed Oct 9 12:45:15 2013
@@ -24,25 +24,44 @@ import static org.apache.stanbol.entityh
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
+import java.io.FileReader;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.Collection;
import java.util.Dictionary;
import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Map.Entry;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.IndexSearcher;
import org.apache.solr.client.solrj.SolrServer;
+import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.schema.IndexSchema;
+import org.apache.solr.search.SolrIndexSearcher;
+import org.apache.solr.util.RefCounted;
import org.apache.stanbol.entityhub.core.mapping.FieldMappingUtils;
import org.apache.stanbol.entityhub.core.site.CacheUtils;
import org.apache.stanbol.entityhub.indexing.core.IndexingDestination;
import org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig;
import org.apache.stanbol.entityhub.indexing.core.destination.OsgiConfigurationUtil;
+import org.apache.stanbol.entityhub.indexing.destination.solryard.fst.CorpusCreationInfo;
+import org.apache.stanbol.entityhub.indexing.destination.solryard.fst.CorpusCreationTask;
+import org.apache.stanbol.entityhub.indexing.destination.solryard.fst.FstConfig;
import org.apache.stanbol.entityhub.servicesapi.mapping.FieldMapper;
import org.apache.stanbol.entityhub.servicesapi.mapping.FieldMapping;
import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
@@ -140,6 +159,18 @@ public class SolrYardIndexingDestination
public static final boolean DEFAULT_SYNCHRONIZED_STATE = true;
/**
+ * The name of the properties file containing the FST configuration.<p>
+ * If not present no FST models will be created in the {@link #finalise()}
+ * state.
+ */
+ public static final String FST_CONF = "fstConf";
+ /**
+ * The number of Threads used to concurrently build FST models
+ */
+ public static final String FST_THREADS = "fstThreads";
+
+ private static final int DEFAULT_FST_THREADS = 4;
+ /**
* The location of the SolrIndex. This MUST BE an absolute Path in case it
* refers to a directory of the local file system and <code>null</code> in
* case an external SolrServer is used.
@@ -179,6 +210,25 @@ public class SolrYardIndexingDestination
private Collection<FieldMapping> indexFieldConfiguration;
private IndexingConfig indexingConfig;
+
+ /*
+ * Fields required for the FST model creation
+ */
+ /**
+ * The SolrCore used by the {@link #solrYard}
+ */
+ private SolrCore core;
+ /**
+ * The FST configurations. Parsed in the {@link #setConfiguration(Map)}
+ * and initialised during {@link #initialise()}. <code>null</code> if no
+ * {@link #FST_CONF} is set.
+ */
+ private List<FstConfig> fstConfigs;
+ /**
+ * The number of threads used to build FST models.
+ * Set in {@link #setConfiguration(Map)}
+ */
+ private int fstThreads;
/**
* This Constructor relays on a subsequent call to
@@ -427,12 +477,69 @@ public class SolrYardIndexingDestination
} catch (Exception e) {
//throw exception for any invalid entry!
throw new IllegalArgumentException(String.format(
- "Unable to parse Field Boost entry from field {} and boost {}",
+ "Unable to parse Field Boost entry from field %s and boost %s",
entry.getKey(),entry.getValue()),e);
}
}
solrYardConfig.setFieldBoosts(fieldBoosts);
}
+ //read the FST config
+ value = config.get(FST_CONF);
+ if(value != null && !StringUtils.isBlank(value.toString())){
+ File fstConfigFile = indexingConfig.getConfigFile(value.toString());
+ if(!fstConfigFile.isFile()){
+ throw new IllegalArgumentException(String.format(
+ "Unable to find configured FST configuration file %s",
+ fstConfigFile));
+ }
+ Collection<String> lines;
+ try {
+ lines = FileUtils.readLines(fstConfigFile, "UTF-8");
+ } catch (IOException e) {
+ throw new IllegalArgumentException(String.format(
+ "Unable to read FST configuration file %s",
+ fstConfigFile),e);
+ }
+ fstConfigs = new ArrayList<FstConfig>();
+ for(String line : lines){
+ line = line.trim();
+ if(!line.isEmpty() && line.charAt(0) != '#'){
+ String[] fields = new String[] {null,null};
+ int index = -1;
+ for(String part : line.split("=|;")){
+ if(index >= 0){
+ fields[index] = part;
+ index = -1;
+ } else if("index".equalsIgnoreCase(part)){
+ index = 0;
+ } else if("store".equalsIgnoreCase(part)){
+ index = 1;
+ }
+ }
+ if(fields[0] == null){
+ throw new IllegalArgumentException("Invalid FST configuration "
+ + "line: "+line +". Param 'index={field}' is required "
+ + "(syntax: 'index={field};store={field}', 'store is optional'')!");
+ }
+ fstConfigs.add(new FstConfig(fields[0], fields[1]));
+ }
+ }
+ }
+ value = config.get(FST_THREADS);
+ if(value instanceof Number){
+ fstThreads = ((Number)value).intValue();
+ } else if(value != null){
+ try {
+ fstThreads = Integer.parseInt(value.toString());
+ }catch (NumberFormatException e) {
+ throw new IllegalArgumentException("Unable to parse the FST thread number from "
+ +value.toString(), e);
+ }
+ }
+ if(fstThreads <= 0){
+ fstThreads = DEFAULT_FST_THREADS;
+ }
+
}
/**
* Creates a {@link SolrYardConfig} and initialised it to used single Yard
@@ -461,7 +568,7 @@ public class SolrYardIndexingDestination
//parameters and initialise the member variables. This method performs
//the the actual initialisation of the SolrYard!
//copy a custom configuration (if present)
- SolrServer server;
+ EmbeddedSolrServer server;
IndexReference solrServerRef = IndexReference.parse(solrYardConfig.getSolrServerLocation());
if(solrIndexConfig != null){ //can only be != null if also solrIndexLocation
//copy the configuration
@@ -485,6 +592,7 @@ public class SolrYardIndexingDestination
}
log.info(" ... create SolrYard");
this.solrYard = new SolrYard(server,solrYardConfig,indexingConfig.getNamespacePrefixService());
+ this.core = server.getCoreContainer().getCore(solrServerRef.getIndex());
}
@Override
@@ -505,13 +613,81 @@ public class SolrYardIndexingDestination
} catch (YardException e) {
log.error("Unable to store FieldMapperConfiguration to the Store!",e);
}
+ log.info(" ... optimize SolrCore");
try {
solrYard.optimize();
} catch (YardException e) {
log.error("Unable to optimize SolrIndex after indexing! IndexArchive will not be optimized ...",e);
}
+ //build the FST models
+ if(fstConfigs != null){
+ //(1) FST config initialisation
+ log.info(" ... init FST configuration(s)");
+ IndexSchema schema = core.getLatestSchema();
+ File fstDir = new File(new File(core.getDataDir()),"fst");
+ if(!fstDir.isDirectory()){
+ try {
+ FileUtils.forceMkdir(fstDir);
+ } catch (IOException e) {
+ throw new IllegalStateException("Unable to create Directory "
+ + fstDir.getAbsolutePath() + "for storing the FST models "
+ + "of SolrCore "+core.getName());
+ }
+ }
+ RefCounted<SolrIndexSearcher> searcherRef = core.getSearcher();
+ try {
+ for(FstConfig fstConfig : fstConfigs){
+ fstConfig.setFstDirectory(fstDir); //set the FST directory
+ log.info("> FST config {}", fstConfig);
+ fstConfig.buildConfig(schema, searcherRef.get().getAtomicReader());
+ for(CorpusCreationInfo corpus : fstConfig.getCorpusCreationInfos()){
+ log.info(" - {}",corpus);
+ }
+ }
+ } finally {
+ searcherRef.decref();
+ }
+
+ List<Future<?>> fstCreationTasks = new ArrayList<Future<?>>();
+ ExecutorService es = Executors.newFixedThreadPool(fstThreads);
+ log.info(" ... build FST models ");
+ for(FstConfig config : fstConfigs){
+ for(final CorpusCreationInfo corpus : config.getCorpusCreationInfos()){
+ fstCreationTasks.add(es.submit(new CorpusCreationTask(core, corpus)));
+ }
+ }
+ //now wait for the completion of the tasks
+ Iterator<Future<?>> taskIt = fstCreationTasks.iterator();
+ while(taskIt.hasNext()){
+ Future<?> task = taskIt.next();
+ try {
+ task.get(); //wait until ready
+ taskIt.remove();
+ } catch (ExecutionException e) {
+ log.error("Exception while building FST models for SolrCore "
+ + core.getName(),e);
+ } catch (InterruptedException e) {
+ log.error("Interupped while building FST models for SolrCore "
+ + core.getName(),e);
+ Thread.currentThread().interrupt();
+
+ }
+ }
+ if(!fstCreationTasks.isEmpty()){
+ log.warn("Unable to build {} FST models for SolrCore {}",
+ fstCreationTasks.size(), core.getName());
+ } else {
+ log.info("All FST modles for SolrCore {} build successfully!",
+ core.getName());
+ }
+ } //no FST modles to build
+
+ //all Solr specific stuff is now ready
+ log.info(" ... close SolrCore");
solrYard.close();
+
//zip the index and copy it over to distribution
+ log.info(" ... build Solr index archive");
if(solrArchive != null){
try {
writeSolrIndexArchive();
Added: stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationInfo.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationInfo.java?rev=1530587&view=auto
==============================================================================
--- stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationInfo.java (added)
+++ stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationInfo.java Wed Oct 9 12:45:15 2013
@@ -0,0 +1,99 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.entityhub.indexing.destination.solryard.fst;
+
+import java.io.File;
+
+import org.apache.commons.lang.ObjectUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.solr.schema.FieldType;
+
+/**
+ * Holds the information required to build an FST corpus for a given language
+ * @author Rupert Westenthaler
+ *
+ */
+public class CorpusCreationInfo {
+
+ /**
+ * The language
+ */
+ public final String language;
+ /**
+ * The Corpus FST
+ */
+ public final File fst;
+ /**
+ * The Solr field used for FST indexing (already encoded)
+ */
+ public final String indexedField;
+ /**
+ * The Solr stored field holding the labels indexed in the FST corpus
+ */
+ public final String storedField;
+ /**
+ * TODO: partial matches are currently deactivated
+ */
+ public final boolean partialMatches = false;
+ /**
+ * The Solr {@link Analyzer} used for the field
+ */
+ public final Analyzer analyzer;
+
+ /**
+ * @param language
+ * @param indexField
+ * @param analyzer
+ * @param fst
+ * @param allowCreation
+ */
+ protected CorpusCreationInfo(String language, String indexField, String storeField, FieldType fieldType, File fst){
+ this.language = language;
+ this.indexedField = indexField;
+ this.storedField = storeField;
+ this.fst = fst;
+ this.analyzer = fieldType.getAnalyzer();
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder("FST Info[language: ").append(language);
+ if(indexedField.equals(storedField)){
+ sb.append(" | field: ").append(indexedField);
+ } else {
+ sb.append(" | fields(index:").append(indexedField).append(", stored:")
+ .append(storedField).append(')');
+ }
+ sb.append(" | file: ").append(fst.getName())
+ .append("(exists: ").append(fst.isFile()).append(')')
+ .append("]");
+ return sb.toString();
+ }
+
+ @Override
+ public int hashCode() {
+ return indexedField.hashCode();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ return obj instanceof CorpusCreationInfo &&
+ ((CorpusCreationInfo)obj).indexedField.equals(indexedField) &&
+ ((CorpusCreationInfo)obj).storedField.equals(storedField) &&
+ ObjectUtils.equals(language, language);
+ }
+}
Propchange: stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationInfo.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationTask.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationTask.java?rev=1530587&view=auto
==============================================================================
--- stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationTask.java (added)
+++ stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationTask.java Wed Oct 9 12:45:15 2013
@@ -0,0 +1,89 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.entityhub.indexing.destination.solryard.fst;
+
+import java.io.IOException;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.lucene.index.IndexReader;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.search.SolrIndexSearcher;
+import org.apache.solr.util.RefCounted;
+import org.opensextant.solrtexttagger.TaggerFstCorpus;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Runtime creation of FST corpora is done as {@link Callable}. This allows
+ * users to decide by the configuration of the {@link ExecutorService} to
+ * control how Corpora are build (e.g. how many can be built at a time.
+ * @author Rupert Westenthaler
+ *
+ */
+public class CorpusCreationTask implements Runnable{
+
+ private final Logger log = LoggerFactory.getLogger(CorpusCreationTask.class);
+
+ CorpusCreationInfo corpusInfo;
+ SolrCore core;
+
+ public CorpusCreationTask(SolrCore core, CorpusCreationInfo corpus){
+ this.core = core;
+ this.corpusInfo = corpus;
+ }
+
+ @Override
+ public void run() {
+ TaggerFstCorpus corpus = null;
+ RefCounted<SolrIndexSearcher> searcherRef = core.getSearcher();
+ try {
+ SolrIndexSearcher searcher = searcherRef.get();
+ //we do get the AtomicReader, because TaggerFstCorpus will need it
+ //anyways. This prevents to create another SlowCompositeReaderWrapper.
+ IndexReader reader = searcher.getAtomicReader();
+ log.info(" ... build {}", corpusInfo);
+ corpus = new TaggerFstCorpus(reader, searcher.getIndexReader().getVersion(),
+ null, corpusInfo.indexedField, corpusInfo.storedField, corpusInfo.analyzer,
+ corpusInfo.partialMatches,1,200);
+ } catch (IOException e) {
+ throw new IllegalStateException("Unable to read Information to build "
+ + corpusInfo + " from SolrIndex '" + core.getName() + "'!", e);
+ } finally {
+ searcherRef.decref(); //ensure that we dereference the searcher
+ }
+ if(corpusInfo.fst.exists()){
+ if(!FileUtils.deleteQuietly(corpusInfo.fst)){
+ log.warn("Unable to delete existing FST fiel for {}",corpusInfo);
+ }
+ }
+ try {
+ corpus.save(corpusInfo.fst);
+ } catch (IOException e) {
+ log.warn("Unable to store FST corpus " + corpusInfo + " to "
+ + corpusInfo.fst.getAbsolutePath() + "!", e);
+ }
+ }
+
+ @Override
+ public String toString() {
+ return new StringBuilder("Task: building ").append(corpusInfo)
+ .append(" for SolrCore ").append(core.getName()).toString();
+ }
+
+}
Propchange: stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/CorpusCreationTask.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstConfig.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstConfig.java?rev=1530587&view=auto
==============================================================================
--- stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstConfig.java (added)
+++ stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstConfig.java Wed Oct 9 12:45:15 2013
@@ -0,0 +1,186 @@
+package org.apache.stanbol.entityhub.indexing.destination.solryard.fst;
+
+import java.io.File;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.solr.schema.FieldType;
+import org.apache.solr.schema.IndexSchema;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class FstConfig {
+
+ protected final Logger log = LoggerFactory.getLogger(FstConfig.class);
+
+ private final String indexField;
+ private final String storeField;
+ private final String fstName;
+ private File fstDirectory;
+ /**
+ * FST corpus configuration
+ */
+ private Map<String,CorpusCreationInfo> corpusInfos = new HashMap<String,CorpusCreationInfo>();
+
+ public FstConfig(String indexField){
+ this(indexField,null);
+ }
+
+ public FstConfig(String indexField, String storeField){
+ this.indexField = indexField;
+ this.storeField = storeField == null ? indexField : storeField;
+ this.fstName = getFstFileName(indexField);
+ }
+
+ public void setFstDirectory(File fstDirectory) {
+ this.fstDirectory = fstDirectory;
+ }
+
+ public File getFstDirectory() {
+ return fstDirectory;
+ }
+
+ protected final CorpusCreationInfo addCorpus(CorpusCreationInfo corpus){
+ if(corpus != null){
+ return corpusInfos.put(corpus.language, corpus);
+ } else {
+ return null;
+ }
+ }
+
+ public Collection<CorpusCreationInfo> getCorpusCreationInfos(){
+ return Collections.unmodifiableCollection(corpusInfos.values());
+ }
+
+ public CorpusCreationInfo getCorpusCreationInfo(String language){
+ return corpusInfos.get(language);
+ }
+ public boolean isLanguage(String language){
+ return corpusInfos.containsKey(language);
+ }
+
+ public Set<String> getLanguages(){
+ return Collections.unmodifiableSet(corpusInfos.keySet());
+ }
+ /**
+ * Inspects the SolrCore to get defined languages for the configured
+ * {@link #indexField} and {@link #storeField}. Initialises the
+ * {@link #getCorpusCreationInfos()}
+ * @param schema the schema of the SolrCore
+ * @param indexReader the index reader of the SolrCore
+ */
+ public void buildConfig(IndexSchema schema, AtomicReader indexReader){
+ FieldInfos fieldInfos = indexReader.getFieldInfos(); //we need this twice
+ String fieldWildcard = encodeLanguage(indexField,"*");
+ for(FieldInfo fieldInfo : fieldInfos){
+ //try to match the field names against the wildcard
+ if(FilenameUtils.wildcardMatch(fieldInfo.name, fieldWildcard)){
+ //for matches parse the language from the field name
+ String language = parseLanguage(fieldInfo.name, indexField);
+ if(language != null){
+ //generate the FST file name
+ StringBuilder fstFileName = new StringBuilder(fstName);
+ if(!language.isEmpty()){
+ fstFileName.append('.').append(language);
+ }
+ fstFileName.append(".fst");
+ File fstFile = new File(fstDirectory,fstFileName.toString());
+ //get the FieldType of the field from the Solr schema
+ FieldType fieldType = schema.getFieldTypeNoEx(fieldInfo.name);
+ if(fieldType != null){ //if the fieldType is present
+ //we need also to check if the stored field with
+ //the labels is present
+ //get the stored Field and check if it is present!
+ String storeFieldName;
+ if(storeField == null){ //storeField == indexField
+ storeFieldName = fieldInfo.name;
+ } else { // check that the storeField is present in the index
+ storeFieldName = encodeLanguage(storeField, language);
+ FieldInfo storedFieldInfos = fieldInfos.fieldInfo(storeFieldName);
+ if(storedFieldInfos == null){
+ log.warn(" ... ignore language {} because Stored Field {} "
+ + "for IndexField {} does not exist! ", new Object[]{
+ language,storeFieldName,fieldInfo.name});
+ storeFieldName = null;
+ }
+
+ }
+ if(storeFieldName != null){ // == valid configuration
+ CorpusCreationInfo fstInfo = new CorpusCreationInfo(language,
+ fieldInfo.name, storeFieldName,
+ fieldType, fstFile);
+ log.debug(" ... init {} ", fstInfo);
+ addCorpus(fstInfo);
+ }
+ } else {
+ log.warn(" ... ignore language {} becuase unknown fieldtype "
+ + "for SolrFied {}",language,fieldInfo.name);
+ }
+ } //else the field matched the wildcard, but has not passed the
+ //encoding test.
+ } //Solr field does not match the field definition in the config
+ } // end iterate over all fields in the SolrIndex
+ }
+
+ protected static String encodeLanguage(String field, String language){
+ StringBuilder sb = new StringBuilder();
+ sb.append('@').append(language).append('/');
+ sb.append(field).append('/');
+ return sb.toString();
+
+ }
+
+ protected static String parseLanguage(String value, String field){
+ int atIndex = value.indexOf('@');
+ int slashIndex = value.indexOf('/');
+ //expect @{lang}/{field}/
+ if(value.indexOf(field, slashIndex) != value.length()-1-field.length()){
+ return null; //no match
+ }
+ if(atIndex == 0 && slashIndex > 0){
+ return value.substring(1,slashIndex);
+ } else {
+ return null;//no match
+ }
+ }
+
+ /**
+ * Getter for the default FST file name based on the configured field
+ * name. This method returns the '<code>{name}</code>' part of the
+ * '<code>{name}.{lang}.fst</code>' name.
+ * @param fstFieldName the field name.
+ * @return the '<code>{name}</code>' part of the'<code>{name}.{lang}.fst</code>' name
+ */
+ protected static String getFstFileName(final String fstFieldName) {
+ String fstName;
+ if(!StringUtils.isAlphanumeric(fstFieldName)) {
+ StringBuilder escaped = new StringBuilder(fstFieldName.length());
+ for(int i = 0; i < fstFieldName.length();i++){
+ int codepoint = fstFieldName.codePointAt(i);
+ if(Character.isLetterOrDigit(codepoint)){
+ escaped.appendCodePoint(codepoint);
+ } else {
+ escaped.append('_');
+ }
+ }
+ fstName = escaped.toString();
+ } else {
+ fstName = fstFieldName;
+ }
+ return fstName;
+ }
+
+ @Override
+ public String toString() {
+ return new StringBuilder("FSTConfig[index: ").append(indexField)
+ .append(" | store: ").append(storeField).append(']').toString();
+ }
+}
Propchange: stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstConfig.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstModelGenerator.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstModelGenerator.java?rev=1530587&view=auto
==============================================================================
--- stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstModelGenerator.java (added)
+++ stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstModelGenerator.java Wed Oct 9 12:45:15 2013
@@ -0,0 +1,15 @@
+package org.apache.stanbol.entityhub.indexing.destination.solryard.fst;
+
+import org.apache.solr.core.SolrCore;
+
+public class FstModelGenerator {
+
+ private SolrCore core;
+ private FstConfig config;
+
+ protected FstModelGenerator(SolrCore core, FstConfig config) {
+ this.core = core;
+ this.config = config;
+ }
+
+}
Propchange: stanbol/trunk/entityhub/indexing/destination/solryard/src/main/java/org/apache/stanbol/entityhub/indexing/destination/solryard/fst/FstModelGenerator.java
------------------------------------------------------------------------------
svn:mime-type = text/plain