You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sa...@apache.org on 2011/05/22 23:45:45 UTC
svn commit: r1126234 [23/28] - in /lucene/dev/branches/solr2452: ./
dev-tools/eclipse/ dev-tools/idea/ dev-tools/idea/.idea/
dev-tools/idea/lucene/ dev-tools/idea/lucene/contrib/ant/
dev-tools/idea/lucene/contrib/db/bdb-je/ dev-tools/idea/lucene/contri...
Modified: lucene/dev/branches/solr2452/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java Sun May 22 21:45:19 2011
@@ -18,9 +18,11 @@ package org.apache.solr.handler.clusteri
*/
import java.io.IOException;
+import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -37,6 +39,7 @@ import org.apache.solr.common.params.Sol
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrCore;
+import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.handler.clustering.SearchClusteringEngine;
import org.apache.solr.handler.component.HighlightComponent;
import org.apache.solr.highlight.SolrHighlighter;
@@ -52,9 +55,17 @@ import org.carrot2.core.ControllerFactor
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.attribute.AttributeNames;
+import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
+import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
+import org.carrot2.util.resource.ClassLoaderLocator;
+import org.carrot2.util.resource.IResource;
+import org.carrot2.util.resource.IResourceLocator;
+import org.carrot2.util.resource.ResourceLookup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
/**
@@ -64,19 +75,33 @@ import com.google.common.collect.Sets;
*
* @link http://project.carrot2.org
*/
-@SuppressWarnings("unchecked")
public class CarrotClusteringEngine extends SearchClusteringEngine {
- private transient static Logger log = LoggerFactory
+ private transient static Logger log = LoggerFactory
.getLogger(CarrotClusteringEngine.class);
+ /**
+ * The subdirectory in Solr config dir to read customized Carrot2 resources from.
+ */
+ private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
+
+ /**
+ * Name of Carrot2 document's field containing Solr document's identifier.
+ */
+ private static final String SOLR_DOCUMENT_ID = "solrId";
+
+ /**
+ * Name of Solr document's field containing the document's identifier. To avoid
+ * repeating the content of documents in clusters on output, each cluster contains
+ * identifiers of documents it contains.
+ */
+ private String idFieldName;
+
/**
* Carrot2 controller that manages instances of clustering algorithms
*/
private Controller controller = ControllerFactory.createPooling();
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
- private String idFieldName;
-
@Override
@Deprecated
public Object cluster(Query query, DocList docList, SolrQueryRequest sreq) {
@@ -101,6 +126,10 @@ public class CarrotClusteringEngine exte
attributes.put(AttributeNames.DOCUMENTS, documents);
attributes.put(AttributeNames.QUERY, query.toString());
+ // Pass the fields on which clustering runs to the
+ // SolrStopwordsCarrot2LexicalDataFactory
+ attributes.put("solrFieldNames", getFieldsForClustering(sreq));
+
// Pass extra overriding attributes from the request, if any
extractCarrotAttributes(sreq.getParams(), attributes);
@@ -113,22 +142,68 @@ public class CarrotClusteringEngine exte
}
}
- @Override
+ @Override
+ @SuppressWarnings({ "unchecked", "rawtypes" })
public String init(NamedList config, final SolrCore core) {
String result = super.init(config, core);
- SolrParams initParams = SolrParams.toSolrParams(config);
+ final SolrParams initParams = SolrParams.toSolrParams(config);
// Initialize Carrot2 controller. Pass initialization attributes, if any.
HashMap<String, Object> initAttributes = new HashMap<String, Object>();
extractCarrotAttributes(initParams, initAttributes);
-
- // Customize the language model factory. The implementation we provide here
- // is included in the code base of Solr, so that it's possible to refactor
- // the Lucene APIs the factory relies on if needed.
- initAttributes.put("PreprocessingPipeline.languageModelFactory",
- LuceneLanguageModelFactory.class);
- this.controller.init(initAttributes);
+ // Customize the stemmer and tokenizer factories. The implementations we provide here
+ // are included in the code base of Solr, so that it's possible to refactor
+ // the Lucene APIs the factories rely on if needed.
+ // Additionally, we set a custom lexical resource factory for Carrot2 that
+ // will use both Carrot2 default stop words as well as stop words from
+ // the StopFilter defined on the field.
+ BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
+ .stemmerFactory(LuceneCarrot2StemmerFactory.class)
+ .tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
+ .lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
+
+ // Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
+ initAttributes.put("solrIndexSchema", core.getSchema());
+
+ // Customize Carrot2's resource lookup to first look for resources
+ // using Solr's resource loader. If that fails, try loading from the classpath.
+ DefaultLexicalDataFactoryDescriptor.attributeBuilder(initAttributes)
+ .resourceLookup(new ResourceLookup(new IResourceLocator() {
+ @Override
+ public IResource[] getAll(final String resource) {
+ final SolrResourceLoader resourceLoader = core.getResourceLoader();
+ final String carrot2ResourcesDir = resourceLoader.getConfigDir()
+ + initParams.get(CarrotParams.LEXICAL_RESOURCES_DIR, CARROT_RESOURCES_PREFIX);
+ try {
+ log.debug("Looking for " + resource + " in "
+ + carrot2ResourcesDir);
+ final InputStream resourceStream = resourceLoader
+ .openResource(carrot2ResourcesDir + "/" + resource);
+
+ log.info(resource + " loaded from " + carrot2ResourcesDir);
+ final IResource foundResource = new IResource() {
+ @Override
+ public InputStream open() throws IOException {
+ return resourceStream;
+ }
+ };
+ return new IResource[] { foundResource };
+ } catch (RuntimeException e) {
+ // No way to distinguish if the resource was found but failed
+ // to load or wasn't found at all, so we simply fall back
+ // to Carrot2 defaults here by returning an empty locations array.
+ log.debug(resource + " not found in " + carrot2ResourcesDir
+ + ". Using the default " + resource + " from Carrot JAR.");
+ return new IResource[] {};
+ }
+ }
+ },
+
+ // Using the class loader directly because this time we want to omit the prefix
+ new ClassLoaderLocator(core.getResourceLoader().getClassLoader())));
+
+ this.controller.init(initAttributes);
this.idFieldName = core.getSchema().getUniqueKeyField().getName();
// Make sure the requested Carrot2 clustering algorithm class is available
@@ -148,17 +223,29 @@ public class CarrotClusteringEngine exte
protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
SolrParams solrParams = sreq.getParams();
- // Names of fields to deliver content for clustering
- String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
+ HashSet<String> fields = Sets.newHashSet(getFieldsForClustering(sreq));
+ fields.add(idFieldName);
+ fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
+ return fields;
+ }
+
+ /**
+ * Returns the names of fields that will be delivering the actual
+ * content for clustering. Currently, there are two such fields: document
+ * title and document content.
+ */
+ private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
+ SolrParams solrParams = sreq.getParams();
+
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
if (StringUtils.isBlank(snippetField)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
+ " must not be blank.");
}
- return Sets.newHashSet(urlField, titleField, snippetField, idFieldName);
- }
-
+ return Sets.newHashSet(titleField, snippetField);
+ }
+
/**
* Prepares Carrot2 documents for clustering.
*/
@@ -180,7 +267,7 @@ public class CarrotClusteringEngine exte
if (produceSummary == true) {
highlighter = HighlightComponent.getHighlighter(core);
if (highlighter != null){
- Map args = new HashMap();
+ Map<String, Object> args = Maps.newHashMap();
snippetFieldAry = new String[]{snippetField};
args.put(HighlightParams.FIELDS, snippetFieldAry);
args.put(HighlightParams.HIGHLIGHT, "true");
@@ -214,11 +301,12 @@ public class CarrotClusteringEngine exte
if (produceSummary && docIds != null) {
docsHolder[0] = docIds.get(sdoc).intValue();
DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
- NamedList highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
+ NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
//should only be one document with one field
- NamedList tmp = (NamedList) highlights.getVal(0);
- String [] highlt = (String[]) tmp.get(snippetField);
+ @SuppressWarnings("unchecked")
+ NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
+ String [] highlt = tmp.get(snippetField);
if (highlt != null && highlt.length == 1) {
snippet = highlt[0];
}
@@ -226,27 +314,13 @@ public class CarrotClusteringEngine exte
}
Document carrotDocument = new Document(getValue(sdoc, titleField),
snippet, (String)sdoc.getFieldValue(urlField));
- carrotDocument.setField("solrId", sdoc.getFieldValue(idFieldName));
+ carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
result.add(carrotDocument);
}
return result;
}
- @Deprecated
- protected String getValue(org.apache.lucene.document.Document doc,
- String field) {
- StringBuilder result = new StringBuilder();
- String[] vals = doc.getValues(field);
- for (int i = 0; i < vals.length; i++) {
- // Join multiple values with a period so that Carrot2 does not pick up
- // phrases that cross field value boundaries (in most cases it would
- // create useless phrases).
- result.append(vals[i]).append(" . ");
- }
- return result.toString().trim();
- }
-
protected String getValue(SolrDocument sdoc, String field) {
StringBuilder result = new StringBuilder();
Collection<Object> vals = sdoc.getFieldValues(field);
@@ -261,9 +335,9 @@ public class CarrotClusteringEngine exte
return result.toString().trim();
}
- private List clustersToNamedList(List<Cluster> carrotClusters,
+ private List<NamedList<Object>> clustersToNamedList(List<Cluster> carrotClusters,
SolrParams solrParams) {
- List result = new ArrayList();
+ List<NamedList<Object>> result = Lists.newArrayList();
clustersToNamedList(carrotClusters, result, solrParams.getBool(
CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt(
CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
@@ -271,25 +345,40 @@ public class CarrotClusteringEngine exte
}
private void clustersToNamedList(List<Cluster> outputClusters,
- List parent, boolean outputSubClusters, int maxLabels) {
+ List<NamedList<Object>> parent, boolean outputSubClusters, int maxLabels) {
for (Cluster outCluster : outputClusters) {
- NamedList cluster = new SimpleOrderedMap();
+ NamedList<Object> cluster = new SimpleOrderedMap<Object>();
parent.add(cluster);
+ // Add labels
List<String> labels = outCluster.getPhrases();
- if (labels.size() > maxLabels)
+ if (labels.size() > maxLabels) {
labels = labels.subList(0, maxLabels);
+ }
cluster.add("labels", labels);
+ // Add cluster score
+ final Double score = outCluster.getScore();
+ if (score != null) {
+ cluster.add("score", score);
+ }
+
+ // Add other topics marker
+ if (outCluster.isOtherTopics()) {
+ cluster.add("other-topics", outCluster.isOtherTopics());
+ }
+
+ // Add documents
List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
- List docList = new ArrayList();
+ List<Object> docList = Lists.newArrayList();
cluster.add("docs", docList);
for (Document doc : docs) {
- docList.add(doc.getField("solrId"));
+ docList.add(doc.getField(SOLR_DOCUMENT_ID));
}
- if (outputSubClusters) {
- List subclusters = new ArrayList();
+ // Add subclusters
+ if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) {
+ List<NamedList<Object>> subclusters = Lists.newArrayList();
cluster.add("clusters", subclusters);
clustersToNamedList(outCluster.getSubclusters(), subclusters,
outputSubClusters, maxLabels);
Modified: lucene/dev/branches/solr2452/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotParams.java Sun May 22 21:45:19 2011
@@ -35,6 +35,8 @@ public interface CarrotParams {
String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragzise";
+ String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
+
public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME,
PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS, SUMMARY_FRAGSIZE);
Modified: lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java Sun May 22 21:45:19 2011
@@ -17,6 +17,11 @@ package org.apache.solr.handler.clusteri
* limitations under the License.
*/
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
import org.apache.lucene.index.Term;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
@@ -37,15 +42,11 @@ import org.apache.solr.util.SolrPluginUt
import org.carrot2.util.attribute.AttributeUtils;
import org.junit.Test;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import com.google.common.collect.ImmutableList;
/**
*
*/
-@SuppressWarnings("unchecked")
public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
@Test
public void testCarrotLingo() throws Exception {
@@ -74,7 +75,7 @@ public class CarrotClusteringEngineTest
@Test
public void testWithoutSubclusters() throws Exception {
- checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs),
+ checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs),
1, 1, 0);
}
@@ -82,7 +83,7 @@ public class CarrotClusteringEngineTest
public void testWithSubclusters() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(CarrotParams.OUTPUT_SUB_CLUSTERS, true);
- checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs), 1, 1, 2);
+ checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs), 1, 1, 2);
}
@Test
@@ -90,19 +91,107 @@ public class CarrotClusteringEngineTest
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 5);
params.set(CarrotParams.NUM_DESCRIPTIONS, 3);
- checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
+ checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
params), 1, 3, 0);
}
@Test
+ public void testClusterScores() throws Exception {
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
+ List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
+ AbstractClusteringTestCase.numberOfDocs, params);
+ int i = 1;
+ for (NamedList<Object> cluster : clusters) {
+ final Double score = getScore(cluster);
+ assertNotNull(score);
+ assertEquals(0.25 * i++, score, 0);
+ }
+ }
+
+ @Test
+ public void testOtherTopics() throws Exception {
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
+ params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "otherTopicsModulo"), 2);
+ List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
+ AbstractClusteringTestCase.numberOfDocs, params);
+ int i = 1;
+ for (NamedList<Object> cluster : clusters) {
+ assertEquals(i++ % 2 == 0 ? true : null, isOtherTopics(cluster));
+ }
+ }
+
+ @Test
public void testCarrotAttributePassing() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 3);
- checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
+ checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
params), 1, 3, 0);
}
+ @Test
+ public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
+ checkLexicalResourcesFromSolrConfig("lexical-resource-check",
+ "online,customsolrstopword,customsolrstoplabel");
+ }
+
+ @Test
+ public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
+ checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
+ "online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
+ }
+
+ private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
+ throws IOException {
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ params.set("merge-resources", false);
+ params.set(AttributeUtils.getKey(
+ LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+ wordsToCheck);
+
+ // "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
+ // stoplabels.en, so we're expecting only one cluster with label "online".
+ final List<NamedList<Object>> clusters = checkEngine(
+ getClusteringEngine(engineName), 1, params);
+ assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+ }
+
+ @Test
+ public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ params.set("merge-resources", false);
+ params.set(AttributeUtils.getKey(
+ LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+ "online,solrownstopword");
+
+ // "solrownstopword" is in stopwords.txt, so we're expecting
+ // only one cluster with label "online".
+ final List<NamedList<Object>> clusters = checkEngine(
+ getClusteringEngine("lexical-resource-check"), 1, params);
+ assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
+ }
+
+ @Test
+ public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
+ ModifiableSolrParams params = new ModifiableSolrParams();
+ // Force string fields to be used for clustering. Does not make sense
+ // in a real word, but does the job in the test.
+ params.set(CarrotParams.TITLE_FIELD_NAME, "url");
+ params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
+ params.set("merge-resources", false);
+ params.set(AttributeUtils.getKey(
+ LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
+ "online,solrownstopword");
+
+ final List<NamedList<Object>> clusters = checkEngine(
+ getClusteringEngine("lexical-resource-check"), 2, params);
+ assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
+ assertEquals(ImmutableList.of("solrownstopword"),
+ getLabels(clusters.get(1)));
+ }
+
private CarrotClusteringEngine getClusteringEngine(String engineName) {
ClusteringComponent comp = (ClusteringComponent) h.getCore()
.getSearchComponent("clustering");
@@ -114,18 +203,18 @@ public class CarrotClusteringEngineTest
return engine;
}
- private List checkEngine(CarrotClusteringEngine engine,
+ private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
int expectedNumClusters) throws IOException {
return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), new ModifiableSolrParams());
}
- private List checkEngine(CarrotClusteringEngine engine,
+ private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
int expectedNumClusters, SolrParams clusteringParams) throws IOException {
return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), clusteringParams);
}
- private List checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
+ private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
int expectedNumClusters, Query query, SolrParams clusteringParams) throws IOException {
// Get all documents to cluster
RefCounted<SolrIndexSearcher> ref = h.getCore().getSearcher();
@@ -145,7 +234,9 @@ public class CarrotClusteringEngineTest
LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams);
Map<SolrDocument,Integer> docIds = new HashMap<SolrDocument, Integer>(docList.size());
SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds );
- List results = (List)engine.cluster(query, solrDocList, docIds, req);
+
+ @SuppressWarnings("unchecked")
+ List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
req.close();
assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
checkClusters(results, false);
@@ -155,51 +246,74 @@ public class CarrotClusteringEngineTest
}
}
- private void checkClusters(List results, int expectedDocCount,
+ private void checkClusters(List<NamedList<Object>> results, int expectedDocCount,
int expectedLabelCount, int expectedSubclusterCount) {
for (int i = 0; i < results.size(); i++) {
- NamedList cluster = (NamedList) results.get(i);
+ NamedList<Object> cluster = results.get(i);
checkCluster(cluster, expectedDocCount, expectedLabelCount,
expectedSubclusterCount);
}
}
- private void checkClusters(List results, boolean hasSubclusters) {
+ private void checkClusters(List<NamedList<Object>> results, boolean hasSubclusters) {
for (int i = 0; i < results.size(); i++) {
- checkCluster((NamedList) results.get(i), hasSubclusters);
+ checkCluster(results.get(i), hasSubclusters);
}
}
- private void checkCluster(NamedList cluster, boolean hasSubclusters) {
- List docs = (List) cluster.get("docs");
+ private void checkCluster(NamedList<Object> cluster, boolean hasSubclusters) {
+ List<Object> docs = getDocs(cluster);
assertNotNull("docs is null and it shouldn't be", docs);
for (int j = 0; j < docs.size(); j++) {
String id = (String) docs.get(j);
assertNotNull("id is null and it shouldn't be", id);
}
- List labels = (List) cluster.get("labels");
+ List<String> labels = getLabels(cluster);
assertNotNull("labels is null but it shouldn't be", labels);
if (hasSubclusters) {
- List subclusters = (List) cluster.get("clusters");
+ List<NamedList<Object>> subclusters = getSubclusters(cluster);
assertNotNull("subclusters is null but it shouldn't be", subclusters);
}
}
- private void checkCluster(NamedList cluster, int expectedDocCount,
+ private void checkCluster(NamedList<Object> cluster, int expectedDocCount,
int expectedLabelCount, int expectedSubclusterCount) {
checkCluster(cluster, expectedSubclusterCount > 0);
assertEquals("number of docs in cluster", expectedDocCount,
- ((List) cluster.get("docs")).size());
+ getDocs(cluster).size());
assertEquals("number of labels in cluster", expectedLabelCount,
- ((List) cluster.get("labels")).size());
+ getLabels(cluster).size());
if (expectedSubclusterCount > 0) {
- List subclusters = (List) cluster.get("clusters");
+ List<NamedList<Object>> subclusters = getSubclusters(cluster);
assertEquals("numClusters", expectedSubclusterCount, subclusters.size());
assertEquals("number of subclusters in cluster",
expectedSubclusterCount, subclusters.size());
}
}
+
+ @SuppressWarnings("unchecked")
+ private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
+ return (List<NamedList<Object>>) cluster.get("clusters");
+ }
+
+ @SuppressWarnings("unchecked")
+ private List<String> getLabels(NamedList<Object> cluster) {
+ return (List<String>) cluster.get("labels");
+ }
+
+ private Double getScore(NamedList<Object> cluster) {
+ return (Double) cluster.get("score");
+ }
+
+ private Boolean isOtherTopics(NamedList<Object> cluster) {
+ return (Boolean)cluster.get("other-topics");
+ }
+
+ @SuppressWarnings("unchecked")
+ private List<Object> getDocs(NamedList<Object> cluster) {
+ return (List<Object>) cluster.get("docs");
+ }
}
Modified: lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/MockClusteringAlgorithm.java Sun May 22 21:45:19 2011
@@ -49,6 +49,11 @@ public class MockClusteringAlgorithm ext
@IntRange(min = 1, max = 5)
private int labels = 1;
+ @Input
+ @Processing
+ @Attribute
+ private int otherTopicsModulo = 0;
+
@Override
public void process() throws ProcessingException {
clusters = Lists.newArrayList();
@@ -59,21 +64,26 @@ public class MockClusteringAlgorithm ext
int documentIndex = 1;
for (Document document : documents) {
StringBuilder label = new StringBuilder("Cluster " + documentIndex);
- Cluster cluster = createCluster(label.toString(), document);
+ Cluster cluster = createCluster(label.toString(), documentIndex, document);
clusters.add(cluster);
for (int i = 1; i <= depth; i++) {
label.append(".");
label.append(i);
- Cluster newCluster = createCluster(label.toString(), document);
- cluster.addSubclusters(createCluster(label.toString(), document), newCluster);
+ Cluster newCluster = createCluster(label.toString(), documentIndex, document);
+ cluster.addSubclusters(createCluster(label.toString(), documentIndex, document), newCluster);
cluster = newCluster;
}
documentIndex++;
}
}
- private Cluster createCluster(String labelBase, Document... documents) {
+ private Cluster createCluster(String labelBase, int documentIndex, Document... documents) {
Cluster cluster = new Cluster();
+ cluster.setScore(documentIndex * 0.25);
+ if (otherTopicsModulo != 0 && documentIndex % otherTopicsModulo == 0)
+ {
+ cluster.setOtherTopics(true);
+ }
for (int i = 0; i < labels; i++) {
cluster.addPhrases(labelBase + "#" + (i + 1));
}
Modified: lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml (original)
+++ lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/resources/solr-clustering/conf/solrconfig.xml Sun May 22 21:45:19 2011
@@ -396,6 +396,15 @@
<str name="name">mock</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
</lst>
+ <lst name="engine">
+ <str name="name">lexical-resource-check</str>
+ <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
+ </lst>
+ <lst name="engine">
+ <str name="name">lexical-resource-check-custom-resource-dir</str>
+ <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
+ <str name="carrot.lexicalResourcesDir">clustering/custom</str>
+ </lst>
</searchComponent>
<searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="doc-clustering">
Modified: lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt (original)
+++ lucene/dev/branches/solr2452/solr/contrib/clustering/src/test/resources/solr-clustering/conf/stopwords.txt Sun May 22 21:45:19 2011
@@ -55,4 +55,5 @@ to
was
will
with
+solrownstopword
Modified: lucene/dev/branches/solr2452/solr/contrib/contrib-build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/contrib-build.xml?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/contrib-build.xml (original)
+++ lucene/dev/branches/solr2452/solr/contrib/contrib-build.xml Sun May 22 21:45:19 2011
@@ -20,7 +20,7 @@
<project name="solr-contrib-build">
<!-- TODO: adjust build.dir/dist.dir appropriately when a contrib project is run individually -->
<property name="build.dir" location="../../build/contrib/${ant.project.name}"/>
- <property name="dist.dir" location="../../dist/"/>
+ <property name="dist.dir" location="../../dist"/>
<import file="../common-build.xml"/>
@@ -39,7 +39,7 @@
<!-- set jarfile only, if the target jar file has no generic name -->
<attribute name="jarfile" default="${common-solr.dir}/build/contrib/solr-@{name}/apache-solr-@{name}-${version}.jar"/>
<sequential>
- <!--<echo message="Checking '@{jarfile}' against source folder '${common.dir}/contrib/@{name}/src/java'"/>-->
+ <!--<echo message="Checking '@{jarfile}' against source folder '${common.dir}/contrib/@{name}/src/main/java'"/>-->
<property name="@{classpath.property}" location="@{jarfile}"/>
<uptodate property="@{property}" targetfile="@{jarfile}">
<srcfiles dir="../@{name}/src/main/java" includes="**/*.java"/>
Modified: lucene/dev/branches/solr2452/solr/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DataImportHandler.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DataImportHandler.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DataImportHandler.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DataImportHandler.java Sun May 22 21:45:19 2011
@@ -194,7 +194,7 @@ public class DataImportHandler extends R
IMPORT_CMD.equals(command)) {
UpdateRequestProcessorChain processorChain =
- req.getCore().getUpdateProcessingChain(params.get(UpdateParams.UPDATE_PROCESSOR));
+ req.getCore().getUpdateProcessingChain(params.get(UpdateParams.UPDATE_CHAIN));
UpdateRequestProcessor processor = processorChain.createProcessor(req, rsp);
SolrResourceLoader loader = req.getCore().getResourceLoader();
SolrWriter sw = getSolrWriter(processor, loader, requestParams, req);
Modified: lucene/dev/branches/solr2452/solr/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/extraction/CHANGES.txt?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/extraction/CHANGES.txt (original)
+++ lucene/dev/branches/solr2452/solr/contrib/extraction/CHANGES.txt Sun May 22 21:45:19 2011
@@ -22,7 +22,7 @@ to your Solr Home lib directory. See ht
Current Version: Tika 0.8 (released 11/07/2010)
-$Id:$
+$Id$
================== Release 4.0-dev ==================
@@ -30,7 +30,8 @@ $Id:$
================== Release 3.2-dev ==================
-(No Changes)
+* SOLR-2480: Add ignoreTikaException flag so that users can ignore TikaException but index
+ meta data. (Shinichiro Abe, koji)
================== Release 3.1-dev ==================
Modified: lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Sun May 22 21:45:19 2011
@@ -16,20 +16,27 @@
*/
package org.apache.solr.handler.extraction;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.Locale;
+
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.NamedList;
+import org.apache.solr.handler.ContentStreamLoader;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.UpdateRequestProcessor;
-import org.apache.solr.handler.ContentStreamLoader;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -37,26 +44,24 @@ import org.apache.tika.sax.XHTMLContentH
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.mime.MediaType;
-import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.BaseMarkupSerializer;
-import org.apache.xml.serialize.XMLSerializer;
+import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.TextSerializer;
+import org.apache.xml.serialize.XMLSerializer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.util.Locale;
-
/**
* The class responsible for loading extracted content into Solr.
*
**/
public class ExtractingDocumentLoader extends ContentStreamLoader {
+
+ private static final Logger log = LoggerFactory.getLogger(ExtractingDocumentLoader.class);
+
/**
* Extract Only supported format
*/
@@ -74,6 +79,7 @@ public class ExtractingDocumentLoader ex
final IndexSchema schema;
final SolrParams params;
final UpdateRequestProcessor processor;
+ final boolean ignoreTikaException;
protected AutoDetectParser autoDetectParser;
private final AddUpdateCommand templateAdd;
@@ -95,6 +101,8 @@ public class ExtractingDocumentLoader ex
//this is lightweight
autoDetectParser = new AutoDetectParser(config);
this.factory = factory;
+
+ ignoreTikaException = params.getBool(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);
}
@@ -180,9 +188,17 @@ public class ExtractingDocumentLoader ex
parsingHandler = new MatchingContentHandler(handler, matcher);
} //else leave it as is
- //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
- ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
- parser.parse(inputStream, parsingHandler, metadata, context);
+ try{
+ //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
+ ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
+ parser.parse(inputStream, parsingHandler, metadata, context);
+ } catch (TikaException e) {
+ if(ignoreTikaException)
+ log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage())
+ .append(". metadata=").append(metadata.toString()).toString());
+ else
+ throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
+ }
if (extractOnly == false) {
addDoc(handler);
} else {
@@ -202,8 +218,6 @@ public class ExtractingDocumentLoader ex
}
} catch (SAXException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
- } catch (TikaException e) {
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
} finally {
IOUtils.closeQuietly(inputStream);
}
Modified: lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java Sun May 22 21:45:19 2011
@@ -28,6 +28,11 @@ public interface ExtractingParams {
*/
public static final String LOWERNAMES = "lowernames";
+ /**
+ * if true, ignore TikaException (give up to extract text but index meta data)
+ */
+ public static final String IGNORE_TIKA_EXCEPTION = "ignoreTikaException";
+
/**
* The param prefix for mapping Tika metadata to Solr fields.
Modified: lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java Sun May 22 21:45:19 2011
@@ -47,23 +47,23 @@ import java.util.*;
*/
public class SolrContentHandler extends DefaultHandler implements ExtractingParams {
private transient static Logger log = LoggerFactory.getLogger(SolrContentHandler.class);
- private SolrInputDocument document;
+ protected SolrInputDocument document;
- private Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
+ protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
- private Metadata metadata;
- private SolrParams params;
- private StringBuilder catchAllBuilder = new StringBuilder(2048);
- private IndexSchema schema;
- private Map<String, StringBuilder> fieldBuilders = Collections.emptyMap();
+ protected Metadata metadata;
+ protected SolrParams params;
+ protected StringBuilder catchAllBuilder = new StringBuilder(2048);
+ protected IndexSchema schema;
+ protected Map<String, StringBuilder> fieldBuilders = Collections.emptyMap();
private LinkedList<StringBuilder> bldrStack = new LinkedList<StringBuilder>();
- private boolean captureAttribs;
- private boolean lowerNames;
- private String contentFieldName = "content";
+ protected boolean captureAttribs;
+ protected boolean lowerNames;
+ protected String contentFieldName = "content";
- private String unknownFieldPrefix = "";
- private String defaultField = "";
+ protected String unknownFieldPrefix = "";
+ protected String defaultField = "";
public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
this(metadata, params, schema, DateUtil.DEFAULT_DATE_FORMATS);
@@ -99,46 +99,82 @@ public class SolrContentHandler extends
* The base implementation adds the metadata as fields, allowing for potential remapping.
*
* @return The {@link org.apache.solr.common.SolrInputDocument}.
+ *
+ * @see #addMetadata()
+ * @see #addCapturedContent()
+ * @see #addContent()
+ * @see #addLiterals()
*/
public SolrInputDocument newDocument() {
float boost = 1.0f;
//handle the metadata extracted from the document
- for (String name : metadata.names()) {
- String[] vals = metadata.getValues(name);
- addField(name, null, vals);
- }
+ addMetadata();
//handle the literals from the params
- Iterator<String> paramNames = params.getParameterNamesIterator();
- while (paramNames.hasNext()) {
- String pname = paramNames.next();
- if (!pname.startsWith(LITERALS_PREFIX)) continue;
-
- String name = pname.substring(LITERALS_PREFIX.length());
- addField(name, null, params.getParams(pname));
- }
+ addLiterals();
//add in the content
- addField(contentFieldName, catchAllBuilder.toString(), null);
+ addContent();
//add in the captured content
+ addCapturedContent();
+
+ if (log.isDebugEnabled()) {
+ log.debug("Doc: {}", document);
+ }
+ return document;
+ }
+
+ /**
+ * Add the per field captured content to the Solr Document. Default implementation uses the
+ * {@link #fieldBuilders} info
+ */
+ protected void addCapturedContent() {
for (Map.Entry<String, StringBuilder> entry : fieldBuilders.entrySet()) {
if (entry.getValue().length() > 0) {
addField(entry.getKey(), entry.getValue().toString(), null);
}
}
- if (log.isDebugEnabled()) {
- log.debug("Doc: " + document);
+ }
+
+ /**
+ * Add in the catch all content to the field. Default impl. uses the {@link #contentFieldName}
+ * and the {@link #catchAllBuilder}
+ */
+ protected void addContent() {
+ addField(contentFieldName, catchAllBuilder.toString(), null);
+ }
+
+ /**
+ * Add in the literals to the document using the {@link #params} and the {@link #LITERALS_PREFIX}.
+ */
+ protected void addLiterals() {
+ Iterator<String> paramNames = params.getParameterNamesIterator();
+ while (paramNames.hasNext()) {
+ String pname = paramNames.next();
+ if (!pname.startsWith(LITERALS_PREFIX)) continue;
+
+ String name = pname.substring(LITERALS_PREFIX.length());
+ addField(name, null, params.getParams(pname));
+ }
+ }
+
+ /**
+ * Add in any metadata using {@link #metadata} as the source.
+ */
+ protected void addMetadata() {
+ for (String name : metadata.names()) {
+ String[] vals = metadata.getValues(name);
+ addField(name, null, vals);
}
- return document;
}
// Naming rules:
// 1) optionally map names to nicenames (lowercase+underscores)
// 2) execute "map" commands
// 3) if resulting field is unknown, map it to a common prefix
- private void addField(String fname, String fval, String[] vals) {
+ protected void addField(String fname, String fval, String[] vals) {
if (lowerNames) {
StringBuilder sb = new StringBuilder();
for (int i=0; i<fname.length(); i++) {
Modified: lucene/dev/branches/solr2452/solr/contrib/uima/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/CHANGES.txt?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/CHANGES.txt (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/CHANGES.txt Sun May 22 21:45:19 2011
@@ -21,11 +21,33 @@ $Id$
================== 3.2.0-dev ==================
+Upgrading from Solr 3.1
+----------------------
+
+* <uimaConfig/> just beneath <config> ... </config> is no longer supported.
+ It should move to UIMAUpdateRequestProcessorFactory setting.
+ See contrib/uima/README.txt for more details. (SOLR-2436)
+
+New Features
+----------------------
+
+* SOLR-2503: extend mapping function to map feature value to dynamicField. (koji)
+
+* SOLR-2512: add ignoreErrors flag so that users can ignore exceptions in AE.
+ (Tommaso Teofili, koji)
+
Test Cases:
+----------------------
+
+* SOLR-2387: add mock annotators for improved testing,
+ (Tommaso Teofili via rmuir)
+
+Other Changes
+----------------------
- * SOLR-2387: add mock annotators for improved testing,
- (Tommaso Teofili via rmuir)
+* SOLR-2436: move uimaConfig to under the uima's update processor in solrconfig.xml.
+ (Tommaso Teofili, koji)
-================== 3.1.0-dev ==================
+================== 3.1.0 ==================
Initial Release
Modified: lucene/dev/branches/solr2452/solr/contrib/uima/README.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/README.txt?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/README.txt (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/README.txt Sun May 22 21:45:19 2011
@@ -3,38 +3,74 @@ Getting Started
To start using Solr UIMA Metadata Extraction Library you should go through the following configuration steps:
1. copy generated solr-uima jar and its libs (under contrib/uima/lib) inside a Solr libraries directory.
+ or set <lib/> tags in solrconfig.xml appropriately to point those jar files.
+
+ <lib dir="../../contrib/uima/lib" />
+ <lib dir="../../dist/" regex="apache-solr-uima-\d.*\.jar" />
2. modify your schema.xml adding the fields you want to be hold metadata specifying proper values for type, indexed, stored and multiValued options:
-3. for example you could specify the following
+ for example you could specify the following
+
<field name="language" type="string" indexed="true" stored="true" required="false"/>
<field name="concept" type="string" indexed="true" stored="true" multiValued="true" required="false"/>
<field name="sentence" type="text" indexed="true" stored="true" multiValued="true" required="false" />
-4. modify your solrconfig.xml adding the following snippet:
- <uimaConfig>
- <runtimeParameters>
- <keyword_apikey>VALID_ALCHEMYAPI_KEY</keyword_apikey>
- <concept_apikey>VALID_ALCHEMYAPI_KEY</concept_apikey>
- <lang_apikey>VALID_ALCHEMYAPI_KEY</lang_apikey>
- <cat_apikey>VALID_ALCHEMYAPI_KEY</cat_apikey>
- <entities_apikey>VALID_ALCHEMYAPI_KEY</entities_apikey>
- <oc_licenseID>VALID_OPENCALAIS_KEY</oc_licenseID>
- </runtimeParameters>
- <analysisEngine>/org/apache/uima/desc/OverridingParamsExtServicesAE.xml</analysisEngine>
- <analyzeFields merge="false">text</analyzeFields>
- <fieldMapping>
- <type name="org.apache.uima.alchemy.ts.concept.ConceptFS">
- <map feature="text" field="concept"/>
- </type>
- <type name="org.apache.uima.alchemy.ts.language.LanguageFS">
- <map feature="language" field="language"/>
- </type>
- <type name="org.apache.uima.SentenceAnnotation">
- <map feature="coveredText" field="sentence"/>
- </type>
- </fieldMapping>
- </uimaConfig>
+3. modify your solrconfig.xml adding the following snippet:
+
+ <updateRequestProcessorChain name="uima">
+ <processor class="org.apache.solr.uima.processor.UIMAUpdateRequestProcessorFactory">
+ <lst name="uimaConfig">
+ <lst name="runtimeParameters">
+ <str name="keyword_apikey">VALID_ALCHEMYAPI_KEY</str>
+ <str name="concept_apikey">VALID_ALCHEMYAPI_KEY</str>
+ <str name="lang_apikey">VALID_ALCHEMYAPI_KEY</str>
+ <str name="cat_apikey">VALID_ALCHEMYAPI_KEY</str>
+ <str name="entities_apikey">VALID_ALCHEMYAPI_KEY</str>
+ <str name="oc_licenseID">VALID_OPENCALAIS_KEY</str>
+ </lst>
+ <str name="analysisEngine">/org/apache/uima/desc/OverridingParamsExtServicesAE.xml</str>
+ <!-- Set to true if you want to continue indexing even if text processing fails.
+ Default is false. That is, Solr throws RuntimeException and
+ never indexed documents entirely in your session. -->
+ <bool name="ignoreErrors">true</bool>
+ <!-- This is optional. It is used for logging when text processing fails.
+ Usually, set uniqueKey field name -->
+ <str name="logField">id</str>
+ <lst name="analyzeFields">
+ <bool name="merge">false</bool>
+ <arr name="fields">
+ <str>text</str>
+ </arr>
+ </lst>
+ <lst name="fieldMappings">
+ <lst name="type">
+ <str name="name">org.apache.uima.alchemy.ts.concept.ConceptFS</str>
+ <lst name="mapping">
+ <str name="feature">text</str>
+ <str name="field">concept</str>
+ </lst>
+ </lst>
+ <lst name="type">
+ <str name="name">org.apache.uima.alchemy.ts.language.LanguageFS</str>
+ <lst name="mapping">
+ <str name="feature">language</str>
+ <str name="field">language</str>
+ </lst>
+ </lst>
+ <lst name="type">
+ <str name="name">org.apache.uima.SentenceAnnotation</str>
+ <lst name="mapping">
+ <str name="feature">coveredText</str>
+ <str name="field">sentence</str>
+ </lst>
+ </lst>
+ </lst>
+ </lst>
+ </processor>
+ <processor class="solr.LogUpdateProcessorFactory" />
+ <processor class="solr.RunUpdateProcessorFactory" />
+ </updateRequestProcessorChain>
where VALID_ALCHEMYAPI_KEY is your AlchemyAPI Access Key. You need to register AlchemyAPI Access
key to exploit the AlchemyAPI services: http://www.alchemyapi.com/api/register.html
@@ -42,21 +78,14 @@ To start using Solr UIMA Metadata Extrac
where VALID_OPENCALAIS_KEY is your Calais Service Key. You need to register Calais Service
key to exploit the Calais services: http://www.opencalais.com/apikey
-5. the analysisEngine tag must contain an AE descriptor inside the specified path in the classpath
+ the analysisEngine must contain an AE descriptor inside the specified path in the classpath
-6. the analyzeFields tag must contain the input fields that need to be analyzed by UIMA,
+ the analyzeFields must contain the input fields that need to be analyzed by UIMA,
if merge=true then their content will be merged and analyzed only once
-7. field mapping describes which features of which types should go in a field
-
-8. define in your solrconfig.xml an UpdateRequestProcessorChain as following:
- <updateRequestProcessorChain name="uima">
- <processor class="org.apache.solr.uima.processor.UIMAUpdateRequestProcessorFactory"/>
- <processor class="solr.LogUpdateProcessorFactory" />
- <processor class="solr.RunUpdateProcessorFactory" />
- </updateRequestProcessorChain>
+ field mapping describes which features of which types should go in a field
-9. in your solrconfig.xml replace the existing default (<requestHandler name="/update"...) or create a new UpdateRequestHandler with the following:
+4. in your solrconfig.xml replace the existing default (<requestHandler name="/update"...) or create a new UpdateRequestHandler with the following:
<requestHandler name="/update" class="solr.XmlUpdateRequestHandler">
<lst name="defaults">
<str name="update.processor">uima</str>
Modified: lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java Sun May 22 21:45:19 2011
@@ -21,7 +21,7 @@ import java.util.Map;
/**
* Configuration holding all the configurable parameters for calling UIMA inside Solr
- *
+ *
* @version $Id$
*/
public class SolrUIMAConfiguration {
@@ -30,20 +30,26 @@ public class SolrUIMAConfiguration {
private boolean fieldsMerging;
- private Map<String, Map<String, String>> typesFeaturesFieldsMapping;
+ private Map<String, Map<String, MapField>> typesFeaturesFieldsMapping;
private String aePath;
private Map<String, Object> runtimeParameters;
+ private boolean ignoreErrors;
+
+ private String logField;
+
public SolrUIMAConfiguration(String aePath, String[] fieldsToAnalyze, boolean fieldsMerging,
- Map<String, Map<String, String>> typesFeaturesFieldsMapping,
- Map<String, Object> runtimeParameters) {
+ Map<String, Map<String, MapField>> typesFeaturesFieldsMapping,
+ Map<String, Object> runtimeParameters, boolean ignoreErrors, String logField) {
this.aePath = aePath;
this.fieldsToAnalyze = fieldsToAnalyze;
this.fieldsMerging = fieldsMerging;
this.runtimeParameters = runtimeParameters;
this.typesFeaturesFieldsMapping = typesFeaturesFieldsMapping;
+ this.ignoreErrors = ignoreErrors;
+ this.logField = logField;
}
public String[] getFieldsToAnalyze() {
@@ -54,7 +60,7 @@ public class SolrUIMAConfiguration {
return fieldsMerging;
}
- public Map<String, Map<String, String>> getTypesFeaturesFieldsMapping() {
+ public Map<String, Map<String, MapField>> getTypesFeaturesFieldsMapping() {
return typesFeaturesFieldsMapping;
}
@@ -66,4 +72,46 @@ public class SolrUIMAConfiguration {
return runtimeParameters;
}
+ public boolean isIgnoreErrors() {
+ return ignoreErrors;
+ }
+
+ public String getLogField(){
+ return logField;
+ }
+
+ static final class MapField {
+
+ private String fieldName, fieldNameFeature;
+ private boolean prefix; // valid if dynamicField == true
+ // false: *_s, true: s_*
+
+ MapField(String fieldName, String fieldNameFeature){
+ this.fieldName = fieldName;
+ this.fieldNameFeature = fieldNameFeature;
+ if(fieldNameFeature != null){
+ if(fieldName.startsWith("*")){
+ prefix = false;
+ this.fieldName = fieldName.substring(1);
+ }
+ else if(fieldName.endsWith("*")){
+ prefix = true;
+ this.fieldName = fieldName.substring(0, fieldName.length() - 1);
+ }
+ else
+ throw new RuntimeException("static field name cannot be used for dynamicField");
+ }
+ }
+
+ String getFieldNameFeature(){
+ return fieldNameFeature;
+ }
+
+ String getFieldName(String featureValue){
+ if(fieldNameFeature != null){
+ return prefix ? fieldName + featureValue : featureValue + fieldName;
+ }
+ return fieldName;
+ }
+ }
}
Modified: lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java Sun May 22 21:45:19 2011
@@ -18,11 +18,11 @@ package org.apache.solr.uima.processor;
*/
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
-import org.apache.solr.core.SolrConfig;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.uima.processor.SolrUIMAConfiguration.MapField;
/**
* Read configuration for Solr-UIMA integration
@@ -32,94 +32,86 @@ import org.w3c.dom.NodeList;
*/
public class SolrUIMAConfigurationReader {
- private static final String AE_RUNTIME_PARAMETERS_NODE_PATH = "/config/uimaConfig/runtimeParameters";
+ private NamedList<Object> args;
- private static final String FIELD_MAPPING_NODE_PATH = "/config/uimaConfig/fieldMapping";
-
- private static final String ANALYZE_FIELDS_NODE_PATH = "/config/uimaConfig/analyzeFields";
-
- private static final String ANALYSIS_ENGINE_NODE_PATH = "/config/uimaConfig/analysisEngine";
-
- private SolrConfig solrConfig;
-
- public SolrUIMAConfigurationReader(SolrConfig solrConfig) {
- this.solrConfig = solrConfig;
+ public SolrUIMAConfigurationReader(NamedList<Object> args) {
+ this.args = args;
}
public SolrUIMAConfiguration readSolrUIMAConfiguration() {
return new SolrUIMAConfiguration(readAEPath(), readFieldsToAnalyze(), readFieldsMerging(),
- readTypesFeaturesFieldsMapping(), readAEOverridingParameters());
+ readTypesFeaturesFieldsMapping(), readAEOverridingParameters(), readIgnoreErrors(),
+ readLogField());
}
private String readAEPath() {
- return solrConfig.getNode(ANALYSIS_ENGINE_NODE_PATH, true).getTextContent();
+ return (String) args.get("analysisEngine");
}
+ @SuppressWarnings("rawtypes")
+ private NamedList getAnalyzeFields() {
+ return (NamedList) args.get("analyzeFields");
+ }
+
+ @SuppressWarnings("unchecked")
private String[] readFieldsToAnalyze() {
- Node analyzeFieldsNode = solrConfig.getNode(ANALYZE_FIELDS_NODE_PATH, true);
- return analyzeFieldsNode.getTextContent().split(",");
+ List<String> fields = (List<String>) getAnalyzeFields().get("fields");
+ return fields.toArray(new String[fields.size()]);
}
private boolean readFieldsMerging() {
- Node analyzeFieldsNode = solrConfig.getNode(ANALYZE_FIELDS_NODE_PATH, true);
- Node mergeNode = analyzeFieldsNode.getAttributes().getNamedItem("merge");
- return Boolean.valueOf(mergeNode.getNodeValue());
+ return (Boolean) getAnalyzeFields().get("merge");
}
- private Map<String, Map<String, String>> readTypesFeaturesFieldsMapping() {
- Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
+ @SuppressWarnings("rawtypes")
+ private Map<String, Map<String, MapField>> readTypesFeaturesFieldsMapping() {
+ Map<String, Map<String, MapField>> map = new HashMap<String, Map<String, MapField>>();
- Node fieldMappingNode = solrConfig.getNode(FIELD_MAPPING_NODE_PATH, true);
+ NamedList fieldMappings = (NamedList) args.get("fieldMappings");
/* iterate over UIMA types */
- if (fieldMappingNode.hasChildNodes()) {
- NodeList typeNodes = fieldMappingNode.getChildNodes();
- for (int i = 0; i < typeNodes.getLength(); i++) {
- /* <type> node */
- Node typeNode = typeNodes.item(i);
- if (typeNode.getNodeType() != Node.TEXT_NODE) {
- Node typeNameAttribute = typeNode.getAttributes().getNamedItem("name");
- /* get a UIMA typename */
- String typeName = typeNameAttribute.getNodeValue();
- /* create entry for UIMA type */
- map.put(typeName, new HashMap<String, String>());
- if (typeNode.hasChildNodes()) {
- /* iterate over features */
- NodeList featuresNodeList = typeNode.getChildNodes();
- for (int j = 0; j < featuresNodeList.getLength(); j++) {
- Node mappingNode = featuresNodeList.item(j);
- if (mappingNode.getNodeType() != Node.TEXT_NODE) {
- /* get field name */
- Node fieldNameNode = mappingNode.getAttributes().getNamedItem("field");
- String mappedFieldName = fieldNameNode.getNodeValue();
- /* get feature name */
- Node featureNameNode = mappingNode.getAttributes().getNamedItem("feature");
- String featureName = featureNameNode.getNodeValue();
- /* map the feature to the field for the specified type */
- map.get(typeName).put(featureName, mappedFieldName);
- }
- }
- }
+ for (int i = 0; i < fieldMappings.size(); i++) {
+ NamedList type = (NamedList) fieldMappings.get("type", i);
+ String typeName = (String)type.get("name");
+
+ Map<String, MapField> subMap = new HashMap<String, MapField>();
+ /* iterate over mapping definitions */
+ for(int j = 0; j < type.size() - 1; j++){
+ NamedList mapping = (NamedList) type.get("mapping", j + 1);
+ String featureName = (String) mapping.get("feature");
+ String fieldNameFeature = null;
+ String mappedFieldName = (String) mapping.get("field");
+ if(mappedFieldName == null){
+ fieldNameFeature = (String) mapping.get("fieldNameFeature");
+ mappedFieldName = (String) mapping.get("dynamicField");
}
+ if(mappedFieldName == null)
+ throw new RuntimeException("either of field or dynamicField should be defined for feature " + featureName);
+ MapField mapField = new MapField(mappedFieldName, fieldNameFeature);
+ subMap.put(featureName, mapField);
}
+ map.put(typeName, subMap);
}
return map;
}
+ @SuppressWarnings("rawtypes")
private Map<String, Object> readAEOverridingParameters() {
Map<String, Object> runtimeParameters = new HashMap<String, Object>();
- Node uimaConfigNode = solrConfig.getNode(AE_RUNTIME_PARAMETERS_NODE_PATH, true);
-
- if (uimaConfigNode.hasChildNodes()) {
- NodeList overridingNodes = uimaConfigNode.getChildNodes();
- for (int i = 0; i < overridingNodes.getLength(); i++) {
- Node overridingNode = overridingNodes.item(i);
- if (overridingNode.getNodeType() != Node.TEXT_NODE && overridingNode.getNodeType() != Node.COMMENT_NODE) {
- runtimeParameters.put(overridingNode.getNodeName(), overridingNode.getTextContent());
- }
- }
+ NamedList runtimeParams = (NamedList) args.get("runtimeParameters");
+ for (int i = 0; i < runtimeParams.size(); i++) {
+ String name = runtimeParams.getName(i);
+ Object value = runtimeParams.getVal(i);
+ runtimeParameters.put(name, value);
}
-
return runtimeParameters;
}
+ private boolean readIgnoreErrors() {
+ Object ignoreErrors = args.get("ignoreErrors");
+ return ignoreErrors == null ? false : (Boolean)ignoreErrors;
+ }
+
+ private String readLogField() {
+ return (String)args.get("logField");
+ }
}
Modified: lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAToSolrMapper.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAToSolrMapper.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAToSolrMapper.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAToSolrMapper.java Sun May 22 21:45:19 2011
@@ -20,6 +20,7 @@ package org.apache.solr.uima.processor;
import java.util.Map;
import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.uima.processor.SolrUIMAConfiguration.MapField;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
@@ -53,7 +54,7 @@ public class UIMAToSolrMapper {
* name of UIMA type to map
* @param featureFieldsmapping
*/
- public void map(String typeName, Map<String, String> featureFieldsmapping) {
+ public void map(String typeName, Map<String, MapField> featureFieldsmapping) {
try {
FeatureStructure fsMock = (FeatureStructure) Class.forName(typeName).getConstructor(
JCas.class).newInstance(cas);
@@ -62,7 +63,11 @@ public class UIMAToSolrMapper {
.hasNext();) {
FeatureStructure fs = iterator.next();
for (String featureName : featureFieldsmapping.keySet()) {
- String fieldName = featureFieldsmapping.get(featureName);
+ MapField mapField = featureFieldsmapping.get(featureName);
+ String fieldNameFeature = mapField.getFieldNameFeature();
+ String fieldNameFeatureValue = fieldNameFeature == null ? null :
+ fs.getFeatureValueAsString(type.getFeatureByBaseName(fieldNameFeature));
+ String fieldName = mapField.getFieldName(fieldNameFeatureValue);
log.info(new StringBuffer("mapping ").append(typeName).append("@").append(featureName)
.append(" to ").append(fieldName).toString());
String featureValue = null;
Modified: lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java Sun May 22 21:45:19 2011
@@ -20,8 +20,11 @@ package org.apache.solr.uima.processor;
import java.io.IOException;
import java.util.Map;
+import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.core.SolrCore;
+import org.apache.solr.uima.processor.SolrUIMAConfiguration.MapField;
import org.apache.solr.uima.processor.ae.AEProvider;
import org.apache.solr.uima.processor.ae.AEProviderFactory;
import org.apache.solr.update.AddUpdateCommand;
@@ -34,43 +37,45 @@ import org.apache.uima.resource.Resource
/**
* Update document(s) to be indexed with UIMA extracted information
- *
+ *
* @version $Id$
*/
public class UIMAUpdateRequestProcessor extends UpdateRequestProcessor {
- private SolrUIMAConfiguration solrUIMAConfiguration;
+ SolrUIMAConfiguration solrUIMAConfiguration;
private AEProvider aeProvider;
- public UIMAUpdateRequestProcessor(UpdateRequestProcessor next, SolrCore solrCore) {
+ public UIMAUpdateRequestProcessor(UpdateRequestProcessor next, SolrCore solrCore,
+ SolrUIMAConfiguration config) {
super(next);
- initialize(solrCore);
+ initialize(solrCore, config);
}
- private void initialize(SolrCore solrCore) {
- SolrUIMAConfigurationReader uimaConfigurationReader = new SolrUIMAConfigurationReader(solrCore
- .getSolrConfig());
- solrUIMAConfiguration = uimaConfigurationReader.readSolrUIMAConfiguration();
+ private void initialize(SolrCore solrCore, SolrUIMAConfiguration config) {
+ solrUIMAConfiguration = config;
aeProvider = AEProviderFactory.getInstance().getAEProvider(solrCore.getName(),
solrUIMAConfiguration.getAePath(), solrUIMAConfiguration.getRuntimeParameters());
}
@Override
public void processAdd(AddUpdateCommand cmd) throws IOException {
+ String text = null;
try {
/* get Solr document */
SolrInputDocument solrInputDocument = cmd.getSolrInputDocument();
/* get the fields to analyze */
- for (String text : getTextsToAnalyze(solrInputDocument)) {
+ String[] texts = getTextsToAnalyze(solrInputDocument);
+ for (int i = 0; i < texts.length; i++) {
+ text = texts[i];
if (text != null && !"".equals(text)) {
/* process the text value */
JCas jcas = processText(text);
UIMAToSolrMapper uimaToSolrMapper = new UIMAToSolrMapper(solrInputDocument, jcas);
/* get field mapping from config */
- Map<String, Map<String, String>> typesAndFeaturesFieldsMap = solrUIMAConfiguration
+ Map<String, Map<String, MapField>> typesAndFeaturesFieldsMap = solrUIMAConfiguration
.getTypesFeaturesFieldsMapping();
/* map type features on fields */
for (String typeFQN : typesAndFeaturesFieldsMap.keySet()) {
@@ -79,7 +84,21 @@ public class UIMAUpdateRequestProcessor
}
}
} catch (UIMAException e) {
- throw new RuntimeException(e);
+ String logField = solrUIMAConfiguration.getLogField();
+ String optionalFieldInfo = logField == null ? "." :
+ new StringBuilder(". ").append(logField).append("=")
+ .append((String)cmd.getSolrInputDocument().getField(logField).getValue())
+ .append(", ").toString();
+ if (solrUIMAConfiguration.isIgnoreErrors())
+ log.warn(new StringBuilder("skip the text processing due to ")
+ .append(e.getLocalizedMessage()).append(optionalFieldInfo)
+ .append(" text=\"").append(text.substring(0, 100)).append("...\"").toString());
+ else{
+ throw new SolrException(ErrorCode.SERVER_ERROR,
+ new StringBuilder("processing error: ")
+ .append(e.getLocalizedMessage()).append(optionalFieldInfo)
+ .append(" text=\"").append(text.substring(0, 100)).append("...\"").toString(), e);
+ }
}
super.processAdd(cmd);
}
Modified: lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorFactory.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorFactory.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorFactory.java Sun May 22 21:45:19 2011
@@ -17,6 +17,7 @@ package org.apache.solr.uima.processor;
* limitations under the License.
*/
+import org.apache.solr.common.util.NamedList;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.processor.UpdateRequestProcessor;
@@ -29,10 +30,19 @@ import org.apache.solr.update.processor.
*/
public class UIMAUpdateRequestProcessorFactory extends UpdateRequestProcessorFactory {
+ private NamedList<Object> args;
+
+ @SuppressWarnings("unchecked")
+ @Override
+ public void init(@SuppressWarnings("rawtypes") NamedList args) {
+ this.args = (NamedList<Object>) args.get("uimaConfig");
+ }
+
@Override
public UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp,
UpdateRequestProcessor next) {
- return new UIMAUpdateRequestProcessor(next, req.getCore());
+ return new UIMAUpdateRequestProcessor(next, req.getCore(),
+ new SolrUIMAConfigurationReader(args).readSolrUIMAConfiguration());
}
}
Modified: lucene/dev/branches/solr2452/solr/contrib/uima/src/main/resources/solr/conf/aggregate-uima-config.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/src/main/resources/solr/conf/aggregate-uima-config.xml?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/src/main/resources/solr/conf/aggregate-uima-config.xml (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/src/main/resources/solr/conf/aggregate-uima-config.xml Sun May 22 21:45:19 2011
@@ -15,19 +15,34 @@
limitations under the License.
-->
-<uimaConfig>
- <runtimeParameters>
- <keyword_apikey>VALID_ALCHEMYAPI_KEY</keyword_apikey>
- <concept_apikey>VALID_ALCHEMYAPI_KEY</concept_apikey>
- <lang_apikey>VALID_ALCHEMYAPI_KEY</lang_apikey>
- <cat_apikey>VALID_ALCHEMYAPI_KEY</cat_apikey>
- <oc_licenseID>VALID_OPENCALAIS_KEY</oc_licenseID>
- </runtimeParameters>
- <analysisEngine>/org/apache/uima/desc/OverridingParamsExtServicesAE.xml</analysisEngine>
- <analyzeFields merge="false">text,title</analyzeFields>
- <fieldMapping>
- <type name="org.apache.uima.jcas.tcas.Annotation">
- <map feature="coveredText" field="tag"/>
- </type>
- </fieldMapping>
-</uimaConfig>
\ No newline at end of file
+ <updateRequestProcessorChain name="uima">
+ <processor class="org.apache.solr.uima.processor.UIMAUpdateRequestProcessorFactory">
+ <lst name="uimaConfig">
+ <lst name="runtimeParameters">
+ <str name="keyword_apikey">VALID_ALCHEMYAPI_KEY</str>
+ <str name="concept_apikey">VALID_ALCHEMYAPI_KEY</str>
+ <str name="lang_apikey">VALID_ALCHEMYAPI_KEY</str>
+ <str name="cat_apikey">VALID_ALCHEMYAPI_KEY</str>
+ <str name="entities_apikey">VALID_ALCHEMYAPI_KEY</str>
+ <str name="oc_licenseID">VALID_OPENCALAIS_KEY</str>
+ </lst>
+ <str name="analysisEngine">/org/apache/uima/desc/OverridingParamsExtServicesAE.xml</str>
+ <lst name="analyzeFields">
+ <bool name="merge">false</bool>
+ <arr name="fields">
+ <str>text</str>
+ <str>title</str>
+ </arr>
+ </lst>
+ <lst name="fieldMappings">
+ <lst name="mapping">
+ <str name="type">org.apache.uima.jcas.tcas.Annotation</str>
+ <str name="feature">convertText</str>
+ <str name="field">tag</str>
+ </lst>
+ </lst>
+ </lst>
+ </processor>
+ <processor class="solr.LogUpdateProcessorFactory" />
+ <processor class="solr.RunUpdateProcessorFactory" />
+ </updateRequestProcessorChain>
Modified: lucene/dev/branches/solr2452/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java Sun May 22 21:45:19 2011
@@ -33,6 +33,8 @@ import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.XmlUpdateRequestHandler;
import org.apache.solr.request.SolrQueryRequestBase;
import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.uima.processor.SolrUIMAConfiguration.MapField;
+import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.update.processor.UpdateRequestProcessorChain;
import org.junit.Before;
import org.junit.BeforeClass;
@@ -66,12 +68,32 @@ public class UIMAUpdateRequestProcessorT
UIMAUpdateRequestProcessorFactory factory = (UIMAUpdateRequestProcessorFactory) chained
.getFactories()[0];
assertNotNull(factory);
+ UpdateRequestProcessor processor = factory.getInstance(req(), null, null);
+ assertTrue(processor instanceof UIMAUpdateRequestProcessor);
+ }
+
+ @Test
+ public void testMultiMap() {
+ SolrCore core = h.getCore();
+ UpdateRequestProcessorChain chained = core.getUpdateProcessingChain("uima-multi-map");
+ assertNotNull(chained);
+ UIMAUpdateRequestProcessorFactory factory = (UIMAUpdateRequestProcessorFactory) chained
+ .getFactories()[0];
+ assertNotNull(factory);
+ UpdateRequestProcessor processor = factory.getInstance(req(), null, null);
+ assertTrue(processor instanceof UIMAUpdateRequestProcessor);
+ SolrUIMAConfiguration conf = ((UIMAUpdateRequestProcessor)processor).solrUIMAConfiguration;
+ Map<String, Map<String, MapField>> map = conf.getTypesFeaturesFieldsMapping();
+ Map<String, MapField> subMap = map.get("a-type-which-can-have-multiple-features");
+ assertEquals(2, subMap.size());
+ assertEquals("1", subMap.get("A").getFieldName(null));
+ assertEquals("2", subMap.get("B").getFieldName(null));
}
@Test
public void testProcessing() throws Exception {
- addDoc(adoc(
+ addDoc("uima", adoc(
"id",
"2312312321312",
"text",
@@ -83,19 +105,19 @@ public class UIMAUpdateRequestProcessorT
assertU(commit());
assertQ(req("sentence:*"), "//*[@numFound='1']");
assertQ(req("sentiment:*"), "//*[@numFound='0']");
- assertQ(req("entity:Prague"), "//*[@numFound='1']");
+ assertQ(req("OTHER_sm:Prague"), "//*[@numFound='1']");
}
@Test
public void testTwoUpdates() throws Exception {
- addDoc(adoc("id", "1", "text", "The Apache Software Foundation is happy to announce "
+ addDoc("uima", adoc("id", "1", "text", "The Apache Software Foundation is happy to announce "
+ "BarCampApache Sydney, Australia, the first ASF-backed event in the Southern "
+ "Hemisphere!"));
assertU(commit());
assertQ(req("sentence:*"), "//*[@numFound='1']");
- addDoc(adoc("id", "2", "text", "Taking place 11th December 2010 at the University "
+ addDoc("uima", adoc("id", "2", "text", "Taking place 11th December 2010 at the University "
+ "of Sydney's Darlington Centre, the BarCampApache \"unconference\" will be"
+ " attendee-driven, facilitated by members of the Apache community and will "
+ "focus on the Apache..."));
@@ -103,12 +125,44 @@ public class UIMAUpdateRequestProcessorT
assertQ(req("sentence:*"), "//*[@numFound='2']");
assertQ(req("sentiment:positive"), "//*[@numFound='1']");
- assertQ(req("entity:Apache"), "//*[@numFound='2']");
+ assertQ(req("ORGANIZATION_sm:Apache"), "//*[@numFound='2']");
+ }
+
+ @Test
+ public void testErrorHandling() throws Exception {
+
+ try{
+ addDoc("uima-not-ignoreErrors", adoc(
+ "id",
+ "2312312321312",
+ "text",
+ "SpellCheckComponent got improvement related to recent Lucene changes. \n "
+ + "Add support for specifying Spelling SuggestWord Comparator to Lucene spell "
+ + "checkers for SpellCheckComponent. Issue SOLR-2053 is already fixed, patch is"
+ + " attached if you need it, but it is also committed to trunk and 3_x branch."
+ + " Last Lucene European Conference has been held in Prague."));
+ fail("exception shouldn't be ignored");
+ }
+ catch(RuntimeException expected){}
+ assertU(commit());
+ assertQ(req("*:*"), "//*[@numFound='0']");
+
+ addDoc("uima-ignoreErrors", adoc(
+ "id",
+ "2312312321312",
+ "text",
+ "SpellCheckComponent got improvement related to recent Lucene changes. \n "
+ + "Add support for specifying Spelling SuggestWord Comparator to Lucene spell "
+ + "checkers for SpellCheckComponent. Issue SOLR-2053 is already fixed, patch is"
+ + " attached if you need it, but it is also committed to trunk and 3_x branch."
+ + " Last Lucene European Conference has been held in Prague."));
+ assertU(commit());
+ assertQ(req("*:*"), "//*[@numFound='1']");
}
- private void addDoc(String doc) throws Exception {
+ private void addDoc(String chain, String doc) throws Exception {
Map<String, String[]> params = new HashMap<String, String[]>();
- params.put(UpdateParams.UPDATE_PROCESSOR, new String[] { "uima" });
+ params.put(UpdateParams.UPDATE_CHAIN, new String[] { chain });
MultiMapSolrParams mmparams = new MultiMapSolrParams(params);
SolrQueryRequestBase req = new SolrQueryRequestBase(h.getCore(), (SolrParams) mmparams) {
};
Modified: lucene/dev/branches/solr2452/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/an/DummyEntityAnnotator.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/solr2452/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/an/DummyEntityAnnotator.java?rev=1126234&r1=1126233&r2=1126234&view=diff
==============================================================================
--- lucene/dev/branches/solr2452/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/an/DummyEntityAnnotator.java (original)
+++ lucene/dev/branches/solr2452/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/an/DummyEntityAnnotator.java Sun May 22 21:45:19 2011
@@ -34,6 +34,12 @@ public class DummyEntityAnnotator extend
EntityAnnotation entityAnnotation = new EntityAnnotation(jcas);
entityAnnotation.setBegin(annotation.getBegin());
entityAnnotation.setEnd(annotation.getEnd());
+ String entityString = annotation.getCoveredText();
+ entityAnnotation.setEntity(entityString);
+ String name = "OTHER"; // "OTHER" makes no sense. In practice, "PERSON", "COUNTRY", "E-MAIL", etc.
+ if(entityString.equals("Apache"))
+ name = "ORGANIZATION";
+ entityAnnotation.setName(name);
entityAnnotation.addToIndexes();
}
}