You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ja...@apache.org on 2012/06/27 14:05:57 UTC
svn commit: r1354455 - in /lucene/dev/trunk/solr: ./
contrib/extraction/src/java/org/apache/solr/handler/extraction/
contrib/extraction/src/test/org/apache/solr/handler/extraction/
Author: janhoy
Date: Wed Jun 27 12:05:55 2012
New Revision: 1354455
URL: http://svn.apache.org/viewvc?rev=1354455&view=rev
Log:
SOLR-1856: In Solr Cell, literals should override Tika-parsed values
Modified:
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1354455&r1=1354454&r2=1354455&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Wed Jun 27 12:05:55 2012
@@ -371,6 +371,10 @@ New Features
* SOLR-3542: Add WeightedFragListBuilder for FVH and set it to default fragListBuilder
in example solrconfig.xml. (Sebastian Lutze, koji)
+* SOLR-1856: In Solr Cell, literals should override Tika-parsed values.
+ Patch adds a param "literalsOverride" which defaults to true, but can be set
+ to "false" to let Tika-parsed values be appended to literal values (Chris Harris, janhoy)
+
Optimizations
----------------------
Modified: lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java?rev=1354455&r1=1354454&r2=1354455&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java (original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java Wed Jun 27 12:05:55 2012
@@ -96,6 +96,10 @@ public interface ExtractingParams {
*/
public static final String CAPTURE_ATTRIBUTES = "captureAttr";
+ /**
+ * Literal field values will by default override other values such as metadata and content. Set this to false to revert to pre-4.0 behaviour
+ */
+ public static final String LITERALS_OVERRIDE = "literalsOverride";
/**
* Capture the specified fields (and everything included below it that isn't capture by some other capture field) separately from the default. This is different
Modified: lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java?rev=1354455&r1=1354454&r2=1354455&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java (original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java Wed Jun 27 12:05:55 2012
@@ -66,6 +66,9 @@ public class SolrContentHandler extends
protected String unknownFieldPrefix = "";
protected String defaultField = "";
+ private boolean literalsOverride;
+ private Set<String> literalFieldNames;
+
public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
this(metadata, params, schema, DateUtil.DEFAULT_DATE_FORMATS);
}
@@ -81,6 +84,7 @@ public class SolrContentHandler extends
this.lowerNames = params.getBool(LOWERNAMES, false);
this.captureAttribs = params.getBool(CAPTURE_ATTRIBUTES, false);
+ this.literalsOverride = params.getBool(LITERALS_OVERRIDE, true);
this.unknownFieldPrefix = params.get(UNKNOWN_FIELD_PREFIX, "");
this.defaultField = params.get(DEFAULT_FIELD, "");
String[] captureFields = params.getParams(CAPTURE_ELEMENTS);
@@ -107,13 +111,11 @@ public class SolrContentHandler extends
* @see #addLiterals()
*/
public SolrInputDocument newDocument() {
- float boost = 1.0f;
- //handle the metadata extracted from the document
- addMetadata();
-
- //handle the literals from the params
+ //handle the literals from the params. NOTE: This MUST be called before the others in order for literals to override other values
addLiterals();
+ //handle the metadata extracted from the document
+ addMetadata();
//add in the content
addContent();
@@ -134,8 +136,10 @@ public class SolrContentHandler extends
protected void addCapturedContent() {
for (Map.Entry<String, StringBuilder> entry : fieldBuilders.entrySet()) {
if (entry.getValue().length() > 0) {
- addField(entry.getKey(), entry.getValue().toString(), null);
- }
+ String fieldName = entry.getKey();
+ if (literalsOverride && literalFieldNames.contains(fieldName))
+ continue;
+ addField(fieldName, entry.getValue().toString(), null); }
}
}
@@ -144,6 +148,8 @@ public class SolrContentHandler extends
* and the {@link #catchAllBuilder}
*/
protected void addContent() {
+ if (literalsOverride && literalFieldNames.contains(contentFieldName))
+ return;
addField(contentFieldName, catchAllBuilder.toString(), null);
}
@@ -152,12 +158,14 @@ public class SolrContentHandler extends
*/
protected void addLiterals() {
Iterator<String> paramNames = params.getParameterNamesIterator();
+ literalFieldNames = new HashSet<String>();
while (paramNames.hasNext()) {
String pname = paramNames.next();
if (!pname.startsWith(LITERALS_PREFIX)) continue;
String name = pname.substring(LITERALS_PREFIX.length());
addField(name, null, params.getParams(pname));
+ literalFieldNames.add(name);
}
}
@@ -166,6 +174,8 @@ public class SolrContentHandler extends
*/
protected void addMetadata() {
for (String name : metadata.names()) {
+ if (literalsOverride && literalFieldNames.contains(name))
+ continue;
String[] vals = metadata.getValues(name);
addField(name, null, vals);
}
Modified: lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java?rev=1354455&r1=1354454&r2=1354455&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java (original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java Wed Jun 27 12:05:55 2012
@@ -444,7 +444,71 @@ public class ExtractingRequestHandlerTes
}
catch(Exception expected){}
}
-
+
+ public void testLiteralsOverride() throws Exception {
+ ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+ assertTrue("handler is null and it shouldn't be", handler != null);
+
+ assertQ(req("*:*"), "//*[@numFound='0']");
+
+ // Here Tika should parse out a title for this document:
+ loadLocal("extraction/solr-word.pdf",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator",
+ "fmap.Keywords", "extractedKeywords",
+ "fmap.Author", "extractedAuthor",
+ "literal.id", "three",
+ "fmap.content", "extractedContent",
+ "fmap.language", "extractedLanguage",
+ "fmap.Creation-Date", "extractedDate",
+ "fmap.AAPL:Keywords", "ignored_a",
+ "fmap.xmpTPg:NPages", "ignored_a",
+ "fmap.Last-Modified", "extractedDate");
+
+ // Here the literal value should override the Tika-parsed title:
+ loadLocal("extraction/solr-word.pdf",
+ "literal.title", "wolf-man",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator",
+ "fmap.Keywords", "extractedKeywords",
+ "fmap.Author", "extractedAuthor",
+ "literal.id", "four",
+ "fmap.content", "extractedContent",
+ "fmap.language", "extractedLanguage",
+ "fmap.Creation-Date", "extractedDate",
+ "fmap.AAPL:Keywords", "ignored_a",
+ "fmap.xmpTPg:NPages", "ignored_a",
+ "fmap.Last-Modified", "extractedDate");
+
+ // Here we mimic the old behaviour where literals are added, not overridden
+ loadLocal("extraction/solr-word.pdf",
+ "literalsOverride", "false",
+ // Trick - we first map the metadata-title to an ignored field before we replace with literal title
+ "fmap.title", "ignored_a",
+ "literal.title", "old-behaviour",
+ "literal.extractedKeywords", "literalkeyword",
+ "fmap.created", "extractedDate",
+ "fmap.producer", "extractedProducer",
+ "fmap.creator", "extractedCreator",
+ "fmap.Keywords", "extractedKeywords",
+ "fmap.Author", "extractedAuthor",
+ "literal.id", "five",
+ "fmap.content", "extractedContent",
+ "fmap.language", "extractedLanguage",
+ "fmap.Creation-Date", "extractedDate",
+ "fmap.AAPL:Keywords", "ignored_a",
+ "fmap.xmpTPg:NPages", "ignored_a",
+ "fmap.Last-Modified", "extractedDate");
+
+ assertU(commit());
+
+ assertQ(req("title:solr-word"), "//*[@numFound='1']");
+ assertQ(req("title:wolf-man"), "//*[@numFound='1']");
+ assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
+ }
+
SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
try {