You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ja...@apache.org on 2012/06/27 14:05:57 UTC

svn commit: r1354455 - in /lucene/dev/trunk/solr: ./ contrib/extraction/src/java/org/apache/solr/handler/extraction/ contrib/extraction/src/test/org/apache/solr/handler/extraction/

Author: janhoy
Date: Wed Jun 27 12:05:55 2012
New Revision: 1354455

URL: http://svn.apache.org/viewvc?rev=1354455&view=rev
Log:
SOLR-1856: In Solr Cell, literals should override Tika-parsed values

Modified:
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
    lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
    lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1354455&r1=1354454&r2=1354455&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Wed Jun 27 12:05:55 2012
@@ -371,6 +371,10 @@ New Features
 * SOLR-3542: Add WeightedFragListBuilder for FVH and set it to default fragListBuilder
   in example solrconfig.xml. (Sebastian Lutze, koji)
 
+* SOLR-1856: In Solr Cell, literals should override Tika-parsed values.
+  Patch adds a param "literalsOverride" which defaults to true, but can be set 
+  to "false" to let Tika-parsed values be appended to literal values (Chris Harris, janhoy)
+
 Optimizations
 ----------------------
 

Modified: lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java?rev=1354455&r1=1354454&r2=1354455&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java (original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingParams.java Wed Jun 27 12:05:55 2012
@@ -96,6 +96,10 @@ public interface ExtractingParams {
    */
   public static final String CAPTURE_ATTRIBUTES = "captureAttr";
 
+  /**
+   * Literal field values will by default override other values such as metadata and content. Set this to false to revert to pre-4.0 behaviour
+   */
+  public static final String LITERALS_OVERRIDE = "literalsOverride";
 
   /**
    * Capture the specified fields (and everything included below it that isn't capture by some other capture field) separately from the default.  This is different

Modified: lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java?rev=1354455&r1=1354454&r2=1354455&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java (original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/SolrContentHandler.java Wed Jun 27 12:05:55 2012
@@ -66,6 +66,9 @@ public class SolrContentHandler extends 
   protected String unknownFieldPrefix = "";
   protected String defaultField = "";
 
+  private boolean literalsOverride;
+  private Set<String> literalFieldNames;
+  
   public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
     this(metadata, params, schema, DateUtil.DEFAULT_DATE_FORMATS);
   }
@@ -81,6 +84,7 @@ public class SolrContentHandler extends 
 
     this.lowerNames = params.getBool(LOWERNAMES, false);
     this.captureAttribs = params.getBool(CAPTURE_ATTRIBUTES, false);
+    this.literalsOverride = params.getBool(LITERALS_OVERRIDE, true);
     this.unknownFieldPrefix = params.get(UNKNOWN_FIELD_PREFIX, "");
     this.defaultField = params.get(DEFAULT_FIELD, "");
     String[] captureFields = params.getParams(CAPTURE_ELEMENTS);
@@ -107,13 +111,11 @@ public class SolrContentHandler extends 
    * @see #addLiterals()
    */
   public SolrInputDocument newDocument() {
-    float boost = 1.0f;
-    //handle the metadata extracted from the document
-    addMetadata();
-
-    //handle the literals from the params
+    //handle the literals from the params. NOTE: This MUST be called before the others in order for literals to override other values
     addLiterals();
 
+    //handle the metadata extracted from the document
+    addMetadata();
 
     //add in the content
     addContent();
@@ -134,8 +136,10 @@ public class SolrContentHandler extends 
   protected void addCapturedContent() {
     for (Map.Entry<String, StringBuilder> entry : fieldBuilders.entrySet()) {
       if (entry.getValue().length() > 0) {
-        addField(entry.getKey(), entry.getValue().toString(), null);
-      }
+        String fieldName = entry.getKey();
+        if (literalsOverride && literalFieldNames.contains(fieldName))
+          continue;
+        addField(fieldName, entry.getValue().toString(), null);      }
     }
   }
 
@@ -144,6 +148,8 @@ public class SolrContentHandler extends 
    * and the {@link #catchAllBuilder}
    */
   protected void addContent() {
+    if (literalsOverride && literalFieldNames.contains(contentFieldName))
+      return;
     addField(contentFieldName, catchAllBuilder.toString(), null);
   }
 
@@ -152,12 +158,14 @@ public class SolrContentHandler extends 
    */
   protected void addLiterals() {
     Iterator<String> paramNames = params.getParameterNamesIterator();
+    literalFieldNames = new HashSet<String>();
     while (paramNames.hasNext()) {
       String pname = paramNames.next();
       if (!pname.startsWith(LITERALS_PREFIX)) continue;
 
       String name = pname.substring(LITERALS_PREFIX.length());
       addField(name, null, params.getParams(pname));
+      literalFieldNames.add(name);
     }
   }
 
@@ -166,6 +174,8 @@ public class SolrContentHandler extends 
    */
   protected void addMetadata() {
     for (String name : metadata.names()) {
+      if (literalsOverride && literalFieldNames.contains(name))
+        continue;
       String[] vals = metadata.getValues(name);
       addField(name, null, vals);
     }

Modified: lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java?rev=1354455&r1=1354454&r2=1354455&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java (original)
+++ lucene/dev/trunk/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java Wed Jun 27 12:05:55 2012
@@ -444,7 +444,71 @@ public class ExtractingRequestHandlerTes
     }
     catch(Exception expected){}
   }
-  
+
+  public void testLiteralsOverride() throws Exception {
+    ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+    assertTrue("handler is null and it shouldn't be", handler != null);
+ 
+    assertQ(req("*:*"), "//*[@numFound='0']");
+
+    // Here Tika should parse out a title for this document:
+    loadLocal("extraction/solr-word.pdf", 
+            "fmap.created", "extractedDate", 
+            "fmap.producer", "extractedProducer",
+            "fmap.creator", "extractedCreator", 
+            "fmap.Keywords", "extractedKeywords",
+            "fmap.Author", "extractedAuthor",
+            "literal.id", "three",
+            "fmap.content", "extractedContent",
+            "fmap.language", "extractedLanguage",
+            "fmap.Creation-Date", "extractedDate",
+            "fmap.AAPL:Keywords", "ignored_a",
+            "fmap.xmpTPg:NPages", "ignored_a",
+            "fmap.Last-Modified", "extractedDate");
+
+    // Here the literal value should override the Tika-parsed title:
+    loadLocal("extraction/solr-word.pdf",
+            "literal.title", "wolf-man",
+            "fmap.created", "extractedDate",
+            "fmap.producer", "extractedProducer",
+            "fmap.creator", "extractedCreator",
+            "fmap.Keywords", "extractedKeywords",
+            "fmap.Author", "extractedAuthor",
+            "literal.id", "four",
+            "fmap.content", "extractedContent",
+            "fmap.language", "extractedLanguage",
+            "fmap.Creation-Date", "extractedDate",
+            "fmap.AAPL:Keywords", "ignored_a",
+            "fmap.xmpTPg:NPages", "ignored_a",
+            "fmap.Last-Modified", "extractedDate");
+
+    // Here we mimic the old behaviour where literals are added, not overridden
+    loadLocal("extraction/solr-word.pdf",
+            "literalsOverride", "false",
+            // Trick - we first map the metadata-title to an ignored field before we replace with literal title
+            "fmap.title", "ignored_a",
+            "literal.title", "old-behaviour",
+            "literal.extractedKeywords", "literalkeyword",
+            "fmap.created", "extractedDate",
+            "fmap.producer", "extractedProducer",
+            "fmap.creator", "extractedCreator",
+            "fmap.Keywords", "extractedKeywords",
+            "fmap.Author", "extractedAuthor",
+            "literal.id", "five",
+            "fmap.content", "extractedContent",
+            "fmap.language", "extractedLanguage",
+            "fmap.Creation-Date", "extractedDate",
+            "fmap.AAPL:Keywords", "ignored_a",
+            "fmap.xmpTPg:NPages", "ignored_a",
+            "fmap.Last-Modified", "extractedDate");
+
+    assertU(commit());
+
+    assertQ(req("title:solr-word"), "//*[@numFound='1']");
+    assertQ(req("title:wolf-man"), "//*[@numFound='1']");
+    assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
+  }
+
   SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
     LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
     try {