You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2015/03/09 01:16:25 UTC

svn commit: r1665099 - in /lucene/dev/trunk/solr: ./ contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/ contrib/dataimporthandler-extras/src/test-files/dihextras/ contrib/dataimporthandler-extras/src/test/org/apache/solr/hand...

Author: shalin
Date: Mon Mar  9 00:16:25 2015
New Revision: 1665099

URL: http://svn.apache.org/r1665099
Log:
SOLR-7189: Allow DIH to extract content from embedded documents via Tika

Added:
    lucene/dev/trunk/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/test_recursive_embedded.docx   (with props)
Modified:
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java
    lucene/dev/trunk/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1665099&r1=1665098&r2=1665099&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Mon Mar  9 00:16:25 2015
@@ -144,6 +144,9 @@ New Features
 * SOLR-6359: Allow number of logs and records kept by UpdateLog to be configured
   (Ramkumar Aiyengar)
 
+* SOLR-7189: Allow DIH to extract content from embedded documents via Tika.
+  (Tim Allison via shalin)
+
 Bug Fixes
 ----------------------
 

Modified: lucene/dev/trunk/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java?rev=1665099&r1=1665098&r2=1665099&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java (original)
+++ lucene/dev/trunk/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java Mon Mar  9 00:16:25 2015
@@ -63,6 +63,7 @@ public class TikaEntityProcessor extends
   private static final Logger LOG = LoggerFactory.getLogger(TikaEntityProcessor.class);
   private String format = "text";
   private boolean done = false;
+  private boolean extractEmbedded = false;
   private String parser;
   static final String AUTO_PARSER = "org.apache.tika.parser.AutoDetectParser";
   private String htmlMapper;
@@ -92,6 +93,10 @@ public class TikaEntityProcessor extends
       wrapAndThrow (SEVERE, e,"Unable to load Tika Config");
     }
 
+    String extractEmbeddedString = context.getResolvedEntityAttribute("extractEmbedded");
+    if ("true".equals(extractEmbeddedString)) {
+      extractEmbedded = true;
+    }
     format = context.getResolvedEntityAttribute("format");
     if(format == null)
       format = "text";
@@ -143,6 +148,9 @@ public class TikaEntityProcessor extends
         if ("identity".equals(htmlMapper)){
           context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
         }
+        if (extractEmbedded) {
+          context.set(Parser.class, tikaParser);
+        }
         tikaParser.parse(is, contentHandler, metadata , context);
     } catch (Exception e) {
       if(SKIP.equals(onError)) {

Added: lucene/dev/trunk/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/test_recursive_embedded.docx
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/test_recursive_embedded.docx?rev=1665099&view=auto
==============================================================================
Binary file - no diff available.

Modified: lucene/dev/trunk/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java?rev=1665099&r1=1665098&r2=1665099&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java (original)
+++ lucene/dev/trunk/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java Mon Mar  9 00:16:25 2015
@@ -18,18 +18,7 @@ package org.apache.solr.handler.dataimpo
 
 import org.junit.BeforeClass;
 import org.junit.Test;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.xml.sax.InputSource;
-
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.transform.Transformer;
-import javax.xml.transform.TransformerFactory;
-import javax.xml.transform.dom.DOMSource;
-import javax.xml.transform.stream.StreamResult;
-import java.io.StringReader;
-import java.io.StringWriter;
+
 import java.util.Locale;
 
 /**Testcase for TikaEntityProcessor
@@ -85,6 +74,16 @@ public class TestTikaEntityProcessor ext
       , "//str[@name='text'][contains(.,'class=\"classAttribute\"')]" //attributes are lower-cased
   };
 
+  private String[] testsEmbedded = {
+      "//*[@numFound='1']",
+      "//str[@name='text'][contains(.,'When in the Course')]"
+  };
+
+  private String[] testsIgnoreEmbedded = {
+      "//*[@numFound='1']",
+      "//str[@name='text'][not(contains(.,'When in the Course'))]"
+  };
+
   @BeforeClass
   public static void beforeClass() throws Exception {
     assumeFalse("This test fails on UNIX with Turkish default locale (https://issues.apache.org/jira/browse/SOLR-6387)",
@@ -136,4 +135,39 @@ public class TestTikaEntityProcessor ext
             "</dataConfig>";
 
   }
+
+  @Test
+  public void testEmbeddedDocsLegacy() throws Exception {
+    //test legacy behavior: ignore embedded docs
+    runFullImport(conf);
+    assertQ(req("*:*"), testsIgnoreEmbedded);
+  }
+
+  @Test
+  public void testEmbeddedDocsTrue() throws Exception {
+    runFullImport(getConfigEmbedded(true));
+    assertQ(req("*:*"), testsEmbedded);
+  }
+
+  @Test
+  public void testEmbeddedDocsFalse() throws Exception {
+    runFullImport(getConfigEmbedded(false));
+    assertQ(req("*:*"), testsIgnoreEmbedded);
+  }
+
+  private String getConfigEmbedded(boolean extractEmbedded) {
+    return
+        "<dataConfig>" +
+            "  <dataSource type=\"BinFileDataSource\"/>" +
+            "  <document>" +
+            "    <entity name=\"Tika\" processor=\"TikaEntityProcessor\" url=\"" +
+                    getFile("dihextras/test_recursive_embedded.docx").getAbsolutePath() + "\" " +
+            "       extractEmbedded=\""+extractEmbedded+"\">" +
+            "      <field column=\"Author\" meta=\"true\" name=\"author\"/>" +
+            "      <field column=\"title\" meta=\"true\" name=\"title\"/>" +
+            "      <field column=\"text\"/>" +
+            "     </entity>" +
+            "  </document>" +
+            "</dataConfig>";
+  }
 }