You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2015/03/09 01:19:30 UTC
svn commit: r1665100 - in /lucene/dev/branches/branch_5x: ./ solr/
solr/contrib/
solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/
solr/contrib/dataimporthandler-extras/src/test-files/dihextras/
solr/contrib/dataimporth...
Author: shalin
Date: Mon Mar 9 00:19:29 2015
New Revision: 1665100
URL: http://svn.apache.org/r1665100
Log:
SOLR-7189: Allow DIH to extract content from embedded documents via Tika
Added:
lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/test_recursive_embedded.docx
- copied unchanged from r1665099, lucene/dev/trunk/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/test_recursive_embedded.docx
Modified:
lucene/dev/branches/branch_5x/ (props changed)
lucene/dev/branches/branch_5x/solr/ (props changed)
lucene/dev/branches/branch_5x/solr/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_5x/solr/contrib/ (props changed)
lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java
lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
Modified: lucene/dev/branches/branch_5x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/CHANGES.txt?rev=1665100&r1=1665099&r2=1665100&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/solr/CHANGES.txt Mon Mar 9 00:19:29 2015
@@ -90,6 +90,9 @@ New Features
* SOLR-6359: Allow number of logs and records kept by UpdateLog to be configured
(Ramkumar Aiyengar)
+* SOLR-7189: Allow DIH to extract content from embedded documents via Tika.
+ (Tim Allison via shalin)
+
Bug Fixes
----------------------
Modified: lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java?rev=1665100&r1=1665099&r2=1665100&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java Mon Mar 9 00:19:29 2015
@@ -63,6 +63,7 @@ public class TikaEntityProcessor extends
private static final Logger LOG = LoggerFactory.getLogger(TikaEntityProcessor.class);
private String format = "text";
private boolean done = false;
+ private boolean extractEmbedded = false;
private String parser;
static final String AUTO_PARSER = "org.apache.tika.parser.AutoDetectParser";
private String htmlMapper;
@@ -92,6 +93,10 @@ public class TikaEntityProcessor extends
wrapAndThrow (SEVERE, e,"Unable to load Tika Config");
}
+ String extractEmbeddedString = context.getResolvedEntityAttribute("extractEmbedded");
+ if ("true".equals(extractEmbeddedString)) {
+ extractEmbedded = true;
+ }
format = context.getResolvedEntityAttribute("format");
if(format == null)
format = "text";
@@ -143,6 +148,9 @@ public class TikaEntityProcessor extends
if ("identity".equals(htmlMapper)){
context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
}
+ if (extractEmbedded) {
+ context.set(Parser.class, tikaParser);
+ }
tikaParser.parse(is, contentHandler, metadata , context);
} catch (Exception e) {
if(SKIP.equals(onError)) {
Modified: lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java?rev=1665100&r1=1665099&r2=1665100&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java (original)
+++ lucene/dev/branches/branch_5x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java Mon Mar 9 00:19:29 2015
@@ -18,18 +18,7 @@ package org.apache.solr.handler.dataimpo
import org.junit.BeforeClass;
import org.junit.Test;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.xml.sax.InputSource;
-
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.transform.Transformer;
-import javax.xml.transform.TransformerFactory;
-import javax.xml.transform.dom.DOMSource;
-import javax.xml.transform.stream.StreamResult;
-import java.io.StringReader;
-import java.io.StringWriter;
+
import java.util.Locale;
/**Testcase for TikaEntityProcessor
@@ -85,6 +74,16 @@ public class TestTikaEntityProcessor ext
, "//str[@name='text'][contains(.,'class=\"classAttribute\"')]" //attributes are lower-cased
};
+ private String[] testsEmbedded = {
+ "//*[@numFound='1']",
+ "//str[@name='text'][contains(.,'When in the Course')]"
+ };
+
+ private String[] testsIgnoreEmbedded = {
+ "//*[@numFound='1']",
+ "//str[@name='text'][not(contains(.,'When in the Course'))]"
+ };
+
@BeforeClass
public static void beforeClass() throws Exception {
assumeFalse("This test fails on UNIX with Turkish default locale (https://issues.apache.org/jira/browse/SOLR-6387)",
@@ -136,4 +135,39 @@ public class TestTikaEntityProcessor ext
"</dataConfig>";
}
+
+ @Test
+ public void testEmbeddedDocsLegacy() throws Exception {
+ //test legacy behavior: ignore embedded docs
+ runFullImport(conf);
+ assertQ(req("*:*"), testsIgnoreEmbedded);
+ }
+
+ @Test
+ public void testEmbeddedDocsTrue() throws Exception {
+ runFullImport(getConfigEmbedded(true));
+ assertQ(req("*:*"), testsEmbedded);
+ }
+
+ @Test
+ public void testEmbeddedDocsFalse() throws Exception {
+ runFullImport(getConfigEmbedded(false));
+ assertQ(req("*:*"), testsIgnoreEmbedded);
+ }
+
+ private String getConfigEmbedded(boolean extractEmbedded) {
+ return
+ "<dataConfig>" +
+ " <dataSource type=\"BinFileDataSource\"/>" +
+ " <document>" +
+ " <entity name=\"Tika\" processor=\"TikaEntityProcessor\" url=\"" +
+ getFile("dihextras/test_recursive_embedded.docx").getAbsolutePath() + "\" " +
+ " extractEmbedded=\""+extractEmbedded+"\">" +
+ " <field column=\"Author\" meta=\"true\" name=\"author\"/>" +
+ " <field column=\"title\" meta=\"true\" name=\"title\"/>" +
+ " <field column=\"text\"/>" +
+ " </entity>" +
+ " </document>" +
+ "</dataConfig>";
+ }
}