You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2013/04/09 06:41:34 UTC
svn commit: r1465880 - in /lucene/dev/branches/branch_4x: ./ solr/
solr/contrib/
solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/
solr/contrib/dataimporthandler-extras/src/test-files/dihextras/
solr/contrib/dataimporth...
Author: shalin
Date: Tue Apr 9 04:41:34 2013
New Revision: 1465880
URL: http://svn.apache.org/r1465880
Log:
SOLR-4530: DIH: Provide configuration to use Tika's IdentityHtmlMapper
Added:
lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/structured.html
- copied unchanged from r1465879, lucene/dev/trunk/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/structured.html
Modified:
lucene/dev/branches/branch_4x/ (props changed)
lucene/dev/branches/branch_4x/solr/ (props changed)
lucene/dev/branches/branch_4x/solr/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_4x/solr/contrib/ (props changed)
lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java
lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
Modified: lucene/dev/branches/branch_4x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/CHANGES.txt?rev=1465880&r1=1465879&r2=1465880&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/solr/CHANGES.txt Tue Apr 9 04:41:34 2013
@@ -114,6 +114,9 @@ New Features
* SOLR-3755: A new collections api to add additional shards dynamically by splitting
existing shards. (yonik, Anshum Gupta, shalin)
+* SOLR-4530: DIH: Provide configuration to use Tika's IdentityHtmlMapper
+ (Alexandre Rafalovitch via shalin)
+
Bug Fixes
----------------------
Modified: lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java?rev=1465880&r1=1465879&r2=1465880&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java (original)
+++ lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java Tue Apr 9 04:41:34 2013
@@ -22,6 +22,8 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.HtmlMapper;
+import org.apache.tika.parser.html.IdentityHtmlMapper;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -63,7 +65,7 @@ public class TikaEntityProcessor extends
private boolean done = false;
private String parser;
static final String AUTO_PARSER = "org.apache.tika.parser.AutoDetectParser";
-
+ private String htmlMapper;
@Override
protected void firstInit(Context context) {
@@ -88,6 +90,13 @@ public class TikaEntityProcessor extends
format = "text";
if (!"html".equals(format) && !"xml".equals(format) && !"text".equals(format)&& !"none".equals(format) )
throw new DataImportHandlerException(SEVERE, "'format' can be one of text|html|xml|none");
+
+ htmlMapper = context.getResolvedEntityAttribute("htmlMapper");
+ if (htmlMapper == null)
+ htmlMapper = "default";
+ if (!"default".equals(htmlMapper) && !"identity".equals(htmlMapper))
+ throw new DataImportHandlerException(SEVERE, "'htmlMapper', if present, must be 'default' or 'identity'");
+
parser = context.getResolvedEntityAttribute("parser");
if(parser == null) {
parser = AUTO_PARSER;
@@ -124,7 +133,11 @@ public class TikaEntityProcessor extends
tikaParser = context.getSolrCore().getResourceLoader().newInstance(parser, Parser.class);
}
try {
- tikaParser.parse(is, contentHandler, metadata , new ParseContext());
+ ParseContext context = new ParseContext();
+ if ("identity".equals(htmlMapper)){
+ context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
+ }
+ tikaParser.parse(is, contentHandler, metadata , context);
} catch (Exception e) {
wrapAndThrow(SEVERE, e, "Unable to read content");
}
Modified: lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java?rev=1465880&r1=1465879&r2=1465880&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java (original)
+++ lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java Tue Apr 9 04:41:34 2013
@@ -55,6 +55,21 @@ public class TestTikaEntityProcessor ext
,"//str[@name='text']"
};
+ private String[] testsHTMLDefault = {
+ "//*[@numFound='1']"
+ , "//str[@name='text'][contains(.,'Basic div')]"
+ , "//str[@name='text'][contains(.,'<h1>')]"
+ , "//str[@name='text'][not(contains(.,'<div>'))]" //default mapper lower-cases elements as it maps
+ , "//str[@name='text'][not(contains(.,'<DIV>'))]"
+ };
+
+ private String[] testsHTMLIdentity = {
+ "//*[@numFound='1']"
+ , "//str[@name='text'][contains(.,'Basic div')]"
+ , "//str[@name='text'][contains(.,'<h1>')]"
+ , "//str[@name='text'][contains(.,'<div>')]"
+ , "//str[@name='text'][contains(.,'class=\"classAttribute\"')]" //attributes are lower-cased
+ };
@BeforeClass
public static void beforeClass() throws Exception {
@@ -67,4 +82,36 @@ public class TestTikaEntityProcessor ext
assertQ(req("*:*"), tests );
}
+ @Test
+ public void testTikaHTMLMapperEmpty() throws Exception {
+ runFullImport(getConfigHTML(null));
+ assertQ(req("*:*"), testsHTMLDefault);
+ }
+
+ @Test
+ public void testTikaHTMLMapperDefault() throws Exception {
+ runFullImport(getConfigHTML("default"));
+ assertQ(req("*:*"), testsHTMLDefault);
+ }
+
+ @Test
+ public void testTikaHTMLMapperIdentity() throws Exception {
+ runFullImport(getConfigHTML("identity"));
+ assertQ(req("*:*"), testsHTMLIdentity);
+ }
+
+ private String getConfigHTML(String htmlMapper) {
+ return
+ "<dataConfig>" +
+ " <dataSource type='BinFileDataSource'/>" +
+ " <document>" +
+ " <entity name='Tika' format='xml' processor='TikaEntityProcessor' " +
+ " url='" + getFile("dihextras/structured.html").getAbsolutePath() + "' " +
+ ((htmlMapper == null) ? "" : (" htmlMapper='" + htmlMapper + "'")) + ">" +
+ " <field column='text'/>" +
+ " </entity>" +
+ " </document>" +
+ "</dataConfig>";
+
+ }
}