You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by sh...@apache.org on 2013/04/09 06:41:34 UTC

svn commit: r1465880 - in /lucene/dev/branches/branch_4x: ./ solr/ solr/contrib/ solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/ solr/contrib/dataimporthandler-extras/src/test-files/dihextras/ solr/contrib/dataimporth...

Author: shalin
Date: Tue Apr  9 04:41:34 2013
New Revision: 1465880

URL: http://svn.apache.org/r1465880
Log:
SOLR-4530: DIH: Provide configuration to use Tika's IdentityHtmlMapper

Added:
    lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/structured.html
      - copied unchanged from r1465879, lucene/dev/trunk/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/structured.html
Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/solr/   (props changed)
    lucene/dev/branches/branch_4x/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/solr/contrib/   (props changed)
    lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java
    lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java

Modified: lucene/dev/branches/branch_4x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/CHANGES.txt?rev=1465880&r1=1465879&r2=1465880&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/solr/CHANGES.txt Tue Apr  9 04:41:34 2013
@@ -114,6 +114,9 @@ New Features
 * SOLR-3755: A new collections api to add additional shards dynamically by splitting
   existing shards. (yonik, Anshum Gupta, shalin)
 
+* SOLR-4530: DIH: Provide configuration to use Tika's IdentityHtmlMapper
+  (Alexandre Rafalovitch via shalin)
+
 Bug Fixes
 ----------------------
 

Modified: lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java?rev=1465880&r1=1465879&r2=1465880&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java (original)
+++ lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java Tue Apr  9 04:41:34 2013
@@ -22,6 +22,8 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.HtmlMapper;
+import org.apache.tika.parser.html.IdentityHtmlMapper;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ContentHandlerDecorator;
 import org.apache.tika.sax.XHTMLContentHandler;
@@ -63,7 +65,7 @@ public class TikaEntityProcessor extends
   private boolean done = false;
   private String parser;
   static final String AUTO_PARSER = "org.apache.tika.parser.AutoDetectParser";
-
+  private String htmlMapper;
 
   @Override
   protected void firstInit(Context context) {
@@ -88,6 +90,13 @@ public class TikaEntityProcessor extends
       format = "text";
     if (!"html".equals(format) && !"xml".equals(format) && !"text".equals(format)&& !"none".equals(format) )
       throw new DataImportHandlerException(SEVERE, "'format' can be one of text|html|xml|none");
+
+    htmlMapper = context.getResolvedEntityAttribute("htmlMapper");
+    if (htmlMapper == null)
+      htmlMapper = "default";
+    if (!"default".equals(htmlMapper) && !"identity".equals(htmlMapper))
+      throw new DataImportHandlerException(SEVERE, "'htmlMapper', if present, must be 'default' or 'identity'");
+
     parser = context.getResolvedEntityAttribute("parser");
     if(parser == null) {
       parser = AUTO_PARSER;
@@ -124,7 +133,11 @@ public class TikaEntityProcessor extends
       tikaParser = context.getSolrCore().getResourceLoader().newInstance(parser, Parser.class);
     }
     try {
-      tikaParser.parse(is, contentHandler, metadata , new ParseContext());
+        ParseContext context = new ParseContext();
+        if ("identity".equals(htmlMapper)){
+          context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
+        }
+        tikaParser.parse(is, contentHandler, metadata , context);
     } catch (Exception e) {
       wrapAndThrow(SEVERE, e, "Unable to read content");
     }

Modified: lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java?rev=1465880&r1=1465879&r2=1465880&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java (original)
+++ lucene/dev/branches/branch_4x/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java Tue Apr  9 04:41:34 2013
@@ -55,6 +55,21 @@ public class TestTikaEntityProcessor ext
       ,"//str[@name='text']"
   };
 
+  private String[] testsHTMLDefault = {
+      "//*[@numFound='1']"
+      , "//str[@name='text'][contains(.,'Basic div')]"
+      , "//str[@name='text'][contains(.,'<h1>')]"
+      , "//str[@name='text'][not(contains(.,'<div>'))]" //default mapper lower-cases elements as it maps
+      , "//str[@name='text'][not(contains(.,'<DIV>'))]"
+  };
+
+  private String[] testsHTMLIdentity = {
+      "//*[@numFound='1']"
+      , "//str[@name='text'][contains(.,'Basic div')]"
+      , "//str[@name='text'][contains(.,'<h1>')]"
+      , "//str[@name='text'][contains(.,'<div>')]"
+      , "//str[@name='text'][contains(.,'class=\"classAttribute\"')]" //attributes are lower-cased
+  };
 
   @BeforeClass
   public static void beforeClass() throws Exception {
@@ -67,4 +82,36 @@ public class TestTikaEntityProcessor ext
     assertQ(req("*:*"), tests );
   }
 
+  @Test
+  public void testTikaHTMLMapperEmpty() throws Exception {
+    runFullImport(getConfigHTML(null));
+    assertQ(req("*:*"), testsHTMLDefault);
+  }
+
+  @Test
+  public void testTikaHTMLMapperDefault() throws Exception {
+    runFullImport(getConfigHTML("default"));
+    assertQ(req("*:*"), testsHTMLDefault);
+  }
+
+  @Test
+  public void testTikaHTMLMapperIdentity() throws Exception {
+    runFullImport(getConfigHTML("identity"));
+    assertQ(req("*:*"), testsHTMLIdentity);
+  }
+
+  private String getConfigHTML(String htmlMapper) {
+    return
+        "<dataConfig>" +
+            "  <dataSource type='BinFileDataSource'/>" +
+            "  <document>" +
+            "    <entity name='Tika' format='xml' processor='TikaEntityProcessor' " +
+            "       url='" + getFile("dihextras/structured.html").getAbsolutePath() + "' " +
+            ((htmlMapper == null) ? "" : (" htmlMapper='" + htmlMapper + "'")) + ">" +
+            "      <field column='text'/>" +
+            "     </entity>" +
+            "  </document>" +
+            "</dataConfig>";
+
+  }
 }