You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/12/16 02:45:18 UTC

svn commit: r891100 - in /lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html: DefaultHtmlMapper.java HtmlParser.java

Author: jukka
Date: Wed Dec 16 01:45:18 2009
New Revision: 891100

URL: http://svn.apache.org/viewvc?rev=891100&view=rev
Log:
TIKA-347: Make HtmlParser customizable through ParseContext

Extract the default HTML mapping rules to a top level class for easier to customization.

Added:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
Modified:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java

Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java?rev=891100&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java Wed Dec 16 01:45:18 2009
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+/**
+ * The default HTML mapping rules in Tika.
+ *
+ * @since Apache Tika 0.6
+ */
+public class DefaultHtmlMapper implements HtmlMapper {
+
+    public String mapSafeElement(String name) {
+        // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
+
+        if ("H1".equals(name)) return "h1";
+        if ("H2".equals(name)) return "h2";
+        if ("H3".equals(name)) return "h3";
+        if ("H4".equals(name)) return "h4";
+        if ("H5".equals(name)) return "h5";
+        if ("H6".equals(name)) return "h6";
+
+        if ("P".equals(name)) return "p";
+        if ("PRE".equals(name)) return "pre";
+        if ("BLOCKQUOTE".equals(name)) return "blockquote";
+
+        if ("UL".equals(name)) return "ul";
+        if ("OL".equals(name)) return "ol";
+        if ("MENU".equals(name)) return "ul";
+        if ("LI".equals(name)) return "li";
+        if ("DL".equals(name)) return "dl";
+        if ("DT".equals(name)) return "dt";
+        if ("DD".equals(name)) return "dd";
+
+        if ("TABLE".equals(name)) return "table";
+        if ("THEAD".equals(name)) return "thead";
+        if ("TBODY".equals(name)) return "tbody";
+        if ("TR".equals(name)) return "tr";
+        if ("TH".equals(name)) return "th";
+        if ("TD".equals(name)) return "td";
+
+        if ("ADDRESS".equals(name)) return "address";
+
+        return null;
+    }
+
+    public boolean isDiscardElement(String name) {
+        return "STYLE".equals(name) || "SCRIPT".equals(name);
+    }
+
+}

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=891100&r1=891099&r2=891100&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Wed Dec 16 01:45:18 2009
@@ -43,6 +43,11 @@
  */
 public class HtmlParser implements Parser {
 
+    /**
+     * The default HTML mapping.
+     */
+    private static final HtmlMapper mapper = new DefaultHtmlMapper();
+
     // Use the widest, most common charset as our default.
     private static final String DEFAULT_CHARSET = "windows-1252";
     private static final int META_TAG_BUFFER_SIZE = 4096;
@@ -190,43 +195,15 @@
      * <p>
      * Subclasses can override this method to customize the default mapping.
      *
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     *             the HTML mapping. This method will be removed in Tika 1.0.
      * @since Apache Tika 0.5
      * @param name HTML element name (upper case)
      * @return XHTML element name (lower case), or
      *         <code>null</code> if the element is unsafe 
      */
     protected String mapSafeElement(String name) {
-        // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
-
-        if ("H1".equals(name)) return "h1";
-        if ("H2".equals(name)) return "h2";
-        if ("H3".equals(name)) return "h3";
-        if ("H4".equals(name)) return "h4";
-        if ("H5".equals(name)) return "h5";
-        if ("H6".equals(name)) return "h6";
-
-        if ("P".equals(name)) return "p";
-        if ("PRE".equals(name)) return "pre";
-        if ("BLOCKQUOTE".equals(name)) return "blockquote";
-
-        if ("UL".equals(name)) return "ul";
-        if ("OL".equals(name)) return "ol";
-        if ("MENU".equals(name)) return "ul";
-        if ("LI".equals(name)) return "li";
-        if ("DL".equals(name)) return "dl";
-        if ("DT".equals(name)) return "dt";
-        if ("DD".equals(name)) return "dd";
-
-        if ("TABLE".equals(name)) return "table";
-        if ("THEAD".equals(name)) return "thead";
-        if ("TBODY".equals(name)) return "tbody";
-        if ("TR".equals(name)) return "tr";
-        if ("TH".equals(name)) return "th";
-        if ("TD".equals(name)) return "td";
-
-        if ("ADDRESS".equals(name)) return "address";
-
-        return null;
+        return mapper.mapSafeElement(name);
     }
 
     /**
@@ -234,6 +211,8 @@
      * discarded instead of including it in the parse output. Subclasses
      * can override this method to customize the set of discarded elements.
      *
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     *             the HTML mapping. This method will be removed in Tika 1.0.
      * @since Apache Tika 0.5
      * @param name HTML element name (upper case)
      * @return <code>true</code> if content inside the named element
@@ -248,8 +227,9 @@
      * protected HtmlParser methods. Making HtmlParser implement HtmlMapper
      * directly would require those methods to be public, which would break
      * backwards compatibility with subclasses.
-     * <p>
-     * TODO: Cleanup in Tika 1.0
+     *
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     *             the HTML mapping. This class will be removed in Tika 1.0.
      */
     private class HtmlParserMapper implements HtmlMapper {
         public String mapSafeElement(String name) {