You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/12/16 02:45:18 UTC
svn commit: r891100 - in
/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html:
DefaultHtmlMapper.java HtmlParser.java
Author: jukka
Date: Wed Dec 16 01:45:18 2009
New Revision: 891100
URL: http://svn.apache.org/viewvc?rev=891100&view=rev
Log:
TIKA-347: Make HtmlParser customizable through ParseContext
Extract the default HTML mapping rules to a top level class for easier to customization.
Added:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java?rev=891100&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java Wed Dec 16 01:45:18 2009
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+/**
+ * The default HTML mapping rules in Tika.
+ *
+ * @since Apache Tika 0.6
+ */
+public class DefaultHtmlMapper implements HtmlMapper {
+
+ public String mapSafeElement(String name) {
+ // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
+
+ if ("H1".equals(name)) return "h1";
+ if ("H2".equals(name)) return "h2";
+ if ("H3".equals(name)) return "h3";
+ if ("H4".equals(name)) return "h4";
+ if ("H5".equals(name)) return "h5";
+ if ("H6".equals(name)) return "h6";
+
+ if ("P".equals(name)) return "p";
+ if ("PRE".equals(name)) return "pre";
+ if ("BLOCKQUOTE".equals(name)) return "blockquote";
+
+ if ("UL".equals(name)) return "ul";
+ if ("OL".equals(name)) return "ol";
+ if ("MENU".equals(name)) return "ul";
+ if ("LI".equals(name)) return "li";
+ if ("DL".equals(name)) return "dl";
+ if ("DT".equals(name)) return "dt";
+ if ("DD".equals(name)) return "dd";
+
+ if ("TABLE".equals(name)) return "table";
+ if ("THEAD".equals(name)) return "thead";
+ if ("TBODY".equals(name)) return "tbody";
+ if ("TR".equals(name)) return "tr";
+ if ("TH".equals(name)) return "th";
+ if ("TD".equals(name)) return "td";
+
+ if ("ADDRESS".equals(name)) return "address";
+
+ return null;
+ }
+
+ public boolean isDiscardElement(String name) {
+ return "STYLE".equals(name) || "SCRIPT".equals(name);
+ }
+
+}
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=891100&r1=891099&r2=891100&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Wed Dec 16 01:45:18 2009
@@ -43,6 +43,11 @@
*/
public class HtmlParser implements Parser {
+ /**
+ * The default HTML mapping.
+ */
+ private static final HtmlMapper mapper = new DefaultHtmlMapper();
+
// Use the widest, most common charset as our default.
private static final String DEFAULT_CHARSET = "windows-1252";
private static final int META_TAG_BUFFER_SIZE = 4096;
@@ -190,43 +195,15 @@
* <p>
* Subclasses can override this method to customize the default mapping.
*
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This method will be removed in Tika 1.0.
* @since Apache Tika 0.5
* @param name HTML element name (upper case)
* @return XHTML element name (lower case), or
* <code>null</code> if the element is unsafe
*/
protected String mapSafeElement(String name) {
- // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
-
- if ("H1".equals(name)) return "h1";
- if ("H2".equals(name)) return "h2";
- if ("H3".equals(name)) return "h3";
- if ("H4".equals(name)) return "h4";
- if ("H5".equals(name)) return "h5";
- if ("H6".equals(name)) return "h6";
-
- if ("P".equals(name)) return "p";
- if ("PRE".equals(name)) return "pre";
- if ("BLOCKQUOTE".equals(name)) return "blockquote";
-
- if ("UL".equals(name)) return "ul";
- if ("OL".equals(name)) return "ol";
- if ("MENU".equals(name)) return "ul";
- if ("LI".equals(name)) return "li";
- if ("DL".equals(name)) return "dl";
- if ("DT".equals(name)) return "dt";
- if ("DD".equals(name)) return "dd";
-
- if ("TABLE".equals(name)) return "table";
- if ("THEAD".equals(name)) return "thead";
- if ("TBODY".equals(name)) return "tbody";
- if ("TR".equals(name)) return "tr";
- if ("TH".equals(name)) return "th";
- if ("TD".equals(name)) return "td";
-
- if ("ADDRESS".equals(name)) return "address";
-
- return null;
+ return mapper.mapSafeElement(name);
}
/**
@@ -234,6 +211,8 @@
* discarded instead of including it in the parse output. Subclasses
* can override this method to customize the set of discarded elements.
*
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This method will be removed in Tika 1.0.
* @since Apache Tika 0.5
* @param name HTML element name (upper case)
* @return <code>true</code> if content inside the named element
@@ -248,8 +227,9 @@
* protected HtmlParser methods. Making HtmlParser implement HtmlMapper
* directly would require those methods to be public, which would break
* backwards compatibility with subclasses.
- * <p>
- * TODO: Cleanup in Tika 1.0
+ *
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This class will be removed in Tika 1.0.
*/
private class HtmlParserMapper implements HtmlMapper {
public String mapSafeElement(String name) {