You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2010/05/31 01:49:05 UTC
svn commit: r949635 - in /tika/trunk: ./
tika-core/src/main/java/org/apache/tika/sax/
tika-parsers/src/main/java/org/apache/tika/parser/html/
tika-parsers/src/test/java/org/apache/tika/parser/html/
Author: mattmann
Date: Sun May 30 23:49:05 2010
New Revision: 949635
URL: http://svn.apache.org/viewvc?rev=949635&view=rev
Log:
- fix for TIKA-379 Html elements and attributes not available in XHTML representation
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=949635&r1=949634&r2=949635&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun May 30 23:49:05 2010
@@ -4,6 +4,8 @@ Release 0.8 - Current Development
The most notable changes in Tika 0.8 over previous releases are:
+ * An approach for plumbing through XHTML attributes was added (TIKA-379)
+
* Media type hierarchy information is now taken into account when
selecting the best parser for a given input document. (TIKA-298)
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java?rev=949635&r1=949634&r2=949635&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java Sun May 30 23:49:05 2010
@@ -188,6 +188,11 @@ public class XHTMLContentHandler extends
startElement(XHTML, name, name, attributes);
}
+ public void startElement(String name, AttributesImpl attributes)
+ throws SAXException {
+ startElement(XHTML, name, name, attributes);
+ }
+
public void endElement(String name) throws SAXException {
endElement(XHTML, name, name);
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java?rev=949635&r1=949634&r2=949635&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java Sun May 30 23:49:05 2010
@@ -1,69 +1,75 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-/**
- * The default HTML mapping rules in Tika.
- *
- * @since Apache Tika 0.6
- */
-public class DefaultHtmlMapper implements HtmlMapper {
-
- /**
- * @since Apache Tika 0.8
- */
- public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
-
- public String mapSafeElement(String name) {
- // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
-
- if ("H1".equals(name)) return "h1";
- if ("H2".equals(name)) return "h2";
- if ("H3".equals(name)) return "h3";
- if ("H4".equals(name)) return "h4";
- if ("H5".equals(name)) return "h5";
- if ("H6".equals(name)) return "h6";
-
- if ("P".equals(name)) return "p";
- if ("PRE".equals(name)) return "pre";
- if ("BLOCKQUOTE".equals(name)) return "blockquote";
-
- if ("UL".equals(name)) return "ul";
- if ("OL".equals(name)) return "ol";
- if ("MENU".equals(name)) return "ul";
- if ("LI".equals(name)) return "li";
- if ("DL".equals(name)) return "dl";
- if ("DT".equals(name)) return "dt";
- if ("DD".equals(name)) return "dd";
-
- if ("TABLE".equals(name)) return "table";
- if ("THEAD".equals(name)) return "thead";
- if ("TBODY".equals(name)) return "tbody";
- if ("TR".equals(name)) return "tr";
- if ("TH".equals(name)) return "th";
- if ("TD".equals(name)) return "td";
-
- if ("ADDRESS".equals(name)) return "address";
-
- return null;
- }
-
- public boolean isDiscardElement(String name) {
- return "STYLE".equals(name) || "SCRIPT".equals(name);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+/**
+ * The default HTML mapping rules in Tika.
+ *
+ * @since Apache Tika 0.6
+ */
+public class DefaultHtmlMapper implements HtmlMapper {
+
+ /**
+ * @since Apache Tika 0.8
+ */
+ public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
+
+ public String mapSafeElement(String name) {
+ // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
+
+ if ("H1".equals(name)) return "h1";
+ if ("H2".equals(name)) return "h2";
+ if ("H3".equals(name)) return "h3";
+ if ("H4".equals(name)) return "h4";
+ if ("H5".equals(name)) return "h5";
+ if ("H6".equals(name)) return "h6";
+
+ if ("P".equals(name)) return "p";
+ if ("PRE".equals(name)) return "pre";
+ if ("BLOCKQUOTE".equals(name)) return "blockquote";
+
+ if ("UL".equals(name)) return "ul";
+ if ("OL".equals(name)) return "ol";
+ if ("MENU".equals(name)) return "ul";
+ if ("LI".equals(name)) return "li";
+ if ("DL".equals(name)) return "dl";
+ if ("DT".equals(name)) return "dt";
+ if ("DD".equals(name)) return "dd";
+
+ if ("TABLE".equals(name)) return "table";
+ if ("THEAD".equals(name)) return "thead";
+ if ("TBODY".equals(name)) return "tbody";
+ if ("TR".equals(name)) return "tr";
+ if ("TH".equals(name)) return "th";
+ if ("TD".equals(name)) return "td";
+
+ if ("ADDRESS".equals(name)) return "address";
+
+ return null;
+ }
+
+ /** Normalises an attribute name. Assumes that the element name
+ * is valid and normalised **/
+ public String mapSafeAttribute(String elementName, String attributeName) {
+ return null;
+ }
+
+ public boolean isDiscardElement(String name) {
+ return "STYLE".equals(name) || "SCRIPT".equals(name);
+ }
+
+}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=949635&r1=949634&r2=949635&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java Sun May 30 23:49:05 2010
@@ -25,6 +25,7 @@ import org.apache.tika.sax.XHTMLContentH
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
class HtmlHandler extends TextContentHandler {
@@ -89,23 +90,40 @@ class HtmlHandler extends TextContentHan
metadata.set(
atts.getValue("http-equiv"),
atts.getValue("content"));
+ xhtml.startElement(uri, local, "meta", atts);
}
if (atts.getValue("name") != null) {
metadata.set(
atts.getValue("name"),
atts.getValue("content"));
+ xhtml.startElement(uri, local, "meta", atts);
}
} else if ("BASE".equals(name) && atts.getValue("href") != null) {
metadata.set(
Metadata.CONTENT_LOCATION,
resolve(atts.getValue("href").trim()));
+ xhtml.startElement(uri, local, "base", atts);
+ } else if ("LINK".equals(name) && atts.getValue("href") != null) {
+ xhtml.startElement(uri, local, "link", atts);
}
}
if (bodyLevel > 0 && discardLevel == 0) {
String safe = mapper.mapSafeElement(name);
if (safe != null) {
- xhtml.startElement(safe);
+ // check if there are any attributes to process
+ if (atts.getLength()==0) xhtml.startElement(safe);
+ else {
+ AttributesImpl newAttributes = new AttributesImpl(atts);
+ for (int att=0;att<newAttributes.getLength();att++){
+ String normAttrName = mapper.mapSafeAttribute(safe, newAttributes.getLocalName(att));
+ if (normAttrName==null){
+ newAttributes.removeAttribute(att);
+ att--;
+ }
+ }
+ xhtml.startElement(safe, newAttributes);
+ }
} else if ("A".equals(name)) {
String href = atts.getValue("href");
if (href != null) {
@@ -127,6 +145,15 @@ class HtmlHandler extends TextContentHan
@Override
public void endElement(
String uri, String local, String name) throws SAXException {
+ if (bodyLevel == 0 && discardLevel == 0) {
+ if ("LINK".equals(name)) {
+ xhtml.endElement("link");
+ } else if ("BASE".equals(name)) {
+ xhtml.endElement("base");
+ } else if ("META".equals(name)) {
+ xhtml.endElement("meta");
+ }
+ }
if (bodyLevel > 0 && discardLevel == 0) {
String safe = mapper.mapSafeElement(name);
if (safe != null) {
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java?rev=949635&r1=949634&r2=949635&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java Sun May 30 23:49:05 2010
@@ -1,54 +1,69 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-/**
- * HTML mapper used to make incoming HTML documents easier to handle by
- * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from
- * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
- * that wants to customize this mapping can place a custom HtmlMapper instance
- * into the parse context.
- *
- * @since Apache Tika 0.6
- */
-public interface HtmlMapper {
-
- /**
- * Maps "safe" HTML element names to semantic XHTML equivalents. If the
- * given element is unknown or deemed unsafe for inclusion in the parse
- * output, then this method returns <code>null</code> and the element
- * will be ignored but the content inside it is still processed. See
- * the {@link #isDiscardElement(String)} method for a way to discard
- * the entire contents of an element.
- *
- * @param name HTML element name (upper case)
- * @return XHTML element name (lower case), or
- * <code>null</code> if the element is unsafe
- */
- String mapSafeElement(String name);
-
- /**
- * Checks whether all content within the given HTML element should be
- * discarded instead of including it in the parse output.
- *
- * @param name HTML element name (upper case)
- * @return <code>true</code> if content inside the named element
- * should be ignored, <code>false</code> otherwise
- */
- boolean isDiscardElement(String name);
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+/**
+ * HTML mapper used to make incoming HTML documents easier to handle by
+ * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from
+ * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
+ * that wants to customize this mapping can place a custom HtmlMapper instance
+ * into the parse context.
+ *
+ * @since Apache Tika 0.6
+ */
+public interface HtmlMapper {
+
+ /**
+ * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+ * given element is unknown or deemed unsafe for inclusion in the parse
+ * output, then this method returns <code>null</code> and the element
+ * will be ignored but the content inside it is still processed. See
+ * the {@link #isDiscardElement(String)} method for a way to discard
+ * the entire contents of an element.
+ *
+ * @param name HTML element name (upper case)
+ * @return XHTML element name (lower case), or
+ * <code>null</code> if the element is unsafe
+ */
+ String mapSafeElement(String name);
+
+ /**
+ * Checks whether all content within the given HTML element should be
+ * discarded instead of including it in the parse output.
+ *
+ * @param name HTML element name (upper case)
+ * @return <code>true</code> if content inside the named element
+ * should be ignored, <code>false</code> otherwise
+ */
+ boolean isDiscardElement(String name);
+
+
+ /**
+ * Maps "safe" HTML attribute names to semantic XHTML equivalents. If the
+ * given attribute is unknown or deemed unsafe for inclusion in the parse
+ * output, then this method returns <code>null</code> and the attribute
+ * will be ignored. This method assumes that the element name
+ * is valid and normalised.
+ *
+ * @param elementName HTML element name (lower case)
+ * @param attributeName HTML attribute name (lower case)
+ * @return XHTML attribute name (lower case), or
+ * <code>null</code> if the element is unsafe
+ */
+ String mapSafeAttribute(String elementName, String attributeName);
+
+}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=949635&r1=949634&r2=949635&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Sun May 30 23:49:05 2010
@@ -234,6 +234,14 @@ public class HtmlParser implements Parse
}
/**
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This method will be removed in Tika 1.0.
+ **/
+ public String mapSafeAttribute(String elementName, String attributeName) {
+ return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName,attributeName) ;
+ }
+
+ /**
* Adapter class that maintains backwards compatibility with the
* protected HtmlParser methods. Making HtmlParser implement HtmlMapper
* directly would require those methods to be public, which would break
@@ -249,6 +257,9 @@ public class HtmlParser implements Parse
public boolean isDiscardElement(String name) {
return HtmlParser.this.isDiscardElement(name);
}
+ public String mapSafeAttribute(String elementName, String attributeName){
+ return HtmlParser.this.mapSafeAttribute(elementName,attributeName);
+ }
}
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java?rev=949635&r1=949634&r2=949635&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java Sun May 30 23:49:05 2010
@@ -17,9 +17,9 @@
package org.apache.tika.parser.html;
/**
- * Alternative HTML mapping rules that pass the input HTML
- * as-is without any modifications.
- *
+ * Alternative HTML mapping rules that pass the input HTML as-is without any
+ * modifications.
+ *
* @since Apache Tika 0.8
*/
public class IdentityHtmlMapper implements HtmlMapper {
@@ -30,6 +30,10 @@ public class IdentityHtmlMapper implemen
return false;
}
+ public String mapSafeAttribute(String elementName, String attributeName) {
+ return attributeName.toLowerCase();
+ }
+
public String mapSafeElement(String name) {
return name;
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=949635&r1=949634&r2=949635&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Sun May 30 23:49:05 2010
@@ -194,7 +194,7 @@ public class HtmlParserTest extends Test
@Override
public void startElement(
String u, String l, String name, Attributes atts) {
- if (atts.getValue("", "href") != null) {
+ if (name.equals("a") && atts.getValue("", "href") != null) {
links.add(atts.getValue("", "href"));
}
}