You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2010/05/31 01:49:05 UTC

svn commit: r949635 - in /tika/trunk: ./ tika-core/src/main/java/org/apache/tika/sax/ tika-parsers/src/main/java/org/apache/tika/parser/html/ tika-parsers/src/test/java/org/apache/tika/parser/html/

Author: mattmann
Date: Sun May 30 23:49:05 2010
New Revision: 949635

URL: http://svn.apache.org/viewvc?rev=949635&view=rev
Log:
- fix for TIKA-379 Html elements and attributes not available in XHTML representation

Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=949635&r1=949634&r2=949635&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sun May 30 23:49:05 2010
@@ -4,6 +4,8 @@ Release 0.8 - Current Development
 
 The most notable changes in Tika 0.8 over previous releases are:
 
+ * An approach for plumbing through XHTML attributes was added (TIKA-379)
+
  * Media type hierarchy information is now taken into account when
    selecting the best parser for a given input document. (TIKA-298)
 

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java?rev=949635&r1=949634&r2=949635&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java Sun May 30 23:49:05 2010
@@ -188,6 +188,11 @@ public class XHTMLContentHandler extends
         startElement(XHTML, name, name, attributes);
     }
 
+    public void startElement(String name, AttributesImpl attributes)
+            throws SAXException {
+        startElement(XHTML, name, name, attributes);
+    }
+
     public void endElement(String name) throws SAXException {
         endElement(XHTML, name, name);
     }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java?rev=949635&r1=949634&r2=949635&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java Sun May 30 23:49:05 2010
@@ -1,69 +1,75 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-/**
- * The default HTML mapping rules in Tika.
- *
- * @since Apache Tika 0.6
- */
-public class DefaultHtmlMapper implements HtmlMapper {
-
-    /**
-     * @since Apache Tika 0.8
-     */
-    public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
-
-    public String mapSafeElement(String name) {
-        // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
-
-        if ("H1".equals(name)) return "h1";
-        if ("H2".equals(name)) return "h2";
-        if ("H3".equals(name)) return "h3";
-        if ("H4".equals(name)) return "h4";
-        if ("H5".equals(name)) return "h5";
-        if ("H6".equals(name)) return "h6";
-
-        if ("P".equals(name)) return "p";
-        if ("PRE".equals(name)) return "pre";
-        if ("BLOCKQUOTE".equals(name)) return "blockquote";
-
-        if ("UL".equals(name)) return "ul";
-        if ("OL".equals(name)) return "ol";
-        if ("MENU".equals(name)) return "ul";
-        if ("LI".equals(name)) return "li";
-        if ("DL".equals(name)) return "dl";
-        if ("DT".equals(name)) return "dt";
-        if ("DD".equals(name)) return "dd";
-
-        if ("TABLE".equals(name)) return "table";
-        if ("THEAD".equals(name)) return "thead";
-        if ("TBODY".equals(name)) return "tbody";
-        if ("TR".equals(name)) return "tr";
-        if ("TH".equals(name)) return "th";
-        if ("TD".equals(name)) return "td";
-
-        if ("ADDRESS".equals(name)) return "address";
-
-        return null;
-    }
-
-    public boolean isDiscardElement(String name) {
-        return "STYLE".equals(name) || "SCRIPT".equals(name);
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+/**
+ * The default HTML mapping rules in Tika.
+ *
+ * @since Apache Tika 0.6
+ */
+public class DefaultHtmlMapper implements HtmlMapper {
+
+    /**
+     * @since Apache Tika 0.8
+     */
+    public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
+
+    public String mapSafeElement(String name) {
+        // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
+
+        if ("H1".equals(name)) return "h1";
+        if ("H2".equals(name)) return "h2";
+        if ("H3".equals(name)) return "h3";
+        if ("H4".equals(name)) return "h4";
+        if ("H5".equals(name)) return "h5";
+        if ("H6".equals(name)) return "h6";
+
+        if ("P".equals(name)) return "p";
+        if ("PRE".equals(name)) return "pre";
+        if ("BLOCKQUOTE".equals(name)) return "blockquote";
+
+        if ("UL".equals(name)) return "ul";
+        if ("OL".equals(name)) return "ol";
+        if ("MENU".equals(name)) return "ul";
+        if ("LI".equals(name)) return "li";
+        if ("DL".equals(name)) return "dl";
+        if ("DT".equals(name)) return "dt";
+        if ("DD".equals(name)) return "dd";
+
+        if ("TABLE".equals(name)) return "table";
+        if ("THEAD".equals(name)) return "thead";
+        if ("TBODY".equals(name)) return "tbody";
+        if ("TR".equals(name)) return "tr";
+        if ("TH".equals(name)) return "th";
+        if ("TD".equals(name)) return "td";
+
+        if ("ADDRESS".equals(name)) return "address";
+
+        return null;
+    }
+
+    /** Normalises an attribute name. Assumes that the element name 
+     * is valid and normalised **/
+    public String mapSafeAttribute(String elementName, String attributeName) {
+        return null;
+    }    
+    
+    public boolean isDiscardElement(String name) {
+        return "STYLE".equals(name) || "SCRIPT".equals(name);
+    }
+
+}

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=949635&r1=949634&r2=949635&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java Sun May 30 23:49:05 2010
@@ -25,6 +25,7 @@ import org.apache.tika.sax.XHTMLContentH
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
 
 class HtmlHandler extends TextContentHandler {
 
@@ -89,23 +90,40 @@ class HtmlHandler extends TextContentHan
                     metadata.set(
                             atts.getValue("http-equiv"),
                             atts.getValue("content"));
+                    xhtml.startElement(uri, local, "meta", atts);
                 }
                 if (atts.getValue("name") != null) {
                     metadata.set(
                             atts.getValue("name"),
                             atts.getValue("content"));
+                    xhtml.startElement(uri, local, "meta", atts);
                 }
             } else if ("BASE".equals(name) && atts.getValue("href") != null) {
                 metadata.set(
                         Metadata.CONTENT_LOCATION,
                         resolve(atts.getValue("href").trim()));
+                xhtml.startElement(uri, local, "base", atts);
+            } else if ("LINK".equals(name) && atts.getValue("href") != null) {
+                xhtml.startElement(uri, local, "link", atts);
             }
         }
 
         if (bodyLevel > 0 && discardLevel == 0) {
             String safe = mapper.mapSafeElement(name);
             if (safe != null) {
-                xhtml.startElement(safe);
+                // check if there are any attributes to process
+                if (atts.getLength()==0) xhtml.startElement(safe);
+                else {
+                    AttributesImpl newAttributes = new AttributesImpl(atts);
+                    for (int att=0;att<newAttributes.getLength();att++){
+                        String normAttrName = mapper.mapSafeAttribute(safe, newAttributes.getLocalName(att));
+                        if (normAttrName==null){
+                            newAttributes.removeAttribute(att);
+                            att--;
+                        }
+                    }
+                    xhtml.startElement(safe, newAttributes);
+                }
             } else if ("A".equals(name)) {
                 String href = atts.getValue("href");
                 if (href != null) {
@@ -127,6 +145,15 @@ class HtmlHandler extends TextContentHan
     @Override
     public void endElement(
             String uri, String local, String name) throws SAXException {
+        if (bodyLevel == 0 && discardLevel == 0) {
+            if ("LINK".equals(name)) {
+                xhtml.endElement("link");
+            } else if ("BASE".equals(name)) {
+                xhtml.endElement("base");
+            } else if ("META".equals(name)) {
+                xhtml.endElement("meta");
+            }
+        }
         if (bodyLevel > 0 && discardLevel == 0) {
             String safe = mapper.mapSafeElement(name);
             if (safe != null) {

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java?rev=949635&r1=949634&r2=949635&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlMapper.java Sun May 30 23:49:05 2010
@@ -1,54 +1,69 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-/**
- * HTML mapper used to make incoming HTML documents easier to handle by
- * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from
- * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
- * that wants to customize this mapping can place a custom HtmlMapper instance
- * into the parse context.
- *
- * @since Apache Tika 0.6
- */
-public interface HtmlMapper {
-
-    /**
-     * Maps "safe" HTML element names to semantic XHTML equivalents. If the
-     * given element is unknown or deemed unsafe for inclusion in the parse
-     * output, then this method returns <code>null</code> and the element
-     * will be ignored but the content inside it is still processed. See
-     * the {@link #isDiscardElement(String)} method for a way to discard
-     * the entire contents of an element.
-     *
-     * @param name HTML element name (upper case)
-     * @return XHTML element name (lower case), or
-     *         <code>null</code> if the element is unsafe 
-     */
-    String mapSafeElement(String name);
-
-    /**
-     * Checks whether all content within the given HTML element should be
-     * discarded instead of including it in the parse output.
-     *
-     * @param name HTML element name (upper case)
-     * @return <code>true</code> if content inside the named element
-     *         should be ignored, <code>false</code> otherwise
-     */
-    boolean isDiscardElement(String name);
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+/**
+ * HTML mapper used to make incoming HTML documents easier to handle by
+ * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from
+ * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
+ * that wants to customize this mapping can place a custom HtmlMapper instance
+ * into the parse context.
+ *
+ * @since Apache Tika 0.6
+ */
+public interface HtmlMapper {
+
+    /**
+     * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+     * given element is unknown or deemed unsafe for inclusion in the parse
+     * output, then this method returns <code>null</code> and the element
+     * will be ignored but the content inside it is still processed. See
+     * the {@link #isDiscardElement(String)} method for a way to discard
+     * the entire contents of an element.
+     *
+     * @param name HTML element name (upper case)
+     * @return XHTML element name (lower case), or
+     *         <code>null</code> if the element is unsafe 
+     */
+    String mapSafeElement(String name);
+
+    /**
+     * Checks whether all content within the given HTML element should be
+     * discarded instead of including it in the parse output.
+     *
+     * @param name HTML element name (upper case)
+     * @return <code>true</code> if content inside the named element
+     *         should be ignored, <code>false</code> otherwise
+     */
+    boolean isDiscardElement(String name);
+    
+    
+    /**
+     * Maps "safe" HTML attribute names to semantic XHTML equivalents. If the
+     * given attribute is unknown or deemed unsafe for inclusion in the parse
+     * output, then this method returns <code>null</code> and the attribute
+     * will be ignored. This method assumes that the element name 
+     * is valid and normalised.
+     *
+     * @param elementName HTML element name (lower case)
+     * @param attributeName HTML attribute name (lower case)
+     * @return XHTML attribute name (lower case), or
+     *         <code>null</code> if the element is unsafe 
+     */
+    String mapSafeAttribute(String elementName, String attributeName);
+
+}

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=949635&r1=949634&r2=949635&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Sun May 30 23:49:05 2010
@@ -234,6 +234,14 @@ public class HtmlParser implements Parse
     }
 
     /**
+    * @deprecated Use the {@link HtmlMapper} mechanism to customize
+    *             the HTML mapping. This method will be removed in Tika 1.0.
+    **/
+    public String mapSafeAttribute(String elementName, String attributeName) {
+        return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName,attributeName) ;
+    }    
+    
+    /**
      * Adapter class that maintains backwards compatibility with the
      * protected HtmlParser methods. Making HtmlParser implement HtmlMapper
      * directly would require those methods to be public, which would break
@@ -249,6 +257,9 @@ public class HtmlParser implements Parse
         public boolean isDiscardElement(String name) {
             return HtmlParser.this.isDiscardElement(name);
         }
+        public String mapSafeAttribute(String elementName, String attributeName){
+            return HtmlParser.this.mapSafeAttribute(elementName,attributeName);
+        }
     }
 
 }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java?rev=949635&r1=949634&r2=949635&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java Sun May 30 23:49:05 2010
@@ -17,9 +17,9 @@
 package org.apache.tika.parser.html;
 
 /**
- * Alternative HTML mapping rules that pass the input HTML
- * as-is without any modifications.
- *
+ * Alternative HTML mapping rules that pass the input HTML as-is without any
+ * modifications.
+ * 
  * @since Apache Tika 0.8
  */
 public class IdentityHtmlMapper implements HtmlMapper {
@@ -30,6 +30,10 @@ public class IdentityHtmlMapper implemen
         return false;
     }
 
+    public String mapSafeAttribute(String elementName, String attributeName) {
+        return attributeName.toLowerCase();
+    }
+
     public String mapSafeElement(String name) {
         return name;
     }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=949635&r1=949634&r2=949635&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Sun May 30 23:49:05 2010
@@ -194,7 +194,7 @@ public class HtmlParserTest extends Test
                     @Override
                     public void startElement(
                             String u, String l, String name, Attributes atts) {
-                        if (atts.getValue("", "href") != null) {
+                        if (name.equals("a") && atts.getValue("", "href") != null) {
                             links.add(atts.getValue("", "href"));
                         }
                     }