You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2015/04/11 02:46:47 UTC

svn commit: r1672805 - in /tika/trunk: CHANGES.txt tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Author: tallison
Date: Sat Apr 11 00:46:46 2015
New Revision: 1672805

URL: http://svn.apache.org/r1672805
Log:
TIKA-1519: add charset information for the non-html formats, too: XHTML(s) and x-asp

Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1672805&r1=1672804&r2=1672805&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Sat Apr 11 00:46:46 2015
@@ -1,8 +1,11 @@
 Release 1.9 - ???
+
+
+Release 1.8 - 4/7/2015
+
   * Upgrade to com.drewnoakes' metadata-extractor to 2.0 and
     add parser for webp metadata (TIKA-1594).
 
-Release 1.8 - 4/7/2015
   * Duration extracted from MP3s with no ID3 tags (TIKA-1589).
 
   * Upgraded to PDFBox 1.8.9 (TIKA-1575).
@@ -76,7 +79,7 @@ Release 1.8 - 4/7/2015
   * Increased the speed of language identification by 
     a factor of two -- contributed by Toke Eskildsen (TIKA-1549).
 
-  * Added parser for Sqlite3 db files. Beware: the org.xerial 
+  * Added parser for Sqlite3 db files. BEWARE: the org.xerial 
     dependency includes native libs. Some users may need to 
     exclude this dependency or configure it specially for 
     their environment (TIKA-1511).

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=1672805&r1=1672804&r2=1672805&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java Sat Apr 11 00:46:46 2015
@@ -47,12 +47,16 @@ public class HtmlParser extends Abstract
     /** Serial version UID */
     private static final long serialVersionUID = 7895315240498733128L;
 
+    private static final MediaType XHTML = MediaType.application("xhtml+xml");
+    private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
+    private static final MediaType X_ASP = MediaType.application("x-asp");
+
     private static final Set<MediaType> SUPPORTED_TYPES =
         Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
                 MediaType.text("html"),
-                MediaType.application("xhtml+xml"),
-                MediaType.application("vnd.wap.xhtml+xml"),
-                MediaType.application("x-asp"))));
+                XHTML,
+                WAP_XHTML,
+                X_ASP)));
 
     private static final ServiceLoader LOADER =
             new ServiceLoader(HtmlParser.class.getClassLoader());
@@ -62,6 +66,7 @@ public class HtmlParser extends Abstract
      */
     private static final Schema HTML_SCHEMA = new HTMLSchema();
 
+
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }
@@ -77,9 +82,18 @@ public class HtmlParser extends Abstract
         try {
             Charset charset = reader.getCharset();
             String previous = metadata.get(Metadata.CONTENT_TYPE);
+            MediaType contentType = null;
             if (previous == null || previous.startsWith("text/html")) {
-                MediaType type = new MediaType(MediaType.TEXT_HTML, charset);
-                metadata.set(Metadata.CONTENT_TYPE, type.toString());
+                contentType = new MediaType(MediaType.TEXT_HTML, charset);
+            } else if (previous.startsWith("application/xhtml+xml")) {
+                contentType = new MediaType(XHTML, charset);
+            } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
+                contentType = new MediaType(WAP_XHTML, charset);
+            } else if (previous.startsWith("application/x-asp")) {
+                contentType = new MediaType(X_ASP, charset);
+            }
+            if (contentType != null) {
+                metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
             }
             // deprecated, see TIKA-431
             metadata.set(Metadata.CONTENT_ENCODING, charset.name());
@@ -153,7 +167,7 @@ public class HtmlParser extends Abstract
     *             the HTML mapping. This method will be removed in Tika 1.0.
     **/
     public String mapSafeAttribute(String elementName, String attributeName) {
-        return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName,attributeName) ;
+        return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName) ;
     }    
     
     /**

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1672805&r1=1672804&r2=1672805&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Sat Apr 11 00:46:46 2015
@@ -26,7 +26,6 @@ import javax.xml.transform.OutputKeys;
 import javax.xml.transform.sax.SAXTransformerFactory;
 import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;
-
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
@@ -35,12 +34,14 @@ import java.io.Writer;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.regex.Pattern;
+
 import org.apache.tika.Tika;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.IOUtils;
 import org.apache.tika.metadata.Geographic;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.LinkContentHandler;
@@ -134,7 +135,8 @@ public class HtmlParserTest {
         String content = new Tika().parseToString(
                 HtmlParserTest.class.getResourceAsStream(path), metadata);
 
-        assertEquals("application/xhtml+xml", metadata.get(Metadata.CONTENT_TYPE));
+        //can't specify charset because default differs between OS's
+        assertTrue(metadata.get(Metadata.CONTENT_TYPE).startsWith("application/xhtml+xml; charset="));
         assertEquals("XHTML test document", metadata.get(TikaCoreProperties.TITLE));
 
         assertEquals("Tika Developers", metadata.get("Author"));
@@ -149,7 +151,7 @@ public class HtmlParserTest {
         ContentHandler handler = new BodyContentHandler();
         new HtmlParser().parse(
                 new ByteArrayInputStream(new byte[0]),
-                handler,  new Metadata(), new ParseContext());
+                handler, new Metadata(), new ParseContext());
         assertEquals("", handler.toString());
     }
 
@@ -261,9 +263,9 @@ public class HtmlParserTest {
             + "<title>the name is \u00e1ndre</title>"
             + "</head><body></body></html>";
         Metadata metadata = new Metadata();
-        new HtmlParser().parse (
+        new HtmlParser().parse(
                 new ByteArrayInputStream(test.getBytes("ISO-8859-1")),
-                new BodyContentHandler(),  metadata, new ParseContext());
+                new BodyContentHandler(), metadata, new ParseContext());
         assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
     }
 
@@ -352,9 +354,9 @@ public class HtmlParserTest {
         String test = "<html><title>Simple Content</title><body></body></html>";
         Metadata metadata = new Metadata();
         metadata.add(Metadata.CONTENT_LANGUAGE, "en");
-        new HtmlParser().parse (
+        new HtmlParser().parse(
                 new ByteArrayInputStream(test.getBytes(IOUtils.UTF_8)),
-                new BodyContentHandler(),  metadata, new ParseContext());
+                new BodyContentHandler(), metadata, new ParseContext());
 
         assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
     }
@@ -867,9 +869,9 @@ public class HtmlParserTest {
             + "<title>hello</title>"
             + "</head><body></body></html>";
         Metadata metadata = new Metadata();
-        new HtmlParser().parse (
+        new HtmlParser().parse(
                 new ByteArrayInputStream(test1.getBytes("ISO-8859-1")),
-                new BodyContentHandler(),  metadata, new ParseContext());
+                new BodyContentHandler(), metadata, new ParseContext());
         assertEquals("some description", metadata.get("og:description"));
         assertTrue(metadata.isMultiValued("og:image"));
     }
@@ -993,7 +995,7 @@ public class HtmlParserTest {
         // The text occurs at line 24 (if lines start at 0) or 25 (if lines start at 1).
         assertEquals(24, textPosition[line]);
         // The column reported seems fuzzy, just test it is close enough.
-        assertTrue(Math.abs(textPosition[col]-47) < 10);
+        assertTrue(Math.abs(textPosition[col] - 47) < 10);
     }
     
     
@@ -1009,9 +1011,9 @@ public class HtmlParserTest {
         		+ "<title>TitleToIgnore</title></body></html>";
         Metadata metadata = new Metadata();
         
-        new HtmlParser().parse (
+        new HtmlParser().parse(
                 new ByteArrayInputStream(test.getBytes(IOUtils.UTF_8)),
-                new BodyContentHandler(),  metadata, new ParseContext());
+                new BodyContentHandler(), metadata, new ParseContext());
 
         //Expecting first title to be set in meta data and second one to be ignored.
         assertEquals("Simple Content", metadata.get(TikaCoreProperties.TITLE));
@@ -1051,6 +1053,38 @@ public class HtmlParserTest {
                 new BodyContentHandler(), metadata, new ParseContext());
         assertEquals("application/ms-word", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
         assertEquals("text/html; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+    @Test
+    public void testXHTMLWithMisleading() throws Exception {
+        //first test an acceptable XHTML header with http-equiv tags
+        String test = "<?xml version=\"1.0\" ?>"+
+                "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" +
+                "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" +
+                "<head>\n" +
+                "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\" />\n" +
+                "<title>title</title></head><body>body</body></html>";
+        Metadata metadata = new Metadata();
+        new AutoDetectParser().parse(
+                new ByteArrayInputStream(test.getBytes(IOUtils.UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+
+        assertEquals("text/html; charset=iso-8859-1", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
+        assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+
+        test = "<?xml version=\"1.0\" ?>"+
+                "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" +
+                "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" +
+                "<head>\n" +
+                "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-NUMBER_SEVEN\" />\n" +
+                "<title>title</title></head><body>body</body></html>";
+        metadata = new Metadata();
+        new AutoDetectParser().parse(
+                new ByteArrayInputStream(test.getBytes(IOUtils.UTF_8)),
+                new BodyContentHandler(), metadata, new ParseContext());
+
+        assertEquals("text/html; charset=iso-NUMBER_SEVEN", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
+        assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
 
     }
 }