You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC

svn commit: r1723223 [29/32] - in /tika/branches/2.x: tika-core/src/test/resources/META-INF/ tika-core/src/test/resources/META-INF/services/ tika-parser-modules/ tika-parser-modules/tika-advanced-module/ tika-parser-modules/tika-advanced-module/src/ ti...

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,101 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class DcXMLParserTest extends TikaTest {
+
+    @Test
+    public void testXMLParserAsciiChars() throws Exception {
+        try (InputStream input = DcXMLParserTest.class.getResourceAsStream(
+                "/test-documents/testXML.xml")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new DcXMLParser().parse(input, handler, metadata);
+
+            assertEquals(
+                    "application/xml",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Tika test document", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Rida Benjelloun", metadata.get(TikaCoreProperties.CREATOR));
+
+            // The file contains 5 dc:subject tags, which come through as
+            //  a multi-valued Tika Metadata entry in file order
+            assertEquals(true, metadata.isMultiValued(TikaCoreProperties.KEYWORDS));
+            assertEquals(5, metadata.getValues(TikaCoreProperties.KEYWORDS).length);
+            assertEquals("Java", metadata.getValues(TikaCoreProperties.KEYWORDS)[0]);
+            assertEquals("XML", metadata.getValues(TikaCoreProperties.KEYWORDS)[1]);
+            assertEquals("XSLT", metadata.getValues(TikaCoreProperties.KEYWORDS)[2]);
+            assertEquals("JDOM", metadata.getValues(TikaCoreProperties.KEYWORDS)[3]);
+            assertEquals("Indexation", metadata.getValues(TikaCoreProperties.KEYWORDS)[4]);
+            assertEquals(true, metadata.isMultiValued(Metadata.SUBJECT));
+            assertEquals(5, metadata.getValues(Metadata.SUBJECT).length);
+            assertEquals("Java", metadata.getValues(Metadata.SUBJECT)[0]);
+            assertEquals("XML", metadata.getValues(Metadata.SUBJECT)[1]);
+            assertEquals("XSLT", metadata.getValues(Metadata.SUBJECT)[2]);
+            assertEquals("JDOM", metadata.getValues(Metadata.SUBJECT)[3]);
+            assertEquals("Indexation", metadata.getValues(Metadata.SUBJECT)[4]);
+
+            assertEquals(
+                    "Framework d\'indexation des documents XML, HTML, PDF etc..",
+                    metadata.get(TikaCoreProperties.DESCRIPTION));
+            assertEquals(
+                    "http://www.apache.org",
+                    metadata.get(TikaCoreProperties.IDENTIFIER));
+            assertEquals("test", metadata.get(TikaCoreProperties.TYPE));
+            assertEquals("application/msword", metadata.get(TikaCoreProperties.FORMAT));
+            assertEquals("Fr", metadata.get(TikaCoreProperties.LANGUAGE));
+            assertTrue(metadata.get(TikaCoreProperties.RIGHTS).contains("testing chars"));
+
+            String content = handler.toString();
+            assertContains("Tika test document", content);
+
+            assertEquals("2000-12-01T00:00:00.000Z", metadata.get(TikaCoreProperties.CREATED));
+        }
+    }
+    
+    @Test
+    public void testXMLParserNonAsciiChars() throws Exception {
+        try (InputStream input = DcXMLParserTest.class.getResourceAsStream("/test-documents/testXML.xml")) {
+            Metadata metadata = new Metadata();
+            new DcXMLParser().parse(input, new DefaultHandler(), metadata);
+
+            final String expected = "Archim\u00E8de et Lius \u00E0 Ch\u00E2teauneuf testing chars en \u00E9t\u00E9";
+            assertEquals(expected, metadata.get(TikaCoreProperties.RIGHTS));
+        }
+    }
+
+    // TIKA-1048
+    @Test
+    public void testNoSpaces() throws Exception {
+      String text = getXML("testXML2.xml").xml;
+      assertFalse(text.contains("testSubject"));
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class EmptyAndDuplicateElementsXMLParserTest extends TikaTest {
+    
+    private Property FIRST_NAME = Property.internalTextBag(
+            "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "FirstName");
+    private Property LAST_NAME = Property.internalTextBag(
+            "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "LastName");
+
+    @Test
+    public void testDefaultBehavior() throws Exception {
+        try (InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream(
+                "/test-documents/testXML3.xml")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new DefaultCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals(4, metadata.getValues(FIRST_NAME).length);
+            assertEquals(2, metadata.getValues(LAST_NAME).length);
+
+            assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
+            assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
+
+            assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
+            assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
+
+            // We didn't know Bob's last name, but now we don't know an entry existed
+            assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
+
+            // We don't know Kate's last name because it was a duplicate
+            assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
+        }
+    }
+    
+    @Test
+    public void testEmptiesAndRepeats() throws Exception {
+        try (InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream(
+                "/test-documents/testXML3.xml")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new AllowEmptiesAndDuplicatesCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals(4, metadata.getValues(FIRST_NAME).length);
+            assertEquals(4, metadata.getValues(LAST_NAME).length);
+
+            assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
+            assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
+
+            assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
+            assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
+
+            assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
+            assertEquals("", metadata.getValues(LAST_NAME)[2]);
+
+            assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
+            assertEquals("Smith", metadata.getValues(LAST_NAME)[3]);
+        }
+    }
+    
+    private class DefaultCustomXMLTestParser extends XMLParser {
+    
+        private static final long serialVersionUID = 2458579047014545931L;
+
+        protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) {
+            return new ElementMetadataHandler(
+                    "http://custom",
+                    localPart,
+                    metadata,
+                    tikaProperty);
+        }
+        
+        protected ContentHandler getContentHandler(
+                ContentHandler handler, Metadata metadata, ParseContext context) {
+            return new TeeContentHandler(
+                    super.getContentHandler(handler, metadata, context),
+                    getCustomElementHandler(metadata, FIRST_NAME, "FirstName"),
+                    getCustomElementHandler(metadata, LAST_NAME, "LastName"));
+        }
+    }
+    
+    private class AllowEmptiesAndDuplicatesCustomXMLTestParser extends DefaultCustomXMLTestParser {
+        
+        private static final long serialVersionUID = 3735646809954466229L;
+
+        protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) {
+            return new ElementMetadataHandler(
+                    "http://custom",
+                    localPart,
+                    metadata,
+                    tikaProperty,
+                    true,
+                    true);
+        }
+    }
+    
+    
+}

Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest.TrackingHandler;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class FictionBookParserTest {
+  
+    @Test
+    public void testFB2() throws Exception {
+        try (InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new FictionBookParser().parse(input, handler, metadata, new ParseContext());
+            String content = handler.toString();
+
+            assertContains("1812", content);
+        }
+    }
+
+    @Test
+    public void testEmbedded() throws Exception {
+        try (InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2")) {
+            ContainerExtractor extractor = new ParserContainerExtractor();
+            TikaInputStream stream = TikaInputStream.get(input);
+
+            assertEquals(true, extractor.isSupported(stream));
+
+            // Process it
+            TrackingHandler handler = new TrackingHandler();
+            extractor.extract(stream, null, handler);
+
+            assertEquals(2, handler.filenames.size());
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml Wed Jan  6 03:50:50 2016
@@ -0,0 +1,102 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  you under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parser-modules</artifactId>
+    <version>2.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>tika-web-module</artifactId>
+  <name>Apache Tika Web Module</name>
+  <url>http://tika.apache.org/</url>
+  
+  <properties>
+    <mime4j.version>0.7.2</mime4j.version>
+  </properties>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.ccil.cowan.tagsoup</groupId>
+      <artifactId>tagsoup</artifactId>
+      <version>1.2.1</version>
+    </dependency>
+    <dependency>
+      <groupId>de.l3s.boilerpipe</groupId>
+      <artifactId>boilerpipe</artifactId>
+      <version>1.1.0</version>
+    </dependency>
+    <dependency>
+      <groupId>rome</groupId>
+      <artifactId>rome</artifactId>
+      <version>1.0</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.james</groupId>
+      <artifactId>apache-mime4j-core</artifactId>
+      <version>${mime4j.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.james</groupId>
+      <artifactId>apache-mime4j-dom</artifactId>
+      <version>${mime4j.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>com.pff</groupId>
+      <artifactId>java-libpst</artifactId>
+      <version>0.8.1</version>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+      <version>${commons.io.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-text-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-package-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>
\ No newline at end of file

Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.feed;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import com.sun.syndication.feed.synd.SyndContent;
+import com.sun.syndication.feed.synd.SyndEntry;
+import com.sun.syndication.feed.synd.SyndFeed;
+import com.sun.syndication.io.FeedException;
+import com.sun.syndication.io.SyndFeedInput;
+
+/**
+ * Feed parser.
+ * <p>
+ * Uses Rome for parsing the feeds. A feed description is put in a paragraph
+ * with its link and title in an anchor.
+ */
+public class FeedParser extends AbstractParser {
+
+    /** Serial version UID */
+    private static final long serialVersionUID = -3785361933034525186L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.application("rss+xml"),
+                    MediaType.application("atom+xml"))));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // set the encoding?
+        try {
+            SyndFeed feed = new SyndFeedInput().build(
+                    new InputSource(new CloseShieldInputStream(stream)));
+
+            String title = stripTags(feed.getTitleEx());
+            String description = stripTags(feed.getDescriptionEx());
+
+            metadata.set(TikaCoreProperties.TITLE, title);
+            metadata.set(TikaCoreProperties.DESCRIPTION, description);
+            // store the other fields in the metadata
+
+            XHTMLContentHandler xhtml =
+                new XHTMLContentHandler(handler, metadata);
+            xhtml.startDocument();
+
+            xhtml.element("h1", title);
+            xhtml.element("p", description);
+
+            xhtml.startElement("ul");
+            for (Object e : feed.getEntries()) {
+                SyndEntry entry = (SyndEntry) e;
+                String link = entry.getLink();
+                if (link != null) {
+                    xhtml.startElement("li");
+                    xhtml.startElement("a", "href", link);
+                    xhtml.characters(stripTags(entry.getTitleEx()));
+                    xhtml.endElement("a");
+                    SyndContent content = entry.getDescription();
+                    if (content != null) {
+                        xhtml.newline();
+                        xhtml.characters(stripTags(content));
+                    }
+                    xhtml.endElement("li");
+                }
+            }
+            xhtml.endElement("ul");
+
+            xhtml.endDocument();
+        } catch (FeedException e) {
+            throw new TikaException("RSS parse error", e);
+        }
+
+    }
+
+    private static String stripTags(SyndContent c) {
+        if (c == null)
+            return "";
+
+        String value = c.getValue();
+
+        String[] parts = value.split("<[^>]*>");
+        StringBuffer buf = new StringBuffer();
+
+        for (String part : parts)
+            buf.append(part);
+
+        return buf.toString().trim();
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,347 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.List;
+import java.util.Locale;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.extractors.ArticleExtractor;
+import de.l3s.boilerpipe.extractors.DefaultExtractor;
+import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a>
+ * library to automatically extract the main content from a web page.
+ * <p/>
+ * Use this as a {@link ContentHandler} object passed to
+ * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, org.apache.tika.parser.ParseContext)}
+ */
+public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
+
+    /**
+     * The newline character that gets inserted after block elements.
+     */
+    private static final char[] NL = new char[]{'\n'};
+    private ContentHandler delegate;
+    private BoilerpipeExtractor extractor;
+    private boolean includeMarkup;
+    private boolean inHeader;
+    private boolean inFooter;
+    private int headerCharOffset;
+    private List<RecordedElement> elements;
+    private TextDocument td;
+    /**
+     * Creates a new boilerpipe-based content extractor, using the
+     * {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
+     *
+     * @param delegate The {@link ContentHandler} object
+     */
+    public BoilerpipeContentHandler(ContentHandler delegate) {
+        this(delegate, DefaultExtractor.INSTANCE);
+    }
+
+    /**
+     * Creates a content handler that writes XHTML body character events to
+     * the given writer.
+     *
+     * @param writer writer
+     */
+    public BoilerpipeContentHandler(Writer writer) {
+        this(new WriteOutContentHandler(writer));
+    }
+
+    /**
+     * Creates a new boilerpipe-based content extractor, using the given
+     * extraction rules. The extracted main content will be passed to the
+     * <delegate> content handler.
+     *
+     * @param delegate  The {@link ContentHandler} object
+     * @param extractor Extraction rules to use, e.g. {@link ArticleExtractor}
+     */
+    public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) {
+        this.td = null;
+        this.delegate = delegate;
+        this.extractor = extractor;
+    }
+
+    public boolean isIncludeMarkup() {
+        return includeMarkup;
+    }
+
+    public void setIncludeMarkup(boolean includeMarkup) {
+        this.includeMarkup = includeMarkup;
+    }
+
+    /**
+     * Retrieves the built TextDocument
+     *
+     * @return TextDocument
+     */
+    public TextDocument getTextDocument() {
+        return td;
+    }
+
+    @Override
+    public void startDocument() throws SAXException {
+        super.startDocument();
+
+        delegate.startDocument();
+
+        inHeader = true;
+        inFooter = false;
+        headerCharOffset = 0;
+
+        if (includeMarkup) {
+            elements = new ArrayList<RecordedElement>();
+        }
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) throws SAXException {
+        super.startPrefixMapping(prefix, uri);
+        delegate.startPrefixMapping(prefix, uri);
+    }
+
+    ;
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+        super.startElement(uri, localName, qName, atts);
+
+        if (inHeader) {
+            delegate.startElement(uri, localName, qName, atts);
+        } else if (inFooter) {
+            // Do nothing
+        } else if (includeMarkup) {
+            elements.add(new RecordedElement(uri, localName, qName, atts));
+        } else {
+            // This happens for the <body> element, if we're not doing markup.
+            delegate.startElement(uri, localName, qName, atts);
+        }
+    }
+
+    ;
+
+    @Override
+    public void characters(char[] chars, int offset, int length) throws SAXException {
+        super.characters(chars, offset, length);
+
+        if (inHeader) {
+            delegate.characters(chars, offset, length);
+            headerCharOffset++;
+        } else if (inFooter) {
+            // Do nothing
+        } else if (includeMarkup) {
+            RecordedElement element = elements.get(elements.size() - 1);
+
+            char[] characters = new char[length];
+            System.arraycopy(chars, offset, characters, 0, length);
+            element.getCharacters().add(characters);
+        }
+    }
+
+    ;
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+        super.endElement(uri, localName, qName);
+
+        if (inHeader) {
+            delegate.endElement(uri, localName, qName);
+            inHeader = !localName.equals("head");
+        } else if (inFooter) {
+            // Do nothing
+        } else if (localName.equals("body")) {
+            inFooter = true;
+        } else if (includeMarkup) {
+            // Add the end element, and the continuation from the previous element
+            elements.add(new RecordedElement(uri, localName, qName));
+            elements.add(new RecordedElement());
+        }
+    }
+
+    ;
+
+    @Override
+    public void endDocument() throws SAXException {
+        super.endDocument();
+
+        td = toTextDocument();
+        try {
+            extractor.process(td);
+        } catch (BoilerpipeProcessingException e) {
+            throw new SAXException(e);
+        }
+
+        Attributes emptyAttrs = new AttributesImpl();
+
+        // At this point we have all the information we need to either emit N paragraphs
+        // of plain text (if not including markup), or we have to replay our recorded elements
+        // and only emit character runs that passed the boilerpipe filters.
+        if (includeMarkup) {
+            BitSet validCharacterRuns = new BitSet();
+            for (TextBlock block : td.getTextBlocks()) {
+                if (block.isContent()) {
+                    BitSet bs = block.getContainedTextElements();
+                    if (bs != null) {
+                        validCharacterRuns.or(bs);
+                    }
+                }
+            }
+
+            // Now have bits set for all valid character runs. Replay our recorded elements,
+            // but only emit character runs flagged as valid.
+            int curCharsIndex = headerCharOffset;
+
+            for (RecordedElement element : elements) {
+                switch (element.getElementType()) {
+                    case START:
+                        delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
+                        // Fall through
+
+                    case CONTINUE:
+                        // Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so
+                        // we have to follow suit.
+                        for (char[] chars : element.getCharacters()) {
+                            curCharsIndex++;
+
+                            if (validCharacterRuns.get(curCharsIndex)) {
+                                delegate.characters(chars, 0, chars.length);
+
+                                // https://issues.apache.org/jira/browse/TIKA-961
+                                if (!Character.isWhitespace(chars[chars.length - 1])) {
+                                    // Only add whitespace for certain elements
+                                    if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
+                                        delegate.ignorableWhitespace(NL, 0, NL.length);
+                                    }
+                                }
+                            }
+                        }
+                        break;
+
+                    case END:
+                        delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
+                        break;
+
+                    default:
+                        throw new RuntimeException("Unhandled element type: " + element.getElementType());
+                }
+
+
+            }
+        } else {
+            for (TextBlock block : td.getTextBlocks()) {
+                if (block.isContent()) {
+                    delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
+                    char[] chars = block.getText().toCharArray();
+                    delegate.characters(chars, 0, chars.length);
+                    delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
+                    delegate.ignorableWhitespace(NL, 0, NL.length);
+                }
+            }
+        }
+
+        delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
+        delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");
+
+        // We defer ending any prefix mapping until here, which is why we don't pass this
+        // through to the delegate in an overridden method.
+        delegate.endPrefixMapping("");
+
+        delegate.endDocument();
+    }
+
+    ;
+
+    private static class RecordedElement {
+        private String uri;
+        private String localName;
+        private String qName;
+        private Attributes attrs;
+        private List<char[]> characters;
+        private ElementType elementType;
+        public RecordedElement(String uri, String localName, String qName, Attributes attrs) {
+            this(uri, localName, qName, attrs, ElementType.START);
+        }
+
+        public RecordedElement(String uri, String localName, String qName) {
+            this(uri, localName, qName, null, ElementType.END);
+        }
+
+        public RecordedElement() {
+            this(null, null, null, null, ElementType.CONTINUE);
+        }
+
+        protected RecordedElement(String uri, String localName, String qName, Attributes attrs, RecordedElement.ElementType elementType) {
+            this.uri = uri;
+            this.localName = localName;
+            this.qName = qName;
+            this.attrs = attrs;
+            this.elementType = elementType;
+            this.characters = new ArrayList<char[]>();
+        }
+
+        @Override
+        public String toString() {
+            return String.format(Locale.ROOT, "<%s> of type %s", localName, elementType);
+        }
+
+        public String getUri() {
+            return uri;
+        }
+
+        public String getLocalName() {
+            return localName;
+        }
+
+        public String getQName() {
+            return qName;
+        }
+
+        public Attributes getAttrs() {
+            return attrs;
+        }
+
+        public List<char[]> getCharacters() {
+            return characters;
+        }
+
+        public RecordedElement.ElementType getElementType() {
+            return elementType;
+        }
+
+        public enum ElementType {
+            START,
+            END,
+            CONTINUE
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * The default HTML mapping rules in Tika.
+ *
+ * @since Apache Tika 0.6
+ */
+@SuppressWarnings("serial")
+public class DefaultHtmlMapper implements HtmlMapper {
+
+    /**
+     * @since Apache Tika 0.8
+     */
+    public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
+    // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
+    private static final Map<String, String> SAFE_ELEMENTS = new HashMap<String, String>() {{
+        put("H1", "h1");
+        put("H2", "h2");
+        put("H3", "h3");
+        put("H4", "h4");
+        put("H5", "h5");
+        put("H6", "h6");
+
+        put("P", "p");
+        put("PRE", "pre");
+        put("BLOCKQUOTE", "blockquote");
+        put("Q", "q");
+
+        put("UL", "ul");
+        put("OL", "ol");
+        put("MENU", "ul");
+        put("LI", "li");
+        put("DL", "dl");
+        put("DT", "dt");
+        put("DD", "dd");
+
+        put("TABLE", "table");
+        put("THEAD", "thead");
+        put("TBODY", "tbody");
+        put("TR", "tr");
+        put("TH", "th");
+        put("TD", "td");
+
+        put("ADDRESS", "address");
+
+        // TIKA-460 - add anchors
+        put("A", "a");
+
+        // TIKA-463 - add additional elements that contain URLs (and their sub-elements)
+        put("MAP", "map");
+        put("AREA", "area");
+        put("IMG", "img");
+        put("FRAMESET", "frameset");
+        put("FRAME", "frame");
+        put("IFRAME", "iframe");
+        put("OBJECT", "object");
+        put("PARAM", "param");
+        put("INS", "ins");
+        put("DEL", "del");
+    }};
+    private static final Set<String> DISCARDABLE_ELEMENTS = new HashSet<String>() {{
+        add("STYLE");
+        add("SCRIPT");
+    }};
+    // For information on tags & attributes, see:
+    // http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#a_dtd_XHTML-1.0-Strict
+    // http://www.w3schools.com/TAGS/
+    private static final Map<String, Set<String>> SAFE_ATTRIBUTES = new HashMap<String, Set<String>>() {{
+        put("a", attrSet("charset", "type", "name", "href", "hreflang", "rel", "rev", "shape", "coords"));
+        put("img", attrSet("src", "alt", "longdesc", "height", "width", "usemap", "ismap"));
+        put("frame", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling"));
+        put("iframe", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width"));
+        put("link", attrSet("charset", "href", "hreflang", "type", "rel", "rev", "media"));
+        put("map", attrSet("id", "class", "style", "title", "name"));
+        put("area", attrSet("shape", "coords", "href", "nohref", "alt"));
+        put("object", attrSet("declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height",
+                "width", "usemap", "name", "tabindex", "align", "border", "hspace", "vspace"));
+        put("param", attrSet("id", "name", "value", "valuetype", "type"));
+        put("blockquote", attrSet("cite"));
+        put("ins", attrSet("cite", "datetime"));
+        put("del", attrSet("cite", "datetime"));
+        put("q", attrSet("cite"));
+
+        // TODO - fill out this set. Include core, i18n, etc sets where appropriate.
+    }};
+
+    private static Set<String> attrSet(String... attrs) {
+        Set<String> result = new HashSet<String>();
+        for (String attr : attrs) {
+            result.add(attr);
+        }
+        return result;
+    }
+
+    public String mapSafeElement(String name) {
+        return SAFE_ELEMENTS.get(name);
+    }
+
+    /**
+     * Normalizes an attribute name. Assumes that the element name
+     * is valid and normalized
+     */
+    public String mapSafeAttribute(String elementName, String attributeName) {
+        Set<String> safeAttrs = SAFE_ATTRIBUTES.get(elementName);
+        if ((safeAttrs != null) && safeAttrs.contains(attributeName)) {
+            return attributeName;
+        } else {
+            return null;
+        }
+    }
+
+    public boolean isDiscardElement(String name) {
+        return DISCARDABLE_ELEMENTS.contains(name);
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.utils.CharsetUtils;
+
+/**
+ * Character encoding detector for determining the character encoding of a
+ * HTML document based on the potential charset parameter found in a
+ * Content-Type http-equiv meta tag somewhere near the beginning. Especially
+ * useful for determining the type among multiple closely related encodings
+ * (ISO-8859-*) for which other types of encoding detection are unreliable.
+ *
+ * @since Apache Tika 1.2
+ */
+public class HtmlEncodingDetector implements EncodingDetector {
+
+    // TIKA-357 - use bigger buffer for meta tag sniffing (was 4K)
+    private static final int META_TAG_BUFFER_SIZE = 8192;
+
+
+    private static final Pattern HTTP_META_PATTERN = Pattern.compile(
+            "(?is)<\\s*meta\\s+([^<>]+)"
+    );
+
+    //this should match both the older:
+    //<meta http-equiv="content-type" content="text/html; charset=xyz"/>
+    //and 
+    //html5 <meta charset="xyz">
+    //See http://webdesign.about.com/od/metatags/qt/meta-charset.htm
+    //for the noisiness that one might encounter in charset attrs.
+    //Chose to go with strict ([-_:\\.a-z0-9]+) to match encodings
+    //following http://docs.oracle.com/javase/7/docs/api/java/nio/charset/Charset.html
+    //For a more general "not" matcher, try:
+    //("(?is)charset\\s*=\\s*['\\\"]?\\s*([^<>\\s'\\\";]+)")
+    private static final Pattern FLEXIBLE_CHARSET_ATTR_PATTERN = Pattern.compile(
+            ("(?is)charset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)")
+    );
+
+    private static final Charset ASCII = Charset.forName("US-ASCII");
+
+    public Charset detect(InputStream input, Metadata metadata)
+            throws IOException {
+        if (input == null) {
+            return null;
+        }
+
+        // Read enough of the text stream to capture possible meta tags
+        input.mark(META_TAG_BUFFER_SIZE);
+        byte[] buffer = new byte[META_TAG_BUFFER_SIZE];
+        int n = 0;
+        int m = input.read(buffer);
+        while (m != -1 && n < buffer.length) {
+            n += m;
+            m = input.read(buffer, n, buffer.length - n);
+        }
+        input.reset();
+
+        // Interpret the head as ASCII and try to spot a meta tag with
+        // a possible character encoding hint
+
+        String head = ASCII.decode(ByteBuffer.wrap(buffer, 0, n)).toString();
+
+        Matcher equiv = HTTP_META_PATTERN.matcher(head);
+        Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher("");
+        //iterate through meta tags
+        while (equiv.find()) {
+            String attrs = equiv.group(1);
+            charsetMatcher.reset(attrs);
+            //iterate through charset= and return the first match
+            //that is valid
+            while (charsetMatcher.find()) {
+                String candCharset = charsetMatcher.group(1);
+                if (CharsetUtils.isSupported(candCharset)) {
+                    try {
+                        return CharsetUtils.forName(candCharset);
+                    } catch (Exception e) {
+                        //ignore
+                    }
+                }
+            }
+        }
+        return null;
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,309 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.TextContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+class HtmlHandler extends TextContentHandler {
+
+    // List of attributes that need to be resolved.
+    private static final Set<String> URI_ATTRIBUTES =
+            new HashSet<String>(Arrays.asList("src", "href", "longdesc", "cite"));
+    private static final Pattern ICBM =
+            Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
+    private final HtmlMapper mapper;
+    private final XHTMLContentHandler xhtml;
+    private final Metadata metadata;
+    private final StringBuilder title = new StringBuilder();
+    private int bodyLevel = 0;
+    private int discardLevel = 0;
+    private int titleLevel = 0;
+    private boolean isTitleSetToMetadata = false;
+
+    private HtmlHandler(
+            HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) {
+        super(xhtml);
+        this.mapper = mapper;
+        this.xhtml = xhtml;
+        this.metadata = metadata;
+
+        // Try to determine the default base URL, if one has not been given
+        if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
+            String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+            if (name != null) {
+                name = name.trim();
+                try {
+                    new URL(name); // test URL format
+                    metadata.set(Metadata.CONTENT_LOCATION, name);
+                } catch (MalformedURLException e) {
+                    // The resource name is not a valid URL, ignore it
+                }
+            }
+        }
+    }
+
+    public HtmlHandler(
+            HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
+        this(mapper, new XHTMLContentHandler(handler, metadata), metadata);
+    }
+
+    @Override
+    public void startElement(
+            String uri, String local, String name, Attributes atts)
+            throws SAXException {
+        if ("TITLE".equals(name) || titleLevel > 0) {
+            titleLevel++;
+        }
+        if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) {
+            bodyLevel++;
+        }
+        if (mapper.isDiscardElement(name) || discardLevel > 0) {
+            discardLevel++;
+        }
+
+        if (bodyLevel == 0 && discardLevel == 0) {
+            if ("META".equals(name) && atts.getValue("content") != null) {
+                // TIKA-478: For cases where we have either a name or
+                // "http-equiv", assume that XHTMLContentHandler will emit
+                // these in the <head>, thus passing them through safely.
+                if (atts.getValue("http-equiv") != null) {
+                    addHtmlMetadata(
+                            atts.getValue("http-equiv"),
+                            atts.getValue("content"));
+                } else if (atts.getValue("name") != null) {
+                    // Record the meta tag in the metadata
+                    addHtmlMetadata(
+                            atts.getValue("name"),
+                            atts.getValue("content"));
+                } else if (atts.getValue("property") != null) {
+                    // TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
+                    metadata.add(
+                            atts.getValue("property"),
+                            atts.getValue("content"));
+                }
+            } else if ("BASE".equals(name) && atts.getValue("href") != null) {
+                startElementWithSafeAttributes("base", atts);
+                xhtml.endElement("base");
+                metadata.set(
+                        Metadata.CONTENT_LOCATION,
+                        resolve(atts.getValue("href")));
+            } else if ("LINK".equals(name)) {
+                startElementWithSafeAttributes("link", atts);
+                xhtml.endElement("link");
+            }
+        }
+
+        if (bodyLevel > 0 && discardLevel == 0) {
+            String safe = mapper.mapSafeElement(name);
+            if (safe != null) {
+                startElementWithSafeAttributes(safe, atts);
+            }
+        }
+
+        title.setLength(0);
+    }
+
+    /**
+     * Adds a metadata setting from the HTML <head/> to the Tika metadata
+     * object. The name and value are normalized where possible.
+     */
+    private void addHtmlMetadata(String name, String value) {
+        if (name == null || value == null) {
+            // ignore
+        } else if (name.equalsIgnoreCase("ICBM")) {
+            Matcher m = ICBM.matcher(value);
+            if (m.matches()) {
+                metadata.set("ICBM", m.group(1) + ", " + m.group(2));
+                metadata.set(Metadata.LATITUDE, m.group(1));
+                metadata.set(Metadata.LONGITUDE, m.group(2));
+            } else {
+                metadata.set("ICBM", value);
+            }
+        } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
+            //don't overwrite Metadata.CONTENT_TYPE!
+            MediaType type = MediaType.parse(value);
+            if (type != null) {
+                metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, type.toString());
+            } else {
+                metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
+            }
+        } else {
+            metadata.add(name, value);
+        }
+    }
+
+    private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
+        if (atts.getLength() == 0) {
+            xhtml.startElement(name);
+            return;
+        }
+
+        boolean isObject = name.equals("object");
+        String codebase = null;
+        if (isObject) {
+            codebase = atts.getValue("", "codebase");
+            if (codebase != null) {
+                codebase = resolve(codebase);
+            } else {
+                codebase = metadata.get(Metadata.CONTENT_LOCATION);
+            }
+        }
+
+        AttributesImpl newAttributes = new AttributesImpl(atts);
+        for (int att = 0; att < newAttributes.getLength(); att++) {
+            String attrName = newAttributes.getLocalName(att);
+            String normAttrName = mapper.mapSafeAttribute(name, attrName);
+            if (normAttrName == null) {
+                newAttributes.removeAttribute(att);
+                att--;
+            } else {
+                // We have a remapped attribute name, so set it as it might have changed.
+                newAttributes.setLocalName(att, normAttrName);
+
+                // And resolve relative links. Eventually this should be pushed
+                // into the HtmlMapper code.
+                if (URI_ATTRIBUTES.contains(normAttrName)) {
+                    newAttributes.setValue(att, resolve(newAttributes.getValue(att)));
+                } else if (isObject && "codebase".equals(normAttrName)) {
+                    newAttributes.setValue(att, codebase);
+                } else if (isObject
+                        && ("data".equals(normAttrName)
+                        || "classid".equals(normAttrName))) {
+                    newAttributes.setValue(
+                            att,
+                            resolve(codebase, newAttributes.getValue(att)));
+                }
+            }
+        }
+
+        if ("img".equals(name) && newAttributes.getValue("", "alt") == null) {
+            newAttributes.addAttribute("", "alt", "alt", "CDATA", "");
+        }
+
+        xhtml.startElement(name, newAttributes);
+    }
+
+    @Override
+    public void endElement(
+            String uri, String local, String name) throws SAXException {
+        if (bodyLevel > 0 && discardLevel == 0) {
+            String safe = mapper.mapSafeElement(name);
+            if (safe != null) {
+                xhtml.endElement(safe);
+            } else if (XHTMLContentHandler.ENDLINE.contains(
+                    name.toLowerCase(Locale.ENGLISH))) {
+                // TIKA-343: Replace closing block tags (and <br/>) with a
+                // newline unless the HtmlMapper above has already mapped
+                // them to something else
+                xhtml.newline();
+            }
+        }
+
+        if (titleLevel > 0) {
+            titleLevel--;
+            if (titleLevel == 0 && !isTitleSetToMetadata) {
+                metadata.set(TikaCoreProperties.TITLE, title.toString().trim());
+                isTitleSetToMetadata = true;
+            }
+        }
+        if (bodyLevel > 0) {
+            bodyLevel--;
+        }
+        if (discardLevel > 0) {
+            discardLevel--;
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length)
+            throws SAXException {
+        if (titleLevel > 0 && bodyLevel == 0) {
+            title.append(ch, start, length);
+        }
+        if (bodyLevel > 0 && discardLevel == 0) {
+            super.characters(ch, start, length);
+        }
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length)
+            throws SAXException {
+        if (bodyLevel > 0 && discardLevel == 0) {
+            super.ignorableWhitespace(ch, start, length);
+        }
+    }
+
+    private String resolve(String url) {
+        return resolve(metadata.get(Metadata.CONTENT_LOCATION), url);
+    }
+
+    private String resolve(String base, String url) {
+        url = url.trim();
+
+        // Return the URL as-is if no base URL is available or if the URL
+        // matches a common non-hierarchical or pseudo URI prefix
+        String lower = url.toLowerCase(Locale.ENGLISH);
+        if (base == null
+                || lower.startsWith("urn:")
+                || lower.startsWith("mailto:")
+                || lower.startsWith("tel:")
+                || lower.startsWith("data:")
+                || lower.startsWith("javascript:")
+                || lower.startsWith("about:")) {
+            return url;
+        }
+
+        try {
+            URL baseURL = new URL(base.trim());
+
+            // We need to handle one special case, where the relativeUrl is
+            // just a query string (like "?pid=1"), and the baseUrl doesn't
+            // end with a '/'. In that case, the URL class removes the last
+            // portion of the path, which we don't want.
+            String path = baseURL.getPath();
+            if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) {
+                return new URL(
+                        baseURL.getProtocol(),
+                        baseURL.getHost(), baseURL.getPort(),
+                        baseURL.getPath() + url).toExternalForm();
+            } else {
+                return new URL(baseURL, url).toExternalForm();
+            }
+        } catch (MalformedURLException e) {
+            // Unknown or broken format; just return the URL as received.
+            return url;
+        }
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+/**
+ * HTML mapper used to make incoming HTML documents easier to handle by
+ * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from
+ * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
+ * that wants to customize this mapping can place a custom HtmlMapper instance
+ * into the parse context.
+ *
+ * @since Apache Tika 0.6
+ */
+public interface HtmlMapper {
+
+    /**
+     * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+     * given element is unknown or deemed unsafe for inclusion in the parse
+     * output, then this method returns <code>null</code> and the element
+     * will be ignored but the content inside it is still processed. See
+     * the {@link #isDiscardElement(String)} method for a way to discard
+     * the entire contents of an element.
+     *
+     * @param name HTML element name (upper case)
+     * @return XHTML element name (lower case), or
+     * <code>null</code> if the element is unsafe
+     */
+    String mapSafeElement(String name);
+
+    /**
+     * Checks whether all content within the given HTML element should be
+     * discarded instead of including it in the parse output.
+     *
+     * @param name HTML element name (upper case)
+     * @return <code>true</code> if content inside the named element
+     * should be ignored, <code>false</code> otherwise
+     */
+    boolean isDiscardElement(String name);
+
+
+    /**
+     * Maps "safe" HTML attribute names to semantic XHTML equivalents. If the
+     * given attribute is unknown or deemed unsafe for inclusion in the parse
+     * output, then this method returns <code>null</code> and the attribute
+     * will be ignored. This method assumes that the element name
+     * is valid and normalised.
+     *
+     * @param elementName   HTML element name (lower case)
+     * @param attributeName HTML attribute name (lower case)
+     * @return XHTML attribute name (lower case), or
+     * <code>null</code> if the element is unsafe
+     */
+    String mapSafeAttribute(String elementName, String attributeName);
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
+ * and post-processes the events to produce XHTML and metadata expected by
+ * Tika clients.
+ */
+public class HtmlParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = 7895315240498733128L;
+
+    private static final MediaType XHTML = MediaType.application("xhtml+xml");
+    private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
+    private static final MediaType X_ASP = MediaType.application("x-asp");
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.text("html"),
+                    XHTML,
+                    WAP_XHTML,
+                    X_ASP)));
+
+    private static final ServiceLoader LOADER =
+            new ServiceLoader(HtmlParser.class.getClassLoader());
+
+    /**
+     * HTML schema singleton used to amortise the heavy instantiation time.
+     */
+    private static final Schema HTML_SCHEMA = new HTMLSchema();
+
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // Automatically detect the character encoding
+        try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
+                metadata,context.get(ServiceLoader.class, LOADER))) {
+            Charset charset = reader.getCharset();
+            String previous = metadata.get(Metadata.CONTENT_TYPE);
+            MediaType contentType = null;
+            if (previous == null || previous.startsWith("text/html")) {
+                contentType = new MediaType(MediaType.TEXT_HTML, charset);
+            } else if (previous.startsWith("application/xhtml+xml")) {
+                contentType = new MediaType(XHTML, charset);
+            } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
+                contentType = new MediaType(WAP_XHTML, charset);
+            } else if (previous.startsWith("application/x-asp")) {
+                contentType = new MediaType(X_ASP, charset);
+            }
+            if (contentType != null) {
+                metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
+            }
+            // deprecated, see TIKA-431
+            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+            // Get the HTML mapper from the parse context
+            HtmlMapper mapper =
+                    context.get(HtmlMapper.class, new HtmlParserMapper());
+
+            // Parse the HTML document
+            org.ccil.cowan.tagsoup.Parser parser =
+                    new org.ccil.cowan.tagsoup.Parser();
+
+            // Use schema from context or default
+            Schema schema = context.get(Schema.class, HTML_SCHEMA);
+
+            // TIKA-528: Reuse share schema to avoid heavy instantiation
+            parser.setProperty(
+                    org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
+            // TIKA-599: Shared schema is thread-safe only if bogons are ignored
+            parser.setFeature(
+                    org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
+
+            parser.setContentHandler(new XHTMLDowngradeHandler(
+                    new HtmlHandler(mapper, handler, metadata)));
+
+            parser.parse(reader.asInputSource());
+        }
+    }
+
+    /**
+     * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+     * given element is unknown or deemed unsafe for inclusion in the parse
+     * output, then this method returns <code>null</code> and the element
+     * will be ignored but the content inside it is still processed. See
+     * the {@link #isDiscardElement(String)} method for a way to discard
+     * the entire contents of an element.
+     * <p/>
+     * Subclasses can override this method to customize the default mapping.
+     *
+     * @param name HTML element name (upper case)
+     * @return XHTML element name (lower case), or
+     * <code>null</code> if the element is unsafe
+     * @since Apache Tika 0.5
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     * the HTML mapping. This method will be removed in Tika 1.0.
+     */
+    protected String mapSafeElement(String name) {
+        return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
+    }
+
+    /**
+     * Checks whether all content within the given HTML element should be
+     * discarded instead of including it in the parse output. Subclasses
+     * can override this method to customize the set of discarded elements.
+     *
+     * @param name HTML element name (upper case)
+     * @return <code>true</code> if content inside the named element
+     * should be ignored, <code>false</code> otherwise
+     * @since Apache Tika 0.5
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     * the HTML mapping. This method will be removed in Tika 1.0.
+     */
+    protected boolean isDiscardElement(String name) {
+        return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
+    }
+
+    /**
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     * the HTML mapping. This method will be removed in Tika 1.0.
+     */
+    public String mapSafeAttribute(String elementName, String attributeName) {
+        return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName);
+    }
+
+    /**
+     * Adapter class that maintains backwards compatibility with the
+     * protected HtmlParser methods. Making HtmlParser implement HtmlMapper
+     * directly would require those methods to be public, which would break
+     * backwards compatibility with subclasses.
+     *
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     * the HTML mapping. This class will be removed in Tika 1.0.
+     */
+    private class HtmlParserMapper implements HtmlMapper {
+        public String mapSafeElement(String name) {
+            return HtmlParser.this.mapSafeElement(name);
+        }
+
+        public boolean isDiscardElement(String name) {
+            return HtmlParser.this.isDiscardElement(name);
+        }
+
+        public String mapSafeAttribute(String elementName, String attributeName) {
+            return HtmlParser.this.mapSafeAttribute(elementName, attributeName);
+        }
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.util.Locale;
+
+/**
+ * Alternative HTML mapping rules that pass the input HTML as-is without any
+ * modifications.
+ *
+ * @since Apache Tika 0.8
+ */
+public class IdentityHtmlMapper implements HtmlMapper {
+
+    public static final HtmlMapper INSTANCE = new IdentityHtmlMapper();
+
+    public boolean isDiscardElement(String name) {
+        return false;
+    }
+
+    public String mapSafeAttribute(String elementName, String attributeName) {
+        return attributeName.toLowerCase(Locale.ENGLISH);
+    }
+
+    public String mapSafeElement(String name) {
+        return name.toLowerCase(Locale.ENGLISH);
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java Wed Jan  6 03:50:50 2016
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import javax.xml.XMLConstants;
+import java.util.Locale;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that downgrades XHTML elements to
+ * old-style HTML elements before passing them on to the decorated
+ * content handler. This downgrading consists of dropping all namespaces
+ * (and namespaced attributes) and uppercasing all element names.
+ * Used by the {@link HtmlParser} to make all incoming HTML look the same.
+ */
+class XHTMLDowngradeHandler extends ContentHandlerDecorator {
+
+    public XHTMLDowngradeHandler(ContentHandler handler) {
+        super(handler);
+    }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String name, Attributes atts)
+            throws SAXException {
+        String upper = localName.toUpperCase(Locale.ENGLISH);
+
+        AttributesImpl attributes = new AttributesImpl();
+        for (int i = 0; i < atts.getLength(); i++) {
+            String auri = atts.getURI(i);
+            String local = atts.getLocalName(i);
+            String qname = atts.getQName(i);
+            if (XMLConstants.NULL_NS_URI.equals(auri)
+                    && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
+                    && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
+                attributes.addAttribute(
+                        auri, local, qname, atts.getType(i), atts.getValue(i));
+            }
+        }
+
+        super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes);
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String name)
+            throws SAXException {
+        String upper = localName.toUpperCase(Locale.ENGLISH);
+        super.endElement(XMLConstants.NULL_NS_URI, upper, upper);
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) {
+    }
+
+    @Override
+    public void endPrefixMapping(String prefix) {
+    }
+
+}