You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/06 04:50:57 UTC
svn commit: r1723223 [29/32] - in /tika/branches/2.x:
tika-core/src/test/resources/META-INF/
tika-core/src/test/resources/META-INF/services/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-module/src/ ti...
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,101 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class DcXMLParserTest extends TikaTest {
+
+ @Test
+ public void testXMLParserAsciiChars() throws Exception {
+ try (InputStream input = DcXMLParserTest.class.getResourceAsStream(
+ "/test-documents/testXML.xml")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new DcXMLParser().parse(input, handler, metadata);
+
+ assertEquals(
+ "application/xml",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Tika test document", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Rida Benjelloun", metadata.get(TikaCoreProperties.CREATOR));
+
+ // The file contains 5 dc:subject tags, which come through as
+ // a multi-valued Tika Metadata entry in file order
+ assertEquals(true, metadata.isMultiValued(TikaCoreProperties.KEYWORDS));
+ assertEquals(5, metadata.getValues(TikaCoreProperties.KEYWORDS).length);
+ assertEquals("Java", metadata.getValues(TikaCoreProperties.KEYWORDS)[0]);
+ assertEquals("XML", metadata.getValues(TikaCoreProperties.KEYWORDS)[1]);
+ assertEquals("XSLT", metadata.getValues(TikaCoreProperties.KEYWORDS)[2]);
+ assertEquals("JDOM", metadata.getValues(TikaCoreProperties.KEYWORDS)[3]);
+ assertEquals("Indexation", metadata.getValues(TikaCoreProperties.KEYWORDS)[4]);
+ assertEquals(true, metadata.isMultiValued(Metadata.SUBJECT));
+ assertEquals(5, metadata.getValues(Metadata.SUBJECT).length);
+ assertEquals("Java", metadata.getValues(Metadata.SUBJECT)[0]);
+ assertEquals("XML", metadata.getValues(Metadata.SUBJECT)[1]);
+ assertEquals("XSLT", metadata.getValues(Metadata.SUBJECT)[2]);
+ assertEquals("JDOM", metadata.getValues(Metadata.SUBJECT)[3]);
+ assertEquals("Indexation", metadata.getValues(Metadata.SUBJECT)[4]);
+
+ assertEquals(
+ "Framework d\'indexation des documents XML, HTML, PDF etc..",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals(
+ "http://www.apache.org",
+ metadata.get(TikaCoreProperties.IDENTIFIER));
+ assertEquals("test", metadata.get(TikaCoreProperties.TYPE));
+ assertEquals("application/msword", metadata.get(TikaCoreProperties.FORMAT));
+ assertEquals("Fr", metadata.get(TikaCoreProperties.LANGUAGE));
+ assertTrue(metadata.get(TikaCoreProperties.RIGHTS).contains("testing chars"));
+
+ String content = handler.toString();
+ assertContains("Tika test document", content);
+
+ assertEquals("2000-12-01T00:00:00.000Z", metadata.get(TikaCoreProperties.CREATED));
+ }
+ }
+
+ @Test
+ public void testXMLParserNonAsciiChars() throws Exception {
+ try (InputStream input = DcXMLParserTest.class.getResourceAsStream("/test-documents/testXML.xml")) {
+ Metadata metadata = new Metadata();
+ new DcXMLParser().parse(input, new DefaultHandler(), metadata);
+
+ final String expected = "Archim\u00E8de et Lius \u00E0 Ch\u00E2teauneuf testing chars en \u00E9t\u00E9";
+ assertEquals(expected, metadata.get(TikaCoreProperties.RIGHTS));
+ }
+ }
+
+ // TIKA-1048
+ @Test
+ public void testNoSpaces() throws Exception {
+ String text = getXML("testXML2.xml").xml;
+ assertFalse(text.contains("testSubject"));
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class EmptyAndDuplicateElementsXMLParserTest extends TikaTest {
+
+ private Property FIRST_NAME = Property.internalTextBag(
+ "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "FirstName");
+ private Property LAST_NAME = Property.internalTextBag(
+ "custom" + Metadata.NAMESPACE_PREFIX_DELIMITER + "LastName");
+
+ @Test
+ public void testDefaultBehavior() throws Exception {
+ try (InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream(
+ "/test-documents/testXML3.xml")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new DefaultCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals(4, metadata.getValues(FIRST_NAME).length);
+ assertEquals(2, metadata.getValues(LAST_NAME).length);
+
+ assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
+ assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
+
+ assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
+ assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
+
+ // We didn't know Bob's last name, but now we don't know an entry existed
+ assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
+
+ // We don't know Kate's last name because it was a duplicate
+ assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
+ }
+ }
+
+ @Test
+ public void testEmptiesAndRepeats() throws Exception {
+ try (InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream(
+ "/test-documents/testXML3.xml")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new AllowEmptiesAndDuplicatesCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals(4, metadata.getValues(FIRST_NAME).length);
+ assertEquals(4, metadata.getValues(LAST_NAME).length);
+
+ assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
+ assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
+
+ assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
+ assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
+
+ assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
+ assertEquals("", metadata.getValues(LAST_NAME)[2]);
+
+ assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
+ assertEquals("Smith", metadata.getValues(LAST_NAME)[3]);
+ }
+ }
+
+ private class DefaultCustomXMLTestParser extends XMLParser {
+
+ private static final long serialVersionUID = 2458579047014545931L;
+
+ protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) {
+ return new ElementMetadataHandler(
+ "http://custom",
+ localPart,
+ metadata,
+ tikaProperty);
+ }
+
+ protected ContentHandler getContentHandler(
+ ContentHandler handler, Metadata metadata, ParseContext context) {
+ return new TeeContentHandler(
+ super.getContentHandler(handler, metadata, context),
+ getCustomElementHandler(metadata, FIRST_NAME, "FirstName"),
+ getCustomElementHandler(metadata, LAST_NAME, "LastName"));
+ }
+ }
+
+ private class AllowEmptiesAndDuplicatesCustomXMLTestParser extends DefaultCustomXMLTestParser {
+
+ private static final long serialVersionUID = 3735646809954466229L;
+
+ protected ElementMetadataHandler getCustomElementHandler(Metadata metadata, Property tikaProperty, String localPart) {
+ return new ElementMetadataHandler(
+ "http://custom",
+ localPart,
+ metadata,
+ tikaProperty,
+ true,
+ true);
+ }
+ }
+
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xml;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest.TrackingHandler;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class FictionBookParserTest {
+
+ @Test
+ public void testFB2() throws Exception {
+ try (InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new FictionBookParser().parse(input, handler, metadata, new ParseContext());
+ String content = handler.toString();
+
+ assertContains("1812", content);
+ }
+ }
+
+ @Test
+ public void testEmbedded() throws Exception {
+ try (InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2")) {
+ ContainerExtractor extractor = new ParserContainerExtractor();
+ TikaInputStream stream = TikaInputStream.get(input);
+
+ assertEquals(true, extractor.isSupported(stream));
+
+ // Process it
+ TrackingHandler handler = new TrackingHandler();
+ extractor.extract(stream, null, handler);
+
+ assertEquals(2, handler.filenames.size());
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/pom.xml Wed Jan 6 03:50:50 2016
@@ -0,0 +1,102 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-web-module</artifactId>
+ <name>Apache Tika Web Module</name>
+ <url>http://tika.apache.org/</url>
+
+ <properties>
+ <mime4j.version>0.7.2</mime4j.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.ccil.cowan.tagsoup</groupId>
+ <artifactId>tagsoup</artifactId>
+ <version>1.2.1</version>
+ </dependency>
+ <dependency>
+ <groupId>de.l3s.boilerpipe</groupId>
+ <artifactId>boilerpipe</artifactId>
+ <version>1.1.0</version>
+ </dependency>
+ <dependency>
+ <groupId>rome</groupId>
+ <artifactId>rome</artifactId>
+ <version>1.0</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.james</groupId>
+ <artifactId>apache-mime4j-core</artifactId>
+ <version>${mime4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.james</groupId>
+ <artifactId>apache-mime4j-dom</artifactId>
+ <version>${mime4j.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.pff</groupId>
+ <artifactId>java-libpst</artifactId>
+ <version>0.8.1</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-text-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-package-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/feed/FeedParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.feed;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import com.sun.syndication.feed.synd.SyndContent;
+import com.sun.syndication.feed.synd.SyndEntry;
+import com.sun.syndication.feed.synd.SyndFeed;
+import com.sun.syndication.io.FeedException;
+import com.sun.syndication.io.SyndFeedInput;
+
+/**
+ * Feed parser.
+ * <p>
+ * Uses Rome for parsing the feeds. A feed description is put in a paragraph
+ * with its link and title in an anchor.
+ */
+public class FeedParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -3785361933034525186L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("rss+xml"),
+ MediaType.application("atom+xml"))));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // set the encoding?
+ try {
+ SyndFeed feed = new SyndFeedInput().build(
+ new InputSource(new CloseShieldInputStream(stream)));
+
+ String title = stripTags(feed.getTitleEx());
+ String description = stripTags(feed.getDescriptionEx());
+
+ metadata.set(TikaCoreProperties.TITLE, title);
+ metadata.set(TikaCoreProperties.DESCRIPTION, description);
+ // store the other fields in the metadata
+
+ XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ xhtml.element("h1", title);
+ xhtml.element("p", description);
+
+ xhtml.startElement("ul");
+ for (Object e : feed.getEntries()) {
+ SyndEntry entry = (SyndEntry) e;
+ String link = entry.getLink();
+ if (link != null) {
+ xhtml.startElement("li");
+ xhtml.startElement("a", "href", link);
+ xhtml.characters(stripTags(entry.getTitleEx()));
+ xhtml.endElement("a");
+ SyndContent content = entry.getDescription();
+ if (content != null) {
+ xhtml.newline();
+ xhtml.characters(stripTags(content));
+ }
+ xhtml.endElement("li");
+ }
+ }
+ xhtml.endElement("ul");
+
+ xhtml.endDocument();
+ } catch (FeedException e) {
+ throw new TikaException("RSS parse error", e);
+ }
+
+ }
+
+ private static String stripTags(SyndContent c) {
+ if (c == null)
+ return "";
+
+ String value = c.getValue();
+
+ String[] parts = value.split("<[^>]*>");
+ StringBuffer buf = new StringBuffer();
+
+ for (String part : parts)
+ buf.append(part);
+
+ return buf.toString().trim();
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,347 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.List;
+import java.util.Locale;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.extractors.ArticleExtractor;
+import de.l3s.boilerpipe.extractors.DefaultExtractor;
+import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a>
+ * library to automatically extract the main content from a web page.
+ * <p/>
+ * Use this as a {@link ContentHandler} object passed to
+ * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, org.apache.tika.parser.ParseContext)}
+ */
+public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
+
+ /**
+ * The newline character that gets inserted after block elements.
+ */
+ private static final char[] NL = new char[]{'\n'};
+ private ContentHandler delegate;
+ private BoilerpipeExtractor extractor;
+ private boolean includeMarkup;
+ private boolean inHeader;
+ private boolean inFooter;
+ private int headerCharOffset;
+ private List<RecordedElement> elements;
+ private TextDocument td;
+ /**
+ * Creates a new boilerpipe-based content extractor, using the
+ * {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
+ *
+ * @param delegate The {@link ContentHandler} object
+ */
+ public BoilerpipeContentHandler(ContentHandler delegate) {
+ this(delegate, DefaultExtractor.INSTANCE);
+ }
+
+ /**
+ * Creates a content handler that writes XHTML body character events to
+ * the given writer.
+ *
+ * @param writer writer
+ */
+ public BoilerpipeContentHandler(Writer writer) {
+ this(new WriteOutContentHandler(writer));
+ }
+
+ /**
+ * Creates a new boilerpipe-based content extractor, using the given
+ * extraction rules. The extracted main content will be passed to the
+ * <delegate> content handler.
+ *
+ * @param delegate The {@link ContentHandler} object
+ * @param extractor Extraction rules to use, e.g. {@link ArticleExtractor}
+ */
+ public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) {
+ this.td = null;
+ this.delegate = delegate;
+ this.extractor = extractor;
+ }
+
+ public boolean isIncludeMarkup() {
+ return includeMarkup;
+ }
+
+ public void setIncludeMarkup(boolean includeMarkup) {
+ this.includeMarkup = includeMarkup;
+ }
+
+ /**
+ * Retrieves the built TextDocument
+ *
+ * @return TextDocument
+ */
+ public TextDocument getTextDocument() {
+ return td;
+ }
+
+ @Override
+ public void startDocument() throws SAXException {
+ super.startDocument();
+
+ delegate.startDocument();
+
+ inHeader = true;
+ inFooter = false;
+ headerCharOffset = 0;
+
+ if (includeMarkup) {
+ elements = new ArrayList<RecordedElement>();
+ }
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) throws SAXException {
+ super.startPrefixMapping(prefix, uri);
+ delegate.startPrefixMapping(prefix, uri);
+ }
+
+ ;
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ super.startElement(uri, localName, qName, atts);
+
+ if (inHeader) {
+ delegate.startElement(uri, localName, qName, atts);
+ } else if (inFooter) {
+ // Do nothing
+ } else if (includeMarkup) {
+ elements.add(new RecordedElement(uri, localName, qName, atts));
+ } else {
+ // This happens for the <body> element, if we're not doing markup.
+ delegate.startElement(uri, localName, qName, atts);
+ }
+ }
+
+ ;
+
+ @Override
+ public void characters(char[] chars, int offset, int length) throws SAXException {
+ super.characters(chars, offset, length);
+
+ if (inHeader) {
+ delegate.characters(chars, offset, length);
+ headerCharOffset++;
+ } else if (inFooter) {
+ // Do nothing
+ } else if (includeMarkup) {
+ RecordedElement element = elements.get(elements.size() - 1);
+
+ char[] characters = new char[length];
+ System.arraycopy(chars, offset, characters, 0, length);
+ element.getCharacters().add(characters);
+ }
+ }
+
+ ;
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ super.endElement(uri, localName, qName);
+
+ if (inHeader) {
+ delegate.endElement(uri, localName, qName);
+ inHeader = !localName.equals("head");
+ } else if (inFooter) {
+ // Do nothing
+ } else if (localName.equals("body")) {
+ inFooter = true;
+ } else if (includeMarkup) {
+ // Add the end element, and the continuation from the previous element
+ elements.add(new RecordedElement(uri, localName, qName));
+ elements.add(new RecordedElement());
+ }
+ }
+
+ ;
+
+ @Override
+ public void endDocument() throws SAXException {
+ super.endDocument();
+
+ td = toTextDocument();
+ try {
+ extractor.process(td);
+ } catch (BoilerpipeProcessingException e) {
+ throw new SAXException(e);
+ }
+
+ Attributes emptyAttrs = new AttributesImpl();
+
+ // At this point we have all the information we need to either emit N paragraphs
+ // of plain text (if not including markup), or we have to replay our recorded elements
+ // and only emit character runs that passed the boilerpipe filters.
+ if (includeMarkup) {
+ BitSet validCharacterRuns = new BitSet();
+ for (TextBlock block : td.getTextBlocks()) {
+ if (block.isContent()) {
+ BitSet bs = block.getContainedTextElements();
+ if (bs != null) {
+ validCharacterRuns.or(bs);
+ }
+ }
+ }
+
+ // Now have bits set for all valid character runs. Replay our recorded elements,
+ // but only emit character runs flagged as valid.
+ int curCharsIndex = headerCharOffset;
+
+ for (RecordedElement element : elements) {
+ switch (element.getElementType()) {
+ case START:
+ delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
+ // Fall through
+
+ case CONTINUE:
+ // Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so
+ // we have to follow suit.
+ for (char[] chars : element.getCharacters()) {
+ curCharsIndex++;
+
+ if (validCharacterRuns.get(curCharsIndex)) {
+ delegate.characters(chars, 0, chars.length);
+
+ // https://issues.apache.org/jira/browse/TIKA-961
+ if (!Character.isWhitespace(chars[chars.length - 1])) {
+ // Only add whitespace for certain elements
+ if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
+ delegate.ignorableWhitespace(NL, 0, NL.length);
+ }
+ }
+ }
+ }
+ break;
+
+ case END:
+ delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
+ break;
+
+ default:
+ throw new RuntimeException("Unhandled element type: " + element.getElementType());
+ }
+
+
+ }
+ } else {
+ for (TextBlock block : td.getTextBlocks()) {
+ if (block.isContent()) {
+ delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
+ char[] chars = block.getText().toCharArray();
+ delegate.characters(chars, 0, chars.length);
+ delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
+ delegate.ignorableWhitespace(NL, 0, NL.length);
+ }
+ }
+ }
+
+ delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
+ delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");
+
+ // We defer ending any prefix mapping until here, which is why we don't pass this
+ // through to the delegate in an overridden method.
+ delegate.endPrefixMapping("");
+
+ delegate.endDocument();
+ }
+
+ ;
+
+ private static class RecordedElement {
+ private String uri;
+ private String localName;
+ private String qName;
+ private Attributes attrs;
+ private List<char[]> characters;
+ private ElementType elementType;
+ public RecordedElement(String uri, String localName, String qName, Attributes attrs) {
+ this(uri, localName, qName, attrs, ElementType.START);
+ }
+
+ public RecordedElement(String uri, String localName, String qName) {
+ this(uri, localName, qName, null, ElementType.END);
+ }
+
+ public RecordedElement() {
+ this(null, null, null, null, ElementType.CONTINUE);
+ }
+
+ protected RecordedElement(String uri, String localName, String qName, Attributes attrs, RecordedElement.ElementType elementType) {
+ this.uri = uri;
+ this.localName = localName;
+ this.qName = qName;
+ this.attrs = attrs;
+ this.elementType = elementType;
+ this.characters = new ArrayList<char[]>();
+ }
+
+ @Override
+ public String toString() {
+ return String.format(Locale.ROOT, "<%s> of type %s", localName, elementType);
+ }
+
+ public String getUri() {
+ return uri;
+ }
+
+ public String getLocalName() {
+ return localName;
+ }
+
+ public String getQName() {
+ return qName;
+ }
+
+ public Attributes getAttrs() {
+ return attrs;
+ }
+
+ public List<char[]> getCharacters() {
+ return characters;
+ }
+
+ public RecordedElement.ElementType getElementType() {
+ return elementType;
+ }
+
+ public enum ElementType {
+ START,
+ END,
+ CONTINUE
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * The default HTML mapping rules in Tika.
+ *
+ * @since Apache Tika 0.6
+ */
+@SuppressWarnings("serial")
+public class DefaultHtmlMapper implements HtmlMapper {
+
+ /**
+ * @since Apache Tika 0.8
+ */
+ public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
+ // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
+ private static final Map<String, String> SAFE_ELEMENTS = new HashMap<String, String>() {{
+ put("H1", "h1");
+ put("H2", "h2");
+ put("H3", "h3");
+ put("H4", "h4");
+ put("H5", "h5");
+ put("H6", "h6");
+
+ put("P", "p");
+ put("PRE", "pre");
+ put("BLOCKQUOTE", "blockquote");
+ put("Q", "q");
+
+ put("UL", "ul");
+ put("OL", "ol");
+ put("MENU", "ul");
+ put("LI", "li");
+ put("DL", "dl");
+ put("DT", "dt");
+ put("DD", "dd");
+
+ put("TABLE", "table");
+ put("THEAD", "thead");
+ put("TBODY", "tbody");
+ put("TR", "tr");
+ put("TH", "th");
+ put("TD", "td");
+
+ put("ADDRESS", "address");
+
+ // TIKA-460 - add anchors
+ put("A", "a");
+
+ // TIKA-463 - add additional elements that contain URLs (and their sub-elements)
+ put("MAP", "map");
+ put("AREA", "area");
+ put("IMG", "img");
+ put("FRAMESET", "frameset");
+ put("FRAME", "frame");
+ put("IFRAME", "iframe");
+ put("OBJECT", "object");
+ put("PARAM", "param");
+ put("INS", "ins");
+ put("DEL", "del");
+ }};
+ private static final Set<String> DISCARDABLE_ELEMENTS = new HashSet<String>() {{
+ add("STYLE");
+ add("SCRIPT");
+ }};
+ // For information on tags & attributes, see:
+ // http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#a_dtd_XHTML-1.0-Strict
+ // http://www.w3schools.com/TAGS/
+ private static final Map<String, Set<String>> SAFE_ATTRIBUTES = new HashMap<String, Set<String>>() {{
+ put("a", attrSet("charset", "type", "name", "href", "hreflang", "rel", "rev", "shape", "coords"));
+ put("img", attrSet("src", "alt", "longdesc", "height", "width", "usemap", "ismap"));
+ put("frame", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling"));
+ put("iframe", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width"));
+ put("link", attrSet("charset", "href", "hreflang", "type", "rel", "rev", "media"));
+ put("map", attrSet("id", "class", "style", "title", "name"));
+ put("area", attrSet("shape", "coords", "href", "nohref", "alt"));
+ put("object", attrSet("declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height",
+ "width", "usemap", "name", "tabindex", "align", "border", "hspace", "vspace"));
+ put("param", attrSet("id", "name", "value", "valuetype", "type"));
+ put("blockquote", attrSet("cite"));
+ put("ins", attrSet("cite", "datetime"));
+ put("del", attrSet("cite", "datetime"));
+ put("q", attrSet("cite"));
+
+ // TODO - fill out this set. Include core, i18n, etc sets where appropriate.
+ }};
+
+ private static Set<String> attrSet(String... attrs) {
+ Set<String> result = new HashSet<String>();
+ for (String attr : attrs) {
+ result.add(attr);
+ }
+ return result;
+ }
+
+ public String mapSafeElement(String name) {
+ return SAFE_ELEMENTS.get(name);
+ }
+
+ /**
+ * Normalizes an attribute name. Assumes that the element name
+ * is valid and normalized
+ */
+ public String mapSafeAttribute(String elementName, String attributeName) {
+ Set<String> safeAttrs = SAFE_ATTRIBUTES.get(elementName);
+ if ((safeAttrs != null) && safeAttrs.contains(attributeName)) {
+ return attributeName;
+ } else {
+ return null;
+ }
+ }
+
+ public boolean isDiscardElement(String name) {
+ return DISCARDABLE_ELEMENTS.contains(name);
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.utils.CharsetUtils;
+
+/**
+ * Character encoding detector for determining the character encoding of a
+ * HTML document based on the potential charset parameter found in a
+ * Content-Type http-equiv meta tag somewhere near the beginning. Especially
+ * useful for determining the type among multiple closely related encodings
+ * (ISO-8859-*) for which other types of encoding detection are unreliable.
+ *
+ * @since Apache Tika 1.2
+ */
+public class HtmlEncodingDetector implements EncodingDetector {
+
+ // TIKA-357 - use bigger buffer for meta tag sniffing (was 4K)
+ private static final int META_TAG_BUFFER_SIZE = 8192;
+
+
+ private static final Pattern HTTP_META_PATTERN = Pattern.compile(
+ "(?is)<\\s*meta\\s+([^<>]+)"
+ );
+
+ //this should match both the older:
+ //<meta http-equiv="content-type" content="text/html; charset=xyz"/>
+ //and
+ //html5 <meta charset="xyz">
+ //See http://webdesign.about.com/od/metatags/qt/meta-charset.htm
+ //for the noisiness that one might encounter in charset attrs.
+ //Chose to go with strict ([-_:\\.a-z0-9]+) to match encodings
+ //following http://docs.oracle.com/javase/7/docs/api/java/nio/charset/Charset.html
+ //For a more general "not" matcher, try:
+ //("(?is)charset\\s*=\\s*['\\\"]?\\s*([^<>\\s'\\\";]+)")
+ private static final Pattern FLEXIBLE_CHARSET_ATTR_PATTERN = Pattern.compile(
+ ("(?is)charset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)")
+ );
+
+ private static final Charset ASCII = Charset.forName("US-ASCII");
+
+ public Charset detect(InputStream input, Metadata metadata)
+ throws IOException {
+ if (input == null) {
+ return null;
+ }
+
+ // Read enough of the text stream to capture possible meta tags
+ input.mark(META_TAG_BUFFER_SIZE);
+ byte[] buffer = new byte[META_TAG_BUFFER_SIZE];
+ int n = 0;
+ int m = input.read(buffer);
+ while (m != -1 && n < buffer.length) {
+ n += m;
+ m = input.read(buffer, n, buffer.length - n);
+ }
+ input.reset();
+
+ // Interpret the head as ASCII and try to spot a meta tag with
+ // a possible character encoding hint
+
+ String head = ASCII.decode(ByteBuffer.wrap(buffer, 0, n)).toString();
+
+ Matcher equiv = HTTP_META_PATTERN.matcher(head);
+ Matcher charsetMatcher = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher("");
+ //iterate through meta tags
+ while (equiv.find()) {
+ String attrs = equiv.group(1);
+ charsetMatcher.reset(attrs);
+ //iterate through charset= and return the first match
+ //that is valid
+ while (charsetMatcher.find()) {
+ String candCharset = charsetMatcher.group(1);
+ if (CharsetUtils.isSupported(candCharset)) {
+ try {
+ return CharsetUtils.forName(candCharset);
+ } catch (Exception e) {
+ //ignore
+ }
+ }
+ }
+ }
+ return null;
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,309 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.TextContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+class HtmlHandler extends TextContentHandler {
+
+ // List of attributes that need to be resolved.
+ private static final Set<String> URI_ATTRIBUTES =
+ new HashSet<String>(Arrays.asList("src", "href", "longdesc", "cite"));
+ private static final Pattern ICBM =
+ Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
+ private final HtmlMapper mapper;
+ private final XHTMLContentHandler xhtml;
+ private final Metadata metadata;
+ private final StringBuilder title = new StringBuilder();
+ private int bodyLevel = 0;
+ private int discardLevel = 0;
+ private int titleLevel = 0;
+ private boolean isTitleSetToMetadata = false;
+
+ private HtmlHandler(
+ HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) {
+ super(xhtml);
+ this.mapper = mapper;
+ this.xhtml = xhtml;
+ this.metadata = metadata;
+
+ // Try to determine the default base URL, if one has not been given
+ if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (name != null) {
+ name = name.trim();
+ try {
+ new URL(name); // test URL format
+ metadata.set(Metadata.CONTENT_LOCATION, name);
+ } catch (MalformedURLException e) {
+ // The resource name is not a valid URL, ignore it
+ }
+ }
+ }
+ }
+
+ public HtmlHandler(
+ HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
+ this(mapper, new XHTMLContentHandler(handler, metadata), metadata);
+ }
+
+ @Override
+ public void startElement(
+ String uri, String local, String name, Attributes atts)
+ throws SAXException {
+ if ("TITLE".equals(name) || titleLevel > 0) {
+ titleLevel++;
+ }
+ if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) {
+ bodyLevel++;
+ }
+ if (mapper.isDiscardElement(name) || discardLevel > 0) {
+ discardLevel++;
+ }
+
+ if (bodyLevel == 0 && discardLevel == 0) {
+ if ("META".equals(name) && atts.getValue("content") != null) {
+ // TIKA-478: For cases where we have either a name or
+ // "http-equiv", assume that XHTMLContentHandler will emit
+ // these in the <head>, thus passing them through safely.
+ if (atts.getValue("http-equiv") != null) {
+ addHtmlMetadata(
+ atts.getValue("http-equiv"),
+ atts.getValue("content"));
+ } else if (atts.getValue("name") != null) {
+ // Record the meta tag in the metadata
+ addHtmlMetadata(
+ atts.getValue("name"),
+ atts.getValue("content"));
+ } else if (atts.getValue("property") != null) {
+ // TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
+ metadata.add(
+ atts.getValue("property"),
+ atts.getValue("content"));
+ }
+ } else if ("BASE".equals(name) && atts.getValue("href") != null) {
+ startElementWithSafeAttributes("base", atts);
+ xhtml.endElement("base");
+ metadata.set(
+ Metadata.CONTENT_LOCATION,
+ resolve(atts.getValue("href")));
+ } else if ("LINK".equals(name)) {
+ startElementWithSafeAttributes("link", atts);
+ xhtml.endElement("link");
+ }
+ }
+
+ if (bodyLevel > 0 && discardLevel == 0) {
+ String safe = mapper.mapSafeElement(name);
+ if (safe != null) {
+ startElementWithSafeAttributes(safe, atts);
+ }
+ }
+
+ title.setLength(0);
+ }
+
+ /**
+ * Adds a metadata setting from the HTML <head/> to the Tika metadata
+ * object. The name and value are normalized where possible.
+ */
+ private void addHtmlMetadata(String name, String value) {
+ if (name == null || value == null) {
+ // ignore
+ } else if (name.equalsIgnoreCase("ICBM")) {
+ Matcher m = ICBM.matcher(value);
+ if (m.matches()) {
+ metadata.set("ICBM", m.group(1) + ", " + m.group(2));
+ metadata.set(Metadata.LATITUDE, m.group(1));
+ metadata.set(Metadata.LONGITUDE, m.group(2));
+ } else {
+ metadata.set("ICBM", value);
+ }
+ } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
+ //don't overwrite Metadata.CONTENT_TYPE!
+ MediaType type = MediaType.parse(value);
+ if (type != null) {
+ metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, type.toString());
+ } else {
+ metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
+ }
+ } else {
+ metadata.add(name, value);
+ }
+ }
+
+ private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
+ if (atts.getLength() == 0) {
+ xhtml.startElement(name);
+ return;
+ }
+
+ boolean isObject = name.equals("object");
+ String codebase = null;
+ if (isObject) {
+ codebase = atts.getValue("", "codebase");
+ if (codebase != null) {
+ codebase = resolve(codebase);
+ } else {
+ codebase = metadata.get(Metadata.CONTENT_LOCATION);
+ }
+ }
+
+ AttributesImpl newAttributes = new AttributesImpl(atts);
+ for (int att = 0; att < newAttributes.getLength(); att++) {
+ String attrName = newAttributes.getLocalName(att);
+ String normAttrName = mapper.mapSafeAttribute(name, attrName);
+ if (normAttrName == null) {
+ newAttributes.removeAttribute(att);
+ att--;
+ } else {
+ // We have a remapped attribute name, so set it as it might have changed.
+ newAttributes.setLocalName(att, normAttrName);
+
+ // And resolve relative links. Eventually this should be pushed
+ // into the HtmlMapper code.
+ if (URI_ATTRIBUTES.contains(normAttrName)) {
+ newAttributes.setValue(att, resolve(newAttributes.getValue(att)));
+ } else if (isObject && "codebase".equals(normAttrName)) {
+ newAttributes.setValue(att, codebase);
+ } else if (isObject
+ && ("data".equals(normAttrName)
+ || "classid".equals(normAttrName))) {
+ newAttributes.setValue(
+ att,
+ resolve(codebase, newAttributes.getValue(att)));
+ }
+ }
+ }
+
+ if ("img".equals(name) && newAttributes.getValue("", "alt") == null) {
+ newAttributes.addAttribute("", "alt", "alt", "CDATA", "");
+ }
+
+ xhtml.startElement(name, newAttributes);
+ }
+
+ @Override
+ public void endElement(
+ String uri, String local, String name) throws SAXException {
+ if (bodyLevel > 0 && discardLevel == 0) {
+ String safe = mapper.mapSafeElement(name);
+ if (safe != null) {
+ xhtml.endElement(safe);
+ } else if (XHTMLContentHandler.ENDLINE.contains(
+ name.toLowerCase(Locale.ENGLISH))) {
+ // TIKA-343: Replace closing block tags (and <br/>) with a
+ // newline unless the HtmlMapper above has already mapped
+ // them to something else
+ xhtml.newline();
+ }
+ }
+
+ if (titleLevel > 0) {
+ titleLevel--;
+ if (titleLevel == 0 && !isTitleSetToMetadata) {
+ metadata.set(TikaCoreProperties.TITLE, title.toString().trim());
+ isTitleSetToMetadata = true;
+ }
+ }
+ if (bodyLevel > 0) {
+ bodyLevel--;
+ }
+ if (discardLevel > 0) {
+ discardLevel--;
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ if (titleLevel > 0 && bodyLevel == 0) {
+ title.append(ch, start, length);
+ }
+ if (bodyLevel > 0 && discardLevel == 0) {
+ super.characters(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ if (bodyLevel > 0 && discardLevel == 0) {
+ super.ignorableWhitespace(ch, start, length);
+ }
+ }
+
+ private String resolve(String url) {
+ return resolve(metadata.get(Metadata.CONTENT_LOCATION), url);
+ }
+
+ private String resolve(String base, String url) {
+ url = url.trim();
+
+ // Return the URL as-is if no base URL is available or if the URL
+ // matches a common non-hierarchical or pseudo URI prefix
+ String lower = url.toLowerCase(Locale.ENGLISH);
+ if (base == null
+ || lower.startsWith("urn:")
+ || lower.startsWith("mailto:")
+ || lower.startsWith("tel:")
+ || lower.startsWith("data:")
+ || lower.startsWith("javascript:")
+ || lower.startsWith("about:")) {
+ return url;
+ }
+
+ try {
+ URL baseURL = new URL(base.trim());
+
+ // We need to handle one special case, where the relativeUrl is
+ // just a query string (like "?pid=1"), and the baseUrl doesn't
+ // end with a '/'. In that case, the URL class removes the last
+ // portion of the path, which we don't want.
+ String path = baseURL.getPath();
+ if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) {
+ return new URL(
+ baseURL.getProtocol(),
+ baseURL.getHost(), baseURL.getPort(),
+ baseURL.getPath() + url).toExternalForm();
+ } else {
+ return new URL(baseURL, url).toExternalForm();
+ }
+ } catch (MalformedURLException e) {
+ // Unknown or broken format; just return the URL as received.
+ return url;
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+/**
+ * HTML mapper used to make incoming HTML documents easier to handle by
+ * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from
+ * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
+ * that wants to customize this mapping can place a custom HtmlMapper instance
+ * into the parse context.
+ *
+ * @since Apache Tika 0.6
+ */
+public interface HtmlMapper {
+
+ /**
+ * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+ * given element is unknown or deemed unsafe for inclusion in the parse
+ * output, then this method returns <code>null</code> and the element
+ * will be ignored but the content inside it is still processed. See
+ * the {@link #isDiscardElement(String)} method for a way to discard
+ * the entire contents of an element.
+ *
+ * @param name HTML element name (upper case)
+ * @return XHTML element name (lower case), or
+ * <code>null</code> if the element is unsafe
+ */
+ String mapSafeElement(String name);
+
+ /**
+ * Checks whether all content within the given HTML element should be
+ * discarded instead of including it in the parse output.
+ *
+ * @param name HTML element name (upper case)
+ * @return <code>true</code> if content inside the named element
+ * should be ignored, <code>false</code> otherwise
+ */
+ boolean isDiscardElement(String name);
+
+
+ /**
+ * Maps "safe" HTML attribute names to semantic XHTML equivalents. If the
+ * given attribute is unknown or deemed unsafe for inclusion in the parse
+ * output, then this method returns <code>null</code> and the attribute
+ * will be ignored. This method assumes that the element name
+ * is valid and normalised.
+ *
+ * @param elementName HTML element name (lower case)
+ * @param attributeName HTML attribute name (lower case)
+ * @return XHTML attribute name (lower case), or
+ * <code>null</code> if the element is unsafe
+ */
+ String mapSafeAttribute(String elementName, String attributeName);
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
+ * and post-processes the events to produce XHTML and metadata expected by
+ * Tika clients.
+ */
+public class HtmlParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 7895315240498733128L;
+
+ private static final MediaType XHTML = MediaType.application("xhtml+xml");
+ private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
+ private static final MediaType X_ASP = MediaType.application("x-asp");
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.text("html"),
+ XHTML,
+ WAP_XHTML,
+ X_ASP)));
+
+ private static final ServiceLoader LOADER =
+ new ServiceLoader(HtmlParser.class.getClassLoader());
+
+ /**
+ * HTML schema singleton used to amortise the heavy instantiation time.
+ */
+ private static final Schema HTML_SCHEMA = new HTMLSchema();
+
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Automatically detect the character encoding
+ try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
+ metadata,context.get(ServiceLoader.class, LOADER))) {
+ Charset charset = reader.getCharset();
+ String previous = metadata.get(Metadata.CONTENT_TYPE);
+ MediaType contentType = null;
+ if (previous == null || previous.startsWith("text/html")) {
+ contentType = new MediaType(MediaType.TEXT_HTML, charset);
+ } else if (previous.startsWith("application/xhtml+xml")) {
+ contentType = new MediaType(XHTML, charset);
+ } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
+ contentType = new MediaType(WAP_XHTML, charset);
+ } else if (previous.startsWith("application/x-asp")) {
+ contentType = new MediaType(X_ASP, charset);
+ }
+ if (contentType != null) {
+ metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
+ }
+ // deprecated, see TIKA-431
+ metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+ // Get the HTML mapper from the parse context
+ HtmlMapper mapper =
+ context.get(HtmlMapper.class, new HtmlParserMapper());
+
+ // Parse the HTML document
+ org.ccil.cowan.tagsoup.Parser parser =
+ new org.ccil.cowan.tagsoup.Parser();
+
+ // Use schema from context or default
+ Schema schema = context.get(Schema.class, HTML_SCHEMA);
+
+ // TIKA-528: Reuse share schema to avoid heavy instantiation
+ parser.setProperty(
+ org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
+ // TIKA-599: Shared schema is thread-safe only if bogons are ignored
+ parser.setFeature(
+ org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
+
+ parser.setContentHandler(new XHTMLDowngradeHandler(
+ new HtmlHandler(mapper, handler, metadata)));
+
+ parser.parse(reader.asInputSource());
+ }
+ }
+
+ /**
+ * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+ * given element is unknown or deemed unsafe for inclusion in the parse
+ * output, then this method returns <code>null</code> and the element
+ * will be ignored but the content inside it is still processed. See
+ * the {@link #isDiscardElement(String)} method for a way to discard
+ * the entire contents of an element.
+ * <p/>
+ * Subclasses can override this method to customize the default mapping.
+ *
+ * @param name HTML element name (upper case)
+ * @return XHTML element name (lower case), or
+ * <code>null</code> if the element is unsafe
+ * @since Apache Tika 0.5
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This method will be removed in Tika 1.0.
+ */
+ protected String mapSafeElement(String name) {
+ return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
+ }
+
+ /**
+ * Checks whether all content within the given HTML element should be
+ * discarded instead of including it in the parse output. Subclasses
+ * can override this method to customize the set of discarded elements.
+ *
+ * @param name HTML element name (upper case)
+ * @return <code>true</code> if content inside the named element
+ * should be ignored, <code>false</code> otherwise
+ * @since Apache Tika 0.5
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This method will be removed in Tika 1.0.
+ */
+ protected boolean isDiscardElement(String name) {
+ return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
+ }
+
+ /**
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This method will be removed in Tika 1.0.
+ */
+ public String mapSafeAttribute(String elementName, String attributeName) {
+ return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName);
+ }
+
+ /**
+ * Adapter class that maintains backwards compatibility with the
+ * protected HtmlParser methods. Making HtmlParser implement HtmlMapper
+ * directly would require those methods to be public, which would break
+ * backwards compatibility with subclasses.
+ *
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This class will be removed in Tika 1.0.
+ */
+ private class HtmlParserMapper implements HtmlMapper {
+ public String mapSafeElement(String name) {
+ return HtmlParser.this.mapSafeElement(name);
+ }
+
+ public boolean isDiscardElement(String name) {
+ return HtmlParser.this.isDiscardElement(name);
+ }
+
+ public String mapSafeAttribute(String elementName, String attributeName) {
+ return HtmlParser.this.mapSafeAttribute(elementName, attributeName);
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/IdentityHtmlMapper.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.util.Locale;
+
+/**
+ * Alternative HTML mapping rules that pass the input HTML as-is without any
+ * modifications.
+ *
+ * @since Apache Tika 0.8
+ */
+public class IdentityHtmlMapper implements HtmlMapper {
+
+ public static final HtmlMapper INSTANCE = new IdentityHtmlMapper();
+
+ public boolean isDiscardElement(String name) {
+ return false;
+ }
+
+ public String mapSafeAttribute(String elementName, String attributeName) {
+ return attributeName.toLowerCase(Locale.ENGLISH);
+ }
+
+ public String mapSafeElement(String name) {
+ return name.toLowerCase(Locale.ENGLISH);
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java?rev=1723223&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-web-module/src/main/java/org/apache/tika/parser/html/XHTMLDowngradeHandler.java Wed Jan 6 03:50:50 2016
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import javax.xml.XMLConstants;
+import java.util.Locale;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Content handler decorator that downgrades XHTML elements to
+ * old-style HTML elements before passing them on to the decorated
+ * content handler. This downgrading consists of dropping all namespaces
+ * (and namespaced attributes) and uppercasing all element names.
+ * Used by the {@link HtmlParser} to make all incoming HTML look the same.
+ */
+class XHTMLDowngradeHandler extends ContentHandlerDecorator {
+
+ public XHTMLDowngradeHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String name, Attributes atts)
+ throws SAXException {
+ String upper = localName.toUpperCase(Locale.ENGLISH);
+
+ AttributesImpl attributes = new AttributesImpl();
+ for (int i = 0; i < atts.getLength(); i++) {
+ String auri = atts.getURI(i);
+ String local = atts.getLocalName(i);
+ String qname = atts.getQName(i);
+ if (XMLConstants.NULL_NS_URI.equals(auri)
+ && !local.equals(XMLConstants.XMLNS_ATTRIBUTE)
+ && !qname.startsWith(XMLConstants.XMLNS_ATTRIBUTE + ":")) {
+ attributes.addAttribute(
+ auri, local, qname, atts.getType(i), atts.getValue(i));
+ }
+ }
+
+ super.startElement(XMLConstants.NULL_NS_URI, upper, upper, attributes);
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String name)
+ throws SAXException {
+ String upper = localName.toUpperCase(Locale.ENGLISH);
+ super.endElement(XMLConstants.NULL_NS_URI, upper, upper);
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) {
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) {
+ }
+
+}