You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 11:11:10 UTC

[04/39] tika git commit: Convert new lines from windows to unix

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
index f43fdc0..4d5cc46 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
@@ -1,347 +1,347 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.BitSet;
-import java.util.List;
-import java.util.Locale;
-
-import de.l3s.boilerpipe.BoilerpipeExtractor;
-import de.l3s.boilerpipe.BoilerpipeProcessingException;
-import de.l3s.boilerpipe.document.TextBlock;
-import de.l3s.boilerpipe.document.TextDocument;
-import de.l3s.boilerpipe.extractors.ArticleExtractor;
-import de.l3s.boilerpipe.extractors.DefaultExtractor;
-import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.WriteOutContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a>
- * library to automatically extract the main content from a web page.
- * <p/>
- * Use this as a {@link ContentHandler} object passed to
- * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, org.apache.tika.parser.ParseContext)}
- */
-public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
-
-    /**
-     * The newline character that gets inserted after block elements.
-     */
-    private static final char[] NL = new char[]{'\n'};
-    private ContentHandler delegate;
-    private BoilerpipeExtractor extractor;
-    private boolean includeMarkup;
-    private boolean inHeader;
-    private boolean inFooter;
-    private int headerCharOffset;
-    private List<RecordedElement> elements;
-    private TextDocument td;
-    /**
-     * Creates a new boilerpipe-based content extractor, using the
-     * {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
-     *
-     * @param delegate The {@link ContentHandler} object
-     */
-    public BoilerpipeContentHandler(ContentHandler delegate) {
-        this(delegate, DefaultExtractor.INSTANCE);
-    }
-
-    /**
-     * Creates a content handler that writes XHTML body character events to
-     * the given writer.
-     *
-     * @param writer writer
-     */
-    public BoilerpipeContentHandler(Writer writer) {
-        this(new WriteOutContentHandler(writer));
-    }
-
-    /**
-     * Creates a new boilerpipe-based content extractor, using the given
-     * extraction rules. The extracted main content will be passed to the
-     * <delegate> content handler.
-     *
-     * @param delegate  The {@link ContentHandler} object
-     * @param extractor Extraction rules to use, e.g. {@link ArticleExtractor}
-     */
-    public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) {
-        this.td = null;
-        this.delegate = delegate;
-        this.extractor = extractor;
-    }
-
-    public boolean isIncludeMarkup() {
-        return includeMarkup;
-    }
-
-    public void setIncludeMarkup(boolean includeMarkup) {
-        this.includeMarkup = includeMarkup;
-    }
-
-    /**
-     * Retrieves the built TextDocument
-     *
-     * @return TextDocument
-     */
-    public TextDocument getTextDocument() {
-        return td;
-    }
-
-    @Override
-    public void startDocument() throws SAXException {
-        super.startDocument();
-
-        delegate.startDocument();
-
-        inHeader = true;
-        inFooter = false;
-        headerCharOffset = 0;
-
-        if (includeMarkup) {
-            elements = new ArrayList<RecordedElement>();
-        }
-    }
-
-    @Override
-    public void startPrefixMapping(String prefix, String uri) throws SAXException {
-        super.startPrefixMapping(prefix, uri);
-        delegate.startPrefixMapping(prefix, uri);
-    }
-
-    ;
-
-    @Override
-    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
-        super.startElement(uri, localName, qName, atts);
-
-        if (inHeader) {
-            delegate.startElement(uri, localName, qName, atts);
-        } else if (inFooter) {
-            // Do nothing
-        } else if (includeMarkup) {
-            elements.add(new RecordedElement(uri, localName, qName, atts));
-        } else {
-            // This happens for the <body> element, if we're not doing markup.
-            delegate.startElement(uri, localName, qName, atts);
-        }
-    }
-
-    ;
-
-    @Override
-    public void characters(char[] chars, int offset, int length) throws SAXException {
-        super.characters(chars, offset, length);
-
-        if (inHeader) {
-            delegate.characters(chars, offset, length);
-            headerCharOffset++;
-        } else if (inFooter) {
-            // Do nothing
-        } else if (includeMarkup) {
-            RecordedElement element = elements.get(elements.size() - 1);
-
-            char[] characters = new char[length];
-            System.arraycopy(chars, offset, characters, 0, length);
-            element.getCharacters().add(characters);
-        }
-    }
-
-    ;
-
-    @Override
-    public void endElement(String uri, String localName, String qName) throws SAXException {
-        super.endElement(uri, localName, qName);
-
-        if (inHeader) {
-            delegate.endElement(uri, localName, qName);
-            inHeader = !localName.equals("head");
-        } else if (inFooter) {
-            // Do nothing
-        } else if (localName.equals("body")) {
-            inFooter = true;
-        } else if (includeMarkup) {
-            // Add the end element, and the continuation from the previous element
-            elements.add(new RecordedElement(uri, localName, qName));
-            elements.add(new RecordedElement());
-        }
-    }
-
-    ;
-
-    @Override
-    public void endDocument() throws SAXException {
-        super.endDocument();
-
-        td = toTextDocument();
-        try {
-            extractor.process(td);
-        } catch (BoilerpipeProcessingException e) {
-            throw new SAXException(e);
-        }
-
-        Attributes emptyAttrs = new AttributesImpl();
-
-        // At this point we have all the information we need to either emit N paragraphs
-        // of plain text (if not including markup), or we have to replay our recorded elements
-        // and only emit character runs that passed the boilerpipe filters.
-        if (includeMarkup) {
-            BitSet validCharacterRuns = new BitSet();
-            for (TextBlock block : td.getTextBlocks()) {
-                if (block.isContent()) {
-                    BitSet bs = block.getContainedTextElements();
-                    if (bs != null) {
-                        validCharacterRuns.or(bs);
-                    }
-                }
-            }
-
-            // Now have bits set for all valid character runs. Replay our recorded elements,
-            // but only emit character runs flagged as valid.
-            int curCharsIndex = headerCharOffset;
-
-            for (RecordedElement element : elements) {
-                switch (element.getElementType()) {
-                    case START:
-                        delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
-                        // Fall through
-
-                    case CONTINUE:
-                        // Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so
-                        // we have to follow suit.
-                        for (char[] chars : element.getCharacters()) {
-                            curCharsIndex++;
-
-                            if (validCharacterRuns.get(curCharsIndex)) {
-                                delegate.characters(chars, 0, chars.length);
-
-                                // https://issues.apache.org/jira/browse/TIKA-961
-                                if (!Character.isWhitespace(chars[chars.length - 1])) {
-                                    // Only add whitespace for certain elements
-                                    if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
-                                        delegate.ignorableWhitespace(NL, 0, NL.length);
-                                    }
-                                }
-                            }
-                        }
-                        break;
-
-                    case END:
-                        delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
-                        break;
-
-                    default:
-                        throw new RuntimeException("Unhandled element type: " + element.getElementType());
-                }
-
-
-            }
-        } else {
-            for (TextBlock block : td.getTextBlocks()) {
-                if (block.isContent()) {
-                    delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
-                    char[] chars = block.getText().toCharArray();
-                    delegate.characters(chars, 0, chars.length);
-                    delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
-                    delegate.ignorableWhitespace(NL, 0, NL.length);
-                }
-            }
-        }
-
-        delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
-        delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");
-
-        // We defer ending any prefix mapping until here, which is why we don't pass this
-        // through to the delegate in an overridden method.
-        delegate.endPrefixMapping("");
-
-        delegate.endDocument();
-    }
-
-    ;
-
-    private static class RecordedElement {
-        private String uri;
-        private String localName;
-        private String qName;
-        private Attributes attrs;
-        private List<char[]> characters;
-        private ElementType elementType;
-        public RecordedElement(String uri, String localName, String qName, Attributes attrs) {
-            this(uri, localName, qName, attrs, ElementType.START);
-        }
-
-        public RecordedElement(String uri, String localName, String qName) {
-            this(uri, localName, qName, null, ElementType.END);
-        }
-
-        public RecordedElement() {
-            this(null, null, null, null, ElementType.CONTINUE);
-        }
-
-        protected RecordedElement(String uri, String localName, String qName, Attributes attrs, RecordedElement.ElementType elementType) {
-            this.uri = uri;
-            this.localName = localName;
-            this.qName = qName;
-            this.attrs = attrs;
-            this.elementType = elementType;
-            this.characters = new ArrayList<char[]>();
-        }
-
-        @Override
-        public String toString() {
-            return String.format(Locale.ROOT, "<%s> of type %s", localName, elementType);
-        }
-
-        public String getUri() {
-            return uri;
-        }
-
-        public String getLocalName() {
-            return localName;
-        }
-
-        public String getQName() {
-            return qName;
-        }
-
-        public Attributes getAttrs() {
-            return attrs;
-        }
-
-        public List<char[]> getCharacters() {
-            return characters;
-        }
-
-        public RecordedElement.ElementType getElementType() {
-            return elementType;
-        }
-
-        public enum ElementType {
-            START,
-            END,
-            CONTINUE
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.List;
+import java.util.Locale;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.extractors.ArticleExtractor;
+import de.l3s.boilerpipe.extractors.DefaultExtractor;
+import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a>
+ * library to automatically extract the main content from a web page.
+ * <p/>
+ * Use this as a {@link ContentHandler} object passed to
+ * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, org.apache.tika.parser.ParseContext)}
+ */
+public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
+
+    /**
+     * The newline character that gets inserted after block elements.
+     */
+    private static final char[] NL = new char[]{'\n'};
+    private ContentHandler delegate;
+    private BoilerpipeExtractor extractor;
+    private boolean includeMarkup;
+    private boolean inHeader;
+    private boolean inFooter;
+    private int headerCharOffset;
+    private List<RecordedElement> elements;
+    private TextDocument td;
+    /**
+     * Creates a new boilerpipe-based content extractor, using the
+     * {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
+     *
+     * @param delegate The {@link ContentHandler} object
+     */
+    public BoilerpipeContentHandler(ContentHandler delegate) {
+        this(delegate, DefaultExtractor.INSTANCE);
+    }
+
+    /**
+     * Creates a content handler that writes XHTML body character events to
+     * the given writer.
+     *
+     * @param writer writer
+     */
+    public BoilerpipeContentHandler(Writer writer) {
+        this(new WriteOutContentHandler(writer));
+    }
+
+    /**
+     * Creates a new boilerpipe-based content extractor, using the given
+     * extraction rules. The extracted main content will be passed to the
+     * <delegate> content handler.
+     *
+     * @param delegate  The {@link ContentHandler} object
+     * @param extractor Extraction rules to use, e.g. {@link ArticleExtractor}
+     */
+    public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) {
+        this.td = null;
+        this.delegate = delegate;
+        this.extractor = extractor;
+    }
+
+    public boolean isIncludeMarkup() {
+        return includeMarkup;
+    }
+
+    public void setIncludeMarkup(boolean includeMarkup) {
+        this.includeMarkup = includeMarkup;
+    }
+
+    /**
+     * Retrieves the built TextDocument
+     *
+     * @return TextDocument
+     */
+    public TextDocument getTextDocument() {
+        return td;
+    }
+
+    @Override
+    public void startDocument() throws SAXException {
+        super.startDocument();
+
+        delegate.startDocument();
+
+        inHeader = true;
+        inFooter = false;
+        headerCharOffset = 0;
+
+        if (includeMarkup) {
+            elements = new ArrayList<RecordedElement>();
+        }
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) throws SAXException {
+        super.startPrefixMapping(prefix, uri);
+        delegate.startPrefixMapping(prefix, uri);
+    }
+
+    ;
+
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+        super.startElement(uri, localName, qName, atts);
+
+        if (inHeader) {
+            delegate.startElement(uri, localName, qName, atts);
+        } else if (inFooter) {
+            // Do nothing
+        } else if (includeMarkup) {
+            elements.add(new RecordedElement(uri, localName, qName, atts));
+        } else {
+            // This happens for the <body> element, if we're not doing markup.
+            delegate.startElement(uri, localName, qName, atts);
+        }
+    }
+
+    ;
+
+    @Override
+    public void characters(char[] chars, int offset, int length) throws SAXException {
+        super.characters(chars, offset, length);
+
+        if (inHeader) {
+            delegate.characters(chars, offset, length);
+            headerCharOffset++;
+        } else if (inFooter) {
+            // Do nothing
+        } else if (includeMarkup) {
+            RecordedElement element = elements.get(elements.size() - 1);
+
+            char[] characters = new char[length];
+            System.arraycopy(chars, offset, characters, 0, length);
+            element.getCharacters().add(characters);
+        }
+    }
+
+    ;
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+        super.endElement(uri, localName, qName);
+
+        if (inHeader) {
+            delegate.endElement(uri, localName, qName);
+            inHeader = !localName.equals("head");
+        } else if (inFooter) {
+            // Do nothing
+        } else if (localName.equals("body")) {
+            inFooter = true;
+        } else if (includeMarkup) {
+            // Add the end element, and the continuation from the previous element
+            elements.add(new RecordedElement(uri, localName, qName));
+            elements.add(new RecordedElement());
+        }
+    }
+
+    ;
+
+    @Override
+    public void endDocument() throws SAXException {
+        super.endDocument();
+
+        td = toTextDocument();
+        try {
+            extractor.process(td);
+        } catch (BoilerpipeProcessingException e) {
+            throw new SAXException(e);
+        }
+
+        Attributes emptyAttrs = new AttributesImpl();
+
+        // At this point we have all the information we need to either emit N paragraphs
+        // of plain text (if not including markup), or we have to replay our recorded elements
+        // and only emit character runs that passed the boilerpipe filters.
+        if (includeMarkup) {
+            BitSet validCharacterRuns = new BitSet();
+            for (TextBlock block : td.getTextBlocks()) {
+                if (block.isContent()) {
+                    BitSet bs = block.getContainedTextElements();
+                    if (bs != null) {
+                        validCharacterRuns.or(bs);
+                    }
+                }
+            }
+
+            // Now have bits set for all valid character runs. Replay our recorded elements,
+            // but only emit character runs flagged as valid.
+            int curCharsIndex = headerCharOffset;
+
+            for (RecordedElement element : elements) {
+                switch (element.getElementType()) {
+                    case START:
+                        delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
+                        // Fall through
+
+                    case CONTINUE:
+                        // Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so
+                        // we have to follow suit.
+                        for (char[] chars : element.getCharacters()) {
+                            curCharsIndex++;
+
+                            if (validCharacterRuns.get(curCharsIndex)) {
+                                delegate.characters(chars, 0, chars.length);
+
+                                // https://issues.apache.org/jira/browse/TIKA-961
+                                if (!Character.isWhitespace(chars[chars.length - 1])) {
+                                    // Only add whitespace for certain elements
+                                    if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
+                                        delegate.ignorableWhitespace(NL, 0, NL.length);
+                                    }
+                                }
+                            }
+                        }
+                        break;
+
+                    case END:
+                        delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
+                        break;
+
+                    default:
+                        throw new RuntimeException("Unhandled element type: " + element.getElementType());
+                }
+
+
+            }
+        } else {
+            for (TextBlock block : td.getTextBlocks()) {
+                if (block.isContent()) {
+                    delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
+                    char[] chars = block.getText().toCharArray();
+                    delegate.characters(chars, 0, chars.length);
+                    delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
+                    delegate.ignorableWhitespace(NL, 0, NL.length);
+                }
+            }
+        }
+
+        delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
+        delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");
+
+        // We defer ending any prefix mapping until here, which is why we don't pass this
+        // through to the delegate in an overridden method.
+        delegate.endPrefixMapping("");
+
+        delegate.endDocument();
+    }
+
+    ;
+
+    private static class RecordedElement {
+        private String uri;
+        private String localName;
+        private String qName;
+        private Attributes attrs;
+        private List<char[]> characters;
+        private ElementType elementType;
+        public RecordedElement(String uri, String localName, String qName, Attributes attrs) {
+            this(uri, localName, qName, attrs, ElementType.START);
+        }
+
+        public RecordedElement(String uri, String localName, String qName) {
+            this(uri, localName, qName, null, ElementType.END);
+        }
+
+        public RecordedElement() {
+            this(null, null, null, null, ElementType.CONTINUE);
+        }
+
+        protected RecordedElement(String uri, String localName, String qName, Attributes attrs, RecordedElement.ElementType elementType) {
+            this.uri = uri;
+            this.localName = localName;
+            this.qName = qName;
+            this.attrs = attrs;
+            this.elementType = elementType;
+            this.characters = new ArrayList<char[]>();
+        }
+
+        @Override
+        public String toString() {
+            return String.format(Locale.ROOT, "<%s> of type %s", localName, elementType);
+        }
+
+        public String getUri() {
+            return uri;
+        }
+
+        public String getLocalName() {
+            return localName;
+        }
+
+        public String getQName() {
+            return qName;
+        }
+
+        public Attributes getAttrs() {
+            return attrs;
+        }
+
+        public List<char[]> getCharacters() {
+            return characters;
+        }
+
+        public RecordedElement.ElementType getElementType() {
+            return elementType;
+        }
+
+        public enum ElementType {
+            START,
+            END,
+            CONTINUE
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
index 0cef05f..4217ac5 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
@@ -1,137 +1,137 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-
-/**
- * The default HTML mapping rules in Tika.
- *
- * @since Apache Tika 0.6
- */
-@SuppressWarnings("serial")
-public class DefaultHtmlMapper implements HtmlMapper {
-
-    /**
-     * @since Apache Tika 0.8
-     */
-    public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
-    // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
-    private static final Map<String, String> SAFE_ELEMENTS = new HashMap<String, String>() {{
-        put("H1", "h1");
-        put("H2", "h2");
-        put("H3", "h3");
-        put("H4", "h4");
-        put("H5", "h5");
-        put("H6", "h6");
-
-        put("P", "p");
-        put("PRE", "pre");
-        put("BLOCKQUOTE", "blockquote");
-        put("Q", "q");
-
-        put("UL", "ul");
-        put("OL", "ol");
-        put("MENU", "ul");
-        put("LI", "li");
-        put("DL", "dl");
-        put("DT", "dt");
-        put("DD", "dd");
-
-        put("TABLE", "table");
-        put("THEAD", "thead");
-        put("TBODY", "tbody");
-        put("TR", "tr");
-        put("TH", "th");
-        put("TD", "td");
-
-        put("ADDRESS", "address");
-
-        // TIKA-460 - add anchors
-        put("A", "a");
-
-        // TIKA-463 - add additional elements that contain URLs (and their sub-elements)
-        put("MAP", "map");
-        put("AREA", "area");
-        put("IMG", "img");
-        put("FRAMESET", "frameset");
-        put("FRAME", "frame");
-        put("IFRAME", "iframe");
-        put("OBJECT", "object");
-        put("PARAM", "param");
-        put("INS", "ins");
-        put("DEL", "del");
-    }};
-    private static final Set<String> DISCARDABLE_ELEMENTS = new HashSet<String>() {{
-        add("STYLE");
-        add("SCRIPT");
-    }};
-    // For information on tags & attributes, see:
-    // http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#a_dtd_XHTML-1.0-Strict
-    // http://www.w3schools.com/TAGS/
-    private static final Map<String, Set<String>> SAFE_ATTRIBUTES = new HashMap<String, Set<String>>() {{
-        put("a", attrSet("charset", "type", "name", "href", "hreflang", "rel", "rev", "shape", "coords"));
-        put("img", attrSet("src", "alt", "longdesc", "height", "width", "usemap", "ismap"));
-        put("frame", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling"));
-        put("iframe", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width"));
-        put("link", attrSet("charset", "href", "hreflang", "type", "rel", "rev", "media"));
-        put("map", attrSet("id", "class", "style", "title", "name"));
-        put("area", attrSet("shape", "coords", "href", "nohref", "alt"));
-        put("object", attrSet("declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height",
-                "width", "usemap", "name", "tabindex", "align", "border", "hspace", "vspace"));
-        put("param", attrSet("id", "name", "value", "valuetype", "type"));
-        put("blockquote", attrSet("cite"));
-        put("ins", attrSet("cite", "datetime"));
-        put("del", attrSet("cite", "datetime"));
-        put("q", attrSet("cite"));
-
-        // TODO - fill out this set. Include core, i18n, etc sets where appropriate.
-    }};
-
-    private static Set<String> attrSet(String... attrs) {
-        Set<String> result = new HashSet<String>();
-        for (String attr : attrs) {
-            result.add(attr);
-        }
-        return result;
-    }
-
-    public String mapSafeElement(String name) {
-        return SAFE_ELEMENTS.get(name);
-    }
-
-    /**
-     * Normalizes an attribute name. Assumes that the element name
-     * is valid and normalized
-     */
-    public String mapSafeAttribute(String elementName, String attributeName) {
-        Set<String> safeAttrs = SAFE_ATTRIBUTES.get(elementName);
-        if ((safeAttrs != null) && safeAttrs.contains(attributeName)) {
-            return attributeName;
-        } else {
-            return null;
-        }
-    }
-
-    public boolean isDiscardElement(String name) {
-        return DISCARDABLE_ELEMENTS.contains(name);
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * The default HTML mapping rules in Tika.
+ *
+ * @since Apache Tika 0.6
+ */
+@SuppressWarnings("serial")
+public class DefaultHtmlMapper implements HtmlMapper {
+
+    /**
+     * @since Apache Tika 0.8
+     */
+    public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
+    // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
+    private static final Map<String, String> SAFE_ELEMENTS = new HashMap<String, String>() {{
+        put("H1", "h1");
+        put("H2", "h2");
+        put("H3", "h3");
+        put("H4", "h4");
+        put("H5", "h5");
+        put("H6", "h6");
+
+        put("P", "p");
+        put("PRE", "pre");
+        put("BLOCKQUOTE", "blockquote");
+        put("Q", "q");
+
+        put("UL", "ul");
+        put("OL", "ol");
+        put("MENU", "ul");
+        put("LI", "li");
+        put("DL", "dl");
+        put("DT", "dt");
+        put("DD", "dd");
+
+        put("TABLE", "table");
+        put("THEAD", "thead");
+        put("TBODY", "tbody");
+        put("TR", "tr");
+        put("TH", "th");
+        put("TD", "td");
+
+        put("ADDRESS", "address");
+
+        // TIKA-460 - add anchors
+        put("A", "a");
+
+        // TIKA-463 - add additional elements that contain URLs (and their sub-elements)
+        put("MAP", "map");
+        put("AREA", "area");
+        put("IMG", "img");
+        put("FRAMESET", "frameset");
+        put("FRAME", "frame");
+        put("IFRAME", "iframe");
+        put("OBJECT", "object");
+        put("PARAM", "param");
+        put("INS", "ins");
+        put("DEL", "del");
+    }};
+    private static final Set<String> DISCARDABLE_ELEMENTS = new HashSet<String>() {{
+        add("STYLE");
+        add("SCRIPT");
+    }};
+    // For information on tags & attributes, see:
+    // http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#a_dtd_XHTML-1.0-Strict
+    // http://www.w3schools.com/TAGS/
+    private static final Map<String, Set<String>> SAFE_ATTRIBUTES = new HashMap<String, Set<String>>() {{
+        put("a", attrSet("charset", "type", "name", "href", "hreflang", "rel", "rev", "shape", "coords"));
+        put("img", attrSet("src", "alt", "longdesc", "height", "width", "usemap", "ismap"));
+        put("frame", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling"));
+        put("iframe", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width"));
+        put("link", attrSet("charset", "href", "hreflang", "type", "rel", "rev", "media"));
+        put("map", attrSet("id", "class", "style", "title", "name"));
+        put("area", attrSet("shape", "coords", "href", "nohref", "alt"));
+        put("object", attrSet("declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height",
+                "width", "usemap", "name", "tabindex", "align", "border", "hspace", "vspace"));
+        put("param", attrSet("id", "name", "value", "valuetype", "type"));
+        put("blockquote", attrSet("cite"));
+        put("ins", attrSet("cite", "datetime"));
+        put("del", attrSet("cite", "datetime"));
+        put("q", attrSet("cite"));
+
+        // TODO - fill out this set. Include core, i18n, etc sets where appropriate.
+    }};
+
+    private static Set<String> attrSet(String... attrs) {
+        Set<String> result = new HashSet<String>();
+        for (String attr : attrs) {
+            result.add(attr);
+        }
+        return result;
+    }
+
+    public String mapSafeElement(String name) {
+        return SAFE_ELEMENTS.get(name);
+    }
+
+    /**
+     * Normalizes an attribute name. Assumes that the element name
+     * is valid and normalized
+     */
+    public String mapSafeAttribute(String elementName, String attributeName) {
+        Set<String> safeAttrs = SAFE_ATTRIBUTES.get(elementName);
+        if ((safeAttrs != null) && safeAttrs.contains(attributeName)) {
+            return attributeName;
+        } else {
+            return null;
+        }
+    }
+
+    public boolean isDiscardElement(String name) {
+        return DISCARDABLE_ELEMENTS.contains(name);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
index c5bbc7a..d5dfaa6 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
@@ -1,309 +1,309 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.sax.TextContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-class HtmlHandler extends TextContentHandler {
-
-    // List of attributes that need to be resolved.
-    private static final Set<String> URI_ATTRIBUTES =
-            new HashSet<String>(Arrays.asList("src", "href", "longdesc", "cite"));
-    private static final Pattern ICBM =
-            Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
-    private final HtmlMapper mapper;
-    private final XHTMLContentHandler xhtml;
-    private final Metadata metadata;
-    private final StringBuilder title = new StringBuilder();
-    private int bodyLevel = 0;
-    private int discardLevel = 0;
-    private int titleLevel = 0;
-    private boolean isTitleSetToMetadata = false;
-
-    private HtmlHandler(
-            HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) {
-        super(xhtml);
-        this.mapper = mapper;
-        this.xhtml = xhtml;
-        this.metadata = metadata;
-
-        // Try to determine the default base URL, if one has not been given
-        if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
-            String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
-            if (name != null) {
-                name = name.trim();
-                try {
-                    new URL(name); // test URL format
-                    metadata.set(Metadata.CONTENT_LOCATION, name);
-                } catch (MalformedURLException e) {
-                    // The resource name is not a valid URL, ignore it
-                }
-            }
-        }
-    }
-
-    public HtmlHandler(
-            HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
-        this(mapper, new XHTMLContentHandler(handler, metadata), metadata);
-    }
-
-    @Override
-    public void startElement(
-            String uri, String local, String name, Attributes atts)
-            throws SAXException {
-        if ("TITLE".equals(name) || titleLevel > 0) {
-            titleLevel++;
-        }
-        if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) {
-            bodyLevel++;
-        }
-        if (mapper.isDiscardElement(name) || discardLevel > 0) {
-            discardLevel++;
-        }
-
-        if (bodyLevel == 0 && discardLevel == 0) {
-            if ("META".equals(name) && atts.getValue("content") != null) {
-                // TIKA-478: For cases where we have either a name or
-                // "http-equiv", assume that XHTMLContentHandler will emit
-                // these in the <head>, thus passing them through safely.
-                if (atts.getValue("http-equiv") != null) {
-                    addHtmlMetadata(
-                            atts.getValue("http-equiv"),
-                            atts.getValue("content"));
-                } else if (atts.getValue("name") != null) {
-                    // Record the meta tag in the metadata
-                    addHtmlMetadata(
-                            atts.getValue("name"),
-                            atts.getValue("content"));
-                } else if (atts.getValue("property") != null) {
-                    // TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
-                    metadata.add(
-                            atts.getValue("property"),
-                            atts.getValue("content"));
-                }
-            } else if ("BASE".equals(name) && atts.getValue("href") != null) {
-                startElementWithSafeAttributes("base", atts);
-                xhtml.endElement("base");
-                metadata.set(
-                        Metadata.CONTENT_LOCATION,
-                        resolve(atts.getValue("href")));
-            } else if ("LINK".equals(name)) {
-                startElementWithSafeAttributes("link", atts);
-                xhtml.endElement("link");
-            }
-        }
-
-        if (bodyLevel > 0 && discardLevel == 0) {
-            String safe = mapper.mapSafeElement(name);
-            if (safe != null) {
-                startElementWithSafeAttributes(safe, atts);
-            }
-        }
-
-        title.setLength(0);
-    }
-
-    /**
-     * Adds a metadata setting from the HTML <head/> to the Tika metadata
-     * object. The name and value are normalized where possible.
-     */
-    private void addHtmlMetadata(String name, String value) {
-        if (name == null || value == null) {
-            // ignore
-        } else if (name.equalsIgnoreCase("ICBM")) {
-            Matcher m = ICBM.matcher(value);
-            if (m.matches()) {
-                metadata.set("ICBM", m.group(1) + ", " + m.group(2));
-                metadata.set(Metadata.LATITUDE, m.group(1));
-                metadata.set(Metadata.LONGITUDE, m.group(2));
-            } else {
-                metadata.set("ICBM", value);
-            }
-        } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
-            //don't overwrite Metadata.CONTENT_TYPE!
-            MediaType type = MediaType.parse(value);
-            if (type != null) {
-                metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, type.toString());
-            } else {
-                metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
-            }
-        } else {
-            metadata.add(name, value);
-        }
-    }
-
-    private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
-        if (atts.getLength() == 0) {
-            xhtml.startElement(name);
-            return;
-        }
-
-        boolean isObject = name.equals("object");
-        String codebase = null;
-        if (isObject) {
-            codebase = atts.getValue("", "codebase");
-            if (codebase != null) {
-                codebase = resolve(codebase);
-            } else {
-                codebase = metadata.get(Metadata.CONTENT_LOCATION);
-            }
-        }
-
-        AttributesImpl newAttributes = new AttributesImpl(atts);
-        for (int att = 0; att < newAttributes.getLength(); att++) {
-            String attrName = newAttributes.getLocalName(att);
-            String normAttrName = mapper.mapSafeAttribute(name, attrName);
-            if (normAttrName == null) {
-                newAttributes.removeAttribute(att);
-                att--;
-            } else {
-                // We have a remapped attribute name, so set it as it might have changed.
-                newAttributes.setLocalName(att, normAttrName);
-
-                // And resolve relative links. Eventually this should be pushed
-                // into the HtmlMapper code.
-                if (URI_ATTRIBUTES.contains(normAttrName)) {
-                    newAttributes.setValue(att, resolve(newAttributes.getValue(att)));
-                } else if (isObject && "codebase".equals(normAttrName)) {
-                    newAttributes.setValue(att, codebase);
-                } else if (isObject
-                        && ("data".equals(normAttrName)
-                        || "classid".equals(normAttrName))) {
-                    newAttributes.setValue(
-                            att,
-                            resolve(codebase, newAttributes.getValue(att)));
-                }
-            }
-        }
-
-        if ("img".equals(name) && newAttributes.getValue("", "alt") == null) {
-            newAttributes.addAttribute("", "alt", "alt", "CDATA", "");
-        }
-
-        xhtml.startElement(name, newAttributes);
-    }
-
-    @Override
-    public void endElement(
-            String uri, String local, String name) throws SAXException {
-        if (bodyLevel > 0 && discardLevel == 0) {
-            String safe = mapper.mapSafeElement(name);
-            if (safe != null) {
-                xhtml.endElement(safe);
-            } else if (XHTMLContentHandler.ENDLINE.contains(
-                    name.toLowerCase(Locale.ENGLISH))) {
-                // TIKA-343: Replace closing block tags (and <br/>) with a
-                // newline unless the HtmlMapper above has already mapped
-                // them to something else
-                xhtml.newline();
-            }
-        }
-
-        if (titleLevel > 0) {
-            titleLevel--;
-            if (titleLevel == 0 && !isTitleSetToMetadata) {
-                metadata.set(TikaCoreProperties.TITLE, title.toString().trim());
-                isTitleSetToMetadata = true;
-            }
-        }
-        if (bodyLevel > 0) {
-            bodyLevel--;
-        }
-        if (discardLevel > 0) {
-            discardLevel--;
-        }
-    }
-
-    @Override
-    public void characters(char[] ch, int start, int length)
-            throws SAXException {
-        if (titleLevel > 0 && bodyLevel == 0) {
-            title.append(ch, start, length);
-        }
-        if (bodyLevel > 0 && discardLevel == 0) {
-            super.characters(ch, start, length);
-        }
-    }
-
-    @Override
-    public void ignorableWhitespace(char[] ch, int start, int length)
-            throws SAXException {
-        if (bodyLevel > 0 && discardLevel == 0) {
-            super.ignorableWhitespace(ch, start, length);
-        }
-    }
-
-    private String resolve(String url) {
-        return resolve(metadata.get(Metadata.CONTENT_LOCATION), url);
-    }
-
-    private String resolve(String base, String url) {
-        url = url.trim();
-
-        // Return the URL as-is if no base URL is available or if the URL
-        // matches a common non-hierarchical or pseudo URI prefix
-        String lower = url.toLowerCase(Locale.ENGLISH);
-        if (base == null
-                || lower.startsWith("urn:")
-                || lower.startsWith("mailto:")
-                || lower.startsWith("tel:")
-                || lower.startsWith("data:")
-                || lower.startsWith("javascript:")
-                || lower.startsWith("about:")) {
-            return url;
-        }
-
-        try {
-            URL baseURL = new URL(base.trim());
-
-            // We need to handle one special case, where the relativeUrl is
-            // just a query string (like "?pid=1"), and the baseUrl doesn't
-            // end with a '/'. In that case, the URL class removes the last
-            // portion of the path, which we don't want.
-            String path = baseURL.getPath();
-            if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) {
-                return new URL(
-                        baseURL.getProtocol(),
-                        baseURL.getHost(), baseURL.getPort(),
-                        baseURL.getPath() + url).toExternalForm();
-            } else {
-                return new URL(baseURL, url).toExternalForm();
-            }
-        } catch (MalformedURLException e) {
-            // Unknown or broken format; just return the URL as received.
-            return url;
-        }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.TextContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+class HtmlHandler extends TextContentHandler {
+
+    // List of attributes that need to be resolved.
+    private static final Set<String> URI_ATTRIBUTES =
+            new HashSet<String>(Arrays.asList("src", "href", "longdesc", "cite"));
+    private static final Pattern ICBM =
+            Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
+    private final HtmlMapper mapper;
+    private final XHTMLContentHandler xhtml;
+    private final Metadata metadata;
+    private final StringBuilder title = new StringBuilder();
+    private int bodyLevel = 0;
+    private int discardLevel = 0;
+    private int titleLevel = 0;
+    private boolean isTitleSetToMetadata = false;
+
+    private HtmlHandler(
+            HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) {
+        super(xhtml);
+        this.mapper = mapper;
+        this.xhtml = xhtml;
+        this.metadata = metadata;
+
+        // Try to determine the default base URL, if one has not been given
+        if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
+            String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+            if (name != null) {
+                name = name.trim();
+                try {
+                    new URL(name); // test URL format
+                    metadata.set(Metadata.CONTENT_LOCATION, name);
+                } catch (MalformedURLException e) {
+                    // The resource name is not a valid URL, ignore it
+                }
+            }
+        }
+    }
+
+    public HtmlHandler(
+            HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
+        this(mapper, new XHTMLContentHandler(handler, metadata), metadata);
+    }
+
+    @Override
+    public void startElement(
+            String uri, String local, String name, Attributes atts)
+            throws SAXException {
+        if ("TITLE".equals(name) || titleLevel > 0) {
+            titleLevel++;
+        }
+        if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) {
+            bodyLevel++;
+        }
+        if (mapper.isDiscardElement(name) || discardLevel > 0) {
+            discardLevel++;
+        }
+
+        if (bodyLevel == 0 && discardLevel == 0) {
+            if ("META".equals(name) && atts.getValue("content") != null) {
+                // TIKA-478: For cases where we have either a name or
+                // "http-equiv", assume that XHTMLContentHandler will emit
+                // these in the <head>, thus passing them through safely.
+                if (atts.getValue("http-equiv") != null) {
+                    addHtmlMetadata(
+                            atts.getValue("http-equiv"),
+                            atts.getValue("content"));
+                } else if (atts.getValue("name") != null) {
+                    // Record the meta tag in the metadata
+                    addHtmlMetadata(
+                            atts.getValue("name"),
+                            atts.getValue("content"));
+                } else if (atts.getValue("property") != null) {
+                    // TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
+                    metadata.add(
+                            atts.getValue("property"),
+                            atts.getValue("content"));
+                }
+            } else if ("BASE".equals(name) && atts.getValue("href") != null) {
+                startElementWithSafeAttributes("base", atts);
+                xhtml.endElement("base");
+                metadata.set(
+                        Metadata.CONTENT_LOCATION,
+                        resolve(atts.getValue("href")));
+            } else if ("LINK".equals(name)) {
+                startElementWithSafeAttributes("link", atts);
+                xhtml.endElement("link");
+            }
+        }
+
+        if (bodyLevel > 0 && discardLevel == 0) {
+            String safe = mapper.mapSafeElement(name);
+            if (safe != null) {
+                startElementWithSafeAttributes(safe, atts);
+            }
+        }
+
+        title.setLength(0);
+    }
+
+    /**
+     * Adds a metadata setting from the HTML <head/> to the Tika metadata
+     * object. The name and value are normalized where possible.
+     */
+    private void addHtmlMetadata(String name, String value) {
+        if (name == null || value == null) {
+            // ignore
+        } else if (name.equalsIgnoreCase("ICBM")) {
+            Matcher m = ICBM.matcher(value);
+            if (m.matches()) {
+                metadata.set("ICBM", m.group(1) + ", " + m.group(2));
+                metadata.set(Metadata.LATITUDE, m.group(1));
+                metadata.set(Metadata.LONGITUDE, m.group(2));
+            } else {
+                metadata.set("ICBM", value);
+            }
+        } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
+            //don't overwrite Metadata.CONTENT_TYPE!
+            MediaType type = MediaType.parse(value);
+            if (type != null) {
+                metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, type.toString());
+            } else {
+                metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
+            }
+        } else {
+            metadata.add(name, value);
+        }
+    }
+
+    private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
+        if (atts.getLength() == 0) {
+            xhtml.startElement(name);
+            return;
+        }
+
+        boolean isObject = name.equals("object");
+        String codebase = null;
+        if (isObject) {
+            codebase = atts.getValue("", "codebase");
+            if (codebase != null) {
+                codebase = resolve(codebase);
+            } else {
+                codebase = metadata.get(Metadata.CONTENT_LOCATION);
+            }
+        }
+
+        AttributesImpl newAttributes = new AttributesImpl(atts);
+        for (int att = 0; att < newAttributes.getLength(); att++) {
+            String attrName = newAttributes.getLocalName(att);
+            String normAttrName = mapper.mapSafeAttribute(name, attrName);
+            if (normAttrName == null) {
+                newAttributes.removeAttribute(att);
+                att--;
+            } else {
+                // We have a remapped attribute name, so set it as it might have changed.
+                newAttributes.setLocalName(att, normAttrName);
+
+                // And resolve relative links. Eventually this should be pushed
+                // into the HtmlMapper code.
+                if (URI_ATTRIBUTES.contains(normAttrName)) {
+                    newAttributes.setValue(att, resolve(newAttributes.getValue(att)));
+                } else if (isObject && "codebase".equals(normAttrName)) {
+                    newAttributes.setValue(att, codebase);
+                } else if (isObject
+                        && ("data".equals(normAttrName)
+                        || "classid".equals(normAttrName))) {
+                    newAttributes.setValue(
+                            att,
+                            resolve(codebase, newAttributes.getValue(att)));
+                }
+            }
+        }
+
+        if ("img".equals(name) && newAttributes.getValue("", "alt") == null) {
+            newAttributes.addAttribute("", "alt", "alt", "CDATA", "");
+        }
+
+        xhtml.startElement(name, newAttributes);
+    }
+
+    @Override
+    public void endElement(
+            String uri, String local, String name) throws SAXException {
+        if (bodyLevel > 0 && discardLevel == 0) {
+            String safe = mapper.mapSafeElement(name);
+            if (safe != null) {
+                xhtml.endElement(safe);
+            } else if (XHTMLContentHandler.ENDLINE.contains(
+                    name.toLowerCase(Locale.ENGLISH))) {
+                // TIKA-343: Replace closing block tags (and <br/>) with a
+                // newline unless the HtmlMapper above has already mapped
+                // them to something else
+                xhtml.newline();
+            }
+        }
+
+        if (titleLevel > 0) {
+            titleLevel--;
+            if (titleLevel == 0 && !isTitleSetToMetadata) {
+                metadata.set(TikaCoreProperties.TITLE, title.toString().trim());
+                isTitleSetToMetadata = true;
+            }
+        }
+        if (bodyLevel > 0) {
+            bodyLevel--;
+        }
+        if (discardLevel > 0) {
+            discardLevel--;
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length)
+            throws SAXException {
+        if (titleLevel > 0 && bodyLevel == 0) {
+            title.append(ch, start, length);
+        }
+        if (bodyLevel > 0 && discardLevel == 0) {
+            super.characters(ch, start, length);
+        }
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length)
+            throws SAXException {
+        if (bodyLevel > 0 && discardLevel == 0) {
+            super.ignorableWhitespace(ch, start, length);
+        }
+    }
+
+    private String resolve(String url) {
+        return resolve(metadata.get(Metadata.CONTENT_LOCATION), url);
+    }
+
+    private String resolve(String base, String url) {
+        url = url.trim();
+
+        // Return the URL as-is if no base URL is available or if the URL
+        // matches a common non-hierarchical or pseudo URI prefix
+        String lower = url.toLowerCase(Locale.ENGLISH);
+        if (base == null
+                || lower.startsWith("urn:")
+                || lower.startsWith("mailto:")
+                || lower.startsWith("tel:")
+                || lower.startsWith("data:")
+                || lower.startsWith("javascript:")
+                || lower.startsWith("about:")) {
+            return url;
+        }
+
+        try {
+            URL baseURL = new URL(base.trim());
+
+            // We need to handle one special case, where the relativeUrl is
+            // just a query string (like "?pid=1"), and the baseUrl doesn't
+            // end with a '/'. In that case, the URL class removes the last
+            // portion of the path, which we don't want.
+            String path = baseURL.getPath();
+            if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) {
+                return new URL(
+                        baseURL.getProtocol(),
+                        baseURL.getHost(), baseURL.getPort(),
+                        baseURL.getPath() + url).toExternalForm();
+            } else {
+                return new URL(baseURL, url).toExternalForm();
+            }
+        } catch (MalformedURLException e) {
+            // Unknown or broken format; just return the URL as received.
+            return url;
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
index 947d26a..1ca7434 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
@@ -1,69 +1,69 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-/**
- * HTML mapper used to make incoming HTML documents easier to handle by
- * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from
- * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
- * that wants to customize this mapping can place a custom HtmlMapper instance
- * into the parse context.
- *
- * @since Apache Tika 0.6
- */
-public interface HtmlMapper {
-
-    /**
-     * Maps "safe" HTML element names to semantic XHTML equivalents. If the
-     * given element is unknown or deemed unsafe for inclusion in the parse
-     * output, then this method returns <code>null</code> and the element
-     * will be ignored but the content inside it is still processed. See
-     * the {@link #isDiscardElement(String)} method for a way to discard
-     * the entire contents of an element.
-     *
-     * @param name HTML element name (upper case)
-     * @return XHTML element name (lower case), or
-     * <code>null</code> if the element is unsafe
-     */
-    String mapSafeElement(String name);
-
-    /**
-     * Checks whether all content within the given HTML element should be
-     * discarded instead of including it in the parse output.
-     *
-     * @param name HTML element name (upper case)
-     * @return <code>true</code> if content inside the named element
-     * should be ignored, <code>false</code> otherwise
-     */
-    boolean isDiscardElement(String name);
-
-
-    /**
-     * Maps "safe" HTML attribute names to semantic XHTML equivalents. If the
-     * given attribute is unknown or deemed unsafe for inclusion in the parse
-     * output, then this method returns <code>null</code> and the attribute
-     * will be ignored. This method assumes that the element name
-     * is valid and normalised.
-     *
-     * @param elementName   HTML element name (lower case)
-     * @param attributeName HTML attribute name (lower case)
-     * @return XHTML attribute name (lower case), or
-     * <code>null</code> if the element is unsafe
-     */
-    String mapSafeAttribute(String elementName, String attributeName);
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+/**
+ * HTML mapper used to make incoming HTML documents easier to handle by
+ * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from
+ * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
+ * that wants to customize this mapping can place a custom HtmlMapper instance
+ * into the parse context.
+ *
+ * @since Apache Tika 0.6
+ */
+public interface HtmlMapper {
+
+    /**
+     * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+     * given element is unknown or deemed unsafe for inclusion in the parse
+     * output, then this method returns <code>null</code> and the element
+     * will be ignored but the content inside it is still processed. See
+     * the {@link #isDiscardElement(String)} method for a way to discard
+     * the entire contents of an element.
+     *
+     * @param name HTML element name (upper case)
+     * @return XHTML element name (lower case), or
+     * <code>null</code> if the element is unsafe
+     */
+    String mapSafeElement(String name);
+
+    /**
+     * Checks whether all content within the given HTML element should be
+     * discarded instead of including it in the parse output.
+     *
+     * @param name HTML element name (upper case)
+     * @return <code>true</code> if content inside the named element
+     * should be ignored, <code>false</code> otherwise
+     */
+    boolean isDiscardElement(String name);
+
+
+    /**
+     * Maps "safe" HTML attribute names to semantic XHTML equivalents. If the
+     * given attribute is unknown or deemed unsafe for inclusion in the parse
+     * output, then this method returns <code>null</code> and the attribute
+     * will be ignored. This method assumes that the element name
+     * is valid and normalised.
+     *
+     * @param elementName   HTML element name (lower case)
+     * @param attributeName HTML attribute name (lower case)
+     * @return XHTML attribute name (lower case), or
+     * <code>null</code> if the element is unsafe
+     */
+    String mapSafeAttribute(String elementName, String attributeName);
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
index 7d6f021..a9a8aa0 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
@@ -1,194 +1,194 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.detect.AutoDetectReader;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.ccil.cowan.tagsoup.HTMLSchema;
-import org.ccil.cowan.tagsoup.Schema;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
- * and post-processes the events to produce XHTML and metadata expected by
- * Tika clients.
- */
-public class HtmlParser extends AbstractParser {
-
-    /**
-     * Serial version UID
-     */
-    private static final long serialVersionUID = 7895315240498733128L;
-
-    private static final MediaType XHTML = MediaType.application("xhtml+xml");
-    private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
-    private static final MediaType X_ASP = MediaType.application("x-asp");
-
-    private static final Set<MediaType> SUPPORTED_TYPES =
-            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
-                    MediaType.text("html"),
-                    XHTML,
-                    WAP_XHTML,
-                    X_ASP)));
-
-    private static final ServiceLoader LOADER =
-            new ServiceLoader(HtmlParser.class.getClassLoader());
-
-    /**
-     * HTML schema singleton used to amortise the heavy instantiation time.
-     */
-    private static final Schema HTML_SCHEMA = new HTMLSchema();
-
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-        // Automatically detect the character encoding
-        try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
-                metadata,context.get(ServiceLoader.class, LOADER))) {
-            Charset charset = reader.getCharset();
-            String previous = metadata.get(Metadata.CONTENT_TYPE);
-            MediaType contentType = null;
-            if (previous == null || previous.startsWith("text/html")) {
-                contentType = new MediaType(MediaType.TEXT_HTML, charset);
-            } else if (previous.startsWith("application/xhtml+xml")) {
-                contentType = new MediaType(XHTML, charset);
-            } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
-                contentType = new MediaType(WAP_XHTML, charset);
-            } else if (previous.startsWith("application/x-asp")) {
-                contentType = new MediaType(X_ASP, charset);
-            }
-            if (contentType != null) {
-                metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
-            }
-            // deprecated, see TIKA-431
-            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
-
-            // Get the HTML mapper from the parse context
-            HtmlMapper mapper =
-                    context.get(HtmlMapper.class, new HtmlParserMapper());
-
-            // Parse the HTML document
-            org.ccil.cowan.tagsoup.Parser parser =
-                    new org.ccil.cowan.tagsoup.Parser();
-
-            // Use schema from context or default
-            Schema schema = context.get(Schema.class, HTML_SCHEMA);
-
-            // TIKA-528: Reuse share schema to avoid heavy instantiation
-            parser.setProperty(
-                    org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
-            // TIKA-599: Shared schema is thread-safe only if bogons are ignored
-            parser.setFeature(
-                    org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
-
-            parser.setContentHandler(new XHTMLDowngradeHandler(
-                    new HtmlHandler(mapper, handler, metadata)));
-
-            parser.parse(reader.asInputSource());
-        }
-    }
-
-    /**
-     * Maps "safe" HTML element names to semantic XHTML equivalents. If the
-     * given element is unknown or deemed unsafe for inclusion in the parse
-     * output, then this method returns <code>null</code> and the element
-     * will be ignored but the content inside it is still processed. See
-     * the {@link #isDiscardElement(String)} method for a way to discard
-     * the entire contents of an element.
-     * <p/>
-     * Subclasses can override this method to customize the default mapping.
-     *
-     * @param name HTML element name (upper case)
-     * @return XHTML element name (lower case), or
-     * <code>null</code> if the element is unsafe
-     * @since Apache Tika 0.5
-     * @deprecated Use the {@link HtmlMapper} mechanism to customize
-     * the HTML mapping. This method will be removed in Tika 1.0.
-     */
-    protected String mapSafeElement(String name) {
-        return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
-    }
-
-    /**
-     * Checks whether all content within the given HTML element should be
-     * discarded instead of including it in the parse output. Subclasses
-     * can override this method to customize the set of discarded elements.
-     *
-     * @param name HTML element name (upper case)
-     * @return <code>true</code> if content inside the named element
-     * should be ignored, <code>false</code> otherwise
-     * @since Apache Tika 0.5
-     * @deprecated Use the {@link HtmlMapper} mechanism to customize
-     * the HTML mapping. This method will be removed in Tika 1.0.
-     */
-    protected boolean isDiscardElement(String name) {
-        return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
-    }
-
-    /**
-     * @deprecated Use the {@link HtmlMapper} mechanism to customize
-     * the HTML mapping. This method will be removed in Tika 1.0.
-     */
-    public String mapSafeAttribute(String elementName, String attributeName) {
-        return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName);
-    }
-
-    /**
-     * Adapter class that maintains backwards compatibility with the
-     * protected HtmlParser methods. Making HtmlParser implement HtmlMapper
-     * directly would require those methods to be public, which would break
-     * backwards compatibility with subclasses.
-     *
-     * @deprecated Use the {@link HtmlMapper} mechanism to customize
-     * the HTML mapping. This class will be removed in Tika 1.0.
-     */
-    private class HtmlParserMapper implements HtmlMapper {
-        public String mapSafeElement(String name) {
-            return HtmlParser.this.mapSafeElement(name);
-        }
-
-        public boolean isDiscardElement(String name) {
-            return HtmlParser.this.isDiscardElement(name);
-        }
-
-        public String mapSafeAttribute(String elementName, String attributeName) {
-            return HtmlParser.this.mapSafeAttribute(elementName, attributeName);
-        }
-    }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
+ * and post-processes the events to produce XHTML and metadata expected by
+ * Tika clients.
+ */
+public class HtmlParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = 7895315240498733128L;
+
+    private static final MediaType XHTML = MediaType.application("xhtml+xml");
+    private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
+    private static final MediaType X_ASP = MediaType.application("x-asp");
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                    MediaType.text("html"),
+                    XHTML,
+                    WAP_XHTML,
+                    X_ASP)));
+
+    private static final ServiceLoader LOADER =
+            new ServiceLoader(HtmlParser.class.getClassLoader());
+
+    /**
+     * HTML schema singleton used to amortise the heavy instantiation time.
+     */
+    private static final Schema HTML_SCHEMA = new HTMLSchema();
+
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // Automatically detect the character encoding
+        try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
+                metadata,context.get(ServiceLoader.class, LOADER))) {
+            Charset charset = reader.getCharset();
+            String previous = metadata.get(Metadata.CONTENT_TYPE);
+            MediaType contentType = null;
+            if (previous == null || previous.startsWith("text/html")) {
+                contentType = new MediaType(MediaType.TEXT_HTML, charset);
+            } else if (previous.startsWith("application/xhtml+xml")) {
+                contentType = new MediaType(XHTML, charset);
+            } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
+                contentType = new MediaType(WAP_XHTML, charset);
+            } else if (previous.startsWith("application/x-asp")) {
+                contentType = new MediaType(X_ASP, charset);
+            }
+            if (contentType != null) {
+                metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
+            }
+            // deprecated, see TIKA-431
+            metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+            // Get the HTML mapper from the parse context
+            HtmlMapper mapper =
+                    context.get(HtmlMapper.class, new HtmlParserMapper());
+
+            // Parse the HTML document
+            org.ccil.cowan.tagsoup.Parser parser =
+                    new org.ccil.cowan.tagsoup.Parser();
+
+            // Use schema from context or default
+            Schema schema = context.get(Schema.class, HTML_SCHEMA);
+
+            // TIKA-528: Reuse share schema to avoid heavy instantiation
+            parser.setProperty(
+                    org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
+            // TIKA-599: Shared schema is thread-safe only if bogons are ignored
+            parser.setFeature(
+                    org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
+
+            parser.setContentHandler(new XHTMLDowngradeHandler(
+                    new HtmlHandler(mapper, handler, metadata)));
+
+            parser.parse(reader.asInputSource());
+        }
+    }
+
+    /**
+     * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+     * given element is unknown or deemed unsafe for inclusion in the parse
+     * output, then this method returns <code>null</code> and the element
+     * will be ignored but the content inside it is still processed. See
+     * the {@link #isDiscardElement(String)} method for a way to discard
+     * the entire contents of an element.
+     * <p/>
+     * Subclasses can override this method to customize the default mapping.
+     *
+     * @param name HTML element name (upper case)
+     * @return XHTML element name (lower case), or
+     * <code>null</code> if the element is unsafe
+     * @since Apache Tika 0.5
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     * the HTML mapping. This method will be removed in Tika 1.0.
+     */
+    protected String mapSafeElement(String name) {
+        return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
+    }
+
+    /**
+     * Checks whether all content within the given HTML element should be
+     * discarded instead of including it in the parse output. Subclasses
+     * can override this method to customize the set of discarded elements.
+     *
+     * @param name HTML element name (upper case)
+     * @return <code>true</code> if content inside the named element
+     * should be ignored, <code>false</code> otherwise
+     * @since Apache Tika 0.5
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     * the HTML mapping. This method will be removed in Tika 1.0.
+     */
+    protected boolean isDiscardElement(String name) {
+        return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
+    }
+
+    /**
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     * the HTML mapping. This method will be removed in Tika 1.0.
+     */
+    public String mapSafeAttribute(String elementName, String attributeName) {
+        return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName);
+    }
+
+    /**
+     * Adapter class that maintains backwards compatibility with the
+     * protected HtmlParser methods. Making HtmlParser implement HtmlMapper
+     * directly would require those methods to be public, which would break
+     * backwards compatibility with subclasses.
+     *
+     * @deprecated Use the {@link HtmlMapper} mechanism to customize
+     * the HTML mapping. This class will be removed in Tika 1.0.
+     */
+    private class HtmlParserMapper implements HtmlMapper {
+        public String mapSafeElement(String name) {
+            return HtmlParser.this.mapSafeElement(name);
+        }
+
+        public boolean isDiscardElement(String name) {
+            return HtmlParser.this.isDiscardElement(name);
+        }
+
+        public String mapSafeAttribute(String elementName, String attributeName) {
+            return HtmlParser.this.mapSafeAttribute(elementName, attributeName);
+        }
+    }
+
+}