You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 11:11:10 UTC
[04/39] tika git commit: Convert new lines from windows to unix
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
index f43fdc0..4d5cc46 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/BoilerpipeContentHandler.java
@@ -1,347 +1,347 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.io.Writer;
-import java.util.ArrayList;
-import java.util.BitSet;
-import java.util.List;
-import java.util.Locale;
-
-import de.l3s.boilerpipe.BoilerpipeExtractor;
-import de.l3s.boilerpipe.BoilerpipeProcessingException;
-import de.l3s.boilerpipe.document.TextBlock;
-import de.l3s.boilerpipe.document.TextDocument;
-import de.l3s.boilerpipe.extractors.ArticleExtractor;
-import de.l3s.boilerpipe.extractors.DefaultExtractor;
-import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.WriteOutContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a>
- * library to automatically extract the main content from a web page.
- * <p/>
- * Use this as a {@link ContentHandler} object passed to
- * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, org.apache.tika.parser.ParseContext)}
- */
-public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
-
- /**
- * The newline character that gets inserted after block elements.
- */
- private static final char[] NL = new char[]{'\n'};
- private ContentHandler delegate;
- private BoilerpipeExtractor extractor;
- private boolean includeMarkup;
- private boolean inHeader;
- private boolean inFooter;
- private int headerCharOffset;
- private List<RecordedElement> elements;
- private TextDocument td;
- /**
- * Creates a new boilerpipe-based content extractor, using the
- * {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
- *
- * @param delegate The {@link ContentHandler} object
- */
- public BoilerpipeContentHandler(ContentHandler delegate) {
- this(delegate, DefaultExtractor.INSTANCE);
- }
-
- /**
- * Creates a content handler that writes XHTML body character events to
- * the given writer.
- *
- * @param writer writer
- */
- public BoilerpipeContentHandler(Writer writer) {
- this(new WriteOutContentHandler(writer));
- }
-
- /**
- * Creates a new boilerpipe-based content extractor, using the given
- * extraction rules. The extracted main content will be passed to the
- * <delegate> content handler.
- *
- * @param delegate The {@link ContentHandler} object
- * @param extractor Extraction rules to use, e.g. {@link ArticleExtractor}
- */
- public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) {
- this.td = null;
- this.delegate = delegate;
- this.extractor = extractor;
- }
-
- public boolean isIncludeMarkup() {
- return includeMarkup;
- }
-
- public void setIncludeMarkup(boolean includeMarkup) {
- this.includeMarkup = includeMarkup;
- }
-
- /**
- * Retrieves the built TextDocument
- *
- * @return TextDocument
- */
- public TextDocument getTextDocument() {
- return td;
- }
-
- @Override
- public void startDocument() throws SAXException {
- super.startDocument();
-
- delegate.startDocument();
-
- inHeader = true;
- inFooter = false;
- headerCharOffset = 0;
-
- if (includeMarkup) {
- elements = new ArrayList<RecordedElement>();
- }
- }
-
- @Override
- public void startPrefixMapping(String prefix, String uri) throws SAXException {
- super.startPrefixMapping(prefix, uri);
- delegate.startPrefixMapping(prefix, uri);
- }
-
- ;
-
- @Override
- public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
- super.startElement(uri, localName, qName, atts);
-
- if (inHeader) {
- delegate.startElement(uri, localName, qName, atts);
- } else if (inFooter) {
- // Do nothing
- } else if (includeMarkup) {
- elements.add(new RecordedElement(uri, localName, qName, atts));
- } else {
- // This happens for the <body> element, if we're not doing markup.
- delegate.startElement(uri, localName, qName, atts);
- }
- }
-
- ;
-
- @Override
- public void characters(char[] chars, int offset, int length) throws SAXException {
- super.characters(chars, offset, length);
-
- if (inHeader) {
- delegate.characters(chars, offset, length);
- headerCharOffset++;
- } else if (inFooter) {
- // Do nothing
- } else if (includeMarkup) {
- RecordedElement element = elements.get(elements.size() - 1);
-
- char[] characters = new char[length];
- System.arraycopy(chars, offset, characters, 0, length);
- element.getCharacters().add(characters);
- }
- }
-
- ;
-
- @Override
- public void endElement(String uri, String localName, String qName) throws SAXException {
- super.endElement(uri, localName, qName);
-
- if (inHeader) {
- delegate.endElement(uri, localName, qName);
- inHeader = !localName.equals("head");
- } else if (inFooter) {
- // Do nothing
- } else if (localName.equals("body")) {
- inFooter = true;
- } else if (includeMarkup) {
- // Add the end element, and the continuation from the previous element
- elements.add(new RecordedElement(uri, localName, qName));
- elements.add(new RecordedElement());
- }
- }
-
- ;
-
- @Override
- public void endDocument() throws SAXException {
- super.endDocument();
-
- td = toTextDocument();
- try {
- extractor.process(td);
- } catch (BoilerpipeProcessingException e) {
- throw new SAXException(e);
- }
-
- Attributes emptyAttrs = new AttributesImpl();
-
- // At this point we have all the information we need to either emit N paragraphs
- // of plain text (if not including markup), or we have to replay our recorded elements
- // and only emit character runs that passed the boilerpipe filters.
- if (includeMarkup) {
- BitSet validCharacterRuns = new BitSet();
- for (TextBlock block : td.getTextBlocks()) {
- if (block.isContent()) {
- BitSet bs = block.getContainedTextElements();
- if (bs != null) {
- validCharacterRuns.or(bs);
- }
- }
- }
-
- // Now have bits set for all valid character runs. Replay our recorded elements,
- // but only emit character runs flagged as valid.
- int curCharsIndex = headerCharOffset;
-
- for (RecordedElement element : elements) {
- switch (element.getElementType()) {
- case START:
- delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
- // Fall through
-
- case CONTINUE:
- // Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so
- // we have to follow suit.
- for (char[] chars : element.getCharacters()) {
- curCharsIndex++;
-
- if (validCharacterRuns.get(curCharsIndex)) {
- delegate.characters(chars, 0, chars.length);
-
- // https://issues.apache.org/jira/browse/TIKA-961
- if (!Character.isWhitespace(chars[chars.length - 1])) {
- // Only add whitespace for certain elements
- if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
- delegate.ignorableWhitespace(NL, 0, NL.length);
- }
- }
- }
- }
- break;
-
- case END:
- delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
- break;
-
- default:
- throw new RuntimeException("Unhandled element type: " + element.getElementType());
- }
-
-
- }
- } else {
- for (TextBlock block : td.getTextBlocks()) {
- if (block.isContent()) {
- delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
- char[] chars = block.getText().toCharArray();
- delegate.characters(chars, 0, chars.length);
- delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
- delegate.ignorableWhitespace(NL, 0, NL.length);
- }
- }
- }
-
- delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
- delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");
-
- // We defer ending any prefix mapping until here, which is why we don't pass this
- // through to the delegate in an overridden method.
- delegate.endPrefixMapping("");
-
- delegate.endDocument();
- }
-
- ;
-
- private static class RecordedElement {
- private String uri;
- private String localName;
- private String qName;
- private Attributes attrs;
- private List<char[]> characters;
- private ElementType elementType;
- public RecordedElement(String uri, String localName, String qName, Attributes attrs) {
- this(uri, localName, qName, attrs, ElementType.START);
- }
-
- public RecordedElement(String uri, String localName, String qName) {
- this(uri, localName, qName, null, ElementType.END);
- }
-
- public RecordedElement() {
- this(null, null, null, null, ElementType.CONTINUE);
- }
-
- protected RecordedElement(String uri, String localName, String qName, Attributes attrs, RecordedElement.ElementType elementType) {
- this.uri = uri;
- this.localName = localName;
- this.qName = qName;
- this.attrs = attrs;
- this.elementType = elementType;
- this.characters = new ArrayList<char[]>();
- }
-
- @Override
- public String toString() {
- return String.format(Locale.ROOT, "<%s> of type %s", localName, elementType);
- }
-
- public String getUri() {
- return uri;
- }
-
- public String getLocalName() {
- return localName;
- }
-
- public String getQName() {
- return qName;
- }
-
- public Attributes getAttrs() {
- return attrs;
- }
-
- public List<char[]> getCharacters() {
- return characters;
- }
-
- public RecordedElement.ElementType getElementType() {
- return elementType;
- }
-
- public enum ElementType {
- START,
- END,
- CONTINUE
- }
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.List;
+import java.util.Locale;
+
+import de.l3s.boilerpipe.BoilerpipeExtractor;
+import de.l3s.boilerpipe.BoilerpipeProcessingException;
+import de.l3s.boilerpipe.document.TextBlock;
+import de.l3s.boilerpipe.document.TextDocument;
+import de.l3s.boilerpipe.extractors.ArticleExtractor;
+import de.l3s.boilerpipe.extractors.DefaultExtractor;
+import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a>
+ * library to automatically extract the main content from a web page.
+ * <p/>
+ * Use this as a {@link ContentHandler} object passed to
+ * {@link HtmlParser#parse(java.io.InputStream, ContentHandler, Metadata, org.apache.tika.parser.ParseContext)}
+ */
+public class BoilerpipeContentHandler extends BoilerpipeHTMLContentHandler {
+
+ /**
+ * The newline character that gets inserted after block elements.
+ */
+ private static final char[] NL = new char[]{'\n'};
+ private ContentHandler delegate;
+ private BoilerpipeExtractor extractor;
+ private boolean includeMarkup;
+ private boolean inHeader;
+ private boolean inFooter;
+ private int headerCharOffset;
+ private List<RecordedElement> elements;
+ private TextDocument td;
+ /**
+ * Creates a new boilerpipe-based content extractor, using the
+ * {@link DefaultExtractor} extraction rules and "delegate" as the content handler.
+ *
+ * @param delegate The {@link ContentHandler} object
+ */
+ public BoilerpipeContentHandler(ContentHandler delegate) {
+ this(delegate, DefaultExtractor.INSTANCE);
+ }
+
+ /**
+ * Creates a content handler that writes XHTML body character events to
+ * the given writer.
+ *
+ * @param writer writer
+ */
+ public BoilerpipeContentHandler(Writer writer) {
+ this(new WriteOutContentHandler(writer));
+ }
+
+ /**
+ * Creates a new boilerpipe-based content extractor, using the given
+ * extraction rules. The extracted main content will be passed to the
+ * <delegate> content handler.
+ *
+ * @param delegate The {@link ContentHandler} object
+ * @param extractor Extraction rules to use, e.g. {@link ArticleExtractor}
+ */
+ public BoilerpipeContentHandler(ContentHandler delegate, BoilerpipeExtractor extractor) {
+ this.td = null;
+ this.delegate = delegate;
+ this.extractor = extractor;
+ }
+
+ public boolean isIncludeMarkup() {
+ return includeMarkup;
+ }
+
+ public void setIncludeMarkup(boolean includeMarkup) {
+ this.includeMarkup = includeMarkup;
+ }
+
+ /**
+ * Retrieves the built TextDocument
+ *
+ * @return TextDocument
+ */
+ public TextDocument getTextDocument() {
+ return td;
+ }
+
+ @Override
+ public void startDocument() throws SAXException {
+ super.startDocument();
+
+ delegate.startDocument();
+
+ inHeader = true;
+ inFooter = false;
+ headerCharOffset = 0;
+
+ if (includeMarkup) {
+ elements = new ArrayList<RecordedElement>();
+ }
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) throws SAXException {
+ super.startPrefixMapping(prefix, uri);
+ delegate.startPrefixMapping(prefix, uri);
+ }
+
+ ;
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
+ super.startElement(uri, localName, qName, atts);
+
+ if (inHeader) {
+ delegate.startElement(uri, localName, qName, atts);
+ } else if (inFooter) {
+ // Do nothing
+ } else if (includeMarkup) {
+ elements.add(new RecordedElement(uri, localName, qName, atts));
+ } else {
+ // This happens for the <body> element, if we're not doing markup.
+ delegate.startElement(uri, localName, qName, atts);
+ }
+ }
+
+ ;
+
+ @Override
+ public void characters(char[] chars, int offset, int length) throws SAXException {
+ super.characters(chars, offset, length);
+
+ if (inHeader) {
+ delegate.characters(chars, offset, length);
+ headerCharOffset++;
+ } else if (inFooter) {
+ // Do nothing
+ } else if (includeMarkup) {
+ RecordedElement element = elements.get(elements.size() - 1);
+
+ char[] characters = new char[length];
+ System.arraycopy(chars, offset, characters, 0, length);
+ element.getCharacters().add(characters);
+ }
+ }
+
+ ;
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ super.endElement(uri, localName, qName);
+
+ if (inHeader) {
+ delegate.endElement(uri, localName, qName);
+ inHeader = !localName.equals("head");
+ } else if (inFooter) {
+ // Do nothing
+ } else if (localName.equals("body")) {
+ inFooter = true;
+ } else if (includeMarkup) {
+ // Add the end element, and the continuation from the previous element
+ elements.add(new RecordedElement(uri, localName, qName));
+ elements.add(new RecordedElement());
+ }
+ }
+
+ ;
+
+ @Override
+ public void endDocument() throws SAXException {
+ super.endDocument();
+
+ td = toTextDocument();
+ try {
+ extractor.process(td);
+ } catch (BoilerpipeProcessingException e) {
+ throw new SAXException(e);
+ }
+
+ Attributes emptyAttrs = new AttributesImpl();
+
+ // At this point we have all the information we need to either emit N paragraphs
+ // of plain text (if not including markup), or we have to replay our recorded elements
+ // and only emit character runs that passed the boilerpipe filters.
+ if (includeMarkup) {
+ BitSet validCharacterRuns = new BitSet();
+ for (TextBlock block : td.getTextBlocks()) {
+ if (block.isContent()) {
+ BitSet bs = block.getContainedTextElements();
+ if (bs != null) {
+ validCharacterRuns.or(bs);
+ }
+ }
+ }
+
+ // Now have bits set for all valid character runs. Replay our recorded elements,
+ // but only emit character runs flagged as valid.
+ int curCharsIndex = headerCharOffset;
+
+ for (RecordedElement element : elements) {
+ switch (element.getElementType()) {
+ case START:
+ delegate.startElement(element.getUri(), element.getLocalName(), element.getQName(), element.getAttrs());
+ // Fall through
+
+ case CONTINUE:
+ // Now emit characters that are valid. Note that boilerpipe pre-increments the character index, so
+ // we have to follow suit.
+ for (char[] chars : element.getCharacters()) {
+ curCharsIndex++;
+
+ if (validCharacterRuns.get(curCharsIndex)) {
+ delegate.characters(chars, 0, chars.length);
+
+ // https://issues.apache.org/jira/browse/TIKA-961
+ if (!Character.isWhitespace(chars[chars.length - 1])) {
+ // Only add whitespace for certain elements
+ if (XHTMLContentHandler.ENDLINE.contains(element.getLocalName())) {
+ delegate.ignorableWhitespace(NL, 0, NL.length);
+ }
+ }
+ }
+ }
+ break;
+
+ case END:
+ delegate.endElement(element.getUri(), element.getLocalName(), element.getQName());
+ break;
+
+ default:
+ throw new RuntimeException("Unhandled element type: " + element.getElementType());
+ }
+
+
+ }
+ } else {
+ for (TextBlock block : td.getTextBlocks()) {
+ if (block.isContent()) {
+ delegate.startElement(XHTMLContentHandler.XHTML, "p", "p", emptyAttrs);
+ char[] chars = block.getText().toCharArray();
+ delegate.characters(chars, 0, chars.length);
+ delegate.endElement(XHTMLContentHandler.XHTML, "p", "p");
+ delegate.ignorableWhitespace(NL, 0, NL.length);
+ }
+ }
+ }
+
+ delegate.endElement(XHTMLContentHandler.XHTML, "body", "body");
+ delegate.endElement(XHTMLContentHandler.XHTML, "html", "html");
+
+ // We defer ending any prefix mapping until here, which is why we don't pass this
+ // through to the delegate in an overridden method.
+ delegate.endPrefixMapping("");
+
+ delegate.endDocument();
+ }
+
+ ;
+
+ private static class RecordedElement {
+ private String uri;
+ private String localName;
+ private String qName;
+ private Attributes attrs;
+ private List<char[]> characters;
+ private ElementType elementType;
+ public RecordedElement(String uri, String localName, String qName, Attributes attrs) {
+ this(uri, localName, qName, attrs, ElementType.START);
+ }
+
+ public RecordedElement(String uri, String localName, String qName) {
+ this(uri, localName, qName, null, ElementType.END);
+ }
+
+ public RecordedElement() {
+ this(null, null, null, null, ElementType.CONTINUE);
+ }
+
+ protected RecordedElement(String uri, String localName, String qName, Attributes attrs, RecordedElement.ElementType elementType) {
+ this.uri = uri;
+ this.localName = localName;
+ this.qName = qName;
+ this.attrs = attrs;
+ this.elementType = elementType;
+ this.characters = new ArrayList<char[]>();
+ }
+
+ @Override
+ public String toString() {
+ return String.format(Locale.ROOT, "<%s> of type %s", localName, elementType);
+ }
+
+ public String getUri() {
+ return uri;
+ }
+
+ public String getLocalName() {
+ return localName;
+ }
+
+ public String getQName() {
+ return qName;
+ }
+
+ public Attributes getAttrs() {
+ return attrs;
+ }
+
+ public List<char[]> getCharacters() {
+ return characters;
+ }
+
+ public RecordedElement.ElementType getElementType() {
+ return elementType;
+ }
+
+ public enum ElementType {
+ START,
+ END,
+ CONTINUE
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
index 0cef05f..4217ac5 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/DefaultHtmlMapper.java
@@ -1,137 +1,137 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-
-/**
- * The default HTML mapping rules in Tika.
- *
- * @since Apache Tika 0.6
- */
-@SuppressWarnings("serial")
-public class DefaultHtmlMapper implements HtmlMapper {
-
- /**
- * @since Apache Tika 0.8
- */
- public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
- // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
- private static final Map<String, String> SAFE_ELEMENTS = new HashMap<String, String>() {{
- put("H1", "h1");
- put("H2", "h2");
- put("H3", "h3");
- put("H4", "h4");
- put("H5", "h5");
- put("H6", "h6");
-
- put("P", "p");
- put("PRE", "pre");
- put("BLOCKQUOTE", "blockquote");
- put("Q", "q");
-
- put("UL", "ul");
- put("OL", "ol");
- put("MENU", "ul");
- put("LI", "li");
- put("DL", "dl");
- put("DT", "dt");
- put("DD", "dd");
-
- put("TABLE", "table");
- put("THEAD", "thead");
- put("TBODY", "tbody");
- put("TR", "tr");
- put("TH", "th");
- put("TD", "td");
-
- put("ADDRESS", "address");
-
- // TIKA-460 - add anchors
- put("A", "a");
-
- // TIKA-463 - add additional elements that contain URLs (and their sub-elements)
- put("MAP", "map");
- put("AREA", "area");
- put("IMG", "img");
- put("FRAMESET", "frameset");
- put("FRAME", "frame");
- put("IFRAME", "iframe");
- put("OBJECT", "object");
- put("PARAM", "param");
- put("INS", "ins");
- put("DEL", "del");
- }};
- private static final Set<String> DISCARDABLE_ELEMENTS = new HashSet<String>() {{
- add("STYLE");
- add("SCRIPT");
- }};
- // For information on tags & attributes, see:
- // http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#a_dtd_XHTML-1.0-Strict
- // http://www.w3schools.com/TAGS/
- private static final Map<String, Set<String>> SAFE_ATTRIBUTES = new HashMap<String, Set<String>>() {{
- put("a", attrSet("charset", "type", "name", "href", "hreflang", "rel", "rev", "shape", "coords"));
- put("img", attrSet("src", "alt", "longdesc", "height", "width", "usemap", "ismap"));
- put("frame", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling"));
- put("iframe", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width"));
- put("link", attrSet("charset", "href", "hreflang", "type", "rel", "rev", "media"));
- put("map", attrSet("id", "class", "style", "title", "name"));
- put("area", attrSet("shape", "coords", "href", "nohref", "alt"));
- put("object", attrSet("declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height",
- "width", "usemap", "name", "tabindex", "align", "border", "hspace", "vspace"));
- put("param", attrSet("id", "name", "value", "valuetype", "type"));
- put("blockquote", attrSet("cite"));
- put("ins", attrSet("cite", "datetime"));
- put("del", attrSet("cite", "datetime"));
- put("q", attrSet("cite"));
-
- // TODO - fill out this set. Include core, i18n, etc sets where appropriate.
- }};
-
- private static Set<String> attrSet(String... attrs) {
- Set<String> result = new HashSet<String>();
- for (String attr : attrs) {
- result.add(attr);
- }
- return result;
- }
-
- public String mapSafeElement(String name) {
- return SAFE_ELEMENTS.get(name);
- }
-
- /**
- * Normalizes an attribute name. Assumes that the element name
- * is valid and normalized
- */
- public String mapSafeAttribute(String elementName, String attributeName) {
- Set<String> safeAttrs = SAFE_ATTRIBUTES.get(elementName);
- if ((safeAttrs != null) && safeAttrs.contains(attributeName)) {
- return attributeName;
- } else {
- return null;
- }
- }
-
- public boolean isDiscardElement(String name) {
- return DISCARDABLE_ELEMENTS.contains(name);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * The default HTML mapping rules in Tika.
+ *
+ * @since Apache Tika 0.6
+ */
+@SuppressWarnings("serial")
+public class DefaultHtmlMapper implements HtmlMapper {
+
+ /**
+ * @since Apache Tika 0.8
+ */
+ public static final HtmlMapper INSTANCE = new DefaultHtmlMapper();
+ // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
+ private static final Map<String, String> SAFE_ELEMENTS = new HashMap<String, String>() {{
+ put("H1", "h1");
+ put("H2", "h2");
+ put("H3", "h3");
+ put("H4", "h4");
+ put("H5", "h5");
+ put("H6", "h6");
+
+ put("P", "p");
+ put("PRE", "pre");
+ put("BLOCKQUOTE", "blockquote");
+ put("Q", "q");
+
+ put("UL", "ul");
+ put("OL", "ol");
+ put("MENU", "ul");
+ put("LI", "li");
+ put("DL", "dl");
+ put("DT", "dt");
+ put("DD", "dd");
+
+ put("TABLE", "table");
+ put("THEAD", "thead");
+ put("TBODY", "tbody");
+ put("TR", "tr");
+ put("TH", "th");
+ put("TD", "td");
+
+ put("ADDRESS", "address");
+
+ // TIKA-460 - add anchors
+ put("A", "a");
+
+ // TIKA-463 - add additional elements that contain URLs (and their sub-elements)
+ put("MAP", "map");
+ put("AREA", "area");
+ put("IMG", "img");
+ put("FRAMESET", "frameset");
+ put("FRAME", "frame");
+ put("IFRAME", "iframe");
+ put("OBJECT", "object");
+ put("PARAM", "param");
+ put("INS", "ins");
+ put("DEL", "del");
+ }};
+ private static final Set<String> DISCARDABLE_ELEMENTS = new HashSet<String>() {{
+ add("STYLE");
+ add("SCRIPT");
+ }};
+ // For information on tags & attributes, see:
+ // http://www.w3.org/TR/2002/REC-xhtml1-20020801/dtds.html#a_dtd_XHTML-1.0-Strict
+ // http://www.w3schools.com/TAGS/
+ private static final Map<String, Set<String>> SAFE_ATTRIBUTES = new HashMap<String, Set<String>>() {{
+ put("a", attrSet("charset", "type", "name", "href", "hreflang", "rel", "rev", "shape", "coords"));
+ put("img", attrSet("src", "alt", "longdesc", "height", "width", "usemap", "ismap"));
+ put("frame", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling"));
+ put("iframe", attrSet("longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width"));
+ put("link", attrSet("charset", "href", "hreflang", "type", "rel", "rev", "media"));
+ put("map", attrSet("id", "class", "style", "title", "name"));
+ put("area", attrSet("shape", "coords", "href", "nohref", "alt"));
+ put("object", attrSet("declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height",
+ "width", "usemap", "name", "tabindex", "align", "border", "hspace", "vspace"));
+ put("param", attrSet("id", "name", "value", "valuetype", "type"));
+ put("blockquote", attrSet("cite"));
+ put("ins", attrSet("cite", "datetime"));
+ put("del", attrSet("cite", "datetime"));
+ put("q", attrSet("cite"));
+
+ // TODO - fill out this set. Include core, i18n, etc sets where appropriate.
+ }};
+
+ private static Set<String> attrSet(String... attrs) {
+ Set<String> result = new HashSet<String>();
+ for (String attr : attrs) {
+ result.add(attr);
+ }
+ return result;
+ }
+
+ public String mapSafeElement(String name) {
+ return SAFE_ELEMENTS.get(name);
+ }
+
+ /**
+ * Normalizes an attribute name. Assumes that the element name
+ * is valid and normalized
+ */
+ public String mapSafeAttribute(String elementName, String attributeName) {
+ Set<String> safeAttrs = SAFE_ATTRIBUTES.get(elementName);
+ if ((safeAttrs != null) && safeAttrs.contains(attributeName)) {
+ return attributeName;
+ } else {
+ return null;
+ }
+ }
+
+ public boolean isDiscardElement(String name) {
+ return DISCARDABLE_ELEMENTS.contains(name);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
index c5bbc7a..d5dfaa6 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
@@ -1,309 +1,309 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Locale;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.sax.TextContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-class HtmlHandler extends TextContentHandler {
-
- // List of attributes that need to be resolved.
- private static final Set<String> URI_ATTRIBUTES =
- new HashSet<String>(Arrays.asList("src", "href", "longdesc", "cite"));
- private static final Pattern ICBM =
- Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
- private final HtmlMapper mapper;
- private final XHTMLContentHandler xhtml;
- private final Metadata metadata;
- private final StringBuilder title = new StringBuilder();
- private int bodyLevel = 0;
- private int discardLevel = 0;
- private int titleLevel = 0;
- private boolean isTitleSetToMetadata = false;
-
- private HtmlHandler(
- HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) {
- super(xhtml);
- this.mapper = mapper;
- this.xhtml = xhtml;
- this.metadata = metadata;
-
- // Try to determine the default base URL, if one has not been given
- if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
- String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
- if (name != null) {
- name = name.trim();
- try {
- new URL(name); // test URL format
- metadata.set(Metadata.CONTENT_LOCATION, name);
- } catch (MalformedURLException e) {
- // The resource name is not a valid URL, ignore it
- }
- }
- }
- }
-
- public HtmlHandler(
- HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
- this(mapper, new XHTMLContentHandler(handler, metadata), metadata);
- }
-
- @Override
- public void startElement(
- String uri, String local, String name, Attributes atts)
- throws SAXException {
- if ("TITLE".equals(name) || titleLevel > 0) {
- titleLevel++;
- }
- if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) {
- bodyLevel++;
- }
- if (mapper.isDiscardElement(name) || discardLevel > 0) {
- discardLevel++;
- }
-
- if (bodyLevel == 0 && discardLevel == 0) {
- if ("META".equals(name) && atts.getValue("content") != null) {
- // TIKA-478: For cases where we have either a name or
- // "http-equiv", assume that XHTMLContentHandler will emit
- // these in the <head>, thus passing them through safely.
- if (atts.getValue("http-equiv") != null) {
- addHtmlMetadata(
- atts.getValue("http-equiv"),
- atts.getValue("content"));
- } else if (atts.getValue("name") != null) {
- // Record the meta tag in the metadata
- addHtmlMetadata(
- atts.getValue("name"),
- atts.getValue("content"));
- } else if (atts.getValue("property") != null) {
- // TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
- metadata.add(
- atts.getValue("property"),
- atts.getValue("content"));
- }
- } else if ("BASE".equals(name) && atts.getValue("href") != null) {
- startElementWithSafeAttributes("base", atts);
- xhtml.endElement("base");
- metadata.set(
- Metadata.CONTENT_LOCATION,
- resolve(atts.getValue("href")));
- } else if ("LINK".equals(name)) {
- startElementWithSafeAttributes("link", atts);
- xhtml.endElement("link");
- }
- }
-
- if (bodyLevel > 0 && discardLevel == 0) {
- String safe = mapper.mapSafeElement(name);
- if (safe != null) {
- startElementWithSafeAttributes(safe, atts);
- }
- }
-
- title.setLength(0);
- }
-
- /**
- * Adds a metadata setting from the HTML <head/> to the Tika metadata
- * object. The name and value are normalized where possible.
- */
- private void addHtmlMetadata(String name, String value) {
- if (name == null || value == null) {
- // ignore
- } else if (name.equalsIgnoreCase("ICBM")) {
- Matcher m = ICBM.matcher(value);
- if (m.matches()) {
- metadata.set("ICBM", m.group(1) + ", " + m.group(2));
- metadata.set(Metadata.LATITUDE, m.group(1));
- metadata.set(Metadata.LONGITUDE, m.group(2));
- } else {
- metadata.set("ICBM", value);
- }
- } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
- //don't overwrite Metadata.CONTENT_TYPE!
- MediaType type = MediaType.parse(value);
- if (type != null) {
- metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, type.toString());
- } else {
- metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
- }
- } else {
- metadata.add(name, value);
- }
- }
-
- private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
- if (atts.getLength() == 0) {
- xhtml.startElement(name);
- return;
- }
-
- boolean isObject = name.equals("object");
- String codebase = null;
- if (isObject) {
- codebase = atts.getValue("", "codebase");
- if (codebase != null) {
- codebase = resolve(codebase);
- } else {
- codebase = metadata.get(Metadata.CONTENT_LOCATION);
- }
- }
-
- AttributesImpl newAttributes = new AttributesImpl(atts);
- for (int att = 0; att < newAttributes.getLength(); att++) {
- String attrName = newAttributes.getLocalName(att);
- String normAttrName = mapper.mapSafeAttribute(name, attrName);
- if (normAttrName == null) {
- newAttributes.removeAttribute(att);
- att--;
- } else {
- // We have a remapped attribute name, so set it as it might have changed.
- newAttributes.setLocalName(att, normAttrName);
-
- // And resolve relative links. Eventually this should be pushed
- // into the HtmlMapper code.
- if (URI_ATTRIBUTES.contains(normAttrName)) {
- newAttributes.setValue(att, resolve(newAttributes.getValue(att)));
- } else if (isObject && "codebase".equals(normAttrName)) {
- newAttributes.setValue(att, codebase);
- } else if (isObject
- && ("data".equals(normAttrName)
- || "classid".equals(normAttrName))) {
- newAttributes.setValue(
- att,
- resolve(codebase, newAttributes.getValue(att)));
- }
- }
- }
-
- if ("img".equals(name) && newAttributes.getValue("", "alt") == null) {
- newAttributes.addAttribute("", "alt", "alt", "CDATA", "");
- }
-
- xhtml.startElement(name, newAttributes);
- }
-
- @Override
- public void endElement(
- String uri, String local, String name) throws SAXException {
- if (bodyLevel > 0 && discardLevel == 0) {
- String safe = mapper.mapSafeElement(name);
- if (safe != null) {
- xhtml.endElement(safe);
- } else if (XHTMLContentHandler.ENDLINE.contains(
- name.toLowerCase(Locale.ENGLISH))) {
- // TIKA-343: Replace closing block tags (and <br/>) with a
- // newline unless the HtmlMapper above has already mapped
- // them to something else
- xhtml.newline();
- }
- }
-
- if (titleLevel > 0) {
- titleLevel--;
- if (titleLevel == 0 && !isTitleSetToMetadata) {
- metadata.set(TikaCoreProperties.TITLE, title.toString().trim());
- isTitleSetToMetadata = true;
- }
- }
- if (bodyLevel > 0) {
- bodyLevel--;
- }
- if (discardLevel > 0) {
- discardLevel--;
- }
- }
-
- @Override
- public void characters(char[] ch, int start, int length)
- throws SAXException {
- if (titleLevel > 0 && bodyLevel == 0) {
- title.append(ch, start, length);
- }
- if (bodyLevel > 0 && discardLevel == 0) {
- super.characters(ch, start, length);
- }
- }
-
- @Override
- public void ignorableWhitespace(char[] ch, int start, int length)
- throws SAXException {
- if (bodyLevel > 0 && discardLevel == 0) {
- super.ignorableWhitespace(ch, start, length);
- }
- }
-
- private String resolve(String url) {
- return resolve(metadata.get(Metadata.CONTENT_LOCATION), url);
- }
-
- private String resolve(String base, String url) {
- url = url.trim();
-
- // Return the URL as-is if no base URL is available or if the URL
- // matches a common non-hierarchical or pseudo URI prefix
- String lower = url.toLowerCase(Locale.ENGLISH);
- if (base == null
- || lower.startsWith("urn:")
- || lower.startsWith("mailto:")
- || lower.startsWith("tel:")
- || lower.startsWith("data:")
- || lower.startsWith("javascript:")
- || lower.startsWith("about:")) {
- return url;
- }
-
- try {
- URL baseURL = new URL(base.trim());
-
- // We need to handle one special case, where the relativeUrl is
- // just a query string (like "?pid=1"), and the baseUrl doesn't
- // end with a '/'. In that case, the URL class removes the last
- // portion of the path, which we don't want.
- String path = baseURL.getPath();
- if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) {
- return new URL(
- baseURL.getProtocol(),
- baseURL.getHost(), baseURL.getPort(),
- baseURL.getPath() + url).toExternalForm();
- } else {
- return new URL(baseURL, url).toExternalForm();
- }
- } catch (MalformedURLException e) {
- // Unknown or broken format; just return the URL as received.
- return url;
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.TextContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+class HtmlHandler extends TextContentHandler {
+
+ // List of attributes that need to be resolved.
+ private static final Set<String> URI_ATTRIBUTES =
+ new HashSet<String>(Arrays.asList("src", "href", "longdesc", "cite"));
+ private static final Pattern ICBM =
+ Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
+ private final HtmlMapper mapper;
+ private final XHTMLContentHandler xhtml;
+ private final Metadata metadata;
+ private final StringBuilder title = new StringBuilder();
+ private int bodyLevel = 0;
+ private int discardLevel = 0;
+ private int titleLevel = 0;
+ private boolean isTitleSetToMetadata = false;
+
+ private HtmlHandler(
+ HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) {
+ super(xhtml);
+ this.mapper = mapper;
+ this.xhtml = xhtml;
+ this.metadata = metadata;
+
+ // Try to determine the default base URL, if one has not been given
+ if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (name != null) {
+ name = name.trim();
+ try {
+ new URL(name); // test URL format
+ metadata.set(Metadata.CONTENT_LOCATION, name);
+ } catch (MalformedURLException e) {
+ // The resource name is not a valid URL, ignore it
+ }
+ }
+ }
+ }
+
+ public HtmlHandler(
+ HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
+ this(mapper, new XHTMLContentHandler(handler, metadata), metadata);
+ }
+
+ @Override
+ public void startElement(
+ String uri, String local, String name, Attributes atts)
+ throws SAXException {
+ if ("TITLE".equals(name) || titleLevel > 0) {
+ titleLevel++;
+ }
+ if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) {
+ bodyLevel++;
+ }
+ if (mapper.isDiscardElement(name) || discardLevel > 0) {
+ discardLevel++;
+ }
+
+ if (bodyLevel == 0 && discardLevel == 0) {
+ if ("META".equals(name) && atts.getValue("content") != null) {
+ // TIKA-478: For cases where we have either a name or
+ // "http-equiv", assume that XHTMLContentHandler will emit
+ // these in the <head>, thus passing them through safely.
+ if (atts.getValue("http-equiv") != null) {
+ addHtmlMetadata(
+ atts.getValue("http-equiv"),
+ atts.getValue("content"));
+ } else if (atts.getValue("name") != null) {
+ // Record the meta tag in the metadata
+ addHtmlMetadata(
+ atts.getValue("name"),
+ atts.getValue("content"));
+ } else if (atts.getValue("property") != null) {
+ // TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
+ metadata.add(
+ atts.getValue("property"),
+ atts.getValue("content"));
+ }
+ } else if ("BASE".equals(name) && atts.getValue("href") != null) {
+ startElementWithSafeAttributes("base", atts);
+ xhtml.endElement("base");
+ metadata.set(
+ Metadata.CONTENT_LOCATION,
+ resolve(atts.getValue("href")));
+ } else if ("LINK".equals(name)) {
+ startElementWithSafeAttributes("link", atts);
+ xhtml.endElement("link");
+ }
+ }
+
+ if (bodyLevel > 0 && discardLevel == 0) {
+ String safe = mapper.mapSafeElement(name);
+ if (safe != null) {
+ startElementWithSafeAttributes(safe, atts);
+ }
+ }
+
+ title.setLength(0);
+ }
+
+ /**
+ * Adds a metadata setting from the HTML <head/> to the Tika metadata
+ * object. The name and value are normalized where possible.
+ */
+ private void addHtmlMetadata(String name, String value) {
+ if (name == null || value == null) {
+ // ignore
+ } else if (name.equalsIgnoreCase("ICBM")) {
+ Matcher m = ICBM.matcher(value);
+ if (m.matches()) {
+ metadata.set("ICBM", m.group(1) + ", " + m.group(2));
+ metadata.set(Metadata.LATITUDE, m.group(1));
+ metadata.set(Metadata.LONGITUDE, m.group(2));
+ } else {
+ metadata.set("ICBM", value);
+ }
+ } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
+ //don't overwrite Metadata.CONTENT_TYPE!
+ MediaType type = MediaType.parse(value);
+ if (type != null) {
+ metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, type.toString());
+ } else {
+ metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
+ }
+ } else {
+ metadata.add(name, value);
+ }
+ }
+
+ private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
+ if (atts.getLength() == 0) {
+ xhtml.startElement(name);
+ return;
+ }
+
+ boolean isObject = name.equals("object");
+ String codebase = null;
+ if (isObject) {
+ codebase = atts.getValue("", "codebase");
+ if (codebase != null) {
+ codebase = resolve(codebase);
+ } else {
+ codebase = metadata.get(Metadata.CONTENT_LOCATION);
+ }
+ }
+
+ AttributesImpl newAttributes = new AttributesImpl(atts);
+ for (int att = 0; att < newAttributes.getLength(); att++) {
+ String attrName = newAttributes.getLocalName(att);
+ String normAttrName = mapper.mapSafeAttribute(name, attrName);
+ if (normAttrName == null) {
+ newAttributes.removeAttribute(att);
+ att--;
+ } else {
+ // We have a remapped attribute name, so set it as it might have changed.
+ newAttributes.setLocalName(att, normAttrName);
+
+ // And resolve relative links. Eventually this should be pushed
+ // into the HtmlMapper code.
+ if (URI_ATTRIBUTES.contains(normAttrName)) {
+ newAttributes.setValue(att, resolve(newAttributes.getValue(att)));
+ } else if (isObject && "codebase".equals(normAttrName)) {
+ newAttributes.setValue(att, codebase);
+ } else if (isObject
+ && ("data".equals(normAttrName)
+ || "classid".equals(normAttrName))) {
+ newAttributes.setValue(
+ att,
+ resolve(codebase, newAttributes.getValue(att)));
+ }
+ }
+ }
+
+ if ("img".equals(name) && newAttributes.getValue("", "alt") == null) {
+ newAttributes.addAttribute("", "alt", "alt", "CDATA", "");
+ }
+
+ xhtml.startElement(name, newAttributes);
+ }
+
+ @Override
+ public void endElement(
+ String uri, String local, String name) throws SAXException {
+ if (bodyLevel > 0 && discardLevel == 0) {
+ String safe = mapper.mapSafeElement(name);
+ if (safe != null) {
+ xhtml.endElement(safe);
+ } else if (XHTMLContentHandler.ENDLINE.contains(
+ name.toLowerCase(Locale.ENGLISH))) {
+ // TIKA-343: Replace closing block tags (and <br/>) with a
+ // newline unless the HtmlMapper above has already mapped
+ // them to something else
+ xhtml.newline();
+ }
+ }
+
+ if (titleLevel > 0) {
+ titleLevel--;
+ if (titleLevel == 0 && !isTitleSetToMetadata) {
+ metadata.set(TikaCoreProperties.TITLE, title.toString().trim());
+ isTitleSetToMetadata = true;
+ }
+ }
+ if (bodyLevel > 0) {
+ bodyLevel--;
+ }
+ if (discardLevel > 0) {
+ discardLevel--;
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ if (titleLevel > 0 && bodyLevel == 0) {
+ title.append(ch, start, length);
+ }
+ if (bodyLevel > 0 && discardLevel == 0) {
+ super.characters(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ if (bodyLevel > 0 && discardLevel == 0) {
+ super.ignorableWhitespace(ch, start, length);
+ }
+ }
+
+ private String resolve(String url) {
+ return resolve(metadata.get(Metadata.CONTENT_LOCATION), url);
+ }
+
+ private String resolve(String base, String url) {
+ url = url.trim();
+
+ // Return the URL as-is if no base URL is available or if the URL
+ // matches a common non-hierarchical or pseudo URI prefix
+ String lower = url.toLowerCase(Locale.ENGLISH);
+ if (base == null
+ || lower.startsWith("urn:")
+ || lower.startsWith("mailto:")
+ || lower.startsWith("tel:")
+ || lower.startsWith("data:")
+ || lower.startsWith("javascript:")
+ || lower.startsWith("about:")) {
+ return url;
+ }
+
+ try {
+ URL baseURL = new URL(base.trim());
+
+ // We need to handle one special case, where the relativeUrl is
+ // just a query string (like "?pid=1"), and the baseUrl doesn't
+ // end with a '/'. In that case, the URL class removes the last
+ // portion of the path, which we don't want.
+ String path = baseURL.getPath();
+ if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) {
+ return new URL(
+ baseURL.getProtocol(),
+ baseURL.getHost(), baseURL.getPort(),
+ baseURL.getPath() + url).toExternalForm();
+ } else {
+ return new URL(baseURL, url).toExternalForm();
+ }
+ } catch (MalformedURLException e) {
+ // Unknown or broken format; just return the URL as received.
+ return url;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
index 947d26a..1ca7434 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlMapper.java
@@ -1,69 +1,69 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-/**
- * HTML mapper used to make incoming HTML documents easier to handle by
- * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from
- * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
- * that wants to customize this mapping can place a custom HtmlMapper instance
- * into the parse context.
- *
- * @since Apache Tika 0.6
- */
-public interface HtmlMapper {
-
- /**
- * Maps "safe" HTML element names to semantic XHTML equivalents. If the
- * given element is unknown or deemed unsafe for inclusion in the parse
- * output, then this method returns <code>null</code> and the element
- * will be ignored but the content inside it is still processed. See
- * the {@link #isDiscardElement(String)} method for a way to discard
- * the entire contents of an element.
- *
- * @param name HTML element name (upper case)
- * @return XHTML element name (lower case), or
- * <code>null</code> if the element is unsafe
- */
- String mapSafeElement(String name);
-
- /**
- * Checks whether all content within the given HTML element should be
- * discarded instead of including it in the parse output.
- *
- * @param name HTML element name (upper case)
- * @return <code>true</code> if content inside the named element
- * should be ignored, <code>false</code> otherwise
- */
- boolean isDiscardElement(String name);
-
-
- /**
- * Maps "safe" HTML attribute names to semantic XHTML equivalents. If the
- * given attribute is unknown or deemed unsafe for inclusion in the parse
- * output, then this method returns <code>null</code> and the attribute
- * will be ignored. This method assumes that the element name
- * is valid and normalised.
- *
- * @param elementName HTML element name (lower case)
- * @param attributeName HTML attribute name (lower case)
- * @return XHTML attribute name (lower case), or
- * <code>null</code> if the element is unsafe
- */
- String mapSafeAttribute(String elementName, String attributeName);
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+/**
+ * HTML mapper used to make incoming HTML documents easier to handle by
+ * Tika clients. The {@link HtmlParser} looks up an optional HTML mapper from
+ * the parse context and uses it to map parsed HTML to "safe" XHTML. A client
+ * that wants to customize this mapping can place a custom HtmlMapper instance
+ * into the parse context.
+ *
+ * @since Apache Tika 0.6
+ */
+public interface HtmlMapper {
+
+ /**
+ * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+ * given element is unknown or deemed unsafe for inclusion in the parse
+ * output, then this method returns <code>null</code> and the element
+ * will be ignored but the content inside it is still processed. See
+ * the {@link #isDiscardElement(String)} method for a way to discard
+ * the entire contents of an element.
+ *
+ * @param name HTML element name (upper case)
+ * @return XHTML element name (lower case), or
+ * <code>null</code> if the element is unsafe
+ */
+ String mapSafeElement(String name);
+
+ /**
+ * Checks whether all content within the given HTML element should be
+ * discarded instead of including it in the parse output.
+ *
+ * @param name HTML element name (upper case)
+ * @return <code>true</code> if content inside the named element
+ * should be ignored, <code>false</code> otherwise
+ */
+ boolean isDiscardElement(String name);
+
+
+ /**
+ * Maps "safe" HTML attribute names to semantic XHTML equivalents. If the
+ * given attribute is unknown or deemed unsafe for inclusion in the parse
+ * output, then this method returns <code>null</code> and the attribute
+ * will be ignored. This method assumes that the element name
+ * is valid and normalised.
+ *
+ * @param elementName HTML element name (lower case)
+ * @param attributeName HTML attribute name (lower case)
+ * @return XHTML attribute name (lower case), or
+ * <code>null</code> if the element is unsafe
+ */
+ String mapSafeAttribute(String elementName, String attributeName);
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
index 7d6f021..a9a8aa0 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
@@ -1,194 +1,194 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.html;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.detect.AutoDetectReader;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.ccil.cowan.tagsoup.HTMLSchema;
-import org.ccil.cowan.tagsoup.Schema;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
- * and post-processes the events to produce XHTML and metadata expected by
- * Tika clients.
- */
-public class HtmlParser extends AbstractParser {
-
- /**
- * Serial version UID
- */
- private static final long serialVersionUID = 7895315240498733128L;
-
- private static final MediaType XHTML = MediaType.application("xhtml+xml");
- private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
- private static final MediaType X_ASP = MediaType.application("x-asp");
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
- MediaType.text("html"),
- XHTML,
- WAP_XHTML,
- X_ASP)));
-
- private static final ServiceLoader LOADER =
- new ServiceLoader(HtmlParser.class.getClassLoader());
-
- /**
- * HTML schema singleton used to amortise the heavy instantiation time.
- */
- private static final Schema HTML_SCHEMA = new HTMLSchema();
-
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- // Automatically detect the character encoding
- try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
- metadata,context.get(ServiceLoader.class, LOADER))) {
- Charset charset = reader.getCharset();
- String previous = metadata.get(Metadata.CONTENT_TYPE);
- MediaType contentType = null;
- if (previous == null || previous.startsWith("text/html")) {
- contentType = new MediaType(MediaType.TEXT_HTML, charset);
- } else if (previous.startsWith("application/xhtml+xml")) {
- contentType = new MediaType(XHTML, charset);
- } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
- contentType = new MediaType(WAP_XHTML, charset);
- } else if (previous.startsWith("application/x-asp")) {
- contentType = new MediaType(X_ASP, charset);
- }
- if (contentType != null) {
- metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
- }
- // deprecated, see TIKA-431
- metadata.set(Metadata.CONTENT_ENCODING, charset.name());
-
- // Get the HTML mapper from the parse context
- HtmlMapper mapper =
- context.get(HtmlMapper.class, new HtmlParserMapper());
-
- // Parse the HTML document
- org.ccil.cowan.tagsoup.Parser parser =
- new org.ccil.cowan.tagsoup.Parser();
-
- // Use schema from context or default
- Schema schema = context.get(Schema.class, HTML_SCHEMA);
-
- // TIKA-528: Reuse share schema to avoid heavy instantiation
- parser.setProperty(
- org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
- // TIKA-599: Shared schema is thread-safe only if bogons are ignored
- parser.setFeature(
- org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
-
- parser.setContentHandler(new XHTMLDowngradeHandler(
- new HtmlHandler(mapper, handler, metadata)));
-
- parser.parse(reader.asInputSource());
- }
- }
-
- /**
- * Maps "safe" HTML element names to semantic XHTML equivalents. If the
- * given element is unknown or deemed unsafe for inclusion in the parse
- * output, then this method returns <code>null</code> and the element
- * will be ignored but the content inside it is still processed. See
- * the {@link #isDiscardElement(String)} method for a way to discard
- * the entire contents of an element.
- * <p/>
- * Subclasses can override this method to customize the default mapping.
- *
- * @param name HTML element name (upper case)
- * @return XHTML element name (lower case), or
- * <code>null</code> if the element is unsafe
- * @since Apache Tika 0.5
- * @deprecated Use the {@link HtmlMapper} mechanism to customize
- * the HTML mapping. This method will be removed in Tika 1.0.
- */
- protected String mapSafeElement(String name) {
- return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
- }
-
- /**
- * Checks whether all content within the given HTML element should be
- * discarded instead of including it in the parse output. Subclasses
- * can override this method to customize the set of discarded elements.
- *
- * @param name HTML element name (upper case)
- * @return <code>true</code> if content inside the named element
- * should be ignored, <code>false</code> otherwise
- * @since Apache Tika 0.5
- * @deprecated Use the {@link HtmlMapper} mechanism to customize
- * the HTML mapping. This method will be removed in Tika 1.0.
- */
- protected boolean isDiscardElement(String name) {
- return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
- }
-
- /**
- * @deprecated Use the {@link HtmlMapper} mechanism to customize
- * the HTML mapping. This method will be removed in Tika 1.0.
- */
- public String mapSafeAttribute(String elementName, String attributeName) {
- return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName);
- }
-
- /**
- * Adapter class that maintains backwards compatibility with the
- * protected HtmlParser methods. Making HtmlParser implement HtmlMapper
- * directly would require those methods to be public, which would break
- * backwards compatibility with subclasses.
- *
- * @deprecated Use the {@link HtmlMapper} mechanism to customize
- * the HTML mapping. This class will be removed in Tika 1.0.
- */
- private class HtmlParserMapper implements HtmlMapper {
- public String mapSafeElement(String name) {
- return HtmlParser.this.mapSafeElement(name);
- }
-
- public boolean isDiscardElement(String name) {
- return HtmlParser.this.isDiscardElement(name);
- }
-
- public String mapSafeAttribute(String elementName, String attributeName) {
- return HtmlParser.this.mapSafeAttribute(elementName, attributeName);
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
+ * and post-processes the events to produce XHTML and metadata expected by
+ * Tika clients.
+ */
+public class HtmlParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 7895315240498733128L;
+
+ private static final MediaType XHTML = MediaType.application("xhtml+xml");
+ private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
+ private static final MediaType X_ASP = MediaType.application("x-asp");
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.text("html"),
+ XHTML,
+ WAP_XHTML,
+ X_ASP)));
+
+ private static final ServiceLoader LOADER =
+ new ServiceLoader(HtmlParser.class.getClassLoader());
+
+ /**
+ * HTML schema singleton used to amortise the heavy instantiation time.
+ */
+ private static final Schema HTML_SCHEMA = new HTMLSchema();
+
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Automatically detect the character encoding
+ try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
+ metadata,context.get(ServiceLoader.class, LOADER))) {
+ Charset charset = reader.getCharset();
+ String previous = metadata.get(Metadata.CONTENT_TYPE);
+ MediaType contentType = null;
+ if (previous == null || previous.startsWith("text/html")) {
+ contentType = new MediaType(MediaType.TEXT_HTML, charset);
+ } else if (previous.startsWith("application/xhtml+xml")) {
+ contentType = new MediaType(XHTML, charset);
+ } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
+ contentType = new MediaType(WAP_XHTML, charset);
+ } else if (previous.startsWith("application/x-asp")) {
+ contentType = new MediaType(X_ASP, charset);
+ }
+ if (contentType != null) {
+ metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
+ }
+ // deprecated, see TIKA-431
+ metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+ // Get the HTML mapper from the parse context
+ HtmlMapper mapper =
+ context.get(HtmlMapper.class, new HtmlParserMapper());
+
+ // Parse the HTML document
+ org.ccil.cowan.tagsoup.Parser parser =
+ new org.ccil.cowan.tagsoup.Parser();
+
+ // Use schema from context or default
+ Schema schema = context.get(Schema.class, HTML_SCHEMA);
+
+ // TIKA-528: Reuse share schema to avoid heavy instantiation
+ parser.setProperty(
+ org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
+ // TIKA-599: Shared schema is thread-safe only if bogons are ignored
+ parser.setFeature(
+ org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
+
+ parser.setContentHandler(new XHTMLDowngradeHandler(
+ new HtmlHandler(mapper, handler, metadata)));
+
+ parser.parse(reader.asInputSource());
+ }
+ }
+
+ /**
+ * Maps "safe" HTML element names to semantic XHTML equivalents. If the
+ * given element is unknown or deemed unsafe for inclusion in the parse
+ * output, then this method returns <code>null</code> and the element
+ * will be ignored but the content inside it is still processed. See
+ * the {@link #isDiscardElement(String)} method for a way to discard
+ * the entire contents of an element.
+ * <p/>
+ * Subclasses can override this method to customize the default mapping.
+ *
+ * @param name HTML element name (upper case)
+ * @return XHTML element name (lower case), or
+ * <code>null</code> if the element is unsafe
+ * @since Apache Tika 0.5
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This method will be removed in Tika 1.0.
+ */
+ protected String mapSafeElement(String name) {
+ return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
+ }
+
+ /**
+ * Checks whether all content within the given HTML element should be
+ * discarded instead of including it in the parse output. Subclasses
+ * can override this method to customize the set of discarded elements.
+ *
+ * @param name HTML element name (upper case)
+ * @return <code>true</code> if content inside the named element
+ * should be ignored, <code>false</code> otherwise
+ * @since Apache Tika 0.5
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This method will be removed in Tika 1.0.
+ */
+ protected boolean isDiscardElement(String name) {
+ return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
+ }
+
+ /**
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This method will be removed in Tika 1.0.
+ */
+ public String mapSafeAttribute(String elementName, String attributeName) {
+ return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName);
+ }
+
+ /**
+ * Adapter class that maintains backwards compatibility with the
+ * protected HtmlParser methods. Making HtmlParser implement HtmlMapper
+ * directly would require those methods to be public, which would break
+ * backwards compatibility with subclasses.
+ *
+ * @deprecated Use the {@link HtmlMapper} mechanism to customize
+ * the HTML mapping. This class will be removed in Tika 1.0.
+ */
+ private class HtmlParserMapper implements HtmlMapper {
+ public String mapSafeElement(String name) {
+ return HtmlParser.this.mapSafeElement(name);
+ }
+
+ public boolean isDiscardElement(String name) {
+ return HtmlParser.this.isDiscardElement(name);
+ }
+
+ public String mapSafeAttribute(String elementName, String attributeName) {
+ return HtmlParser.this.mapSafeAttribute(elementName, attributeName);
+ }
+ }
+
+}