You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/02/02 22:06:37 UTC
[tika] branch main updated: TIKA-3963 (#935)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 047c88655 TIKA-3963 (#935)
047c88655 is described below
commit 047c8865532cbc7e46d58b945f32f957ec18f1bb
Author: Tim Allison <ta...@apache.org>
AuthorDate: Thu Feb 2 17:06:27 2023 -0500
TIKA-3963 (#935)
* TIKA-3963 -- normalize author, title, subject, description and keywords to their dc: or other equivalent
---
CHANGES.txt | 6 +++
.../org/apache/tika/parser/html/HtmlHandler.java | 49 +++++++++++++++++++---
.../apache/tika/parser/html/HtmlParserTest.java | 31 ++++++++++++++
.../test-documents/testHTML_metadata.html | 16 +++++++
.../testHTML_metadata_two_titles.html | 17 ++++++++
5 files changed, 113 insertions(+), 6 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 9561838e4..fcb3355d8 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,9 @@
+Release 2.7.1 - ???
+
+ * Normalize author, title, subject and description to their Dublin Core
+ properties in the HTMLParser (TIKA-3963).
+
+
Release 2.7.0 - 1/31/2023
* Add SVG detection for svg files that lack the xml header (TIKA-3308).
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
index 9b79ef75d..ae2de95d5 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
@@ -22,9 +22,11 @@ import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
+import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -39,11 +41,14 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.HTML;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.TextContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
class HtmlHandler extends TextContentHandler {
@@ -52,6 +57,16 @@ class HtmlHandler extends TextContentHandler {
new HashSet<>(Arrays.asList("src", "href", "longdesc", "cite"));
private static final Pattern ICBM =
Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
+
+ private static final Map<String, Property> META_HEADER_MAPPINGS = new HashMap<>();
+
+ static {
+ META_HEADER_MAPPINGS.put("author", TikaCoreProperties.CREATOR);
+ META_HEADER_MAPPINGS.put("title", TikaCoreProperties.TITLE);
+ META_HEADER_MAPPINGS.put("subject", TikaCoreProperties.SUBJECT);
+ META_HEADER_MAPPINGS.put("keywords", Office.KEYWORDS);
+ META_HEADER_MAPPINGS.put("description", TikaCoreProperties.DESCRIPTION);
+ }
private static final Attributes EMPTY_ATTS = new AttributesImpl();
private final HtmlMapper mapper;
private final XHTMLContentHandler xhtml;
@@ -179,9 +194,14 @@ class HtmlHandler extends TextContentHandler {
* object. The name and value are normalized where possible.
*/
private void addHtmlMetadata(String name, String value) {
- if (name == null || value == null) {
- // ignore
- } else if (name.equalsIgnoreCase("ICBM")) {
+ //note that "name" derives from attributes and is not uppercased
+ //like the elements by the XHTMLDowngradeHandler
+
+ if (StringUtils.isBlank(name) || StringUtils.isBlank(value)) {
+ return;
+ }
+
+ if (name.equalsIgnoreCase("ICBM")) {
Matcher m = ICBM.matcher(value);
if (m.matches()) {
metadata.set("ICBM", m.group(1) + ", " + m.group(2));
@@ -190,7 +210,10 @@ class HtmlHandler extends TextContentHandler {
} else {
metadata.set("ICBM", value);
}
- } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
+ return;
+ }
+
+ if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
//don't overwrite Metadata.CONTENT_TYPE!
MediaType type = MediaType.parse(value);
if (type != null) {
@@ -198,9 +221,23 @@ class HtmlHandler extends TextContentHandler {
} else {
metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
}
- } else {
- metadata.add(name, value);
+ return;
+ }
+
+ String lcName = name.toLowerCase(Locale.US);
+ if (META_HEADER_MAPPINGS.containsKey(lcName)) {
+ Property property = META_HEADER_MAPPINGS.get(lcName);
+ if (property.equals(TikaCoreProperties.TITLE) && isTitleSetToMetadata) {
+ //prefer the title element if it is already set
+ //do nothing
+ } else if (property.isMultiValuePermitted()) {
+ metadata.add(property, value);
+ } else {
+ metadata.set(property, value);
+ }
}
+ //TODO -- we should prefix these raw names to avoid collisions
+ metadata.add(name, value);
}
private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 093e2af34..408c85092 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -72,6 +72,7 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Geographic;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
@@ -1238,6 +1239,36 @@ public class HtmlParserTest extends TikaTest {
}
}
+ @Test
+ public void testMetadataMapping() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testHTML_metadata.html");
+ Metadata m = metadataList.get(0);
+ assertEquals("Free Web tutorials", m.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Free Web tutorials", m.get("description"));
+
+ assertEquals("HTML,CSS,XML,JavaScript", m.get(TikaCoreProperties.SUBJECT));
+ assertEquals("HTML,CSS,XML,JavaScript", m.get("keywords"));
+
+ assertEquals("HTML,CSS,XML,JavaScript", m.get(Office.KEYWORDS));
+ assertEquals("HTML,CSS,XML,JavaScript", m.get(Office.KEYWORDS));
+
+ assertEquals("OldMetaTitle", m.get(TikaCoreProperties.TITLE));
+ assertEquals("OldMetaTitle", m.get("title"));
+
+ assertEquals("John Doe", m.get(TikaCoreProperties.CREATOR));
+ assertEquals("John Doe", m.get("author"));
+ }
+
+ @Test
+ public void testPreferenceForTitleElement() throws Exception {
+ //this tests that the <title> element is preferred over the title attribute
+ List<Metadata> metadataList = getRecursiveMetadata("testHTML_metadata_two_titles.html");
+ Metadata m = metadataList.get(0);
+
+ assertEquals("ActualTitle", m.get(TikaCoreProperties.TITLE));
+ assertEquals("OldMetaTitle", m.get("title"));
+ }
+
private class EncodingDetectorRunner implements Callable<String> {
final static String DONE = "done";
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testHTML_metadata.html b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testHTML_metadata.html
new file mode 100644
index 000000000..c879441fa
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testHTML_metadata.html
@@ -0,0 +1,16 @@
+<!DOCTYPE html>
+<html>
+<head>
+ <meta charset="UTF-8">
+ <meta name="description" content="Free Web tutorials">
+ <meta name="keywords" content="HTML,CSS,XML,JavaScript">
+ <meta name="author" content="John Doe">
+ <meta name="title" content="OldMetaTitle">
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+</head>
+<body>
+
+<p>All meta information goes in the head section...</p>
+
+</body>
+</html>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testHTML_metadata_two_titles.html b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testHTML_metadata_two_titles.html
new file mode 100644
index 000000000..654047f15
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testHTML_metadata_two_titles.html
@@ -0,0 +1,17 @@
+<!DOCTYPE html>
+<html>
+<head>
+ <meta charset="UTF-8">
+ <title>ActualTitle</title>
+ <meta name="description" content="Free Web tutorials">
+ <meta name="keywords" content="HTML,CSS,XML,JavaScript">
+ <meta name="title" content="OldMetaTitle">
+ <meta name="author" content="John Doe">
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
+</head>
+<body>
+
+<p>All meta information goes in the head section...</p>
+
+</body>
+</html>