You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/02/02 20:11:17 UTC

[tika] 01/01: TIKA-3963 -- normalize author, title, subject, description and keywords to their dc: or other equivalents

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3963
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d5cbfc700732f74a446d40b0d75ad8dc705abc4b
Author: tallison <ta...@apache.org>
AuthorDate: Thu Feb 2 15:11:01 2023 -0500

    TIKA-3963 -- normalize author, title, subject, description and keywords to their dc: or other equivalents
---
 CHANGES.txt                                        |  6 +++
 .../org/apache/tika/parser/html/HtmlHandler.java   | 46 +++++++++++++++++++---
 .../apache/tika/parser/html/HtmlParserTest.java    | 22 +++++++++++
 .../test-documents/testHTML_metadata.html          | 16 ++++++++
 4 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 9561838e4..fcb3355d8 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,9 @@
+Release 2.7.1 - ???
+
+   * Normalize author, title, subject and description to their Dublin Core
+     properties in the HTMLParser (TIKA-3963).
+
+
 Release 2.7.0 - 1/31/2023
 
    * Add SVG detection for svg files that lack the xml header (TIKA-3308).
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
index 9b79ef75d..d92a90941 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
@@ -22,9 +22,11 @@ import java.net.MalformedURLException;
 import java.net.URL;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
+import java.util.Map;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -39,11 +41,14 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.metadata.HTML;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.TextContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
 
 class HtmlHandler extends TextContentHandler {
 
@@ -52,6 +57,16 @@ class HtmlHandler extends TextContentHandler {
             new HashSet<>(Arrays.asList("src", "href", "longdesc", "cite"));
     private static final Pattern ICBM =
             Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
+
+    private static final Map<String, Property> META_HEADER_MAPPINGS = new HashMap<>();
+
+    static {
+        META_HEADER_MAPPINGS.put("author", TikaCoreProperties.CREATOR);
+        META_HEADER_MAPPINGS.put("title", TikaCoreProperties.TITLE);
+        META_HEADER_MAPPINGS.put("subject", TikaCoreProperties.SUBJECT);
+        META_HEADER_MAPPINGS.put("keywords", Office.KEYWORDS);
+        META_HEADER_MAPPINGS.put("description", TikaCoreProperties.DESCRIPTION);
+    }
     private static final Attributes EMPTY_ATTS = new AttributesImpl();
     private final HtmlMapper mapper;
     private final XHTMLContentHandler xhtml;
@@ -179,9 +194,14 @@ class HtmlHandler extends TextContentHandler {
      * object. The name and value are normalized where possible.
      */
     private void addHtmlMetadata(String name, String value) {
-        if (name == null || value == null) {
-            // ignore
-        } else if (name.equalsIgnoreCase("ICBM")) {
+        //note that "name" derives from attributes and is not uppercased
+        //like the elements by the XHTMLDowngradeHandler
+
+        if (StringUtils.isBlank(name) || StringUtils.isBlank(value)) {
+            return;
+        }
+
+        if (name.equalsIgnoreCase("ICBM")) {
             Matcher m = ICBM.matcher(value);
             if (m.matches()) {
                 metadata.set("ICBM", m.group(1) + ", " + m.group(2));
@@ -190,7 +210,10 @@ class HtmlHandler extends TextContentHandler {
             } else {
                 metadata.set("ICBM", value);
             }
-        } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
+            return;
+        }
+
+        if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
             //don't overwrite Metadata.CONTENT_TYPE!
             MediaType type = MediaType.parse(value);
             if (type != null) {
@@ -198,9 +221,20 @@ class HtmlHandler extends TextContentHandler {
             } else {
                 metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
             }
-        } else {
-            metadata.add(name, value);
+            return;
+        }
+
+        String lcName = name.toLowerCase(Locale.US);
+        if (META_HEADER_MAPPINGS.containsKey(lcName)) {
+            Property property = META_HEADER_MAPPINGS.get(lcName);
+            if (property.isMultiValuePermitted()) {
+                metadata.add(property, value);
+            } else {
+                metadata.set(property, value);
+            }
         }
+        //TODO -- we should prefix these raw names to avoid collisions
+        metadata.add(name, value);
     }
 
     private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 093e2af34..3ba4f9287 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -72,6 +72,7 @@ import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Geographic;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
@@ -1238,6 +1239,27 @@ public class HtmlParserTest extends TikaTest {
         }
     }
 
+    @Test
+    public void testMetadataMapping() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testHTML_metadata.html");
+        Metadata m = metadataList.get(0);
+        assertEquals("Free Web tutorials", m.get(TikaCoreProperties.DESCRIPTION));
+        assertEquals("Free Web tutorials", m.get("description"));
+
+        assertEquals("HTML,CSS,XML,JavaScript", m.get(TikaCoreProperties.SUBJECT));
+        assertEquals("HTML,CSS,XML,JavaScript", m.get("keywords"));
+
+        assertEquals("HTML,CSS,XML,JavaScript", m.get(Office.KEYWORDS));
+        assertEquals("HTML,CSS,XML,JavaScript", m.get(Office.KEYWORDS));
+
+        assertEquals("OldMetaTitle", m.get(TikaCoreProperties.TITLE));
+        assertEquals("OldMetaTitle", m.get("title"));
+
+        assertEquals("John Doe", m.get(TikaCoreProperties.CREATOR));
+        assertEquals("John Doe", m.get("author"));
+
+    }
+
     private class EncodingDetectorRunner implements Callable<String> {
 
         final static String DONE = "done";
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testHTML_metadata.html b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testHTML_metadata.html
new file mode 100644
index 000000000..c879441fa
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testHTML_metadata.html
@@ -0,0 +1,16 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <meta name="description" content="Free Web tutorials">
+    <meta name="keywords" content="HTML,CSS,XML,JavaScript">
+    <meta name="author" content="John Doe">
+    <meta name="title" content="OldMetaTitle">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+</head>
+<body>
+
+<p>All meta information goes in the head section...</p>
+
+</body>
+</html>