You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/02/02 20:11:16 UTC

[tika] branch TIKA-3963 created (now d5cbfc700)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-3963
in repository https://gitbox.apache.org/repos/asf/tika.git


      at d5cbfc700 TIKA-3963 -- normalize author, title, subject, description and keywords to their dc: or other equivalents

This branch includes the following new commits:

     new d5cbfc700 TIKA-3963 -- normalize author, title, subject, description and keywords to their dc: or other equivalents

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-3963 -- normalize author, title, subject, description and keywords to their dc: or other equivalents

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3963
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d5cbfc700732f74a446d40b0d75ad8dc705abc4b
Author: tallison <ta...@apache.org>
AuthorDate: Thu Feb 2 15:11:01 2023 -0500

    TIKA-3963 -- normalize author, title, subject, description and keywords to their dc: or other equivalents
---
 CHANGES.txt                                        |  6 +++
 .../org/apache/tika/parser/html/HtmlHandler.java   | 46 +++++++++++++++++++---
 .../apache/tika/parser/html/HtmlParserTest.java    | 22 +++++++++++
 .../test-documents/testHTML_metadata.html          | 16 ++++++++
 4 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 9561838e4..fcb3355d8 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,9 @@
+Release 2.7.1 - ???
+
+   * Normalize author, title, subject and description to their Dublin Core
+     properties in the HTMLParser (TIKA-3963).
+
+
 Release 2.7.0 - 1/31/2023
 
    * Add SVG detection for svg files that lack the xml header (TIKA-3308).
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
index 9b79ef75d..d92a90941 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
@@ -22,9 +22,11 @@ import java.net.MalformedURLException;
 import java.net.URL;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
+import java.util.Map;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -39,11 +41,14 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.metadata.HTML;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.TextContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
 
 class HtmlHandler extends TextContentHandler {
 
@@ -52,6 +57,16 @@ class HtmlHandler extends TextContentHandler {
             new HashSet<>(Arrays.asList("src", "href", "longdesc", "cite"));
     private static final Pattern ICBM =
             Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
+
+    private static final Map<String, Property> META_HEADER_MAPPINGS = new HashMap<>();
+
+    static {
+        META_HEADER_MAPPINGS.put("author", TikaCoreProperties.CREATOR);
+        META_HEADER_MAPPINGS.put("title", TikaCoreProperties.TITLE);
+        META_HEADER_MAPPINGS.put("subject", TikaCoreProperties.SUBJECT);
+        META_HEADER_MAPPINGS.put("keywords", Office.KEYWORDS);
+        META_HEADER_MAPPINGS.put("description", TikaCoreProperties.DESCRIPTION);
+    }
     private static final Attributes EMPTY_ATTS = new AttributesImpl();
     private final HtmlMapper mapper;
     private final XHTMLContentHandler xhtml;
@@ -179,9 +194,14 @@ class HtmlHandler extends TextContentHandler {
      * object. The name and value are normalized where possible.
      */
     private void addHtmlMetadata(String name, String value) {
-        if (name == null || value == null) {
-            // ignore
-        } else if (name.equalsIgnoreCase("ICBM")) {
+        //note that "name" derives from attributes and is not uppercased
+        //like the elements by the XHTMLDowngradeHandler
+
+        if (StringUtils.isBlank(name) || StringUtils.isBlank(value)) {
+            return;
+        }
+
+        if (name.equalsIgnoreCase("ICBM")) {
             Matcher m = ICBM.matcher(value);
             if (m.matches()) {
                 metadata.set("ICBM", m.group(1) + ", " + m.group(2));
@@ -190,7 +210,10 @@ class HtmlHandler extends TextContentHandler {
             } else {
                 metadata.set("ICBM", value);
             }
-        } else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
+            return;
+        }
+
+        if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
             //don't overwrite Metadata.CONTENT_TYPE!
             MediaType type = MediaType.parse(value);
             if (type != null) {
@@ -198,9 +221,20 @@ class HtmlHandler extends TextContentHandler {
             } else {
                 metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
             }
-        } else {
-            metadata.add(name, value);
+            return;
+        }
+
+        String lcName = name.toLowerCase(Locale.US);
+        if (META_HEADER_MAPPINGS.containsKey(lcName)) {
+            Property property = META_HEADER_MAPPINGS.get(lcName);
+            if (property.isMultiValuePermitted()) {
+                metadata.add(property, value);
+            } else {
+                metadata.set(property, value);
+            }
         }
+        //TODO -- we should prefix these raw names to avoid collisions
+        metadata.add(name, value);
     }
 
     private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 093e2af34..3ba4f9287 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -72,6 +72,7 @@ import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Geographic;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
@@ -1238,6 +1239,27 @@ public class HtmlParserTest extends TikaTest {
         }
     }
 
+    @Test
+    public void testMetadataMapping() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testHTML_metadata.html");
+        Metadata m = metadataList.get(0);
+        assertEquals("Free Web tutorials", m.get(TikaCoreProperties.DESCRIPTION));
+        assertEquals("Free Web tutorials", m.get("description"));
+
+        assertEquals("HTML,CSS,XML,JavaScript", m.get(TikaCoreProperties.SUBJECT));
+        assertEquals("HTML,CSS,XML,JavaScript", m.get("keywords"));
+
+        assertEquals("HTML,CSS,XML,JavaScript", m.get(Office.KEYWORDS));
+        assertEquals("HTML,CSS,XML,JavaScript", m.get(Office.KEYWORDS));
+
+        assertEquals("OldMetaTitle", m.get(TikaCoreProperties.TITLE));
+        assertEquals("OldMetaTitle", m.get("title"));
+
+        assertEquals("John Doe", m.get(TikaCoreProperties.CREATOR));
+        assertEquals("John Doe", m.get("author"));
+
+    }
+
     private class EncodingDetectorRunner implements Callable<String> {
 
         final static String DONE = "done";
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testHTML_metadata.html b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testHTML_metadata.html
new file mode 100644
index 000000000..c879441fa
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/resources/test-documents/testHTML_metadata.html
@@ -0,0 +1,16 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <meta name="description" content="Free Web tutorials">
+    <meta name="keywords" content="HTML,CSS,XML,JavaScript">
+    <meta name="author" content="John Doe">
+    <meta name="title" content="OldMetaTitle">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+</head>
+<body>
+
+<p>All meta information goes in the head section...</p>
+
+</body>
+</html>