You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2013/05/13 19:51:49 UTC

svn commit: r1481990 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/html/HtmlHandler.java test/java/org/apache/tika/parser/html/HtmlParserTest.java

Author: dmeikle
Date: Mon May 13 17:51:49 2013
New Revision: 1481990

URL: http://svn.apache.org/r1481990
Log:
Patch by Markus Jelsma for TIKA-992 to allow OpenGraph meta tags to have multiple values.

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=1481990&r1=1481989&r2=1481990&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java Mon May 13 17:51:49 2013
@@ -112,7 +112,7 @@ class HtmlHandler extends TextContentHan
                             atts.getValue("content"));
                 } else if (atts.getValue("property") != null) {
                     // TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
-                    addHtmlMetadata(
+                    metadata.add(
                             atts.getValue("property"),
                             atts.getValue("content"));
                 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1481990&r1=1481989&r2=1481990&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Mon May 13 17:51:49 2013
@@ -80,7 +80,7 @@ public class HtmlParserTest extends Test
                 "Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE));
         assertEquals("Tika Developers", metadata.get("Author"));
         assertEquals("5", metadata.get("refresh"));
-        
+
         assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
         assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));
 
@@ -408,21 +408,20 @@ public class HtmlParserTest extends Test
      */
     public void testBoilerplateRemoval() throws Exception {
         String path = "/test-documents/boilerplate.html";
-        
+
         Metadata metadata = new Metadata();
         BodyContentHandler handler = new BodyContentHandler();
         new HtmlParser().parse(
                 HtmlParserTest.class.getResourceAsStream(path),
                 new BoilerpipeContentHandler(handler),  metadata, new ParseContext());
-        
+
         String content = handler.toString();
         assertTrue(content.startsWith("This is the real meat"));
         assertTrue(content.endsWith("This is the end of the text.\n"));
         assertFalse(content.contains("boilerplate"));
         assertFalse(content.contains("footer"));
     }
-    
-    
+
     /**
      * Test case for TIKA-478. Don't emit <head> sub-elements inside of <body>.
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-478">TIKA-478</a>
@@ -439,22 +438,22 @@ public class HtmlParserTest extends Test
                 makeHtmlTransformer(sw), new Metadata(), new ParseContext());
 
         String result = sw.toString();
-        
+
         // Title element in <head> section
         assertTrue(Pattern.matches("(?s)<html.*<head>.*<title>Title</title>.*</head>.*$", result));
 
         // No meta elements in body
         assertFalse(Pattern.matches("(?s).*<body>.*<meta. *</body>.*$", result));
-        
+
         // meta elements should show up in <head> section
         assertTrue(Pattern.matches("(?s)<html.*<head>.*<meta .*</head>.*$", result));
-        
+
         // No link elements in body
         assertFalse(Pattern.matches("(?s).*<body>.*<link .*</body>.*$", result));
-        
+
         // link element should be in <head> section
         assertTrue(Pattern.matches("(?s)<html.*<head>.*<link .*</head>.*$", result));
-        
+
         // There should be ending elements.
         assertTrue(Pattern.matches("(?s).*</body>.*</html>$", result));
 
@@ -475,7 +474,7 @@ public class HtmlParserTest extends Test
                 makeHtmlTransformer(sw), new Metadata(), new ParseContext());
 
         String result = sw.toString();
-        
+
         // <img> tag should exist, with fully resolved URL
         assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", result));
     }
@@ -495,7 +494,7 @@ public class HtmlParserTest extends Test
                 makeHtmlTransformer(sw), new Metadata(), new ParseContext());
 
         String result = sw.toString();
-        
+
         // <frame> tag should exist, with fully resolved URL
         assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));
     }
@@ -516,7 +515,7 @@ public class HtmlParserTest extends Test
                 makeHtmlTransformer(sw), new Metadata(), new ParseContext());
 
         String result = sw.toString();
-        
+
         // <iframe> tag should exist, with fully resolved URL
         assertTrue(Pattern.matches("(?s).*<iframe .* src=\"http://domain.com/framed.html\".*$", result));
     }
@@ -538,7 +537,7 @@ public class HtmlParserTest extends Test
                 makeHtmlTransformer(sw), new Metadata(), new ParseContext());
 
         String result = sw.toString();
-        
+
         // <map> tag should exist, with <area> tag with fully resolved URL
         assertTrue(Pattern.matches("(?s).*<map .*<area .* href=\"http://domain.com/map.html\".*</map>.*$", result));
     }
@@ -560,7 +559,7 @@ public class HtmlParserTest extends Test
                 makeHtmlTransformer(sw), new Metadata(), new ParseContext());
 
         String result = sw.toString();
-        
+
         // <object> tag should exist with fully resolved URLs
         assertTrue(
               "<object> tag not correctly found in:\n" + result,
@@ -578,7 +577,7 @@ public class HtmlParserTest extends Test
         Metadata metadata = new Metadata();
         metadata.add("Content-Type", "text/html; charset=utf-8");
         metadata.add("Language", null);
-        
+
         StringWriter sw = new StringWriter();
         new HtmlParser().parse(
                 new ByteArrayInputStream(test.getBytes("UTF-8")),
@@ -606,10 +605,10 @@ public class HtmlParserTest extends Test
                 makeHtmlTransformer(sw1), new Metadata(), new ParseContext());
 
         String result = sw1.toString();
-        
+
         // <frame> tag should exist, with fully resolved URL
         assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));
-        
+
         // <body> tag should not exist.
         assertFalse(Pattern.matches("(?s).*<body>.*$", result));
 
@@ -627,7 +626,7 @@ public class HtmlParserTest extends Test
                 makeHtmlTransformer(sw2), new Metadata(), new ParseContext());
 
         result = sw2.toString();
-        
+
         // <frame> tags should exist, with relative URL (no base element specified)
         assertTrue(Pattern.matches("(?s).*<frame .* src=\"top.html\"/>.*$", result));
         assertTrue(Pattern.matches("(?s).*<frame .* src=\"left.html\"/>.*$", result));
@@ -645,22 +644,22 @@ public class HtmlParserTest extends Test
      */
     public void testBoilerplateDelegation() throws Exception {
         String path = "/test-documents/boilerplate.html";
-        
+
         Metadata metadata = new Metadata();
         StringWriter sw = new StringWriter();
         new HtmlParser().parse(
                 HtmlParserTest.class.getResourceAsStream(path),
                 makeHtmlTransformer(sw),  metadata, new ParseContext());
-        
+
         String content = sw.toString();
-        
+
         // Should have <html>, <head>, <title>, <body> elements
         assertTrue(Pattern.matches("(?s).*<html xmlns=\"http://www.w3.org/1999/xhtml\">.*</html>.*$", content));
         assertTrue(Pattern.matches("(?s).*<head>.*</head>.*$", content));
         assertTrue(Pattern.matches("(?s).*<title>Title</title>.*$", content));
         assertTrue(Pattern.matches("(?s).*<body>.*</body>.*$", content));
     }
-    
+
     /**
      * Test case for TIKA-481. Verify href in <link> is resolved.
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-481">TIKA-481</a>
@@ -677,16 +676,16 @@ public class HtmlParserTest extends Test
                 makeHtmlTransformer(sw), new Metadata(), new ParseContext());
 
         String result = sw.toString();
-        
+
         // <link> tag should exist in <head>, with fully resolved URL
         assertTrue(Pattern.matches("(?s).*<head>.*<link rel=\"next\" href=\"http://domain.com/next.html\"/>.*</head>.*$", result));
     }
-    
+
 
     /**
      * Create ContentHandler that transforms SAX events into textual HTML output,
      * and writes it out to <writer> - typically this is a StringWriter.
-     * 
+     *
      * @param writer Where to write resulting HTML text.
      * @return ContentHandler suitable for passing to parse() methods.
      * @throws Exception
@@ -700,24 +699,24 @@ public class HtmlParserTest extends Test
         handler.setResult(new StreamResult(writer));
         return handler;
     }
-    
+
     /**
      * Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
      */
     public void testBoilerplateWithMarkup() throws Exception {
         String path = "/test-documents/boilerplate.html";
-        
+
         Metadata metadata = new Metadata();
         StringWriter sw = new StringWriter();
         ContentHandler ch = makeHtmlTransformer(sw);
         BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
         bpch.setIncludeMarkup(true);
-        
+
         new HtmlParser().parse(
                 HtmlParserTest.class.getResourceAsStream(path),
                 bpch,  metadata, new ParseContext());
-        
+
         String content = sw.toString();
         assertTrue("Has empty table elements", content.contains("<body><table><tr><td><table><tr><td>"));
         assertTrue("Has empty a element", content.contains("<a shape=\"rect\" href=\"Main.php\"/>"));
@@ -741,7 +740,7 @@ public class HtmlParserTest extends Test
     /**
      * Test case for TIKA-869
      * IdentityHtmlMapper needs to lower-case tag names.
-     * 
+     *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-869">TIKA-869</a>
      */
     public void testIdentityMapper() throws Exception {
@@ -756,16 +755,16 @@ public class HtmlParserTest extends Test
         new HtmlParser().parse (
                 new ByteArrayInputStream(html.getBytes("UTF-8")),
                 makeHtmlTransformer(sw),  metadata, parseContext);
-        
+
         String result = sw.toString();
         // Make sure we don't get <body><BODY/></body>
         assertTrue(Pattern.matches("(?s).*<body/>.*$", result));
     }
-    
+
     /**
      * Test case for TIKA-889
      * XHTMLContentHandler wont emit newline when html element matches ENDLINE set.
-     * 
+     *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-889">TIKA-889</a>
      */
     public void testNewlineAndIndent() throws Exception {
@@ -776,22 +775,24 @@ public class HtmlParserTest extends Test
         new HtmlParser().parse(
                 new ByteArrayInputStream(html.getBytes("UTF-8")),
                 handler,  new Metadata(), new ParseContext());
-        
+
         // Make sure we get <tab>, "one", newline, newline
         String result = handler.toString();
-        
+
         assertTrue(Pattern.matches("\tone\n\n", result));
     }
 
     /**
      * Test case for TIKA-983:  HTML parser should add Open Graph meta tag data to Metadata returned by parser
-     * 
+     *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-983">TIKA-983</a>
      */
     public void testOpenGraphMetadata() throws Exception {
         String test1 =
             "<html><head><meta property=\"og:description\""
             + " content=\"some description\" />"
+            + "<meta property=\"og:image\" content=\"http://example.com/image1.jpg\" />"
+            + "<meta property=\"og:image\" content=\"http://example.com/image2.jpg\" />"
             + "<title>hello</title>"
             + "</head><body></body></html>";
         Metadata metadata = new Metadata();
@@ -799,7 +800,7 @@ public class HtmlParserTest extends Test
                 new ByteArrayInputStream(test1.getBytes("ISO-8859-1")),
                 new BodyContentHandler(),  metadata, new ParseContext());
         assertEquals("some description", metadata.get("og:description"));
-
+        assertTrue(metadata.isMultiValued("og:image"));
     }
 
     // TIKA-1011