You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2013/05/13 19:51:49 UTC
svn commit: r1481990 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/html/HtmlHandler.java
test/java/org/apache/tika/parser/html/HtmlParserTest.java
Author: dmeikle
Date: Mon May 13 17:51:49 2013
New Revision: 1481990
URL: http://svn.apache.org/r1481990
Log:
Patch by Markus Jelsma for TIKA-992 to allow OpenGraph meta tags to have multiple values.
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=1481990&r1=1481989&r2=1481990&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java Mon May 13 17:51:49 2013
@@ -112,7 +112,7 @@ class HtmlHandler extends TextContentHan
atts.getValue("content"));
} else if (atts.getValue("property") != null) {
// TIKA-983: Handle <meta property="og:xxx" content="yyy" /> tags
- addHtmlMetadata(
+ metadata.add(
atts.getValue("property"),
atts.getValue("content"));
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=1481990&r1=1481989&r2=1481990&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Mon May 13 17:51:49 2013
@@ -80,7 +80,7 @@ public class HtmlParserTest extends Test
"Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Tika Developers", metadata.get("Author"));
assertEquals("5", metadata.get("refresh"));
-
+
assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));
@@ -408,21 +408,20 @@ public class HtmlParserTest extends Test
*/
public void testBoilerplateRemoval() throws Exception {
String path = "/test-documents/boilerplate.html";
-
+
Metadata metadata = new Metadata();
BodyContentHandler handler = new BodyContentHandler();
new HtmlParser().parse(
HtmlParserTest.class.getResourceAsStream(path),
new BoilerpipeContentHandler(handler), metadata, new ParseContext());
-
+
String content = handler.toString();
assertTrue(content.startsWith("This is the real meat"));
assertTrue(content.endsWith("This is the end of the text.\n"));
assertFalse(content.contains("boilerplate"));
assertFalse(content.contains("footer"));
}
-
-
+
/**
* Test case for TIKA-478. Don't emit <head> sub-elements inside of <body>.
* @see <a href="https://issues.apache.org/jira/browse/TIKA-478">TIKA-478</a>
@@ -439,22 +438,22 @@ public class HtmlParserTest extends Test
makeHtmlTransformer(sw), new Metadata(), new ParseContext());
String result = sw.toString();
-
+
// Title element in <head> section
assertTrue(Pattern.matches("(?s)<html.*<head>.*<title>Title</title>.*</head>.*$", result));
// No meta elements in body
assertFalse(Pattern.matches("(?s).*<body>.*<meta. *</body>.*$", result));
-
+
// meta elements should show up in <head> section
assertTrue(Pattern.matches("(?s)<html.*<head>.*<meta .*</head>.*$", result));
-
+
// No link elements in body
assertFalse(Pattern.matches("(?s).*<body>.*<link .*</body>.*$", result));
-
+
// link element should be in <head> section
assertTrue(Pattern.matches("(?s)<html.*<head>.*<link .*</head>.*$", result));
-
+
// There should be ending elements.
assertTrue(Pattern.matches("(?s).*</body>.*</html>$", result));
@@ -475,7 +474,7 @@ public class HtmlParserTest extends Test
makeHtmlTransformer(sw), new Metadata(), new ParseContext());
String result = sw.toString();
-
+
// <img> tag should exist, with fully resolved URL
assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", result));
}
@@ -495,7 +494,7 @@ public class HtmlParserTest extends Test
makeHtmlTransformer(sw), new Metadata(), new ParseContext());
String result = sw.toString();
-
+
// <frame> tag should exist, with fully resolved URL
assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));
}
@@ -516,7 +515,7 @@ public class HtmlParserTest extends Test
makeHtmlTransformer(sw), new Metadata(), new ParseContext());
String result = sw.toString();
-
+
// <iframe> tag should exist, with fully resolved URL
assertTrue(Pattern.matches("(?s).*<iframe .* src=\"http://domain.com/framed.html\".*$", result));
}
@@ -538,7 +537,7 @@ public class HtmlParserTest extends Test
makeHtmlTransformer(sw), new Metadata(), new ParseContext());
String result = sw.toString();
-
+
// <map> tag should exist, with <area> tag with fully resolved URL
assertTrue(Pattern.matches("(?s).*<map .*<area .* href=\"http://domain.com/map.html\".*</map>.*$", result));
}
@@ -560,7 +559,7 @@ public class HtmlParserTest extends Test
makeHtmlTransformer(sw), new Metadata(), new ParseContext());
String result = sw.toString();
-
+
// <object> tag should exist with fully resolved URLs
assertTrue(
"<object> tag not correctly found in:\n" + result,
@@ -578,7 +577,7 @@ public class HtmlParserTest extends Test
Metadata metadata = new Metadata();
metadata.add("Content-Type", "text/html; charset=utf-8");
metadata.add("Language", null);
-
+
StringWriter sw = new StringWriter();
new HtmlParser().parse(
new ByteArrayInputStream(test.getBytes("UTF-8")),
@@ -606,10 +605,10 @@ public class HtmlParserTest extends Test
makeHtmlTransformer(sw1), new Metadata(), new ParseContext());
String result = sw1.toString();
-
+
// <frame> tag should exist, with fully resolved URL
assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$", result));
-
+
// <body> tag should not exist.
assertFalse(Pattern.matches("(?s).*<body>.*$", result));
@@ -627,7 +626,7 @@ public class HtmlParserTest extends Test
makeHtmlTransformer(sw2), new Metadata(), new ParseContext());
result = sw2.toString();
-
+
// <frame> tags should exist, with relative URL (no base element specified)
assertTrue(Pattern.matches("(?s).*<frame .* src=\"top.html\"/>.*$", result));
assertTrue(Pattern.matches("(?s).*<frame .* src=\"left.html\"/>.*$", result));
@@ -645,22 +644,22 @@ public class HtmlParserTest extends Test
*/
public void testBoilerplateDelegation() throws Exception {
String path = "/test-documents/boilerplate.html";
-
+
Metadata metadata = new Metadata();
StringWriter sw = new StringWriter();
new HtmlParser().parse(
HtmlParserTest.class.getResourceAsStream(path),
makeHtmlTransformer(sw), metadata, new ParseContext());
-
+
String content = sw.toString();
-
+
// Should have <html>, <head>, <title>, <body> elements
assertTrue(Pattern.matches("(?s).*<html xmlns=\"http://www.w3.org/1999/xhtml\">.*</html>.*$", content));
assertTrue(Pattern.matches("(?s).*<head>.*</head>.*$", content));
assertTrue(Pattern.matches("(?s).*<title>Title</title>.*$", content));
assertTrue(Pattern.matches("(?s).*<body>.*</body>.*$", content));
}
-
+
/**
* Test case for TIKA-481. Verify href in <link> is resolved.
* @see <a href="https://issues.apache.org/jira/browse/TIKA-481">TIKA-481</a>
@@ -677,16 +676,16 @@ public class HtmlParserTest extends Test
makeHtmlTransformer(sw), new Metadata(), new ParseContext());
String result = sw.toString();
-
+
// <link> tag should exist in <head>, with fully resolved URL
assertTrue(Pattern.matches("(?s).*<head>.*<link rel=\"next\" href=\"http://domain.com/next.html\"/>.*</head>.*$", result));
}
-
+
/**
* Create ContentHandler that transforms SAX events into textual HTML output,
* and writes it out to <writer> - typically this is a StringWriter.
- *
+ *
* @param writer Where to write resulting HTML text.
* @return ContentHandler suitable for passing to parse() methods.
* @throws Exception
@@ -700,24 +699,24 @@ public class HtmlParserTest extends Test
handler.setResult(new StreamResult(writer));
return handler;
}
-
+
/**
* Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
* @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
*/
public void testBoilerplateWithMarkup() throws Exception {
String path = "/test-documents/boilerplate.html";
-
+
Metadata metadata = new Metadata();
StringWriter sw = new StringWriter();
ContentHandler ch = makeHtmlTransformer(sw);
BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
bpch.setIncludeMarkup(true);
-
+
new HtmlParser().parse(
HtmlParserTest.class.getResourceAsStream(path),
bpch, metadata, new ParseContext());
-
+
String content = sw.toString();
assertTrue("Has empty table elements", content.contains("<body><table><tr><td><table><tr><td>"));
assertTrue("Has empty a element", content.contains("<a shape=\"rect\" href=\"Main.php\"/>"));
@@ -741,7 +740,7 @@ public class HtmlParserTest extends Test
/**
* Test case for TIKA-869
* IdentityHtmlMapper needs to lower-case tag names.
- *
+ *
* @see <a href="https://issues.apache.org/jira/browse/TIKA-869">TIKA-869</a>
*/
public void testIdentityMapper() throws Exception {
@@ -756,16 +755,16 @@ public class HtmlParserTest extends Test
new HtmlParser().parse (
new ByteArrayInputStream(html.getBytes("UTF-8")),
makeHtmlTransformer(sw), metadata, parseContext);
-
+
String result = sw.toString();
// Make sure we don't get <body><BODY/></body>
assertTrue(Pattern.matches("(?s).*<body/>.*$", result));
}
-
+
/**
* Test case for TIKA-889
* XHTMLContentHandler wont emit newline when html element matches ENDLINE set.
- *
+ *
* @see <a href="https://issues.apache.org/jira/browse/TIKA-889">TIKA-889</a>
*/
public void testNewlineAndIndent() throws Exception {
@@ -776,22 +775,24 @@ public class HtmlParserTest extends Test
new HtmlParser().parse(
new ByteArrayInputStream(html.getBytes("UTF-8")),
handler, new Metadata(), new ParseContext());
-
+
// Make sure we get <tab>, "one", newline, newline
String result = handler.toString();
-
+
assertTrue(Pattern.matches("\tone\n\n", result));
}
/**
* Test case for TIKA-983: HTML parser should add Open Graph meta tag data to Metadata returned by parser
- *
+ *
* @see <a href="https://issues.apache.org/jira/browse/TIKA-983">TIKA-983</a>
*/
public void testOpenGraphMetadata() throws Exception {
String test1 =
"<html><head><meta property=\"og:description\""
+ " content=\"some description\" />"
+ + "<meta property=\"og:image\" content=\"http://example.com/image1.jpg\" />"
+ + "<meta property=\"og:image\" content=\"http://example.com/image2.jpg\" />"
+ "<title>hello</title>"
+ "</head><body></body></html>";
Metadata metadata = new Metadata();
@@ -799,7 +800,7 @@ public class HtmlParserTest extends Test
new ByteArrayInputStream(test1.getBytes("ISO-8859-1")),
new BodyContentHandler(), metadata, new ParseContext());
assertEquals("some description", metadata.get("og:description"));
-
+ assertTrue(metadata.isMultiValued("og:image"));
}
// TIKA-1011