You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/11/08 02:20:03 UTC
tika git commit: TIKA-1896 -- add test files and unit tests,
no fix yet
Repository: tika
Updated Branches:
refs/heads/master aadccbf97 -> 7b45c7ceb
TIKA-1896 -- add test files and unit tests, no fix yet
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7b45c7ce
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7b45c7ce
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7b45c7ce
Branch: refs/heads/master
Commit: 7b45c7ceb0830cb33a04571da87dd86a817d4138
Parents: aadccbf
Author: tballison <ta...@mitre.org>
Authored: Mon Nov 7 21:19:56 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Mon Nov 7 21:19:56 2016 -0500
----------------------------------------------------------------------
.../apache/tika/parser/html/HtmlParserTest.java | 17 ++++++++++++++++-
.../test-documents/testHTMLBadScript.html | 9 +++++++++
.../test-documents/testHTMLGoodScript.html | 9 +++++++++
3 files changed, 34 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/7b45c7ce/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 41efcc0..75744ca 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -127,7 +127,7 @@ public class HtmlParserTest extends TikaTest {
}
@Test
- @Ignore("The file 'testXHTML_utf8.html' is not available fo testing")
+ @Ignore("The file 'testXHTML_utf8.html' is not available for testing")
public void XtestParseUTF8() throws IOException, SAXException, TikaException {
String path = "/test-documents/testXHTML_utf8.html";
Metadata metadata = new Metadata();
@@ -1219,6 +1219,21 @@ public class HtmlParserTest extends TikaTest {
}
@Test
+ @Ignore("until we fix TIKA-1896")
+ public void testBadScript() throws Exception {
+ String xml = getXML("testHTMLBadScript.html").xml;
+ assertContains("This is a test", xml);
+ assertNotContained("cool", xml);
+ }
+
+ @Test
+ public void testGoodScript() throws Exception {
+ String xml = getXML("testHTMLGoodScript.html").xml;
+ assertContains("This is a test", xml);
+ assertNotContained("cool", xml);
+ }
+
+ @Test
public void testMultiThreadingEncodingDetection() throws Exception {
List<EncodingDetector> detectors = new ArrayList<>();
ServiceLoader loader =
http://git-wip-us.apache.org/repos/asf/tika/blob/7b45c7ce/tika-parsers/src/test/resources/test-documents/testHTMLBadScript.html
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testHTMLBadScript.html b/tika-parsers/src/test/resources/test-documents/testHTMLBadScript.html
new file mode 100644
index 0000000..2c61f4f
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testHTMLBadScript.html
@@ -0,0 +1,9 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+ <script lang="javascript">cool script</script language>
+</head>
+<body>
+<p>This is a test.</p>
+</body>
+</html>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/7b45c7ce/tika-parsers/src/test/resources/test-documents/testHTMLGoodScript.html
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testHTMLGoodScript.html b/tika-parsers/src/test/resources/test-documents/testHTMLGoodScript.html
new file mode 100644
index 0000000..f37eb98
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testHTMLGoodScript.html
@@ -0,0 +1,9 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+ <script lang="javascript">cool script</script>
+</head>
+<body>
+<p>This is a test.</p>
+</body>
+</html>
\ No newline at end of file