You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/03/22 02:19:13 UTC
[01/13] tika git commit: TIKA-1855 -- first pass. Need to turn back
on the forbidden-apis testCheck. More clean up remains.
Repository: tika
Updated Branches:
refs/heads/2.x cf9632388 -> e1498edbb
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/testlargerbuffer.html
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testlargerbuffer.html b/tika-test-resources/src/test/resources/test-documents/testlargerbuffer.html
new file mode 100644
index 0000000..545addd
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/testlargerbuffer.html
@@ -0,0 +1,827 @@
+<script language="javascript">
+
+function addToList(from,to)
+{
+ if(from.selectedIndex >= 0) {
+ isPresent = false;
+ var options=to.getElementsByTagName("option");
+
+ if(from.item(from.selectedIndex).value == "0") {
+ for(i=to.options.length-1; i>= 0; i--) {
+ to.removeChild(options[i]);
+ }
+ }
+ for (i=0; i< to.options.length; i++)
+ {
+ if(options[i].value == from.item(from.selectedIndex).value || options[i].value == "0")
+ isPresent = true;
+ }
+ if(!isPresent) {
+ var oOption = document.createElement("option");;
+ to.appendChild(oOption);
+ oOption.value = from.item(from.selectedIndex).value;
+ oOption.text = from.item(from.selectedIndex).text;
+ }
+ }
+}
+
+function delFromList(to)
+{
+ if(to.selectedIndex >= 0) {
+ var options=to.getElementsByTagName("option");
+ to.removeChild(options[to.selectedIndex]);
+ }
+}
+
+function fillListToGet(form, to)
+{
+ var options=to.getElementsByTagName("option");
+ for (i=0; i< to.options.length; i++)
+ {
+ form.action += "&"+to.name+"="+options[i].value;
+ }
+}
+
+</script>
+<script language="javascript">
+
+function addToList(from,to)
+{
+ if(from.selectedIndex >= 0) {
+ isPresent = false;
+ var options=to.getElementsByTagName("option");
+
+ if(from.item(from.selectedIndex).value == "0") {
+ for(i=to.options.length-1; i>= 0; i--) {
+ to.removeChild(options[i]);
+ }
+ }
+ for (i=0; i< to.options.length; i++)
+ {
+ if(options[i].value == from.item(from.selectedIndex).value || options[i].value == "0")
+ isPresent = true;
+ }
+ if(!isPresent) {
+ var oOption = document.createElement("option");;
+ to.appendChild(oOption);
+ oOption.value = from.item(from.selectedIndex).value;
+ oOption.text = from.item(from.selectedIndex).text;
+ }
+ }
+}
+
+function delFromList(to)
+{
+ if(to.selectedIndex >= 0) {
+ var options=to.getElementsByTagName("option");
+ to.removeChild(options[to.selectedIndex]);
+ }
+}
+
+function fillListToGet(form, to)
+{
+ var options=to.getElementsByTagName("option");
+ for (i=0; i< to.options.length; i++)
+ {
+ form.action += "&"+to.name+"="+options[i].value;
+ }
+}
+
+function fillOtherGet(form)
+{
+ if (document.all.price_from != "") {
+ form.action += "&price_from="+document.all.price_from.value;
+ }
+ if (document.all.price_to != "") {
+ form.action += "&price_to="+document.all.price_to.value;
+ }
+ if (document.all.square_from != "") {
+ form.action += "&square_from="+document.all.square_from.value;
+ }
+ if (document.all.square_to != "") {
+ form.action += "&square_to="+document.all.square_to.value;
+ }
+ if (document.all.MKAD != "") {
+ form.action += "&MKAD="+document.all.MKAD.value;
+ }
+}
+
+</script>
+<script language="javascript">
+
+function addToList(from,to)
+{
+ if(from.selectedIndex >= 0) {
+ isPresent = false;
+ var options=to.getElementsByTagName("option");
+
+ if(from.item(from.selectedIndex).value == "0") {
+ for(i=to.options.length-1; i>= 0; i--) {
+ to.removeChild(options[i]);
+ }
+ }
+ for (i=0; i< to.options.length; i++)
+ {
+ if(options[i].value == from.item(from.selectedIndex).value || options[i].value == "0")
+ isPresent = true;
+ }
+ if(!isPresent) {
+ var oOption = document.createElement("option");
+ to.appendChild(oOption);
+ oOption.value = from.item(from.selectedIndex).value;
+ oOption.text = from.item(from.selectedIndex).text;
+ }
+ }
+}
+
+function delFromList(to)
+{
+ if(to.selectedIndex >= 0) {
+ var options=to.getElementsByTagName("option");
+ to.removeChild(options[to.selectedIndex]);
+ }
+}
+
+function fillListToGet(form, to)
+{
+ var options=to.getElementsByTagName("option");
+ for (i=0; i< to.options.length; i++)
+ {
+ form.action += "&"+to.name+"="+options[i].value;
+ }
+}
+
+function fillOtherGet(form)
+{
+ if (document.all.price_from != "") {
+ form.action += "&price_from="+document.all.price_from.value;
+ }
+ if (document.all.price_to != "") {
+ form.action += "&price_to="+document.all.price_to.value;
+ }
+ if (document.all.square_from != "") {
+ form.action += "&square_from="+document.all.square_from.value;
+ }
+ if (document.all.square_to != "") {
+ form.action += "&square_to="+document.all.square_to.value;
+ }
+ if (document.all.MKAD != "") {
+ form.action += "&MKAD="+document.all.MKAD.value;
+ }
+}
+
+</script>
+
+<html>
+<head>
+<title>������ �������, ����� ��������, ������ ������ � ������. ������������ ������������. ������ �������� "���������-������������"
+
+</title>
+<link rel="SHORTCUT ICON" href="/favicon.ico" />
+<meta http-equiv="Content-Type" content="text/html; charset=windows-1251">
+<meta http-equiv="Content-Language" content="ru">
+<meta name="Keywords" content="��������� ������������, ������, �����, �����, ����, ��������, �������, ���������, �����, �������, �������, �������, ���, ������, �������, ������������, ����������, �������, ������������, ������, ������, ����">
+<meta name="Description" content="��������� ������������ "��������� ������������", "������� ����" ������. ������ � ������� ������������ � ����� ������������ � ������ � �����������: �������, ���������, ����������������, �������� � ������ ������� ���������, ��������, �������, ��������, ����, ����. ������ �������, ������, ���������. ����� ��������. ����� ������. ������ ����.">
+<meta http-equiv="description" content="��������� ������������ "��������� ������������", "������� ����" ������. ������ � ������� ������������ � ����� ������������ � ������ � �����������: �������, ���������, ����������������, �������� � ������ ������� ���������, ��������, �������, ��������, ����, ����. ������ �������, ������, ���������. ����� ��������. ����� ������. ������ ����.">
+<meta name="revisit" content="7 days">
+<meta name='yandex-verification' content='77a043af80883202' />
+
+<link rel="stylesheet" href="continent.css" type="text/css">
+</head>
+<body bgcolor="#FFFFFF" text="#000000" leftmargin="0" topmargin="0" marginwidth="0" marginheight="0">
+<table width="100%" border="0" cellspacing="0" cellpadding="0" height="100%">
+ <tr>
+ <td height="10">
+ <noindex><table width="100%" border="0" cellspacing="0" cellpadding="0">
+ <tr>
+ <td><a title="������ ������� ������� ������" href="/default.asp"><img src="imgs/logo2.gif" Alt="������ ������� �������, ������, ������, �������, ���������" width="205" height="68" style="margin-top:13px; margin-bottom:3px; margin-left:13px;" border=0></a></td>
+ <td align=center valign=bottom>
+
+ <a href='http://office.realty-guide.ru/rot/?key=289' target=_blank><img src='/imgs/banners/ban32.gif' border=0 width=500 height=75></a>
+
+ </td>
+ </tr>
+ </table></noindex>
+ </td>
+ </tr>
+ <tr>
+ <td valign="top" height="100%">
+ <table width="100%" border="0" cellspacing="0" cellpadding="0" height="100%">
+ <tr>
+ <td width="228" bgcolor="#546154" valign="top" align=center>
+ <table width="100%" border="0" cellspacing="0" cellpadding="0" height=402>
+ <tr>
+ <td height="147" background="imgs/hd_bg2.gif" valign="top"><img src="imgs/h_fl.jpg" width="202" height="136" style="margin-top: 10px; margin-left: 14px;" alt="������ ������� �������, ������, ������, �������, ���������"></td>
+ </tr>
+ <tr>
+ <td height="255" valign="top">
+ <OBJECT classid="clsid:D27CDB6E-AE6D-11cf-96B8-444553540000"
+ codebase="http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,0,0"
+ WIDTH="228" HEIGHT="250" id="menu10" ALIGN="">
+ <PARAM NAME=movie VALUE="menu10.swf"> <PARAM NAME=quality VALUE=high> <PARAM NAME=bgcolor VALUE=#525E52> <EMBED src="menu10.swf" quality=high bgcolor=#525E52 WIDTH="228" HEIGHT="250" NAME="menu10" ALIGN=""
+ TYPE="application/x-shockwave-flash" PLUGINSPAGE="http://www.macromedia.com/go/getflashplayer"></EMBED>
+</OBJECT>
+ </td>
+ </tr>
+ </table>
+<a href="/kommvip.asp"><img width=169 height=114 src="/imgs/vipbanner3.gif" border=0 alt="� ������ ������� �� ������ ������������ �� ������������� ��� �������� �� ������ ������������ ������������: ������ ������, �������, ���������, ����, ����������, ��������� ���������� ���������� � �.�., ������������ ������� ��������� ������������ ������� � ���������� ���������-������������"></a>
+<br>
+<br>
+<a href="/arendavip.asp"><img width=169 height=114 src="/imgs/vipbanner_arenda.jpg" border=0 alt="� ������ ������� �� ������ ������������ �� ������������ ������������� �� ������ �����: ������ �������, ������ ���������, ������ ���, ������ ����� � �.�., ������������ ������� ��������� ������������ ������� � ���������� ���������-������������"></a>
+<br>
+<br>
+<noindex><a target=_blank title="���������� ������� ������" href="http://www.lagunadom.ru"><img width=169 height=114 src="/ban/ban_169_114.gif" border=0 alt="���������� ������� ������"></a></noindex>
+<br>
+<br>
+<br>
+<br>
+<noindex><!--a target=_blank title="������������� ����, ���������� ����, ����, ������������� ���, ���������� ��� - ��������-������� ���������� ����" href="http://www.nyelki.ru"><img width=169 height=94 src="/imgs/banner.jpg" border=0 alt="������������� ����, ���������� ����, ����, ������������� ���, ���������� ��� - ��������-������� ���������� ����"></a>
+<br>
+<br>
+<br>
+<br-->
+<!-- Yandex.Metrika -->
+<script src="//mc.yandex.ru/resource/watch.js" type="text/javascript"></script>
+<script type="text/javascript">
+try { var yaCounter177293 = new Ya.Metrika(177293); } catch(e){}
+</script>
+<noscript><div style="position: absolute;"><img src="//mc.yandex.ru/watch/177293" alt="" /></div></noscript>
+<!-- Yandex.Metrika -->
+<!--Rating@Mail.ru COUNTER--><script language="JavaScript" type="text/javascript"><!--
+d=document;var a='';a+=';r='+escape(d.referrer)
+js=10//--></script><script language="JavaScript1.1" type="text/javascript"><!--
+a+=';j='+navigator.javaEnabled()
+js=11//--></script><script language="JavaScript1.2" type="text/javascript"><!--
+s=screen;a+=';s='+s.width+'*'+s.height
+a+=';d='+(s.colorDepth?s.colorDepth:s.pixelDepth)
+js=12//--></script><script language="JavaScript1.3" type="text/javascript"><!--
+js=13//--></script><script language="JavaScript" type="text/javascript"><!--
+d.write('<a target=_blank href="http://top.mail.ru/jump?from=782596"'+
+' target=_top><img src="http://top.list.ru/counter'+
+'?id=782596;t=54;js='+js+a+';rand='+Math.random()+
+'" alt="�������@Mail.ru"'+' border=0 height=31 width=88/><\/a>')
+if(11<js)d.write('<'+'!-- ')//--></script><noscript><a
+target=_blank href="http://top.mail.ru/jump?from=782596"><img
+src="http://top.list.ru/counter?js=na;id=782596;t=54"
+border=0 height=31 width=88
+alt="�������@Mail.ru"/></a></noscript><script language="JavaScript" type="text/javascript"><!--
+if(11<js)d.write('--'+'>')//--></script><!--/COUNTER--></noindex>
+<br>
+ <br><br>
+ </td>
+ <td valign="top" bgcolor="#546154" height="100%">
+ <table width="100%" border="0" cellspacing="0" cellpadding="0" height="100%">
+ <tr>
+ <td height="4" background="imgs/hd_bg1.gif" align="right" valign="top" style="padding-right:13px; font-size:4px;"> </td>
+ </tr>
+ <tr>
+ <td valign="top" style="padding-right:13px;" height="20" align=right background="imgs/hd_bg1n.gif">
+ <table border=0 cellspacing=0 cellpadding=0 height=20>
+ <tr>
+
+ <td><img src="/imgs/tabl1_p.gif" height=20></td>
+ <td valign=bottom background="/imgs/tabl2_p.gif"><div style="padding-bottom:2px;"><a style="color:#000000; text-decoration:none;" href="/basket.asp">�������</a></td>
+ <td><img src="/imgs/tablr_pa.gif" height=20></td>
+ <td valign=bottom background="/imgs/tabl2_a.gif"><div style="padding-bottom:2px; font-weight:bold; text-transform:uppercase;">���������-������������</div></div></td>
+ <td><img src="/imgs/tabl3_a.gif" height=20></td>
+
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td valign="top" style="padding-bottom:13px;padding-right:13px;" height="100%">
+
+<style>
+a:link { color: #000000; text-decoration: none;}
+a:visited { color: #000000; text-decoration: none;}
+a:active { color: #000000; text-decoration: none;}
+a:hover { color: #1FB21F; text-decoration: underline;}
+h2 { margin:0px; padding:0px; font-weight: normal; font-size: 8pt; text-decoration:none;}
+</style>
+<table width="100%" border="0" cellspacing="0" cellpadding="0" bgcolor=#FFFFFF>
+<tr>
+ <td valign=top align=left><img src="/imgs/fp2.gif" width=37 height=31></td>
+ <td valign=top align=right><img src="/imgs/fp1.gif" width=257 height=24></td>
+</tr>
+</table>
+<table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
+<tr>
+ <td valign=top>
+ <table width="100%" border="0" cellspacing="1" cellpadding="0" bgcolor=#FFFFFF>
+ <tr>
+ <td width=12 valign=top><img src="/imgs/fp_li2.gif" width=8 height=15></td>
+ <td>
+ <h1>���������-������������:</h1>
+<p style="font-family:Times New Roman; font-size:14px; margin-top:10px; margin-bottom:0px;"><b>��������� ������������ "���������-������������"</b>, �������� � 1999 ����, ������������ ����� ������� �������������� �� ����� ������������ �. ������, ������� ������������ ������������ ������������ � �������������� ������ � ������� �������.</p>
+<p style="font-family:Times New Roman; font-size:14px; margin-top:10px; margin-bottom:0px;">�������� "<b>���������-������������</b>" ���������� ���������� ������ ������ ���� ������������ �������� "������ � ������ � �����������" �� �������� ������� � ������������ ��������������� ���������������� ����� � ������.</p>
+<p style="font-family:Times New Roman; font-size:14px; margin-top:10px; margin-bottom:0px;">�� ���������� ��������� <b>����������� ������</b>:</p>
+<ul style="font-family:Times New Roman; font-size:14px; margin-top:10px; margin-bottom:0px;">
+<li><b>������ � ������� ������������ ������������ � ������ � �����������</b>: �������, ���������, ����������������, �������� � ������ ������� ���������.
+<li><b>�������, ������ ������ � ������-�������</b>, ������ ������ �����, ������ ����� ��� ��������.
+<li><b>������ � ������� ����� ������������ � ������</b>: ��������, �������.
+<li><b>������ � ������� ���������� ������������ � �����������</b>: ��������, ����, ����.
+<li><b>����������� ������������� ������ �� ������ � �����-������� ����� � ������� ���������</b>.
+<li><b>���������� � ����������� �������������������� ����������</b>.
+<li><b>������������� ���������� �������������</b>.
+</ul>
+<br>
+ </td>
+ </tr>
+ </table>
+ <table width="100%" border="0" cellspacing="0" style="padding-left:12px;" cellpadding="0" bgcolor=#FFFFFF>
+ <tr>
+ <td valign=top width=50%><h1 style="color:red">������ ����� ������������</h1></td>
+ <td valign=top width=50%><h1 style="color:red">������ ������������ ������������</h1></td>
+ </tr>
+ <tr>
+ <td valign=top><br><h1>������ ������� � ������</h1></td>
+ <td valign=top><br><h1>������ ������ � ���������</h1></td>
+ </tr>
+ <tr>
+ <td valign=top>
+ <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
+ <tr>
+ <td valign=top width=60><img width=60 height=60 alt="������ ������� � ������" border=0 class=img1 src="/imgs/fp_i1.jpg"></td>
+ <td valign=top class=fp_small>����� �������� � ������ ���� ��������� ������������ ������� ������ � �������. 150 ����������� ����� �������� ���������. ���� �� ������ ������� ����������� ������ ���.</td>
+ </tr>
+ <tr>
+ <td colspan=2>
+ <a title="������ �������" href="arenda_all.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������ �������</h2></a></div>
+ <a title="�������� � ������" href="arenda_dball.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ������� � ������</h2></a></div>
+ </td>
+ </tr>
+ </table>
+ </td>
+ <td valign=top>
+ <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
+ <tr>
+ <td valign=top width=60><img width=60 height=60 alt="������ ������ � ���������" border=0 class=img1 src="/imgs/fp_i2.jpg"></td>
+ <td valign=top class=fp_small>������ ������. ����� ������� ��������� � ������. ����� 2000 ��������� ������ � ������. 100 ����� ����������� ����� ���� ������ ����. ���� �� ������ ������ ����������� ��������. ���� ������.</td>
+ </tr>
+ <tr>
+ <td colspan=2>
+ <a title="������ ������" href="komm.asp?kommtype_id=1&kommtype_id=8"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������ ������</h2></a></div>
+ <a title="����� � ������" href="komm_db.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ������ � ������</h2></a></div>
+ </td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td valign=top><br><h1>������ ������ � ������</h1></td>
+ <td valign=top><br><h1>������ ������� � ������</h1></td>
+ </tr>
+ <tr>
+ <td valign=top>
+ <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
+ <tr>
+ <td valign=top width=60><img width=60 height=60 alt="������ ������ � ������" border=0 class=img1 src="/imgs/fp_i3.jpg"></td>
+ <td valign=top class=fp_small>������ ������ � ����� ������ ������ �� 1 ���� � �������� �������. � ��� ����� ����� ��������� � ������ ������� � ������������ ��������. ������ ����� �������? ������ �������� ������!</td>
+ </tr>
+ <tr>
+ <td colspan=2>
+ <a title="������ ������" href="arenda_all.asp?roomamount=-1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������ ������</h2></a></div>
+ <a title="������� � ������" href="arenda_dball.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ������ � ������</h2></a></div>
+ </td>
+ </tr>
+ </table>
+ </td>
+ <td valign=top>
+ <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
+ <tr>
+ <td valign=top width=60><img width=60 height=60 alt="������ ������� � ������" border=0 class=img1 src="/imgs/fp_i4.jpg"></td>
+ <td valign=top class=fp_small>����� ����� � ������ ��� �����������. �� ����� ����� �� ������ ����� ��������� ����������� �� ������ ��������� ��������� � ��������. ���� �������.</td>
+ </tr>
+ <tr>
+ <td colspan=2>
+ <a title="������ �������" href="komm.asp?kommtype_id=2"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������ �������</h2></a></div>
+ <a title="������ � ������" href="komm_db.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ������� � ������</h2></a></div>
+ </td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td valign=top><br><h1>������ ������� �������. ���� �������.</h1></td>
+ <td valign=top><br><h1>������ ���������������� ���������</h1></td>
+ </tr>
+ <tr>
+ <td valign=top>
+ <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
+ <tr>
+ <td valign=top width=60><img width=60 height=60 alt="������ ������� �������" border=0 class=img1 src="/imgs/fp_i5.jpg"></td>
+ <td valign=top class=fp_small>��� ���, ��� ����� ����� ������� �������� ��� �������� � ������. � ��� �� ����� ����� 1000 �������� ������� ������������ � ������. ���� �������. ����� ������� ��������? �����������, �� �������.</td>
+ </tr>
+ <tr>
+ <td colspan=2>
+ <a title="������ �������" href="arenda_all.asp?elit=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������ ������� �������</h2></a></div>
+ <a title="�������� � ������" href="arenda_dball.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ������� ������� � ������</h2></a></div>
+ </td>
+ </tr>
+ </table>
+ </td>
+ <td valign=top>
+ <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
+ <tr>
+ <td valign=top width=60><img width=60 height=60 alt="������ ���������������� ���������" border=0 class=img1 src="/imgs/fp_i6.jpg"></td>
+ <td valign=top class=fp_small>��� ���, ��� ����� ����� ��� ����� ������������ � ������ ��� �����������. � ��� �� ���� �� ������ ����� ������� ����� ��������� ��� ������������ . ���� ���������.</td>
+ </tr>
+ <tr>
+ <td colspan=2>
+ <a title="������ �����������" href="komm.asp?kommtype_id=4"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������ �����������</h2></a></div>
+ <a title="������������ � ������" href="komm_db.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ����������� � ������</h2></a></div>
+ </td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td valign=top><br><h1>���������� ������ �������</h1></td>
+ <td valign=top><br><h1>������ ���������</h1></td>
+ </tr>
+ <tr>
+ <td valign=top>
+ <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
+ <tr>
+ <td valign=top width=60><img width=60 height=60 alt="���������� ������ �������" border=0 class=img1 src="/imgs/fp_i7.jpg"></td>
+ <td valign=top class=fp_small>������ �������, ��������������� � ������ �������� ���������, ��������� ������������ ���������� ����� �������� � ������ ���������. ���� ������� � ���������� ������.</td>
+ </tr>
+ <tr>
+ <td colspan=2>
+ <a title="������ �������" href="arendaday_results.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� ���������� ������ �������</h2></a></div>
+ <a title="�������� � ������" href="arendaday_db.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ������� � ���������� ������</h2></a></div>
+ </td>
+ </tr>
+ </table>
+ </td>
+ <td valign=top>
+ <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
+ <tr>
+ <td valign=top width=60><img width=60 height=60 alt="������ ���������" border=0 class=img1 src="/imgs/fp_i8.jpg"></td>
+ <td valign=top class=fp_small>��� ���, ��� ����� ����� ��� ����� �������. �� ���������� ������� ����� �������� ��������� � �������� � �������� ������� ������. ���� ���������.</td>
+ </tr>
+ <tr>
+ <td colspan=2>
+ <a title="������ ���������" href="komm.asp?kommtype_id=3"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������ ���������</h2></a></div>
+ <a title="�������� � ������" href="komm_db.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ��������� � ������</h2></a></div>
+ </td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td valign=top><br><h1>������ ��������� � ��� � �����������</h1></td>
+ <td valign=top><br><h1>������ ��������� ��� ��������� � ����</h1></td>
+ </tr>
+ <tr>
+ <td valign=top>
+ <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
+ <tr>
+ <td valign=top width=60><img width=60 height=60 alt="������ ��������� � ���" border=0 class=img1 src="/imgs/fp_i9.jpg"></td>
+ <td valign=top class=fp_small>���, ���� ���������� ������ �������� ��� ����������� ���� � �����������, ��������� ������������ ���������� ������� ����� ���������� ������������ . ����� ��� ����� ������� � ���� ��� ������. ����.</td>
+ </tr>
+ <tr>
+ <td colspan=2>
+ <a title="������ ��������� ���" href="arenda_cottage.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� ������ ���������, ���</h2></a></div>
+ <a title="�������� ���� � ������" href="cottage_db.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ���������, ���, ����� � ������</h2></a></div>
+ </td>
+ </tr>
+ </table>
+ </td>
+ <td valign=top>
+ <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
+ <tr>
+ <td valign=top width=60><img width=60 height=60 alt="������ ���������� � ����" border=0 class=img1 src="/imgs/fp_i10.jpg"></td>
+ <td valign=top class=fp_small>������ ����� ��������� ��� ��������, ��� ��� ����. �� ���� ����� �� ������ ����� ����������� �� ������ ������������ ������������ ��� ������������ ������� � ����. ����� ��� ����� ��������, ����, ��� � ������.</td>
+ </tr>
+ <tr>
+ <td colspan=2>
+ <a title="������ ���������� ����" href="komm.asp?kommtype_id=5&kommtype_id=6"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������ ����������, ����</h2></a></div>
+ <a title="��������� � ���� � ������" href="komm_db.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ���������� � ���� � ������</h2></a></div>
+ </td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td valign=top><br><h1>���������� ������ ��������� � �����������</h1></td>
+ <td valign=top><br><h1>������ ��������� ���������� ����������</h1></td>
+ </tr>
+ <tr>
+ <td valign=top>
+ <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
+ <tr>
+ <td valign=top width=60><img width=60 height=60 alt="���������� ������ ���������" border=0 class=img1 src="/imgs/fp_i13.jpg"></td>
+ <td valign=top class=fp_small>�� ������ �������� �������� ��� ��������� � ���������� ����? ���� ��������� ������������ ���������� ����� ������� ���������. ���� ���������.</td>
+ </tr>
+ <tr>
+ <td colspan=2>
+ <a title="������ ���������" href="arenda_cottageday.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� ���������� ������ ���������</h2></a></div>
+ <a title="�������� � ������" href="cottageday_db.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ��������� � ���������� ������</h2></a></div>
+ </td>
+ </tr>
+ </table>
+ </td>
+ <td valign=top>
+ <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
+ <tr>
+ <td valign=top width=60><img width=60 height=60 alt="������ ��������� ���������� ����������" border=0 class=img1 src="/imgs/fp_i15.jpg"></td>
+ <td valign=top class=fp_small>����� ��������� ���������� ����������. ������� ����������� ����� ��� � ������. ���� �� ������������ ������������ ����������� ���������. ����.</td>
+ </tr>
+ <tr>
+ <td colspan=2>
+ <a title="������ ���������" href="komm.asp?kommtype_id=7"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������ ��������� ���������� ����������</h2></a></div>
+ <a title="��������� � ������" href="komm_db.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ��������� ���������� ���������� � ������</h2></a></div>
+ </td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td colspan=2 valign=top><br><h1 style="color:red">������� ������������ ������������</h1></td>
+ </tr>
+ <tr>
+ <td colspan=2 align=center valign=top>
+ <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
+ <tr>
+ <td valign=top width=60><img width=60 height=60 alt="������� ������������ ������������" border=0 class=img1 src="/imgs/fp_i14.jpg"></td>
+ <td valign=top class=fp_small>���� �� ������ ������ ��������� ��� �������: ����, �������, �����, ������������, �� �� ������ ������������ � ������������� �� ������� ������������ ������������ ��� ������� ���� ������ �� ������� ��������� � ������. ����-������� ������������ �� �������.</td>
+ </tr>
+ <tr>
+ <td colspan=2>
+ <a title="������� ������" href="kommP.asp?kommtype_id=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������� ������</h2></a></div>
+ <a title="������� �������" href="kommP.asp?kommtype_id=2"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������� �������</h2></a></div>
+ <a title="������� ���������" href="kommP.asp?kommtype_id=3"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������� ���������</h2></a></div>
+ <a title="������� ����������" href="kommP.asp?kommtype_id=5"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������� ����������</h2></a></div>
+ <a title="������� ����" href="kommP.asp?kommtype_id=6"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������� ����</h2></a></div>
+ <a title="������� �����������" href="kommP.asp?kommtype_id=4"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������� ���������������� ���������</h2></a></div>
+ <a title="������� ���������" href="kommP.asp?kommtype_id=7"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������� ���</h2></a></div>
+ </td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ </table>
+ <table width="100%" border="0" cellspacing="1" cellpadding="0" bgcolor=#FFFFFF>
+ <tr>
+ <td width=12 valign=top><img src="/imgs/fp_li2.gif" width=8 height=15></td>
+ <td>
+ <h1>������� ������������. ������:</h1>
+ <br>
+
+ <li><a href="/news.asp?id=69&curr=1"><h2>��������� �� ������ - ������� �������� ������������</h2></a>
+
+ <li><a href="/news.asp?id=68&curr=1"><h2>������ ��������!</h2></a>
+
+ <li><a href="/news.asp?id=67&curr=1"><h2>��� ������ ����������, ���� ����� ����������� �������� �������� �����?</h2></a>
+
+ <li><a href="/news.asp?id=66&curr=1"><h2>5 �������� ����� ������� �������� � ������</h2></a>
+
+ <li><a href="/news.asp?id=65&curr=1"><h2>���� ������� � ����������: ���� �������� � ���?</h2></a>
+
+ <li><a title="������� ������������" href="news.asp"><h2><b>������ ������� ������������...</b></h2></a>
+ <br>
+ </td>
+ </tr>
+ </table>
+ </td>
+ <td width=5> </td>
+ <td valign=top width=300>
+ <h1><img src="/imgs/fp_li2.gif" width=8 height=15> ����������� <font color=red>��� ��������</font>:</h1>
+ <br>
+
+
+<table cellspacing=0 cellpadding=0 border=0 width=100%>
+<tr>
+ <td width=1 bgcolor=#CFCFCF><img src='imgs/blank.gif' width=1 height=1></td>
+ <td valign=top width=100% bgcolor=white>
+ <table cellspacing=1 cellpadding=3 border=0 width=100% style='margin-top:5px;'>
+ <tr>
+ <td width=1 valign=middle><img src='imgs/knop1.gif'></td>
+ <td class=text bgcolor=white valign=middle>
+ <a href="/arendaview_komm.asp?anketa_id=148110" class=menubig><b>������ ������</b></a>
+ </td>
+ </tr>
+ </table>
+ <table width=100% cellspacing=0 cellpadding=3 border=0>
+ <tr>
+ <td width=128 valign=top align=right nowrap>
+ <a href="/arendaview_komm.asp?anketa_id=148110"><img style="margin-right:11px; margin-left:5px;" width=9 height=100 src="/imgs/bez.gif" border=0><img class=img1 width=100 height=100 src="/imgs/Photos5/s_k_67491.jpg" border=0 alt="������ ������"></a>
+ </td>
+ <td valign=top nowrap style='padding-left:6px;'>
+ <a href='/arendaview_komm.asp?anketa_id=148110' style="color:#000000; text-decoration:none; font-size:11px;"><p class='viprow0'><b>���������� �.</b></p><p class='viprow'>7 �� �� ����</p><p class='viprow'>2100 - 2500 ��.�</b></p><p class='viprow'><b style='font-size:11px;'>130 $/��.�./���</b></p></a>
+ </td>
+ </tr>
+ </table>
+
+ </td>
+</tr>
+<tr><td colspan=2 bgcolor=#CFCFCF><img src='imgs/blank.gif' width=1 height=1></td></tr>
+<tr><td colspan=2><img src='imgs/shadow1.gif'></td></tr>
+</table>
+ <br>
+
+
+<table cellspacing=0 cellpadding=0 border=0 width=300>
+<tr>
+ <td width=1 bgcolor=#CFCFCF><img src='imgs/blank.gif' width=1 height=1></td>
+ <td valign=top width=100% bgcolor=white>
+ <table cellspacing=1 cellpadding=3 border=0 width=100% style='margin-top:5px;'>
+ <tr>
+ <td valign=top width=1 valign=middle><img src='imgs/knop1.gif'></td>
+ <td class=text bgcolor=white valign=middle>
+ <a href="/arendaview_kommp.asp?anketa_id=167792" class=menubig><b>������� �����</b></a>
+ </td>
+ </tr>
+ </table>
+ <table width=100% cellspacing=0 cellpadding=3 border=0>
+ <tr>
+ <td width=128 valign=top align=right nowrap>
+ <a href="/arendaview_kommp.asp?anketa_id=167792"><img style="margin-right:11px; margin-left:5px;" width=9 height=100 src="/imgs/bez.gif" border=0><img class=img1 width=100 height=100 src="/imgs/Photos5/s_kp_96026.jpg" border=0 alt="������� �����"></a>
+ </td>
+ <td valign=top nowrap style='padding-left:6px;'>
+ <a href='/arendaview_kommp.asp?anketa_id=167792' style="color:#000000; text-decoration:none; font-size:11px;"><p class='viprow0'><b>�. ���������� �������</b></p><p class='viprow'>918 ��.�</b></p><p class='viprow'><b style='font-size:11px;'>7 344 000 $</b></p></a>
+ </td>
+ </tr>
+ </table>
+
+ </td>
+</tr>
+<tr><td colspan=2 bgcolor=#CFCFCF><img src='imgs/blank.gif' width=1 height=1></td></tr>
+<tr><td colspan=2><img src='imgs/shadow1.gif'></td></tr>
+</table>
+ <br>
+
+<table cellspacing=0 cellpadding=0 border=0 width=100%>
+<tr>
+ <td width=1 bgcolor=#CFCFCF><img src='imgs/blank.gif' width=1 height=1></td>
+ <td valign=top width=100% bgcolor=white>
+ <table cellspacing=1 cellpadding=3 border=0 width=100% style='margin-top:5px;'>
+ <tr>
+ <td valign=top width=1 valign=middle><img src='imgs/knop1.gif'></td>
+ <td class=text bgcolor=white valign=middle>
+ <a href="/arendaview_all.asp?anketa_id=160328" class=menubig><b>������ 2-����. ��������</b></a>
+ </td>
+ </tr>
+ </table>
+ <table width=100% cellspacing=0 cellpadding=3 border=0>
+ <tr>
+ <td width=128 valign=top align=right nowrap>
+ <a href="/arendaview_all.asp?anketa_id=160328"><img style="margin-right:11px; margin-left:5px;" width=9 height=100 src="/imgs/bez.gif" border=0><img class=img1 width=100 height=100 src="/imgs/Photos1/s_ae_45253.jpg" border=0 alt="������ 2-����. ��������"></a>
+ </td>
+ <td valign=top nowrap style='padding-left:6px;'>
+ <a href='/arendaview_all.asp?anketa_id=160328' style="color:#000000; text-decoration:none; font-size:11px;"><p class='viprow0'><b>�. ��������</b></p><p class='viprow'>10 ����� ������ �� �����</p><p class='viprow'>��. ��������, ��� 25</p><p class='viprow'>����� ������� 60 ��.�</b></p><p class='viprow'><b style='font-size:11px;'>100 000 $/���</b></p></a>
+ </td>
+ </tr>
+ </table>
+
+ </td>
+</tr>
+<tr><td colspan=2 bgcolor=#CFCFCF><img src='imgs/blank.gif' width=1 height=1></td></tr>
+<tr><td colspan=2><img src='imgs/shadow1.gif'></td></tr>
+</table>
+<br>
+
+
+<table cellspacing=0 cellpadding=0 border=0 width=100%>
+<tr>
+ <td width=1 bgcolor=#CFCFCF><img src='imgs/blank.gif' width=1 height=1></td>
+ <td valign=top width=100% bgcolor=white>
+ <table cellspacing=1 cellpadding=3 border=0 width=100% style='margin-top:5px;'>
+ <tr>
+ <td valign=top width=1 valign=middle><img src='imgs/knop1.gif'></td>
+ <td class=text bgcolor=white valign=middle>
+ <a href="/arendaview_cottage.asp?anketa_id=1761" class=menubig><b>������ ��������</b></a>
+ </td>
+ </tr>
+ </table>
+ <table width=100% cellspacing=0 cellpadding=3 border=0>
+ <tr>
+ <td width=128 valign=top align=right nowrap>
+ <a href="/arendaview_cottage.asp?anketa_id=1761"><img style="margin-right:11px; margin-left:5px;" width=9 height=100 src="/imgs/bez.gif" border=0><img class=img1 width=100 height=100 src="/imgs/Photos61/vipcot1761.jpg" border=0 alt="������ ��������"></a>
+ </td>
+ <td valign=top nowrap style='padding-left:6px;'>
+ <a href='/arendaview_cottage.asp?anketa_id=1761' style="color:#000000; text-decoration:none; font-size:11px;"><p class='viprow0'><b>������������ �.</b></p><p class='viprow'>15 �� �� ����</p><p class='viprow'>520 ��.�</b></p><p class='viprow'><b style='font-size:11px;'>465 000 $/���</b></p></a>
+ </td>
+ </tr>
+ </table>
+
+ </td>
+</tr>
+<tr><td colspan=2 bgcolor=#CFCFCF><img src='imgs/blank.gif' width=1 height=1></td></tr>
+<tr><td colspan=2><img src='imgs/shadow1.gif'></td></tr>
+</table>
+<br>
+
+ <div class=ns><a title="������ ������������" href="kommvip.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ��� ����������� �� ������ ������������ ������������ ��� ��������</h2></a></div>
+ <div class=ns><a title="������ ������������ " href="kommvipp.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ��� ����������� �� ������� ������������ ������������ ��� ��������</h2></a></div>
+ <div class=ns><a title="������ �������" href="arendavip.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ��� ����������� �� ������ ������� ��� ��������</h2></a></div>
+ <div class=ns><a title="������ ������� " href="arendacotvip.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ��� ����������� �� ������ ���������, ���, ����� ��� ��������</h2></a></div>
+ <br>
+ <h1><img src="/imgs/fp_li2.gif" width=8 height=15> �������� ������:</h1>
+ <p style="font-family:Times New Roman; font-size:12px; margin-top:10px; margin-bottom:0px;">
+ <b>����������</b> ������������, �������� ������� ��� ����� � ������, �� ������� ������ ������� ��� ����� ��������� �� �������� ��������. �� ������ �������� ������ ��� ��������� � ���� �� ��������.
+<br><i>������������ �� �������� ������ � ������� ������������ ���������</i>.
+</p>
+ <br>
+ <div class=ns><a target=_blank title="����� �������� " href="form_1.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ��������</h2></a></div>
+ <div class=ns><a target=_blank title="����� ������� " href="form_1.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �������</h2></a></div>
+ <div class=ns><a target=_blank title="����� ������� ��������" href="form_1.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ������� ��������</h2></a></div>
+ <div class=ns><a target=_blank title="����� �������� ���������" href="form_1day.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �������� ���������</h2></a></div>
+ <div class=ns><a target=_blank title="����� ��������, ����, ����" href="form_5s.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �������, ���, ����</h2></a></div>
+ <div class=ns><a target=_blank title="����� �����" href="form_1off.asp?kommtypeid=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ����</h2></a></div>
+ <div class=ns><a target=_blank title="����� ������ � ������������" href="form_1off.asp?kommtypeid=2&kommtypeid=4"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �����, ������������</h2></a></div>
+ <div class=ns nowrap><a target=_blank title="����� ��������" href="form_1off.asp?kommtypeid=3&kommtypeid=5&kommtypeid=6&kommtypeid=7"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �������, ��������, ����</h2></a></div>
+ <div class=ns><a target=_blank title="������� ����" href="form_6s.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������� �������, ����, ���</h2></a></div>
+ <div class=ns><a target=_blank title="������� �����" href="form_1off.asp?kommtypeid=1&own_type=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������� ����</h2></a></div>
+ <div class=ns><a target=_blank title="������� ������" href="form_1off.asp?kommtypeid=2&kommtypeid=4&own_type=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������� �����, ������������</h2></a></div>
+ <div class=ns><a target=_blank title="������� ���������" href="form_1off.asp?kommtypeid=1&kommtypeid=2&kommtypeid=3&kommtypeid=4&kommtypeid=5&kommtypeid=6&kommtypeid=7&own_type=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������� ������� ���������</h2></a></div>
+ <br>
+ <p style="font-family:Times New Roman; font-size:12px; margin-top:10px; margin-bottom:0px;">
+ <b>��������.</b> ���� �� ������ ����� � ������ ��������, �������, ����, �����, �������... ���� ���� ��� ���������� ������� ������������, �������� ������ � �� ������� ��� ����� ��� ������ ��������� �������, ������� � � ����������� �����. �� ������ ������������ � ������������� �� ������ � ������� ������������ �� ����� �����. ���� ����������� ���������. ��� ������������ ���������
��� �������������� ��������� ������ �������������.
+<br><i>��������! �� �� ����� ����������, �� ��������� �������������� �����, �������� ��������� � ������� ���������, ������ ������������ �� ����� ������.</i>
+ </p>
+ <br>
+ <div class=ns><a target=_blank title="����� �������� " href="form_3.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ��������</h2></a></div>
+ <div class=ns><a target=_blank title="����� ������� " href="form_3.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �������</h2></a></div>
+ <div class=ns><a target=_blank title="����� ������� �������� " href="form_3.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ������� ��������</h2></a></div>
+ <div class=ns><a target=_blank title="����� �������� ���������" href="form_3day.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �������� ���������</h2></a></div>
+ <div class=ns><a target=_blank title="����� ����" href="form_5.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �������, ���, ����</h2></a></div>
+ <div class=ns><a target=_blank title="����� ���� " href="form_3off.asp?kommtypeid=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ����</h2></a></div>
+ <div class=ns><a target=_blank title="����� ����� " href="form_3off.asp?kommtypeid=2&kommtypeid=4"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �����, ������������</h2></a></div>
+ <div class=ns><a target=_blank title="����� ������� " href="form_3off.asp?kommtypeid=3&kommtypeid=5&kommtypeid=6&kommtypeid=7"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �������, ��������, ����</h2></a></div>
+ <div class=ns><a target=_blank title="������ �������, ���� " href="form_6.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������ �������, ����, ���</h2></a></div>
+ <div class=ns><a target=_blank title="������ ���� " href="form_3off.asp?kommtypeid=1&own_type=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������ ����</h2></a></div>
+ <div class=ns><a target=_blank title="������ ����� " href="form_3off.asp?kommtypeid=2&kommtypeid=4&own_type=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������ �����, ������������</h2></a></div>
+ <div class=ns><a target=_blank title="������ ��������� " href="form_3off.asp?kommtypeid=1&kommtypeid=2&kommtypeid=3&kommtypeid=4&kommtypeid=5&kommtypeid=6&kommtypeid=7&own_type=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������ ������� ���������</h2></a></div>
+ <br>
+ <h1><img src="/imgs/fp_li2.gif" width=8 height=15> ���������� ����� ���������� �� ������ ������������:</h1>
+ <br>
+ <div class=ns><a title="������ �������" href="freetables.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������ ������� � ������</h2></a></div>
+ <div class=ns><a title="������ ���������" href="freetables_komm.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������ ������� ���������</h2></a></div>
+ <div class=ns><a title="������ ��������� ���" href="freetables_cott.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������ ���������, ���, �����</h2></a></div>
+ <br>
+ <h1><img src="/imgs/fp_li2.gif" width=8 height=15> ��������:</h1>
+ <br>
+ <div class=ns><a href="vakansii.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �� ������ ������������ ������������</h2></a></div>
+ <div class=ns><a href="vakansii.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �� ������ ������� � ������</h2></a></div>
+ <br>
+ </td>
+</tr>
+</table>
+<table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
+<tr>
+ <td bgcolor="#FFFFFF" colspan=2 valign="top" style="padding-top: 5px; padding-right: 5px; padding-bottom: 5px; padding-left: 7px">
+ <br>
+<div align=center>
+<a style="font-size:10px;" href="/default.asp">�������</a> ::
+<a style="font-size:10px;" href="/arenda_results.asp">������ �����</a> ::
+<a style="font-size:10px;" href="/prodaga.asp">�������/������� �����</a> ::
+<a style="font-size:10px;" href="/komm.asp">������������ ������������</a> ::
+<a style="font-size:10px;" href="/nedvvrossii.asp">������������ � ������</a> ::
+<a style="font-size:10px;" href="/docs.asp">���������� ����������</a> ::
+<a style="font-size:10px;" href="/zemuchastki.asp">��������� �������</a> ::
+<a style="font-size:10px;" href="/vakansii.asp">��������</a> ::
+<a style="font-size:10px;" href="/questions.asp">������� ��������</a> ::
+<a style="font-size:10px;" href="/info.asp">���������� ����������</a> ::
+<a style="font-size:10px;" href="/freetables.asp">����� ���������� �� ������������</a> ::
+<a style="font-size:10px;" href="/links.asp">������� ������</a> ::
+<a style="font-size:10px;" href="/kontakty.asp">��������</a>
+</div>
+
+ </td>
+</tr>
+</table>
+</td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr>
+ <td height="20" style="padding-left:13px; padding-right:13px;">
+<table width="100%" border="0" cellspacing="0" cellpadding="0">
+ <tr>
+ <td class=copy>
+ © 2001 � 2009 <a title="�������� ������������" href="/">��������� ������������</a> "���������-������������", "������� ����" - ������ �������, ������ ������, ������ ���������.<br>
+ ���.: +7 495 737-7019 ����: +7 495 231-7755 E-mail: <a href="mailto:info1@makler.su" style="color:black">info1@makler.su</a><br>
+</td>
+ </tr>
+</table>
+ </td>
+ </tr>
+</table><script type="text/javascript">
+var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
+document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
+</script>
+<script type="text/javascript">
+try {
+var pageTracker = _gat._getTracker("UA-8971199-1");
+pageTracker._trackPageview();
+} catch(err) {}</script></body>
+</html>
+
+
[10/13] tika git commit: TIKA-1855 -- first pass. Need to turn back
on the forbidden-apis testCheck. More clean up remains.
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-app/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
new file mode 100644
index 0000000..0ed428e
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -0,0 +1,312 @@
+package org.apache.tika.parser;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.parser.digesting.CommonsDigester;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.ContentHandlerFactory;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class RecursiveParserWrapperTest {
+
+ @Test
+ public void testBasicXML() throws Exception {
+ List<Metadata> list = getMetadata(new Metadata(),
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+ Metadata container = list.get(0);
+ String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+ //not much differentiates html from xml in this test file
+ assertTrue(content.indexOf("<p class=\"header\" />") > -1);
+ }
+
+ @Test
+ public void testBasicHTML() throws Exception {
+ List<Metadata> list = getMetadata(new Metadata(),
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1));
+ Metadata container = list.get(0);
+ String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+ //not much differentiates html from xml in this test file
+ assertTrue(content.indexOf("<p class=\"header\"></p>") > -1);
+ }
+
+ @Test
+ public void testBasicText() throws Exception {
+ List<Metadata> list = getMetadata(new Metadata(),
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+ Metadata container = list.get(0);
+ String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertTrue(content.indexOf("<p ") < 0);
+ assertTrue(content.indexOf("embed_0") > -1);
+ }
+
+ @Test
+ public void testIgnoreContent() throws Exception {
+ List<Metadata> list = getMetadata(new Metadata(),
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
+ Metadata container = list.get(0);
+ String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertNull(content);
+ }
+
+
+ @Test
+ public void testCharLimit() throws Exception {
+ ParseContext context = new ParseContext();
+ Metadata metadata = new Metadata();
+
+ Parser wrapped = new AutoDetectParser();
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60));
+ InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+ "/test-documents/test_recursive_embedded.docx");
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ List<Metadata> list = wrapper.getMetadata();
+
+ assertEquals(5, list.size());
+
+ int wlr = 0;
+ for (Metadata m : list) {
+ String limitReached = m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED);
+ if (limitReached != null && limitReached.equals("true")) {
+ wlr++;
+ }
+ }
+ assertEquals(1, wlr);
+
+ }
+
+ @Test
+ public void testMaxEmbedded() throws Exception {
+ int maxEmbedded = 4;
+ int totalNoLimit = 12;//including outer container file
+ ParseContext context = new ParseContext();
+ Metadata metadata = new Metadata();
+ String limitReached = null;
+
+ Parser wrapped = new AutoDetectParser();
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+
+ InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+ "/test-documents/test_recursive_embedded.docx");
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ List<Metadata> list = wrapper.getMetadata();
+ //test default
+ assertEquals(totalNoLimit, list.size());
+
+ limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
+ assertNull(limitReached);
+
+
+ wrapper.reset();
+ stream.close();
+
+ //test setting value
+ metadata = new Metadata();
+ stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+ "/test-documents/test_recursive_embedded.docx");
+ wrapper.setMaxEmbeddedResources(maxEmbedded);
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ list = wrapper.getMetadata();
+
+ //add 1 for outer container file
+ assertEquals(maxEmbedded + 1, list.size());
+
+ limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
+ assertEquals("true", limitReached);
+
+ wrapper.reset();
+ stream.close();
+
+ //test setting value < 0
+ metadata = new Metadata();
+ stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+ "/test-documents/test_recursive_embedded.docx");
+
+ wrapper.setMaxEmbeddedResources(-2);
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ assertEquals(totalNoLimit, list.size());
+ limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
+ assertNull(limitReached);
+ }
+
+ @Test
+ public void testEmbeddedResourcePath() throws Exception {
+
+ Set<String> targets = new HashSet<String>();
+ targets.add("/embed1.zip");
+ targets.add("/embed1.zip/embed2.zip");
+ targets.add("/embed1.zip/embed2.zip/embed3.zip");
+ targets.add("/embed1.zip/embed2.zip/embed3.zip/embed4.zip");
+ targets.add("/embed1.zip/embed2.zip/embed3.zip/embed4.zip/embed4.txt");
+ targets.add("/embed1.zip/embed2.zip/embed3.zip/embed3.txt");
+ targets.add("/embed1.zip/embed2.zip/embed2a.txt");
+ targets.add("/embed1.zip/embed2.zip/embed2b.txt");
+ targets.add("/embed1.zip/embed1b.txt");
+ targets.add("/embed1.zip/embed1a.txt");
+ targets.add("/image1.emf");
+
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
+ List<Metadata> list = getMetadata(metadata,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+ Metadata container = list.get(0);
+ String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
+ assertTrue(content.indexOf("<p class=\"header\" />") > -1);
+
+ Set<String> seen = new HashSet<String>();
+ for (Metadata m : list) {
+ String path = m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
+ if (path != null) {
+ seen.add(path);
+ }
+ }
+ assertEquals(targets, seen);
+ }
+
+ @Test
+ public void testEmbeddedNPE() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx");
+ List<Metadata> list = getMetadata(metadata,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+ //default behavior (user doesn't specify whether or not to catch embedded exceptions
+ //is to catch the exception
+ assertEquals(13, list.size());
+ Metadata mockNPEMetadata = list.get(10);
+ assertContains("java.lang.NullPointerException", mockNPEMetadata.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
+
+ metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx");
+ list = getMetadata(metadata,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
+ false, null);
+
+ //Composite parser swallows caught TikaExceptions, IOExceptions and SAXExceptions
+ //and just doesn't bother to report that there was an exception.
+ assertEquals(12, list.size());
+ }
+
+ @Test
+ public void testPrimaryExcWEmbedded() throws Exception {
+ //if embedded content is handled and then
+ //the parser hits an exception in the container document,
+ //that the first element of the returned list is the container document
+ //and the second is the embedded content
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "embedded_then_npe.xml");
+
+ ParseContext context = new ParseContext();
+ Parser wrapped = new AutoDetectParser();
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true);
+ String path = "/test-documents/mock/embedded_then_npe.xml";
+
+ InputStream stream = null;
+ boolean npe = false;
+ try {
+ stream = RecursiveParserWrapperTest.class.getResourceAsStream(
+ path);
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ } catch (TikaException e) {
+ if (e.getCause().getClass().equals(NullPointerException.class)) {
+ npe = true;
+ }
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+ assertTrue("npe", npe);
+
+ List<Metadata> metadataList = wrapper.getMetadata();
+ assertEquals(2, metadataList.size());
+ Metadata outerMetadata = metadataList.get(0);
+ Metadata embeddedMetadata = metadataList.get(1);
+ assertContains("main_content", outerMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertEquals("embedded_then_npe.xml", outerMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
+ assertEquals("Nikolai Lobachevsky", outerMetadata.get("author"));
+
+ assertContains("some_embedded_content", embeddedMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertEquals("embed1.xml", embeddedMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
+ assertEquals("embeddedAuthor", embeddedMetadata.get("author"));
+ }
+
+ @Test
+ public void testDigesters() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
+ List<Metadata> list = getMetadata(metadata,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
+ true, new CommonsDigester(100000, CommonsDigester.DigestAlgorithm.MD5));
+ int i = 0;
+ Metadata m0 = list.get(0);
+ Metadata m6 = list.get(6);
+ String md5Key = "X-TIKA:digest:MD5";
+ assertEquals("59f626e09a8c16ab6dbc2800c685f772", list.get(0).get(md5Key));
+ assertEquals("ccdf3882e7e4c2454e28884db9b0a54d", list.get(6).get(md5Key));
+ assertEquals("a869bf6432ebd14e19fc79416274e0c9", list.get(7).get(md5Key));
+ }
+
+ private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory,
+ boolean catchEmbeddedExceptions,
+ DigestingParser.Digester digester) throws Exception {
+ ParseContext context = new ParseContext();
+ Parser wrapped = new AutoDetectParser();
+ if (digester != null) {
+ wrapped = new DigestingParser(wrapped, digester);
+ }
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
+ contentHandlerFactory, catchEmbeddedExceptions);
+ String path = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (path == null) {
+ path = "/test-documents/test_recursive_embedded.docx";
+ } else {
+ path = "/test-documents/" + path;
+ }
+ InputStream stream = null;
+ try {
+ stream = TikaInputStream.get(RecursiveParserWrapperTest.class.getResource(path).toURI());
+ wrapper.parse(stream, new DefaultHandler(), metadata, context);
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+ return wrapper.getMetadata();
+
+ }
+
+ private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory)
+ throws Exception {
+ return getMetadata(metadata, contentHandlerFactory, true, null);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/TestParsers.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/TestParsers.java b/tika-app/src/test/java/org/apache/tika/parser/TestParsers.java
new file mode 100644
index 0000000..cde3e78
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/TestParsers.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Junit test class for Tika {@link Parser}s.
+ */
+public class TestParsers extends TikaTest {
+
+ private TikaConfig tc;
+
+ private Tika tika;
+
+ @Before
+ public void setUp() throws Exception {
+ tc = TikaConfig.getDefaultConfig();
+ tika = new Tika(tc);
+ }
+
+ @Test
+ public void testWORDExtraction() throws Exception {
+
+ Path tmpFile = getTestDocumentAsTempFile("testWORD.doc");
+ Parser parser = tika.getParser();
+ Metadata metadata = new Metadata();
+ try (InputStream stream = Files.newInputStream(tmpFile)) {
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+ } finally {
+ Files.delete(tmpFile);
+ }
+ assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
+ }
+
+ @Test
+ public void testEXCELExtraction() throws Exception {
+ final String expected = "Numbers and their Squares";
+ Path tmpFile = getTestDocumentAsTempFile("testEXCEL.xls");
+ try {
+ String s1 = tika.parseToString(tmpFile);
+ assertTrue("Text does not contain '" + expected + "'", s1
+ .contains(expected));
+ Parser parser = tika.getParser();
+ Metadata metadata = new Metadata();
+ try (InputStream stream = Files.newInputStream(tmpFile)) {
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+ }
+ assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
+ } finally {
+ Files.delete(tmpFile);
+ }
+ }
+
+ @Test
+ public void testOptionalHyphen() throws Exception {
+ String[] extensions =
+ new String[] { "ppt", "pptx", "doc", "docx", "rtf", "pdf"};
+ for (String extension : extensions) {
+ Path tmpFile = getTestDocumentAsTempFile("testOptionalHyphen." + extension);
+ String content = null;
+ try {
+ content = tika.parseToString(tmpFile);
+ } finally {
+ Files.delete(tmpFile);
+ }
+ assertTrue("optional hyphen was not handled for '" + extension + "' file type: " + content,
+ content.contains("optionalhyphen") ||
+ content.contains("optional\u00adhyphen") || // soft hyphen
+ content.contains("optional\u200bhyphen") || // zero width space
+ content.contains("optional\u2027")); // hyphenation point
+
+ }
+ }
+
+ @Test
+ public void testComment() throws Exception {
+ final String[] extensions = new String[] {"ppt", "pptx", "doc",
+ "docx", "xls", "xlsx", "pdf", "rtf"};
+ for(String extension : extensions) {
+ verifyComment(extension, "testComment");
+ }
+ }
+
+ private void verifyComment(String extension, String fileName) throws Exception {
+ TemporaryResources tmp = new TemporaryResources();
+
+ String content = null;
+ Path tmpFile = null;
+ try {
+ tmpFile = getTestDocumentAsTempFile(fileName + "." + extension);
+ content = tika.parseToString(tmpFile);
+ } finally {
+ if (tmpFile != null) {
+ Files.delete(tmpFile);
+ }
+ }
+ assertTrue(extension + ": content=" + content + " did not extract text",
+ content.contains("Here is some text"));
+ assertTrue(extension + ": content=" + content + " did not extract comment",
+ content.contains("Here is a comment"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java b/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
new file mode 100644
index 0000000..54c1427
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.fork;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.NotSerializableException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.Tika;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.fork.ForkParser;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Test that the ForkParser correctly behaves when
+ * wired in to the regular Parsers and their test data
+ */
+public class ForkParserIntegrationTest {
+
+ private Tika tika = new Tika(); // TODO Use TikaConfig instead, when it works
+
+ /**
+ * Simple text parsing
+ */
+ @Test
+ public void testForkedTextParsing() throws Exception {
+ ForkParser parser = new ForkParser(
+ ForkParserIntegrationTest.class.getClassLoader(),
+ tika.getParser());
+
+ try {
+ ContentHandler output = new BodyContentHandler();
+ InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
+ "/test-documents/testTXT.txt");
+ ParseContext context = new ParseContext();
+ parser.parse(stream, output, new Metadata(), context);
+
+ String content = output.toString();
+ assertContains("Test d'indexation", content);
+ assertContains("http://www.apache.org", content);
+ } finally {
+ parser.close();
+ }
+ }
+
+ /**
+ * This error has a message and an equals() implementation as to be able
+ * to match it against the serialized version of itself.
+ */
+ static class AnError extends Error {
+ private static final long serialVersionUID = -6197267350768803348L;
+ private String message;
+ AnError(String message) {
+ super(message);
+ this.message = message;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ AnError anError = (AnError) o;
+
+ if (!message.equals(anError.message)) return false;
+
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ return message.hashCode();
+ }
+ }
+
+ /**
+ * This error isn't serializable on the server, so can't be sent back
+ * to the Fork Client once it has occured
+ */
+ static class WontBeSerializedError extends RuntimeException {
+ private static final long serialVersionUID = 1L;
+
+ WontBeSerializedError(String message) {
+ super(message);
+ }
+
+ private void writeObject(java.io.ObjectOutputStream out) {
+ RuntimeException e = new RuntimeException("Bang!");
+ boolean found = false;
+ for (StackTraceElement ste : e.getStackTrace()) {
+ if (ste.getClassName().equals(ForkParser.class.getName())) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ throw e;
+ }
+ }
+ }
+
+ static class BrokenParser implements Parser {
+ private static final long serialVersionUID = 995871497930817839L;
+ public Error err = new AnError("Simulated fail");
+ public RuntimeException re = null;
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return new HashSet<MediaType>(Arrays.asList(MediaType.TEXT_PLAIN));
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+ if (re != null) throw re;
+ throw err;
+ }
+ }
+
+ /**
+ * TIKA-831 Parsers throwing errors should be caught and
+ * properly reported
+ */
+ @Test
+ public void testParsingErrorInForkedParserShouldBeReported() throws Exception {
+ BrokenParser brokenParser = new BrokenParser();
+ Parser parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
+ InputStream stream = getClass().getResourceAsStream("/test-documents/testTXT.txt");
+
+ // With a serializable error, we'll get that back
+ try {
+ ContentHandler output = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+ parser.parse(stream, output, new Metadata(), context);
+ fail("Expected TikaException caused by Error");
+ } catch (TikaException e) {
+ assertEquals(brokenParser.err, e.getCause());
+ }
+
+ // With a non serializable one, we'll get something else
+ // TODO Fix this test
+ brokenParser = new BrokenParser();
+ brokenParser.re= new WontBeSerializedError("Can't Serialize");
+ parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
+// try {
+// ContentHandler output = new BodyContentHandler();
+// ParseContext context = new ParseContext();
+// parser.parse(stream, output, new Metadata(), context);
+// fail("Expected TikaException caused by Error");
+// } catch (TikaException e) {
+// assertEquals(TikaException.class, e.getCause().getClass());
+// assertEquals("Bang!", e.getCause().getMessage());
+// }
+ }
+
+ /**
+ * If we supply a non serializable object on the ParseContext,
+ * check we get a helpful exception back
+ */
+ @Test
+ public void testParserHandlingOfNonSerializable() throws Exception {
+ ForkParser parser = new ForkParser(
+ ForkParserIntegrationTest.class.getClassLoader(),
+ tika.getParser());
+
+ ParseContext context = new ParseContext();
+ context.set(Detector.class, new Detector() {
+ public MediaType detect(InputStream input, Metadata metadata) {
+ return MediaType.OCTET_STREAM;
+ }
+ });
+
+ try {
+ ContentHandler output = new BodyContentHandler();
+ InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
+ "/test-documents/testTXT.txt");
+ parser.parse(stream, output, new Metadata(), context);
+ fail("Should have blown up with a non serializable ParseContext");
+ } catch(TikaException e) {
+ // Check the right details
+ assertNotNull(e.getCause());
+ assertEquals(NotSerializableException.class, e.getCause().getClass());
+ assertEquals("Unable to serialize ParseContext to pass to the Forked Parser", e.getMessage());
+ } finally {
+ parser.close();
+ }
+ }
+
+ /**
+ * TIKA-832
+ */
+ @Test
+ public void testAttachingADebuggerOnTheForkedParserShouldWork()
+ throws Exception {
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, tika.getParser());
+
+ ForkParser parser = new ForkParser(
+ ForkParserIntegrationTest.class.getClassLoader(),
+ tika.getParser());
+ parser.setJavaCommand(Arrays.asList("java", "-Xmx32m", "-Xdebug",
+ "-Xrunjdwp:transport=dt_socket,address=54321,server=y,suspend=n"));
+ try {
+ ContentHandler body = new BodyContentHandler();
+ InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
+ "/test-documents/testTXT.txt");
+ parser.parse(stream, body, new Metadata(), context);
+ String content = body.toString();
+ assertContains("Test d'indexation", content);
+ assertContains("http://www.apache.org", content);
+ } finally {
+ parser.close();
+ }
+ }
+
+ /**
+ * TIKA-808 - Ensure that parsing of our test PDFs work under
+ * the Fork Parser, to ensure that complex parsing behaves
+ */
+ @Test
+ public void testForkedPDFParsing() throws Exception {
+ ForkParser parser = new ForkParser(
+ ForkParserIntegrationTest.class.getClassLoader(),
+ tika.getParser());
+ try {
+ ContentHandler output = new BodyContentHandler();
+ InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
+ "/test-documents/testPDF.pdf");
+ ParseContext context = new ParseContext();
+ parser.parse(stream, output, new Metadata(), context);
+
+ String content = output.toString();
+ assertContains("Apache Tika", content);
+ assertContains("Tika - Content Analysis Toolkit", content);
+ assertContains("incubator", content);
+ assertContains("Apache Software Foundation", content);
+ } finally {
+ parser.close();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/mock/MockParserTest.java b/tika-app/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
new file mode 100644
index 0000000..52af12b
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
@@ -0,0 +1,251 @@
+package org.apache.tika.parser.mock;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.util.Date;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+
+/**
+ * Somewhat bizarrely, we can't put the test of this test resource in tika-test-resources
+ * or else it will be called by every module that uses it. Um, Yossarian!!!
+ */
+public class MockParserTest extends TikaTest {
+ private final static String M = "/test-documents/mock/";
+ private final static Parser PARSER = new AutoDetectParser();
+
+ @Override
+ public XMLResult getXML(String path, Metadata m) throws Exception {
+ //note that this is specific to MockParserTest with addition of M to the path!
+ InputStream is = getResourceAsStream(M+path);
+ try {
+ return super.getXML(is, PARSER, m);
+ } finally {
+ IOUtils.closeQuietly(is);
+ }
+ }
+
+ @Test
+ public void testExample() throws Exception {
+ Metadata m = new Metadata();
+ PrintStream out = System.out;
+ PrintStream err = System.err;
+ ByteArrayOutputStream outBos = new ByteArrayOutputStream();
+ ByteArrayOutputStream errBos = new ByteArrayOutputStream();
+ PrintStream tmpOut = new PrintStream(outBos, true, UTF_8.toString());
+ PrintStream tmpErr = new PrintStream(errBos, true, UTF_8.toString());
+ System.setOut(tmpOut);
+ System.setErr(tmpErr);
+ try {
+ assertThrowable("example.xml", m, IOException.class, "not another IOException");
+ assertMockParser(m);
+ } finally {
+ System.setOut(out);
+ System.setErr(err);
+ }
+ String outString = new String(outBos.toByteArray(), UTF_8);
+ assertContains("writing to System.out", outString);
+
+ String errString = new String(errBos.toByteArray(), UTF_8);
+ assertContains("writing to System.err", errString);
+
+ }
+
+ @Test
+ public void testNothingBad() throws Exception {
+ Metadata m = new Metadata();
+ String content = getXML("nothing_bad.xml", m).xml;
+ assertEquals("Geoffrey Chaucer", m.get("author"));
+ assertContains("<p>And bathed every veyne in swich licour,</p>", content);
+ assertMockParser(m);
+ }
+
+ @Test
+ public void testNullPointer() throws Exception {
+ Metadata m = new Metadata();
+ assertThrowable("null_pointer.xml", m, NullPointerException.class, "null pointer message");
+ assertMockParser(m);
+ }
+
+ @Test
+ public void testNullPointerNoMsg() throws Exception {
+ Metadata m = new Metadata();
+ assertThrowable("null_pointer_no_msg.xml", m, NullPointerException.class, null);
+ assertMockParser(m);
+ }
+
+
+ @Test
+ public void testSleep() throws Exception {
+ long start = new Date().getTime();
+ Metadata m = new Metadata();
+ String content = getXML("sleep.xml", m).xml;
+ assertMockParser(m);
+ long elapsed = new Date().getTime()-start;
+ //should sleep for at least 3000
+ boolean enoughTimeHasElapsed = elapsed > 2000;
+ assertTrue("not enough time has not elapsed: "+elapsed, enoughTimeHasElapsed);
+ assertMockParser(m);
+ }
+
+ @Test
+ public void testHeavyHang() throws Exception {
+ long start = new Date().getTime();
+ Metadata m = new Metadata();
+
+ String content = getXML("heavy_hang.xml", m).xml;
+ assertMockParser(m);
+ long elapsed = new Date().getTime()-start;
+ //should sleep for at least 3000
+ boolean enoughTimeHasElapsed = elapsed > 2000;
+ assertTrue("not enough time has elapsed: "+elapsed, enoughTimeHasElapsed);
+ assertMockParser(m);
+ }
+
+ @Test
+ public void testFakeOOM() throws Exception {
+ Metadata m = new Metadata();
+ assertThrowable("fake_oom.xml", m, OutOfMemoryError.class, "not another oom");
+ assertMockParser(m);
+ }
+
+ @Test
+ public void testRealOOM() throws Exception {
+ //Note: we're not actually testing the diff between fake and real oom
+ //i.e. by creating child process and setting different -Xmx or
+ //memory profiling.
+ Metadata m = new Metadata();
+ assertThrowable("real_oom.xml", m, OutOfMemoryError.class, "Java heap space");
+ assertMockParser(m);
+ }
+
+ @Test
+ public void testInterruptibleSleep() {
+ //Without static initialization of the parser, it can take ~1 second after t.start()
+ //before the parser actually calls parse. This is
+ //just the time it takes to instantiate and call AutoDetectParser, do the detection, etc.
+ //This is not thread creation overhead.
+ ParserRunnable r = new ParserRunnable("sleep_interruptible.xml");
+ Thread t = new Thread(r);
+ t.start();
+ long start = new Date().getTime();
+ try {
+ Thread.sleep(1000);
+ } catch (InterruptedException e) {
+ //swallow
+ }
+
+ t.interrupt();
+
+ try {
+ t.join(10000);
+ } catch (InterruptedException e) {
+ //swallow
+ }
+ long elapsed = new Date().getTime()-start;
+ boolean shortEnough = elapsed < 2000;//the xml file specifies 3000
+ assertTrue("elapsed (" + elapsed + " millis) was not short enough", shortEnough);
+ }
+
+ @Test
+ public void testNonInterruptibleSleep() {
+ ParserRunnable r = new ParserRunnable("sleep_not_interruptible.xml");
+ Thread t = new Thread(r);
+ t.start();
+ long start = new Date().getTime();
+ try {
+ //make sure that the thread has actually started
+ Thread.sleep(1000);
+ } catch (InterruptedException e) {
+ //swallow
+ }
+ t.interrupt();
+ try {
+ t.join(20000);
+ } catch (InterruptedException e) {
+ //swallow
+ }
+ long elapsed = new Date().getTime()-start;
+ boolean longEnough = elapsed > 3000;//the xml file specifies 3000, this sleeps 1000
+ assertTrue("elapsed ("+elapsed+" millis) was not long enough", longEnough);
+ }
+
+ private class ParserRunnable implements Runnable {
+ private final String path;
+ ParserRunnable(String path) {
+ this.path = path;
+ }
+ @Override
+ public void run() {
+ Metadata m = new Metadata();
+ try {
+ getXML(path, m);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ } finally {
+ assertMockParser(m);
+ }
+ }
+ }
+
+ private void assertThrowable(String path, Metadata m, Class<? extends Throwable> expected, String message) {
+
+ try {
+ getXML(path, m);
+ } catch (Throwable t) {
+ //if this is a throwable wrapped in a TikaException, use the cause
+ if (t instanceof TikaException && t.getCause() != null) {
+ t = t.getCause();
+ }
+ if (! (t.getClass().isAssignableFrom(expected))){
+ fail(t.getClass() +" is not assignable from "+expected);
+ }
+ if (message != null) {
+ assertEquals(message, t.getMessage());
+ }
+ }
+ }
+
+ private void assertMockParser(Metadata m) {
+ String[] parsers = m.getValues("X-Parsed-By");
+ //make sure that it was actually parsed by mock.
+ boolean parsedByMock = false;
+ for (String parser : parsers) {
+ if (parser.equals("org.apache.tika.parser.mock.MockParser")) {
+ parsedByMock = true;
+ break;
+ }
+ }
+ assertTrue("mock parser should have been called", parsedByMock);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java b/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java
new file mode 100644
index 0000000..c47a348
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/pkg/PackageTest.java
@@ -0,0 +1,335 @@
+package org.apache.tika.parser.pkg;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Before;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class PackageTest extends TikaTest {
+
+ private static final MediaType TYPE_7ZIP = MediaType.application("x-7z-compressed");
+
+ private ParseContext recursingContext;
+ private Parser autoDetectParser;
+
+ @Before
+ public void setUp() throws Exception {
+
+ autoDetectParser = new AutoDetectParser();
+ recursingContext = new ParseContext();
+ recursingContext.set(Parser.class, autoDetectParser);
+ }
+
+ @Test
+ public void testZlibParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/testTXT.zlib")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/zlib", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("http://www.apache.org", content);
+ }
+
+
+ @Test
+ public void testArParsing() throws Exception {
+ Parser parser = new AutoDetectParser();
+
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/testARofText.ar")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-archive",
+ metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("http://www.apache.org", content);
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/testARofSND.ar")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-archive",
+ metadata.get(Metadata.CONTENT_TYPE));
+ content = handler.toString();
+ assertContains("testAU.au", content);
+ }
+
+ @Test
+ public void testBzip2Parsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tbz2")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("test-documents/testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+
+ @Test
+ public void testCompressParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tar.Z");
+ try {
+ parser.parse(stream, handler, metadata, recursingContext);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("application/x-compress", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("test-documents/testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+
+ @Test
+ public void testGzipParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tgz")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("test-documents/testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+
+ @Test
+ public void testRarParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.rar")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-rar-compressed", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("test-documents/testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+
+ @Test
+ public void test7ZParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ // Ensure 7zip is a parsable format
+ assertTrue("No 7zip parser found",
+ parser.getSupportedTypes(recursingContext).contains(TYPE_7ZIP));
+
+ // Parse
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.7z")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("test-documents/testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+ @Test
+ public void testTarParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tar")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("test-documents/testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("test-documents/testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("test-documents/testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("test-documents/testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("test-documents/testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("test-documents/testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("test-documents/testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("test-documents/testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("test-documents/testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+
+ @Test
+ public void testZipParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/test-documents.zip")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("testEXCEL.xls", content);
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("testHTML.html", content);
+ assertContains("Test Indexation Html", content);
+ assertContains("testOpenOffice2.odt", content);
+ assertContains("This is a sample Open Office document", content);
+ assertContains("testPDF.pdf", content);
+ assertContains("Apache Tika", content);
+ assertContains("testPPT.ppt", content);
+ assertContains("Sample Powerpoint Slide", content);
+ assertContains("testRTF.rtf", content);
+ assertContains("indexation Word", content);
+ assertContains("testTXT.txt", content);
+ assertContains("Test d'indexation de Txt", content);
+ assertContains("testWORD.doc", content);
+ assertContains("This is a sample Microsoft Word Document", content);
+ assertContains("testXML.xml", content);
+ assertContains("Rida Benjelloun", content);
+ }
+
+ @Test
+ public void testSvgzParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = PackageTest.class.getResourceAsStream(
+ "/test-documents/testSVG.svgz")) {
+ parser.parse(stream, handler, metadata, recursingContext);
+ }
+
+ assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ assertContains("Test SVG image", content);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java b/tika-app/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java
new file mode 100644
index 0000000..eff076b
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.sax;
+
+import static org.apache.tika.TikaTest.assertContains;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+
+/**
+ * Test class for the {@link PhoneExtractingContentHandler}
+ * class. This demonstrates how to parse a document and retrieve any phone numbers
+ * found within.
+ *
+ * The phone numbers are added to a multivalued Metadata object under the key, "phonenumbers".
+ * You can get an array of phone numbers by calling metadata.getValues("phonenumber").
+ */
+public class PhoneExtractingContentHandlerTest {
+ @Test
+ public void testExtractPhoneNumbers() throws Exception {
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them
+ // to the underlying Handler.
+ PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata);
+ try (InputStream stream = PhoneExtractingContentHandlerTest.class.getResourceAsStream("/test-documents/testPhoneNumberExtractor.odt")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+ String[] phoneNumbers = metadata.getValues("phonenumbers");
+ assertContains("9498888888", phoneNumbers[0]);
+ assertContains("9497777777", phoneNumbers[1]);
+ assertContains("9496666666", phoneNumbers[2]);
+ assertContains("9495555555", phoneNumbers[3]);
+ assertContains("4193404645", phoneNumbers[4]);
+ assertContains("9044687081", phoneNumbers[5]);
+ assertContains("2604094811", phoneNumbers[6]);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java b/tika-app/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java
new file mode 100644
index 0000000..62660c8
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.utils;
+
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.parser.DefaultParser;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+
+public class ServiceLoaderUtilsTest extends TikaTest {
+ @Test
+ public void testOrdering() throws Exception {
+ //make sure that non Tika parsers come last
+ //which means that they'll overwrite Tika parsers and
+ //be preferred.
+ DefaultParser defaultParser = new DefaultParser();
+ int vorbisIndex = -1;
+ int fictIndex = -1;
+ int dcxmlIndex = -1;
+ int i = 0;
+ for (Parser p : defaultParser.getAllComponentParsers()) {
+ if ("class org.gagravarr.tika.VorbisParser".equals(p.getClass().toString())) {
+ vorbisIndex = i;
+ }
+ if ("class org.apache.tika.parser.xml.FictionBookParser".equals(p.getClass().toString())) {
+ fictIndex = i;
+ }
+ if ("class org.apache.tika.parser.xml.DcXMLParser".equals(p.getClass().toString())) {
+ dcxmlIndex = i;
+ }
+ i++;
+ }
+
+ assertNotEquals(vorbisIndex, fictIndex);
+ assertNotEquals(fictIndex, dcxmlIndex);
+ assertTrue(vorbisIndex > fictIndex);
+ assertTrue(fictIndex > dcxmlIndex);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/pom.xml
----------------------------------------------------------------------
diff --git a/tika-core/pom.xml b/tika-core/pom.xml
index e63f101..2c61616 100644
--- a/tika-core/pom.xml
+++ b/tika-core/pom.xml
@@ -33,8 +33,17 @@
<packaging>bundle</packaging>
<name>Apache Tika core</name>
<url>http://tika.apache.org/</url>
+ <properties>
+ <!-- NOTE: sync codec version with POI -->
+ <codec.version>1.10</codec.version>
+ </properties>
<dependencies>
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <version>${codec.version}</version>
+ </dependency>
<!-- Optional OSGi dependencies, used only when running within OSGi -->
<dependency>
<groupId>org.osgi</groupId>
@@ -60,6 +69,13 @@
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-test-resources</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
<dependency>
<groupId>org.ops4j.pax.exam</groupId>
<artifactId>pax-exam-junit4</artifactId>
@@ -108,6 +124,9 @@
<Bundle-DocURL>${project.url}</Bundle-DocURL>
<Bundle-Activator>org.apache.tika.config.TikaActivator</Bundle-Activator>
<Bundle-ActivationPolicy>lazy</Bundle-ActivationPolicy>
+ <Embed-Dependency>
+ commons-codec
+ </Embed-Dependency>
</instructions>
</configuration>
</plugin>
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java
new file mode 100644
index 0000000..e7b2405
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/parser/digesting/CommonsDigester.java
@@ -0,0 +1,295 @@
+package org.apache.tika.parser.digesting;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.DigestingParser;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Implementation of {@link org.apache.tika.parser.DigestingParser.Digester}
+ * that relies on commons.codec.digest.DigestUtils to calculate digest hashes.
+ * <p>
+ * This digester tries to use the regular mark/reset protocol on the InputStream.
+ * However, this wraps an internal BoundedInputStream, and if the InputStream
+ * is not fully read, then this will reset the stream and
+ * spool the InputStream to disk (via TikaInputStream) and then digest the file.
+ * <p>
+ * If a TikaInputStream is passed in and it has an underlying file that is longer
+ * than the {@link #markLimit}, then this digester digests the file directly.
+ *
+ */
+public class CommonsDigester implements DigestingParser.Digester {
+
+ public enum DigestAlgorithm {
+ //those currently available in commons.digest
+ MD2,
+ MD5,
+ SHA1,
+ SHA256,
+ SHA384,
+ SHA512;
+
+ String getMetadataKey() {
+ return TikaCoreProperties.TIKA_META_PREFIX+
+ "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER+this.toString();
+ }
+ }
+
+ private final List<DigestAlgorithm> algorithms = new ArrayList<DigestAlgorithm>();
+ private final int markLimit;
+
+ public CommonsDigester(int markLimit, DigestAlgorithm... algorithms) {
+ Collections.addAll(this.algorithms, algorithms);
+ if (markLimit < 0) {
+ throw new IllegalArgumentException("markLimit must be >= 0");
+ }
+ this.markLimit = markLimit;
+ }
+
+ @Override
+ public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException {
+ InputStream tis = TikaInputStream.get(is);
+ long sz = -1;
+ if (((TikaInputStream)tis).hasFile()) {
+ sz = ((TikaInputStream)tis).getLength();
+ }
+ //if the file is definitely a file,
+ //and its size is greater than its mark limit,
+ //just digest the underlying file.
+ if (sz > markLimit) {
+ digestFile(((TikaInputStream)tis).getFile(), m);
+ return;
+ }
+
+ //try the usual mark/reset stuff.
+ //however, if you actually hit the bound,
+ //then stop and spool to file via TikaInputStream
+ SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, tis);
+ boolean finishedStream = false;
+ for (DigestAlgorithm algorithm : algorithms) {
+ bis.mark(markLimit + 1);
+ finishedStream = digestEach(algorithm, bis, m);
+ bis.reset();
+ if (!finishedStream) {
+ break;
+ }
+ }
+ if (!finishedStream) {
+ digestFile(((TikaInputStream)tis).getFile(), m);
+ }
+ }
+
+ private void digestFile(File f, Metadata m) throws IOException {
+ for (DigestAlgorithm algorithm : algorithms) {
+ try (InputStream is = new FileInputStream(f)) {
+ digestEach(algorithm, is, m);
+ }
+ }
+ }
+
+ /**
+ *
+ * @param algorithm algo to use
+ * @param is input stream to read from
+ * @param metadata metadata for reporting the digest
+ * @return whether or not this finished the input stream
+ * @throws IOException
+ */
+ private boolean digestEach(DigestAlgorithm algorithm,
+ InputStream is, Metadata metadata) throws IOException {
+ String digest = null;
+ try {
+ switch (algorithm) {
+ case MD2:
+ digest = DigestUtils.md2Hex(is);
+ break;
+ case MD5:
+ digest = DigestUtils.md5Hex(is);
+ break;
+ case SHA1:
+ digest = DigestUtils.sha1Hex(is);
+ break;
+ case SHA256:
+ digest = DigestUtils.sha256Hex(is);
+ break;
+ case SHA384:
+ digest = DigestUtils.sha384Hex(is);
+ break;
+ case SHA512:
+ digest = DigestUtils.sha512Hex(is);
+ break;
+ default:
+ throw new IllegalArgumentException("Sorry, not aware of algorithm: " + algorithm.toString());
+ }
+ } catch (IOException e) {
+ e.printStackTrace();
+ //swallow, or should we throw this?
+ }
+ if (is instanceof SimpleBoundedInputStream) {
+ if (((SimpleBoundedInputStream)is).hasHitBound()) {
+ return false;
+ }
+ }
+ metadata.set(algorithm.getMetadataKey(), digest);
+ return true;
+ }
+
+ /**
+ *
+ * @param s comma-delimited (no space) list of algorithms to use: md5,sha256
+ * @return
+ */
+ public static DigestAlgorithm[] parse(String s) {
+ assert(s != null);
+
+ List<DigestAlgorithm> ret = new ArrayList<DigestAlgorithm>();
+ for (String algoString : s.split(",")) {
+ String uc = algoString.toUpperCase(Locale.ROOT);
+ if (uc.equals(DigestAlgorithm.MD2.toString())) {
+ ret.add(DigestAlgorithm.MD2);
+ } else if (uc.equals(DigestAlgorithm.MD5.toString())) {
+ ret.add(DigestAlgorithm.MD5);
+ } else if (uc.equals(DigestAlgorithm.SHA1.toString())) {
+ ret.add(DigestAlgorithm.SHA1);
+ } else if (uc.equals(DigestAlgorithm.SHA256.toString())) {
+ ret.add(DigestAlgorithm.SHA256);
+ } else if (uc.equals(DigestAlgorithm.SHA384.toString())) {
+ ret.add(DigestAlgorithm.SHA384);
+ } else if (uc.equals(DigestAlgorithm.SHA512.toString())) {
+ ret.add(DigestAlgorithm.SHA512);
+ } else {
+ StringBuilder sb = new StringBuilder();
+ int i = 0;
+ for (DigestAlgorithm algo : DigestAlgorithm.values()) {
+ if (i++ > 0) {
+ sb.append(", ");
+ }
+ sb.append(algo.toString());
+ }
+ throw new IllegalArgumentException("Couldn't match " + s + " with any of: " + sb.toString());
+ }
+ }
+ return ret.toArray(new DigestAlgorithm[ret.size()]);
+ }
+
+ /**
+ * Very slight modification of Commons' BoundedInputStream
+ * so that we can figure out if this hit the bound or not.
+ */
+ private class SimpleBoundedInputStream extends InputStream {
+ private final static int EOF = -1;
+ private final long max;
+ private final InputStream in;
+ private long pos;
+ boolean hitBound = false;
+
+ private SimpleBoundedInputStream(long max, InputStream in) {
+ this.max = max;
+ this.in = in;
+ }
+
+ @Override
+ public int read() throws IOException {
+ if (max >= 0 && pos >= max) {
+ hitBound = true;
+ return EOF;
+ }
+ final int result = in.read();
+ pos++;
+ return result;
+ }
+
+ /**
+ * Invokes the delegate's <code>read(byte[])</code> method.
+ * @param b the buffer to read the bytes into
+ * @return the number of bytes read or -1 if the end of stream or
+ * the limit has been reached.
+ * @throws IOException if an I/O error occurs
+ */
+ @Override
+ public int read(final byte[] b) throws IOException {
+ return this.read(b, 0, b.length);
+ }
+
+ /**
+ * Invokes the delegate's <code>read(byte[], int, int)</code> method.
+ * @param b the buffer to read the bytes into
+ * @param off The start offset
+ * @param len The number of bytes to read
+ * @return the number of bytes read or -1 if the end of stream or
+ * the limit has been reached.
+ * @throws IOException if an I/O error occurs
+ */
+ @Override
+ public int read(final byte[] b, final int off, final int len) throws IOException {
+ if (max>=0 && pos>=max) {
+ return EOF;
+ }
+ final long maxRead = max>=0 ? Math.min(len, max-pos) : len;
+ final int bytesRead = in.read(b, off, (int)maxRead);
+
+ if (bytesRead==EOF) {
+ return EOF;
+ }
+
+ pos+=bytesRead;
+ return bytesRead;
+ }
+
+ /**
+ * Invokes the delegate's <code>skip(long)</code> method.
+ * @param n the number of bytes to skip
+ * @return the actual number of bytes skipped
+ * @throws IOException if an I/O error occurs
+ */
+ @Override
+ public long skip(final long n) throws IOException {
+ final long toSkip = max>=0 ? Math.min(n, max-pos) : n;
+ final long skippedBytes = in.skip(toSkip);
+ pos+=skippedBytes;
+ return skippedBytes;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ in.reset();
+ }
+
+ @Override
+ public void mark(int readLimit) {
+ in.mark(readLimit);
+ }
+
+ public boolean hasHitBound() {
+ return hitBound;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/java/org/apache/tika/TikaTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 2c6f21f..1edf91c 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -26,6 +26,9 @@ import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
@@ -74,6 +77,25 @@ public abstract class TikaTest {
}
}
+
+ /**
+ * Copies test file from "test-documents" to a temp file.
+ * Consumers are responsible for deleting the temp file after use.
+ *
+ * @param name
+ * @return
+ * @throws IOException
+ */
+ public Path getTestDocumentAsTempFile(String name) throws IOException{
+ Path tmp = Files.createTempFile("tika-test", "");
+ Files.copy(getResourceAsStream("/test-documents/"+name), tmp, StandardCopyOption.REPLACE_EXISTING);
+ return tmp;
+ }
+
+ public InputStream getTestDocumentAsStream(String name) {
+ return TikaInputStream.get(getResourceAsStream("/test-documents/"+name));
+ }
+
public InputStream getResourceAsStream(String name) {
InputStream stream = this.getClass().getResourceAsStream(name);
if (stream == null) {
@@ -106,36 +128,50 @@ public abstract class TikaTest {
}
}
+ protected XMLResult getXML(String filePath, Parser parser, Metadata metadata, ParseContext context) throws Exception {
+ return getXML(getTestDocumentAsStream(filePath), parser, metadata, context);
+ }
+
protected XMLResult getXML(String filePath, Parser parser, Metadata metadata) throws Exception {
- return getXML(getResourceAsStream("/test-documents/" + filePath), parser, metadata);
+ return getXML(getTestDocumentAsStream(filePath), parser, metadata);
}
protected XMLResult getXML(String filePath, Metadata metadata) throws Exception {
- return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), metadata);
+ Parser parser = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, parser);
+
+ return getXML(getTestDocumentAsStream(filePath), parser, metadata, context);
+ }
+
+ protected XMLResult getXML(String filePath, Parser parser) throws Exception {
+ //send in empty parse context so that only outer parser is used
+ return getXML(getTestDocumentAsStream(filePath), parser, new Metadata(), new ParseContext());
}
protected XMLResult getXML(String filePath) throws Exception {
- return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), new Metadata());
+ return getXML(filePath, new Metadata());
}
protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception {
- ParseContext context = new ParseContext();
- context.set(Parser.class, parser);
-
- try {
- ContentHandler handler = new ToXMLContentHandler();
- parser.parse(input, handler, metadata, context);
- return new XMLResult(handler.toString(), metadata);
- } finally {
- input.close();
- }
- }
+ return getXML(input, parser, metadata, new ParseContext());
+ }
- /**
- * Basic text extraction.
- * <p>
- * Tries to close input stream after processing.
- */
+ protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata, ParseContext context) throws Exception {
+ try {
+ ContentHandler handler = new ToXMLContentHandler();
+ parser.parse(input, handler, metadata, context);
+ return new XMLResult(handler.toString(), metadata);
+ } finally {
+ input.close();
+ }
+ }
+
+ /**
+ * Basic text extraction.
+ * <p>
+ * Tries to close input stream after processing.
+ */
public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{
ContentHandler handler = new BodyContentHandler(1000000);
try {
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java b/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java
index c815607..d2f3b40 100644
--- a/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java
+++ b/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java
@@ -22,13 +22,13 @@ import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
+import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MimeDetectionTest;
import org.junit.Before;
import org.junit.Test;
-public class MimeDetectionWithNNTest {
+public class MimeDetectionWithNNTest extends TikaTest {
private Detector detector;
@@ -88,13 +88,13 @@ public class MimeDetectionWithNNTest {
private void testUrl(String expected, String url, String file)
throws IOException {
- InputStream in = MimeDetectionTest.class.getResourceAsStream(file);
+ InputStream in = getTestDocumentAsStream(file);
testStream(expected, url, in);
}
private void testFile(String expected, String filename) throws IOException {
- InputStream in = MimeDetectionTest.class.getResourceAsStream(filename);
+ InputStream in = getTestDocumentAsStream(filename);
testStream(expected, filename, in);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
index 1f986da..31df3ec 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
@@ -27,12 +27,13 @@ import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
+import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.junit.Before;
import org.junit.Test;
-public class MimeDetectionTest {
+public class MimeDetectionTest extends TikaTest {
private MimeTypes mimeTypes;
@@ -136,12 +137,12 @@ public class MimeDetectionTest {
}
private void testUrl(String expected, String url, String file) throws IOException{
- InputStream in = getClass().getResourceAsStream(file);
+ InputStream in = getTestDocumentAsStream(file);
testStream(expected, url, in);
}
private void testFile(String expected, String filename) throws IOException {
- InputStream in = getClass().getResourceAsStream(filename);
+ InputStream in = getTestDocumentAsStream(filename);
testStream(expected, filename, in);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java
index 35c75b7..415961f 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java
@@ -27,11 +27,12 @@ import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
+import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.junit.Before;
import org.junit.Test;
-public class ProbabilisticMimeDetectionTest {
+public class ProbabilisticMimeDetectionTest extends TikaTest {
private ProbabilisticMimeDetectionSelector proDetector;
@@ -130,12 +131,12 @@ public class ProbabilisticMimeDetectionTest {
private void testUrl(String expected, String url, String file)
throws IOException {
- InputStream in = getClass().getResourceAsStream(file);
+ InputStream in = getTestDocumentAsStream(file);
testStream(expected, url, in);
}
private void testFile(String expected, String filename) throws IOException {
- InputStream in = getClass().getResourceAsStream(filename);
+ InputStream in = getTestDocumentAsStream(filename);
testStream(expected, filename, in);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
index 5605300..a6dc7f3 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
@@ -29,6 +29,7 @@ import java.io.InputStream;
import java.net.URL;
import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.detect.DefaultProbDetector;
import org.apache.tika.metadata.Metadata;
@@ -36,7 +37,7 @@ import org.apache.tika.mime.ProbabilisticMimeDetectionSelector.Builder;
import org.junit.Before;
import org.junit.Test;
-public class ProbabilisticMimeDetectionTestWithTika {
+public class ProbabilisticMimeDetectionTestWithTika extends TikaTest {
private ProbabilisticMimeDetectionSelector proSelector;
private MediaTypeRegistry registry;
@@ -151,12 +152,12 @@ public class ProbabilisticMimeDetectionTestWithTika {
private void testUrl(String expected, String url, String file)
throws IOException {
- InputStream in = getClass().getResourceAsStream(file);
+ InputStream in = getTestDocumentAsStream(file);
testStream(expected, url, in);
}
private void testFile(String expected, String filename) throws IOException {
- InputStream in = getClass().getResourceAsStream(filename);
+ InputStream in = getTestDocumentAsStream(filename);
testStream(expected, filename, in);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/java/org/apache/tika/osgi/BundleIT.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/osgi/BundleIT.java b/tika-core/src/test/java/org/apache/tika/osgi/BundleIT.java
index f3397d9..696d5e6 100644
--- a/tika-core/src/test/java/org/apache/tika/osgi/BundleIT.java
+++ b/tika-core/src/test/java/org/apache/tika/osgi/BundleIT.java
@@ -18,27 +18,17 @@ package org.apache.tika.osgi;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.assertNotNull;
import static org.ops4j.pax.exam.CoreOptions.bundle;
import static org.ops4j.pax.exam.CoreOptions.junitBundles;
import static org.ops4j.pax.exam.CoreOptions.options;
-import static org.ops4j.pax.exam.CoreOptions.mavenBundle;
import javax.inject.Inject;
-
import java.io.File;
import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.io.Writer;
import java.net.URISyntaxException;
import java.util.Set;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.osgi.TikaService;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.ops4j.pax.exam.Configuration;
@@ -48,7 +38,6 @@ import org.ops4j.pax.exam.spi.reactors.ExamReactorStrategy;
import org.ops4j.pax.exam.spi.reactors.PerMethod;
import org.osgi.framework.Bundle;
import org.osgi.framework.BundleContext;
-import org.xml.sax.ContentHandler;
@RunWith(PaxExam.class)
@ExamReactorStrategy(PerMethod.class)
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/GLDAS_CLM10SUBP_3H.A19790202.0000.001.grb
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/GLDAS_CLM10SUBP_3H.A19790202.0000.001.grb b/tika-core/src/test/resources/org/apache/tika/mime/GLDAS_CLM10SUBP_3H.A19790202.0000.001.grb
deleted file mode 100644
index 0bffdca..0000000
Binary files a/tika-core/src/test/resources/org/apache/tika/mime/GLDAS_CLM10SUBP_3H.A19790202.0000.001.grb and /dev/null differ
[05/13] tika git commit: TIKA-1855 -- first pass. Need to turn back
on the forbidden-apis testCheck. More clean up remains.
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
deleted file mode 100644
index c3d13b7..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ /dev/null
@@ -1,1047 +0,0 @@
-/*
-* Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.mime;
-
-// Junit imports
-import static java.nio.charset.StandardCharsets.UTF_16BE;
-import static java.nio.charset.StandardCharsets.UTF_16LE;
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNotSame;
-
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.URL;
-
-import org.apache.tika.Tika;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.metadata.Metadata;
-import org.junit.Before;
-import org.junit.Test;
-
-/**
- *
- * Test Suite for the {@link MimeTypes} repository.
- *
- */
-public class TestMimeTypes {
-
- private Tika tika;
-
- private MimeTypes repo;
-
- private URL u;
-
- private static final File f = new File("/a/b/c/x.pdf");
-
- @Before
- public void setUp() throws Exception{
- TikaConfig config = TikaConfig.getDefaultConfig();
- repo = config.getMimeRepository();
- tika = new Tika(config);
- u = new URL("http://mydomain.com/x.pdf?x=y");
- }
-
- @Test
- public void testCaseSensitivity() {
- String type = tika.detect("test.PDF");
- assertNotNull(type);
- assertEquals(type, tika.detect("test.pdf"));
- assertEquals(type, tika.detect("test.PdF"));
- assertEquals(type, tika.detect("test.pdF"));
- }
-
- @Test
- public void testLoadMimeTypes() throws MimeTypeException {
- assertNotNull(repo.forName("application/octet-stream"));
- assertNotNull(repo.forName("text/x-tex"));
- }
-
- /**
- * Tests MIME type determination based solely on the URL's extension.
- */
- @Test
- public void testGuessMimeTypes() throws Exception {
- assertTypeByName("application/pdf", "x.pdf");
- assertEquals("application/pdf", tika.detect(u.toExternalForm()));
- assertEquals("application/pdf", tika.detect(f.getPath()));
- assertTypeByName("text/plain", "x.txt");
- assertTypeByName("text/html", "x.htm");
- assertTypeByName("text/html", "x.html");
- assertTypeByName("application/xhtml+xml", "x.xhtml");
- assertTypeByName("application/xml", "x.xml");
- assertTypeByName("application/zip", "x.zip");
- assertTypeByName("application/vnd.oasis.opendocument.text", "x.odt");
- assertTypeByName("application/octet-stream", "x.unknown");
-
- // Test for the MS Office media types and file extensions listed in
- // http://blogs.msdn.com/vsofficedeveloper/pages/Office-2007-Open-XML-MIME-Types.aspx
- assertTypeByName("application/msword", "x.doc");
- assertTypeByName("application/msword", "x.dot");
- assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "x.docx");
- assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.template", "x.dotx");
- assertTypeByName("application/vnd.ms-word.document.macroenabled.12", "x.docm");
- assertTypeByName("application/vnd.ms-word.template.macroenabled.12", "x.dotm");
- assertTypeByName("application/vnd.ms-excel", "x.xls");
- assertTypeByName("application/vnd.ms-excel", "x.xlt");
- assertTypeByName("application/vnd.ms-excel", "x.xla");
- assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "x.xlsx");
- assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.template", "x.xltx");
- assertTypeByName("application/vnd.ms-excel.sheet.macroenabled.12", "x.xlsm");
- assertTypeByName("application/vnd.ms-excel.template.macroenabled.12", "x.xltm");
- assertTypeByName("application/vnd.ms-excel.addin.macroenabled.12", "x.xlam");
- assertTypeByName("application/vnd.ms-excel.sheet.binary.macroenabled.12", "x.xlsb");
- assertTypeByName("application/vnd.ms-powerpoint", "x.ppt");
- assertTypeByName("application/vnd.ms-powerpoint", "x.pot");
- assertTypeByName("application/vnd.ms-powerpoint", "x.pps");
- assertTypeByName("application/vnd.ms-powerpoint", "x.ppa");
- assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.presentation", "x.pptx");
- assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.template", "x.potx");
- assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.slideshow", "x.ppsx");
- assertTypeByName("application/vnd.ms-powerpoint.addin.macroenabled.12", "x.ppam");
- assertTypeByName("application/vnd.ms-powerpoint.presentation.macroenabled.12", "x.pptm");
- assertTypeByName("application/vnd.ms-powerpoint.template.macroenabled.12", "x.potm");
- assertTypeByName("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "x.ppsm");
- }
-
- /**
- * Note - detecting container formats by mime magic is very very
- * iffy, as we can't be sure where things will end up.
- * People really ought to use the container aware detection...
- */
- @Test
- public void testOLE2Detection() throws Exception {
- // These have the properties block near the start, so our mime
- // magic will spot them
- assertTypeByData("application/vnd.ms-excel", "testEXCEL.xls");
-
- // This one quite legitimately doesn't have its properties block
- // as one of the first couple of entries
- // As such, our mime magic can't figure it out...
- assertTypeByData("application/x-tika-msoffice", "testWORD.doc");
- assertTypeByData("application/x-tika-msoffice", "testPPT.ppt");
-
-
- // By name + data:
-
- // Those we got right to start with are fine
- assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL.xls");
-
- // And the name lets us specialise the generic OOXML
- // ones to their actual type
- assertTypeByNameAndData("application/vnd.ms-powerpoint", "testPPT.ppt");
- assertTypeByNameAndData("application/msword", "testWORD.doc");
- }
-
- /**
- * Files generated by Works 7.0 Spreadsheet application use the OLE2
- * structure and resemble Excel files (they contain a "Workbook"). They are
- * not Excel though. They are distinguished from Excel files with an
- * additional top-level entry in below the root of the POI filesystem.
- *
- * @throws Exception
- */
- @Test
- public void testWorksSpreadsheetDetection() throws Exception {
- assertTypeDetection("testWORKSSpreadsheet7.0.xlr",
- // with name-only, everything should be all right
- "application/x-tika-msworks-spreadsheet",
- // this is possible due to MimeTypes guessing the type
- // based on the WksSSWorkBook near the beginning of the
- // file
- "application/x-tika-msworks-spreadsheet",
- // this is right, the magic-based detection works, there is
- // no need for the name-based detection to refine it
- "application/x-tika-msworks-spreadsheet");
- }
-
- @Test
- public void testStarOfficeDetection() throws Exception {
- assertTypeDetection("testVORCalcTemplate.vor",
- "application/x-staroffice-template",
- "application/vnd.stardivision.calc",
- "application/vnd.stardivision.calc");
- assertTypeDetection("testVORDrawTemplate.vor",
- "application/x-staroffice-template",
- "application/vnd.stardivision.draw",
- "application/vnd.stardivision.draw");
- assertTypeDetection("testVORImpressTemplate.vor",
- "application/x-staroffice-template",
- "application/vnd.stardivision.impress",
- "application/vnd.stardivision.impress");
- assertTypeDetection("testVORWriterTemplate.vor",
- "application/x-staroffice-template",
- "application/vnd.stardivision.writer",
- "application/vnd.stardivision.writer");
-
- assertTypeDetection("testStarOffice-5.2-calc.sdc",
- "application/vnd.stardivision.calc",
- "application/vnd.stardivision.calc",
- "application/vnd.stardivision.calc");
- assertTypeDetection("testStarOffice-5.2-draw.sda",
- "application/vnd.stardivision.draw",
- "application/vnd.stardivision.draw",
- "application/vnd.stardivision.draw");
- assertTypeDetection("testStarOffice-5.2-impress.sdd",
- "application/vnd.stardivision.impress",
- "application/vnd.stardivision.impress",
- "application/vnd.stardivision.impress");
- assertTypeDetection("testStarOffice-5.2-writer.sdw",
- "application/vnd.stardivision.writer",
- "application/vnd.stardivision.writer",
- "application/vnd.stardivision.writer");
- }
-
- /**
- * Files generated by Works Word Processor versions 3.0 and 4.0 use the
- * OLE2 structure. They don't resemble Word though.
- *
- * @throws Exception
- */
- @Test
- public void testOldWorksWordProcessorDetection() throws Exception {
- assertTypeDetection(
- "testWORKSWordProcessor3.0.wps",
- // .wps is just like any other works extension
- "application/vnd.ms-works",
- // this is due to MatOST substring
- "application/vnd.ms-works",
- // magic-based detection works, no need to refine it
- "application/vnd.ms-works");
-
- // files in version 4.0 are no different from those in version 3.0
- assertTypeDetection(
- "testWORKSWordProcessor4.0.wps",
- "application/vnd.ms-works",
- "application/vnd.ms-works",
- "application/vnd.ms-works");
- }
-
- /**
- * Files from Excel 2 through 4 are based on the BIFF record
- * structure, but without a wrapping OLE2 structure.
- * Excel 5 and Excel 95+ work on OLE2
- */
- @Test
- public void testOldExcel() throws Exception {
- // With just a name, we'll think everything's a new Excel file
- assertTypeByName("application/vnd.ms-excel","testEXCEL_4.xls");
- assertTypeByName("application/vnd.ms-excel","testEXCEL_5.xls");
- assertTypeByName("application/vnd.ms-excel","testEXCEL_95.xls");
-
- // With data, we can work out if it's old or new style
- assertTypeByData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls");
- assertTypeByData("application/x-tika-msoffice","testEXCEL_5.xls");
- assertTypeByData("application/x-tika-msoffice","testEXCEL_95.xls");
-
- assertTypeByNameAndData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls");
- assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_5.xls");
- assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_95.xls");
- }
-
- /**
- * Note - detecting container formats by mime magic is very very
- * iffy, as we can't be sure where things will end up.
- * People really ought to use the container aware detection...
- */
- @Test
- public void testOoxmlDetection() throws Exception {
- // These two do luckily have [Content_Types].xml near the start,
- // so our mime magic will spot them
- assertTypeByData("application/x-tika-ooxml", "testEXCEL.xlsx");
- assertTypeByData("application/x-tika-ooxml", "testPPT.pptx");
-
- // This one quite legitimately doesn't have its [Content_Types].xml
- // file as one of the first couple of entries
- // As such, our mime magic can't figure it out...
- assertTypeByData("application/zip", "testWORD.docx");
-
- // If we give the filename as well as the data, we can
- // specialise the ooxml generic one to the correct type
- assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "testEXCEL.xlsx");
- assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.presentationml.presentation", "testPPT.pptx");
- assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "testWORD.docx");
-
- // Test a few of the less usual ones
- assertTypeByNameAndData("application/vnd.ms-excel.sheet.binary.macroenabled.12","testEXCEL.xlsb");
- assertTypeByNameAndData("application/vnd.ms-powerpoint.presentation.macroenabled.12", "testPPT.pptm");
- assertTypeByNameAndData("application/vnd.ms-powerpoint.template.macroenabled.12", "testPPT.potm");
- assertTypeByNameAndData("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "testPPT.ppsm");
- }
-
- /**
- * Note - container based formats, needs container detection
- * to be properly correct
- */
- @Test
- public void testVisioDetection() throws Exception {
- // By Name, should get it right
- assertTypeByName("application/vnd.visio", "testVISIO.vsd");
- assertTypeByName("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm");
- assertTypeByName("application/vnd.ms-visio.drawing", "testVISIO.vsdx");
- assertTypeByName("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm");
- assertTypeByName("application/vnd.ms-visio.stencil", "testVISIO.vssx");
- assertTypeByName("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm");
- assertTypeByName("application/vnd.ms-visio.template", "testVISIO.vstx");
-
- // By Name and Data, should get it right
- assertTypeByNameAndData("application/vnd.visio", "testVISIO.vsd");
- assertTypeByNameAndData("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm");
- assertTypeByNameAndData("application/vnd.ms-visio.drawing", "testVISIO.vsdx");
- assertTypeByNameAndData("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm");
- assertTypeByNameAndData("application/vnd.ms-visio.stencil", "testVISIO.vssx");
- assertTypeByNameAndData("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm");
- assertTypeByNameAndData("application/vnd.ms-visio.template", "testVISIO.vstx");
-
- // By Data only, will get the container parent
- assertTypeByData("application/x-tika-msoffice", "testVISIO.vsd");
- assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdm");
- assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdx");
- assertTypeByData("application/x-tika-ooxml", "testVISIO.vssm");
- assertTypeByData("application/x-tika-ooxml", "testVISIO.vssx");
- assertTypeByData("application/x-tika-ooxml", "testVISIO.vstm");
- assertTypeByData("application/x-tika-ooxml", "testVISIO.vstx");
- }
-
- /**
- * Note - detecting container formats by mime magic is very very
- * iffy, as we can't be sure where things will end up.
- * People really ought to use the container aware detection...
- */
- @Test
- public void testIWorkDetection() throws Exception {
- // By name is easy
- assertTypeByName("application/vnd.apple.keynote", "testKeynote.key");
- assertTypeByName("application/vnd.apple.numbers", "testNumbers.numbers");
- assertTypeByName("application/vnd.apple.pages", "testPages.pages");
-
- // We can't do it by data, as we'd need to unpack
- // the zip file to check the XML
- assertTypeByData("application/zip", "testKeynote.key");
-
- assertTypeByNameAndData("application/vnd.apple.keynote", "testKeynote.key");
- assertTypeByNameAndData("application/vnd.apple.numbers", "testNumbers.numbers");
- assertTypeByNameAndData("application/vnd.apple.pages", "testPages.pages");
- }
-
- @Test
- public void testArchiveDetection() throws Exception {
- assertTypeByName("application/x-archive", "test.ar");
- assertTypeByName("application/zip", "test.zip");
- assertTypeByName("application/x-tar", "test.tar");
- assertTypeByName("application/gzip", "test.tgz"); // See GZIP, not tar contents of it
- assertTypeByName("application/x-cpio", "test.cpio");
-
- // TODO Add an example .deb and .udeb, then check these
-
- // Check the mime magic patterns for them work too
- assertTypeByData("application/x-archive", "testARofText.ar");
- assertTypeByData("application/x-archive", "testARofSND.ar");
- assertTypeByData("application/zip", "test-documents.zip");
- assertTypeByData("application/x-gtar", "test-documents.tar"); // GNU TAR
- assertTypeByData("application/gzip", "test-documents.tgz"); // See GZIP, not tar contents of it
- assertTypeByData("application/x-cpio", "test-documents.cpio");
-
- // For spanned zip files, the .zip file doesn't have the header, it's the other parts
- assertTypeByData("application/octet-stream", "test-documents-spanned.zip");
- assertTypeByData("application/zip", "test-documents-spanned.z01");
- }
-
- @Test
- public void testFeedsDetection() throws Exception {
- assertType("application/rss+xml", "rsstest.rss");
- assertType("application/atom+xml", "testATOM.atom");
- assertTypeByData("application/rss+xml", "rsstest.rss");
- assertTypeByName("application/rss+xml", "rsstest.rss");
- assertTypeByData("application/atom+xml", "testATOM.atom");
- assertTypeByName("application/atom+xml", "testATOM.atom");
- }
-
- @Test
- public void testFitsDetection() throws Exception {
- // FITS image created using imagemagick convert of testJPEG.jpg
- assertType("application/fits", "testFITS.fits");
- assertTypeByData("application/fits", "testFITS.fits");
- assertTypeByName("application/fits", "testFITS.fits");
- }
-
- @Test
- public void testJpegDetection() throws Exception {
- assertType("image/jpeg", "testJPEG.jpg");
- assertTypeByData("image/jpeg", "testJPEG.jpg");
- assertTypeByName("image/jpeg", "x.jpg");
- assertTypeByName("image/jpeg", "x.JPG");
- assertTypeByName("image/jpeg", "x.jpeg");
- assertTypeByName("image/jpeg", "x.JPEG");
- assertTypeByName("image/jpeg", "x.jpe");
- assertTypeByName("image/jpeg", "x.jif");
- assertTypeByName("image/jpeg", "x.jfif");
- assertTypeByName("image/jpeg", "x.jfi");
-
- assertType("image/jp2", "testJPEG.jp2");
- assertTypeByData("image/jp2", "testJPEG.jp2");
- assertTypeByName("image/jp2", "x.jp2");
- }
-
- @Test
- public void testBpgDetection() throws Exception {
- assertType("image/x-bpg", "testBPG.bpg");
- assertTypeByData("image/x-bpg", "testBPG.bpg");
- assertTypeByData("image/x-bpg", "testBPG_commented.bpg");
- assertTypeByName("image/x-bpg", "x.bpg");
- }
-
- @Test
- public void testTiffDetection() throws Exception {
- assertType("image/tiff", "testTIFF.tif");
- assertTypeByData("image/tiff", "testTIFF.tif");
- assertTypeByName("image/tiff", "x.tiff");
- assertTypeByName("image/tiff", "x.tif");
- assertTypeByName("image/tiff", "x.TIF");
- }
-
- @Test
- public void testGifDetection() throws Exception {
- assertType("image/gif", "testGIF.gif");
- assertTypeByData("image/gif", "testGIF.gif");
- assertTypeByName("image/gif", "x.gif");
- assertTypeByName("image/gif", "x.GIF");
- }
-
- @Test
- public void testPngDetection() throws Exception {
- assertType("image/png", "testPNG.png");
- assertTypeByData("image/png", "testPNG.png");
- assertTypeByName("image/png", "x.png");
- assertTypeByName("image/png", "x.PNG");
- }
-
- @Test
- public void testWEBPDetection() throws Exception {
- assertType("image/webp", "testWEBP.webp");
- assertTypeByData("image/webp", "testWEBP.webp");
- assertTypeByName("image/webp", "x.webp");
- assertTypeByName("image/webp", "x.WEBP");
- }
-
- @Test
- public void testBmpDetection() throws Exception {
- assertType("image/x-ms-bmp", "testBMP.bmp");
- assertTypeByData("image/x-ms-bmp", "testBMP.bmp");
- assertTypeByName("image/x-ms-bmp", "x.bmp");
- assertTypeByName("image/x-ms-bmp", "x.BMP");
- assertTypeByName("image/x-ms-bmp", "x.dib");
- assertTypeByName("image/x-ms-bmp", "x.DIB");
- //false positive check -- contains part of BMP signature
- assertType("text/plain", "testBMPfp.txt");
- }
-
- @Test
- public void testPnmDetection() throws Exception {
- assertType("image/x-portable-bitmap", "testPBM.pbm");
- assertType("image/x-portable-graymap", "testPGM.pgm");
- assertType("image/x-portable-pixmap", "testPPM.ppm");
- assertTypeByData("image/x-portable-bitmap", "testPBM.pbm");
- assertTypeByData("image/x-portable-graymap", "testPGM.pgm");
- assertTypeByData("image/x-portable-pixmap", "testPPM.ppm");
- assertTypeByName("image/x-portable-anymap", "x.pnm");
- assertTypeByName("image/x-portable-anymap", "x.PNM");
- assertTypeByName("image/x-portable-bitmap", "x.pbm");
- assertTypeByName("image/x-portable-bitmap", "x.PBM");
- assertTypeByName("image/x-portable-graymap", "x.pgm");
- assertTypeByName("image/x-portable-graymap", "x.PGM");
- assertTypeByName("image/x-portable-pixmap", "x.ppm");
- assertTypeByName("image/x-portable-pixmap", "x.PPM");
- }
-
- @Test
- public void testPictDetection() throws Exception {
- assertType("image/x-pict", "testPICT.pct");
- assertTypeByData("image/x-pict", "testPICT.pct");
- assertTypeByName("image/x-pict", "x.pic");
- assertTypeByName("image/x-pict", "x.PCT");
- }
-
- @Test
- public void testCgmDetection() throws Exception {
- // TODO: Need a test image file
- assertTypeByName("image/cgm", "x.cgm");
- assertTypeByName("image/cgm", "x.CGM");
- }
-
- @Test
- public void testRdfXmlDetection() throws Exception {
- assertTypeByName("application/rdf+xml", "x.rdf");
- assertTypeByName("application/rdf+xml", "x.owl");
- }
-
- @Test
- public void testSvgDetection() throws Exception {
- assertType("image/svg+xml", "testSVG.svg");
- assertTypeByData("image/svg+xml", "testSVG.svg");
- assertTypeByName("image/svg+xml", "x.svg");
- assertTypeByName("image/svg+xml", "x.SVG");
-
- // Should *.svgz be svg or gzip
- assertType("application/gzip", "testSVG.svgz");
- assertTypeByData("application/gzip", "testSVG.svgz");
- assertTypeByName("image/svg+xml", "x.svgz");
- assertTypeByName("image/svg+xml", "x.SVGZ");
- }
-
- @Test
- public void testPdfDetection() throws Exception {
- // PDF extension by name is enough
- assertTypeByName("application/pdf", "x.pdf");
- assertTypeByName("application/pdf", "x.PDF");
-
- // For normal PDFs, can get by name or data or both
- assertType("application/pdf", "testPDF.pdf");
- assertTypeByData("application/pdf", "testPDF.pdf");
-
- // PDF with a BoM works both ways too
- assertType("application/pdf", "testPDF_bom.pdf");
- assertTypeByData("application/pdf", "testPDF_bom.pdf");
- }
-
- @Test
- public void testSwfDetection() throws Exception {
- assertTypeByName("application/x-shockwave-flash", "x.swf");
- assertTypeByName("application/x-shockwave-flash", "x.SWF");
- assertTypeByName("application/x-shockwave-flash", "test1.swf");
- assertTypeByName("application/x-shockwave-flash", "test2.swf");
- assertTypeByName("application/x-shockwave-flash", "test3.swf");
- }
-
- @Test
- public void testDwgDetection() throws Exception {
- assertTypeByName("image/vnd.dwg", "x.dwg");
- assertTypeByData("image/vnd.dwg", "testDWG2004.dwg");
- assertTypeByData("image/vnd.dwg", "testDWG2007.dwg");
- assertTypeByData("image/vnd.dwg", "testDWG2010.dwg");
- }
-
- @Test
- public void testprtDetection() throws Exception {
- assertTypeByName("application/x-prt", "x.prt");
- assertTypeByData("application/x-prt", "testCADKEY.prt");
- }
-
- /**
- * Formats which are based on plain text
- */
- @Test
- public void testTextBasedFormatsDetection() throws Exception {
- assertTypeByName("text/plain", "testTXT.txt");
- assertType( "text/plain", "testTXT.txt");
-
- assertTypeByName("text/css", "testCSS.css");
- assertType( "text/css", "testCSS.css");
-
- assertTypeByName("text/csv", "testCSV.csv");
- assertType( "text/csv", "testCSV.csv");
-
- assertTypeByName("text/html", "testHTML.html");
- assertType( "text/html", "testHTML.html");
-
- assertTypeByName("application/javascript", "testJS.js");
- assertType( "application/javascript", "testJS.js");
- }
-
- @Test
- public void testJavaDetection() throws Exception {
- // TODO Classloader doesn't seem to find the .class file in test-documents
- //assertTypeDetection("AutoDetectParser.class", "application/java-vm");
-
- // OSX Native Extension
- assertTypeDetection("testJNILIB.jnilib", "application/x-java-jnilib");
- }
-
- @Test
- public void testXmlAndHtmlDetection() throws Exception {
- assertTypeByData("application/xml", "<?xml version=\"1.0\" encoding=\"UTF-8\"?><records><record/></records>"
- .getBytes(UTF_8));
- assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>"
- .getBytes(UTF_16LE));
- assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>"
- .getBytes(UTF_16BE));
- assertTypeByData("application/xml", "<!-- XML without processing instructions --><records><record/></records>"
- .getBytes(UTF_8));
- assertTypeByData("text/html", "<html><body>HTML</body></html>"
- .getBytes(UTF_8));
- assertTypeByData("text/html", "<!-- HTML comment --><html><body>HTML</body></html>"
- .getBytes(UTF_8));
- }
-
- @Test
- public void testWmfDetection() throws Exception {
- assertTypeByName("application/x-msmetafile", "x.wmf");
- assertTypeByData("application/x-msmetafile", "testWMF.wmf");
- assertTypeByName("application/x-msmetafile", "x.WMF");
-
- assertTypeByName("application/x-emf", "x.emf");
- assertTypeByData("application/x-emf","testEMF.emf");
- assertTypeByName("application/x-emf", "x.EMF");
- // TODO: Need a test wmz file
- assertTypeByName("application/x-ms-wmz", "x.wmz");
- assertTypeByName("application/x-ms-wmz", "x.WMZ");
- // TODO: Need a test emz file
- assertTypeByName("application/gzip", "x.emz");
- assertTypeByName("application/gzip", "x.EMZ");
- }
-
- @Test
- public void testPsDetection() throws Exception {
- // TODO: Need a test postscript file
- assertTypeByName("application/postscript", "x.ps");
- assertTypeByName("application/postscript", "x.PS");
- assertTypeByName("application/postscript", "x.eps");
- assertTypeByName("application/postscript", "x.epsf");
- assertTypeByName("application/postscript", "x.epsi");
- }
-
- @Test
- public void testMicrosoftMultiMediaDetection() throws Exception {
- assertTypeByName("video/x-ms-asf", "x.asf");
- assertTypeByName("video/x-ms-wmv", "x.wmv");
- assertTypeByName("audio/x-ms-wma", "x.wma");
-
- assertTypeByData("video/x-ms-asf", "testASF.asf");
- assertTypeByData("video/x-ms-wmv", "testWMV.wmv");
- assertTypeByData("audio/x-ms-wma", "testWMA.wma");
- }
-
- /**
- * All 3 DITA types are in theory handled by the same mimetype,
- * but we specialise them
- */
- @Test
- public void testDITADetection() throws Exception {
- assertTypeByName("application/dita+xml; format=topic", "test.dita");
- assertTypeByName("application/dita+xml; format=map", "test.ditamap");
- assertTypeByName("application/dita+xml; format=val", "test.ditaval");
-
- assertTypeByData("application/dita+xml; format=task", "testDITA.dita");
- assertTypeByData("application/dita+xml; format=concept", "testDITA2.dita");
- assertTypeByData("application/dita+xml; format=map", "testDITA.ditamap");
-
- assertTypeByNameAndData("application/dita+xml; format=task", "testDITA.dita");
- assertTypeByNameAndData("application/dita+xml; format=concept", "testDITA2.dita");
- assertTypeByNameAndData("application/dita+xml; format=map", "testDITA.ditamap");
-
- // These are all children of the official type
- assertEquals("application/dita+xml",
- repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.ditamap")).toString());
- assertEquals("application/dita+xml",
- repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.dita")).toString());
- // Concept inherits from topic
- assertEquals("application/dita+xml; format=topic",
- repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA2.dita")).toString());
- }
-
- /**
- * @since TIKA-194
- */
- @Test
- public void testJavaRegex() throws Exception{
- MimeType testType = new MimeType(MediaType.parse("foo/bar"));
- this.repo.add(testType);
- assertNotNull(repo.forName("foo/bar"));
- String pattern = "rtg_sst_grb_0\\.5\\.\\d{8}";
- this.repo.addPattern(testType, pattern, true);
- String testFileName = "rtg_sst_grb_0.5.12345678";
- assertEquals("foo/bar", tika.detect(testFileName));
-
- MimeType testType2 = new MimeType(MediaType.parse("foo/bar2"));
- this.repo.add(testType2);
- assertNotNull(repo.forName("foo/bar2"));
- this.repo.addPattern(testType2, pattern, false);
- assertNotSame("foo/bar2", tika.detect(testFileName));
- }
-
- @Test
- public void testRawDetection() throws Exception {
- assertTypeByName("image/x-raw-adobe", "x.dng");
- assertTypeByName("image/x-raw-adobe", "x.DNG");
- assertTypeByName("image/x-raw-hasselblad", "x.3fr");
- assertTypeByName("image/x-raw-fuji", "x.raf");
- assertTypeByName("image/x-raw-canon", "x.crw");
- assertTypeByName("image/x-raw-canon", "x.cr2");
- assertTypeByName("image/x-raw-kodak", "x.k25");
- assertTypeByName("image/x-raw-kodak", "x.kdc");
- assertTypeByName("image/x-raw-kodak", "x.dcs");
- assertTypeByName("image/x-raw-kodak", "x.drf");
- assertTypeByName("image/x-raw-minolta", "x.mrw");
- assertTypeByName("image/x-raw-nikon", "x.nef");
- assertTypeByName("image/x-raw-nikon", "x.nrw");
- assertTypeByName("image/x-raw-olympus", "x.orf");
- assertTypeByName("image/x-raw-pentax", "x.ptx");
- assertTypeByName("image/x-raw-pentax", "x.pef");
- assertTypeByName("image/x-raw-sony", "x.arw");
- assertTypeByName("image/x-raw-sony", "x.srf");
- assertTypeByName("image/x-raw-sony", "x.sr2");
- assertTypeByName("image/x-raw-sigma", "x.x3f");
- assertTypeByName("image/x-raw-epson", "x.erf");
- assertTypeByName("image/x-raw-mamiya", "x.mef");
- assertTypeByName("image/x-raw-leaf", "x.mos");
- assertTypeByName("image/x-raw-panasonic", "x.raw");
- assertTypeByName("image/x-raw-panasonic", "x.rw2");
- assertTypeByName("image/x-raw-phaseone", "x.iiq");
- assertTypeByName("image/x-raw-red", "x.r3d");
- assertTypeByName("image/x-raw-imacon", "x.fff");
- assertTypeByName("image/x-raw-logitech", "x.pxn");
- assertTypeByName("image/x-raw-casio", "x.bay");
- assertTypeByName("image/x-raw-rawzor", "x.rwz");
- }
-
- /**
- * Tests that we correctly detect the font types
- */
- @Test
- public void testFontDetection() throws Exception {
- assertTypeByName("application/x-font-adobe-metric", "x.afm");
- assertTypeByData("application/x-font-adobe-metric", "testAFM.afm");
-
- assertTypeByName("application/x-font-printer-metric", "x.pfm");
- // TODO Get a sample .pfm file
- assertTypeByData(
- "application/x-font-printer-metric",
- new byte[] {0x00, 0x01, 256-0xb1, 0x0a, 0x00, 0x00, 0x43, 0x6f,
- 0x70, 0x79, 0x72, 0x69, 0x67, 0x68, 0x74, 0x20}
- );
-
- assertTypeByName("application/x-font-type1", "x.pfa");
- // TODO Get a sample .pfa file
- assertTypeByData(
- "application/x-font-type1",
- new byte[] {0x25, 0x21, 0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f,
- 0x62, 0x65, 0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31,
- 0x2e, 0x30, 0x20, 0x20, 0x2d, 0x2a, 0x2d, 0x20}
- );
-
- assertTypeByName("application/x-font-type1", "x.pfb");
- // TODO Get a sample .pfm file
- assertTypeByData(
- "application/x-font-type1",
- new byte[] {-0x80, 0x01, 0x09, 0x05, 0x00, 0x00, 0x25, 0x21,
- 0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f, 0x62, 0x65,
- 0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31, 0x2e, 0x30 }
- );
- }
-
- /**
- * Tests MimeTypes.getMimeType(URL), which examines both the byte header
- * and, if necessary, the URL's extension.
- */
- @Test
- public void testMimeDeterminationForTestDocuments() throws Exception {
- assertType("text/html", "testHTML.html");
- assertType("application/zip", "test-documents.zip");
-
- assertType("text/html", "testHTML_utf8.html");
- assertType(
- "application/vnd.oasis.opendocument.text",
- "testOpenOffice2.odt");
- assertType("application/pdf", "testPDF.pdf");
- assertType("application/rtf", "testRTF.rtf");
- assertType("text/plain", "testTXT.txt");
- assertType("application/xml", "testXML.xml");
- assertType("audio/basic", "testAU.au");
- assertType("audio/x-aiff", "testAIFF.aif");
- assertType("audio/x-wav", "testWAV.wav");
- assertType("audio/midi", "testMID.mid");
- assertType("application/x-msaccess", "testACCESS.mdb");
- assertType("application/x-font-ttf", "testTrueType3.ttf");
- }
-
- @Test
- public void test7ZipDetection() throws Exception {
- assertTypeByName("application/x-7z-compressed","test-documents.7z");
- assertTypeByData("application/x-7z-compressed","test-documents.7z");
- assertTypeByNameAndData("application/x-7z-compressed", "test-documents.7z");
- }
-
- @Test
- public void testWebArchiveDetection() throws Exception {
- assertTypeByName("application/x-webarchive","x.webarchive");
- assertTypeByData("application/x-bplist","testWEBARCHIVE.webarchive");
- assertTypeByNameAndData("application/x-webarchive", "testWEBARCHIVE.webarchive");
- }
-
- /**
- * KML, and KMZ (zipped KML)
- */
- @Test
- public void testKMLZDetection() throws Exception {
- assertTypeByName("application/vnd.google-earth.kml+xml","testKML.kml");
- assertTypeByData("application/vnd.google-earth.kml+xml","testKML.kml");
- assertTypeByNameAndData("application/vnd.google-earth.kml+xml", "testKML.kml");
-
- assertTypeByName("application/vnd.google-earth.kmz","testKMZ.kmz");
- assertTypeByNameAndData("application/vnd.google-earth.kmz", "testKMZ.kmz");
-
- // By data only, mimetype magic only gets us to a .zip
- // We need to use the Zip Aware detector to get the full type
- assertTypeByData("application/zip","testKMZ.kmz");
- }
-
- @Test
- public void testCreativeSuite() throws IOException {
- assertTypeDetection("testINDD.indd", "application/x-adobe-indesign");
- assertTypeDetection("testPSD.psd", "image/vnd.adobe.photoshop");
- }
-
- @Test
- public void testAMR() throws IOException {
- // AMR matches on name, data or both
- assertTypeDetection("testAMR.amr", "audio/amr");
-
- // AMR-WB subtype shares extension, so needs data to identify
- assertTypeDetection("testAMR-WB.amr", "audio/amr", "audio/amr-wb", "audio/amr-wb");
-
- // Ditto for the AMR-WB+ subtype, which we don't have a sample file of yet
- //assertTypeDetection("testAMR-WB+.amr", "audio/amr", "audio/amr-wb+", "audio/amr-wb+");
- }
-
- @Test
- public void testEmail() throws IOException {
- // EMLX
- assertTypeDetection("testEMLX.emlx", "message/x-emlx");
-
- // Groupwise
- assertTypeDetection("testGroupWiseEml.eml", "message/rfc822");
-
- // Lotus
- assertTypeDetection("testLotusEml.eml", "message/rfc822");
-
- // Thunderbird - doesn't currently work by name
- assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml");
- }
-
- @Test
- public void testAxCrypt() throws Exception {
- // test-TXT.txt encrypted with a key of "tika"
- assertTypeDetection("testTXT-tika.axx", "application/x-axcrypt");
- }
-
- @Test
- public void testWindowsEXE() throws Exception {
- assertTypeByName("application/x-msdownload", "x.dll");
- assertTypeByName("application/x-ms-installer", "x.msi");
- assertTypeByName("application/x-dosexec", "x.exe");
-
- assertTypeByData("application/x-msdownload; format=pe", "testTinyPE.exe");
- assertTypeByNameAndData("application/x-msdownload; format=pe", "testTinyPE.exe");
-
- // A jar file with part of a PE header, but not a full one
- // should still be detected as a zip or jar (without/with name)
- assertTypeByData("application/zip", "testJAR_with_PEHDR.jar");
- assertTypeByNameAndData("application/java-archive", "testJAR_with_PEHDR.jar");
- }
-
- @Test
- public void testMatroskaDetection() throws Exception {
- assertType("video/x-matroska", "testMKV.mkv");
- // TODO: Need custom detector data detection, see TIKA-1180
- assertTypeByData("application/x-matroska", "testMKV.mkv");
- assertTypeByNameAndData("video/x-matroska", "testMKV.mkv");
- assertTypeByName("video/x-matroska", "x.mkv");
- assertTypeByName("video/x-matroska", "x.MKV");
- assertTypeByName("audio/x-matroska", "x.mka");
- assertTypeByName("audio/x-matroska", "x.MKA");
- }
-
- @Test
- public void testWebMDetection() throws Exception {
- assertType("video/webm", "testWEBM.webm");
- // TODO: Need custom detector data detection, see TIKA-1180
- assertTypeByData("application/x-matroska", "testWEBM.webm");
- assertTypeByNameAndData("video/webm", "testWEBM.webm");
- assertTypeByName("video/webm", "x.webm");
- assertTypeByName("video/webm", "x.WEBM");
- }
-
- /** Test getMimeType(byte[]) */
- @Test
- public void testGetMimeType_byteArray() throws IOException {
- // Plain text detection
- assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
- assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
- assertText(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF });
- assertText(new byte[] { 'a', 'b', 'c' });
- assertText(new byte[] { '\t', '\r', '\n', 0x0C, 0x1B });
- assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C });
- }
-
- @Test
- public void testBerkeleyDB() throws IOException {
- assertTypeByData(
- "application/x-berkeley-db; format=btree; version=2",
- "testBDB_btree_2.db");
- assertTypeByData(
- "application/x-berkeley-db; format=btree; version=3",
- "testBDB_btree_3.db");
- assertTypeByData(
- "application/x-berkeley-db; format=btree; version=4",
- "testBDB_btree_4.db");
- // V4 and V5 share the same btree format
- assertTypeByData(
- "application/x-berkeley-db; format=btree; version=4",
- "testBDB_btree_5.db");
-
- assertTypeByData(
- "application/x-berkeley-db; format=hash; version=2",
- "testBDB_hash_2.db");
- assertTypeByData(
- "application/x-berkeley-db; format=hash; version=3",
- "testBDB_hash_3.db");
- assertTypeByData(
- "application/x-berkeley-db; format=hash; version=4",
- "testBDB_hash_4.db");
- assertTypeByData(
- "application/x-berkeley-db; format=hash; version=5",
- "testBDB_hash_5.db");
- }
-
- /**
- * CBOR typically contains HTML
- */
- @Test
- public void testCBOR() throws IOException {
- assertTypeByNameAndData("application/cbor", "NUTCH-1997.cbor");
- assertTypeByData("application/cbor", "NUTCH-1997.cbor");
- }
-
- @Test
- public void testZLIB() throws IOException {
- // ZLIB encoded versions of testTXT.txt
- assertTypeByData("application/zlib", "testTXT.zlib");
- assertTypeByData("application/zlib", "testTXT.zlib0");
- assertTypeByData("application/zlib", "testTXT.zlib5");
- assertTypeByData("application/zlib", "testTXT.zlib9");
- }
-
- @Test
- public void testTextFormats() throws Exception {
- assertType("application/x-bibtex-text-file", "testBIBTEX.bib");
- assertTypeByData("application/x-bibtex-text-file", "testBIBTEX.bib");
- }
-
- @Test
- public void testCodeFormats() throws Exception {
- assertType("text/x-csrc", "testC.c");
- assertType("text/x-chdr", "testH.h");
- assertTypeByData("text/x-csrc", "testC.c");
- assertTypeByData("text/x-chdr", "testH.h");
-
- assertTypeByName("text/x-java-source", "testJAVA.java");
- assertType("text/x-java-properties", "testJAVAPROPS.properties");
-
- assertType("text/x-matlab", "testMATLAB.m");
- assertType("text/x-matlab", "testMATLAB_wtsgaus.m");
- assertType("text/x-matlab", "testMATLAB_barcast.m");
- assertTypeByData("text/x-matlab", "testMATLAB.m");
- assertTypeByData("text/x-matlab", "testMATLAB_wtsgaus.m");
- assertTypeByData("text/x-matlab", "testMATLAB_barcast.m");
- }
-
- @Test
- public void testWebVTT() throws Exception {
- assertType("text/vtt", "testWebVTT.vtt");
- assertTypeByData("text/vtt", "testWebVTT.vtt");
- }
-
- private void assertText(byte[] prefix) throws IOException {
- assertMagic("text/plain", prefix);
- }
-
- private void assertNotText(byte[] prefix) throws IOException {
- assertMagic("application/octet-stream", prefix);
- }
-
- private void assertMagic(String expected, byte[] prefix) throws IOException {
- MediaType type =
- repo.detect(new ByteArrayInputStream(prefix), new Metadata());
- assertNotNull(type);
- assertEquals(expected, type.toString());
- }
-
- private void assertType(String expected, String filename) throws Exception {
- try (InputStream stream = TestMimeTypes.class.getResourceAsStream(
- "/test-documents/" + filename)) {
- assertNotNull("Test file not found: " + filename, stream);
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
- assertEquals(expected, repo.detect(stream, metadata).toString());
- }
- }
-
- private void assertTypeByName(String expected, String filename)
- throws IOException {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
- assertEquals(expected, repo.detect(null, metadata).toString());
- }
-
- private void assertTypeByData(String expected, String filename)
- throws IOException {
- try (InputStream stream = TestMimeTypes.class.getResourceAsStream(
- "/test-documents/" + filename)) {
- assertNotNull("Test file not found: " + filename, stream);
- Metadata metadata = new Metadata();
- assertEquals(expected, repo.detect(stream, metadata).toString());
- }
- }
-
- private void assertTypeByData(String expected, byte[] data)
- throws IOException {
- try (InputStream stream = new ByteArrayInputStream(data)) {
- Metadata metadata = new Metadata();
- assertEquals(expected, repo.detect(stream, metadata).toString());
- }
- }
-
- private void assertTypeDetection(String filename, String type)
- throws IOException {
- assertTypeDetection(filename, type, type, type);
- }
-
- private void assertTypeDetection(String filename, String byName, String byData,
- String byNameAndData) throws IOException {
- assertTypeByName(byName, filename);
- assertTypeByData(byData, filename);
- assertTypeByNameAndData(byNameAndData, filename);
- }
-
- private void assertTypeByNameAndData(String expected, String filename)
- throws IOException {
- assertEquals(expected, getTypeByNameAndData(filename).toString());
- }
-
- private MediaType getTypeByNameAndData(String filename) throws IOException {
- try (InputStream stream = TestMimeTypes.class.getResourceAsStream(
- "/test-documents/" + filename)) {
- assertNotNull("Test document not found: " + filename, stream);
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
- return repo.detect(stream, metadata);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
deleted file mode 100644
index 91b054e..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++ /dev/null
@@ -1,459 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipOutputStream;
-
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.XMPDM;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.sax.BodyContentHandler;
-import org.gagravarr.tika.FlacParser;
-import org.gagravarr.tika.OpusParser;
-import org.gagravarr.tika.VorbisParser;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class AutoDetectParserTest {
- private TikaConfig tika = TikaConfig.getDefaultConfig();
-
- // Easy to read constants for the MIME types:
- private static final String RAW = "application/octet-stream";
- private static final String EXCEL = "application/vnd.ms-excel";
- private static final String HTML = "text/html; charset=ISO-8859-1";
- private static final String PDF = "application/pdf";
- private static final String POWERPOINT = "application/vnd.ms-powerpoint";
- private static final String KEYNOTE = "application/vnd.apple.keynote";
- private static final String PAGES = "application/vnd.apple.pages";
- private static final String NUMBERS = "application/vnd.apple.numbers";
- private static final String CHM = "application/vnd.ms-htmlhelp";
- private static final String RTF = "application/rtf";
- private static final String PLAINTEXT = "text/plain; charset=ISO-8859-1";
- private static final String UTF8TEXT = "text/plain; charset=UTF-8";
- private static final String WORD = "application/msword";
- private static final String XML = "application/xml";
- private static final String RSS = "application/rss+xml";
- private static final String BMP = "image/x-ms-bmp";
- private static final String GIF = "image/gif";
- private static final String JPEG = "image/jpeg";
- private static final String PNG = "image/png";
- private static final String OGG_VORBIS = "audio/vorbis";
- private static final String OGG_OPUS = "audio/opus";
- private static final String OGG_FLAC = "audio/x-oggflac";
- private static final String FLAC_NATIVE= "audio/x-flac";
- private static final String OPENOFFICE
- = "application/vnd.oasis.opendocument.text";
-
-
- /**
- * This is where a single test is done.
- * @param tp the parameters encapsulated in a TestParams instance
- * @throws IOException
- */
- private void assertAutoDetect(TestParams tp) throws Exception {
- try (InputStream input = AutoDetectParserTest.class.getResourceAsStream(tp.resourceRealName)) {
- if (input == null) {
- fail("Could not open stream from specified resource: "
- + tp.resourceRealName);
- }
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, tp.resourceStatedName);
- metadata.set(Metadata.CONTENT_TYPE, tp.statedType);
- ContentHandler handler = new BodyContentHandler();
- new AutoDetectParser(tika).parse(input, handler, metadata);
-
- assertEquals("Bad content type: " + tp,
- tp.realType, metadata.get(Metadata.CONTENT_TYPE));
-
- if (tp.expectedContentFragment != null) {
- assertTrue("Expected content not found: " + tp,
- handler.toString().contains(tp.expectedContentFragment));
- }
- }
- }
-
- /**
- * Convenience method -- its sole purpose of existence is to make the
- * call to it more readable than it would be if a TestParams instance
- * would need to be instantiated there.
- *
- * @param resourceRealName real name of resource
- * @param resourceStatedName stated name -- will a bad name fool us?
- * @param realType - the real MIME type
- * @param statedType - stated MIME type - will a wrong one fool us?
- * @param expectedContentFragment - something expected in the text
- * @throws Exception
- */
- private void assertAutoDetect(String resourceRealName,
- String resourceStatedName,
- String realType,
- String statedType,
- String expectedContentFragment)
- throws Exception {
-
- assertAutoDetect(new TestParams(resourceRealName, resourceStatedName,
- realType, statedType, expectedContentFragment));
- }
-
- private void assertAutoDetect(
- String resource, String type, String content) throws Exception {
-
- resource = "/test-documents/" + resource;
-
- // TODO !!!! The disabled tests below should work!
- // The correct MIME type should be determined regardless of the
- // stated type (ContentType hint) and the stated URL name.
-
-
- // Try different combinations of correct and incorrect arguments:
- final String wrongMimeType = RAW;
- assertAutoDetect(resource, resource, type, type, content);
- assertAutoDetect(resource, resource, type, null, content);
- assertAutoDetect(resource, resource, type, wrongMimeType, content);
-
- assertAutoDetect(resource, null, type, type, content);
- assertAutoDetect(resource, null, type, null, content);
- assertAutoDetect(resource, null, type, wrongMimeType, content);
-
- final String badResource = "a.xyz";
- assertAutoDetect(resource, badResource, type, type, content);
- assertAutoDetect(resource, badResource, type, null, content);
- assertAutoDetect(resource, badResource, type, wrongMimeType, content);
- }
-
- @Test
- public void testKeynote() throws Exception {
- assertAutoDetect("testKeynote.key", KEYNOTE, "A sample presentation");
- }
-
- @Test
- public void testPages() throws Exception {
- assertAutoDetect("testPages.pages", PAGES, "Sample pages document");
- }
-
- @Test
- public void testNumbers() throws Exception {
- assertAutoDetect("testNumbers.numbers", NUMBERS, "Checking Account: 300545668");
- }
-
- @Test
- public void testChm() throws Exception {
- assertAutoDetect("testChm.chm", CHM, "If you do not specify a window type or a window name, the main window is used.");
- }
-
- @Test
- public void testEpub() throws Exception {
- assertAutoDetect(
- "testEPUB.epub", "application/epub+zip",
- "The previous headings were subchapters");
- }
-
- @Test
- public void testExcel() throws Exception {
- assertAutoDetect("testEXCEL.xls", EXCEL, "Sample Excel Worksheet");
- }
-
- @Test
- public void testHTML() throws Exception {
- assertAutoDetect("testHTML.html", HTML, "Test Indexation Html");
- }
-
- @Test
- public void testOpenOffice() throws Exception {
- assertAutoDetect("testOpenOffice2.odt", OPENOFFICE,
- "This is a sample Open Office document");
- }
-
- @Test
- public void testPDF() throws Exception {
- assertAutoDetect("testPDF.pdf", PDF, "Content Analysis Toolkit");
-
- }
-
- @Test
- public void testPowerpoint() throws Exception {
- assertAutoDetect("testPPT.ppt", POWERPOINT, "Sample Powerpoint Slide");
- }
-
- @Test
- public void testRdfXml() throws Exception {
- assertAutoDetect("testRDF.rdf", "application/rdf+xml", "");
- }
-
- @Test
- public void testRTF() throws Exception {
- assertAutoDetect("testRTF.rtf", RTF, "indexation Word");
- }
-
- @Test
- public void testText() throws Exception {
- assertAutoDetect("testTXT.txt", PLAINTEXT, "indexation de Txt");
- }
-
- @Test
- public void testTextNonASCIIUTF8() throws Exception {
- assertAutoDetect("testTXTNonASCIIUTF8.txt", UTF8TEXT, "The quick brown fox jumps over the lazy dog");
- }
-
- @Test
- public void testWord() throws Exception {
- assertAutoDetect("testWORD.doc", WORD, "Sample Word Document");
- }
-
- @Test
- public void testXML() throws Exception {
- assertAutoDetect("testXML.xml", XML, "Lius");
- }
-
- @Test
- public void testRss() throws Exception {
- assertAutoDetect("/test-documents/rsstest.rss", "feed", RSS, "application/rss+xml", "Sample RSS File for Junit test");
- }
-
- @Test
- public void testImages() throws Exception {
- assertAutoDetect("testBMP.bmp", BMP, null);
- assertAutoDetect("testGIF.gif", GIF, null);
- assertAutoDetect("testJPEG.jpg", JPEG, null);
- assertAutoDetect("testPNG.png", PNG, null);
- }
-
- /**
- * Make sure that zip bomb attacks are prevented.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-216">TIKA-216</a>
- */
- @Test
- public void testZipBombPrevention() throws Exception {
- try (InputStream tgz = AutoDetectParserTest.class.getResourceAsStream(
- "/test-documents/TIKA-216.tgz")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler(-1);
- new AutoDetectParser(tika).parse(tgz, handler, metadata);
- fail("Zip bomb was not detected");
- } catch (TikaException e) {
- // expected
- }
- }
-
- /**
- * Make sure XML parse errors don't trigger ZIP bomb detection.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-1322">TIKA-1322</a>
- */
- @Test
- public void testNoBombDetectedForInvalidXml() throws Exception {
- // create zip with ten empty / invalid XML files, 1.xml .. 10.xml
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- ZipOutputStream zos = new ZipOutputStream(baos);
- for (int i = 1; i <= 10; i++) {
- zos.putNextEntry(new ZipEntry(i + ".xml"));
- zos.closeEntry();
- }
- zos.finish();
- zos.close();
- new AutoDetectParser(tika).parse(new ByteArrayInputStream(baos.toByteArray()), new BodyContentHandler(-1),
- new Metadata());
- }
-
- /**
- * Test to ensure that the Ogg Audio parsers (Vorbis, Opus, Flac etc)
- * have been correctly included, and are available
- */
- @SuppressWarnings("deprecation")
- @Test
- public void testOggFlacAudio() throws Exception {
- // The three test files should all have similar test data
- String[] testFiles = new String[] {
- "testVORBIS.ogg", "testFLAC.flac", "testFLAC.oga",
- "testOPUS.opus"
- };
- MediaType[] mediaTypes = new MediaType[] {
- MediaType.parse(OGG_VORBIS), MediaType.parse(FLAC_NATIVE),
- MediaType.parse(OGG_FLAC), MediaType.parse(OGG_OPUS)
- };
-
- // Check we can load the parsers, and they claim to do the right things
- VorbisParser vParser = new VorbisParser();
- assertNotNull("Parser not found for " + mediaTypes[0],
- vParser.getSupportedTypes(new ParseContext()));
-
- FlacParser fParser = new FlacParser();
- assertNotNull("Parser not found for " + mediaTypes[1],
- fParser.getSupportedTypes(new ParseContext()));
- assertNotNull("Parser not found for " + mediaTypes[2],
- fParser.getSupportedTypes(new ParseContext()));
-
- OpusParser oParser = new OpusParser();
- assertNotNull("Parser not found for " + mediaTypes[3],
- oParser.getSupportedTypes(new ParseContext()));
-
- // Check we found the parser
- CompositeParser parser = (CompositeParser)tika.getParser();
- for (MediaType mt : mediaTypes) {
- assertNotNull("Parser not found for " + mt, parser.getParsers().get(mt) );
- }
-
- // Have each file parsed, and check
- for (int i=0; i<testFiles.length; i++) {
- String file = testFiles[i];
- try (InputStream input = AutoDetectParserTest.class.getResourceAsStream(
- "/test-documents/" + file)) {
- if (input == null) {
- fail("Could not find test file " + file);
- }
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new AutoDetectParser(tika).parse(input, handler, metadata);
-
- assertEquals("Incorrect content type for " + file,
- mediaTypes[i].toString(), metadata.get(Metadata.CONTENT_TYPE));
-
- // Check some of the common metadata
- // Old style metadata
- assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
- assertEquals("Test Title", metadata.get(Metadata.TITLE));
- // New style metadata
- assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
-
- // Check some of the XMPDM metadata
- if (!file.endsWith(".opus")) {
- assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
- }
- assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
- assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
- assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
-
- // Check some of the text
- String content = handler.toString();
- assertTrue(content.contains("Test Title"));
- assertTrue(content.contains("Test Artist"));
- }
- }
- }
-
- /**
- * Test case for TIKA-514. Provide constructor for AutoDetectParser that has explicit
- * list of supported parsers.
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-514">TIKA-514</a>
- */
- @Test
- public void testSpecificParserList() throws Exception {
- AutoDetectParser parser = new AutoDetectParser(new MyDetector(), new MyParser());
-
- InputStream is = new ByteArrayInputStream("test".getBytes(UTF_8));
- Metadata metadata = new Metadata();
- parser.parse(is, new BodyContentHandler(), metadata, new ParseContext());
-
- assertEquals("value", metadata.get("MyParser"));
- }
-
- private static final MediaType MY_MEDIA_TYPE = new MediaType("application", "x-myparser");
-
- /**
- * A test detector which always returns the type supported
- * by the test parser
- */
- @SuppressWarnings("serial")
- private static class MyDetector implements Detector {
- public MediaType detect(InputStream input, Metadata metadata) throws IOException {
- return MY_MEDIA_TYPE;
- }
- }
-
- @SuppressWarnings("serial")
- private static class MyParser extends AbstractParser {
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- Set<MediaType> supportedTypes = new HashSet<MediaType>();
- supportedTypes.add(MY_MEDIA_TYPE);
- return supportedTypes;
- }
-
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) {
- metadata.add("MyParser", "value");
- }
-
- }
-
- /**
- * Minimal class to encapsulate all parameters -- the main reason for
- * its existence is to aid in debugging via its toString() method.
- *
- * Getters and setters intentionally not provided.
- */
- private static class TestParams {
-
- public String resourceRealName;
- public String resourceStatedName;
- public String realType;
- public String statedType;
- public String expectedContentFragment;
-
-
- private TestParams(String resourceRealName,
- String resourceStatedName,
- String realType,
- String statedType,
- String expectedContentFragment) {
- this.resourceRealName = resourceRealName;
- this.resourceStatedName = resourceStatedName;
- this.realType = realType;
- this.statedType = statedType;
- this.expectedContentFragment = expectedContentFragment;
- }
-
-
- /**
- * Produces a string like the following:
- *
- * <pre>
- * Test parameters:
- * resourceRealName = /test-documents/testEXCEL.xls
- * resourceStatedName = null
- * realType = application/vnd.ms-excel
- * statedType = null
- * expectedContentFragment = Sample Excel Worksheet
- * </pre>
- */
- public String toString() {
- return "Test parameters:\n"
- + " resourceRealName = " + resourceRealName + "\n"
- + " resourceStatedName = " + resourceStatedName + "\n"
- + " realType = " + realType + "\n"
- + " statedType = " + statedType + "\n"
- + " expectedContentFragment = " + expectedContentFragment + "\n";
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
deleted file mode 100644
index 68edfc2..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/parser/DigestingParserTest.java
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.utils.CommonsDigester;
-import org.junit.Test;
-
-
-public class DigestingParserTest extends TikaTest {
-
- private final static String P = TikaCoreProperties.TIKA_META_PREFIX+
- "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER;
-
- private final int UNLIMITED = 1000000;//well, not really, but longer than input file
- private final Parser p = new AutoDetectParser();
-
- @Test
- public void testBasic() throws Exception {
- Map<CommonsDigester.DigestAlgorithm, String> expected =
- new HashMap<CommonsDigester.DigestAlgorithm, String>();
-
- expected.put(CommonsDigester.DigestAlgorithm.MD2,"d768c8e27b0b52c6eaabfaa7122d1d4f");
- expected.put(CommonsDigester.DigestAlgorithm.MD5,"59f626e09a8c16ab6dbc2800c685f772");
- expected.put(CommonsDigester.DigestAlgorithm.SHA1,"7a1f001d163ac90d8ea54c050faf5a38079788a6");
- expected.put(CommonsDigester.DigestAlgorithm.SHA256,"c4b7fab030a8b6a9d6691f6699ac8e6f" +
- "82bc53764a0f1430d134ae3b70c32654");
- expected.put(CommonsDigester.DigestAlgorithm.SHA384,"ebe368b9326fef44408290724d187553"+
- "8b8a6923fdf251ddab72c6e4b5d54160" +
- "9db917ba4260d1767995a844d8d654df");
- expected.put(CommonsDigester.DigestAlgorithm.SHA512,"ee46d973ee1852c018580c242955974d"+
- "da4c21f36b54d7acd06fcf68e974663b"+
- "fed1d256875be58d22beacf178154cc3"+
- "a1178cb73443deaa53aa0840324708bb");
-
- //test each one
- for (CommonsDigester.DigestAlgorithm algo : CommonsDigester.DigestAlgorithm.values()) {
- Metadata m = new Metadata();
- XMLResult xml = getXML("test_recursive_embedded.docx",
- new DigestingParser(p, new CommonsDigester(UNLIMITED, algo)), m);
- assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
- }
-
-
- //test comma separated
- CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse("md5,sha256,sha384,sha512");
- Metadata m = new Metadata();
- XMLResult xml = getXML("test_recursive_embedded.docx",
- new DigestingParser(p, new CommonsDigester(UNLIMITED, algos)), m);
- for (CommonsDigester.DigestAlgorithm algo : new CommonsDigester.DigestAlgorithm[]{
- CommonsDigester.DigestAlgorithm.MD5,
- CommonsDigester.DigestAlgorithm.SHA256,
- CommonsDigester.DigestAlgorithm.SHA384,
- CommonsDigester.DigestAlgorithm.SHA512}) {
- assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
- }
-
- assertNull(m.get(P+CommonsDigester.DigestAlgorithm.MD2.toString()));
- assertNull(m.get(P+CommonsDigester.DigestAlgorithm.SHA1.toString()));
-
- }
-
- @Test
- public void testLimitedRead() throws Exception {
- CommonsDigester.DigestAlgorithm algo = CommonsDigester.DigestAlgorithm.MD5;
- int limit = 100;
- byte[] bytes = new byte[limit];
- InputStream is = getResourceAsStream("/test-documents/test_recursive_embedded.docx");
- is.read(bytes, 0, limit);
- is.close();
- Metadata m = new Metadata();
- try {
- XMLResult xml = getXML(TikaInputStream.get(bytes),
- new DigestingParser(p, new CommonsDigester(100, algo)), m);
- } catch (TikaException e) {
- //thrown because this is just a file fragment
- assertContains("Unexpected RuntimeException from org.apache.tika.parser.microsoft.ooxml.OOXMLParser",
- e.getMessage());
- }
- String expectedMD5 = m.get(P+"MD5");
-
- m = new Metadata();
- XMLResult xml = getXML("test_recursive_embedded.docx",
- new DigestingParser(p, new CommonsDigester(100, algo)), m);
- assertEquals(expectedMD5, m.get(P+"MD5"));
- }
-
- @Test
- public void testReset() throws Exception {
- String expectedMD5 = "1643c2cef21e36720c54f4f6cb3349d0";
- Metadata m = new Metadata();
- XMLResult xml = getXML("test_recursive_embedded.docx",
- new DigestingParser(p, new CommonsDigester(100, CommonsDigester.DigestAlgorithm.MD5)), m);
- assertEquals(expectedMD5, m.get(P+"MD5"));
- }
-
- @Test
- public void testNegativeMaxMarkLength() throws Exception {
- Metadata m = new Metadata();
- boolean ex = false;
- try {
- XMLResult xml = getXML("test_recursive_embedded.docx",
- new DigestingParser(p, new CommonsDigester(-1, CommonsDigester.DigestAlgorithm.MD5)), m);
- } catch (IllegalArgumentException e) {
- ex = true;
- }
- assertTrue("Exception not thrown", ex);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
deleted file mode 100644
index 2fcd1c3..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import java.io.ByteArrayInputStream;
-import java.io.InputStream;
-import java.io.Reader;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.junit.Test;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-
-public class ParsingReaderTest {
-
- @Test
- public void testPlainText() throws Exception {
- String data = "test content";
- InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
- Reader reader = new ParsingReader(stream, "test.txt");
- assertEquals('t', reader.read());
- assertEquals('e', reader.read());
- assertEquals('s', reader.read());
- assertEquals('t', reader.read());
- assertEquals(' ', reader.read());
- assertEquals('c', reader.read());
- assertEquals('o', reader.read());
- assertEquals('n', reader.read());
- assertEquals('t', reader.read());
- assertEquals('e', reader.read());
- assertEquals('n', reader.read());
- assertEquals('t', reader.read());
- assertEquals('\n', reader.read());
- assertEquals(-1, reader.read());
- reader.close();
- assertEquals(-1, stream.read());
- }
-
- @Test
- public void testXML() throws Exception {
- String data = "<p>test <span>content</span></p>";
- InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
- Reader reader = new ParsingReader(stream, "test.xml");
- assertEquals(' ', (char) reader.read());
- assertEquals('t', (char) reader.read());
- assertEquals('e', (char) reader.read());
- assertEquals('s', (char) reader.read());
- assertEquals('t', (char) reader.read());
- assertEquals(' ', (char) reader.read());
- assertEquals(' ', (char) reader.read());
- assertEquals('c', (char) reader.read());
- assertEquals('o', (char) reader.read());
- assertEquals('n', (char) reader.read());
- assertEquals('t', (char) reader.read());
- assertEquals('e', (char) reader.read());
- assertEquals('n', (char) reader.read());
- assertEquals('t', (char) reader.read());
- assertEquals('\n', (char) reader.read());
- assertEquals(-1, reader.read());
- reader.close();
- assertEquals(-1, stream.read());
- }
-
- /**
- * Test case for TIKA-203
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-203">TIKA-203</a>
- */
- @Test
- public void testMetadata() throws Exception {
- Metadata metadata = new Metadata();
- InputStream stream = ParsingReaderTest.class.getResourceAsStream(
- "/test-documents/testEXCEL.xls");
- try (Reader reader = new ParsingReader(
- new AutoDetectParser(), stream, metadata, new ParseContext())) {
- // Metadata should already be available
- assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
- // Check that the internal buffering isn't broken
- assertEquals('F', (char) reader.read());
- assertEquals('e', (char) reader.read());
- assertEquals('u', (char) reader.read());
- assertEquals('i', (char) reader.read());
- assertEquals('l', (char) reader.read());
- assertEquals('1', (char) reader.read());
- }
- }
-
-}
[06/13] tika git commit: TIKA-1855 -- first pass. Need to turn back
on the forbidden-apis testCheck. More clean up remains.
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/main/appended-resources/META-INF/LICENSE
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/appended-resources/META-INF/LICENSE b/tika-parsers/src/main/appended-resources/META-INF/LICENSE
deleted file mode 100644
index bd54624..0000000
--- a/tika-parsers/src/main/appended-resources/META-INF/LICENSE
+++ /dev/null
@@ -1,94 +0,0 @@
-APACHE TIKA SUBCOMPONENTS
-
-Apache Tika includes a number of subcomponents with separate copyright notices
-and license terms. Your use of these subcomponents is subject to the terms and
-conditions of the following licenses.
-
-Charset detection code from ICU4J (http://site.icu-project.org/)
-
- Copyright (c) 1995-2009 International Business Machines Corporation
- and others
-
- All rights reserved.
-
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, and/or sell copies of the Software, and to permit persons
- to whom the Software is furnished to do so, provided that the above
- copyright notice(s) and this permission notice appear in all copies
- of the Software and that both the above copyright notice(s) and this
- permission notice appear in supporting documentation.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
- IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
- BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
- OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
- WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
- ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- SOFTWARE.
-
- Except as contained in this notice, the name of a copyright holder shall
- not be used in advertising or otherwise to promote the sale, use or other
- dealings in this Software without prior written authorization of the
- copyright holder.
-
-
-JUnRAR (https://github.com/edmund-wagner/junrar/)
-
- JUnRAR is based on the UnRAR tool, and covered by the same license
- It was formerly available from http://java-unrar.svn.sourceforge.net/
-
- ****** ***** ****** UnRAR - free utility for RAR archives
- ** ** ** ** ** ** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ****** ******* ****** License for use and distribution of
- ** ** ** ** ** ** ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ** ** ** ** ** ** FREE portable version
- ~~~~~~~~~~~~~~~~~~~~~
-
- The source code of UnRAR utility is freeware. This means:
-
- 1. All copyrights to RAR and the utility UnRAR are exclusively
- owned by the author - Alexander Roshal.
-
- 2. The UnRAR sources may be used in any software to handle RAR
- archives without limitations free of charge, but cannot be used
- to re-create the RAR compression algorithm, which is proprietary.
- Distribution of modified UnRAR sources in separate form or as a
- part of other software is permitted, provided that it is clearly
- stated in the documentation and source comments that the code may
- not be used to develop a RAR (WinRAR) compatible archiver.
-
- 3. The UnRAR utility may be freely distributed. It is allowed
- to distribute UnRAR inside of other software packages.
-
- 4. THE RAR ARCHIVER AND THE UnRAR UTILITY ARE DISTRIBUTED "AS IS".
- NO WARRANTY OF ANY KIND IS EXPRESSED OR IMPLIED. YOU USE AT
- YOUR OWN RISK. THE AUTHOR WILL NOT BE LIABLE FOR DATA LOSS,
- DAMAGES, LOSS OF PROFITS OR ANY OTHER KIND OF LOSS WHILE USING
- OR MISUSING THIS SOFTWARE.
-
- 5. Installing and using the UnRAR utility signifies acceptance of
- these terms and conditions of the license.
-
- 6. If you don't agree with terms of the license you must remove
- UnRAR files from your storage devices and cease to use the
- utility.
-
- Thank you for your interest in RAR and UnRAR. Alexander L. Roshal
-
-Sqlite (included in the "provided" org.xerial's sqlite-jdbc)
- Sqlite is in the Public Domain. For details
- see: https://www.sqlite.org/copyright.html
-
-Two photos in test-documents (testWebp_Alpha_Lossy.webp and testWebp_Alpha_Lossless.webp)
- are in the public domain. These files were retrieved from:
- https://github.com/drewnoakes/metadata-extractor-images/tree/master/webp
- These photos are also available here:
- https://developers.google.com/speed/webp/gallery2#webp_links
- Credits for the photo:
- "Free Stock Photo in High Resolution - Yellow Rose 3 - Flowers"
- Image Author: Jon Sullivan
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/main/java/org/apache/tika/parser/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/internal/Activator.java b/tika-parsers/src/main/java/org/apache/tika/parser/internal/Activator.java
deleted file mode 100644
index a884d3a..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/internal/Activator.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.internal;
-
-import java.util.Properties;
-
-import org.apache.tika.detect.DefaultDetector;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.parser.DefaultParser;
-import org.apache.tika.parser.Parser;
-import org.osgi.framework.BundleActivator;
-import org.osgi.framework.BundleContext;
-import org.osgi.framework.ServiceRegistration;
-
-public class Activator implements BundleActivator {
-
- private ServiceRegistration detectorService;
-
- private ServiceRegistration parserService;
-
- @Override
- public void start(BundleContext context) throws Exception {
- detectorService = context.registerService(
- Detector.class.getName(),
- new DefaultDetector(Activator.class.getClassLoader()),
- new Properties());
- Parser parser = new DefaultParser(Activator.class.getClassLoader());
- parserService = context.registerService(
- Parser.class.getName(),
- parser,
- new Properties());
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
- parserService.unregister();
- detectorService.unregister();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java b/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
deleted file mode 100644
index a064156..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/utils/CommonsDigester.java
+++ /dev/null
@@ -1,299 +0,0 @@
-package org.apache.tika.parser.utils;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Locale;
-
-import org.apache.commons.codec.digest.DigestUtils;
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.DigestingParser;
-import org.apache.tika.parser.ParseContext;
-
-/**
- * Implementation of {@link org.apache.tika.parser.DigestingParser.Digester}
- * that relies on commons.codec.digest.DigestUtils to calculate digest hashes.
- * <p>
- * This digester tries to use the regular mark/reset protocol on the InputStream.
- * However, this wraps an internal BoundedInputStream, and if the InputStream
- * is not fully read, then this will reset the stream and
- * spool the InputStream to disk (via TikaInputStream) and then digest the file.
- * <p>
- * If a TikaInputStream is passed in and it has an underlying file that is longer
- * than the {@link #markLimit}, then this digester digests the file directly.
- *
- */
-public class CommonsDigester implements DigestingParser.Digester {
-
- public enum DigestAlgorithm {
- //those currently available in commons.digest
- MD2,
- MD5,
- SHA1,
- SHA256,
- SHA384,
- SHA512;
-
- String getMetadataKey() {
- return TikaCoreProperties.TIKA_META_PREFIX+
- "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER+this.toString();
- }
- }
-
- private final List<DigestAlgorithm> algorithms = new ArrayList<DigestAlgorithm>();
- private final int markLimit;
-
- public CommonsDigester(int markLimit, DigestAlgorithm... algorithms) {
- Collections.addAll(this.algorithms, algorithms);
- if (markLimit < 0) {
- throw new IllegalArgumentException("markLimit must be >= 0");
- }
- this.markLimit = markLimit;
- }
-
- @Override
- public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException {
- InputStream tis = TikaInputStream.get(is);
- long sz = -1;
- if (((TikaInputStream)tis).hasFile()) {
- sz = ((TikaInputStream)tis).getLength();
- }
- //if the file is definitely a file,
- //and its size is greater than its mark limit,
- //just digest the underlying file.
- if (sz > markLimit) {
- digestFile(((TikaInputStream)tis).getFile(), m);
- return;
- }
-
- //try the usual mark/reset stuff.
- //however, if you actually hit the bound,
- //then stop and spool to file via TikaInputStream
- SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, tis);
- boolean finishedStream = false;
- for (DigestAlgorithm algorithm : algorithms) {
- bis.mark(markLimit + 1);
- finishedStream = digestEach(algorithm, bis, m);
- bis.reset();
- if (!finishedStream) {
- break;
- }
- }
- if (!finishedStream) {
- digestFile(((TikaInputStream)tis).getFile(), m);
- }
- }
-
- private void digestFile(File f, Metadata m) throws IOException {
- for (DigestAlgorithm algorithm : algorithms) {
- InputStream is = new FileInputStream(f);
- try {
- digestEach(algorithm, is, m);
- } finally {
- IOUtils.closeQuietly(is);
- }
- }
- }
-
- /**
- *
- * @param algorithm algo to use
- * @param is input stream to read from
- * @param metadata metadata for reporting the digest
- * @return whether or not this finished the input stream
- * @throws IOException
- */
- private boolean digestEach(DigestAlgorithm algorithm,
- InputStream is, Metadata metadata) throws IOException {
- String digest = null;
- try {
- switch (algorithm) {
- case MD2:
- digest = DigestUtils.md2Hex(is);
- break;
- case MD5:
- digest = DigestUtils.md5Hex(is);
- break;
- case SHA1:
- digest = DigestUtils.sha1Hex(is);
- break;
- case SHA256:
- digest = DigestUtils.sha256Hex(is);
- break;
- case SHA384:
- digest = DigestUtils.sha384Hex(is);
- break;
- case SHA512:
- digest = DigestUtils.sha512Hex(is);
- break;
- default:
- throw new IllegalArgumentException("Sorry, not aware of algorithm: " + algorithm.toString());
- }
- } catch (IOException e) {
- e.printStackTrace();
- //swallow, or should we throw this?
- }
- if (is instanceof SimpleBoundedInputStream) {
- if (((SimpleBoundedInputStream)is).hasHitBound()) {
- return false;
- }
- }
- metadata.set(algorithm.getMetadataKey(), digest);
- return true;
- }
-
- /**
- *
- * @param s comma-delimited (no space) list of algorithms to use: md5,sha256
- * @return
- */
- public static DigestAlgorithm[] parse(String s) {
- assert(s != null);
-
- List<DigestAlgorithm> ret = new ArrayList<DigestAlgorithm>();
- for (String algoString : s.split(",")) {
- String uc = algoString.toUpperCase(Locale.ROOT);
- if (uc.equals(DigestAlgorithm.MD2.toString())) {
- ret.add(DigestAlgorithm.MD2);
- } else if (uc.equals(DigestAlgorithm.MD5.toString())) {
- ret.add(DigestAlgorithm.MD5);
- } else if (uc.equals(DigestAlgorithm.SHA1.toString())) {
- ret.add(DigestAlgorithm.SHA1);
- } else if (uc.equals(DigestAlgorithm.SHA256.toString())) {
- ret.add(DigestAlgorithm.SHA256);
- } else if (uc.equals(DigestAlgorithm.SHA384.toString())) {
- ret.add(DigestAlgorithm.SHA384);
- } else if (uc.equals(DigestAlgorithm.SHA512.toString())) {
- ret.add(DigestAlgorithm.SHA512);
- } else {
- StringBuilder sb = new StringBuilder();
- int i = 0;
- for (DigestAlgorithm algo : DigestAlgorithm.values()) {
- if (i++ > 0) {
- sb.append(", ");
- }
- sb.append(algo.toString());
- }
- throw new IllegalArgumentException("Couldn't match " + s + " with any of: " + sb.toString());
- }
- }
- return ret.toArray(new DigestAlgorithm[ret.size()]);
- }
-
- /**
- * Very slight modification of Commons' BoundedInputStream
- * so that we can figure out if this hit the bound or not.
- */
- private class SimpleBoundedInputStream extends InputStream {
- private final static int EOF = -1;
- private final long max;
- private final InputStream in;
- private long pos;
- boolean hitBound = false;
-
- private SimpleBoundedInputStream(long max, InputStream in) {
- this.max = max;
- this.in = in;
- }
-
- @Override
- public int read() throws IOException {
- if (max >= 0 && pos >= max) {
- hitBound = true;
- return EOF;
- }
- final int result = in.read();
- pos++;
- return result;
- }
-
- /**
- * Invokes the delegate's <code>read(byte[])</code> method.
- * @param b the buffer to read the bytes into
- * @return the number of bytes read or -1 if the end of stream or
- * the limit has been reached.
- * @throws IOException if an I/O error occurs
- */
- @Override
- public int read(final byte[] b) throws IOException {
- return this.read(b, 0, b.length);
- }
-
- /**
- * Invokes the delegate's <code>read(byte[], int, int)</code> method.
- * @param b the buffer to read the bytes into
- * @param off The start offset
- * @param len The number of bytes to read
- * @return the number of bytes read or -1 if the end of stream or
- * the limit has been reached.
- * @throws IOException if an I/O error occurs
- */
- @Override
- public int read(final byte[] b, final int off, final int len) throws IOException {
- if (max>=0 && pos>=max) {
- return EOF;
- }
- final long maxRead = max>=0 ? Math.min(len, max-pos) : len;
- final int bytesRead = in.read(b, off, (int)maxRead);
-
- if (bytesRead==EOF) {
- return EOF;
- }
-
- pos+=bytesRead;
- return bytesRead;
- }
-
- /**
- * Invokes the delegate's <code>skip(long)</code> method.
- * @param n the number of bytes to skip
- * @return the actual number of bytes skipped
- * @throws IOException if an I/O error occurs
- */
- @Override
- public long skip(final long n) throws IOException {
- final long toSkip = max>=0 ? Math.min(n, max-pos) : n;
- final long skippedBytes = in.skip(toSkip);
- pos+=skippedBytes;
- return skippedBytes;
- }
-
- @Override
- public void reset() throws IOException {
- in.reset();
- }
-
- @Override
- public void mark(int readLimit) {
- in.mark(readLimit);
- }
-
- public boolean hasHitBound() {
- return hitBound;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/TestParsers.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/TestParsers.java b/tika-parsers/src/test/java/org/apache/tika/TestParsers.java
deleted file mode 100644
index ddd671d..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/TestParsers.java
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.InputStream;
-
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.junit.Before;
-import org.junit.Test;
-import org.xml.sax.helpers.DefaultHandler;
-
-/**
- * Junit test class for Tika {@link Parser}s.
- */
-public class TestParsers extends TikaTest {
-
- private TikaConfig tc;
-
- private Tika tika;
-
- @Before
- public void setUp() throws Exception {
- tc = TikaConfig.getDefaultConfig();
- tika = new Tika(tc);
- }
-
- @Test
- public void testWORDxtraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testWORD.doc");
- Parser parser = tika.getParser();
- Metadata metadata = new Metadata();
- try (InputStream stream = new FileInputStream(file)) {
- parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
- }
- assertEquals("Sample Word Document", metadata.get(TikaCoreProperties.TITLE));
- }
-
- @Test
- public void testEXCELExtraction() throws Exception {
- final String expected = "Numbers and their Squares";
- File file = getResourceAsFile("/test-documents/testEXCEL.xls");
- String s1 = tika.parseToString(file);
- assertTrue("Text does not contain '" + expected + "'", s1
- .contains(expected));
- Parser parser = tika.getParser();
- Metadata metadata = new Metadata();
- try (InputStream stream = new FileInputStream(file)) {
- parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
- }
- assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
- }
-
- @Test
- public void testOptionalHyphen() throws Exception {
- String[] extensions =
- new String[] { "ppt", "pptx", "doc", "docx", "rtf", "pdf"};
- for (String extension : extensions) {
- File file = getResourceAsFile("/test-documents/testOptionalHyphen." + extension);
- String content = tika.parseToString(file);
- assertTrue("optional hyphen was not handled for '" + extension + "' file type: " + content,
- content.contains("optionalhyphen") ||
- content.contains("optional\u00adhyphen") || // soft hyphen
- content.contains("optional\u200bhyphen") || // zero width space
- content.contains("optional\u2027")); // hyphenation point
-
- }
- }
-
- private void verifyComment(String extension, String fileName) throws Exception {
- File file = getResourceAsFile("/test-documents/" + fileName + "." + extension);
- String content = tika.parseToString(file);
- assertTrue(extension + ": content=" + content + " did not extract text",
- content.contains("Here is some text"));
- assertTrue(extension + ": content=" + content + " did not extract comment",
- content.contains("Here is a comment"));
- }
-
- @Test
- public void testComment() throws Exception {
- final String[] extensions = new String[] {"ppt", "pptx", "doc",
- "docx", "xls", "xlsx", "pdf", "rtf"};
- for(String extension : extensions) {
- verifyComment(extension, "testComment");
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java b/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
deleted file mode 100644
index 2125888..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.config;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import org.apache.tika.detect.CompositeDetector;
-import org.apache.tika.detect.DefaultDetector;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.detect.EmptyDetector;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.mbox.OutlookPSTParser;
-import org.apache.tika.parser.microsoft.POIFSContainerDetector;
-import org.apache.tika.parser.pkg.ZipContainerDetector;
-import org.junit.Test;
-
-/**
- * Junit test class for {@link TikaConfig}, which cover things
- * that {@link TikaConfigTest} can't do due to a need for the
- * full set of detectors
- */
-public class TikaDetectorConfigTest extends AbstractTikaConfigTest {
- @Test
- public void testDetectorExcludeFromDefault() throws Exception {
- TikaConfig config = getConfig("TIKA-1702-detector-blacklist.xml");
- assertNotNull(config.getParser());
- assertNotNull(config.getDetector());
- CompositeDetector detector = (CompositeDetector)config.getDetector();
-
- // Should be wrapping two detectors
- assertEquals(2, detector.getDetectors().size());
-
-
- // First should be DefaultDetector, second Empty, that order
- assertEquals(DefaultDetector.class, detector.getDetectors().get(0).getClass());
- assertEquals(EmptyDetector.class, detector.getDetectors().get(1).getClass());
-
-
- // Get the DefaultDetector from the config
- DefaultDetector confDetector = (DefaultDetector)detector.getDetectors().get(0);
-
- // Get a fresh "default" DefaultParser
- DefaultDetector normDetector = new DefaultDetector(config.getMimeRepository());
-
-
- // The default one will offer the Zip and POIFS detectors
- assertDetectors(normDetector, true, true);
-
-
- // The one from the config won't, as we excluded those
- assertDetectors(confDetector, false, false);
- }
-
- /**
- * TIKA-1708 - If the Zip detector is disabled, either explicitly,
- * or via giving a list of detectors that it isn't part of, ensure
- * that detection of PST files still works
- */
- @Test
- public void testPSTDetectionWithoutZipDetector() throws Exception {
- // Check the one with an exclude
- TikaConfig configWX = getConfig("TIKA-1708-detector-default.xml");
- assertNotNull(configWX.getParser());
- assertNotNull(configWX.getDetector());
- CompositeDetector detectorWX = (CompositeDetector)configWX.getDetector();
-
- // Check it has the POIFS one, but not the zip one
- assertDetectors(detectorWX, true, false);
-
-
- // Check the one with an explicit list
- TikaConfig configCL = getConfig("TIKA-1708-detector-composite.xml");
- assertNotNull(configCL.getParser());
- assertNotNull(configCL.getDetector());
- CompositeDetector detectorCL = (CompositeDetector)configCL.getDetector();
- assertEquals(2, detectorCL.getDetectors().size());
-
- // Check it also has the POIFS one, but not the zip one
- assertDetectors(detectorCL, true, false);
-
-
- // Check that both detectors have a mimetypes with entries
- assertTrue("Not enough mime types: " + configWX.getMediaTypeRegistry().getTypes().size(),
- configWX.getMediaTypeRegistry().getTypes().size() > 100);
- assertTrue("Not enough mime types: " + configCL.getMediaTypeRegistry().getTypes().size(),
- configCL.getMediaTypeRegistry().getTypes().size() > 100);
-
-
- // Now check they detect PST files correctly
- TikaInputStream stream = TikaInputStream.get(
- getResourceAsFile("/test-documents/testPST.pst"));
- assertEquals(
- OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE,
- detectorWX.detect(stream, new Metadata())
- );
- assertEquals(
- OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE,
- detectorCL.detect(stream, new Metadata())
- );
- }
-
- private void assertDetectors(CompositeDetector detector, boolean shouldHavePOIFS,
- boolean shouldHaveZip) {
- boolean hasZip = false;
- boolean hasPOIFS = false;
- for (Detector d : detector.getDetectors()) {
- if (d instanceof ZipContainerDetector) {
- if (shouldHaveZip) {
- hasZip = true;
- } else {
- fail("Shouldn't have the ZipContainerDetector from config");
- }
- }
- if (d instanceof POIFSContainerDetector) {
- if (shouldHavePOIFS) {
- hasPOIFS = true;
- } else {
- fail("Shouldn't have the POIFSContainerDetector from config");
- }
- }
- }
- if (shouldHavePOIFS) assertTrue("Should have the POIFSContainerDetector", hasPOIFS);
- if (shouldHaveZip) assertTrue("Should have the ZipContainerDetector", hasZip);
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java b/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
deleted file mode 100644
index 2acd358..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.config;
-
-import static org.apache.tika.TikaTest.assertContains;
-import static org.apache.tika.TikaTest.assertNotContained;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import java.util.List;
-
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.CompositeParser;
-import org.apache.tika.parser.DefaultParser;
-import org.apache.tika.parser.EmptyParser;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.ParserDecorator;
-import org.apache.tika.parser.executable.ExecutableParser;
-import org.apache.tika.parser.xml.XMLParser;
-import org.junit.Test;
-
-/**
- * Junit test class for {@link TikaConfig}, which cover things
- * that {@link TikaConfigTest} can't do due to a need for the
- * full set of parsers
- */
-public class TikaParserConfigTest extends AbstractTikaConfigTest {
- @Test
- public void testMimeExcludeInclude() throws Exception {
- TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
- assertNotNull(config.getParser());
- assertNotNull(config.getDetector());
- Parser parser = config.getParser();
-
- MediaType PDF = MediaType.application("pdf");
- MediaType JPEG = MediaType.image("jpeg");
-
-
- // Has two parsers
- assertEquals(CompositeParser.class, parser.getClass());
- CompositeParser cParser = (CompositeParser)parser;
- assertEquals(2, cParser.getAllComponentParsers().size());
-
- // Both are decorated
- assertTrue(cParser.getAllComponentParsers().get(0) instanceof ParserDecorator);
- assertTrue(cParser.getAllComponentParsers().get(1) instanceof ParserDecorator);
- ParserDecorator p0 = (ParserDecorator)cParser.getAllComponentParsers().get(0);
- ParserDecorator p1 = (ParserDecorator)cParser.getAllComponentParsers().get(1);
-
-
- // DefaultParser will be wrapped with excludes
- assertEquals(DefaultParser.class, p0.getWrappedParser().getClass());
-
- assertNotContained(PDF, p0.getSupportedTypes(context));
- assertContains(PDF, p0.getWrappedParser().getSupportedTypes(context));
- assertNotContained(JPEG, p0.getSupportedTypes(context));
- assertContains(JPEG, p0.getWrappedParser().getSupportedTypes(context));
-
-
- // Will have an empty parser for PDF
- assertEquals(EmptyParser.class, p1.getWrappedParser().getClass());
- assertEquals(1, p1.getSupportedTypes(context).size());
- assertContains(PDF, p1.getSupportedTypes(context));
- assertNotContained(PDF, p1.getWrappedParser().getSupportedTypes(context));
- }
-
- @Test
- public void testParserExcludeFromDefault() throws Exception {
- TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
- assertNotNull(config.getParser());
- assertNotNull(config.getDetector());
- CompositeParser parser = (CompositeParser)config.getParser();
-
- MediaType PE_EXE = MediaType.application("x-msdownload");
- MediaType ELF = MediaType.application("x-elf");
-
-
- // Get the DefaultParser from the config
- ParserDecorator confWrappedParser = (ParserDecorator)parser.getParsers().get(MediaType.APPLICATION_XML);
- assertNotNull(confWrappedParser);
- DefaultParser confParser = (DefaultParser)confWrappedParser.getWrappedParser();
-
- // Get a fresh "default" DefaultParser
- DefaultParser normParser = new DefaultParser(config.getMediaTypeRegistry());
-
-
- // The default one will offer the Executable Parser
- assertContains(PE_EXE, normParser.getSupportedTypes(context));
- assertContains(ELF, normParser.getSupportedTypes(context));
-
- boolean hasExec = false;
- for (Parser p : normParser.getParsers().values()) {
- if (p instanceof ExecutableParser) {
- hasExec = true;
- break;
- }
- }
- assertTrue(hasExec);
-
-
- // The one from the config won't
- assertNotContained(PE_EXE, confParser.getSupportedTypes(context));
- assertNotContained(ELF, confParser.getSupportedTypes(context));
-
- for (Parser p : confParser.getParsers().values()) {
- if (p instanceof ExecutableParser)
- fail("Shouldn't have the Executable Parser from config");
- }
- }
- /**
- * TIKA-1558 It should be possible to exclude Parsers from being picked up by
- * DefaultParser.
- */
- @Test
- public void defaultParserBlacklist() throws Exception {
- TikaConfig config = new TikaConfig();
- assertNotNull(config.getParser());
- assertNotNull(config.getDetector());
- CompositeParser cp = (CompositeParser) config.getParser();
- List<Parser> parsers = cp.getAllComponentParsers();
-
- boolean hasXML = false;
- for (Parser p : parsers) {
- if (p instanceof XMLParser) {
- hasXML = true;
- break;
- }
- }
- assertTrue("Default config should include an XMLParser.", hasXML);
-
- // This custom TikaConfig should exclude XMLParser and all of its subclasses.
- config = getConfig("TIKA-1558-blacklistsub.xml");
- cp = (CompositeParser) config.getParser();
- parsers = cp.getAllComponentParsers();
-
- for (Parser p : parsers) {
- if (p instanceof XMLParser)
- fail("Custom config should not include an XMLParser (" + p.getClass() + ").");
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java b/tika-parsers/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java
deleted file mode 100644
index 71af206..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.config;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-
-import org.apache.tika.language.translate.DefaultTranslator;
-import org.apache.tika.language.translate.EmptyTranslator;
-import org.junit.Test;
-
-/**
- * Junit test class for {@link TikaConfig}, which cover things
- * that {@link TikaConfigTest} can't do due to a need for the
- * full set of translators
- */
-public class TikaTranslatorConfigTest extends AbstractTikaConfigTest {
- @Test
- public void testDefaultBehaviour() throws Exception {
- TikaConfig config = TikaConfig.getDefaultConfig();
- assertNotNull(config.getTranslator());
- assertEquals(DefaultTranslator.class, config.getTranslator().getClass());
- }
-
- @Test
- public void testRequestsDefault() throws Exception {
- TikaConfig config = getConfig("TIKA-1702-translator-default.xml");
- assertNotNull(config.getParser());
- assertNotNull(config.getDetector());
- assertNotNull(config.getTranslator());
-
- assertEquals(DefaultTranslator.class, config.getTranslator().getClass());
- }
-
- @Test
- public void testRequestsEmpty() throws Exception {
- TikaConfig config = getConfig("TIKA-1702-translator-empty.xml");
- assertNotNull(config.getParser());
- assertNotNull(config.getDetector());
- assertNotNull(config.getTranslator());
-
- assertEquals(EmptyTranslator.class, config.getTranslator().getClass());
- }
-
- /**
- * Currently, Translators don't support Composites, so
- * if multiple translators are given, only the first wins
- */
- @Test
- public void testRequestsMultiple() throws Exception {
- TikaConfig config = getConfig("TIKA-1702-translator-empty-default.xml");
- assertNotNull(config.getParser());
- assertNotNull(config.getDetector());
- assertNotNull(config.getTranslator());
-
- assertEquals(EmptyTranslator.class, config.getTranslator().getClass());
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
deleted file mode 100644
index 5787408..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.detect;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.io.File;
-import java.io.FilenameFilter;
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MimeTypes;
-import org.junit.Test;
-
-/**
- * Junit test class for {@link ContainerAwareDetector}
- */
-public class TestContainerAwareDetector {
- private final TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
- private final MimeTypes mimeTypes = tikaConfig.getMimeRepository();
- private final Detector detector = new DefaultDetector(mimeTypes);
-
- private void assertTypeByData(String file, String type) throws Exception {
- assertTypeByNameAndData(file, null, type);
- }
- private void assertTypeByNameAndData(String file, String type) throws Exception {
- assertTypeByNameAndData(file, file, type);
- }
- private void assertType(String file, String byData, String byNameAndData) throws Exception {
- assertTypeByData(file, byData);
- assertTypeByNameAndData(file, byNameAndData);
- }
- private void assertTypeByNameAndData(String dataFile, String name, String type) throws Exception {
- assertTypeByNameAndData(dataFile, name, type, null);
- }
- private void assertTypeByNameAndData(String dataFile, String name, String typeFromDetector, String typeFromMagic) throws Exception {
- try (TikaInputStream stream = TikaInputStream.get(
- TestContainerAwareDetector.class.getResource("/test-documents/" + dataFile))) {
- Metadata m = new Metadata();
- if (name != null)
- m.add(Metadata.RESOURCE_NAME_KEY, name);
-
- // Mime Magic version is likely to be less precise
- if (typeFromMagic != null) {
- assertEquals(
- MediaType.parse(typeFromMagic),
- mimeTypes.detect(stream, m));
- }
-
- // All being well, the detector should get it perfect
- assertEquals(
- MediaType.parse(typeFromDetector),
- detector.detect(stream, m));
- }
- }
-
- @Test
- public void testDetectOLE2() throws Exception {
- // Microsoft office types known by POI
- assertTypeByData("testEXCEL.xls", "application/vnd.ms-excel");
- assertTypeByData("testWORD.doc", "application/msword");
- assertTypeByData("testPPT.ppt", "application/vnd.ms-powerpoint");
-
- assertTypeByData("test-outlook.msg", "application/vnd.ms-outlook");
- assertTypeByData("test-outlook2003.msg", "application/vnd.ms-outlook");
- assertTypeByData("testVISIO.vsd", "application/vnd.visio");
- assertTypeByData("testPUBLISHER.pub", "application/x-mspublisher");
- assertTypeByData("testWORKS.wps", "application/vnd.ms-works");
- assertTypeByData("testWORKS2000.wps", "application/vnd.ms-works");
-
- // older Works Word Processor files can't be recognized
- // they were created with Works Word Processor 7.0 (hence the text inside)
- // and exported to the older formats with the "Save As" feature
- assertTypeByData("testWORKSWordProcessor3.0.wps","application/vnd.ms-works");
- assertTypeByData("testWORKSWordProcessor4.0.wps","application/vnd.ms-works");
- assertTypeByData("testWORKSSpreadsheet7.0.xlr", "application/x-tika-msworks-spreadsheet");
- assertTypeByData("testPROJECT2003.mpp", "application/vnd.ms-project");
- assertTypeByData("testPROJECT2007.mpp", "application/vnd.ms-project");
-
- // Excel95 can be detected by not parsed
- assertTypeByData("testEXCEL_95.xls", "application/vnd.ms-excel");
-
- // Try some ones that POI doesn't handle, that are still OLE2 based
- assertTypeByData("testCOREL.shw", "application/x-corelpresentations");
- assertTypeByData("testQUATTRO.qpw", "application/x-quattro-pro");
- assertTypeByData("testQUATTRO.wb3", "application/x-quattro-pro");
-
- assertTypeByData("testHWP_5.0.hwp", "application/x-hwp-v5");
-
-
- // With the filename and data
- assertTypeByNameAndData("testEXCEL.xls", "application/vnd.ms-excel");
- assertTypeByNameAndData("testWORD.doc", "application/msword");
- assertTypeByNameAndData("testPPT.ppt", "application/vnd.ms-powerpoint");
-
- // With the wrong filename supplied, data will trump filename
- assertTypeByNameAndData("testEXCEL.xls", "notWord.doc", "application/vnd.ms-excel");
- assertTypeByNameAndData("testWORD.doc", "notExcel.xls", "application/msword");
- assertTypeByNameAndData("testPPT.ppt", "notWord.doc", "application/vnd.ms-powerpoint");
-
- // With a filename of a totally different type, data will trump filename
- assertTypeByNameAndData("testEXCEL.xls", "notPDF.pdf", "application/vnd.ms-excel");
- assertTypeByNameAndData("testEXCEL.xls", "notPNG.png", "application/vnd.ms-excel");
- }
-
- /**
- * There is no way to distinguish "proper" StarOffice files from templates.
- * All templates have the same extension but their actual type depends on
- * the magic. Our current MimeTypes class doesn't allow us to use the same
- * glob pattern in more than one mimetype.
- *
- * @throws Exception
- */
- @Test
- public void testDetectStarOfficeFiles() throws Exception {
- assertType("testStarOffice-5.2-calc.sdc",
- "application/vnd.stardivision.calc",
- "application/vnd.stardivision.calc");
- assertType("testVORCalcTemplate.vor",
- "application/vnd.stardivision.calc",
- "application/vnd.stardivision.calc");
- assertType("testStarOffice-5.2-draw.sda",
- "application/vnd.stardivision.draw",
- "application/vnd.stardivision.draw");
- assertType("testVORDrawTemplate.vor",
- "application/vnd.stardivision.draw",
- "application/vnd.stardivision.draw");
- assertType("testStarOffice-5.2-impress.sdd",
- "application/vnd.stardivision.impress",
- "application/vnd.stardivision.impress");
- assertType("testVORImpressTemplate.vor",
- "application/vnd.stardivision.impress",
- "application/vnd.stardivision.impress");
- assertType("testStarOffice-5.2-writer.sdw",
- "application/vnd.stardivision.writer",
- "application/vnd.stardivision.writer");
- assertType("testVORWriterTemplate.vor",
- "application/vnd.stardivision.writer",
- "application/vnd.stardivision.writer");
-
- }
-
- @Test
- public void testOpenContainer() throws Exception {
- try (TikaInputStream stream = TikaInputStream.get(
- TestContainerAwareDetector.class.getResource("/test-documents/testPPT.ppt"))) {
- assertNull(stream.getOpenContainer());
- assertEquals(
- MediaType.parse("application/vnd.ms-powerpoint"),
- detector.detect(stream, new Metadata()));
- assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem);
- }
- }
-
- /**
- * EPub uses a similar mimetype entry to OpenDocument for storing
- * the mimetype within the parent zip file
- */
- @Test
- public void testDetectEPub() throws Exception {
- assertTypeByData("testEPUB.epub", "application/epub+zip");
- assertTypeByData("testiBooks.ibooks", "application/x-ibooks+zip");
- }
-
- @Test
- public void testDetectLotusNotesEml() throws Exception {
- // Lotus .eml files aren't guaranteed to have any of the magic
- // matches as the first line, but should have X-Notes-Item and Message-ID
- assertTypeByData("testLotusEml.eml", "message/rfc822");
- }
-
- @Test
- public void testDetectODF() throws Exception {
- assertTypeByData("testODFwithOOo3.odt", "application/vnd.oasis.opendocument.text");
- assertTypeByData("testOpenOffice2.odf", "application/vnd.oasis.opendocument.formula");
- }
-
- @Test
- public void testDetectOOXML() throws Exception {
- assertTypeByData("testEXCEL.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
- assertTypeByData("testWORD.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
- assertTypeByData("testPPT.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
-
- // Check some of the less common OOXML types
- assertTypeByData("testPPT.pptm", "application/vnd.ms-powerpoint.presentation.macroenabled.12");
- assertTypeByData("testPPT.ppsx", "application/vnd.openxmlformats-officedocument.presentationml.slideshow");
- assertTypeByData("testPPT.ppsm", "application/vnd.ms-powerpoint.slideshow.macroEnabled.12");
- assertTypeByData("testDOTM.dotm", "application/vnd.ms-word.template.macroEnabled.12");
- assertTypeByData("testEXCEL.strict.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
- assertTypeByData("testPPT.xps", "application/vnd.ms-xpsdocument");
-
- assertTypeByData("testVISIO.vsdm", "application/vnd.ms-visio.drawing.macroenabled.12");
- assertTypeByData("testVISIO.vsdx", "application/vnd.ms-visio.drawing");
- assertTypeByData("testVISIO.vssm", "application/vnd.ms-visio.stencil.macroenabled.12");
- assertTypeByData("testVISIO.vssx", "application/vnd.ms-visio.stencil");
- assertTypeByData("testVISIO.vstm", "application/vnd.ms-visio.template.macroenabled.12");
- assertTypeByData("testVISIO.vstx", "application/vnd.ms-visio.template");
-
- // .xlsb is an OOXML file containing the binary parts, and not
- // an OLE2 file as you might initially expect!
- assertTypeByData("testEXCEL.xlsb", "application/vnd.ms-excel.sheet.binary.macroEnabled.12");
-
- // With the filename and data
- assertTypeByNameAndData("testEXCEL.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
- assertTypeByNameAndData("testWORD.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
- assertTypeByNameAndData("testPPT.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
-
- // With the wrong filename supplied, data will trump filename
- assertTypeByNameAndData("testEXCEL.xlsx", "notWord.docx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
- assertTypeByNameAndData("testWORD.docx", "notExcel.xlsx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
- assertTypeByNameAndData("testPPT.pptx", "notWord.docx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
-
- // With an incorrect filename of a different container type, data trumps filename
- assertTypeByNameAndData("testEXCEL.xlsx", "notOldExcel.xls", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
- }
-
- /**
- * Password Protected OLE2 files are fairly straightforward to detect, as they
- * have the same structure as regular OLE2 files. (Core streams may be encrypted
- * however)
- */
- @Test
- public void testDetectProtectedOLE2() throws Exception {
- assertTypeByData("testEXCEL_protected_passtika.xls", "application/vnd.ms-excel");
- assertTypeByData("testWORD_protected_passtika.doc", "application/msword");
- assertTypeByData("testPPT_protected_passtika.ppt", "application/vnd.ms-powerpoint");
- assertTypeByNameAndData("testEXCEL_protected_passtika.xls", "application/vnd.ms-excel");
- assertTypeByNameAndData("testWORD_protected_passtika.doc", "application/msword");
- assertTypeByNameAndData("testPPT_protected_passtika.ppt", "application/vnd.ms-powerpoint");
- }
-
- /**
- * Password Protected OOXML files are much more tricky beasts to work with.
- * They have a very different structure to regular OOXML files, and instead
- * of being ZIP based they are actually an OLE2 file which contains the
- * OOXML structure within an encrypted stream.
- * This makes detecting them much harder...
- */
- @Test
- public void testDetectProtectedOOXML() throws Exception {
- // Encrypted Microsoft Office OOXML files have OLE magic but
- // special streams, so we can tell they're Protected OOXML
- assertTypeByData("testEXCEL_protected_passtika.xlsx",
- "application/x-tika-ooxml-protected");
- assertTypeByData("testWORD_protected_passtika.docx",
- "application/x-tika-ooxml-protected");
- assertTypeByData("testPPT_protected_passtika.pptx",
- "application/x-tika-ooxml-protected");
-
- // At the moment, we can't use the name to specialise
- // See discussions on TIKA-790 for details
- assertTypeByNameAndData("testEXCEL_protected_passtika.xlsx",
- "application/x-tika-ooxml-protected");
- assertTypeByNameAndData("testWORD_protected_passtika.docx",
- "application/x-tika-ooxml-protected");
- assertTypeByNameAndData("testPPT_protected_passtika.pptx",
- "application/x-tika-ooxml-protected");
- }
-
- /**
- * Check that temporary files created by Tika are removed after
- * closing TikaInputStream.
- */
- @Test
- public void testRemovalTempfiles() throws Exception {
- assertRemovalTempfiles("testWORD.docx");
- assertRemovalTempfiles("test-documents.zip");
- }
-
- private int countTemporaryFiles() {
- return new File(System.getProperty("java.io.tmpdir")).listFiles(
- new FilenameFilter() {
- public boolean accept(File dir, String name) {
- return name.startsWith("apache-tika-");
- }
- }).length;
- }
-
- private void assertRemovalTempfiles(String fileName) throws Exception {
- int numberOfTempFiles = countTemporaryFiles();
-
- try (TikaInputStream stream = TikaInputStream.get(
- TestContainerAwareDetector.class.getResource("/test-documents/" + fileName))) {
- detector.detect(stream, new Metadata());
- }
-
- assertEquals(numberOfTempFiles, countTemporaryFiles());
- }
-
- @Test
- public void testDetectIWork() throws Exception {
- assertTypeByData("testKeynote.key", "application/vnd.apple.keynote");
- assertTypeByData("testNumbers.numbers", "application/vnd.apple.numbers");
- assertTypeByData("testPages.pages", "application/vnd.apple.pages");
- }
-
- @Test
- public void testDetectKMZ() throws Exception {
- assertTypeByData("testKMZ.kmz", "application/vnd.google-earth.kmz");
- }
-
- @Test
- public void testDetectIPA() throws Exception {
- assertTypeByNameAndData("testIPA.ipa", "application/x-itunes-ipa");
- assertTypeByData("testIPA.ipa", "application/x-itunes-ipa");
- }
-
- @Test
- public void testASiC() throws Exception {
- assertTypeByData("testASiCE.asice", "application/vnd.etsi.asic-e+zip");
- assertTypeByData("testASiCS.asics", "application/vnd.etsi.asic-s+zip");
- assertTypeByNameAndData("testASiCE.asice", "application/vnd.etsi.asic-e+zip");
- assertTypeByNameAndData("testASiCS.asics", "application/vnd.etsi.asic-s+zip");
- }
-
- @Test
- public void testDetectZip() throws Exception {
- assertTypeByData("test-documents.zip", "application/zip");
- assertTypeByData("test-zip-of-zip.zip", "application/zip");
-
- // JAR based formats
- assertTypeByData("testJAR.jar", "application/java-archive");
- assertTypeByData("testWAR.war", "application/x-tika-java-web-archive");
- assertTypeByData("testEAR.ear", "application/x-tika-java-enterprise-archive");
- assertTypeByData("testAPK.apk", "application/vnd.android.package-archive");
-
- // JAR with HTML files in it
- assertTypeByNameAndData("testJAR_with_HTML.jar", "testJAR_with_HTML.jar",
- "application/java-archive", "application/java-archive");
- }
-
- private TikaInputStream getTruncatedFile(String name, int n)
- throws IOException {
- try (InputStream input = TestContainerAwareDetector.class.getResourceAsStream(
- "/test-documents/" + name)) {
- byte[] bytes = new byte[n];
- int m = 0;
- while (m < bytes.length) {
- int i = input.read(bytes, m, bytes.length - m);
- if (i != -1) {
- m += i;
- } else {
- throw new IOException("Unexpected end of stream");
- }
- }
- return TikaInputStream.get(bytes);
- }
- }
-
- @Test
- public void testTruncatedFiles() throws Exception {
- // First up a truncated OOXML (zip) file
-
- // With only the data supplied, the best we can do is the container
- Metadata m = new Metadata();
- try (TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300)) {
- assertEquals(
- MediaType.application("x-tika-ooxml"),
- detector.detect(xlsx, m));
- }
-
- // With truncated data + filename, we can use the filename to specialise
- m = new Metadata();
- m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx");
- try (TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300)) {
- assertEquals(
- MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
- detector.detect(xlsx, m));
- }
-
- // Now a truncated OLE2 file
- m = new Metadata();
- try (TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400)) {
- assertEquals(
- MediaType.application("x-tika-msoffice"),
- detector.detect(xls, m));
- }
-
- // Finally a truncated OLE2 file, with a filename available
- m = new Metadata();
- m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xls");
- try (TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400)) {
- assertEquals(
- MediaType.application("vnd.ms-excel"),
- detector.detect(xls, m));
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java b/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java
deleted file mode 100644
index e988aff..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java
+++ /dev/null
@@ -1,292 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.embedder;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStreamWriter;
-import java.net.URISyntaxException;
-import java.net.URL;
-import java.text.DateFormat;
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.Locale;
-import java.util.Map;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.txt.TXTParser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Unit test for {@link ExternalEmbedder}s.
- */
-public class ExternalEmbedderTest {
-
- protected static final DateFormat EXPECTED_METADATA_DATE_FORMATTER =
- new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ROOT);
- protected static final String DEFAULT_CHARSET = UTF_8.name();
- private static final String COMMAND_METADATA_ARGUMENT_DESCRIPTION = "dc:description";
- private static final String TEST_TXT_PATH = "/test-documents/testTXT.txt";
-
- private TemporaryResources tmp = new TemporaryResources();
-
- /**
- * Gets the expected returned metadata value for the given field
- *
- * @param fieldName
- * @return a prefix added to the field name
- */
- protected String getExpectedMetadataValueString(String fieldName, Date timestamp) {
- return this.getClass().getSimpleName() + " embedded " + fieldName +
- " on " + EXPECTED_METADATA_DATE_FORMATTER.format(timestamp);
- }
-
- /**
- * Gets the tika <code>Metadata</code> object containing data to be
- * embedded.
- *
- * @return the populated tika metadata object
- */
- protected Metadata getMetadataToEmbed(Date timestamp) {
- Metadata metadata = new Metadata();
- metadata.add(TikaCoreProperties.DESCRIPTION,
- getExpectedMetadataValueString(TikaCoreProperties.DESCRIPTION.toString(), timestamp));
- return metadata;
- }
-
- /**
- * Gets the <code>Embedder</code> to test.
- *
- * @return the embedder under test
- */
- protected Embedder getEmbedder() {
- ExternalEmbedder embedder = new ExternalEmbedder();
- Map<Property, String[]> metadataCommandArguments = new HashMap<Property, String[]>(1);
- metadataCommandArguments.put(TikaCoreProperties.DESCRIPTION,
- new String[] { COMMAND_METADATA_ARGUMENT_DESCRIPTION });
- embedder.setMetadataCommandArguments(metadataCommandArguments);
- return embedder;
- }
-
- /**
- * Gets the source input stream through standard Java resource loaders
- * before metadata has been embedded.
- *
- * @return a fresh input stream
- */
- protected InputStream getSourceStandardInputStream() {
- return this.getClass().getResourceAsStream(TEST_TXT_PATH);
- }
-
- /**
- * Gets the source input stream via {@link TikaInputStream}
- * before metadata has been embedded.
- *
- * @return a fresh input stream
- * @throws FileNotFoundException
- */
- protected InputStream getSourceTikaInputStream() throws FileNotFoundException {
- return TikaInputStream.get(getSourceInputFile());
- }
-
- /**
- * Gets the source input file through standard Java resource loaders
- * before metadata has been embedded.
- *
- * @return a fresh input stream
- * @throws FileNotFoundException
- */
- protected File getSourceInputFile() throws FileNotFoundException {
- URL origUrl = this.getClass().getResource(TEST_TXT_PATH);
- if (origUrl == null) {
- throw new FileNotFoundException("could not load " + TEST_TXT_PATH);
- }
- try {
- return new File(origUrl.toURI());
- } catch (URISyntaxException e) {
- throw new FileNotFoundException(e.getMessage());
- }
- }
-
- /**
- * Gets the parser to use to verify the result of the embed operation.
- *
- * @return the parser to read embedded metadata
- */
- protected Parser getParser() {
- return new TXTParser();
- }
-
- /**
- * Whether or not the final result of reading the now embedded metadata is
- * expected in the output of the external tool
- *
- * @return whether or not results are expected in command line output
- */
- protected boolean getIsMetadataExpectedInOutput() {
- return true;
- }
-
- /**
- * Tests embedding metadata then reading metadata to verify the results.
- *
- * @param isResultExpectedInOutput whether or not results are expected in command line output
- */
- protected void embedInTempFile(InputStream sourceInputStream, boolean isResultExpectedInOutput) {
- Embedder embedder = getEmbedder();
-
- // TODO Move this check to ExternalEmbedder
- String os = System.getProperty("os.name", "");
- if (os.contains("Windows")) {
- // Skip test on Windows
- return;
- }
-
- Date timestamp = new Date();
- Metadata metadataToEmbed = getMetadataToEmbed(timestamp);
-
- try {
- File tempOutputFile = tmp.createTemporaryFile();
- FileOutputStream tempFileOutputStream = new FileOutputStream(tempOutputFile);
-
- // Embed the metadata into a copy of the original output stream
- embedder.embed(metadataToEmbed, sourceInputStream, tempFileOutputStream, null);
-
- ParseContext context = new ParseContext();
- Parser parser = getParser();
- context.set(Parser.class, parser);
-
- // Setup the extracting content handler
- ByteArrayOutputStream result = new ByteArrayOutputStream();
- OutputStreamWriter outputWriter = new OutputStreamWriter(result,DEFAULT_CHARSET);
- ContentHandler handler = new BodyContentHandler(outputWriter);
-
- // Create a new metadata object to read the new metadata into
- Metadata embeddedMetadata = new Metadata();
-
- // Setup a re-read of the now embeded temp file
- FileInputStream embeddedFileInputStream = new FileInputStream(tempOutputFile);
-
- parser.parse(embeddedFileInputStream, handler, embeddedMetadata,
- context);
-
- tmp.dispose();
-
- String outputString = null;
- if (isResultExpectedInOutput) {
- outputString = result.toString(DEFAULT_CHARSET);
- } else {
- assertTrue("no metadata found", embeddedMetadata.size() > 0);
- }
-
- // Check each metadata property for the expected value
- for (String metadataName : metadataToEmbed.names()) {
- if (metadataToEmbed.get(metadataName) != null) {
- String expectedValue = metadataToEmbed.get(metadataName);
- boolean foundExpectedValue = false;
- if (isResultExpectedInOutput) {
- // just check that the entire output contains the expected string
- foundExpectedValue = outputString.contains(expectedValue);
- } else {
- if (embeddedMetadata.isMultiValued(metadataName)) {
- for (String embeddedValue : embeddedMetadata.getValues(metadataName)) {
- if (embeddedValue != null) {
- if (embeddedValue.contains(expectedValue)) {
- foundExpectedValue = true;
- break;
- }
- }
- }
- } else {
- String embeddedValue = embeddedMetadata.get(metadataName);
- assertNotNull("expected metadata for "
- + metadataName + " not found",
- embeddedValue);
- foundExpectedValue = embeddedValue.contains(expectedValue);
- }
- }
- assertTrue(
- "result did not contain expected appended metadata "
- + metadataName + "="
- + expectedValue,
- foundExpectedValue);
- }
- }
- } catch (IOException e) {
- fail(e.getMessage());
- } catch (TikaException e) {
- fail(e.getMessage());
- } catch (SAXException e) {
- fail(e.getMessage());
- }
- }
-
- protected void checkSourceFileExists() {
- String message = "the original input file was deleted";
- try {
- File origInputFile = getSourceInputFile();
- assertNotNull(message, origInputFile);
- assertTrue(message, origInputFile.exists());
- } catch (FileNotFoundException e) {
- fail(message + ": " + e.getMessage());
- }
- }
-
- /**
- * Tests embedding using an input stream obtained via {@link ExternalEmbedderTest#getSourceStandardInputStream()}
- *
- * @throws IOException
- */
- @Test
- public void testEmbedStandardInputStream() throws IOException {
- embedInTempFile(getSourceStandardInputStream(), getIsMetadataExpectedInOutput());
- checkSourceFileExists();
- }
-
- /**
- * Tests embedding using an input stream obtained via {@link ExternalEmbedderTest#getSourceTikaInputStream()}
- *
- * @throws IOException
- */
- @Test
- public void testEmbedTikaInputStream() throws IOException {
- embedInTempFile(getSourceTikaInputStream(), getIsMetadataExpectedInOutput());
- checkSourceFileExists();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java b/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java
deleted file mode 100644
index 7987630..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.mime;
-
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import org.junit.Before;
-import org.junit.Test;
-
-public class MimeTypeTest {
-
- private MimeTypes types;
- private MimeType text;
-
- @Before
- public void setUp() throws MimeTypeException {
- types = new MimeTypes();
- text = types.forName("text/plain");
- }
-
- /** Test MimeType constructor */
- @Test
- public void testConstrctor() {
- // Missing name
- try {
- new MimeType(null);
- fail("Expected IllegalArgumentException");
- } catch (IllegalArgumentException e) {
- // expected result
- }
- }
-
- @Test
- public void testIsValidName() {
- assertTrue(MimeType.isValid("application/octet-stream"));
- assertTrue(MimeType.isValid("text/plain"));
- assertTrue(MimeType.isValid("foo/bar"));
- assertTrue(MimeType.isValid("a/b"));
-
- assertFalse(MimeType.isValid("application"));
- assertFalse(MimeType.isValid("application/"));
- assertFalse(MimeType.isValid("/"));
- assertFalse(MimeType.isValid("/octet-stream"));
- assertFalse(MimeType.isValid("application//octet-stream"));
- assertFalse(MimeType.isValid("application/octet=stream"));
- assertFalse(MimeType.isValid("application/\u00f6ctet-stream"));
- assertFalse(MimeType.isValid("text/plain;"));
- assertFalse(MimeType.isValid("text/plain; charset=UTF-8"));
- try {
- MimeType.isValid(null);
- fail("Expected IllegalArgumentException");
- } catch (IllegalArgumentException e) {
- // expected result
- }
- }
-
- /** Test MimeType setDescription() */
- @Test
- public void testSetEmptyValues() {
- try {
- text.setDescription(null);
- fail("Expected IllegalArgumentException");
- } catch (IllegalArgumentException e) {
- // expected result
- }
-
- try {
- text.setAcronym(null);
- fail("Expected IllegalArgumentException");
- } catch (IllegalArgumentException e) {
- // expected result
- }
-
- try {
- text.addLink(null);
- fail("Expected IllegalArgumentException");
- } catch (IllegalArgumentException e) {
- // expected result
- }
-
- try {
- text.setUniformTypeIdentifier(null);
- fail("Expected IllegalArgumentException");
- } catch (IllegalArgumentException e) {
- // expected result
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java b/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java
deleted file mode 100644
index be8a575..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.mime;
-
-import static org.apache.tika.mime.MediaType.OCTET_STREAM;
-import static org.apache.tika.mime.MediaType.TEXT_PLAIN;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import org.junit.Before;
-import org.junit.Test;
-
-public class MimeTypesTest {
-
- private MimeTypes types;
-
- private MediaTypeRegistry registry;
-
- private MimeType binary;
-
- private MimeType text;
-
- private MimeType html;
-
- @Before
- public void setUp() throws MimeTypeException {
- types = new MimeTypes();
- registry = types.getMediaTypeRegistry();
- binary = types.forName("application/octet-stream");
- text = types.forName("text/plain");
- types.addAlias(text, MediaType.parse("text/x-plain"));
- html = types.forName("text/html");
- types.setSuperType(html, TEXT_PLAIN);
- }
-
- @Test
- public void testForName() throws MimeTypeException {
- assertEquals(text, types.forName("text/plain"));
- assertEquals(text, types.forName("TEXT/PLAIN"));
-
- try {
- types.forName("invalid");
- fail("MimeTypeException not thrown on invalid type name");
- } catch (MimeTypeException e) {
- // expected
- }
- }
-
- @Test
- public void testRegisteredMimes() throws MimeTypeException {
- String dummy = "text/xxxxx";
- assertEquals(text, types.getRegisteredMimeType("text/plain"));
- assertNull(types.getRegisteredMimeType(dummy));
- assertNotNull(types.forName(dummy));
- assertEquals(dummy, types.forName("text/xxxxx").getType().toString());
- assertEquals(dummy, types.getRegisteredMimeType("text/xxxxx").getType().toString());
-
- try {
- types.forName("invalid");
- fail("MimeTypeException not thrown on invalid type name");
- } catch (MimeTypeException e) {
- // expected
- }
- }
-
- @Test
- public void testSuperType() throws MimeTypeException {
- assertNull(registry.getSupertype(OCTET_STREAM));
- assertEquals(OCTET_STREAM, registry.getSupertype(TEXT_PLAIN));
- assertEquals(TEXT_PLAIN, registry.getSupertype(html.getType()));
- }
-
- @Test
- public void testIsDescendantOf() {
- assertFalse(registry.isSpecializationOf(OCTET_STREAM, OCTET_STREAM));
- assertFalse(registry.isSpecializationOf(TEXT_PLAIN, TEXT_PLAIN));
- assertFalse(registry.isSpecializationOf(html.getType(), html.getType()));
-
- assertTrue(registry.isSpecializationOf(html.getType(), OCTET_STREAM));
- assertFalse(registry.isSpecializationOf(OCTET_STREAM, html.getType()));
-
- assertTrue(registry.isSpecializationOf(html.getType(), TEXT_PLAIN));
- assertFalse(registry.isSpecializationOf(TEXT_PLAIN, html.getType()));
-
- assertTrue(registry.isSpecializationOf(TEXT_PLAIN, OCTET_STREAM));
- assertFalse(registry.isSpecializationOf(OCTET_STREAM, TEXT_PLAIN));
- }
-
- @Test
- public void testCompareTo() {
- assertTrue(binary.compareTo(binary) == 0);
- assertTrue(binary.compareTo(text) != 0);
- assertTrue(binary.compareTo(html) != 0);
-
- assertTrue(text.compareTo(binary) != 0);
- assertTrue(text.compareTo(text) == 0);
- assertTrue(text.compareTo(html) != 0);
-
- assertTrue(html.compareTo(binary) != 0);
- assertTrue(html.compareTo(text) != 0);
- assertTrue(html.compareTo(html) == 0);
- }
-
-}
[07/13] tika git commit: TIKA-1855 -- first pass. Need to turn back
on the forbidden-apis testCheck. More clean up remains.
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index a8bfaed..20f8760 100644
--- a/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parser-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -60,6 +60,7 @@ import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.sax.ToXMLContentHandler;
import org.junit.AfterClass;
import org.junit.BeforeClass;
+import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.ContentHandler;
@@ -105,74 +106,55 @@ public class PDFParserTest extends TikaTest {
@Test
public void testPdfParsing() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- Metadata metadata = new Metadata();
-
- InputStream stream = PDFParserTest.class.getResourceAsStream(
- "/test-documents/testPDF.pdf");
-
- String content = getText(stream, parser, metadata);
-
- assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Bertrand Delacr\u00e9taz", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Bertrand Delacr\u00e9taz", metadata.get(Metadata.AUTHOR));
- assertEquals("Firefox", metadata.get(TikaCoreProperties.CREATOR_TOOL));
- assertEquals("Apache Tika - Apache Tika", metadata.get(TikaCoreProperties.TITLE));
+ XMLResult r = getXML("testPDF.pdf");
+ assertEquals("application/pdf", r.metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Bertrand Delacr\u00e9taz", r.metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Bertrand Delacr\u00e9taz", r.metadata.get(Metadata.AUTHOR));
+ assertEquals("Firefox", r.metadata.get(TikaCoreProperties.CREATOR_TOOL));
+ assertEquals("Apache Tika - Apache Tika", r.metadata.get(TikaCoreProperties.TITLE));
// Can't reliably test dates yet - see TIKA-451
// assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.CREATION_DATE));
// assertEquals("Sat Sep 15 10:02:31 BST 2007", metadata.get(Metadata.LAST_MODIFIED));
- assertContains("Apache Tika", content);
- assertContains("Tika - Content Analysis Toolkit", content);
- assertContains("incubator", content);
- assertContains("Apache Software Foundation", content);
+ assertContains("Apache Tika", r.xml);
+ assertContains("Tika - Content Analysis Toolkit", r.xml);
+ assertContains("incubator", r.xml);
+ assertContains("Apache Software Foundation", r.xml);
// testing how the end of one paragraph is separated from start of the next one
- assertTrue("should have word boundary after headline",
- !content.contains("ToolkitApache"));
- assertTrue("should have word boundary between paragraphs",
- !content.contains("libraries.Apache"));
+
+ // should have word boundary after headline
+ assertNotContained("ToolkitApache", r.xml);
+ // should have word boundary between paragraphs
+ assertNotContained("libraries.Apache", r.xml);
}
@Test
public void testPdfParsingMetadataOnly() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- Metadata metadata = new Metadata();
-
- try (InputStream stream = PDFParserTest.class.getResourceAsStream(
- "/test-documents/testPDF.pdf")) {
- parser.parse(stream, null, metadata, new ParseContext());
- }
-
- assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Bertrand Delacr\u00e9taz", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Firefox", metadata.get(TikaCoreProperties.CREATOR_TOOL));
- assertEquals("Apache Tika - Apache Tika", metadata.get(TikaCoreProperties.TITLE));
+ XMLResult r = getXML("testPDF.pdf");
+ assertEquals("application/pdf", r.metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Bertrand Delacr\u00e9taz", r.metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Firefox", r.metadata.get(TikaCoreProperties.CREATOR_TOOL));
+ assertEquals("Apache Tika - Apache Tika", r.metadata.get(TikaCoreProperties.TITLE));
}
@Test
public void testCustomMetadata() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- Metadata metadata = new Metadata();
-
- InputStream stream = PDFParserTest.class.getResourceAsStream(
- "/test-documents/testPDF-custommetadata.pdf");
-
- String content = getText(stream, parser, metadata);
+ XMLResult r = getXML("testPDF-custommetadata.pdf");
- assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Document author", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Document author", metadata.get(Metadata.AUTHOR));
- assertEquals("Document title", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("application/pdf", r.metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Document author", r.metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Document author", r.metadata.get(Metadata.AUTHOR));
+ assertEquals("Document title", r.metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Custom Value", metadata.get("Custom Property"));
+ assertEquals("Custom Value", r.metadata.get("Custom Property"));
- assertEquals("Array Entry 1", metadata.get("Custom Array"));
- assertEquals(2, metadata.getValues("Custom Array").length);
- assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]);
- assertEquals("Array Entry 2", metadata.getValues("Custom Array")[1]);
+ assertEquals("Array Entry 1", r.metadata.get("Custom Array"));
+ assertEquals(2, r.metadata.getValues("Custom Array").length);
+ assertEquals("Array Entry 1", r.metadata.getValues("Custom Array")[0]);
+ assertEquals("Array Entry 2", r.metadata.getValues("Custom Array")[1]);
- assertContains("Hello World!", content);
+ assertContains("Hello World!", r.xml);
}
/**
@@ -182,16 +164,9 @@ public class PDFParserTest extends TikaTest {
*/
@Test
public void testProtectedPDF() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
-
- try (InputStream stream = PDFParserTest.class.getResourceAsStream(
- "/test-documents/testPDF_protected.pdf")) {
- parser.parse(stream, handler, metadata, context);
- }
+ XMLResult r = getXML("testPDF_protected.pdf");
+ Metadata metadata = r.metadata;
assertEquals("true", metadata.get("pdf:encrypted"));
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("The Bank of England", metadata.get(TikaCoreProperties.CREATOR));
@@ -200,27 +175,23 @@ public class PDFParserTest extends TikaTest {
assertEquals("Speeches by Andrew G Haldane", metadata.get(Metadata.SUBJECT));
assertEquals("Rethinking the Financial Network, Speech by Andrew G Haldane, Executive Director, Financial Stability delivered at the Financial Student Association, Amsterdam on 28 April 2009", metadata.get(TikaCoreProperties.TITLE));
- String content = handler.toString();
+ String content = r.xml;
assertContains("RETHINKING THE FINANCIAL NETWORK", content);
assertContains("On 16 November 2002", content);
assertContains("In many important respects", content);
// Try again with an explicit empty password
- handler = new BodyContentHandler();
metadata = new Metadata();
- context = new ParseContext();
+ ParseContext context = new ParseContext();
context.set(PasswordProvider.class, new PasswordProvider() {
public String getPassword(Metadata metadata) {
return "";
}
});
-
- try (InputStream stream = PDFParserTest.class.getResourceAsStream(
- "/test-documents/testPDF_protected.pdf")) {
- parser.parse(stream, handler, metadata, context);
- }
+ r = getXML("testPDF_protected.pdf", new AutoDetectParser(), metadata, context);
+ metadata = r.metadata;
assertEquals("true", metadata.get("pdf:encrypted"));
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
@@ -234,7 +205,6 @@ public class PDFParserTest extends TikaTest {
assertContains("In many important respects", content);
//now test wrong password
- handler = new BodyContentHandler();
metadata = new Metadata();
context = new ParseContext();
context.set(PasswordProvider.class, new PasswordProvider() {
@@ -244,23 +214,20 @@ public class PDFParserTest extends TikaTest {
});
boolean ex = false;
- try (InputStream stream = PDFParserTest.class.getResourceAsStream(
- "/test-documents/testPDF_protected.pdf")) {
- parser.parse(stream, handler, metadata, context);
+ try {
+ r = getXML("testPDF_protected.pdf", new AutoDetectParser(), metadata, context);
} catch (EncryptedDocumentException e) {
ex = true;
}
- content = handler.toString();
+ content = r.xml;
assertTrue("encryption exception", ex);
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("true", metadata.get("pdf:encrypted"));
//pdf:encrypted, X-Parsed-By and Content-Type
assertEquals("very little metadata should be parsed", 3, metadata.names().length);
- assertEquals(0, content.length());
//now test wrong password with non sequential parser
- handler = new BodyContentHandler();
metadata = new Metadata();
context = new ParseContext();
context.set(PasswordProvider.class, new PasswordProvider() {
@@ -272,22 +239,21 @@ public class PDFParserTest extends TikaTest {
config.setUseNonSequentialParser(true);
context.set(PDFParserConfig.class, config);
- ;
ex = false;
- try (InputStream stream = PDFParserTest.class.getResourceAsStream(
- "/test-documents/testPDF_protected.pdf")) {
- parser.parse(stream, handler, metadata, context);
+ try {
+ r = getXML("testPDF_protected.pdf", new AutoDetectParser(), metadata, context);
} catch (EncryptedDocumentException e) {
ex = true;
}
- content = handler.toString();
+
+ content = r.xml;
assertTrue("encryption exception", ex);
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("true", metadata.get("pdf:encrypted"));
//pdf:encrypted, X-Parsed-By and Content-Type
assertEquals("very little metadata should be parsed", 3, metadata.names().length);
- assertEquals(0, content.length());
+
}
@Test
@@ -619,6 +585,7 @@ public class PDFParserTest extends TikaTest {
* TODO: more testing
*/
@Test
+ @Ignore("this will be going away as soon as we upgrade to 2.0")
public void testSequentialParser() throws Exception {
Parser sequentialParser = new AutoDetectParser();
@@ -745,13 +712,13 @@ public class PDFParserTest extends TikaTest {
//The current test doc does not contain any content in the signature area.
//This just tests that a RuntimeException is not thrown.
//TODO: find a better test file for this issue.
- String xml = getXML("/testPDF_acroform3.pdf").xml;
+ String xml = getXML("testPDF_acroform3.pdf").xml;
assertTrue("found", (xml.contains("<li>aTextField: TIKA-1226</li>")));
}
@Test // TIKA-1228, TIKA-1268
public void testEmbeddedFilesInChildren() throws Exception {
- String xml = getXML("/testPDF_childAttachments.pdf").xml;
+ String xml = getXML("testPDF_childAttachments.pdf").xml;
//"regressiveness" exists only in Unit10.doc not in the container pdf document
assertTrue(xml.contains("regressiveness"));
@@ -785,7 +752,7 @@ public class PDFParserTest extends TikaTest {
@Test
public void testEmbeddedFilesInAnnotations() throws Exception {
- String xml = getXML("/testPDFFileEmbInAnnotation.pdf").xml;
+ String xml = getXML("testPDFFileEmbInAnnotation.pdf").xml;
assertTrue(xml.contains("This is a Excel"));
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
index 4398999..aa70106 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
@@ -114,7 +114,7 @@ public class ISArchiveParser implements Parser {
InputStream stream = TikaInputStream.get(new File(this.location + investigation));
ISATabUtils.parseInvestigation(stream, xhtml, metadata, context, this.studyFileName);
-
+ stream.close();
xhtml.element("h1", "INVESTIGATION " + metadata.get("Investigation Identifier"));
}
@@ -130,6 +130,7 @@ public class ISArchiveParser implements Parser {
xhtml.element("h3", "ASSAY " + assayFileName);
InputStream stream = TikaInputStream.get(new File(this.location + assayFileName));
ISATabUtils.parseAssay(stream, xhtml, metadata, context);
+ stream.close();
xhtml.endElement("div");
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java
index 6a63eb4..ddbca81 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java
@@ -21,11 +21,10 @@ package org.apache.tika.parser.netcdf;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
-import java.util.Set;
import java.util.List;
+import java.util.Set;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
@@ -37,11 +36,10 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-
import ucar.nc2.Attribute;
+import ucar.nc2.Dimension;
import ucar.nc2.NetcdfFile;
import ucar.nc2.Variable;
-import ucar.nc2.Dimension;
/**
* A {@link Parser} for <a
@@ -82,9 +80,10 @@ public class NetCDFParser extends AbstractParser {
Metadata metadata, ParseContext context) throws IOException,
SAXException, TikaException {
- TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
+ TikaInputStream tis = TikaInputStream.get(stream);
+ NetcdfFile ncFile = null;
try {
- NetcdfFile ncFile = NetcdfFile.open(tis.getFile().getAbsolutePath());
+ ncFile = NetcdfFile.open(tis.getFile().getAbsolutePath());
metadata.set("File-Type-Description", ncFile.getFileTypeDescription());
// first parse out the set of global attributes
for (Attribute attr : ncFile.getGlobalAttributes()) {
@@ -129,9 +128,13 @@ public class NetCDFParser extends AbstractParser {
xhtml.endElement("ul");
xhtml.endDocument();
-
+ ncFile.close();
} catch (IOException e) {
throw new TikaException("NetCDF parse error", e);
+ } finally {
+ if (ncFile != null) {
+ ncFile.close();
+ }
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/dif/DIFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/dif/DIFParserTest.java b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/dif/DIFParserTest.java
index ef31abc..373da0d 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/dif/DIFParserTest.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/dif/DIFParserTest.java
@@ -18,37 +18,22 @@ package org.apache.tika.parser.dif;
import static org.junit.Assert.assertEquals;
-import java.io.InputStream;
-
import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
-import org.xml.sax.ContentHandler;
public class DIFParserTest extends TikaTest {
@Test
public void testDifMetadata() throws Exception {
- Parser parser = new DIFParser();
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = DIFParser.class.getResourceAsStream(
- "/test-documents/Zamora2010.dif")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- assertEquals(metadata.get("DIF-Entry_ID"),"00794186-48f9-11e3-9dcb-00c0f03d5b7c");
- assertEquals(metadata.get("DIF-Metadata_Name"),"ACADIS IDN DIF");
+ XMLResult r = getXML("Zamora2010.dif", new DIFParser());
+ assertEquals(r.metadata.get("DIF-Entry_ID"),"00794186-48f9-11e3-9dcb-00c0f03d5b7c");
+ assertEquals(r.metadata.get("DIF-Metadata_Name"),"ACADIS IDN DIF");
- String content = handler.toString();
+ String content = r.xml;
assertContains("Title: Zamora 2010 Using Sediment Geochemistry", content);
- assertContains("Southernmost_Latitude : 78.833", content);
- assertContains("Northernmost_Latitude : 79.016", content);
- assertContains("Westernmost_Longitude : 11.64", content);
- assertContains("Easternmost_Longitude : 13.34", content);
+ assertContains("Southernmost_Latitude : </td><td>78.833", content);
+ assertContains("Northernmost_Latitude : </td><td>79.016", content);
+ assertContains("Westernmost_Longitude : </td><td>11.64", content);
+ assertContains("Easternmost_Longitude : </td><td>13.34", content);
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/envi/EnviHeaderParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/envi/EnviHeaderParserTest.java b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/envi/EnviHeaderParserTest.java
index 3603280..0bf67fb 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/envi/EnviHeaderParserTest.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/envi/EnviHeaderParserTest.java
@@ -17,44 +17,26 @@
package org.apache.tika.parser.envi;
-import static org.apache.tika.TikaTest.assertContains;
-import static org.junit.Assert.assertNotNull;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.ToXMLContentHandler;
+import org.apache.tika.TikaTest;
import org.junit.Test;
/**
* Test cases to exercise the {@link EnviHeaderParser}.
*/
-public class EnviHeaderParserTest {
+public class EnviHeaderParserTest extends TikaTest {
@Test
public void testParseGlobalMetadata() throws Exception {
if (System.getProperty("java.version").startsWith("1.5")) {
return;
}
- Parser parser = new EnviHeaderParser();
- ToXMLContentHandler handler = new ToXMLContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = EnviHeaderParser.class.getResourceAsStream(
- "/test-documents/envi_test_header.hdr")) {
- assertNotNull("Test ENVI file not found", stream);
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
+ XMLResult r = getXML("envi_test_header.hdr", new EnviHeaderParser());
// Check content of test file
- String content = handler.toString();
- assertContains("<body><p>ENVI</p>", content);
- assertContains("<p>samples = 2400</p>", content);
- assertContains("<p>lines = 2400</p>", content);
- assertContains("<p>map info = {Sinusoidal, 1.5000, 1.5000, -10007091.3643, 5559289.2856, 4.6331271653e+02, 4.6331271653e+02, , units=Meters}</p>", content);
- assertContains("content=\"application/envi.hdr\"", content);
- assertContains("projection info = {16, 6371007.2, 0.000000, 0.0, 0.0, Sinusoidal, units=Meters}", content);
+ assertContains("<body><p>ENVI</p>", r.xml);
+ assertContains("<p>samples = 2400</p>", r.xml);
+ assertContains("<p>lines = 2400</p>", r.xml);
+ assertContains("<p>map info = {Sinusoidal, 1.5000, 1.5000, -10007091.3643, 5559289.2856, 4.6331271653e+02, 4.6331271653e+02, , units=Meters}</p>", r.xml);
+ assertContains("content=\"application/envi.hdr\"", r.xml);
+ assertContains("projection info = {16, 6371007.2, 0.000000, 0.0, 0.0, Sinusoidal, units=Meters}", r.xml);
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java
index cf37989..5d4c58c 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java
@@ -49,7 +49,7 @@ public class TestGDALParser extends TikaTest {
}
@Test
- public void testParseBasicInfo() {
+ public void testParseBasicInfo() throws Exception {
assumeTrue(canRun());
final String expectedDriver = "netCDF/Network Common Data Format";
final String expectedUpperRight = "512.0, 0.0";
@@ -59,18 +59,9 @@ public class TestGDALParser extends TikaTest {
final String expectedCoordinateSystem = "`'";
final String expectedSize = "512, 512";
- GDALParser parser = new GDALParser();
- InputStream stream = TestGDALParser.class
- .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc");
- Metadata met = new Metadata();
- BodyContentHandler handler = new BodyContentHandler();
- try {
- parser.parse(stream, handler, met, new ParseContext());
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
+ XMLResult r = getXML("sresa1b_ncar_ccsm3_0_run1_200001.nc", new GDALParser());
+ Metadata met = r.metadata;
assertNotNull(met);
assertNotNull(met.get("Driver"));
assertEquals(expectedDriver, met.get("Driver"));
@@ -91,7 +82,7 @@ public class TestGDALParser extends TikaTest {
}
@Test
- public void testParseMetadata() {
+ public void testParseMetadata() throws Exception {
assumeTrue(canRun());
final String expectedNcInst = "NCAR (National Center for Atmospheric Research, Boulder, CO, USA)";
final String expectedModelNameEnglish = "NCAR CCSM";
@@ -102,14 +93,10 @@ public class TestGDALParser extends TikaTest {
final String expectedSub8Name = "\":ua";
final String expectedSub8Desc = "[1x17x128x256] eastward_wind (32-bit floating-point)";
- GDALParser parser = new GDALParser();
- InputStream stream = TestGDALParser.class
- .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc");
- Metadata met = new Metadata();
- BodyContentHandler handler = new BodyContentHandler();
- try {
- parser.parse(stream, handler, met, new ParseContext());
- assertNotNull(met);
+ XMLResult r = getXML("sresa1b_ncar_ccsm3_0_run1_200001.nc");
+ Metadata met = r.metadata;
+
+ assertNotNull(met);
assertNotNull(met.get("NC_GLOBAL#institution"));
assertEquals(expectedNcInst, met.get("NC_GLOBAL#institution"));
assertNotNull(met.get("NC_GLOBAL#model_name_english"));
@@ -129,14 +116,11 @@ public class TestGDALParser extends TikaTest {
assertTrue(met.get("SUBDATASET_8_NAME").endsWith(expectedSub8Name));
assertNotNull(met.get("SUBDATASET_8_DESC"));
assertEquals(expectedSub8Desc, met.get("SUBDATASET_8_DESC"));
- } catch (Exception e) {
- e.printStackTrace();
- fail(e.getMessage());
- }
}
@Test
public void testParseFITS() {
+ //TODO: fix this...add spooling to tmp file to TikaTest
String fitsFilename = "/test-documents/WFPC2u5780205r_c0fx.fits";
assumeTrue(canRun());
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
index 0d6fb74..0fbe7b3 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
@@ -21,25 +21,30 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
-import org.junit.Test;
+
import java.io.ByteArrayInputStream;
import java.io.IOException;
-import java.io.InputStream;
import java.io.UnsupportedEncodingException;
+import org.apache.tika.TikaTest;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
import org.xml.sax.SAXException;
-public class GeoParserTest {
+public class GeoParserTest extends TikaTest {
private Parser geoparser = new GeoParser();
@Test
- public void testFunctions() throws UnsupportedEncodingException,
- IOException, SAXException, TikaException {
+ public void testFunctions() throws Exception {
+
+ /* if it's not available no tests to run */
+ if (!((GeoParser) geoparser).isAvailable())
+ return;
+
String text = "The millennial-scale cooling trend that followed the HTM coincides with the decrease in China "
+ "summer insolation driven by slow changes in Earth's orbit. Despite the nearly linear forcing, the transition from the HTM to "
+ "the Little Ice Age (1500-1900 AD) was neither gradual nor uniform. To understand how feedbacks and perturbations result in rapid changes, "
@@ -53,13 +58,7 @@ public class GeoParserTest {
GeoParserConfig config = new GeoParserConfig();
context.set(GeoParserConfig.class, config);
- InputStream s = new ByteArrayInputStream(text.getBytes(UTF_8));
- /* if it's not available no tests to run */
- if (!((GeoParser) geoparser).isAvailable())
- return;
-
- geoparser.parse(s, new BodyContentHandler(), metadata, context);
-
+ XMLResult r = getXML(new ByteArrayInputStream(text.getBytes(UTF_8)), geoparser, metadata, context);
assertNotNull(metadata.get("Geographic_NAME"));
assertNotNull(metadata.get("Geographic_LONGITUDE"));
assertNotNull(metadata.get("Geographic_LATITUDE"));
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java
index acd0cb2..442b080 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/geoinfo/GeographicInformationParserTest.java
@@ -17,45 +17,29 @@
package org.apache.tika.parser.geoinfo;
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.geoinfo.GeographicInformationParser;
-import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
-import org.xml.sax.ContentHandler;
-import java.io.*;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-public class GeographicInformationParserTest {
+public class GeographicInformationParserTest extends TikaTest {
@Test
- public void testISO19139() throws Exception{
- String path ="/test-documents/sampleFile.iso19139";
-
- Metadata metadata = new Metadata();
- Parser parser=new org.apache.tika.parser.geoinfo.GeographicInformationParser();
- ContentHandler contentHandler=new BodyContentHandler();
- ParseContext parseContext=new ParseContext();
-
- InputStream inputStream = GeographicInformationParser.class.getResourceAsStream(path);
-
- parser.parse(inputStream, contentHandler, metadata, parseContext);
-
- assertEquals("text/iso19139+xml", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("UTF-8", metadata.get("CharacterSet"));
- assertEquals("https", metadata.get("TransferOptionsOnlineProtocol "));
- assertEquals("browser", metadata.get("TransferOptionsOnlineProfile "));
- assertEquals("Barrow Atqasuk ARCSS Plant", metadata.get("TransferOptionsOnlineName "));
-
- String content = contentHandler.toString();
- assertTrue(content.contains("Barrow Atqasuk ARCSS Plant"));
- assertTrue(content.contains("GeographicElementWestBoundLatitude -157.24"));
- assertTrue(content.contains("GeographicElementEastBoundLatitude -156.4"));
- assertTrue(content.contains("GeographicElementNorthBoundLatitude 71.18"));
- assertTrue(content.contains("GeographicElementSouthBoundLatitude 70.27"));
+ public void testISO19139() throws Exception {
+ XMLResult r = getXML("sampleFile.iso19139", new GeographicInformationParser());
+ assertEquals("text/iso19139+xml", r.metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("UTF-8", r.metadata.get("CharacterSet"));
+ assertEquals("https", r.metadata.get("TransferOptionsOnlineProtocol "));
+ assertEquals("browser", r.metadata.get("TransferOptionsOnlineProfile "));
+ assertEquals("Barrow Atqasuk ARCSS Plant", r.metadata.get("TransferOptionsOnlineName "));
+
+ assertContains("Barrow Atqasuk ARCSS Plant", r.xml);
+ assertContains("<td>GeographicElementWestBoundLatitude</td>\t<td>-157.24</td>", r.xml);
+ assertContains("<td>GeographicElementEastBoundLatitude</td>\t<td>-156.4</td>", r.xml);
+ assertContains("<td>GeographicElementNorthBoundLatitude</td>\t<td>71.18</td>", r.xml);
+ assertContains("<td>GeographicElementSouthBoundLatitude</td>\t<td>70.27</td>", r.xml);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java
index 6ccf6af..622d511 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/grib/GribParserTest.java
@@ -18,36 +18,24 @@
package org.apache.tika.parser.grib;
//JDK imports
-import static org.junit.Assert.*;
-import java.io.InputStream;
+import static org.junit.Assert.assertNotNull;
-//TIKA imports
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.TikaTest;
import org.junit.Test;
-import org.xml.sax.ContentHandler;
-import java.io.File;
+
+//TIKA imports
/**
* Test cases to exercise the {@link org.apache.tika.parser.grib.GribParser}.
*/
-public class GribParserTest {
+public class GribParserTest extends TikaTest {
@Test
public void testParseGlobalMetadata() throws Exception {
- Parser parser = new GribParser();
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- try (InputStream stream = GribParser.class.getResourceAsStream("/test-documents/gdas1.forecmwf.2014062612.grib2")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
- assertNotNull(metadata);
- String content = handler.toString();
- assertTrue(content.contains("dimensions:"));
- assertTrue(content.contains("variables:"));
+ XMLResult r = getXML("gdas1.forecmwf.2014062612.grib2", new GribParser());
+ assertNotNull(r.metadata);
+ assertContains("dimensions:", r.xml);
+ assertContains("variables:", r.xml);
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
index 9bda875..1ee4dc7 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/hdf/HDFParserTest.java
@@ -17,39 +17,27 @@
package org.apache.tika.parser.hdf;
//JDK imports
+
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
-import java.io.InputStream;
-
-
-
+import org.apache.tika.TikaTest;
+import org.junit.Test;
//TIKA imports
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.hdf.HDFParser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
/**
*
* Test suite for the {@link HDFParser}.
*
*/
-public class HDFParserTest {
+public class HDFParserTest extends TikaTest {
@Test
public void testParseGlobalMetadata() throws Exception {
if(System.getProperty("java.version").startsWith("1.5")) {
return;
}
- Parser parser = new HDFParser();
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
/*
* this is a publicly available HDF5 file from the MLS mission:
*
@@ -57,12 +45,10 @@ public class HDFParserTest {
* ftp://acdisc.gsfc.nasa.gov/data/s4pa///Aura_MLS_Level2/ML2O3.002//2009
* /MLS-Aura_L2GP-O3_v02-23-c01_2009d122.he5
*/
- try (InputStream stream = HDFParser.class.getResourceAsStream("/test-documents/test.he5")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
- assertNotNull(metadata);
- assertEquals("5", metadata.get("GranuleMonth"));
+ XMLResult r = getXML("test.he5", new HDFParser());
+ assertNotNull(r.metadata);
+ assertEquals("5", r.metadata.get("GranuleMonth"));
}
@Test
@@ -70,23 +56,17 @@ public class HDFParserTest {
if(System.getProperty("java.version").startsWith("1.5")) {
return;
}
- Parser parser = new HDFParser();
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
/*
* this is a publicly available HDF4 file from the HD4 examples:
*
* http://www.hdfgroup.org/training/hdf4_chunking/Chunkit/bin/input54kmdata.hdf
*/
- try (InputStream stream = HDFParser.class.getResourceAsStream("/test-documents/test.hdf")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- assertNotNull(metadata);
- assertEquals("Direct read of HDF4 file through CDM library", metadata.get("_History"));
- assertEquals("Ascending", metadata.get("Pass"));
+ XMLResult r = getXML("test.hdf", new HDFParser());
+ assertNotNull(r.metadata);
+ assertEquals("Direct read of HDF4 file through CDM library", r.metadata.get("_History"));
+ assertEquals("Ascending", r.metadata.get("Pass"));
assertEquals("Hierarchical Data Format, version 4",
- metadata.get("File-Type-Description"));
+ r.metadata.get("File-Type-Description"));
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
index ce4299c..fcc71f5 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/isatab/ISArchiveParserTest.java
@@ -17,44 +17,72 @@
package org.apache.tika.parser.isatab;
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
-import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
+import org.apache.tika.TikaTest;
import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
import org.junit.Test;
-import org.xml.sax.ContentHandler;
-public class ISArchiveParserTest {
+public class ISArchiveParserTest extends TikaTest {
+
+ static Path tmpDir;
+ final static String ISA_SUBDIR = "testISATab_BII-I-1";
+ final static String[] ISA_FILES = {
+ "a_bii-s-2_metabolite profiling_NMR spectroscopy.txt",
+ "a_metabolome.txt",
+ "a_microarray.txt",
+ "a_proteome.txt",
+ "a_transcriptome.txt",
+ "i_investigation.txt"
+ };
+
+ @BeforeClass
+ public static void createTempDir() throws Exception {
+ tmpDir = Files.createTempDirectory(ISA_SUBDIR);
+ for (String isaFile : ISA_FILES) {
+ String isaPath = "test-documents/"+ISA_SUBDIR+"/"+isaFile;
+ Files.copy(ISArchiveParserTest.class.getClassLoader().getResourceAsStream(isaPath),
+ tmpDir.resolve(isaFile));
+ }
+ }
+ @AfterClass
+ public static void deleteTempDir() throws Exception {
+ for (String isaFile : ISA_FILES) {
+ Path p = tmpDir.resolve(isaFile);
+ Files.delete(p);
+ }
+ Files.delete(tmpDir);
+ }
@Test
public void testParseArchive() throws Exception {
- String path = "/test-documents/testISATab_BII-I-1/s_BII-S-1.txt";
-
- Parser parser = new ISArchiveParser(ISArchiveParserTest.class.getResource("/test-documents/testISATab_BII-I-1/").toURI().getPath());
- //Parser parser = new AutoDetectParser();
-
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
- try (InputStream stream = ISArchiveParserTest.class.getResourceAsStream(path)) {
- parser.parse(stream, handler, metadata, context);
- }
-
+
+ Parser parser = new ISArchiveParser(tmpDir.toString());
+ XMLResult r = getXML(ISA_SUBDIR+"/s_BII-S-1.txt",
+ parser);
+
// INVESTIGATION
- assertEquals("Invalid Investigation Identifier", "BII-I-1", metadata.get("Investigation Identifier"));
- assertEquals("Invalid Investigation Title", "Growth control of the eukaryote cell: a systems biology study in yeast", metadata.get("Investigation Title"));
+ assertEquals("Invalid Investigation Identifier", "BII-I-1",
+ r.metadata.get("Investigation Identifier"));
+ assertEquals("Invalid Investigation Title",
+ "Growth control of the eukaryote cell: a systems biology study in yeast",
+ r.metadata.get("Investigation Title"));
// INVESTIGATION PUBLICATIONS
- assertEquals("Invalid Investigation PubMed ID", "17439666", metadata.get("Investigation PubMed ID"));
- assertEquals("Invalid Investigation Publication DOI", "doi:10.1186/jbiol54", metadata.get("Investigation Publication DOI"));
+ assertEquals("Invalid Investigation PubMed ID", "17439666",
+ r.metadata.get("Investigation PubMed ID"));
+ assertEquals("Invalid Investigation Publication DOI", "doi:10.1186/jbiol54",
+ r.metadata.get("Investigation Publication DOI"));
// INVESTIGATION CONTACTS
- assertEquals("Invalid Investigation Person Last Name", "Oliver", metadata.get("Investigation Person Last Name"));
- assertEquals("Invalid Investigation Person First Name", "Stephen", metadata.get("Investigation Person First Name"));
+ assertEquals("Invalid Investigation Person Last Name", "Oliver",
+ r.metadata.get("Investigation Person Last Name"));
+ assertEquals("Invalid Investigation Person First Name", "Stephen",
+ r.metadata.get("Investigation Person First Name"));
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java
index 0b31fea..aee5d62 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/mat/MatParserTest.java
@@ -16,65 +16,39 @@
*/
package org.apache.tika.parser.mat;
-import static org.apache.tika.TikaTest.assertContains;
import static org.junit.Assert.assertEquals;
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.ToXMLContentHandler;
+import org.apache.tika.TikaTest;
import org.junit.Test;
/**
* Test cases to exercise the {@link MatParser}.
*/
-public class MatParserTest {
+public class MatParserTest extends TikaTest {
@Test
public void testParser() throws Exception {
- AutoDetectParser parser = new AutoDetectParser();
- ToXMLContentHandler handler = new ToXMLContentHandler();
- Metadata metadata = new Metadata();
- String path = "/test-documents/breidamerkurjokull_radar_profiles_2009.mat";
-
- try (InputStream stream = MatParser.class.getResourceAsStream(path)) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
+ XMLResult r = getXML("breidamerkurjokull_radar_profiles_2009.mat");
// Check Metadata
- assertEquals("PCWIN64", metadata.get("platform"));
- assertEquals("MATLAB 5.0 MAT-file", metadata.get("fileType"));
- assertEquals("IM", metadata.get("endian"));
- assertEquals("Thu Feb 21 15:52:49 2013", metadata.get("createdOn"));
+ assertEquals("PCWIN64", r.metadata.get("platform"));
+ assertEquals("MATLAB 5.0 MAT-file", r.metadata.get("fileType"));
+ assertEquals("IM", r.metadata.get("endian"));
+ assertEquals("Thu Feb 21 15:52:49 2013", r.metadata.get("createdOn"));
// Check Content
- String content = handler.toString();
-
- assertContains("<li>[1x909 double array]</li>", content);
- assertContains("<p>c1:[1x1 struct array]</p>", content);
- assertContains("<li>[1024x1 double array]</li>", content);
- assertContains("<p>b1:[1x1 struct array]</p>", content);
- assertContains("<p>a1:[1x1 struct array]</p>", content);
- assertContains("<li>[1024x1261 double array]</li>", content);
- assertContains("<li>[1x1 double array]</li>", content);
- assertContains("</body></html>", content);
+ assertContains("<li>[1x909 double array]</li>", r.xml);
+ assertContains("<p>c1:[1x1 struct array]</p>", r.xml);
+ assertContains("<li>[1024x1 double array]</li>", r.xml);
+ assertContains("<p>b1:[1x1 struct array]</p>", r.xml);
+ assertContains("<p>a1:[1x1 struct array]</p>", r.xml);
+ assertContains("<li>[1024x1261 double array]</li>", r.xml);
+ assertContains("<li>[1x1 double array]</li>", r.xml);
+ assertContains("</body></html>", r.xml);
}
@Test
public void testParserForText() throws Exception {
- Parser parser = new MatParser();
- ToXMLContentHandler handler = new ToXMLContentHandler();
- Metadata metadata = new Metadata();
- String path = "/test-documents/test_mat_text.mat";
-
- try (InputStream stream = MatParser.class.getResourceAsStream(path)) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- // Check Content
- String content = handler.toString();
- assertContains("<p>double:[2x2 double array]</p>", content);
+ XMLResult r = getXML("test_mat_text.mat", new MatParser());
+ assertContains("<p>double:[2x2 double array]</p>", r.xml);
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
index 3cc1df8..7d0f2e8 100644
--- a/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
+++ b/tika-parser-modules/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/netcdf/NetCDFParserTest.java
@@ -17,54 +17,42 @@
package org.apache.tika.parser.netcdf;
//JDK imports
-import java.io.InputStream;
-//TIKA imports
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
-import org.xml.sax.ContentHandler;
-import static org.apache.tika.TikaTest.assertContains;
-import static org.junit.Assert.assertEquals;
+//TIKA imports
/**
* Test cases to exercise the {@link NetCDFParser}.
*/
-public class NetCDFParserTest {
+public class NetCDFParserTest extends TikaTest {
@Test
public void testParseGlobalMetadata() throws Exception {
- Parser parser = new NetCDFParser();
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = NetCDFParser.class
- .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
- assertEquals(metadata.get(TikaCoreProperties.TITLE),
+ XMLResult r = getXML("sresa1b_ncar_ccsm3_0_run1_200001.nc", new NetCDFParser());
+ assertEquals(r.metadata.get(TikaCoreProperties.TITLE),
"model output prepared for IPCC AR4");
- assertEquals(metadata.get(Metadata.CONTACT), "ccsm@ucar.edu");
- assertEquals(metadata.get(Metadata.PROJECT_ID),
+ assertEquals(r.metadata.get(Metadata.CONTACT), "ccsm@ucar.edu");
+ assertEquals(r.metadata.get(Metadata.PROJECT_ID),
"IPCC Fourth Assessment");
- assertEquals(metadata.get(Metadata.CONVENTIONS), "CF-1.0");
- assertEquals(metadata.get(Metadata.REALIZATION), "1");
- assertEquals(metadata.get(Metadata.EXPERIMENT_ID),
+ assertEquals(r.metadata.get(Metadata.CONVENTIONS), "CF-1.0");
+ assertEquals(r.metadata.get(Metadata.REALIZATION), "1");
+ assertEquals(r.metadata.get(Metadata.EXPERIMENT_ID),
"720 ppm stabilization experiment (SRESA1B)");
- assertEquals(metadata.get("File-Type-Description"),
+ assertEquals(r.metadata.get("File-Type-Description"),
"NetCDF-3/CDM");
- String content = handler.toString();
- assertContains("long_name = \"Surface area\"", content);
- assertContains("float area(lat=128, lon=256)", content);
- assertContains("float lat(lat=128)", content);
- assertContains("double lat_bnds(lat=128, bnds=2)", content);
- assertContains("double lon_bnds(lon=256, bnds=2)", content);
+ assertContains("long_name = \"Surface area\"", r.xml);
+ assertContains("float area(lat=128, lon=256)", r.xml);
+ assertContains("float lat(lat=128)", r.xml);
+ assertContains("double lat_bnds(lat=128, bnds=2)", r.xml);
+ assertContains("double lon_bnds(lon=256, bnds=2)", r.xml);
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
index 5f197d2..1c5b2db 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
@@ -14,21 +14,19 @@
package org.apache.tika.parser.strings;
import static org.apache.tika.parser.strings.StringsParser.getStringsProg;
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertTrue;
import static org.junit.Assume.assumeTrue;
-import java.io.InputStream;
import java.util.Arrays;
+import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.external.ExternalParser;
-import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
-import org.xml.sax.ContentHandler;
-public class StringsParserTest {
+public class StringsParserTest extends TikaTest {
public static boolean canRun() {
StringsConfig config = new StringsConfig();
String[] checkCmd = {config.getStringsPath() + getStringsProg(), "--version"};
@@ -40,7 +38,7 @@ public class StringsParserTest {
public void testParse() throws Exception {
assumeTrue(canRun());
- String resource = "/test-documents/testOCTET_header.dbase3";
+ String resource = "testOCTET_header.dbase3";
String[] content = { "CLASSNO", "TITLE", "ITEMNO", "LISTNO", "LISTDATE" };
@@ -50,22 +48,15 @@ public class StringsParserTest {
FileConfig fileConfig = new FileConfig();
Parser parser = new StringsParser();
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
ParseContext context = new ParseContext();
context.set(StringsConfig.class, stringsConfig);
context.set(FileConfig.class, fileConfig);
-
- try (InputStream stream = StringsParserTest.class.getResourceAsStream(resource)) {
- parser.parse(stream, handler, metadata, context);
- } catch (Exception e) {
- e.printStackTrace();
- }
+ Metadata metadata = new Metadata();
+ XMLResult r = getXML(resource, parser, metadata, context);
// Content
for (String word : content) {
- assertTrue(handler.toString().contains(word));
+ assertTrue(r.xml.contains(word));
}
// Metadata
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
index 050ef15..9064597 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
@@ -23,13 +23,14 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
+import org.apache.tika.TikaTest;
import org.junit.Test;
-public class CharsetDetectorTest {
+public class CharsetDetectorTest extends TikaTest {
@Test
public void testTagDropper() throws IOException {
- try (InputStream in = CharsetDetectorTest.class.getResourceAsStream("/test-documents/resume.html")) {
+ try (InputStream in = getTestDocumentAsStream("resume.html")) {
CharsetDetector detector = new CharsetDetector();
detector.enableInputFilter(true);
detector.setText(in);
@@ -52,7 +53,7 @@ public class CharsetDetectorTest {
@Test
public void testEmptyOrNullDeclaredCharset() throws IOException {
- try (InputStream in = CharsetDetectorTest.class.getResourceAsStream("/test-documents/resume.html")) {
+ try (InputStream in = getTestDocumentAsStream("resume.html")) {
CharsetDetector detector = new CharsetDetector();
Reader reader = detector.getReader(in, null);
assertTrue(reader.ready());
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
index 3de5eac..6d1c99a 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
@@ -18,13 +18,13 @@ package org.apache.tika.parser.txt;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.apache.tika.TikaTest.assertContains;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import java.io.ByteArrayInputStream;
import java.io.StringWriter;
+import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
@@ -35,7 +35,7 @@ import org.junit.Test;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;
-public class TXTParserTest {
+public class TXTParserTest extends TikaTest {
private Parser parser = new TXTParser();
@@ -233,39 +233,21 @@ public class TXTParserTest {
@Test
public void testCP866() throws Exception {
- Metadata metadata = new Metadata();
- StringWriter writer = new StringWriter();
- parser.parse(
- TXTParserTest.class.getResourceAsStream("/test-documents/russian.cp866.txt"),
- new WriteOutContentHandler(writer),
- metadata,
- new ParseContext());
-
- assertEquals("text/plain; charset=IBM866", metadata.get(Metadata.CONTENT_TYPE));
+ XMLResult r = getXML("russian.cp866.txt", parser);
+ assertEquals("text/plain; charset=IBM866", r.metadata.get(Metadata.CONTENT_TYPE));
}
@Test
public void testEBCDIC_CP500() throws Exception {
- Metadata metadata = new Metadata();
- StringWriter writer = new StringWriter();
- parser.parse(
- TXTParserTest.class.getResourceAsStream("/test-documents/english.cp500.txt"),
- new WriteOutContentHandler(writer),
- metadata,
- new ParseContext());
-
- assertEquals("text/plain; charset=IBM500", metadata.get(Metadata.CONTENT_TYPE));
+ XMLResult r = getXML("english.cp500.txt", parser);
+ assertEquals("text/plain; charset=IBM500", r.metadata.get(Metadata.CONTENT_TYPE));
// Additional check that it isn't too eager on short blocks of text
- metadata = new Metadata();
- writer = new StringWriter();
- parser.parse(
+ r = getXML(
new ByteArrayInputStream("<html><body>hello world</body></html>".getBytes(ISO_8859_1)),
- new WriteOutContentHandler(writer),
- metadata,
- new ParseContext());
+ parser, new Metadata());
- assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("text/plain; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
}
/**
@@ -276,20 +258,17 @@ public class TXTParserTest {
@Test
public void testCharsetDetectionWithShortSnipet() throws Exception {
final String text = "Hello, World!";
-
- Metadata metadata = new Metadata();
- parser.parse(
- new ByteArrayInputStream(text.getBytes(UTF_8)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("text/plain; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
+ XMLResult r = getXML(
+ new ByteArrayInputStream(text.getBytes(UTF_8)), parser, new Metadata());
+ assertEquals("text/plain; charset=ISO-8859-1", r.metadata.get(Metadata.CONTENT_TYPE));
// Now verify that if we tell the parser the encoding is UTF-8, that's what
// we get back (see TIKA-868)
- metadata.set(Metadata.CONTENT_TYPE, "application/binary; charset=UTF-8");
+ r.metadata.set(Metadata.CONTENT_TYPE, "application/binary; charset=UTF-8");
parser.parse(
new ByteArrayInputStream(text.getBytes(UTF_8)),
- new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE));
+ new BodyContentHandler(), r.metadata, new ParseContext());
+ assertEquals("text/plain; charset=UTF-8", r.metadata.get(Metadata.CONTENT_TYPE));
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
index 22094f4..665151d 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
@@ -20,26 +20,17 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
-import java.io.InputStream;
-
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
public class DcXMLParserTest extends TikaTest {
@Test
public void testXMLParserAsciiChars() throws Exception {
- try (InputStream input = DcXMLParserTest.class.getResourceAsStream(
- "/test-documents/testXML.xml")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new DcXMLParser().parse(input, handler, metadata);
-
+ XMLResult result = getXML("testXML.xml", new DcXMLParser());
+ Metadata metadata = result.metadata;
assertEquals(
"application/xml",
metadata.get(Metadata.CONTENT_TYPE));
@@ -74,22 +65,17 @@ public class DcXMLParserTest extends TikaTest {
assertEquals("Fr", metadata.get(TikaCoreProperties.LANGUAGE));
assertTrue(metadata.get(TikaCoreProperties.RIGHTS).contains("testing chars"));
- String content = handler.toString();
- assertContains("Tika test document", content);
+ assertContains("Tika test document", result.xml);
assertEquals("2000-12-01T00:00:00.000Z", metadata.get(TikaCoreProperties.CREATED));
- }
+
}
@Test
public void testXMLParserNonAsciiChars() throws Exception {
- try (InputStream input = DcXMLParserTest.class.getResourceAsStream("/test-documents/testXML.xml")) {
- Metadata metadata = new Metadata();
- new DcXMLParser().parse(input, new DefaultHandler(), metadata);
-
- final String expected = "Archim\u00E8de et Lius \u00E0 Ch\u00E2teauneuf testing chars en \u00E9t\u00E9";
- assertEquals(expected, metadata.get(TikaCoreProperties.RIGHTS));
- }
+ XMLResult r = getXML("testXML.xml", new DcXMLParser());
+ final String expected = "Archim\u00E8de et Lius \u00E0 Ch\u00E2teauneuf testing chars en \u00E9t\u00E9";
+ assertEquals(expected, r.metadata.get(TikaCoreProperties.RIGHTS));
}
// TIKA-1048
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
index 20227a6..536f9d7 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/EmptyAndDuplicateElementsXMLParserTest.java
@@ -18,13 +18,10 @@ package org.apache.tika.parser.xml;
import static org.junit.Assert.assertEquals;
-import java.io.InputStream;
-
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
@@ -38,52 +35,45 @@ public class EmptyAndDuplicateElementsXMLParserTest extends TikaTest {
@Test
public void testDefaultBehavior() throws Exception {
- try (InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream(
- "/test-documents/testXML3.xml")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new DefaultCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
+ XMLResult r = getXML("testXML3.xml", new DefaultCustomXMLTestParser());
+ Metadata metadata = r.metadata;
- assertEquals(4, metadata.getValues(FIRST_NAME).length);
- assertEquals(2, metadata.getValues(LAST_NAME).length);
+ assertEquals(4, metadata.getValues(FIRST_NAME).length);
+ assertEquals(2, metadata.getValues(LAST_NAME).length);
- assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
- assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
+ assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
+ assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
- assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
- assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
+ assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
+ assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
- // We didn't know Bob's last name, but now we don't know an entry existed
- assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
+ // We didn't know Bob's last name, but now we don't know an entry existed
+ assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
- // We don't know Kate's last name because it was a duplicate
- assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
- }
+ // We don't know Kate's last name because it was a duplicate
+ assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
}
@Test
public void testEmptiesAndRepeats() throws Exception {
- try (InputStream input = EmptyAndDuplicateElementsXMLParserTest.class.getResourceAsStream(
- "/test-documents/testXML3.xml")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new AllowEmptiesAndDuplicatesCustomXMLTestParser().parse(input, handler, metadata, new ParseContext());
+ XMLResult r = getXML("testXML3.xml", new AllowEmptiesAndDuplicatesCustomXMLTestParser());
+ Metadata metadata = r.metadata;
- assertEquals(4, metadata.getValues(FIRST_NAME).length);
- assertEquals(4, metadata.getValues(LAST_NAME).length);
+ assertEquals(4, metadata.getValues(FIRST_NAME).length);
+ assertEquals(4, metadata.getValues(LAST_NAME).length);
- assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
- assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
+ assertEquals("John", metadata.getValues(FIRST_NAME)[0]);
+ assertEquals("Smith", metadata.getValues(LAST_NAME)[0]);
- assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
- assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
+ assertEquals("Jane", metadata.getValues(FIRST_NAME)[1]);
+ assertEquals("Doe", metadata.getValues(LAST_NAME)[1]);
- assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
- assertEquals("", metadata.getValues(LAST_NAME)[2]);
+ assertEquals("Bob", metadata.getValues(FIRST_NAME)[2]);
+ assertEquals("", metadata.getValues(LAST_NAME)[2]);
+
+ assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
+ assertEquals("Smith", metadata.getValues(LAST_NAME)[3]);
- assertEquals("Kate", metadata.getValues(FIRST_NAME)[3]);
- assertEquals("Smith", metadata.getValues(LAST_NAME)[3]);
- }
}
private class DefaultCustomXMLTestParser extends XMLParser {
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
index 62454fa..aee7307 100644
--- a/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
+++ b/tika-parser-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/xml/FictionBookParserTest.java
@@ -16,38 +16,29 @@
*/
package org.apache.tika.parser.xml;
-import static org.apache.tika.TikaTest.assertContains;
import static org.junit.Assert.assertEquals;
import java.io.InputStream;
-import org.apache.tika.TikaTest.TrackingHandler;
+import org.apache.tika.TikaTest;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
-import org.xml.sax.ContentHandler;
-public class FictionBookParserTest {
+public class FictionBookParserTest extends TikaTest {
@Test
public void testFB2() throws Exception {
- try (InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new FictionBookParser().parse(input, handler, metadata, new ParseContext());
- String content = handler.toString();
-
- assertContains("1812", content);
- }
+ XMLResult r = getXML("test.fb2", new FictionBookParser(), new Metadata(), new ParseContext());
+ assertContains("1812", r.xml);
}
@Test
public void testEmbedded() throws Exception {
- try (InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2")) {
+ try (InputStream input = getTestDocumentAsStream("test.fb2")) {
ContainerExtractor extractor = new ParserContainerExtractor();
TikaInputStream stream = TikaInputStream.get(input);
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
deleted file mode 100644
index 67207d2..0000000
--- a/tika-parsers/pom.xml
+++ /dev/null
@@ -1,333 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parent</artifactId>
- <version>2.0-SNAPSHOT</version>
- <relativePath>../tika-parent/pom.xml</relativePath>
- </parent>
-
- <artifactId>tika-parsers</artifactId>
- <packaging>bundle</packaging>
- <name>Apache Tika parsers</name>
- <url>http://tika.apache.org/</url>
-
- <properties>
- <vorbis.version>0.6</vorbis.version>
- </properties>
-
- <dependencies>
- <!-- Optional OSGi dependency, used only when running within OSGi -->
- <dependency>
- <groupId>org.osgi</groupId>
- <artifactId>org.osgi.core</artifactId>
- <version>4.0.0</version>
- <scope>provided</scope>
- <optional>true</optional>
- </dependency>
-
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-multimedia-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-advanced-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-cad-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-code-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-crypto-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-database-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-ebook-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-journal-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-office-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-package-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-pdf-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-scientific-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-text-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-web-module</artifactId>
- <version>${project.version}</version>
- </dependency>
-
- <!-- Optional OSGi dependencies, used only when running within OSGi -->
- <dependency>
- <groupId>org.apache.felix</groupId>
- <artifactId>org.apache.felix.scr.annotations</artifactId>
- <scope>provided</scope>
- </dependency>
-
- <!-- Externally Maintained Parsers -->
- <dependency>
- <groupId>org.gagravarr</groupId>
- <artifactId>vorbis-java-tika</artifactId>
- <version>${vorbis.version}</version>
- </dependency>
- <dependency>
- <groupId>org.gagravarr</groupId>
- <artifactId>vorbis-java-core</artifactId>
- <version>${vorbis.version}</version>
- </dependency>
-
- <!-- Test dependencies -->
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-log4j12</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-test-resources</artifactId>
- <version>${project.version}</version>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
-
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <extensions>true</extensions>
- <configuration>
- <instructions>
- <Bundle-DocURL>${project.url}</Bundle-DocURL>
- <Bundle-Activator>
- org.apache.tika.parser.internal.Activator
- </Bundle-Activator>
- <Import-Package>
- org.w3c.dom,
- org.apache.tika.*,
- *;resolution:=optional
- </Import-Package>
- </instructions>
- </configuration>
- </plugin>
- <plugin>
- <groupId>org.apache.rat</groupId>
- <artifactId>apache-rat-plugin</artifactId>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- <version>2.10</version>
- <executions>
- <execution>
- <id>unpack</id>
- <phase>compile</phase>
- <goals>
- <goal>unpack</goal>
- </goals>
- <configuration>
- <artifactItems>
- <artifactItem>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-test-resources</artifactId>
- <version>${project.version}</version>
- <type>test-jar</type>
- <overWrite>true</overWrite>
- <outputDirectory>${project.build.testOutputDirectory}</outputDirectory>
- </artifactItem>
- </artifactItems>
- </configuration>
- </execution>
- </executions>
- </plugin>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-shade-plugin</artifactId>
- <version>2.4.2</version>
- <executions>
- <execution>
- <phase>package</phase>
- <goals>
- <goal>shade</goal>
- </goals>
- <configuration>
- <createDependencyReducedPom>
- false
- </createDependencyReducedPom>
- <artifactSet>
- <includes>
- <include>org.apache.tika:tika-parser-multimedia-module</include>
- <include>org.apache.tika:tika-parser-advanced-module</include>
- <include>org.apache.tika:tika-parser-cad-module</include>
- <include>org.apache.tika:tika-parser-code-module</include>
- <include>org.apache.tika:tika-parser-crypto-module</include>
- <include>org.apache.tika:tika-parser-database-module</include>
- <include>org.apache.tika:tika-parser-ebook-module</include>
- <include>org.apache.tika:tika-parser-journal-module</include>
- <include>org.apache.tika:tika-parser-office-module</include>
- <include>org.apache.tika:tika-parser-package-module</include>
- <include>org.apache.tika:tika-parser-pdf-module</include>
- <include>org.apache.tika:tika-parser-scientific-module</include>
- <include>org.apache.tika:tika-parser-text-module</include>
- <include>org.apache.tika:tika-parser-web-module</include>
- </includes>
- </artifactSet>
- <transformers>
- <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
- <resource>META-INF/services/org.apache.tika.detect.Detector</resource>
- </transformer>
- <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
- <resource>META-INF/services/org.apache.tika.detect.EncodingDetector</resource>
- </transformer>
- <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
- <resource>META-INF/services/org.apache.tika.parser.Parser</resource>
- </transformer>
- </transformers>
- </configuration>
- </execution>
- </executions>
- </plugin>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-jar-plugin</artifactId>
- <executions>
- <execution>
- <goals>
- <goal>test-jar</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
- </plugins>
-
- <pluginManagement>
- <plugins>
- <!-- This plugin's configuration is used to store Eclipse m2e -->
- <!-- settings only. It has no influence on the Maven build itself. -->
- <plugin>
- <groupId>org.eclipse.m2e</groupId>
- <artifactId>lifecycle-mapping</artifactId>
- <version>1.0.0</version>
- <configuration>
- <lifecycleMappingMetadata>
- <pluginExecutions>
- <pluginExecution>
- <pluginExecutionFilter>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-scr-plugin</artifactId>
- <versionRange>[1.7.2,)</versionRange>
- <goals>
- <goal>scr</goal>
- </goals>
- </pluginExecutionFilter>
- <action>
- <execute />
- </action>
- </pluginExecution>
- </pluginExecutions>
- </lifecycleMappingMetadata>
- </configuration>
- </plugin>
- </plugins>
- </pluginManagement>
- </build>
-
- <organization>
- <name>The Apache Software Foundation</name>
- <url>http://www.apache.org</url>
- </organization>
- <scm>
- <url>http://svn.apache.org/viewvc/tika/trunk/tika-parsers</url>
- <connection>scm:svn:http://svn.apache.org/repos/asf/tika/trunk/tika-parsers</connection>
- <developerConnection>scm:svn:https://svn.apache.org/repos/asf/tika/trunk/tika-parsers</developerConnection>
- </scm>
- <issueManagement>
- <system>JIRA</system>
- <url>https://issues.apache.org/jira/browse/TIKA</url>
- </issueManagement>
- <ciManagement>
- <system>Jenkins</system>
- <url>https://builds.apache.org/job/Tika-trunk/</url>
- </ciManagement>
-</project>
[04/13] tika git commit: TIKA-1855 -- first pass. Need to turn back
on the forbidden-apis testCheck. More clean up remains.
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
deleted file mode 100644
index 4889b38..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ /dev/null
@@ -1,312 +0,0 @@
-package org.apache.tika.parser;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-import static org.apache.tika.TikaTest.assertContains;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaMetadataKeys;
-import org.apache.tika.parser.utils.CommonsDigester;
-import org.apache.tika.sax.BasicContentHandlerFactory;
-import org.apache.tika.sax.ContentHandlerFactory;
-import org.junit.Test;
-import org.xml.sax.helpers.DefaultHandler;
-
-public class RecursiveParserWrapperTest {
-
- @Test
- public void testBasicXML() throws Exception {
- List<Metadata> list = getMetadata(new Metadata(),
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
- Metadata container = list.get(0);
- String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
- //not much differentiates html from xml in this test file
- assertTrue(content.indexOf("<p class=\"header\" />") > -1);
- }
-
- @Test
- public void testBasicHTML() throws Exception {
- List<Metadata> list = getMetadata(new Metadata(),
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1));
- Metadata container = list.get(0);
- String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
- //not much differentiates html from xml in this test file
- assertTrue(content.indexOf("<p class=\"header\"></p>") > -1);
- }
-
- @Test
- public void testBasicText() throws Exception {
- List<Metadata> list = getMetadata(new Metadata(),
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
- Metadata container = list.get(0);
- String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
- assertTrue(content.indexOf("<p ") < 0);
- assertTrue(content.indexOf("embed_0") > -1);
- }
-
- @Test
- public void testIgnoreContent() throws Exception {
- List<Metadata> list = getMetadata(new Metadata(),
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
- Metadata container = list.get(0);
- String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
- assertNull(content);
- }
-
-
- @Test
- public void testCharLimit() throws Exception {
- ParseContext context = new ParseContext();
- Metadata metadata = new Metadata();
-
- Parser wrapped = new AutoDetectParser();
- RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60));
- InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream(
- "/test-documents/test_recursive_embedded.docx");
- wrapper.parse(stream, new DefaultHandler(), metadata, context);
- List<Metadata> list = wrapper.getMetadata();
-
- assertEquals(5, list.size());
-
- int wlr = 0;
- for (Metadata m : list) {
- String limitReached = m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED);
- if (limitReached != null && limitReached.equals("true")) {
- wlr++;
- }
- }
- assertEquals(1, wlr);
-
- }
-
- @Test
- public void testMaxEmbedded() throws Exception {
- int maxEmbedded = 4;
- int totalNoLimit = 12;//including outer container file
- ParseContext context = new ParseContext();
- Metadata metadata = new Metadata();
- String limitReached = null;
-
- Parser wrapped = new AutoDetectParser();
- RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
-
- InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream(
- "/test-documents/test_recursive_embedded.docx");
- wrapper.parse(stream, new DefaultHandler(), metadata, context);
- List<Metadata> list = wrapper.getMetadata();
- //test default
- assertEquals(totalNoLimit, list.size());
-
- limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
- assertNull(limitReached);
-
-
- wrapper.reset();
- stream.close();
-
- //test setting value
- metadata = new Metadata();
- stream = RecursiveParserWrapperTest.class.getResourceAsStream(
- "/test-documents/test_recursive_embedded.docx");
- wrapper.setMaxEmbeddedResources(maxEmbedded);
- wrapper.parse(stream, new DefaultHandler(), metadata, context);
- list = wrapper.getMetadata();
-
- //add 1 for outer container file
- assertEquals(maxEmbedded + 1, list.size());
-
- limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
- assertEquals("true", limitReached);
-
- wrapper.reset();
- stream.close();
-
- //test setting value < 0
- metadata = new Metadata();
- stream = RecursiveParserWrapperTest.class.getResourceAsStream(
- "/test-documents/test_recursive_embedded.docx");
-
- wrapper.setMaxEmbeddedResources(-2);
- wrapper.parse(stream, new DefaultHandler(), metadata, context);
- assertEquals(totalNoLimit, list.size());
- limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
- assertNull(limitReached);
- }
-
- @Test
- public void testEmbeddedResourcePath() throws Exception {
-
- Set<String> targets = new HashSet<String>();
- targets.add("/embed1.zip");
- targets.add("/embed1.zip/embed2.zip");
- targets.add("/embed1.zip/embed2.zip/embed3.zip");
- targets.add("/embed1.zip/embed2.zip/embed3.zip/embed4.zip");
- targets.add("/embed1.zip/embed2.zip/embed3.zip/embed4.zip/embed4.txt");
- targets.add("/embed1.zip/embed2.zip/embed3.zip/embed3.txt");
- targets.add("/embed1.zip/embed2.zip/embed2a.txt");
- targets.add("/embed1.zip/embed2.zip/embed2b.txt");
- targets.add("/embed1.zip/embed1b.txt");
- targets.add("/embed1.zip/embed1a.txt");
- targets.add("/image1.emf");
-
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
- List<Metadata> list = getMetadata(metadata,
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
- Metadata container = list.get(0);
- String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
- assertTrue(content.indexOf("<p class=\"header\" />") > -1);
-
- Set<String> seen = new HashSet<String>();
- for (Metadata m : list) {
- String path = m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
- if (path != null) {
- seen.add(path);
- }
- }
- assertEquals(targets, seen);
- }
-
- @Test
- public void testEmbeddedNPE() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx");
- List<Metadata> list = getMetadata(metadata,
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
- //default behavior (user doesn't specify whether or not to catch embedded exceptions
- //is to catch the exception
- assertEquals(13, list.size());
- Metadata mockNPEMetadata = list.get(10);
- assertContains("java.lang.NullPointerException", mockNPEMetadata.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
-
- metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx");
- list = getMetadata(metadata,
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
- false, null);
-
- //Composite parser swallows caught TikaExceptions, IOExceptions and SAXExceptions
- //and just doesn't bother to report that there was an exception.
- assertEquals(12, list.size());
- }
-
- @Test
- public void testPrimaryExcWEmbedded() throws Exception {
- //if embedded content is handled and then
- //the parser hits an exception in the container document,
- //that the first element of the returned list is the container document
- //and the second is the embedded content
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, "embedded_then_npe.xml");
-
- ParseContext context = new ParseContext();
- Parser wrapped = new AutoDetectParser();
- RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true);
- String path = "/test-documents/mock/embedded_then_npe.xml";
-
- InputStream stream = null;
- boolean npe = false;
- try {
- stream = RecursiveParserWrapperTest.class.getResourceAsStream(
- path);
- wrapper.parse(stream, new DefaultHandler(), metadata, context);
- } catch (TikaException e) {
- if (e.getCause().getClass().equals(NullPointerException.class)) {
- npe = true;
- }
- } finally {
- IOUtils.closeQuietly(stream);
- }
- assertTrue("npe", npe);
-
- List<Metadata> metadataList = wrapper.getMetadata();
- assertEquals(2, metadataList.size());
- Metadata outerMetadata = metadataList.get(0);
- Metadata embeddedMetadata = metadataList.get(1);
- assertContains("main_content", outerMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
- assertEquals("embedded_then_npe.xml", outerMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
- assertEquals("Nikolai Lobachevsky", outerMetadata.get("author"));
-
- assertContains("some_embedded_content", embeddedMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
- assertEquals("embed1.xml", embeddedMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
- assertEquals("embeddedAuthor", embeddedMetadata.get("author"));
- }
-
- @Test
- public void testDigesters() throws Exception {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
- List<Metadata> list = getMetadata(metadata,
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
- true, new CommonsDigester(100000, CommonsDigester.DigestAlgorithm.MD5));
- int i = 0;
- Metadata m0 = list.get(0);
- Metadata m6 = list.get(6);
- String md5Key = "X-TIKA:digest:MD5";
- assertEquals("59f626e09a8c16ab6dbc2800c685f772", list.get(0).get(md5Key));
- assertEquals("ccdf3882e7e4c2454e28884db9b0a54d", list.get(6).get(md5Key));
- assertEquals("a869bf6432ebd14e19fc79416274e0c9", list.get(7).get(md5Key));
- }
-
- private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory,
- boolean catchEmbeddedExceptions,
- DigestingParser.Digester digester) throws Exception {
- ParseContext context = new ParseContext();
- Parser wrapped = new AutoDetectParser();
- if (digester != null) {
- wrapped = new DigestingParser(wrapped, digester);
- }
- RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped,
- contentHandlerFactory, catchEmbeddedExceptions);
- String path = metadata.get(Metadata.RESOURCE_NAME_KEY);
- if (path == null) {
- path = "/test-documents/test_recursive_embedded.docx";
- } else {
- path = "/test-documents/" + path;
- }
- InputStream stream = null;
- try {
- stream = TikaInputStream.get(RecursiveParserWrapperTest.class.getResource(path).toURI());
- wrapper.parse(stream, new DefaultHandler(), metadata, context);
- } finally {
- IOUtils.closeQuietly(stream);
- }
- return wrapper.getMetadata();
-
- }
-
- private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory)
- throws Exception {
- return getMetadata(metadata, contentHandlerFactory, true, null);
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
deleted file mode 100644
index 54c1427..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.fork;
-
-import static org.apache.tika.TikaTest.assertContains;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.fail;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.NotSerializableException;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.tika.Tika;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.fork.ForkParser;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Test that the ForkParser correctly behaves when
- * wired in to the regular Parsers and their test data
- */
-public class ForkParserIntegrationTest {
-
- private Tika tika = new Tika(); // TODO Use TikaConfig instead, when it works
-
- /**
- * Simple text parsing
- */
- @Test
- public void testForkedTextParsing() throws Exception {
- ForkParser parser = new ForkParser(
- ForkParserIntegrationTest.class.getClassLoader(),
- tika.getParser());
-
- try {
- ContentHandler output = new BodyContentHandler();
- InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
- "/test-documents/testTXT.txt");
- ParseContext context = new ParseContext();
- parser.parse(stream, output, new Metadata(), context);
-
- String content = output.toString();
- assertContains("Test d'indexation", content);
- assertContains("http://www.apache.org", content);
- } finally {
- parser.close();
- }
- }
-
- /**
- * This error has a message and an equals() implementation as to be able
- * to match it against the serialized version of itself.
- */
- static class AnError extends Error {
- private static final long serialVersionUID = -6197267350768803348L;
- private String message;
- AnError(String message) {
- super(message);
- this.message = message;
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) return true;
- if (o == null || getClass() != o.getClass()) return false;
-
- AnError anError = (AnError) o;
-
- if (!message.equals(anError.message)) return false;
-
- return true;
- }
-
- @Override
- public int hashCode() {
- return message.hashCode();
- }
- }
-
- /**
- * This error isn't serializable on the server, so can't be sent back
- * to the Fork Client once it has occured
- */
- static class WontBeSerializedError extends RuntimeException {
- private static final long serialVersionUID = 1L;
-
- WontBeSerializedError(String message) {
- super(message);
- }
-
- private void writeObject(java.io.ObjectOutputStream out) {
- RuntimeException e = new RuntimeException("Bang!");
- boolean found = false;
- for (StackTraceElement ste : e.getStackTrace()) {
- if (ste.getClassName().equals(ForkParser.class.getName())) {
- found = true;
- break;
- }
- }
- if (!found) {
- throw e;
- }
- }
- }
-
- static class BrokenParser implements Parser {
- private static final long serialVersionUID = 995871497930817839L;
- public Error err = new AnError("Simulated fail");
- public RuntimeException re = null;
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return new HashSet<MediaType>(Arrays.asList(MediaType.TEXT_PLAIN));
- }
-
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
- if (re != null) throw re;
- throw err;
- }
- }
-
- /**
- * TIKA-831 Parsers throwing errors should be caught and
- * properly reported
- */
- @Test
- public void testParsingErrorInForkedParserShouldBeReported() throws Exception {
- BrokenParser brokenParser = new BrokenParser();
- Parser parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
- InputStream stream = getClass().getResourceAsStream("/test-documents/testTXT.txt");
-
- // With a serializable error, we'll get that back
- try {
- ContentHandler output = new BodyContentHandler();
- ParseContext context = new ParseContext();
- parser.parse(stream, output, new Metadata(), context);
- fail("Expected TikaException caused by Error");
- } catch (TikaException e) {
- assertEquals(brokenParser.err, e.getCause());
- }
-
- // With a non serializable one, we'll get something else
- // TODO Fix this test
- brokenParser = new BrokenParser();
- brokenParser.re= new WontBeSerializedError("Can't Serialize");
- parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
-// try {
-// ContentHandler output = new BodyContentHandler();
-// ParseContext context = new ParseContext();
-// parser.parse(stream, output, new Metadata(), context);
-// fail("Expected TikaException caused by Error");
-// } catch (TikaException e) {
-// assertEquals(TikaException.class, e.getCause().getClass());
-// assertEquals("Bang!", e.getCause().getMessage());
-// }
- }
-
- /**
- * If we supply a non serializable object on the ParseContext,
- * check we get a helpful exception back
- */
- @Test
- public void testParserHandlingOfNonSerializable() throws Exception {
- ForkParser parser = new ForkParser(
- ForkParserIntegrationTest.class.getClassLoader(),
- tika.getParser());
-
- ParseContext context = new ParseContext();
- context.set(Detector.class, new Detector() {
- public MediaType detect(InputStream input, Metadata metadata) {
- return MediaType.OCTET_STREAM;
- }
- });
-
- try {
- ContentHandler output = new BodyContentHandler();
- InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
- "/test-documents/testTXT.txt");
- parser.parse(stream, output, new Metadata(), context);
- fail("Should have blown up with a non serializable ParseContext");
- } catch(TikaException e) {
- // Check the right details
- assertNotNull(e.getCause());
- assertEquals(NotSerializableException.class, e.getCause().getClass());
- assertEquals("Unable to serialize ParseContext to pass to the Forked Parser", e.getMessage());
- } finally {
- parser.close();
- }
- }
-
- /**
- * TIKA-832
- */
- @Test
- public void testAttachingADebuggerOnTheForkedParserShouldWork()
- throws Exception {
- ParseContext context = new ParseContext();
- context.set(Parser.class, tika.getParser());
-
- ForkParser parser = new ForkParser(
- ForkParserIntegrationTest.class.getClassLoader(),
- tika.getParser());
- parser.setJavaCommand(Arrays.asList("java", "-Xmx32m", "-Xdebug",
- "-Xrunjdwp:transport=dt_socket,address=54321,server=y,suspend=n"));
- try {
- ContentHandler body = new BodyContentHandler();
- InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
- "/test-documents/testTXT.txt");
- parser.parse(stream, body, new Metadata(), context);
- String content = body.toString();
- assertContains("Test d'indexation", content);
- assertContains("http://www.apache.org", content);
- } finally {
- parser.close();
- }
- }
-
- /**
- * TIKA-808 - Ensure that parsing of our test PDFs work under
- * the Fork Parser, to ensure that complex parsing behaves
- */
- @Test
- public void testForkedPDFParsing() throws Exception {
- ForkParser parser = new ForkParser(
- ForkParserIntegrationTest.class.getClassLoader(),
- tika.getParser());
- try {
- ContentHandler output = new BodyContentHandler();
- InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream(
- "/test-documents/testPDF.pdf");
- ParseContext context = new ParseContext();
- parser.parse(stream, output, new Metadata(), context);
-
- String content = output.toString();
- assertContains("Apache Tika", content);
- assertContains("Tika - Content Analysis Toolkit", content);
- assertContains("incubator", content);
- assertContains("Apache Software Foundation", content);
- } finally {
- parser.close();
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
deleted file mode 100644
index d222e68..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
+++ /dev/null
@@ -1,251 +0,0 @@
-package org.apache.tika.parser.mock;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.PrintStream;
-import java.util.Date;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.IOUtils;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.Parser;
-import org.junit.Test;
-
-/**
- * Somewhat bizarrely, we can't put the test of this test resource in tika-test-resources
- * or else it will be called by every module that uses it. Um, Yossarian!!!
- */
-public class MockParserTest extends TikaTest {
- private final static String M = "/test-documents/mock/";
- private final static Parser PARSER = new AutoDetectParser();
-
- @Override
- public XMLResult getXML(String path, Metadata m) throws Exception {
- //note that this is specific to MockParserTest with addition of M to the path!
- InputStream is = getResourceAsStream(M+path);
- try {
- return super.getXML(is, PARSER, m);
- } finally {
- IOUtils.closeQuietly(is);
- }
- }
-
- @Test
- public void testExample() throws Exception {
- Metadata m = new Metadata();
- PrintStream out = System.out;
- PrintStream err = System.err;
- ByteArrayOutputStream outBos = new ByteArrayOutputStream();
- ByteArrayOutputStream errBos = new ByteArrayOutputStream();
- PrintStream tmpOut = new PrintStream(outBos, true, UTF_8.toString());
- PrintStream tmpErr = new PrintStream(errBos, true, UTF_8.toString());
- System.setOut(tmpOut);
- System.setErr(tmpErr);
- try {
- assertThrowable("example.xml", m, IOException.class, "not another IOException");
- assertMockParser(m);
- } finally {
- System.setOut(out);
- System.setErr(err);
- }
- String outString = new String(outBos.toByteArray(), UTF_8);
- assertContains("writing to System.out", outString);
-
- String errString = new String(errBos.toByteArray(), UTF_8);
- assertContains("writing to System.err", errString);
-
- }
-
- @Test
- public void testNothingBad() throws Exception {
- Metadata m = new Metadata();
- String content = getXML("nothing_bad.xml", m).xml;
- assertEquals("Geoffrey Chaucer", m.get("author"));
- assertContains("<p>And bathed every veyne in swich licour,</p>", content);
- assertMockParser(m);
- }
-
- @Test
- public void testNullPointer() throws Exception {
- Metadata m = new Metadata();
- assertThrowable("null_pointer.xml", m, NullPointerException.class, "another null pointer exception");
- assertMockParser(m);
- }
-
- @Test
- public void testNullPointerNoMsg() throws Exception {
- Metadata m = new Metadata();
- assertThrowable("null_pointer_no_msg.xml", m, NullPointerException.class, null);
- assertMockParser(m);
- }
-
-
- @Test
- public void testSleep() throws Exception {
- long start = new Date().getTime();
- Metadata m = new Metadata();
- String content = getXML("sleep.xml", m).xml;
- assertMockParser(m);
- long elapsed = new Date().getTime()-start;
- //should sleep for at least 3000
- boolean enoughTimeHasElapsed = elapsed > 2000;
- assertTrue("not enough time has not elapsed: "+elapsed, enoughTimeHasElapsed);
- assertMockParser(m);
- }
-
- @Test
- public void testHeavyHang() throws Exception {
- long start = new Date().getTime();
- Metadata m = new Metadata();
-
- String content = getXML("heavy_hang.xml", m).xml;
- assertMockParser(m);
- long elapsed = new Date().getTime()-start;
- //should sleep for at least 3000
- boolean enoughTimeHasElapsed = elapsed > 2000;
- assertTrue("not enough time has elapsed: "+elapsed, enoughTimeHasElapsed);
- assertMockParser(m);
- }
-
- @Test
- public void testFakeOOM() throws Exception {
- Metadata m = new Metadata();
- assertThrowable("fake_oom.xml", m, OutOfMemoryError.class, "not another oom");
- assertMockParser(m);
- }
-
- @Test
- public void testRealOOM() throws Exception {
- //Note: we're not actually testing the diff between fake and real oom
- //i.e. by creating child process and setting different -Xmx or
- //memory profiling.
- Metadata m = new Metadata();
- assertThrowable("real_oom.xml", m, OutOfMemoryError.class, "Java heap space");
- assertMockParser(m);
- }
-
- @Test
- public void testInterruptibleSleep() {
- //Without static initialization of the parser, it can take ~1 second after t.start()
- //before the parser actually calls parse. This is
- //just the time it takes to instantiate and call AutoDetectParser, do the detection, etc.
- //This is not thread creation overhead.
- ParserRunnable r = new ParserRunnable("sleep_interruptible.xml");
- Thread t = new Thread(r);
- t.start();
- long start = new Date().getTime();
- try {
- Thread.sleep(1000);
- } catch (InterruptedException e) {
- //swallow
- }
-
- t.interrupt();
-
- try {
- t.join(10000);
- } catch (InterruptedException e) {
- //swallow
- }
- long elapsed = new Date().getTime()-start;
- boolean shortEnough = elapsed < 2000;//the xml file specifies 3000
- assertTrue("elapsed (" + elapsed + " millis) was not short enough", shortEnough);
- }
-
- @Test
- public void testNonInterruptibleSleep() {
- ParserRunnable r = new ParserRunnable("sleep_not_interruptible.xml");
- Thread t = new Thread(r);
- t.start();
- long start = new Date().getTime();
- try {
- //make sure that the thread has actually started
- Thread.sleep(1000);
- } catch (InterruptedException e) {
- //swallow
- }
- t.interrupt();
- try {
- t.join(20000);
- } catch (InterruptedException e) {
- //swallow
- }
- long elapsed = new Date().getTime()-start;
- boolean longEnough = elapsed > 3000;//the xml file specifies 3000, this sleeps 1000
- assertTrue("elapsed ("+elapsed+" millis) was not long enough", longEnough);
- }
-
- private class ParserRunnable implements Runnable {
- private final String path;
- ParserRunnable(String path) {
- this.path = path;
- }
- @Override
- public void run() {
- Metadata m = new Metadata();
- try {
- getXML(path, m);
- } catch (Exception e) {
- throw new RuntimeException(e);
- } finally {
- assertMockParser(m);
- }
- }
- }
-
- private void assertThrowable(String path, Metadata m, Class<? extends Throwable> expected, String message) {
-
- try {
- getXML(path, m);
- } catch (Throwable t) {
- //if this is a throwable wrapped in a TikaException, use the cause
- if (t instanceof TikaException && t.getCause() != null) {
- t = t.getCause();
- }
- if (! (t.getClass().isAssignableFrom(expected))){
- fail(t.getClass() +" is not assignable from "+expected);
- }
- if (message != null) {
- assertEquals(message, t.getMessage());
- }
- }
- }
-
- private void assertMockParser(Metadata m) {
- String[] parsers = m.getValues("X-Parsed-By");
- //make sure that it was actually parsed by mock.
- boolean parsedByMock = false;
- for (String parser : parsers) {
- if (parser.equals("org.apache.tika.parser.mock.MockParser")) {
- parsedByMock = true;
- break;
- }
- }
- assertTrue("mock parser should have been called", parsedByMock);
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageTest.java
deleted file mode 100644
index 26d263b..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageTest.java
+++ /dev/null
@@ -1,335 +0,0 @@
-package org.apache.tika.parser.pkg;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Before;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class PackageTest extends TikaTest {
-
- private static final MediaType TYPE_7ZIP = MediaType.application("x-7z-compressed");
-
- private ParseContext recursingContext;
- private Parser autoDetectParser;
-
- @Before
- public void setUp() throws Exception {
-
- autoDetectParser = new AutoDetectParser();
- recursingContext = new ParseContext();
- recursingContext.set(Parser.class, autoDetectParser);
- }
-
- @Test
- public void testZlibParsing() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = PackageTest.class.getResourceAsStream(
- "/test-documents/testTXT.zlib")) {
- parser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/zlib", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("Test d'indexation de Txt", content);
- assertContains("http://www.apache.org", content);
- }
-
-
- @Test
- public void testArParsing() throws Exception {
- Parser parser = new AutoDetectParser();
-
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = PackageTest.class.getResourceAsStream(
- "/test-documents/testARofText.ar")) {
- parser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/x-archive",
- metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("testTXT.txt", content);
- assertContains("Test d'indexation de Txt", content);
- assertContains("http://www.apache.org", content);
-
- try (InputStream stream = PackageTest.class.getResourceAsStream(
- "/test-documents/testARofSND.ar")) {
- parser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/x-archive",
- metadata.get(Metadata.CONTENT_TYPE));
- content = handler.toString();
- assertContains("testAU.au", content);
- }
-
- @Test
- public void testBzip2Parsing() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = PackageTest.class.getResourceAsStream(
- "/test-documents/test-documents.tbz2")) {
- parser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/x-bzip2", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("test-documents/testEXCEL.xls", content);
- assertContains("Sample Excel Worksheet", content);
- assertContains("test-documents/testHTML.html", content);
- assertContains("Test Indexation Html", content);
- assertContains("test-documents/testOpenOffice2.odt", content);
- assertContains("This is a sample Open Office document", content);
- assertContains("test-documents/testPDF.pdf", content);
- assertContains("Apache Tika", content);
- assertContains("test-documents/testPPT.ppt", content);
- assertContains("Sample Powerpoint Slide", content);
- assertContains("test-documents/testRTF.rtf", content);
- assertContains("indexation Word", content);
- assertContains("test-documents/testTXT.txt", content);
- assertContains("Test d'indexation de Txt", content);
- assertContains("test-documents/testWORD.doc", content);
- assertContains("This is a sample Microsoft Word Document", content);
- assertContains("test-documents/testXML.xml", content);
- assertContains("Rida Benjelloun", content);
- }
-
- @Test
- public void testCompressParsing() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- InputStream stream = PackageTest.class.getResourceAsStream(
- "/test-documents/test-documents.tar.Z");
- try {
- parser.parse(stream, handler, metadata, recursingContext);
- } finally {
- stream.close();
- }
-
- assertEquals("application/x-compress", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("test-documents/testEXCEL.xls", content);
- assertContains("Sample Excel Worksheet", content);
- assertContains("test-documents/testHTML.html", content);
- assertContains("Test Indexation Html", content);
- assertContains("test-documents/testOpenOffice2.odt", content);
- assertContains("This is a sample Open Office document", content);
- assertContains("test-documents/testPDF.pdf", content);
- assertContains("Apache Tika", content);
- assertContains("test-documents/testPPT.ppt", content);
- assertContains("Sample Powerpoint Slide", content);
- assertContains("test-documents/testRTF.rtf", content);
- assertContains("indexation Word", content);
- assertContains("test-documents/testTXT.txt", content);
- assertContains("Test d'indexation de Txt", content);
- assertContains("test-documents/testWORD.doc", content);
- assertContains("This is a sample Microsoft Word Document", content);
- assertContains("test-documents/testXML.xml", content);
- assertContains("Rida Benjelloun", content);
- }
-
- @Test
- public void testGzipParsing() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = PackageTest.class.getResourceAsStream(
- "/test-documents/test-documents.tgz")) {
- parser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("test-documents/testEXCEL.xls", content);
- assertContains("Sample Excel Worksheet", content);
- assertContains("test-documents/testHTML.html", content);
- assertContains("Test Indexation Html", content);
- assertContains("test-documents/testOpenOffice2.odt", content);
- assertContains("This is a sample Open Office document", content);
- assertContains("test-documents/testPDF.pdf", content);
- assertContains("Apache Tika", content);
- assertContains("test-documents/testPPT.ppt", content);
- assertContains("Sample Powerpoint Slide", content);
- assertContains("test-documents/testRTF.rtf", content);
- assertContains("indexation Word", content);
- assertContains("test-documents/testTXT.txt", content);
- assertContains("Test d'indexation de Txt", content);
- assertContains("test-documents/testWORD.doc", content);
- assertContains("This is a sample Microsoft Word Document", content);
- assertContains("test-documents/testXML.xml", content);
- assertContains("Rida Benjelloun", content);
- }
-
- @Test
- public void testRarParsing() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = PackageTest.class.getResourceAsStream(
- "/test-documents/test-documents.rar")) {
- parser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/x-rar-compressed", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("test-documents/testEXCEL.xls", content);
- assertContains("Sample Excel Worksheet", content);
- assertContains("test-documents/testHTML.html", content);
- assertContains("Test Indexation Html", content);
- assertContains("test-documents/testOpenOffice2.odt", content);
- assertContains("This is a sample Open Office document", content);
- assertContains("test-documents/testPDF.pdf", content);
- assertContains("Apache Tika", content);
- assertContains("test-documents/testPPT.ppt", content);
- assertContains("Sample Powerpoint Slide", content);
- assertContains("test-documents/testRTF.rtf", content);
- assertContains("indexation Word", content);
- assertContains("test-documents/testTXT.txt", content);
- assertContains("Test d'indexation de Txt", content);
- assertContains("test-documents/testWORD.doc", content);
- assertContains("This is a sample Microsoft Word Document", content);
- assertContains("test-documents/testXML.xml", content);
- assertContains("Rida Benjelloun", content);
- }
-
- @Test
- public void test7ZParsing() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- // Ensure 7zip is a parsable format
- assertTrue("No 7zip parser found",
- parser.getSupportedTypes(recursingContext).contains(TYPE_7ZIP));
-
- // Parse
- try (InputStream stream = PackageTest.class.getResourceAsStream(
- "/test-documents/test-documents.7z")) {
- parser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("test-documents/testEXCEL.xls", content);
- assertContains("Sample Excel Worksheet", content);
- assertContains("test-documents/testHTML.html", content);
- assertContains("Test Indexation Html", content);
- assertContains("test-documents/testOpenOffice2.odt", content);
- assertContains("This is a sample Open Office document", content);
- assertContains("test-documents/testPDF.pdf", content);
- assertContains("Apache Tika", content);
- assertContains("test-documents/testPPT.ppt", content);
- assertContains("Sample Powerpoint Slide", content);
- assertContains("test-documents/testRTF.rtf", content);
- assertContains("indexation Word", content);
- assertContains("test-documents/testTXT.txt", content);
- assertContains("Test d'indexation de Txt", content);
- assertContains("test-documents/testWORD.doc", content);
- assertContains("This is a sample Microsoft Word Document", content);
- assertContains("test-documents/testXML.xml", content);
- assertContains("Rida Benjelloun", content);
- }
- @Test
- public void testTarParsing() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = PackageTest.class.getResourceAsStream(
- "/test-documents/test-documents.tar")) {
- parser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("test-documents/testEXCEL.xls", content);
- assertContains("Sample Excel Worksheet", content);
- assertContains("test-documents/testHTML.html", content);
- assertContains("Test Indexation Html", content);
- assertContains("test-documents/testOpenOffice2.odt", content);
- assertContains("This is a sample Open Office document", content);
- assertContains("test-documents/testPDF.pdf", content);
- assertContains("Apache Tika", content);
- assertContains("test-documents/testPPT.ppt", content);
- assertContains("Sample Powerpoint Slide", content);
- assertContains("test-documents/testRTF.rtf", content);
- assertContains("indexation Word", content);
- assertContains("test-documents/testTXT.txt", content);
- assertContains("Test d'indexation de Txt", content);
- assertContains("test-documents/testWORD.doc", content);
- assertContains("This is a sample Microsoft Word Document", content);
- assertContains("test-documents/testXML.xml", content);
- assertContains("Rida Benjelloun", content);
- }
-
- @Test
- public void testZipParsing() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = PackageTest.class.getResourceAsStream(
- "/test-documents/test-documents.zip")) {
- parser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/zip", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("testEXCEL.xls", content);
- assertContains("Sample Excel Worksheet", content);
- assertContains("testHTML.html", content);
- assertContains("Test Indexation Html", content);
- assertContains("testOpenOffice2.odt", content);
- assertContains("This is a sample Open Office document", content);
- assertContains("testPDF.pdf", content);
- assertContains("Apache Tika", content);
- assertContains("testPPT.ppt", content);
- assertContains("Sample Powerpoint Slide", content);
- assertContains("testRTF.rtf", content);
- assertContains("indexation Word", content);
- assertContains("testTXT.txt", content);
- assertContains("Test d'indexation de Txt", content);
- assertContains("testWORD.doc", content);
- assertContains("This is a sample Microsoft Word Document", content);
- assertContains("testXML.xml", content);
- assertContains("Rida Benjelloun", content);
- }
-
- @Test
- public void testSvgzParsing() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = PackageTest.class.getResourceAsStream(
- "/test-documents/testSVG.svgz")) {
- parser.parse(stream, handler, metadata, recursingContext);
- }
-
- assertEquals("application/gzip", metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("Test SVG image", content);
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java b/tika-parsers/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java
deleted file mode 100644
index d30759a..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/sax/PhoneExtractingContentHandlerTest.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.sax;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.junit.Test;
-
-import java.io.InputStream;
-
-import static org.apache.tika.TikaTest.assertContains;
-
-/**
- * Test class for the {@link org.apache.tika.sax.PhoneExtractingContentHandler}
- * class. This demonstrates how to parse a document and retrieve any phone numbers
- * found within.
- *
- * The phone numbers are added to a multivalued Metadata object under the key, "phonenumbers".
- * You can get an array of phone numbers by calling metadata.getValues("phonenumber").
- */
-public class PhoneExtractingContentHandlerTest {
- @Test
- public void testExtractPhoneNumbers() throws Exception {
- Parser parser = new AutoDetectParser();
- Metadata metadata = new Metadata();
- // The PhoneExtractingContentHandler will examine any characters for phone numbers before passing them
- // to the underlying Handler.
- PhoneExtractingContentHandler handler = new PhoneExtractingContentHandler(new BodyContentHandler(), metadata);
- try (InputStream stream = PhoneExtractingContentHandlerTest.class.getResourceAsStream("/test-documents/testPhoneNumberExtractor.odt")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
- String[] phoneNumbers = metadata.getValues("phonenumbers");
- assertContains("9498888888", phoneNumbers[0]);
- assertContains("9497777777", phoneNumbers[1]);
- assertContains("9496666666", phoneNumbers[2]);
- assertContains("9495555555", phoneNumbers[3]);
- assertContains("4193404645", phoneNumbers[4]);
- assertContains("9044687081", phoneNumbers[5]);
- assertContains("2604094811", phoneNumbers[6]);
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parsers/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java b/tika-parsers/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java
deleted file mode 100644
index 62660c8..0000000
--- a/tika-parsers/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.utils;
-
-import static org.junit.Assert.assertNotEquals;
-import static org.junit.Assert.assertTrue;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.parser.DefaultParser;
-import org.apache.tika.parser.Parser;
-import org.junit.Test;
-
-public class ServiceLoaderUtilsTest extends TikaTest {
- @Test
- public void testOrdering() throws Exception {
- //make sure that non Tika parsers come last
- //which means that they'll overwrite Tika parsers and
- //be preferred.
- DefaultParser defaultParser = new DefaultParser();
- int vorbisIndex = -1;
- int fictIndex = -1;
- int dcxmlIndex = -1;
- int i = 0;
- for (Parser p : defaultParser.getAllComponentParsers()) {
- if ("class org.gagravarr.tika.VorbisParser".equals(p.getClass().toString())) {
- vorbisIndex = i;
- }
- if ("class org.apache.tika.parser.xml.FictionBookParser".equals(p.getClass().toString())) {
- fictIndex = i;
- }
- if ("class org.apache.tika.parser.xml.DcXMLParser".equals(p.getClass().toString())) {
- dcxmlIndex = i;
- }
- i++;
- }
-
- assertNotEquals(vorbisIndex, fictIndex);
- assertNotEquals(fictIndex, dcxmlIndex);
- assertTrue(vorbisIndex > fictIndex);
- assertTrue(fictIndex > dcxmlIndex);
- }
-}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/pom.xml
----------------------------------------------------------------------
diff --git a/tika-server/pom.xml b/tika-server/pom.xml
index fa55b55..7d118cb 100644
--- a/tika-server/pom.xml
+++ b/tika-server/pom.xml
@@ -131,7 +131,13 @@
<type>test-jar</type>
<scope>test</scope>
</dependency>
-
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-test-resources</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
----------------------------------------------------------------------
diff --git a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
index 4804398..aafde60 100644
--- a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
+++ b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
@@ -39,7 +39,7 @@ import org.apache.cxf.rs.security.cors.CrossOriginResourceSharingFilter;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.parser.DigestingParser;
-import org.apache.tika.parser.utils.CommonsDigester;
+import org.apache.tika.parser.digesting.CommonsDigester;
import org.apache.tika.server.resource.DetectorResource;
import org.apache.tika.server.resource.LanguageResource;
import org.apache.tika.server.resource.MetadataResource;
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
----------------------------------------------------------------------
diff --git a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
index 9dab196..0eb3cc3 100644
--- a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
+++ b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
@@ -18,8 +18,6 @@
package org.apache.tika.server;
import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
@@ -41,14 +39,15 @@ import org.apache.cxf.binding.BindingFactoryManager;
import org.apache.cxf.endpoint.Server;
import org.apache.cxf.jaxrs.JAXRSBindingFactory;
import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
-import org.apache.tika.parser.utils.CommonsDigester;
+import org.apache.tika.parser.digesting.CommonsDigester;
import org.apache.tika.server.resource.TikaResource;
import org.apache.tika.server.resource.UnpackerResource;
import org.junit.After;
import org.junit.Before;
-public abstract class CXFTestBase {
+public abstract class CXFTestBase extends TikaTest {
private final static int DIGESTER_READ_LIMIT = 20*1024*1024;
protected static final String endPoint =
@@ -56,13 +55,6 @@ public abstract class CXFTestBase {
protected Server server;
private TikaConfig tika;
- public static void assertContains(String needle, String haystack) {
- assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle));
- }
-
- public static void assertNotFound(String needle, String haystack) {
- assertFalse(needle + " unexpectedly found in:\n" + haystack, haystack.contains(needle));
- }
protected static InputStream copy(InputStream in, int remaining) throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/java/org/apache/tika/server/DetectorResourceTest.java
----------------------------------------------------------------------
diff --git a/tika-server/src/test/java/org/apache/tika/server/DetectorResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/DetectorResourceTest.java
index 3d4dc1f..2ec2682 100644
--- a/tika-server/src/test/java/org/apache/tika/server/DetectorResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/DetectorResourceTest.java
@@ -68,7 +68,7 @@ public class DetectorResourceTest extends CXFTestBase {
.accept("*/*")
.header("Content-Disposition",
"attachment; filename=" + FOO_CSV)
- .put(ClassLoader.getSystemResourceAsStream(FOO_CSV));
+ .put(getTestDocumentAsStream(FOO_CSV));
assertNotNull(response);
String readMime = getStringFromInputStream((InputStream) response
.getEntity());
@@ -85,7 +85,7 @@ public class DetectorResourceTest extends CXFTestBase {
.accept("*/*")
.header("Content-Disposition",
"attachment; filename=" + CDEC_CSV_NO_EXT)
- .put(ClassLoader.getSystemResourceAsStream(CDEC_CSV_NO_EXT));
+ .put(getTestDocumentAsStream(CDEC_CSV_NO_EXT));
assertNotNull(response);
String readMime = getStringFromInputStream((InputStream) response
.getEntity());
@@ -98,7 +98,7 @@ public class DetectorResourceTest extends CXFTestBase {
.accept("*/*")
.header("Content-Disposition",
"attachment; filename=" + CDEC_CSV_NO_EXT + ".csv")
- .put(ClassLoader.getSystemResourceAsStream(CDEC_CSV_NO_EXT));
+ .put(getTestDocumentAsStream(CDEC_CSV_NO_EXT));
assertNotNull(response);
readMime = getStringFromInputStream((InputStream) response.getEntity());
assertEquals("text/csv", readMime);
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/java/org/apache/tika/server/LanguageResourceTest.java
----------------------------------------------------------------------
diff --git a/tika-server/src/test/java/org/apache/tika/server/LanguageResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/LanguageResourceTest.java
index c3ca475..c2c4397 100644
--- a/tika-server/src/test/java/org/apache/tika/server/LanguageResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/LanguageResourceTest.java
@@ -87,7 +87,7 @@ public class LanguageResourceTest extends CXFTestBase {
String url = endPoint + LANG_STREAM_PATH;
Response response = WebClient.create(url).type("text/plain")
.accept("text/plain")
- .put(ClassLoader.getSystemResourceAsStream("english.txt"));
+ .put(getTestDocumentAsStream("english.txt"));
assertNotNull(response);
String readLang = getStringFromInputStream((InputStream) response
.getEntity());
@@ -99,7 +99,7 @@ public class LanguageResourceTest extends CXFTestBase {
String url = endPoint + LANG_STREAM_PATH;
Response response = WebClient.create(url).type("text/plain")
.accept("text/plain")
- .put(ClassLoader.getSystemResourceAsStream("french.txt"));
+ .put(getTestDocumentAsStream("french.txt"));
assertNotNull(response);
String readLang = getStringFromInputStream((InputStream) response
.getEntity());
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/java/org/apache/tika/server/MetadataResourceTest.java
----------------------------------------------------------------------
diff --git a/tika-server/src/test/java/org/apache/tika/server/MetadataResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/MetadataResourceTest.java
index 7cd5f1d..5e4c0d0 100644
--- a/tika-server/src/test/java/org/apache/tika/server/MetadataResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/MetadataResourceTest.java
@@ -73,8 +73,7 @@ public class MetadataResourceTest extends CXFTestBase {
.create(endPoint + META_PATH)
.type("application/msword")
.accept("text/csv")
- .put(ClassLoader
- .getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
+ .put(getTestDocumentAsStream(TikaResourceTest.TEST_DOC));
Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
@@ -100,8 +99,7 @@ public class MetadataResourceTest extends CXFTestBase {
.create(endPoint + META_PATH)
.type("application/vnd.ms-excel")
.accept("text/csv")
- .put(ClassLoader
- .getSystemResourceAsStream(TikaResourceTest.TEST_PASSWORD_PROTECTED));
+ .put(getTestDocumentAsStream(TikaResourceTest.TEST_PASSWORD_PROTECTED));
// Won't work, no password given
assertEquals(500, response.getStatus());
@@ -112,7 +110,7 @@ public class MetadataResourceTest extends CXFTestBase {
.type("application/vnd.ms-excel")
.accept("text/csv")
.header("Password", "wrong password")
- .put(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_PASSWORD_PROTECTED));
+ .put(getTestDocumentAsStream(TikaResourceTest.TEST_PASSWORD_PROTECTED));
assertEquals(500, response.getStatus());
@@ -122,7 +120,7 @@ public class MetadataResourceTest extends CXFTestBase {
.type("application/vnd.ms-excel")
.accept("text/csv")
.header("Password", "password")
- .put(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_PASSWORD_PROTECTED));
+ .put(getTestDocumentAsStream(TikaResourceTest.TEST_PASSWORD_PROTECTED));
// Will work
assertEquals(200, response.getStatus());
@@ -149,8 +147,7 @@ public class MetadataResourceTest extends CXFTestBase {
.create(endPoint + META_PATH)
.type("application/msword")
.accept("application/json")
- .put(ClassLoader
- .getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
+ .put(getTestDocumentAsStream(TikaResourceTest.TEST_DOC));
Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
@@ -165,8 +162,7 @@ public class MetadataResourceTest extends CXFTestBase {
.create(endPoint + META_PATH)
.type("application/msword")
.accept("application/rdf+xml")
- .put(ClassLoader
- .getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
+ .put(getTestDocumentAsStream(TikaResourceTest.TEST_DOC));
String result = IOUtils.readStringFromStream((InputStream) response.getEntity());
assertContains("<rdf:li>Maxim Valyanskiy</rdf:li>", result);
@@ -176,14 +172,14 @@ public class MetadataResourceTest extends CXFTestBase {
@Test
public void testGetField_XXX_NotFound() throws Exception {
Response response = WebClient.create(endPoint + META_PATH + "/xxx").type("application/msword")
- .accept(MediaType.APPLICATION_JSON).put(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC));
+ .accept(MediaType.APPLICATION_JSON).put(getTestDocumentAsStream(TikaResourceTest.TEST_DOC));
Assert.assertEquals(Response.Status.NOT_FOUND.getStatusCode(), response.getStatus());
}
@Test
public void testGetField_Author_TEXT_Partial_BAD_REQUEST() throws Exception {
- InputStream stream = ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC);
+ InputStream stream = getTestDocumentAsStream(TikaResourceTest.TEST_DOC);
Response response = WebClient.create(endPoint + META_PATH + "/Author").type("application/msword")
.accept(MediaType.TEXT_PLAIN).put(copy(stream, 8000));
@@ -193,7 +189,7 @@ public class MetadataResourceTest extends CXFTestBase {
@Test
public void testGetField_Author_TEXT_Partial_Found() throws Exception {
- InputStream stream = ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC);
+ InputStream stream = getTestDocumentAsStream(TikaResourceTest.TEST_DOC);
Response response = WebClient.create(endPoint + META_PATH + "/Author").type("application/msword")
.accept(MediaType.TEXT_PLAIN).put(copy(stream, 12000));
@@ -205,7 +201,7 @@ public class MetadataResourceTest extends CXFTestBase {
@Test
public void testGetField_Author_JSON_Partial_Found() throws Exception {
- InputStream stream = ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC);
+ InputStream stream = getTestDocumentAsStream(TikaResourceTest.TEST_DOC);
Response response = WebClient.create(endPoint + META_PATH + "/Author").type("application/msword")
.accept(MediaType.APPLICATION_JSON).put(copy(stream, 12000));
@@ -219,7 +215,7 @@ public class MetadataResourceTest extends CXFTestBase {
@Test
public void testGetField_Author_XMP_Partial_Found() throws Exception {
- InputStream stream = ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC);
+ InputStream stream = getTestDocumentAsStream(TikaResourceTest.TEST_DOC);
Response response = WebClient.create(endPoint + META_PATH + "/dc:creator").type("application/msword")
.accept("application/rdf+xml").put(copy(stream, 12000));
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
----------------------------------------------------------------------
diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
index 56910a9..9d41ff1 100644
--- a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
@@ -73,8 +73,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
Response response = WebClient
.create(endPoint + META_PATH)
.accept("application/json")
- .put(ClassLoader
- .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ .put(getTestDocumentAsStream(TEST_RECURSIVE_DOC));
Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
@@ -92,8 +91,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
.create(endPoint + META_PATH)
.type("application/vnd.ms-excel")
.accept("application/json")
- .put(ClassLoader
- .getSystemResourceAsStream(TikaResourceTest.TEST_PASSWORD_PROTECTED));
+ .put(getTestDocumentAsStream(TikaResourceTest.TEST_PASSWORD_PROTECTED));
// Won't work, no password given
assertEquals(500, response.getStatus());
@@ -104,7 +102,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
.type("application/vnd.ms-excel")
.accept("application/json")
.header("Password", "password")
- .put(ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_PASSWORD_PROTECTED));
+ .put(getTestDocumentAsStream(TikaResourceTest.TEST_PASSWORD_PROTECTED));
// Will work
assertEquals(200, response.getStatus());
@@ -122,8 +120,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
Response response = WebClient
.create(endPoint+META_PATH)
.accept("application/json")
- .put(ClassLoader
- .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ .put(getTestDocumentAsStream(TEST_RECURSIVE_DOC));
Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
@@ -135,8 +132,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
response = WebClient
.create(endPoint + META_PATH + SLASH)
.accept("application/json")
- .put(ClassLoader
- .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ .put(getTestDocumentAsStream(TEST_RECURSIVE_DOC));
reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
@@ -147,8 +143,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
response = WebClient
.create(endPoint + META_PATH + UNPARSEABLE_PATH)
.accept("application/json")
- .put(ClassLoader
- .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ .put(getTestDocumentAsStream(TEST_RECURSIVE_DOC));
reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
@@ -159,8 +154,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
response = WebClient
.create(endPoint + META_PATH + XML_PATH)
.accept("application/json")
- .put(ClassLoader
- .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ .put(getTestDocumentAsStream(TEST_RECURSIVE_DOC));
reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
@@ -171,8 +165,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
response = WebClient
.create(endPoint + META_PATH + TEXT_PATH)
.accept("application/json")
- .put(ClassLoader
- .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ .put(getTestDocumentAsStream(TEST_RECURSIVE_DOC));
reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
@@ -183,8 +176,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
response = WebClient
.create(endPoint + META_PATH + IGNORE_PATH)
.accept("application/json")
- .put(ClassLoader
- .getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ .put(getTestDocumentAsStream(TEST_RECURSIVE_DOC));
reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
metadataList = JsonMetadataList.fromJson(reader);
assertEquals(12, metadataList.size());
@@ -198,7 +190,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
Attachment attachmentPart =
new Attachment("myworddocx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ getTestDocumentAsStream(TEST_RECURSIVE_DOC));
WebClient webClient = WebClient.create(endPoint + META_PATH + FORM_PATH);
Response response = webClient.type("multipart/form-data")
@@ -214,7 +206,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
attachmentPart =
new Attachment("myworddocx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ getTestDocumentAsStream(TEST_RECURSIVE_DOC));
webClient = WebClient.create(endPoint + META_PATH + FORM_PATH + UNPARSEABLE_PATH);
response = webClient.type("multipart/form-data")
@@ -230,7 +222,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
attachmentPart =
new Attachment("myworddocx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ getTestDocumentAsStream(TEST_RECURSIVE_DOC));
webClient = WebClient.create(endPoint + META_PATH + FORM_PATH + XML_PATH);
response = webClient.type("multipart/form-data")
@@ -246,7 +238,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
attachmentPart =
new Attachment("myworddocx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ getTestDocumentAsStream(TEST_RECURSIVE_DOC));
webClient = WebClient.create(endPoint + META_PATH + FORM_PATH+TEXT_PATH);
response = webClient.type("multipart/form-data")
@@ -262,7 +254,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
attachmentPart =
new Attachment("myworddocx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
- ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ getTestDocumentAsStream(TEST_RECURSIVE_DOC));
webClient = WebClient.create(endPoint + META_PATH +FORM_PATH+IGNORE_PATH);
response = webClient.type("multipart/form-data")
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/java/org/apache/tika/server/StackTraceOffTest.java
----------------------------------------------------------------------
diff --git a/tika-server/src/test/java/org/apache/tika/server/StackTraceOffTest.java b/tika-server/src/test/java/org/apache/tika/server/StackTraceOffTest.java
index bd5fefe..065cf52 100644
--- a/tika-server/src/test/java/org/apache/tika/server/StackTraceOffTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/StackTraceOffTest.java
@@ -90,7 +90,7 @@ public class StackTraceOffTest extends CXFTestBase {
.accept("*/*")
.header("Content-Disposition",
"attachment; filename=" + TEST_PASSWORD_PROTECTED)
- .put(ClassLoader.getSystemResourceAsStream(TEST_PASSWORD_PROTECTED));
+ .put(getTestDocumentAsStream(TEST_PASSWORD_PROTECTED));
assertNotNull("null response: " + path, response);
assertEquals("unprocessable: " + path, UNPROCESSEABLE, response.getStatus());
String msg = getStringFromInputStream((InputStream) response
@@ -105,7 +105,7 @@ public class StackTraceOffTest extends CXFTestBase {
Response response = WebClient
.create(endPoint + path)
.accept("*/*")
- .put(ClassLoader.getSystemResourceAsStream(TEST_NULL));
+ .put(getTestDocumentAsStream(TEST_NULL));
assertNotNull("null response: " + path, response);
assertEquals("unprocessable: " + path, UNPROCESSEABLE, response.getStatus());
String msg = getStringFromInputStream((InputStream) response
@@ -124,7 +124,7 @@ public class StackTraceOffTest extends CXFTestBase {
.accept("*/*")
.header("Content-Disposition",
"attachment; filename=null_pointer.evil")
- .put(ClassLoader.getSystemResourceAsStream(TEST_NULL));
+ .put(getTestDocumentAsStream(TEST_NULL));
assertNotNull("null response: " + path, response);
assertEquals("bad type: " + path, 415, response.getStatus());
String msg = getStringFromInputStream((InputStream) response
@@ -139,7 +139,7 @@ public class StackTraceOffTest extends CXFTestBase {
//exceptions as the others...
@Test
public void testMeta() throws Exception {
- InputStream stream = ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC);
+ InputStream stream = getTestDocumentAsStream(TikaResourceTest.TEST_DOC);
Response response = WebClient.create(endPoint + "/meta" + "/Author").type("application/msword")
.accept(MediaType.TEXT_PLAIN).put(copy(stream, 8000));
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/java/org/apache/tika/server/StackTraceTest.java
----------------------------------------------------------------------
diff --git a/tika-server/src/test/java/org/apache/tika/server/StackTraceTest.java b/tika-server/src/test/java/org/apache/tika/server/StackTraceTest.java
index 410824a..a45d1eb 100644
--- a/tika-server/src/test/java/org/apache/tika/server/StackTraceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/StackTraceTest.java
@@ -84,7 +84,7 @@ public class StackTraceTest extends CXFTestBase {
.accept("*/*")
.header("Content-Disposition",
"attachment; filename=" + TEST_PASSWORD_PROTECTED)
- .put(ClassLoader.getSystemResourceAsStream(TEST_PASSWORD_PROTECTED));
+ .put(getTestDocumentAsStream(TEST_PASSWORD_PROTECTED));
assertNotNull("null response: " + path, response);
assertEquals("unprocessable: " + path, UNPROCESSEABLE, response.getStatus());
String msg = getStringFromInputStream((InputStream) response
@@ -100,12 +100,12 @@ public class StackTraceTest extends CXFTestBase {
Response response = WebClient
.create(endPoint + path)
.accept("*/*")
- .put(ClassLoader.getSystemResourceAsStream(TEST_NULL));
+ .put(getTestDocumentAsStream(TEST_NULL));
assertNotNull("null response: " + path, response);
assertEquals("unprocessable: " + path, UNPROCESSEABLE, response.getStatus());
String msg = getStringFromInputStream((InputStream) response
.getEntity());
- assertContains("Caused by: java.lang.NullPointerException: null pointer message",
+ assertContains("Caused by: java.lang.NullPointerException: another null pointer exception",
msg);
}
}
@@ -135,7 +135,7 @@ public class StackTraceTest extends CXFTestBase {
//exceptions as the others...
@Test
public void testMeta() throws Exception {
- InputStream stream = ClassLoader.getSystemResourceAsStream(TikaResourceTest.TEST_DOC);
+ InputStream stream = getTestDocumentAsStream(TikaResourceTest.TEST_DOC);
Response response = WebClient.create(endPoint + "/meta" + "/Author").type("application/msword")
.accept(MediaType.TEXT_PLAIN).put(copy(stream, 8000));
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java
----------------------------------------------------------------------
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java
index e4e60a5..5e5c735 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java
@@ -79,9 +79,9 @@ public class TikaParsersTest extends CXFTestBase {
assertContains("audio/ogg", text);
} else {
// Shouldn't do
- assertNotFound("text/plain", text);
- assertNotFound("application/pdf", text);
- assertNotFound("audio/ogg", text);
+ assertNotContained("text/plain", text);
+ assertNotContained("application/pdf", text);
+ assertNotContained("audio/ogg", text);
}
}
}
@@ -114,9 +114,9 @@ public class TikaParsersTest extends CXFTestBase {
assertContains("<li>audio/ogg", text);
} else {
// Shouldn't do
- assertNotFound("text/plain", text);
- assertNotFound("application/pdf", text);
- assertNotFound("audio/ogg", text);
+ assertNotContained("text/plain", text);
+ assertNotContained("application/pdf", text);
+ assertNotContained("audio/ogg", text);
}
}
}
[11/13] tika git commit: TIKA-1855 -- first pass. Need to turn back
on the forbidden-apis testCheck. More clean up remains.
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
new file mode 100644
index 0000000..b852de0
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -0,0 +1,1044 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+// Junit imports
+
+import static java.nio.charset.StandardCharsets.UTF_16BE;
+import static java.nio.charset.StandardCharsets.UTF_16LE;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNotSame;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+
+import org.apache.tika.Tika;
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ *
+ * Test Suite for the {@link MimeTypes} repository.
+ *
+ */
+public class TestMimeTypes extends TikaTest {
+
+ private Tika tika;
+
+ private MimeTypes repo;
+
+ private URL u;
+
+ private static final File f = new File("/a/b/c/x.pdf");
+
+ @Before
+ public void setUp() throws Exception{
+ TikaConfig config = TikaConfig.getDefaultConfig();
+ repo = config.getMimeRepository();
+ tika = new Tika(config);
+ u = new URL("http://mydomain.com/x.pdf?x=y");
+ }
+
+ @Test
+ public void testCaseSensitivity() {
+ String type = tika.detect("test.PDF");
+ assertNotNull(type);
+ assertEquals(type, tika.detect("test.pdf"));
+ assertEquals(type, tika.detect("test.PdF"));
+ assertEquals(type, tika.detect("test.pdF"));
+ }
+
+ @Test
+ public void testLoadMimeTypes() throws MimeTypeException {
+ assertNotNull(repo.forName("application/octet-stream"));
+ assertNotNull(repo.forName("text/x-tex"));
+ }
+
+ /**
+ * Tests MIME type determination based solely on the URL's extension.
+ */
+ @Test
+ public void testGuessMimeTypes() throws Exception {
+ assertTypeByName("application/pdf", "x.pdf");
+ assertEquals("application/pdf", tika.detect(u.toExternalForm()));
+ assertEquals("application/pdf", tika.detect(f.getPath()));
+ assertTypeByName("text/plain", "x.txt");
+ assertTypeByName("text/html", "x.htm");
+ assertTypeByName("text/html", "x.html");
+ assertTypeByName("application/xhtml+xml", "x.xhtml");
+ assertTypeByName("application/xml", "x.xml");
+ assertTypeByName("application/zip", "x.zip");
+ assertTypeByName("application/vnd.oasis.opendocument.text", "x.odt");
+ assertTypeByName("application/octet-stream", "x.unknown");
+
+ // Test for the MS Office media types and file extensions listed in
+ // http://blogs.msdn.com/vsofficedeveloper/pages/Office-2007-Open-XML-MIME-Types.aspx
+ assertTypeByName("application/msword", "x.doc");
+ assertTypeByName("application/msword", "x.dot");
+ assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "x.docx");
+ assertTypeByName("application/vnd.openxmlformats-officedocument.wordprocessingml.template", "x.dotx");
+ assertTypeByName("application/vnd.ms-word.document.macroenabled.12", "x.docm");
+ assertTypeByName("application/vnd.ms-word.template.macroenabled.12", "x.dotm");
+ assertTypeByName("application/vnd.ms-excel", "x.xls");
+ assertTypeByName("application/vnd.ms-excel", "x.xlt");
+ assertTypeByName("application/vnd.ms-excel", "x.xla");
+ assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "x.xlsx");
+ assertTypeByName("application/vnd.openxmlformats-officedocument.spreadsheetml.template", "x.xltx");
+ assertTypeByName("application/vnd.ms-excel.sheet.macroenabled.12", "x.xlsm");
+ assertTypeByName("application/vnd.ms-excel.template.macroenabled.12", "x.xltm");
+ assertTypeByName("application/vnd.ms-excel.addin.macroenabled.12", "x.xlam");
+ assertTypeByName("application/vnd.ms-excel.sheet.binary.macroenabled.12", "x.xlsb");
+ assertTypeByName("application/vnd.ms-powerpoint", "x.ppt");
+ assertTypeByName("application/vnd.ms-powerpoint", "x.pot");
+ assertTypeByName("application/vnd.ms-powerpoint", "x.pps");
+ assertTypeByName("application/vnd.ms-powerpoint", "x.ppa");
+ assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.presentation", "x.pptx");
+ assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.template", "x.potx");
+ assertTypeByName("application/vnd.openxmlformats-officedocument.presentationml.slideshow", "x.ppsx");
+ assertTypeByName("application/vnd.ms-powerpoint.addin.macroenabled.12", "x.ppam");
+ assertTypeByName("application/vnd.ms-powerpoint.presentation.macroenabled.12", "x.pptm");
+ assertTypeByName("application/vnd.ms-powerpoint.template.macroenabled.12", "x.potm");
+ assertTypeByName("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "x.ppsm");
+ }
+
+ /**
+ * Note - detecting container formats by mime magic is very very
+ * iffy, as we can't be sure where things will end up.
+ * People really ought to use the container aware detection...
+ */
+ @Test
+ public void testOLE2Detection() throws Exception {
+ // These have the properties block near the start, so our mime
+ // magic will spot them
+ assertTypeByData("application/vnd.ms-excel", "testEXCEL.xls");
+
+ // This one quite legitimately doesn't have its properties block
+ // as one of the first couple of entries
+ // As such, our mime magic can't figure it out...
+ assertTypeByData("application/x-tika-msoffice", "testWORD.doc");
+ assertTypeByData("application/x-tika-msoffice", "testPPT.ppt");
+
+
+ // By name + data:
+
+ // Those we got right to start with are fine
+ assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL.xls");
+
+ // And the name lets us specialise the generic OOXML
+ // ones to their actual type
+ assertTypeByNameAndData("application/vnd.ms-powerpoint", "testPPT.ppt");
+ assertTypeByNameAndData("application/msword", "testWORD.doc");
+ }
+
+ /**
+ * Files generated by Works 7.0 Spreadsheet application use the OLE2
+ * structure and resemble Excel files (they contain a "Workbook"). They are
+ * not Excel though. They are distinguished from Excel files with an
+ * additional top-level entry in below the root of the POI filesystem.
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testWorksSpreadsheetDetection() throws Exception {
+ assertTypeDetection("testWORKSSpreadsheet7.0.xlr",
+ // with name-only, everything should be all right
+ "application/x-tika-msworks-spreadsheet",
+ // this is possible due to MimeTypes guessing the type
+ // based on the WksSSWorkBook near the beginning of the
+ // file
+ "application/x-tika-msworks-spreadsheet",
+ // this is right, the magic-based detection works, there is
+ // no need for the name-based detection to refine it
+ "application/x-tika-msworks-spreadsheet");
+ }
+
+ @Test
+ public void testStarOfficeDetection() throws Exception {
+ assertTypeDetection("testVORCalcTemplate.vor",
+ "application/x-staroffice-template",
+ "application/vnd.stardivision.calc",
+ "application/vnd.stardivision.calc");
+ assertTypeDetection("testVORDrawTemplate.vor",
+ "application/x-staroffice-template",
+ "application/vnd.stardivision.draw",
+ "application/vnd.stardivision.draw");
+ assertTypeDetection("testVORImpressTemplate.vor",
+ "application/x-staroffice-template",
+ "application/vnd.stardivision.impress",
+ "application/vnd.stardivision.impress");
+ assertTypeDetection("testVORWriterTemplate.vor",
+ "application/x-staroffice-template",
+ "application/vnd.stardivision.writer",
+ "application/vnd.stardivision.writer");
+
+ assertTypeDetection("testStarOffice-5.2-calc.sdc",
+ "application/vnd.stardivision.calc",
+ "application/vnd.stardivision.calc",
+ "application/vnd.stardivision.calc");
+ assertTypeDetection("testStarOffice-5.2-draw.sda",
+ "application/vnd.stardivision.draw",
+ "application/vnd.stardivision.draw",
+ "application/vnd.stardivision.draw");
+ assertTypeDetection("testStarOffice-5.2-impress.sdd",
+ "application/vnd.stardivision.impress",
+ "application/vnd.stardivision.impress",
+ "application/vnd.stardivision.impress");
+ assertTypeDetection("testStarOffice-5.2-writer.sdw",
+ "application/vnd.stardivision.writer",
+ "application/vnd.stardivision.writer",
+ "application/vnd.stardivision.writer");
+ }
+
+ /**
+ * Files generated by Works Word Processor versions 3.0 and 4.0 use the
+ * OLE2 structure. They don't resemble Word though.
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testOldWorksWordProcessorDetection() throws Exception {
+ assertTypeDetection(
+ "testWORKSWordProcessor3.0.wps",
+ // .wps is just like any other works extension
+ "application/vnd.ms-works",
+ // this is due to MatOST substring
+ "application/vnd.ms-works",
+ // magic-based detection works, no need to refine it
+ "application/vnd.ms-works");
+
+ // files in version 4.0 are no different from those in version 3.0
+ assertTypeDetection(
+ "testWORKSWordProcessor4.0.wps",
+ "application/vnd.ms-works",
+ "application/vnd.ms-works",
+ "application/vnd.ms-works");
+ }
+
+ /**
+ * Files from Excel 2 through 4 are based on the BIFF record
+ * structure, but without a wrapping OLE2 structure.
+ * Excel 5 and Excel 95+ work on OLE2
+ */
+ @Test
+ public void testOldExcel() throws Exception {
+ // With just a name, we'll think everything's a new Excel file
+ assertTypeByName("application/vnd.ms-excel","testEXCEL_4.xls");
+ assertTypeByName("application/vnd.ms-excel","testEXCEL_5.xls");
+ assertTypeByName("application/vnd.ms-excel","testEXCEL_95.xls");
+
+ // With data, we can work out if it's old or new style
+ assertTypeByData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls");
+ assertTypeByData("application/x-tika-msoffice","testEXCEL_5.xls");
+ assertTypeByData("application/x-tika-msoffice","testEXCEL_95.xls");
+
+ assertTypeByNameAndData("application/vnd.ms-excel.sheet.4","testEXCEL_4.xls");
+ assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_5.xls");
+ assertTypeByNameAndData("application/vnd.ms-excel","testEXCEL_95.xls");
+ }
+
+ /**
+ * Note - detecting container formats by mime magic is very very
+ * iffy, as we can't be sure where things will end up.
+ * People really ought to use the container aware detection...
+ */
+ @Test
+ public void testOoxmlDetection() throws Exception {
+ // These two do luckily have [Content_Types].xml near the start,
+ // so our mime magic will spot them
+ assertTypeByData("application/x-tika-ooxml", "testEXCEL.xlsx");
+ assertTypeByData("application/x-tika-ooxml", "testPPT.pptx");
+
+ // This one quite legitimately doesn't have its [Content_Types].xml
+ // file as one of the first couple of entries
+ // As such, our mime magic can't figure it out...
+ assertTypeByData("application/zip", "testWORD.docx");
+
+ // If we give the filename as well as the data, we can
+ // specialise the ooxml generic one to the correct type
+ assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "testEXCEL.xlsx");
+ assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.presentationml.presentation", "testPPT.pptx");
+ assertTypeByNameAndData("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "testWORD.docx");
+
+ // Test a few of the less usual ones
+ assertTypeByNameAndData("application/vnd.ms-excel.sheet.binary.macroenabled.12","testEXCEL.xlsb");
+ assertTypeByNameAndData("application/vnd.ms-powerpoint.presentation.macroenabled.12", "testPPT.pptm");
+ assertTypeByNameAndData("application/vnd.ms-powerpoint.template.macroenabled.12", "testPPT.potm");
+ assertTypeByNameAndData("application/vnd.ms-powerpoint.slideshow.macroenabled.12", "testPPT.ppsm");
+ }
+
+ /**
+ * Note - container based formats, needs container detection
+ * to be properly correct
+ */
+ @Test
+ public void testVisioDetection() throws Exception {
+ // By Name, should get it right
+ assertTypeByName("application/vnd.visio", "testVISIO.vsd");
+ assertTypeByName("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm");
+ assertTypeByName("application/vnd.ms-visio.drawing", "testVISIO.vsdx");
+ assertTypeByName("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm");
+ assertTypeByName("application/vnd.ms-visio.stencil", "testVISIO.vssx");
+ assertTypeByName("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm");
+ assertTypeByName("application/vnd.ms-visio.template", "testVISIO.vstx");
+
+ // By Name and Data, should get it right
+ assertTypeByNameAndData("application/vnd.visio", "testVISIO.vsd");
+ assertTypeByNameAndData("application/vnd.ms-visio.drawing.macroenabled.12", "testVISIO.vsdm");
+ assertTypeByNameAndData("application/vnd.ms-visio.drawing", "testVISIO.vsdx");
+ assertTypeByNameAndData("application/vnd.ms-visio.stencil.macroenabled.12", "testVISIO.vssm");
+ assertTypeByNameAndData("application/vnd.ms-visio.stencil", "testVISIO.vssx");
+ assertTypeByNameAndData("application/vnd.ms-visio.template.macroenabled.12", "testVISIO.vstm");
+ assertTypeByNameAndData("application/vnd.ms-visio.template", "testVISIO.vstx");
+
+ // By Data only, will get the container parent
+ assertTypeByData("application/x-tika-msoffice", "testVISIO.vsd");
+ assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdm");
+ assertTypeByData("application/x-tika-ooxml", "testVISIO.vsdx");
+ assertTypeByData("application/x-tika-ooxml", "testVISIO.vssm");
+ assertTypeByData("application/x-tika-ooxml", "testVISIO.vssx");
+ assertTypeByData("application/x-tika-ooxml", "testVISIO.vstm");
+ assertTypeByData("application/x-tika-ooxml", "testVISIO.vstx");
+ }
+
+ /**
+ * Note - detecting container formats by mime magic is very very
+ * iffy, as we can't be sure where things will end up.
+ * People really ought to use the container aware detection...
+ */
+ @Test
+ public void testIWorkDetection() throws Exception {
+ // By name is easy
+ assertTypeByName("application/vnd.apple.keynote", "testKeynote.key");
+ assertTypeByName("application/vnd.apple.numbers", "testNumbers.numbers");
+ assertTypeByName("application/vnd.apple.pages", "testPages.pages");
+
+ // We can't do it by data, as we'd need to unpack
+ // the zip file to check the XML
+ assertTypeByData("application/zip", "testKeynote.key");
+
+ assertTypeByNameAndData("application/vnd.apple.keynote", "testKeynote.key");
+ assertTypeByNameAndData("application/vnd.apple.numbers", "testNumbers.numbers");
+ assertTypeByNameAndData("application/vnd.apple.pages", "testPages.pages");
+ }
+
+ @Test
+ public void testArchiveDetection() throws Exception {
+ assertTypeByName("application/x-archive", "test.ar");
+ assertTypeByName("application/zip", "test.zip");
+ assertTypeByName("application/x-tar", "test.tar");
+ assertTypeByName("application/gzip", "test.tgz"); // See GZIP, not tar contents of it
+ assertTypeByName("application/x-cpio", "test.cpio");
+
+ // TODO Add an example .deb and .udeb, then check these
+
+ // Check the mime magic patterns for them work too
+ assertTypeByData("application/x-archive", "testARofText.ar");
+ assertTypeByData("application/x-archive", "testARofSND.ar");
+ assertTypeByData("application/zip", "test-documents.zip");
+ assertTypeByData("application/x-gtar", "test-documents.tar"); // GNU TAR
+ assertTypeByData("application/gzip", "test-documents.tgz"); // See GZIP, not tar contents of it
+ assertTypeByData("application/x-cpio", "test-documents.cpio");
+
+ // For spanned zip files, the .zip file doesn't have the header, it's the other parts
+ assertTypeByData("application/octet-stream", "test-documents-spanned.zip");
+ assertTypeByData("application/zip", "test-documents-spanned.z01");
+ }
+
+ @Test
+ public void testFeedsDetection() throws Exception {
+ assertType("application/rss+xml", "rsstest.rss");
+ assertType("application/atom+xml", "testATOM.atom");
+ assertTypeByData("application/rss+xml", "rsstest.rss");
+ assertTypeByName("application/rss+xml", "rsstest.rss");
+ assertTypeByData("application/atom+xml", "testATOM.atom");
+ assertTypeByName("application/atom+xml", "testATOM.atom");
+ }
+
+ @Test
+ public void testFitsDetection() throws Exception {
+ // FITS image created using imagemagick convert of testJPEG.jpg
+ assertType("application/fits", "testFITS.fits");
+ assertTypeByData("application/fits", "testFITS.fits");
+ assertTypeByName("application/fits", "testFITS.fits");
+ }
+
+ @Test
+ public void testJpegDetection() throws Exception {
+ assertType("image/jpeg", "testJPEG.jpg");
+ assertTypeByData("image/jpeg", "testJPEG.jpg");
+ assertTypeByName("image/jpeg", "x.jpg");
+ assertTypeByName("image/jpeg", "x.JPG");
+ assertTypeByName("image/jpeg", "x.jpeg");
+ assertTypeByName("image/jpeg", "x.JPEG");
+ assertTypeByName("image/jpeg", "x.jpe");
+ assertTypeByName("image/jpeg", "x.jif");
+ assertTypeByName("image/jpeg", "x.jfif");
+ assertTypeByName("image/jpeg", "x.jfi");
+
+ assertType("image/jp2", "testJPEG.jp2");
+ assertTypeByData("image/jp2", "testJPEG.jp2");
+ assertTypeByName("image/jp2", "x.jp2");
+ }
+
+ @Test
+ public void testBpgDetection() throws Exception {
+ assertType("image/x-bpg", "testBPG.bpg");
+ assertTypeByData("image/x-bpg", "testBPG.bpg");
+ assertTypeByData("image/x-bpg", "testBPG_commented.bpg");
+ assertTypeByName("image/x-bpg", "x.bpg");
+ }
+
+ @Test
+ public void testTiffDetection() throws Exception {
+ assertType("image/tiff", "testTIFF.tif");
+ assertTypeByData("image/tiff", "testTIFF.tif");
+ assertTypeByName("image/tiff", "x.tiff");
+ assertTypeByName("image/tiff", "x.tif");
+ assertTypeByName("image/tiff", "x.TIF");
+ }
+
+ @Test
+ public void testGifDetection() throws Exception {
+ assertType("image/gif", "testGIF.gif");
+ assertTypeByData("image/gif", "testGIF.gif");
+ assertTypeByName("image/gif", "x.gif");
+ assertTypeByName("image/gif", "x.GIF");
+ }
+
+ @Test
+ public void testPngDetection() throws Exception {
+ assertType("image/png", "testPNG.png");
+ assertTypeByData("image/png", "testPNG.png");
+ assertTypeByName("image/png", "x.png");
+ assertTypeByName("image/png", "x.PNG");
+ }
+
+ @Test
+ public void testWEBPDetection() throws Exception {
+ assertType("image/webp", "testWEBP.webp");
+ assertTypeByData("image/webp", "testWEBP.webp");
+ assertTypeByName("image/webp", "x.webp");
+ assertTypeByName("image/webp", "x.WEBP");
+ }
+
+ @Test
+ public void testBmpDetection() throws Exception {
+ assertType("image/x-ms-bmp", "testBMP.bmp");
+ assertTypeByData("image/x-ms-bmp", "testBMP.bmp");
+ assertTypeByName("image/x-ms-bmp", "x.bmp");
+ assertTypeByName("image/x-ms-bmp", "x.BMP");
+ assertTypeByName("image/x-ms-bmp", "x.dib");
+ assertTypeByName("image/x-ms-bmp", "x.DIB");
+ //false positive check -- contains part of BMP signature
+ assertType("text/plain", "testBMPfp.txt");
+ }
+
+ @Test
+ public void testPnmDetection() throws Exception {
+ assertType("image/x-portable-bitmap", "testPBM.pbm");
+ assertType("image/x-portable-graymap", "testPGM.pgm");
+ assertType("image/x-portable-pixmap", "testPPM.ppm");
+ assertTypeByData("image/x-portable-bitmap", "testPBM.pbm");
+ assertTypeByData("image/x-portable-graymap", "testPGM.pgm");
+ assertTypeByData("image/x-portable-pixmap", "testPPM.ppm");
+ assertTypeByName("image/x-portable-anymap", "x.pnm");
+ assertTypeByName("image/x-portable-anymap", "x.PNM");
+ assertTypeByName("image/x-portable-bitmap", "x.pbm");
+ assertTypeByName("image/x-portable-bitmap", "x.PBM");
+ assertTypeByName("image/x-portable-graymap", "x.pgm");
+ assertTypeByName("image/x-portable-graymap", "x.PGM");
+ assertTypeByName("image/x-portable-pixmap", "x.ppm");
+ assertTypeByName("image/x-portable-pixmap", "x.PPM");
+ }
+
+ @Test
+ public void testPictDetection() throws Exception {
+ assertType("image/x-pict", "testPICT.pct");
+ assertTypeByData("image/x-pict", "testPICT.pct");
+ assertTypeByName("image/x-pict", "x.pic");
+ assertTypeByName("image/x-pict", "x.PCT");
+ }
+
+ @Test
+ public void testCgmDetection() throws Exception {
+ // TODO: Need a test image file
+ assertTypeByName("image/cgm", "x.cgm");
+ assertTypeByName("image/cgm", "x.CGM");
+ }
+
+ @Test
+ public void testRdfXmlDetection() throws Exception {
+ assertTypeByName("application/rdf+xml", "x.rdf");
+ assertTypeByName("application/rdf+xml", "x.owl");
+ }
+
+ @Test
+ public void testSvgDetection() throws Exception {
+ assertType("image/svg+xml", "testSVG.svg");
+ assertTypeByData("image/svg+xml", "testSVG.svg");
+ assertTypeByName("image/svg+xml", "x.svg");
+ assertTypeByName("image/svg+xml", "x.SVG");
+
+ // Should *.svgz be svg or gzip
+ assertType("application/gzip", "testSVG.svgz");
+ assertTypeByData("application/gzip", "testSVG.svgz");
+ assertTypeByName("image/svg+xml", "x.svgz");
+ assertTypeByName("image/svg+xml", "x.SVGZ");
+ }
+
+ @Test
+ public void testPdfDetection() throws Exception {
+ // PDF extension by name is enough
+ assertTypeByName("application/pdf", "x.pdf");
+ assertTypeByName("application/pdf", "x.PDF");
+
+ // For normal PDFs, can get by name or data or both
+ assertType("application/pdf", "testPDF.pdf");
+ assertTypeByData("application/pdf", "testPDF.pdf");
+
+ // PDF with a BoM works both ways too
+ assertType("application/pdf", "testPDF_bom.pdf");
+ assertTypeByData("application/pdf", "testPDF_bom.pdf");
+ }
+
+ @Test
+ public void testSwfDetection() throws Exception {
+ assertTypeByName("application/x-shockwave-flash", "x.swf");
+ assertTypeByName("application/x-shockwave-flash", "x.SWF");
+ assertTypeByName("application/x-shockwave-flash", "test1.swf");
+ assertTypeByName("application/x-shockwave-flash", "test2.swf");
+ assertTypeByName("application/x-shockwave-flash", "test3.swf");
+ }
+
+ @Test
+ public void testDwgDetection() throws Exception {
+ assertTypeByName("image/vnd.dwg", "x.dwg");
+ assertTypeByData("image/vnd.dwg", "testDWG2004.dwg");
+ assertTypeByData("image/vnd.dwg", "testDWG2007.dwg");
+ assertTypeByData("image/vnd.dwg", "testDWG2010.dwg");
+ }
+
+ @Test
+ public void testprtDetection() throws Exception {
+ assertTypeByName("application/x-prt", "x.prt");
+ assertTypeByData("application/x-prt", "testCADKEY.prt");
+ }
+
+ /**
+ * Formats which are based on plain text
+ */
+ @Test
+ public void testTextBasedFormatsDetection() throws Exception {
+ assertTypeByName("text/plain", "testTXT.txt");
+ assertType( "text/plain", "testTXT.txt");
+
+ assertTypeByName("text/css", "testCSS.css");
+ assertType( "text/css", "testCSS.css");
+
+ assertTypeByName("text/csv", "testCSV.csv");
+ assertType( "text/csv", "testCSV.csv");
+
+ assertTypeByName("text/html", "testHTML.html");
+ assertType( "text/html", "testHTML.html");
+
+ assertTypeByName("application/javascript", "testJS.js");
+ assertType( "application/javascript", "testJS.js");
+ }
+
+ @Test
+ public void testJavaDetection() throws Exception {
+ // TODO Classloader doesn't seem to find the .class file in test-documents
+ //assertTypeDetection("AutoDetectParser.class", "application/java-vm");
+
+ // OSX Native Extension
+ assertTypeDetection("testJNILIB.jnilib", "application/x-java-jnilib");
+ }
+
+ @Test
+ public void testXmlAndHtmlDetection() throws Exception {
+ assertTypeByData("application/xml", "<?xml version=\"1.0\" encoding=\"UTF-8\"?><records><record/></records>"
+ .getBytes(UTF_8));
+ assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>"
+ .getBytes(UTF_16LE));
+ assertTypeByData("application/xml", "\uFEFF<?xml version=\"1.0\" encoding=\"UTF-16\"?><records><record/></records>"
+ .getBytes(UTF_16BE));
+ assertTypeByData("application/xml", "<!-- XML without processing instructions --><records><record/></records>"
+ .getBytes(UTF_8));
+ assertTypeByData("text/html", "<html><body>HTML</body></html>"
+ .getBytes(UTF_8));
+ assertTypeByData("text/html", "<!-- HTML comment --><html><body>HTML</body></html>"
+ .getBytes(UTF_8));
+ }
+
+ @Test
+ public void testWmfDetection() throws Exception {
+ assertTypeByName("application/x-msmetafile", "x.wmf");
+ assertTypeByData("application/x-msmetafile", "testWMF.wmf");
+ assertTypeByName("application/x-msmetafile", "x.WMF");
+
+ assertTypeByName("application/x-emf", "x.emf");
+ assertTypeByData("application/x-emf","testEMF.emf");
+ assertTypeByName("application/x-emf", "x.EMF");
+ // TODO: Need a test wmz file
+ assertTypeByName("application/x-ms-wmz", "x.wmz");
+ assertTypeByName("application/x-ms-wmz", "x.WMZ");
+ // TODO: Need a test emz file
+ assertTypeByName("application/gzip", "x.emz");
+ assertTypeByName("application/gzip", "x.EMZ");
+ }
+
+ @Test
+ public void testPsDetection() throws Exception {
+ // TODO: Need a test postscript file
+ assertTypeByName("application/postscript", "x.ps");
+ assertTypeByName("application/postscript", "x.PS");
+ assertTypeByName("application/postscript", "x.eps");
+ assertTypeByName("application/postscript", "x.epsf");
+ assertTypeByName("application/postscript", "x.epsi");
+ }
+
+ @Test
+ public void testMicrosoftMultiMediaDetection() throws Exception {
+ assertTypeByName("video/x-ms-asf", "x.asf");
+ assertTypeByName("video/x-ms-wmv", "x.wmv");
+ assertTypeByName("audio/x-ms-wma", "x.wma");
+
+ assertTypeByData("video/x-ms-asf", "testASF.asf");
+ assertTypeByData("video/x-ms-wmv", "testWMV.wmv");
+ assertTypeByData("audio/x-ms-wma", "testWMA.wma");
+ }
+
+ /**
+ * All 3 DITA types are in theory handled by the same mimetype,
+ * but we specialise them
+ */
+ @Test
+ public void testDITADetection() throws Exception {
+ assertTypeByName("application/dita+xml; format=topic", "test.dita");
+ assertTypeByName("application/dita+xml; format=map", "test.ditamap");
+ assertTypeByName("application/dita+xml; format=val", "test.ditaval");
+
+ assertTypeByData("application/dita+xml; format=task", "testDITA.dita");
+ assertTypeByData("application/dita+xml; format=concept", "testDITA2.dita");
+ assertTypeByData("application/dita+xml; format=map", "testDITA.ditamap");
+
+ assertTypeByNameAndData("application/dita+xml; format=task", "testDITA.dita");
+ assertTypeByNameAndData("application/dita+xml; format=concept", "testDITA2.dita");
+ assertTypeByNameAndData("application/dita+xml; format=map", "testDITA.ditamap");
+
+ // These are all children of the official type
+ assertEquals("application/dita+xml",
+ repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.ditamap")).toString());
+ assertEquals("application/dita+xml",
+ repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA.dita")).toString());
+ // Concept inherits from topic
+ assertEquals("application/dita+xml; format=topic",
+ repo.getMediaTypeRegistry().getSupertype(getTypeByNameAndData("testDITA2.dita")).toString());
+ }
+
+ /**
+ * @since TIKA-194
+ */
+ @Test
+ public void testJavaRegex() throws Exception{
+ MimeType testType = new MimeType(MediaType.parse("foo/bar"));
+ this.repo.add(testType);
+ assertNotNull(repo.forName("foo/bar"));
+ String pattern = "rtg_sst_grb_0\\.5\\.\\d{8}";
+ this.repo.addPattern(testType, pattern, true);
+ String testFileName = "rtg_sst_grb_0.5.12345678";
+ assertEquals("foo/bar", tika.detect(testFileName));
+
+ MimeType testType2 = new MimeType(MediaType.parse("foo/bar2"));
+ this.repo.add(testType2);
+ assertNotNull(repo.forName("foo/bar2"));
+ this.repo.addPattern(testType2, pattern, false);
+ assertNotSame("foo/bar2", tika.detect(testFileName));
+ }
+
+ @Test
+ public void testRawDetection() throws Exception {
+ assertTypeByName("image/x-raw-adobe", "x.dng");
+ assertTypeByName("image/x-raw-adobe", "x.DNG");
+ assertTypeByName("image/x-raw-hasselblad", "x.3fr");
+ assertTypeByName("image/x-raw-fuji", "x.raf");
+ assertTypeByName("image/x-raw-canon", "x.crw");
+ assertTypeByName("image/x-raw-canon", "x.cr2");
+ assertTypeByName("image/x-raw-kodak", "x.k25");
+ assertTypeByName("image/x-raw-kodak", "x.kdc");
+ assertTypeByName("image/x-raw-kodak", "x.dcs");
+ assertTypeByName("image/x-raw-kodak", "x.drf");
+ assertTypeByName("image/x-raw-minolta", "x.mrw");
+ assertTypeByName("image/x-raw-nikon", "x.nef");
+ assertTypeByName("image/x-raw-nikon", "x.nrw");
+ assertTypeByName("image/x-raw-olympus", "x.orf");
+ assertTypeByName("image/x-raw-pentax", "x.ptx");
+ assertTypeByName("image/x-raw-pentax", "x.pef");
+ assertTypeByName("image/x-raw-sony", "x.arw");
+ assertTypeByName("image/x-raw-sony", "x.srf");
+ assertTypeByName("image/x-raw-sony", "x.sr2");
+ assertTypeByName("image/x-raw-sigma", "x.x3f");
+ assertTypeByName("image/x-raw-epson", "x.erf");
+ assertTypeByName("image/x-raw-mamiya", "x.mef");
+ assertTypeByName("image/x-raw-leaf", "x.mos");
+ assertTypeByName("image/x-raw-panasonic", "x.raw");
+ assertTypeByName("image/x-raw-panasonic", "x.rw2");
+ assertTypeByName("image/x-raw-phaseone", "x.iiq");
+ assertTypeByName("image/x-raw-red", "x.r3d");
+ assertTypeByName("image/x-raw-imacon", "x.fff");
+ assertTypeByName("image/x-raw-logitech", "x.pxn");
+ assertTypeByName("image/x-raw-casio", "x.bay");
+ assertTypeByName("image/x-raw-rawzor", "x.rwz");
+ }
+
+ /**
+ * Tests that we correctly detect the font types
+ */
+ @Test
+ public void testFontDetection() throws Exception {
+ assertTypeByName("application/x-font-adobe-metric", "x.afm");
+ assertTypeByData("application/x-font-adobe-metric", "testAFM.afm");
+
+ assertTypeByName("application/x-font-printer-metric", "x.pfm");
+ // TODO Get a sample .pfm file
+ assertTypeByData(
+ "application/x-font-printer-metric",
+ new byte[] {0x00, 0x01, 256-0xb1, 0x0a, 0x00, 0x00, 0x43, 0x6f,
+ 0x70, 0x79, 0x72, 0x69, 0x67, 0x68, 0x74, 0x20}
+ );
+
+ assertTypeByName("application/x-font-type1", "x.pfa");
+ // TODO Get a sample .pfa file
+ assertTypeByData(
+ "application/x-font-type1",
+ new byte[] {0x25, 0x21, 0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f,
+ 0x62, 0x65, 0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31,
+ 0x2e, 0x30, 0x20, 0x20, 0x2d, 0x2a, 0x2d, 0x20}
+ );
+
+ assertTypeByName("application/x-font-type1", "x.pfb");
+ // TODO Get a sample .pfm file
+ assertTypeByData(
+ "application/x-font-type1",
+ new byte[] {-0x80, 0x01, 0x09, 0x05, 0x00, 0x00, 0x25, 0x21,
+ 0x50, 0x53, 0x2d, 0x41, 0x64, 0x6f, 0x62, 0x65,
+ 0x46, 0x6f, 0x6e, 0x74, 0x2d, 0x31, 0x2e, 0x30 }
+ );
+ }
+
+ /**
+ * Tests MimeTypes.getMimeType(URL), which examines both the byte header
+ * and, if necessary, the URL's extension.
+ */
+ @Test
+ public void testMimeDeterminationForTestDocuments() throws Exception {
+ assertType("text/html", "testHTML.html");
+ assertType("application/zip", "test-documents.zip");
+
+ assertType("text/html", "testHTML_utf8.html");
+ assertType(
+ "application/vnd.oasis.opendocument.text",
+ "testOpenOffice2.odt");
+ assertType("application/pdf", "testPDF.pdf");
+ assertType("application/rtf", "testRTF.rtf");
+ assertType("text/plain", "testTXT.txt");
+ assertType("application/xml", "testXML.xml");
+ assertType("audio/basic", "testAU.au");
+ assertType("audio/x-aiff", "testAIFF.aif");
+ assertType("audio/x-wav", "testWAV.wav");
+ assertType("audio/midi", "testMID.mid");
+ assertType("application/x-msaccess", "testACCESS.mdb");
+ assertType("application/x-font-ttf", "testTrueType3.ttf");
+ }
+
+ @Test
+ public void test7ZipDetection() throws Exception {
+ assertTypeByName("application/x-7z-compressed","test-documents.7z");
+ assertTypeByData("application/x-7z-compressed","test-documents.7z");
+ assertTypeByNameAndData("application/x-7z-compressed", "test-documents.7z");
+ }
+
+ @Test
+ public void testWebArchiveDetection() throws Exception {
+ assertTypeByName("application/x-webarchive","x.webarchive");
+ assertTypeByData("application/x-bplist","testWEBARCHIVE.webarchive");
+ assertTypeByNameAndData("application/x-webarchive", "testWEBARCHIVE.webarchive");
+ }
+
+ /**
+ * KML, and KMZ (zipped KML)
+ */
+ @Test
+ public void testKMLZDetection() throws Exception {
+ assertTypeByName("application/vnd.google-earth.kml+xml","testKML.kml");
+ assertTypeByData("application/vnd.google-earth.kml+xml","testKML.kml");
+ assertTypeByNameAndData("application/vnd.google-earth.kml+xml", "testKML.kml");
+
+ assertTypeByName("application/vnd.google-earth.kmz","testKMZ.kmz");
+ assertTypeByNameAndData("application/vnd.google-earth.kmz", "testKMZ.kmz");
+
+ // By data only, mimetype magic only gets us to a .zip
+ // We need to use the Zip Aware detector to get the full type
+ assertTypeByData("application/zip","testKMZ.kmz");
+ }
+
+ @Test
+ public void testCreativeSuite() throws IOException {
+ assertTypeDetection("testINDD.indd", "application/x-adobe-indesign");
+ assertTypeDetection("testPSD.psd", "image/vnd.adobe.photoshop");
+ }
+
+ @Test
+ public void testAMR() throws IOException {
+ // AMR matches on name, data or both
+ assertTypeDetection("testAMR.amr", "audio/amr");
+
+ // AMR-WB subtype shares extension, so needs data to identify
+ assertTypeDetection("testAMR-WB.amr", "audio/amr", "audio/amr-wb", "audio/amr-wb");
+
+ // Ditto for the AMR-WB+ subtype, which we don't have a sample file of yet
+ //assertTypeDetection("testAMR-WB+.amr", "audio/amr", "audio/amr-wb+", "audio/amr-wb+");
+ }
+
+ @Test
+ public void testEmail() throws IOException {
+ // EMLX
+ assertTypeDetection("testEMLX.emlx", "message/x-emlx");
+
+ // Groupwise
+ assertTypeDetection("testGroupWiseEml.eml", "message/rfc822");
+
+ // Lotus
+ assertTypeDetection("testLotusEml.eml", "message/rfc822");
+
+ // Thunderbird - doesn't currently work by name
+ assertTypeByNameAndData("message/rfc822", "testThunderbirdEml.eml");
+ }
+
+ @Test
+ public void testAxCrypt() throws Exception {
+ // test-TXT.txt encrypted with a key of "tika"
+ assertTypeDetection("testTXT-tika.axx", "application/x-axcrypt");
+ }
+
+ @Test
+ public void testWindowsEXE() throws Exception {
+ assertTypeByName("application/x-msdownload", "x.dll");
+ assertTypeByName("application/x-ms-installer", "x.msi");
+ assertTypeByName("application/x-dosexec", "x.exe");
+
+ assertTypeByData("application/x-msdownload; format=pe", "testTinyPE.exe");
+ assertTypeByNameAndData("application/x-msdownload; format=pe", "testTinyPE.exe");
+
+ // A jar file with part of a PE header, but not a full one
+ // should still be detected as a zip or jar (without/with name)
+ assertTypeByData("application/zip", "testJAR_with_PEHDR.jar");
+ assertTypeByNameAndData("application/java-archive", "testJAR_with_PEHDR.jar");
+ }
+
+ @Test
+ public void testMatroskaDetection() throws Exception {
+ assertType("video/x-matroska", "testMKV.mkv");
+ // TODO: Need custom detector data detection, see TIKA-1180
+ assertTypeByData("application/x-matroska", "testMKV.mkv");
+ assertTypeByNameAndData("video/x-matroska", "testMKV.mkv");
+ assertTypeByName("video/x-matroska", "x.mkv");
+ assertTypeByName("video/x-matroska", "x.MKV");
+ assertTypeByName("audio/x-matroska", "x.mka");
+ assertTypeByName("audio/x-matroska", "x.MKA");
+ }
+
+ @Test
+ public void testWebMDetection() throws Exception {
+ assertType("video/webm", "testWEBM.webm");
+ // TODO: Need custom detector data detection, see TIKA-1180
+ assertTypeByData("application/x-matroska", "testWEBM.webm");
+ assertTypeByNameAndData("video/webm", "testWEBM.webm");
+ assertTypeByName("video/webm", "x.webm");
+ assertTypeByName("video/webm", "x.WEBM");
+ }
+
+ /** Test getMimeType(byte[]) */
+ @Test
+ public void testGetMimeType_byteArray() throws IOException {
+ // Plain text detection
+ assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
+ assertText(new byte[] { (byte) 0xFF, (byte) 0xFE });
+ assertText(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF });
+ assertText(new byte[] { 'a', 'b', 'c' });
+ assertText(new byte[] { '\t', '\r', '\n', 0x0C, 0x1B });
+ assertNotText(new byte[] { '\t', '\r', '\n', 0x0E, 0x1C });
+ }
+
+ @Test
+ public void testBerkeleyDB() throws IOException {
+ assertTypeByData(
+ "application/x-berkeley-db; format=btree; version=2",
+ "testBDB_btree_2.db");
+ assertTypeByData(
+ "application/x-berkeley-db; format=btree; version=3",
+ "testBDB_btree_3.db");
+ assertTypeByData(
+ "application/x-berkeley-db; format=btree; version=4",
+ "testBDB_btree_4.db");
+ // V4 and V5 share the same btree format
+ assertTypeByData(
+ "application/x-berkeley-db; format=btree; version=4",
+ "testBDB_btree_5.db");
+
+ assertTypeByData(
+ "application/x-berkeley-db; format=hash; version=2",
+ "testBDB_hash_2.db");
+ assertTypeByData(
+ "application/x-berkeley-db; format=hash; version=3",
+ "testBDB_hash_3.db");
+ assertTypeByData(
+ "application/x-berkeley-db; format=hash; version=4",
+ "testBDB_hash_4.db");
+ assertTypeByData(
+ "application/x-berkeley-db; format=hash; version=5",
+ "testBDB_hash_5.db");
+ }
+
+ /**
+ * CBOR typically contains HTML
+ */
+ @Test
+ public void testCBOR() throws IOException {
+ assertTypeByNameAndData("application/cbor", "NUTCH-1997.cbor");
+ assertTypeByData("application/cbor", "NUTCH-1997.cbor");
+ }
+
+ @Test
+ public void testZLIB() throws IOException {
+ // ZLIB encoded versions of testTXT.txt
+ assertTypeByData("application/zlib", "testTXT.zlib");
+ assertTypeByData("application/zlib", "testTXT.zlib0");
+ assertTypeByData("application/zlib", "testTXT.zlib5");
+ assertTypeByData("application/zlib", "testTXT.zlib9");
+ }
+
+ @Test
+ public void testTextFormats() throws Exception {
+ assertType("application/x-bibtex-text-file", "testBIBTEX.bib");
+ assertTypeByData("application/x-bibtex-text-file", "testBIBTEX.bib");
+ }
+
+ @Test
+ public void testCodeFormats() throws Exception {
+ assertType("text/x-csrc", "testC.c");
+ assertType("text/x-chdr", "testH.h");
+ assertTypeByData("text/x-csrc", "testC.c");
+ assertTypeByData("text/x-chdr", "testH.h");
+
+ assertTypeByName("text/x-java-source", "testJAVA.java");
+ assertType("text/x-java-properties", "testJAVAPROPS.properties");
+
+ assertType("text/x-matlab", "testMATLAB.m");
+ assertType("text/x-matlab", "testMATLAB_wtsgaus.m");
+ assertType("text/x-matlab", "testMATLAB_barcast.m");
+ assertTypeByData("text/x-matlab", "testMATLAB.m");
+ assertTypeByData("text/x-matlab", "testMATLAB_wtsgaus.m");
+ assertTypeByData("text/x-matlab", "testMATLAB_barcast.m");
+ }
+
+ @Test
+ public void testWebVTT() throws Exception {
+ assertType("text/vtt", "testWebVTT.vtt");
+ assertTypeByData("text/vtt", "testWebVTT.vtt");
+ }
+
+ private void assertText(byte[] prefix) throws IOException {
+ assertMagic("text/plain", prefix);
+ }
+
+ private void assertNotText(byte[] prefix) throws IOException {
+ assertMagic("application/octet-stream", prefix);
+ }
+
+ private void assertMagic(String expected, byte[] prefix) throws IOException {
+ MediaType type =
+ repo.detect(new ByteArrayInputStream(prefix), new Metadata());
+ assertNotNull(type);
+ assertEquals(expected, type.toString());
+ }
+
+ private void assertType(String expected, String filename) throws Exception {
+ try (InputStream stream = getTestDocumentAsStream(filename)) {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+ assertEquals(expected, repo.detect(stream, metadata).toString());
+ }
+ }
+
+ private void assertTypeByName(String expected, String filename)
+ throws IOException {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+ assertEquals(expected, repo.detect(null, metadata).toString());
+ }
+
+ private void assertTypeByData(String expected, String filename)
+ throws IOException {
+ try (InputStream stream = getTestDocumentAsStream(filename)) {
+ Metadata metadata = new Metadata();
+ assertEquals(expected, repo.detect(stream, metadata).toString());
+ }
+ }
+
+ private void assertTypeByData(String expected, byte[] data)
+ throws IOException {
+ try (InputStream stream = new ByteArrayInputStream(data)) {
+ Metadata metadata = new Metadata();
+ assertEquals(expected, repo.detect(stream, metadata).toString());
+ }
+ }
+
+ private void assertTypeDetection(String filename, String type)
+ throws IOException {
+ assertTypeDetection(filename, type, type, type);
+ }
+
+ private void assertTypeDetection(String filename, String byName, String byData,
+ String byNameAndData) throws IOException {
+ assertTypeByName(byName, filename);
+ assertTypeByData(byData, filename);
+ assertTypeByNameAndData(byNameAndData, filename);
+ }
+
+ private void assertTypeByNameAndData(String expected, String filename)
+ throws IOException {
+ assertEquals(expected, getTypeByNameAndData(filename).toString());
+ }
+
+ private MediaType getTypeByNameAndData(String filename) throws IOException {
+ try (InputStream stream = getTestDocumentAsStream(filename)) {
+ assertNotNull("Test document not found: " + filename, stream);
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+ return repo.detect(stream, metadata);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
new file mode 100644
index 0000000..91b054e
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -0,0 +1,459 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipOutputStream;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPDM;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.BodyContentHandler;
+import org.gagravarr.tika.FlacParser;
+import org.gagravarr.tika.OpusParser;
+import org.gagravarr.tika.VorbisParser;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class AutoDetectParserTest {
+ private TikaConfig tika = TikaConfig.getDefaultConfig();
+
+ // Easy to read constants for the MIME types:
+ private static final String RAW = "application/octet-stream";
+ private static final String EXCEL = "application/vnd.ms-excel";
+ private static final String HTML = "text/html; charset=ISO-8859-1";
+ private static final String PDF = "application/pdf";
+ private static final String POWERPOINT = "application/vnd.ms-powerpoint";
+ private static final String KEYNOTE = "application/vnd.apple.keynote";
+ private static final String PAGES = "application/vnd.apple.pages";
+ private static final String NUMBERS = "application/vnd.apple.numbers";
+ private static final String CHM = "application/vnd.ms-htmlhelp";
+ private static final String RTF = "application/rtf";
+ private static final String PLAINTEXT = "text/plain; charset=ISO-8859-1";
+ private static final String UTF8TEXT = "text/plain; charset=UTF-8";
+ private static final String WORD = "application/msword";
+ private static final String XML = "application/xml";
+ private static final String RSS = "application/rss+xml";
+ private static final String BMP = "image/x-ms-bmp";
+ private static final String GIF = "image/gif";
+ private static final String JPEG = "image/jpeg";
+ private static final String PNG = "image/png";
+ private static final String OGG_VORBIS = "audio/vorbis";
+ private static final String OGG_OPUS = "audio/opus";
+ private static final String OGG_FLAC = "audio/x-oggflac";
+ private static final String FLAC_NATIVE= "audio/x-flac";
+ private static final String OPENOFFICE
+ = "application/vnd.oasis.opendocument.text";
+
+
+ /**
+ * This is where a single test is done.
+ * @param tp the parameters encapsulated in a TestParams instance
+ * @throws IOException
+ */
+ private void assertAutoDetect(TestParams tp) throws Exception {
+ try (InputStream input = AutoDetectParserTest.class.getResourceAsStream(tp.resourceRealName)) {
+ if (input == null) {
+ fail("Could not open stream from specified resource: "
+ + tp.resourceRealName);
+ }
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, tp.resourceStatedName);
+ metadata.set(Metadata.CONTENT_TYPE, tp.statedType);
+ ContentHandler handler = new BodyContentHandler();
+ new AutoDetectParser(tika).parse(input, handler, metadata);
+
+ assertEquals("Bad content type: " + tp,
+ tp.realType, metadata.get(Metadata.CONTENT_TYPE));
+
+ if (tp.expectedContentFragment != null) {
+ assertTrue("Expected content not found: " + tp,
+ handler.toString().contains(tp.expectedContentFragment));
+ }
+ }
+ }
+
+ /**
+ * Convenience method -- its sole purpose of existence is to make the
+ * call to it more readable than it would be if a TestParams instance
+ * would need to be instantiated there.
+ *
+ * @param resourceRealName real name of resource
+ * @param resourceStatedName stated name -- will a bad name fool us?
+ * @param realType - the real MIME type
+ * @param statedType - stated MIME type - will a wrong one fool us?
+ * @param expectedContentFragment - something expected in the text
+ * @throws Exception
+ */
+ private void assertAutoDetect(String resourceRealName,
+ String resourceStatedName,
+ String realType,
+ String statedType,
+ String expectedContentFragment)
+ throws Exception {
+
+ assertAutoDetect(new TestParams(resourceRealName, resourceStatedName,
+ realType, statedType, expectedContentFragment));
+ }
+
+ private void assertAutoDetect(
+ String resource, String type, String content) throws Exception {
+
+ resource = "/test-documents/" + resource;
+
+ // TODO !!!! The disabled tests below should work!
+ // The correct MIME type should be determined regardless of the
+ // stated type (ContentType hint) and the stated URL name.
+
+
+ // Try different combinations of correct and incorrect arguments:
+ final String wrongMimeType = RAW;
+ assertAutoDetect(resource, resource, type, type, content);
+ assertAutoDetect(resource, resource, type, null, content);
+ assertAutoDetect(resource, resource, type, wrongMimeType, content);
+
+ assertAutoDetect(resource, null, type, type, content);
+ assertAutoDetect(resource, null, type, null, content);
+ assertAutoDetect(resource, null, type, wrongMimeType, content);
+
+ final String badResource = "a.xyz";
+ assertAutoDetect(resource, badResource, type, type, content);
+ assertAutoDetect(resource, badResource, type, null, content);
+ assertAutoDetect(resource, badResource, type, wrongMimeType, content);
+ }
+
+ @Test
+ public void testKeynote() throws Exception {
+ assertAutoDetect("testKeynote.key", KEYNOTE, "A sample presentation");
+ }
+
+ @Test
+ public void testPages() throws Exception {
+ assertAutoDetect("testPages.pages", PAGES, "Sample pages document");
+ }
+
+ @Test
+ public void testNumbers() throws Exception {
+ assertAutoDetect("testNumbers.numbers", NUMBERS, "Checking Account: 300545668");
+ }
+
+ @Test
+ public void testChm() throws Exception {
+ assertAutoDetect("testChm.chm", CHM, "If you do not specify a window type or a window name, the main window is used.");
+ }
+
+ @Test
+ public void testEpub() throws Exception {
+ assertAutoDetect(
+ "testEPUB.epub", "application/epub+zip",
+ "The previous headings were subchapters");
+ }
+
+ @Test
+ public void testExcel() throws Exception {
+ assertAutoDetect("testEXCEL.xls", EXCEL, "Sample Excel Worksheet");
+ }
+
+ @Test
+ public void testHTML() throws Exception {
+ assertAutoDetect("testHTML.html", HTML, "Test Indexation Html");
+ }
+
+ @Test
+ public void testOpenOffice() throws Exception {
+ assertAutoDetect("testOpenOffice2.odt", OPENOFFICE,
+ "This is a sample Open Office document");
+ }
+
+ @Test
+ public void testPDF() throws Exception {
+ assertAutoDetect("testPDF.pdf", PDF, "Content Analysis Toolkit");
+
+ }
+
+ @Test
+ public void testPowerpoint() throws Exception {
+ assertAutoDetect("testPPT.ppt", POWERPOINT, "Sample Powerpoint Slide");
+ }
+
+ @Test
+ public void testRdfXml() throws Exception {
+ assertAutoDetect("testRDF.rdf", "application/rdf+xml", "");
+ }
+
+ @Test
+ public void testRTF() throws Exception {
+ assertAutoDetect("testRTF.rtf", RTF, "indexation Word");
+ }
+
+ @Test
+ public void testText() throws Exception {
+ assertAutoDetect("testTXT.txt", PLAINTEXT, "indexation de Txt");
+ }
+
+ @Test
+ public void testTextNonASCIIUTF8() throws Exception {
+ assertAutoDetect("testTXTNonASCIIUTF8.txt", UTF8TEXT, "The quick brown fox jumps over the lazy dog");
+ }
+
+ @Test
+ public void testWord() throws Exception {
+ assertAutoDetect("testWORD.doc", WORD, "Sample Word Document");
+ }
+
+ @Test
+ public void testXML() throws Exception {
+ assertAutoDetect("testXML.xml", XML, "Lius");
+ }
+
+ @Test
+ public void testRss() throws Exception {
+ assertAutoDetect("/test-documents/rsstest.rss", "feed", RSS, "application/rss+xml", "Sample RSS File for Junit test");
+ }
+
+ @Test
+ public void testImages() throws Exception {
+ assertAutoDetect("testBMP.bmp", BMP, null);
+ assertAutoDetect("testGIF.gif", GIF, null);
+ assertAutoDetect("testJPEG.jpg", JPEG, null);
+ assertAutoDetect("testPNG.png", PNG, null);
+ }
+
+ /**
+ * Make sure that zip bomb attacks are prevented.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-216">TIKA-216</a>
+ */
+ @Test
+ public void testZipBombPrevention() throws Exception {
+ try (InputStream tgz = AutoDetectParserTest.class.getResourceAsStream(
+ "/test-documents/TIKA-216.tgz")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler(-1);
+ new AutoDetectParser(tika).parse(tgz, handler, metadata);
+ fail("Zip bomb was not detected");
+ } catch (TikaException e) {
+ // expected
+ }
+ }
+
+ /**
+ * Make sure XML parse errors don't trigger ZIP bomb detection.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-1322">TIKA-1322</a>
+ */
+ @Test
+ public void testNoBombDetectedForInvalidXml() throws Exception {
+ // create zip with ten empty / invalid XML files, 1.xml .. 10.xml
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ ZipOutputStream zos = new ZipOutputStream(baos);
+ for (int i = 1; i <= 10; i++) {
+ zos.putNextEntry(new ZipEntry(i + ".xml"));
+ zos.closeEntry();
+ }
+ zos.finish();
+ zos.close();
+ new AutoDetectParser(tika).parse(new ByteArrayInputStream(baos.toByteArray()), new BodyContentHandler(-1),
+ new Metadata());
+ }
+
+ /**
+ * Test to ensure that the Ogg Audio parsers (Vorbis, Opus, Flac etc)
+ * have been correctly included, and are available
+ */
+ @SuppressWarnings("deprecation")
+ @Test
+ public void testOggFlacAudio() throws Exception {
+ // The three test files should all have similar test data
+ String[] testFiles = new String[] {
+ "testVORBIS.ogg", "testFLAC.flac", "testFLAC.oga",
+ "testOPUS.opus"
+ };
+ MediaType[] mediaTypes = new MediaType[] {
+ MediaType.parse(OGG_VORBIS), MediaType.parse(FLAC_NATIVE),
+ MediaType.parse(OGG_FLAC), MediaType.parse(OGG_OPUS)
+ };
+
+ // Check we can load the parsers, and they claim to do the right things
+ VorbisParser vParser = new VorbisParser();
+ assertNotNull("Parser not found for " + mediaTypes[0],
+ vParser.getSupportedTypes(new ParseContext()));
+
+ FlacParser fParser = new FlacParser();
+ assertNotNull("Parser not found for " + mediaTypes[1],
+ fParser.getSupportedTypes(new ParseContext()));
+ assertNotNull("Parser not found for " + mediaTypes[2],
+ fParser.getSupportedTypes(new ParseContext()));
+
+ OpusParser oParser = new OpusParser();
+ assertNotNull("Parser not found for " + mediaTypes[3],
+ oParser.getSupportedTypes(new ParseContext()));
+
+ // Check we found the parser
+ CompositeParser parser = (CompositeParser)tika.getParser();
+ for (MediaType mt : mediaTypes) {
+ assertNotNull("Parser not found for " + mt, parser.getParsers().get(mt) );
+ }
+
+ // Have each file parsed, and check
+ for (int i=0; i<testFiles.length; i++) {
+ String file = testFiles[i];
+ try (InputStream input = AutoDetectParserTest.class.getResourceAsStream(
+ "/test-documents/" + file)) {
+ if (input == null) {
+ fail("Could not find test file " + file);
+ }
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new AutoDetectParser(tika).parse(input, handler, metadata);
+
+ assertEquals("Incorrect content type for " + file,
+ mediaTypes[i].toString(), metadata.get(Metadata.CONTENT_TYPE));
+
+ // Check some of the common metadata
+ // Old style metadata
+ assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+ assertEquals("Test Title", metadata.get(Metadata.TITLE));
+ // New style metadata
+ assertEquals("Test Artist", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Test Title", metadata.get(TikaCoreProperties.TITLE));
+
+ // Check some of the XMPDM metadata
+ if (!file.endsWith(".opus")) {
+ assertEquals("Test Album", metadata.get(XMPDM.ALBUM));
+ }
+ assertEquals("Test Artist", metadata.get(XMPDM.ARTIST));
+ assertEquals("Stereo", metadata.get(XMPDM.AUDIO_CHANNEL_TYPE));
+ assertEquals("44100", metadata.get(XMPDM.AUDIO_SAMPLE_RATE));
+
+ // Check some of the text
+ String content = handler.toString();
+ assertTrue(content.contains("Test Title"));
+ assertTrue(content.contains("Test Artist"));
+ }
+ }
+ }
+
+ /**
+ * Test case for TIKA-514. Provide constructor for AutoDetectParser that has explicit
+ * list of supported parsers.
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-514">TIKA-514</a>
+ */
+ @Test
+ public void testSpecificParserList() throws Exception {
+ AutoDetectParser parser = new AutoDetectParser(new MyDetector(), new MyParser());
+
+ InputStream is = new ByteArrayInputStream("test".getBytes(UTF_8));
+ Metadata metadata = new Metadata();
+ parser.parse(is, new BodyContentHandler(), metadata, new ParseContext());
+
+ assertEquals("value", metadata.get("MyParser"));
+ }
+
+ private static final MediaType MY_MEDIA_TYPE = new MediaType("application", "x-myparser");
+
+ /**
+ * A test detector which always returns the type supported
+ * by the test parser
+ */
+ @SuppressWarnings("serial")
+ private static class MyDetector implements Detector {
+ public MediaType detect(InputStream input, Metadata metadata) throws IOException {
+ return MY_MEDIA_TYPE;
+ }
+ }
+
+ @SuppressWarnings("serial")
+ private static class MyParser extends AbstractParser {
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ Set<MediaType> supportedTypes = new HashSet<MediaType>();
+ supportedTypes.add(MY_MEDIA_TYPE);
+ return supportedTypes;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) {
+ metadata.add("MyParser", "value");
+ }
+
+ }
+
+ /**
+ * Minimal class to encapsulate all parameters -- the main reason for
+ * its existence is to aid in debugging via its toString() method.
+ *
+ * Getters and setters intentionally not provided.
+ */
+ private static class TestParams {
+
+ public String resourceRealName;
+ public String resourceStatedName;
+ public String realType;
+ public String statedType;
+ public String expectedContentFragment;
+
+
+ private TestParams(String resourceRealName,
+ String resourceStatedName,
+ String realType,
+ String statedType,
+ String expectedContentFragment) {
+ this.resourceRealName = resourceRealName;
+ this.resourceStatedName = resourceStatedName;
+ this.realType = realType;
+ this.statedType = statedType;
+ this.expectedContentFragment = expectedContentFragment;
+ }
+
+
+ /**
+ * Produces a string like the following:
+ *
+ * <pre>
+ * Test parameters:
+ * resourceRealName = /test-documents/testEXCEL.xls
+ * resourceStatedName = null
+ * realType = application/vnd.ms-excel
+ * statedType = null
+ * expectedContentFragment = Sample Excel Worksheet
+ * </pre>
+ */
+ public String toString() {
+ return "Test parameters:\n"
+ + " resourceRealName = " + resourceRealName + "\n"
+ + " resourceStatedName = " + resourceStatedName + "\n"
+ + " realType = " + realType + "\n"
+ + " statedType = " + statedType + "\n"
+ + " expectedContentFragment = " + expectedContentFragment + "\n";
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java b/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java
new file mode 100644
index 0000000..66323d3
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/DigestingParserTest.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.DigestingParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.digesting.CommonsDigester;
+import org.junit.Test;
+
+
+public class DigestingParserTest extends TikaTest {
+
+ private final static String P = TikaCoreProperties.TIKA_META_PREFIX+
+ "digest"+Metadata.NAMESPACE_PREFIX_DELIMITER;
+
+ private final int UNLIMITED = 1000000;//well, not really, but longer than input file
+ private final Parser p = new AutoDetectParser();
+
+ @Test
+ public void testBasic() throws Exception {
+ Map<CommonsDigester.DigestAlgorithm, String> expected =
+ new HashMap<CommonsDigester.DigestAlgorithm, String>();
+
+ expected.put(CommonsDigester.DigestAlgorithm.MD2,"d768c8e27b0b52c6eaabfaa7122d1d4f");
+ expected.put(CommonsDigester.DigestAlgorithm.MD5,"59f626e09a8c16ab6dbc2800c685f772");
+ expected.put(CommonsDigester.DigestAlgorithm.SHA1,"7a1f001d163ac90d8ea54c050faf5a38079788a6");
+ expected.put(CommonsDigester.DigestAlgorithm.SHA256,"c4b7fab030a8b6a9d6691f6699ac8e6f" +
+ "82bc53764a0f1430d134ae3b70c32654");
+ expected.put(CommonsDigester.DigestAlgorithm.SHA384,"ebe368b9326fef44408290724d187553"+
+ "8b8a6923fdf251ddab72c6e4b5d54160" +
+ "9db917ba4260d1767995a844d8d654df");
+ expected.put(CommonsDigester.DigestAlgorithm.SHA512,"ee46d973ee1852c018580c242955974d"+
+ "da4c21f36b54d7acd06fcf68e974663b"+
+ "fed1d256875be58d22beacf178154cc3"+
+ "a1178cb73443deaa53aa0840324708bb");
+
+ //test each one
+ for (CommonsDigester.DigestAlgorithm algo : CommonsDigester.DigestAlgorithm.values()) {
+ Metadata m = new Metadata();
+ XMLResult xml = getXML("test_recursive_embedded.docx",
+ new DigestingParser(p, new CommonsDigester(UNLIMITED, algo)), m);
+ assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
+ }
+
+
+ //test comma separated
+ CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse("md5,sha256,sha384,sha512");
+ Metadata m = new Metadata();
+ XMLResult xml = getXML("test_recursive_embedded.docx",
+ new DigestingParser(p, new CommonsDigester(UNLIMITED, algos)), m);
+ for (CommonsDigester.DigestAlgorithm algo : new CommonsDigester.DigestAlgorithm[]{
+ CommonsDigester.DigestAlgorithm.MD5,
+ CommonsDigester.DigestAlgorithm.SHA256,
+ CommonsDigester.DigestAlgorithm.SHA384,
+ CommonsDigester.DigestAlgorithm.SHA512}) {
+ assertEquals(algo.toString(), expected.get(algo), m.get(P + algo.toString()));
+ }
+
+ assertNull(m.get(P+CommonsDigester.DigestAlgorithm.MD2.toString()));
+ assertNull(m.get(P+CommonsDigester.DigestAlgorithm.SHA1.toString()));
+
+ }
+
+ @Test
+ public void testLimitedRead() throws Exception {
+ CommonsDigester.DigestAlgorithm algo = CommonsDigester.DigestAlgorithm.MD5;
+ int limit = 100;
+ byte[] bytes = new byte[limit];
+ InputStream is = getResourceAsStream("/test-documents/test_recursive_embedded.docx");
+ is.read(bytes, 0, limit);
+ is.close();
+ Metadata m = new Metadata();
+ try {
+ XMLResult xml = getXML(TikaInputStream.get(bytes),
+ new DigestingParser(p, new CommonsDigester(100, algo)), m);
+ } catch (TikaException e) {
+ //thrown because this is just a file fragment
+ assertContains("Unexpected RuntimeException from org.apache.tika.parser.microsoft.ooxml.OOXMLParser",
+ e.getMessage());
+ }
+ String expectedMD5 = m.get(P+"MD5");
+
+ m = new Metadata();
+ XMLResult xml = getXML("test_recursive_embedded.docx",
+ new DigestingParser(p, new CommonsDigester(100, algo)), m);
+ assertEquals(expectedMD5, m.get(P+"MD5"));
+ }
+
+ @Test
+ public void testReset() throws Exception {
+ String expectedMD5 = "1643c2cef21e36720c54f4f6cb3349d0";
+ Metadata m = new Metadata();
+ XMLResult xml = getXML("test_recursive_embedded.docx",
+ new DigestingParser(p, new CommonsDigester(100, CommonsDigester.DigestAlgorithm.MD5)), m);
+ assertEquals(expectedMD5, m.get(P+"MD5"));
+ }
+
+ @Test
+ public void testNegativeMaxMarkLength() throws Exception {
+ Metadata m = new Metadata();
+ boolean ex = false;
+ try {
+ XMLResult xml = getXML("test_recursive_embedded.docx",
+ new DigestingParser(p, new CommonsDigester(-1, CommonsDigester.DigestAlgorithm.MD5)), m);
+ } catch (IllegalArgumentException e) {
+ ex = true;
+ }
+ assertTrue("Exception not thrown", ex);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/parser/ParsingReaderTest.java b/tika-app/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
new file mode 100644
index 0000000..71c07b7
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/parser/ParsingReaderTest.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.io.Reader;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+public class ParsingReaderTest {
+
+ @Test
+ public void testPlainText() throws Exception {
+ String data = "test content";
+ InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
+ Reader reader = new ParsingReader(stream, "test.txt");
+ assertEquals('t', reader.read());
+ assertEquals('e', reader.read());
+ assertEquals('s', reader.read());
+ assertEquals('t', reader.read());
+ assertEquals(' ', reader.read());
+ assertEquals('c', reader.read());
+ assertEquals('o', reader.read());
+ assertEquals('n', reader.read());
+ assertEquals('t', reader.read());
+ assertEquals('e', reader.read());
+ assertEquals('n', reader.read());
+ assertEquals('t', reader.read());
+ assertEquals('\n', reader.read());
+ assertEquals(-1, reader.read());
+ reader.close();
+ assertEquals(-1, stream.read());
+ }
+
+ @Test
+ public void testXML() throws Exception {
+ String data = "<p>test <span>content</span></p>";
+ InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
+ Reader reader = new ParsingReader(stream, "test.xml");
+ assertEquals(' ', (char) reader.read());
+ assertEquals('t', (char) reader.read());
+ assertEquals('e', (char) reader.read());
+ assertEquals('s', (char) reader.read());
+ assertEquals('t', (char) reader.read());
+ assertEquals(' ', (char) reader.read());
+ assertEquals(' ', (char) reader.read());
+ assertEquals('c', (char) reader.read());
+ assertEquals('o', (char) reader.read());
+ assertEquals('n', (char) reader.read());
+ assertEquals('t', (char) reader.read());
+ assertEquals('e', (char) reader.read());
+ assertEquals('n', (char) reader.read());
+ assertEquals('t', (char) reader.read());
+ assertEquals('\n', (char) reader.read());
+ assertEquals(-1, reader.read());
+ reader.close();
+ assertEquals(-1, stream.read());
+ }
+
+ /**
+ * Test case for TIKA-203
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-203">TIKA-203</a>
+ */
+ @Test
+ public void testMetadata() throws Exception {
+ Metadata metadata = new Metadata();
+ InputStream stream = ParsingReaderTest.class.getResourceAsStream(
+ "/test-documents/testEXCEL.xls");
+ try (Reader reader = new ParsingReader(
+ new AutoDetectParser(), stream, metadata, new ParseContext())) {
+ // Metadata should already be available
+ assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
+ // Check that the internal buffering isn't broken
+ assertEquals('F', (char) reader.read());
+ assertEquals('e', (char) reader.read());
+ assertEquals('u', (char) reader.read());
+ assertEquals('i', (char) reader.read());
+ assertEquals('l', (char) reader.read());
+ assertEquals('1', (char) reader.read());
+ }
+ }
+
+}
[09/13] tika git commit: TIKA-1855 -- first pass. Need to turn back
on the forbidden-apis testCheck. More clean up remains.
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/brwNIMS_2014.dif
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/brwNIMS_2014.dif b/tika-core/src/test/resources/org/apache/tika/mime/brwNIMS_2014.dif
deleted file mode 100644
index e131add..0000000
--- a/tika-core/src/test/resources/org/apache/tika/mime/brwNIMS_2014.dif
+++ /dev/null
@@ -1,56 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
- <DIF xmlns="http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/ http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/dif_v9.8.4.xsd">
- <Entry_ID>02a6301c-3ab3-11e4-8ee7-00c0f03d5b7c</Entry_ID>
- <Entry_Title>Barrow Logger Data NIMS 2014</Entry_Title>
-
- <Parameters>
- <Category>EARTH SCIENCE</Category>
- <Topic>BIOSPHERE</Topic>
- <Term>ECOLOGICAL DYNAMICS</Term>
- </Parameters>
-
-
- <Spatial_Coverage>
- <Southernmost_Latitude>70</Southernmost_Latitude>
- <Northernmost_Latitude>72</Northernmost_Latitude>
- <Westernmost_Longitude>-162</Westernmost_Longitude>
- <Easternmost_Longitude>-150</Easternmost_Longitude>
- </Spatial_Coverage>
-
- <Data_Center>
- <Data_Center_Name>
- <Short_Name>ACADIS</Short_Name>
- <Long_Name>Advanced Cooperative Arctic Data and Information Service</Long_Name>
- </Data_Center_Name>
- <Data_Center_URL>http://www.aoncadis.org/</Data_Center_URL>
- <Personnel>
- <Role>DATA CENTER CONTACT</Role>
- <First_Name>ACADIS</First_Name>
- <Last_Name>User Services</Last_Name>
- <Contact_Address>
- <Address>NCAR/CISL</Address>
- <Address>P.O. Box 3000</Address>
- <City>Boulder</City>
- <Province_or_State>CO</Province_or_State>
- <Postal_Code>80307</Postal_Code>
- <Country>USA</Country>
- </Contact_Address>
- </Personnel>
- </Data_Center>
-
- <Summary>
- <Abstract>Logger records from the Networked Info-mechanical Systems (NIMS), Transect length: ~50m The data was recorded using a CR3000 logger. The sensor trolley was equipped with instruments for recording the distance to vegetation canopy (SR50a Sonic Distance, Campbell Scientific), up- and downwelling short- and longwave radiation (CNR4 net radiometer, Kipp & Zonen), air temperature and surface temperature (SI-111 IR radiometer, Apogee Instruments Inc.) and spectral reflection (Jaz Combo-2, Ocean Optics; GreenSeeker RT100 (505), NTech).</Abstract>
- </Summary>
-
- <Related_URL>
- <URL_Content_Type>
- <Type>GET DATA</Type>
- </URL_Content_Type>
- <URL>http://www.aoncadis.org/dataset/id/02a6301c-3ab3-11e4-8ee7-00c0f03d5b7c.html</URL>
- <Description>Data Center top-level access page for this resource</Description>
- </Related_URL>
-
- <Metadata_Name>ACADIS IDN DIF</Metadata_Name>
- <Metadata_Version>9.8.4</Metadata_Version>
- <Last_DIF_Revision_Date>2015-02-05</Last_DIF_Revision_Date>
- </DIF>
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/circles-with-prefix.svg
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/circles-with-prefix.svg b/tika-core/src/test/resources/org/apache/tika/mime/circles-with-prefix.svg
deleted file mode 100644
index d68ff55..0000000
--- a/tika-core/src/test/resources/org/apache/tika/mime/circles-with-prefix.svg
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<svg:svg xmlns:svg="http://www.w3.org/2000/svg" width="12cm" height="12cm">
- <svg:g style="fill-opacity:0.7; stroke:black; stroke-width:0.1cm;">
- <svg:circle cx="6cm" cy="2cm" r="100" style="fill:red;" transform="translate(0,50)" />
- <svg:circle cx="6cm" cy="2cm" r="100" style="fill:blue;" transform="translate(70,150)" />
- <svg:circle cx="6cm" cy="2cm" r="100" style="fill:green;" transform="translate(-70,150)"/>
- </svg:g>
-</svg:svg>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/circles.svg
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/circles.svg b/tika-core/src/test/resources/org/apache/tika/mime/circles.svg
deleted file mode 100644
index 8b71e82..0000000
--- a/tika-core/src/test/resources/org/apache/tika/mime/circles.svg
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg" width="12cm" height="12cm">
- <g style="fill-opacity:0.7; stroke:black; stroke-width:0.1cm;">
- <circle cx="6cm" cy="2cm" r="100" style="fill:red;" transform="translate(0,50)" />
- <circle cx="6cm" cy="2cm" r="100" style="fill:blue;" transform="translate(70,150)" />
- <circle cx="6cm" cy="2cm" r="100" style="fill:green;" transform="translate(-70,150)"/>
- </g>
-</svg>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/datamatrix.png
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/datamatrix.png b/tika-core/src/test/resources/org/apache/tika/mime/datamatrix.png
deleted file mode 100644
index 4aa5003..0000000
Binary files a/tika-core/src/test/resources/org/apache/tika/mime/datamatrix.png and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/gdas1.forecmwf.2014062612.grib2
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/gdas1.forecmwf.2014062612.grib2 b/tika-core/src/test/resources/org/apache/tika/mime/gdas1.forecmwf.2014062612.grib2
deleted file mode 100644
index 7ab3416..0000000
Binary files a/tika-core/src/test/resources/org/apache/tika/mime/gdas1.forecmwf.2014062612.grib2 and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment b/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment
deleted file mode 100644
index bf36d08..0000000
--- a/tika-core/src/test/resources/org/apache/tika/mime/htmlfragment
+++ /dev/null
@@ -1,18 +0,0 @@
-<div id="leftcol">
- <ul>
- <li><a href="/mission/sec/sec.html"> Security and Information Sciences Home ›</a> </li>
- <li><a href="/mission/sec/publications/-publications.html">Publications ›</a> </li>
- <li><a href="/mission/sec/corpora/corpora.html">Corpora ›</a> </li>
- <li><a href="/mission/sec/softwaretools/tools.html">Software Tools ›</a></li>
- <li><a href="/mission/sec/CSO/CSO.html"> Systems and Operations ›</a>
- <ul>
- <li><a href="/mission/sec/publications/-publications.html">Publications ›</a></li>
- <li><a href="/mission/sec/CSO/biographies/CSObios.html">Biographies ›</a></li>
- </ul>
- </li>
- <li><a href="/mission/sec/CST/CST.html"> Systems and Technology ›</a> </li>
- <li><a href="/mission/sec/CSA/CSA.html"> System Assessments ›</a> </li>
- <li><a href="/mission/sec/HLT/HLT.html">Human Language Technology ›</a>
-<li><a href="/mission/sec/computing/computing.html">Computing and Analytics ›</a></li>
- </ul>
-</div>
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/plotutils-bin-cgm-v3.cgm
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/plotutils-bin-cgm-v3.cgm b/tika-core/src/test/resources/org/apache/tika/mime/plotutils-bin-cgm-v3.cgm
deleted file mode 100644
index 450f5ad..0000000
Binary files a/tika-core/src/test/resources/org/apache/tika/mime/plotutils-bin-cgm-v3.cgm and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/stylesheet.xsl
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/stylesheet.xsl b/tika-core/src/test/resources/org/apache/tika/mime/stylesheet.xsl
deleted file mode 100644
index d704f07..0000000
--- a/tika-core/src/test/resources/org/apache/tika/mime/stylesheet.xsl
+++ /dev/null
@@ -1,9 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
-
- <xsl:output method="xml" indent="yes"/>
-
- <xsl:template match="/">
- <test hello="world"/>
- </xsl:template>
-</xsl:stylesheet>
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/test-difficult-rdf1.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/test-difficult-rdf1.xml b/tika-core/src/test/resources/org/apache/tika/mime/test-difficult-rdf1.xml
deleted file mode 100644
index dc88dcf..0000000
--- a/tika-core/src/test/resources/org/apache/tika/mime/test-difficult-rdf1.xml
+++ /dev/null
@@ -1,39 +0,0 @@
-<?xml version='1.0' encoding='ISO-8859-1'?>
-
-<!DOCTYPE uridef[
- <!ENTITY rdf "http://www.w3.org/1999/02/22-rdf-syntax-ns">
- <!ENTITY shadow-rdf "http://www.daml.org/services/owl-s/1.2/generic/ObjectList.owl">
- <!ENTITY expr "http://www.daml.org/services/owl-s/1.2/generic/Expression.owl">
- <!ENTITY rdfs "http://www.w3.org/2000/01/rdf-schema">
- <!ENTITY owl "http://www.w3.org/2002/07/owl">
- <!ENTITY xsd "http://www.w3.org/2001/XMLSchema">
- <!ENTITY time "http://www.isi.edu/~hobbs/damltime/time-entry.owl">
- <!ENTITY swrl "http://www.w3.org/2003/11/swrl">
- <!ENTITY service "http://www.daml.org/services/owl-s/1.2/Service.owl">
- <!ENTITY grounding "http://www.daml.org/services/owl-s/1.2/Grounding.owl">
- <!ENTITY process "http://www.daml.org/services/owl-s/1.2/Process.owl">
- <!ENTITY DEFAULT "http://www.daml.org/services/owl-s/1.2/Process.owl">
-]>
-
-
-<rdf:RDF
- xmlns:rdf= "&rdf;#"
- xmlns:shadow-rdf= "&shadow-rdf;#"
- xmlns:expr= "&expr;#"
- xmlns:rdfs= "&rdfs;#"
- xmlns:owl= "&owl;#"
- xmlns:swrl= "&swrl;#"
- xmlns:xsd= "&xsd;#"
- xmlns:service= "&service;#"
- xmlns:process= "&process;#"
- xmlns:grounding= "&grounding;#"
- xmlns= "&DEFAULT;#"
- xml:base="&process;">
-
-<!--
- TIKA-309: Mime type application/rdf+xml not correctly detected
- Simplified test case based on the OWL document at
- http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl
--->
-
-</rdf:RDF>
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/test-difficult-rdf2.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/test-difficult-rdf2.xml b/tika-core/src/test/resources/org/apache/tika/mime/test-difficult-rdf2.xml
deleted file mode 100644
index 0f8fe28..0000000
--- a/tika-core/src/test/resources/org/apache/tika/mime/test-difficult-rdf2.xml
+++ /dev/null
@@ -1,44 +0,0 @@
-<!-- This is the OWL 2 Namespace Document, sometimes
- called the "owl.owl" file.
-
- For some commentary about its creation, see
- http://www.w3.org/2007/OWL/wiki/Owl2DotOwlDevel
-
- This was created from the 16 Oct 2009 version of
- that page, with the turtle-to-rdf/xml conversion
- done by cwm, and the conversion to XML entity
- references done by hand. The GRDDL triple and
- namespace have also been added by hand
-
- The real OWL 1 and OWL 2 namespace is:
- http://www.w3.org/2002/07/owl#
-
--->
-<!DOCTYPE rdf:RDF [
-
-<!ENTITY location "http://www.w3.org/2002/07/owl" >
-<!ENTITY rdf "http://www.w3.org/1999/02/22-rdf-syntax-ns#" >
-<!ENTITY rdfs "http://www.w3.org/2000/01/rdf-schema#" >
-<!ENTITY xsd "http://www.w3.org/2001/XMLSchema#" >
-<!ENTITY dc "http://purl.org/dc/elements/1.1/" >
-<!ENTITY grddl "http://www.w3.org/2003/g/data-view#" >
-<!ENTITY owl "&location;#" >
-
-]>
-<rdf:RDF
- xml:base ="&location;"
- xmlns:rdf ="&rdf;"
- xmlns:rdfs="&rdfs;"
- xmlns:xsd = "&xsd;"
- xmlns:owl ="&owl;"
- xmlns:dc = "&dc;"
- xmlns:grddl = "&grddl;"
- >
-
-<!--
- TIKA-309: Mime type application/rdf+xml not correctly detected
- Simplified test case based on the OWL 2 Namespace Document at
- http://www.w3.org/2002/07/owl#
--->
-
-</rdf:RDF>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/test-iso-8859-1.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/test-iso-8859-1.xml b/tika-core/src/test/resources/org/apache/tika/mime/test-iso-8859-1.xml
deleted file mode 100644
index 7573369..0000000
--- a/tika-core/src/test/resources/org/apache/tika/mime/test-iso-8859-1.xml
+++ /dev/null
@@ -1,2 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-<test hello="world"/>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/test-long-comment.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/test-long-comment.xml b/tika-core/src/test/resources/org/apache/tika/mime/test-long-comment.xml
deleted file mode 100644
index 84844ec..0000000
--- a/tika-core/src/test/resources/org/apache/tika/mime/test-long-comment.xml
+++ /dev/null
@@ -1,21 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<?somepi blahblah test="ignore-me.xml" ?>
-<test hello="world"/>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/test-malformed-header.html.bin
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/test-malformed-header.html.bin b/tika-core/src/test/resources/org/apache/tika/mime/test-malformed-header.html.bin
deleted file mode 100644
index 069ee51..0000000
Binary files a/tika-core/src/test/resources/org/apache/tika/mime/test-malformed-header.html.bin and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/test-tika-327.html
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/test-tika-327.html b/tika-core/src/test/resources/org/apache/tika/mime/test-tika-327.html
deleted file mode 100644
index fe9d04b..0000000
--- a/tika-core/src/test/resources/org/apache/tika/mime/test-tika-327.html
+++ /dev/null
@@ -1,50 +0,0 @@
-<?xml version="1.0" encoding="iso-8859-1"?><link href="http://www.apache.org" rel="stylesheet" type="text/css" />
-<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
-<title>title</title>
-<meta name="description" content="content" />
-<meta name="keywords" content="keys" />
-<script language="JavaScript" type="text/javascript">
-<!--
-function hello() {
-}
-//-->
-
-
-</script>
-
-<!-- IE fix -->
-<style type="text/css">form { display: inline }</style>
-<!--
-comment
--->
-</head>
-
-<body>
-<table>
- <tr>
- <td>
- <table>
- <tr>
- <td><font class="title"><!--comment--><a href="index.php">image</a></font></td>
- <td> <table>
- <tr>
- <td>
- </td>
- </tr>
- </table></td>
-
-
-
- </tr>
- <tr>
- <td>
- <span class="class">Home </span> </span>
- </td>
- <td>
- July 2, 2013 </td>
- </tr>
- </table></td>
- </tr>
-</table>
-end of table
-</body>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/test-utf16be.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/test-utf16be.xml b/tika-core/src/test/resources/org/apache/tika/mime/test-utf16be.xml
deleted file mode 100644
index 6835338..0000000
Binary files a/tika-core/src/test/resources/org/apache/tika/mime/test-utf16be.xml and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/test-utf16le.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/test-utf16le.xml b/tika-core/src/test/resources/org/apache/tika/mime/test-utf16le.xml
deleted file mode 100644
index 2a9124d..0000000
Binary files a/tika-core/src/test/resources/org/apache/tika/mime/test-utf16le.xml and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml b/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml
deleted file mode 100644
index 4cd4db3..0000000
--- a/tika-core/src/test/resources/org/apache/tika/mime/test-utf8-bom.xml
+++ /dev/null
@@ -1,2 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<test hello="world"/>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/test-utf8.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/test-utf8.xml b/tika-core/src/test/resources/org/apache/tika/mime/test-utf8.xml
deleted file mode 100644
index 1304d8b..0000000
--- a/tika-core/src/test/resources/org/apache/tika/mime/test-utf8.xml
+++ /dev/null
@@ -1,2 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<test hello="world"/>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/test.html
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/test.html b/tika-core/src/test/resources/org/apache/tika/mime/test.html
deleted file mode 100644
index 763e237..0000000
--- a/tika-core/src/test/resources/org/apache/tika/mime/test.html
+++ /dev/null
@@ -1,10 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
-<html>
-<head>
-<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
-<title>Hello World</title>
-</head>
-<body>
- <p>Hello World!<p/>
-</body>
-</html>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/test.xls
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/test.xls b/tika-core/src/test/resources/org/apache/tika/mime/test.xls
deleted file mode 100644
index 347d8a6..0000000
Binary files a/tika-core/src/test/resources/org/apache/tika/mime/test.xls and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-core/src/test/resources/org/apache/tika/mime/testlargerbuffer.html
----------------------------------------------------------------------
diff --git a/tika-core/src/test/resources/org/apache/tika/mime/testlargerbuffer.html b/tika-core/src/test/resources/org/apache/tika/mime/testlargerbuffer.html
deleted file mode 100644
index 545addd..0000000
--- a/tika-core/src/test/resources/org/apache/tika/mime/testlargerbuffer.html
+++ /dev/null
@@ -1,827 +0,0 @@
-<script language="javascript">
-
-function addToList(from,to)
-{
- if(from.selectedIndex >= 0) {
- isPresent = false;
- var options=to.getElementsByTagName("option");
-
- if(from.item(from.selectedIndex).value == "0") {
- for(i=to.options.length-1; i>= 0; i--) {
- to.removeChild(options[i]);
- }
- }
- for (i=0; i< to.options.length; i++)
- {
- if(options[i].value == from.item(from.selectedIndex).value || options[i].value == "0")
- isPresent = true;
- }
- if(!isPresent) {
- var oOption = document.createElement("option");;
- to.appendChild(oOption);
- oOption.value = from.item(from.selectedIndex).value;
- oOption.text = from.item(from.selectedIndex).text;
- }
- }
-}
-
-function delFromList(to)
-{
- if(to.selectedIndex >= 0) {
- var options=to.getElementsByTagName("option");
- to.removeChild(options[to.selectedIndex]);
- }
-}
-
-function fillListToGet(form, to)
-{
- var options=to.getElementsByTagName("option");
- for (i=0; i< to.options.length; i++)
- {
- form.action += "&"+to.name+"="+options[i].value;
- }
-}
-
-</script>
-<script language="javascript">
-
-function addToList(from,to)
-{
- if(from.selectedIndex >= 0) {
- isPresent = false;
- var options=to.getElementsByTagName("option");
-
- if(from.item(from.selectedIndex).value == "0") {
- for(i=to.options.length-1; i>= 0; i--) {
- to.removeChild(options[i]);
- }
- }
- for (i=0; i< to.options.length; i++)
- {
- if(options[i].value == from.item(from.selectedIndex).value || options[i].value == "0")
- isPresent = true;
- }
- if(!isPresent) {
- var oOption = document.createElement("option");;
- to.appendChild(oOption);
- oOption.value = from.item(from.selectedIndex).value;
- oOption.text = from.item(from.selectedIndex).text;
- }
- }
-}
-
-function delFromList(to)
-{
- if(to.selectedIndex >= 0) {
- var options=to.getElementsByTagName("option");
- to.removeChild(options[to.selectedIndex]);
- }
-}
-
-function fillListToGet(form, to)
-{
- var options=to.getElementsByTagName("option");
- for (i=0; i< to.options.length; i++)
- {
- form.action += "&"+to.name+"="+options[i].value;
- }
-}
-
-function fillOtherGet(form)
-{
- if (document.all.price_from != "") {
- form.action += "&price_from="+document.all.price_from.value;
- }
- if (document.all.price_to != "") {
- form.action += "&price_to="+document.all.price_to.value;
- }
- if (document.all.square_from != "") {
- form.action += "&square_from="+document.all.square_from.value;
- }
- if (document.all.square_to != "") {
- form.action += "&square_to="+document.all.square_to.value;
- }
- if (document.all.MKAD != "") {
- form.action += "&MKAD="+document.all.MKAD.value;
- }
-}
-
-</script>
-<script language="javascript">
-
-function addToList(from,to)
-{
- if(from.selectedIndex >= 0) {
- isPresent = false;
- var options=to.getElementsByTagName("option");
-
- if(from.item(from.selectedIndex).value == "0") {
- for(i=to.options.length-1; i>= 0; i--) {
- to.removeChild(options[i]);
- }
- }
- for (i=0; i< to.options.length; i++)
- {
- if(options[i].value == from.item(from.selectedIndex).value || options[i].value == "0")
- isPresent = true;
- }
- if(!isPresent) {
- var oOption = document.createElement("option");
- to.appendChild(oOption);
- oOption.value = from.item(from.selectedIndex).value;
- oOption.text = from.item(from.selectedIndex).text;
- }
- }
-}
-
-function delFromList(to)
-{
- if(to.selectedIndex >= 0) {
- var options=to.getElementsByTagName("option");
- to.removeChild(options[to.selectedIndex]);
- }
-}
-
-function fillListToGet(form, to)
-{
- var options=to.getElementsByTagName("option");
- for (i=0; i< to.options.length; i++)
- {
- form.action += "&"+to.name+"="+options[i].value;
- }
-}
-
-function fillOtherGet(form)
-{
- if (document.all.price_from != "") {
- form.action += "&price_from="+document.all.price_from.value;
- }
- if (document.all.price_to != "") {
- form.action += "&price_to="+document.all.price_to.value;
- }
- if (document.all.square_from != "") {
- form.action += "&square_from="+document.all.square_from.value;
- }
- if (document.all.square_to != "") {
- form.action += "&square_to="+document.all.square_to.value;
- }
- if (document.all.MKAD != "") {
- form.action += "&MKAD="+document.all.MKAD.value;
- }
-}
-
-</script>
-
-<html>
-<head>
-<title>������ �������, ����� ��������, ������ ������ � ������. ������������ ������������. ������ �������� "���������-������������"
-
-</title>
-<link rel="SHORTCUT ICON" href="/favicon.ico" />
-<meta http-equiv="Content-Type" content="text/html; charset=windows-1251">
-<meta http-equiv="Content-Language" content="ru">
-<meta name="Keywords" content="��������� ������������, ������, �����, �����, ����, ��������, �������, ���������, �����, �������, �������, �������, ���, ������, �������, ������������, ����������, �������, ������������, ������, ������, ����">
-<meta name="Description" content="��������� ������������ "��������� ������������", "������� ����" ������. ������ � ������� ������������ � ����� ������������ � ������ � �����������: �������, ���������, ����������������, �������� � ������ ������� ���������, ��������, �������, ��������, ����, ����. ������ �������, ������, ���������. ����� ��������. ����� ������. ������ ����.">
-<meta http-equiv="description" content="��������� ������������ "��������� ������������", "������� ����" ������. ������ � ������� ������������ � ����� ������������ � ������ � �����������: �������, ���������, ����������������, �������� � ������ ������� ���������, ��������, �������, ��������, ����, ����. ������ �������, ������, ���������. ����� ��������. ����� ������. ������ ����.">
-<meta name="revisit" content="7 days">
-<meta name='yandex-verification' content='77a043af80883202' />
-
-<link rel="stylesheet" href="continent.css" type="text/css">
-</head>
-<body bgcolor="#FFFFFF" text="#000000" leftmargin="0" topmargin="0" marginwidth="0" marginheight="0">
-<table width="100%" border="0" cellspacing="0" cellpadding="0" height="100%">
- <tr>
- <td height="10">
- <noindex><table width="100%" border="0" cellspacing="0" cellpadding="0">
- <tr>
- <td><a title="������ ������� ������� ������" href="/default.asp"><img src="imgs/logo2.gif" Alt="������ ������� �������, ������, ������, �������, ���������" width="205" height="68" style="margin-top:13px; margin-bottom:3px; margin-left:13px;" border=0></a></td>
- <td align=center valign=bottom>
-
- <a href='http://office.realty-guide.ru/rot/?key=289' target=_blank><img src='/imgs/banners/ban32.gif' border=0 width=500 height=75></a>
-
- </td>
- </tr>
- </table></noindex>
- </td>
- </tr>
- <tr>
- <td valign="top" height="100%">
- <table width="100%" border="0" cellspacing="0" cellpadding="0" height="100%">
- <tr>
- <td width="228" bgcolor="#546154" valign="top" align=center>
- <table width="100%" border="0" cellspacing="0" cellpadding="0" height=402>
- <tr>
- <td height="147" background="imgs/hd_bg2.gif" valign="top"><img src="imgs/h_fl.jpg" width="202" height="136" style="margin-top: 10px; margin-left: 14px;" alt="������ ������� �������, ������, ������, �������, ���������"></td>
- </tr>
- <tr>
- <td height="255" valign="top">
- <OBJECT classid="clsid:D27CDB6E-AE6D-11cf-96B8-444553540000"
- codebase="http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,0,0"
- WIDTH="228" HEIGHT="250" id="menu10" ALIGN="">
- <PARAM NAME=movie VALUE="menu10.swf"> <PARAM NAME=quality VALUE=high> <PARAM NAME=bgcolor VALUE=#525E52> <EMBED src="menu10.swf" quality=high bgcolor=#525E52 WIDTH="228" HEIGHT="250" NAME="menu10" ALIGN=""
- TYPE="application/x-shockwave-flash" PLUGINSPAGE="http://www.macromedia.com/go/getflashplayer"></EMBED>
-</OBJECT>
- </td>
- </tr>
- </table>
-<a href="/kommvip.asp"><img width=169 height=114 src="/imgs/vipbanner3.gif" border=0 alt="� ������ ������� �� ������ ������������ �� ������������� ��� �������� �� ������ ������������ ������������: ������ ������, �������, ���������, ����, ����������, ��������� ���������� ���������� � �.�., ������������ ������� ��������� ������������ ������� � ���������� ���������-������������"></a>
-<br>
-<br>
-<a href="/arendavip.asp"><img width=169 height=114 src="/imgs/vipbanner_arenda.jpg" border=0 alt="� ������ ������� �� ������ ������������ �� ������������ ������������� �� ������ �����: ������ �������, ������ ���������, ������ ���, ������ ����� � �.�., ������������ ������� ��������� ������������ ������� � ���������� ���������-������������"></a>
-<br>
-<br>
-<noindex><a target=_blank title="���������� ������� ������" href="http://www.lagunadom.ru"><img width=169 height=114 src="/ban/ban_169_114.gif" border=0 alt="���������� ������� ������"></a></noindex>
-<br>
-<br>
-<br>
-<br>
-<noindex><!--a target=_blank title="������������� ����, ���������� ����, ����, ������������� ���, ���������� ��� - ��������-������� ���������� ����" href="http://www.nyelki.ru"><img width=169 height=94 src="/imgs/banner.jpg" border=0 alt="������������� ����, ���������� ����, ����, ������������� ���, ���������� ��� - ��������-������� ���������� ����"></a>
-<br>
-<br>
-<br>
-<br-->
-<!-- Yandex.Metrika -->
-<script src="//mc.yandex.ru/resource/watch.js" type="text/javascript"></script>
-<script type="text/javascript">
-try { var yaCounter177293 = new Ya.Metrika(177293); } catch(e){}
-</script>
-<noscript><div style="position: absolute;"><img src="//mc.yandex.ru/watch/177293" alt="" /></div></noscript>
-<!-- Yandex.Metrika -->
-<!--Rating@Mail.ru COUNTER--><script language="JavaScript" type="text/javascript"><!--
-d=document;var a='';a+=';r='+escape(d.referrer)
-js=10//--></script><script language="JavaScript1.1" type="text/javascript"><!--
-a+=';j='+navigator.javaEnabled()
-js=11//--></script><script language="JavaScript1.2" type="text/javascript"><!--
-s=screen;a+=';s='+s.width+'*'+s.height
-a+=';d='+(s.colorDepth?s.colorDepth:s.pixelDepth)
-js=12//--></script><script language="JavaScript1.3" type="text/javascript"><!--
-js=13//--></script><script language="JavaScript" type="text/javascript"><!--
-d.write('<a target=_blank href="http://top.mail.ru/jump?from=782596"'+
-' target=_top><img src="http://top.list.ru/counter'+
-'?id=782596;t=54;js='+js+a+';rand='+Math.random()+
-'" alt="�������@Mail.ru"'+' border=0 height=31 width=88/><\/a>')
-if(11<js)d.write('<'+'!-- ')//--></script><noscript><a
-target=_blank href="http://top.mail.ru/jump?from=782596"><img
-src="http://top.list.ru/counter?js=na;id=782596;t=54"
-border=0 height=31 width=88
-alt="�������@Mail.ru"/></a></noscript><script language="JavaScript" type="text/javascript"><!--
-if(11<js)d.write('--'+'>')//--></script><!--/COUNTER--></noindex>
-<br>
- <br><br>
- </td>
- <td valign="top" bgcolor="#546154" height="100%">
- <table width="100%" border="0" cellspacing="0" cellpadding="0" height="100%">
- <tr>
- <td height="4" background="imgs/hd_bg1.gif" align="right" valign="top" style="padding-right:13px; font-size:4px;"> </td>
- </tr>
- <tr>
- <td valign="top" style="padding-right:13px;" height="20" align=right background="imgs/hd_bg1n.gif">
- <table border=0 cellspacing=0 cellpadding=0 height=20>
- <tr>
-
- <td><img src="/imgs/tabl1_p.gif" height=20></td>
- <td valign=bottom background="/imgs/tabl2_p.gif"><div style="padding-bottom:2px;"><a style="color:#000000; text-decoration:none;" href="/basket.asp">�������</a></td>
- <td><img src="/imgs/tablr_pa.gif" height=20></td>
- <td valign=bottom background="/imgs/tabl2_a.gif"><div style="padding-bottom:2px; font-weight:bold; text-transform:uppercase;">���������-������������</div></div></td>
- <td><img src="/imgs/tabl3_a.gif" height=20></td>
-
- </tr>
- </table>
- </td>
- </tr>
- <tr>
- <td valign="top" style="padding-bottom:13px;padding-right:13px;" height="100%">
-
-<style>
-a:link { color: #000000; text-decoration: none;}
-a:visited { color: #000000; text-decoration: none;}
-a:active { color: #000000; text-decoration: none;}
-a:hover { color: #1FB21F; text-decoration: underline;}
-h2 { margin:0px; padding:0px; font-weight: normal; font-size: 8pt; text-decoration:none;}
-</style>
-<table width="100%" border="0" cellspacing="0" cellpadding="0" bgcolor=#FFFFFF>
-<tr>
- <td valign=top align=left><img src="/imgs/fp2.gif" width=37 height=31></td>
- <td valign=top align=right><img src="/imgs/fp1.gif" width=257 height=24></td>
-</tr>
-</table>
-<table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
-<tr>
- <td valign=top>
- <table width="100%" border="0" cellspacing="1" cellpadding="0" bgcolor=#FFFFFF>
- <tr>
- <td width=12 valign=top><img src="/imgs/fp_li2.gif" width=8 height=15></td>
- <td>
- <h1>���������-������������:</h1>
-<p style="font-family:Times New Roman; font-size:14px; margin-top:10px; margin-bottom:0px;"><b>��������� ������������ "���������-������������"</b>, �������� � 1999 ����, ������������ ����� ������� �������������� �� ����� ������������ �. ������, ������� ������������ ������������ ������������ � �������������� ������ � ������� �������.</p>
-<p style="font-family:Times New Roman; font-size:14px; margin-top:10px; margin-bottom:0px;">�������� "<b>���������-������������</b>" ���������� ���������� ������ ������ ���� ������������ �������� "������ � ������ � �����������" �� �������� ������� � ������������ ��������������� ���������������� ����� � ������.</p>
-<p style="font-family:Times New Roman; font-size:14px; margin-top:10px; margin-bottom:0px;">�� ���������� ��������� <b>����������� ������</b>:</p>
-<ul style="font-family:Times New Roman; font-size:14px; margin-top:10px; margin-bottom:0px;">
-<li><b>������ � ������� ������������ ������������ � ������ � �����������</b>: �������, ���������, ����������������, �������� � ������ ������� ���������.
-<li><b>�������, ������ ������ � ������-�������</b>, ������ ������ �����, ������ ����� ��� ��������.
-<li><b>������ � ������� ����� ������������ � ������</b>: ��������, �������.
-<li><b>������ � ������� ���������� ������������ � �����������</b>: ��������, ����, ����.
-<li><b>����������� ������������� ������ �� ������ � �����-������� ����� � ������� ���������</b>.
-<li><b>���������� � ����������� �������������������� ����������</b>.
-<li><b>������������� ���������� �������������</b>.
-</ul>
-<br>
- </td>
- </tr>
- </table>
- <table width="100%" border="0" cellspacing="0" style="padding-left:12px;" cellpadding="0" bgcolor=#FFFFFF>
- <tr>
- <td valign=top width=50%><h1 style="color:red">������ ����� ������������</h1></td>
- <td valign=top width=50%><h1 style="color:red">������ ������������ ������������</h1></td>
- </tr>
- <tr>
- <td valign=top><br><h1>������ ������� � ������</h1></td>
- <td valign=top><br><h1>������ ������ � ���������</h1></td>
- </tr>
- <tr>
- <td valign=top>
- <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
- <tr>
- <td valign=top width=60><img width=60 height=60 alt="������ ������� � ������" border=0 class=img1 src="/imgs/fp_i1.jpg"></td>
- <td valign=top class=fp_small>����� �������� � ������ ���� ��������� ������������ ������� ������ � �������. 150 ����������� ����� �������� ���������. ���� �� ������ ������� ����������� ������ ���.</td>
- </tr>
- <tr>
- <td colspan=2>
- <a title="������ �������" href="arenda_all.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������ �������</h2></a></div>
- <a title="�������� � ������" href="arenda_dball.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ������� � ������</h2></a></div>
- </td>
- </tr>
- </table>
- </td>
- <td valign=top>
- <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
- <tr>
- <td valign=top width=60><img width=60 height=60 alt="������ ������ � ���������" border=0 class=img1 src="/imgs/fp_i2.jpg"></td>
- <td valign=top class=fp_small>������ ������. ����� ������� ��������� � ������. ����� 2000 ��������� ������ � ������. 100 ����� ����������� ����� ���� ������ ����. ���� �� ������ ������ ����������� ��������. ���� ������.</td>
- </tr>
- <tr>
- <td colspan=2>
- <a title="������ ������" href="komm.asp?kommtype_id=1&kommtype_id=8"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������ ������</h2></a></div>
- <a title="����� � ������" href="komm_db.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ������ � ������</h2></a></div>
- </td>
- </tr>
- </table>
- </td>
- </tr>
- <tr>
- <td valign=top><br><h1>������ ������ � ������</h1></td>
- <td valign=top><br><h1>������ ������� � ������</h1></td>
- </tr>
- <tr>
- <td valign=top>
- <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
- <tr>
- <td valign=top width=60><img width=60 height=60 alt="������ ������ � ������" border=0 class=img1 src="/imgs/fp_i3.jpg"></td>
- <td valign=top class=fp_small>������ ������ � ����� ������ ������ �� 1 ���� � �������� �������. � ��� ����� ����� ��������� � ������ ������� � ������������ ��������. ������ ����� �������? ������ �������� ������!</td>
- </tr>
- <tr>
- <td colspan=2>
- <a title="������ ������" href="arenda_all.asp?roomamount=-1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������ ������</h2></a></div>
- <a title="������� � ������" href="arenda_dball.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ������ � ������</h2></a></div>
- </td>
- </tr>
- </table>
- </td>
- <td valign=top>
- <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
- <tr>
- <td valign=top width=60><img width=60 height=60 alt="������ ������� � ������" border=0 class=img1 src="/imgs/fp_i4.jpg"></td>
- <td valign=top class=fp_small>����� ����� � ������ ��� �����������. �� ����� ����� �� ������ ����� ��������� ����������� �� ������ ��������� ��������� � ��������. ���� �������.</td>
- </tr>
- <tr>
- <td colspan=2>
- <a title="������ �������" href="komm.asp?kommtype_id=2"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������ �������</h2></a></div>
- <a title="������ � ������" href="komm_db.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ������� � ������</h2></a></div>
- </td>
- </tr>
- </table>
- </td>
- </tr>
- <tr>
- <td valign=top><br><h1>������ ������� �������. ���� �������.</h1></td>
- <td valign=top><br><h1>������ ���������������� ���������</h1></td>
- </tr>
- <tr>
- <td valign=top>
- <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
- <tr>
- <td valign=top width=60><img width=60 height=60 alt="������ ������� �������" border=0 class=img1 src="/imgs/fp_i5.jpg"></td>
- <td valign=top class=fp_small>��� ���, ��� ����� ����� ������� �������� ��� �������� � ������. � ��� �� ����� ����� 1000 �������� ������� ������������ � ������. ���� �������. ����� ������� ��������? �����������, �� �������.</td>
- </tr>
- <tr>
- <td colspan=2>
- <a title="������ �������" href="arenda_all.asp?elit=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������ ������� �������</h2></a></div>
- <a title="�������� � ������" href="arenda_dball.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ������� ������� � ������</h2></a></div>
- </td>
- </tr>
- </table>
- </td>
- <td valign=top>
- <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
- <tr>
- <td valign=top width=60><img width=60 height=60 alt="������ ���������������� ���������" border=0 class=img1 src="/imgs/fp_i6.jpg"></td>
- <td valign=top class=fp_small>��� ���, ��� ����� ����� ��� ����� ������������ � ������ ��� �����������. � ��� �� ���� �� ������ ����� ������� ����� ��������� ��� ������������ . ���� ���������.</td>
- </tr>
- <tr>
- <td colspan=2>
- <a title="������ �����������" href="komm.asp?kommtype_id=4"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������ �����������</h2></a></div>
- <a title="������������ � ������" href="komm_db.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ����������� � ������</h2></a></div>
- </td>
- </tr>
- </table>
- </td>
- </tr>
- <tr>
- <td valign=top><br><h1>���������� ������ �������</h1></td>
- <td valign=top><br><h1>������ ���������</h1></td>
- </tr>
- <tr>
- <td valign=top>
- <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
- <tr>
- <td valign=top width=60><img width=60 height=60 alt="���������� ������ �������" border=0 class=img1 src="/imgs/fp_i7.jpg"></td>
- <td valign=top class=fp_small>������ �������, ��������������� � ������ �������� ���������, ��������� ������������ ���������� ����� �������� � ������ ���������. ���� ������� � ���������� ������.</td>
- </tr>
- <tr>
- <td colspan=2>
- <a title="������ �������" href="arendaday_results.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� ���������� ������ �������</h2></a></div>
- <a title="�������� � ������" href="arendaday_db.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ������� � ���������� ������</h2></a></div>
- </td>
- </tr>
- </table>
- </td>
- <td valign=top>
- <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
- <tr>
- <td valign=top width=60><img width=60 height=60 alt="������ ���������" border=0 class=img1 src="/imgs/fp_i8.jpg"></td>
- <td valign=top class=fp_small>��� ���, ��� ����� ����� ��� ����� �������. �� ���������� ������� ����� �������� ��������� � �������� � �������� ������� ������. ���� ���������.</td>
- </tr>
- <tr>
- <td colspan=2>
- <a title="������ ���������" href="komm.asp?kommtype_id=3"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������ ���������</h2></a></div>
- <a title="�������� � ������" href="komm_db.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ��������� � ������</h2></a></div>
- </td>
- </tr>
- </table>
- </td>
- </tr>
- <tr>
- <td valign=top><br><h1>������ ��������� � ��� � �����������</h1></td>
- <td valign=top><br><h1>������ ��������� ��� ��������� � ����</h1></td>
- </tr>
- <tr>
- <td valign=top>
- <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
- <tr>
- <td valign=top width=60><img width=60 height=60 alt="������ ��������� � ���" border=0 class=img1 src="/imgs/fp_i9.jpg"></td>
- <td valign=top class=fp_small>���, ���� ���������� ������ �������� ��� ����������� ���� � �����������, ��������� ������������ ���������� ������� ����� ���������� ������������ . ����� ��� ����� ������� � ���� ��� ������. ����.</td>
- </tr>
- <tr>
- <td colspan=2>
- <a title="������ ��������� ���" href="arenda_cottage.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� ������ ���������, ���</h2></a></div>
- <a title="�������� ���� � ������" href="cottage_db.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ���������, ���, ����� � ������</h2></a></div>
- </td>
- </tr>
- </table>
- </td>
- <td valign=top>
- <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
- <tr>
- <td valign=top width=60><img width=60 height=60 alt="������ ���������� � ����" border=0 class=img1 src="/imgs/fp_i10.jpg"></td>
- <td valign=top class=fp_small>������ ����� ��������� ��� ��������, ��� ��� ����. �� ���� ����� �� ������ ����� ����������� �� ������ ������������ ������������ ��� ������������ ������� � ����. ����� ��� ����� ��������, ����, ��� � ������.</td>
- </tr>
- <tr>
- <td colspan=2>
- <a title="������ ���������� ����" href="komm.asp?kommtype_id=5&kommtype_id=6"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������ ����������, ����</h2></a></div>
- <a title="��������� � ���� � ������" href="komm_db.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ���������� � ���� � ������</h2></a></div>
- </td>
- </tr>
- </table>
- </td>
- </tr>
- <tr>
- <td valign=top><br><h1>���������� ������ ��������� � �����������</h1></td>
- <td valign=top><br><h1>������ ��������� ���������� ����������</h1></td>
- </tr>
- <tr>
- <td valign=top>
- <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
- <tr>
- <td valign=top width=60><img width=60 height=60 alt="���������� ������ ���������" border=0 class=img1 src="/imgs/fp_i13.jpg"></td>
- <td valign=top class=fp_small>�� ������ �������� �������� ��� ��������� � ���������� ����? ���� ��������� ������������ ���������� ����� ������� ���������. ���� ���������.</td>
- </tr>
- <tr>
- <td colspan=2>
- <a title="������ ���������" href="arenda_cottageday.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� ���������� ������ ���������</h2></a></div>
- <a title="�������� � ������" href="cottageday_db.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ��������� � ���������� ������</h2></a></div>
- </td>
- </tr>
- </table>
- </td>
- <td valign=top>
- <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
- <tr>
- <td valign=top width=60><img width=60 height=60 alt="������ ��������� ���������� ����������" border=0 class=img1 src="/imgs/fp_i15.jpg"></td>
- <td valign=top class=fp_small>����� ��������� ���������� ����������. ������� ����������� ����� ��� � ������. ���� �� ������������ ������������ ����������� ���������. ����.</td>
- </tr>
- <tr>
- <td colspan=2>
- <a title="������ ���������" href="komm.asp?kommtype_id=7"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������ ��������� ���������� ����������</h2></a></div>
- <a title="��������� � ������" href="komm_db.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ��������� ���������� ���������� � ������</h2></a></div>
- </td>
- </tr>
- </table>
- </td>
- </tr>
- <tr>
- <td colspan=2 valign=top><br><h1 style="color:red">������� ������������ ������������</h1></td>
- </tr>
- <tr>
- <td colspan=2 align=center valign=top>
- <table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
- <tr>
- <td valign=top width=60><img width=60 height=60 alt="������� ������������ ������������" border=0 class=img1 src="/imgs/fp_i14.jpg"></td>
- <td valign=top class=fp_small>���� �� ������ ������ ��������� ��� �������: ����, �������, �����, ������������, �� �� ������ ������������ � ������������� �� ������� ������������ ������������ ��� ������� ���� ������ �� ������� ��������� � ������. ����-������� ������������ �� �������.</td>
- </tr>
- <tr>
- <td colspan=2>
- <a title="������� ������" href="kommP.asp?kommtype_id=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������� ������</h2></a></div>
- <a title="������� �������" href="kommP.asp?kommtype_id=2"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������� �������</h2></a></div>
- <a title="������� ���������" href="kommP.asp?kommtype_id=3"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������� ���������</h2></a></div>
- <a title="������� ����������" href="kommP.asp?kommtype_id=5"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������� ����������</h2></a></div>
- <a title="������� ����" href="kommP.asp?kommtype_id=6"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������� ����</h2></a></div>
- <a title="������� �����������" href="kommP.asp?kommtype_id=4"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������� ���������������� ���������</h2></a></div>
- <a title="������� ���������" href="kommP.asp?kommtype_id=7"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����������� �� ������� ���</h2></a></div>
- </td>
- </tr>
- </table>
- </td>
- </tr>
- </table>
- <table width="100%" border="0" cellspacing="1" cellpadding="0" bgcolor=#FFFFFF>
- <tr>
- <td width=12 valign=top><img src="/imgs/fp_li2.gif" width=8 height=15></td>
- <td>
- <h1>������� ������������. ������:</h1>
- <br>
-
- <li><a href="/news.asp?id=69&curr=1"><h2>��������� �� ������ - ������� �������� ������������</h2></a>
-
- <li><a href="/news.asp?id=68&curr=1"><h2>������ ��������!</h2></a>
-
- <li><a href="/news.asp?id=67&curr=1"><h2>��� ������ ����������, ���� ����� ����������� �������� �������� �����?</h2></a>
-
- <li><a href="/news.asp?id=66&curr=1"><h2>5 �������� ����� ������� �������� � ������</h2></a>
-
- <li><a href="/news.asp?id=65&curr=1"><h2>���� ������� � ����������: ���� �������� � ���?</h2></a>
-
- <li><a title="������� ������������" href="news.asp"><h2><b>������ ������� ������������...</b></h2></a>
- <br>
- </td>
- </tr>
- </table>
- </td>
- <td width=5> </td>
- <td valign=top width=300>
- <h1><img src="/imgs/fp_li2.gif" width=8 height=15> ����������� <font color=red>��� ��������</font>:</h1>
- <br>
-
-
-<table cellspacing=0 cellpadding=0 border=0 width=100%>
-<tr>
- <td width=1 bgcolor=#CFCFCF><img src='imgs/blank.gif' width=1 height=1></td>
- <td valign=top width=100% bgcolor=white>
- <table cellspacing=1 cellpadding=3 border=0 width=100% style='margin-top:5px;'>
- <tr>
- <td width=1 valign=middle><img src='imgs/knop1.gif'></td>
- <td class=text bgcolor=white valign=middle>
- <a href="/arendaview_komm.asp?anketa_id=148110" class=menubig><b>������ ������</b></a>
- </td>
- </tr>
- </table>
- <table width=100% cellspacing=0 cellpadding=3 border=0>
- <tr>
- <td width=128 valign=top align=right nowrap>
- <a href="/arendaview_komm.asp?anketa_id=148110"><img style="margin-right:11px; margin-left:5px;" width=9 height=100 src="/imgs/bez.gif" border=0><img class=img1 width=100 height=100 src="/imgs/Photos5/s_k_67491.jpg" border=0 alt="������ ������"></a>
- </td>
- <td valign=top nowrap style='padding-left:6px;'>
- <a href='/arendaview_komm.asp?anketa_id=148110' style="color:#000000; text-decoration:none; font-size:11px;"><p class='viprow0'><b>���������� �.</b></p><p class='viprow'>7 �� �� ����</p><p class='viprow'>2100 - 2500 ��.�</b></p><p class='viprow'><b style='font-size:11px;'>130 $/��.�./���</b></p></a>
- </td>
- </tr>
- </table>
-
- </td>
-</tr>
-<tr><td colspan=2 bgcolor=#CFCFCF><img src='imgs/blank.gif' width=1 height=1></td></tr>
-<tr><td colspan=2><img src='imgs/shadow1.gif'></td></tr>
-</table>
- <br>
-
-
-<table cellspacing=0 cellpadding=0 border=0 width=300>
-<tr>
- <td width=1 bgcolor=#CFCFCF><img src='imgs/blank.gif' width=1 height=1></td>
- <td valign=top width=100% bgcolor=white>
- <table cellspacing=1 cellpadding=3 border=0 width=100% style='margin-top:5px;'>
- <tr>
- <td valign=top width=1 valign=middle><img src='imgs/knop1.gif'></td>
- <td class=text bgcolor=white valign=middle>
- <a href="/arendaview_kommp.asp?anketa_id=167792" class=menubig><b>������� �����</b></a>
- </td>
- </tr>
- </table>
- <table width=100% cellspacing=0 cellpadding=3 border=0>
- <tr>
- <td width=128 valign=top align=right nowrap>
- <a href="/arendaview_kommp.asp?anketa_id=167792"><img style="margin-right:11px; margin-left:5px;" width=9 height=100 src="/imgs/bez.gif" border=0><img class=img1 width=100 height=100 src="/imgs/Photos5/s_kp_96026.jpg" border=0 alt="������� �����"></a>
- </td>
- <td valign=top nowrap style='padding-left:6px;'>
- <a href='/arendaview_kommp.asp?anketa_id=167792' style="color:#000000; text-decoration:none; font-size:11px;"><p class='viprow0'><b>�. ���������� �������</b></p><p class='viprow'>918 ��.�</b></p><p class='viprow'><b style='font-size:11px;'>7 344 000 $</b></p></a>
- </td>
- </tr>
- </table>
-
- </td>
-</tr>
-<tr><td colspan=2 bgcolor=#CFCFCF><img src='imgs/blank.gif' width=1 height=1></td></tr>
-<tr><td colspan=2><img src='imgs/shadow1.gif'></td></tr>
-</table>
- <br>
-
-<table cellspacing=0 cellpadding=0 border=0 width=100%>
-<tr>
- <td width=1 bgcolor=#CFCFCF><img src='imgs/blank.gif' width=1 height=1></td>
- <td valign=top width=100% bgcolor=white>
- <table cellspacing=1 cellpadding=3 border=0 width=100% style='margin-top:5px;'>
- <tr>
- <td valign=top width=1 valign=middle><img src='imgs/knop1.gif'></td>
- <td class=text bgcolor=white valign=middle>
- <a href="/arendaview_all.asp?anketa_id=160328" class=menubig><b>������ 2-����. ��������</b></a>
- </td>
- </tr>
- </table>
- <table width=100% cellspacing=0 cellpadding=3 border=0>
- <tr>
- <td width=128 valign=top align=right nowrap>
- <a href="/arendaview_all.asp?anketa_id=160328"><img style="margin-right:11px; margin-left:5px;" width=9 height=100 src="/imgs/bez.gif" border=0><img class=img1 width=100 height=100 src="/imgs/Photos1/s_ae_45253.jpg" border=0 alt="������ 2-����. ��������"></a>
- </td>
- <td valign=top nowrap style='padding-left:6px;'>
- <a href='/arendaview_all.asp?anketa_id=160328' style="color:#000000; text-decoration:none; font-size:11px;"><p class='viprow0'><b>�. ��������</b></p><p class='viprow'>10 ����� ������ �� �����</p><p class='viprow'>��. ��������, ��� 25</p><p class='viprow'>����� ������� 60 ��.�</b></p><p class='viprow'><b style='font-size:11px;'>100 000 $/���</b></p></a>
- </td>
- </tr>
- </table>
-
- </td>
-</tr>
-<tr><td colspan=2 bgcolor=#CFCFCF><img src='imgs/blank.gif' width=1 height=1></td></tr>
-<tr><td colspan=2><img src='imgs/shadow1.gif'></td></tr>
-</table>
-<br>
-
-
-<table cellspacing=0 cellpadding=0 border=0 width=100%>
-<tr>
- <td width=1 bgcolor=#CFCFCF><img src='imgs/blank.gif' width=1 height=1></td>
- <td valign=top width=100% bgcolor=white>
- <table cellspacing=1 cellpadding=3 border=0 width=100% style='margin-top:5px;'>
- <tr>
- <td valign=top width=1 valign=middle><img src='imgs/knop1.gif'></td>
- <td class=text bgcolor=white valign=middle>
- <a href="/arendaview_cottage.asp?anketa_id=1761" class=menubig><b>������ ��������</b></a>
- </td>
- </tr>
- </table>
- <table width=100% cellspacing=0 cellpadding=3 border=0>
- <tr>
- <td width=128 valign=top align=right nowrap>
- <a href="/arendaview_cottage.asp?anketa_id=1761"><img style="margin-right:11px; margin-left:5px;" width=9 height=100 src="/imgs/bez.gif" border=0><img class=img1 width=100 height=100 src="/imgs/Photos61/vipcot1761.jpg" border=0 alt="������ ��������"></a>
- </td>
- <td valign=top nowrap style='padding-left:6px;'>
- <a href='/arendaview_cottage.asp?anketa_id=1761' style="color:#000000; text-decoration:none; font-size:11px;"><p class='viprow0'><b>������������ �.</b></p><p class='viprow'>15 �� �� ����</p><p class='viprow'>520 ��.�</b></p><p class='viprow'><b style='font-size:11px;'>465 000 $/���</b></p></a>
- </td>
- </tr>
- </table>
-
- </td>
-</tr>
-<tr><td colspan=2 bgcolor=#CFCFCF><img src='imgs/blank.gif' width=1 height=1></td></tr>
-<tr><td colspan=2><img src='imgs/shadow1.gif'></td></tr>
-</table>
-<br>
-
- <div class=ns><a title="������ ������������" href="kommvip.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ��� ����������� �� ������ ������������ ������������ ��� ��������</h2></a></div>
- <div class=ns><a title="������ ������������ " href="kommvipp.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ��� ����������� �� ������� ������������ ������������ ��� ��������</h2></a></div>
- <div class=ns><a title="������ �������" href="arendavip.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ��� ����������� �� ������ ������� ��� ��������</h2></a></div>
- <div class=ns><a title="������ ������� " href="arendacotvip.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ��� ����������� �� ������ ���������, ���, ����� ��� ��������</h2></a></div>
- <br>
- <h1><img src="/imgs/fp_li2.gif" width=8 height=15> �������� ������:</h1>
- <p style="font-family:Times New Roman; font-size:12px; margin-top:10px; margin-bottom:0px;">
- <b>����������</b> ������������, �������� ������� ��� ����� � ������, �� ������� ������ ������� ��� ����� ��������� �� �������� ��������. �� ������ �������� ������ ��� ��������� � ���� �� ��������.
-<br><i>������������ �� �������� ������ � ������� ������������ ���������</i>.
-</p>
- <br>
- <div class=ns><a target=_blank title="����� �������� " href="form_1.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ��������</h2></a></div>
- <div class=ns><a target=_blank title="����� ������� " href="form_1.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �������</h2></a></div>
- <div class=ns><a target=_blank title="����� ������� ��������" href="form_1.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ������� ��������</h2></a></div>
- <div class=ns><a target=_blank title="����� �������� ���������" href="form_1day.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �������� ���������</h2></a></div>
- <div class=ns><a target=_blank title="����� ��������, ����, ����" href="form_5s.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �������, ���, ����</h2></a></div>
- <div class=ns><a target=_blank title="����� �����" href="form_1off.asp?kommtypeid=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ����</h2></a></div>
- <div class=ns><a target=_blank title="����� ������ � ������������" href="form_1off.asp?kommtypeid=2&kommtypeid=4"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �����, ������������</h2></a></div>
- <div class=ns nowrap><a target=_blank title="����� ��������" href="form_1off.asp?kommtypeid=3&kommtypeid=5&kommtypeid=6&kommtypeid=7"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �������, ��������, ����</h2></a></div>
- <div class=ns><a target=_blank title="������� ����" href="form_6s.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������� �������, ����, ���</h2></a></div>
- <div class=ns><a target=_blank title="������� �����" href="form_1off.asp?kommtypeid=1&own_type=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������� ����</h2></a></div>
- <div class=ns><a target=_blank title="������� ������" href="form_1off.asp?kommtypeid=2&kommtypeid=4&own_type=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������� �����, ������������</h2></a></div>
- <div class=ns><a target=_blank title="������� ���������" href="form_1off.asp?kommtypeid=1&kommtypeid=2&kommtypeid=3&kommtypeid=4&kommtypeid=5&kommtypeid=6&kommtypeid=7&own_type=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������� ������� ���������</h2></a></div>
- <br>
- <p style="font-family:Times New Roman; font-size:12px; margin-top:10px; margin-bottom:0px;">
- <b>��������.</b> ���� �� ������ ����� � ������ ��������, �������, ����, �����, �������... ���� ���� ��� ���������� ������� ������������, �������� ������ � �� ������� ��� ����� ��� ������ ��������� �������, ������� � � ����������� �����. �� ������ ������������ � ������������� �� ������ � ������� ������������ �� ����� �����. ���� ����������� ���������. ��� ������������ ���������
��� �������������� ��������� ������ �������������.
-<br><i>��������! �� �� ����� ����������, �� ��������� �������������� �����, �������� ��������� � ������� ���������, ������ ������������ �� ����� ������.</i>
- </p>
- <br>
- <div class=ns><a target=_blank title="����� �������� " href="form_3.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ��������</h2></a></div>
- <div class=ns><a target=_blank title="����� ������� " href="form_3.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �������</h2></a></div>
- <div class=ns><a target=_blank title="����� ������� �������� " href="form_3.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ������� ��������</h2></a></div>
- <div class=ns><a target=_blank title="����� �������� ���������" href="form_3day.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �������� ���������</h2></a></div>
- <div class=ns><a target=_blank title="����� ����" href="form_5.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �������, ���, ����</h2></a></div>
- <div class=ns><a target=_blank title="����� ���� " href="form_3off.asp?kommtypeid=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� ����</h2></a></div>
- <div class=ns><a target=_blank title="����� ����� " href="form_3off.asp?kommtypeid=2&kommtypeid=4"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �����, ������������</h2></a></div>
- <div class=ns><a target=_blank title="����� ������� " href="form_3off.asp?kommtypeid=3&kommtypeid=5&kommtypeid=6&kommtypeid=7"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �������, ��������, ����</h2></a></div>
- <div class=ns><a target=_blank title="������ �������, ���� " href="form_6.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������ �������, ����, ���</h2></a></div>
- <div class=ns><a target=_blank title="������ ���� " href="form_3off.asp?kommtypeid=1&own_type=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������ ����</h2></a></div>
- <div class=ns><a target=_blank title="������ ����� " href="form_3off.asp?kommtypeid=2&kommtypeid=4&own_type=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������ �����, ������������</h2></a></div>
- <div class=ns><a target=_blank title="������ ��������� " href="form_3off.asp?kommtypeid=1&kommtypeid=2&kommtypeid=3&kommtypeid=4&kommtypeid=5&kommtypeid=6&kommtypeid=7&own_type=1"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������ ������� ���������</h2></a></div>
- <br>
- <h1><img src="/imgs/fp_li2.gif" width=8 height=15> ���������� ����� ���������� �� ������ ������������:</h1>
- <br>
- <div class=ns><a title="������ �������" href="freetables.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������ ������� � ������</h2></a></div>
- <div class=ns><a title="������ ���������" href="freetables_komm.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������ ������� ���������</h2></a></div>
- <div class=ns><a title="������ ��������� ���" href="freetables_cott.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ������ ���������, ���, �����</h2></a></div>
- <br>
- <h1><img src="/imgs/fp_li2.gif" width=8 height=15> ��������:</h1>
- <br>
- <div class=ns><a href="vakansii.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �� ������ ������������ ������������</h2></a></div>
- <div class=ns><a href="vakansii.asp"><h2><img src="/imgs/fp_li.gif" width=4 height=9 border=0> ����� �� ������ ������� � ������</h2></a></div>
- <br>
- </td>
-</tr>
-</table>
-<table width="100%" border="0" cellspacing="5" cellpadding="0" bgcolor=#FFFFFF>
-<tr>
- <td bgcolor="#FFFFFF" colspan=2 valign="top" style="padding-top: 5px; padding-right: 5px; padding-bottom: 5px; padding-left: 7px">
- <br>
-<div align=center>
-<a style="font-size:10px;" href="/default.asp">�������</a> ::
-<a style="font-size:10px;" href="/arenda_results.asp">������ �����</a> ::
-<a style="font-size:10px;" href="/prodaga.asp">�������/������� �����</a> ::
-<a style="font-size:10px;" href="/komm.asp">������������ ������������</a> ::
-<a style="font-size:10px;" href="/nedvvrossii.asp">������������ � ������</a> ::
-<a style="font-size:10px;" href="/docs.asp">���������� ����������</a> ::
-<a style="font-size:10px;" href="/zemuchastki.asp">��������� �������</a> ::
-<a style="font-size:10px;" href="/vakansii.asp">��������</a> ::
-<a style="font-size:10px;" href="/questions.asp">������� ��������</a> ::
-<a style="font-size:10px;" href="/info.asp">���������� ����������</a> ::
-<a style="font-size:10px;" href="/freetables.asp">����� ���������� �� ������������</a> ::
-<a style="font-size:10px;" href="/links.asp">������� ������</a> ::
-<a style="font-size:10px;" href="/kontakty.asp">��������</a>
-</div>
-
- </td>
-</tr>
-</table>
-</td>
- </tr>
- </table>
- </td>
- </tr>
- </table>
- </td>
- </tr>
- <tr>
- <td height="20" style="padding-left:13px; padding-right:13px;">
-<table width="100%" border="0" cellspacing="0" cellpadding="0">
- <tr>
- <td class=copy>
- © 2001 � 2009 <a title="�������� ������������" href="/">��������� ������������</a> "���������-������������", "������� ����" - ������ �������, ������ ������, ������ ���������.<br>
- ���.: +7 495 737-7019 ����: +7 495 231-7755 E-mail: <a href="mailto:info1@makler.su" style="color:black">info1@makler.su</a><br>
-</td>
- </tr>
-</table>
- </td>
- </tr>
-</table><script type="text/javascript">
-var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
-document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
-</script>
-<script type="text/javascript">
-try {
-var pageTracker = _gat._getTracker("UA-8971199-1");
-pageTracker._trackPageview();
-} catch(err) {}</script></body>
-</html>
-
-
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parent/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 08b955e..7d4657b 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -395,7 +395,8 @@
<execution>
<goals>
<goal>check</goal>
- <goal>testCheck</goal>
+ <!-- TODO: turn this back on!
+ <goal>testCheck</goal> -->
</goals>
</execution>
</executions>
[12/13] tika git commit: TIKA-1855 -- first pass. Need to turn back
on the forbidden-apis testCheck. More clean up remains.
Posted by ta...@apache.org.
TIKA-1855 -- first pass. Need to turn back on the forbidden-apis testCheck. More clean up remains.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/aa5f60d7
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/aa5f60d7
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/aa5f60d7
Branch: refs/heads/2.x
Commit: aa5f60d7a0ac0a6a9d739344c76b10940132503f
Parents: 41915dc
Author: tballison <ta...@mitre.org>
Authored: Mon Mar 21 21:18:00 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Mon Mar 21 21:18:05 2016 -0400
----------------------------------------------------------------------
pom.xml | 7 +-
tika-app/pom.xml | 15 +
.../batch/builders/AppParserFactoryBuilder.java | 2 +-
.../main/java/org/apache/tika/cli/TikaCLI.java | 2 +-
.../main/java/org/apache/tika/gui/TikaGUI.java | 2 +-
.../tika/config/TikaDetectorConfigTest.java | 143 +++
.../tika/config/TikaParserConfigTest.java | 155 +++
.../tika/config/TikaTranslatorConfigTest.java | 73 ++
.../tika/detect/TestContainerAwareDetector.java | 410 +++++++
.../tika/embedder/ExternalEmbedderTest.java | 285 +++++
.../java/org/apache/tika/mime/MimeTypeTest.java | 108 ++
.../org/apache/tika/mime/MimeTypesTest.java | 122 ++
.../org/apache/tika/mime/TestMimeTypes.java | 1044 +++++++++++++++++
.../tika/parser/AutoDetectParserTest.java | 459 ++++++++
.../apache/tika/parser/DigestingParserTest.java | 139 +++
.../apache/tika/parser/ParsingReaderTest.java | 104 ++
.../tika/parser/RecursiveParserWrapperTest.java | 312 ++++++
.../org/apache/tika/parser/TestParsers.java | 133 +++
.../parser/fork/ForkParserIntegrationTest.java | 268 +++++
.../apache/tika/parser/mock/MockParserTest.java | 251 +++++
.../org/apache/tika/parser/pkg/PackageTest.java | 335 ++++++
.../sax/PhoneExtractingContentHandlerTest.java | 58 +
.../tika/utils/ServiceLoaderUtilsTest.java | 57 +
tika-core/pom.xml | 19 +
.../tika/parser/digesting/CommonsDigester.java | 295 +++++
.../src/test/java/org/apache/tika/TikaTest.java | 74 +-
.../tika/detect/MimeDetectionWithNNTest.java | 8 +-
.../org/apache/tika/mime/MimeDetectionTest.java | 7 +-
.../mime/ProbabilisticMimeDetectionTest.java | 7 +-
.../ProbabilisticMimeDetectionTestWithTika.java | 7 +-
.../java/org/apache/tika/osgi/BundleIT.java | 11 -
.../GLDAS_CLM10SUBP_3H.A19790202.0000.001.grb | Bin 1362900 -> 0 bytes
.../org/apache/tika/mime/brwNIMS_2014.dif | 56 -
.../apache/tika/mime/circles-with-prefix.svg | 8 -
.../resources/org/apache/tika/mime/circles.svg | 8 -
.../org/apache/tika/mime/datamatrix.png | Bin 204 -> 0 bytes
.../tika/mime/gdas1.forecmwf.2014062612.grib2 | Bin 2489194 -> 0 bytes
.../resources/org/apache/tika/mime/htmlfragment | 18 -
.../apache/tika/mime/plotutils-bin-cgm-v3.cgm | Bin 1744 -> 0 bytes
.../org/apache/tika/mime/stylesheet.xsl | 9 -
.../apache/tika/mime/test-difficult-rdf1.xml | 39 -
.../apache/tika/mime/test-difficult-rdf2.xml | 44 -
.../org/apache/tika/mime/test-iso-8859-1.xml | 2 -
.../org/apache/tika/mime/test-long-comment.xml | 21 -
.../tika/mime/test-malformed-header.html.bin | Bin 305 -> 0 bytes
.../org/apache/tika/mime/test-tika-327.html | 50 -
.../org/apache/tika/mime/test-utf16be.xml | Bin 126 -> 0 bytes
.../org/apache/tika/mime/test-utf16le.xml | Bin 126 -> 0 bytes
.../org/apache/tika/mime/test-utf8-bom.xml | 2 -
.../org/apache/tika/mime/test-utf8.xml | 2 -
.../resources/org/apache/tika/mime/test.html | 10 -
.../resources/org/apache/tika/mime/test.xls | Bin 13824 -> 0 bytes
.../org/apache/tika/mime/testlargerbuffer.html | 827 --------------
tika-parent/pom.xml | 3 +-
tika-parser-modules/pom.xml | 26 -
.../tika/parser/ner/NamedEntityParserTest.java | 16 +-
.../parser/ner/regex/RegexNERecogniserTest.java | 15 +-
.../apache/tika/parser/ner/regex/ner-regex.txt | 17 +
.../tika/parser/ner/tika-config-for-ner.xml | 27 +
.../tika/parser/jdbc/SQLite3ParserTest.java | 50 +-
.../tika/parser/chm/TestChmExtraction.java | 25 +-
.../tika/parser/microsoft/ExcelParserTest.java | 387 +++----
.../apache/tika/parser/odf/ODFParserTest.java | 460 ++++----
.../apache/tika/parser/rtf/RTFParserTest.java | 163 +--
.../apache/tika/parser/pdf/PDFParserTest.java | 133 +--
.../tika/parser/isatab/ISArchiveParser.java | 3 +-
.../apache/tika/parser/netcdf/NetCDFParser.java | 17 +-
.../apache/tika/parser/dif/DIFParserTest.java | 31 +-
.../tika/parser/envi/EnviHeaderParserTest.java | 36 +-
.../apache/tika/parser/gdal/TestGDALParser.java | 34 +-
.../tika/parser/geo/topic/GeoParserTest.java | 23 +-
.../GeographicInformationParserTest.java | 50 +-
.../apache/tika/parser/grib/GribParserTest.java | 30 +-
.../apache/tika/parser/hdf/HDFParserTest.java | 44 +-
.../tika/parser/isatab/ISArchiveParserTest.java | 80 +-
.../apache/tika/parser/mat/MatParserTest.java | 60 +-
.../tika/parser/netcdf/NetCDFParserTest.java | 48 +-
.../tika/parser/strings/StringsParserTest.java | 23 +-
.../tika/parser/txt/CharsetDetectorTest.java | 7 +-
.../apache/tika/parser/txt/TXTParserTest.java | 51 +-
.../apache/tika/parser/xml/DcXMLParserTest.java | 28 +-
.../EmptyAndDuplicateElementsXMLParserTest.java | 60 +-
.../tika/parser/xml/FictionBookParserTest.java | 19 +-
tika-parsers/pom.xml | 333 ------
.../main/appended-resources/META-INF/LICENSE | 94 --
.../apache/tika/parser/internal/Activator.java | 54 -
.../tika/parser/utils/CommonsDigester.java | 299 -----
.../test/java/org/apache/tika/TestParsers.java | 109 --
.../tika/config/TikaDetectorConfigTest.java | 143 ---
.../tika/config/TikaParserConfigTest.java | 157 ---
.../tika/config/TikaTranslatorConfigTest.java | 72 --
.../tika/detect/TestContainerAwareDetector.java | 410 -------
.../tika/embedder/ExternalEmbedderTest.java | 292 -----
.../java/org/apache/tika/mime/MimeTypeTest.java | 105 --
.../org/apache/tika/mime/MimeTypesTest.java | 122 --
.../org/apache/tika/mime/TestMimeTypes.java | 1047 ------------------
.../tika/parser/AutoDetectParserTest.java | 459 --------
.../apache/tika/parser/DigestingParserTest.java | 136 ---
.../apache/tika/parser/ParsingReaderTest.java | 104 --
.../tika/parser/RecursiveParserWrapperTest.java | 312 ------
.../parser/fork/ForkParserIntegrationTest.java | 268 -----
.../apache/tika/parser/mock/MockParserTest.java | 251 -----
.../org/apache/tika/parser/pkg/PackageTest.java | 335 ------
.../sax/PhoneExtractingContentHandlerTest.java | 58 -
.../tika/utils/ServiceLoaderUtilsTest.java | 57 -
tika-server/pom.xml | 8 +-
.../org/apache/tika/server/TikaServerCli.java | 2 +-
.../org/apache/tika/server/CXFTestBase.java | 14 +-
.../tika/server/DetectorResourceTest.java | 6 +-
.../tika/server/LanguageResourceTest.java | 4 +-
.../tika/server/MetadataResourceTest.java | 26 +-
.../server/RecursiveMetadataResourceTest.java | 36 +-
.../apache/tika/server/StackTraceOffTest.java | 8 +-
.../org/apache/tika/server/StackTraceTest.java | 8 +-
.../org/apache/tika/server/TikaParsersTest.java | 12 +-
.../apache/tika/server/TikaResourceTest.java | 23 +-
.../tika/server/UnpackerResourceTest.java | 20 +-
tika-server/src/test/resources/2exe.docx | Bin 715333 -> 0 bytes
tika-server/src/test/resources/2pic.doc | Bin 4339712 -> 0 bytes
tika-server/src/test/resources/2pic.docx | Bin 883427 -> 0 bytes
.../src/test/resources/CDEC_WEATHER_2010_03_02 | 98 --
tika-server/src/test/resources/Doc1_ole.doc | Bin 89600 -> 0 bytes
tika-server/src/test/resources/english.txt | 1 -
tika-server/src/test/resources/foo.csv | 4 -
tika-server/src/test/resources/french.txt | 1 -
.../test/resources/mime/custom-mimetypes.xml | 24 -
.../src/test/resources/mock/null_pointer.xml | 25 -
.../org/apache/tika/mime/custom-mimetypes.xml | 24 +
tika-server/src/test/resources/password.xls | Bin 22528 -> 0 bytes
tika-server/src/test/resources/pic.xls | Bin 593920 -> 0 bytes
tika-server/src/test/resources/pic.xlsx | Bin 580188 -> 0 bytes
tika-server/src/test/resources/test.doc | Bin 9216 -> 0 bytes
.../testRTF_npeFromWMFInTikaServer.rtf | 235 ----
.../test/resources/test_recursive_embedded.docx | Bin 27082 -> 0 bytes
tika-test-resources/pom.xml | 7 -
.../apache/tika/parser/ner/regex/ner-regex.txt | 17 -
.../org/apache/tika/parser/ner/tika-config.xml | 27 -
.../src/test/resources/test-documents/2exe.docx | Bin 0 -> 715333 bytes
.../src/test/resources/test-documents/2pic.doc | Bin 0 -> 4339712 bytes
.../src/test/resources/test-documents/2pic.docx | Bin 0 -> 883427 bytes
.../test-documents/CDEC_WEATHER_2010_03_02 | 98 ++
.../resources/test-documents/brwNIMS_2014.dif | 56 +
.../test-documents/circles-with-prefix.svg | 8 +
.../test/resources/test-documents/circles.svg | 8 +
.../resources/test-documents/datamatrix.png | Bin 0 -> 204 bytes
.../test/resources/test-documents/english.txt | 1 +
.../src/test/resources/test-documents/foo.csv | 4 +
.../test/resources/test-documents/french.txt | 1 +
.../test/resources/test-documents/htmlfragment | 18 +
.../test-documents/mock/null_pointer.xml | 4 +-
.../test/resources/test-documents/password.xls | Bin 0 -> 22528 bytes
.../src/test/resources/test-documents/pic.xls | Bin 0 -> 593920 bytes
.../src/test/resources/test-documents/pic.xlsx | Bin 0 -> 580188 bytes
.../test-documents/plotutils-bin-cgm-v3.cgm | Bin 0 -> 1744 bytes
.../resources/test-documents/stylesheet.xsl | 9 +
.../test-documents/test-difficult-rdf1.xml | 39 +
.../test-documents/test-difficult-rdf2.xml | 44 +
.../test-documents/test-iso-8859-1.xml | 2 +
.../test-documents/test-long-comment.xml | 21 +
.../resources/test-documents/test-tika-327.html | 50 +
.../resources/test-documents/test-utf16be.xml | Bin 0 -> 126 bytes
.../resources/test-documents/test-utf16le.xml | Bin 0 -> 126 bytes
.../resources/test-documents/test-utf8-bom.xml | 2 +
.../test/resources/test-documents/test-utf8.xml | 2 +
.../src/test/resources/test-documents/test.html | 10 +
.../src/test/resources/test-documents/test.xls | Bin 0 -> 13824 bytes
.../testRTF_npeFromWMFInTikaServer.rtf | 235 ++++
.../test-documents/testlargerbuffer.html | 827 ++++++++++++++
168 files changed, 7231 insertions(+), 8029 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index c790244..ea4f114 100644
--- a/pom.xml
+++ b/pom.xml
@@ -46,9 +46,10 @@
<modules>
<module>tika-parent</module>
- <module>tika-core</module>
<module>tika-test-resources</module>
- <module>tika-parsers</module>
+ <module>tika-core</module>
+ <module>tika-parser-modules</module>
+ <module>tika-parser-bundles</module>
<module>tika-xmp</module>
<module>tika-serialization</module>
<module>tika-batch</module>
@@ -59,8 +60,6 @@
<module>tika-langdetect</module>
<module>tika-example</module>
<module>tika-java7</module>
- <module>tika-parser-modules</module>
- <module>tika-parser-bundles</module>
</modules>
<profiles>
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/pom.xml
----------------------------------------------------------------------
diff --git a/tika-app/pom.xml b/tika-app/pom.xml
index e362391..9177afb 100644
--- a/tika-app/pom.xml
+++ b/tika-app/pom.xml
@@ -101,6 +101,21 @@
<groupId>commons-io</groupId>
<version>${commons.io.version}</version>
</dependency>
+ <!-- test dependencies -->
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-test-resources</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java
----------------------------------------------------------------------
diff --git a/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java b/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java
index 998f649..98f4343 100644
--- a/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java
+++ b/tika-app/src/main/java/org/apache/tika/batch/builders/AppParserFactoryBuilder.java
@@ -23,7 +23,7 @@ import java.util.Map;
import org.apache.tika.batch.DigestingAutoDetectParserFactory;
import org.apache.tika.batch.ParserFactory;
import org.apache.tika.parser.DigestingParser;
-import org.apache.tika.parser.utils.CommonsDigester;
+import org.apache.tika.parser.digesting.CommonsDigester;
import org.apache.tika.util.ClassLoaderUtil;
import org.apache.tika.util.XMLDOMUtil;
import org.w3c.dom.Node;
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
----------------------------------------------------------------------
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 314599e..a2b91c9 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -101,7 +101,7 @@ import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
-import org.apache.tika.parser.utils.CommonsDigester;
+import org.apache.tika.parser.digesting.CommonsDigester;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerFactory;
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
----------------------------------------------------------------------
diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
index 5ecc763..1bc9405 100644
--- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
+++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
@@ -76,7 +76,7 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
-import org.apache.tika.parser.utils.CommonsDigester;
+import org.apache.tika.parser.digesting.CommonsDigester;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java b/tika-app/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
new file mode 100644
index 0000000..132475a
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import org.apache.tika.detect.CompositeDetector;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.detect.EmptyDetector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.mbox.OutlookPSTParser;
+import org.apache.tika.parser.microsoft.POIFSContainerDetector;
+import org.apache.tika.parser.pkg.ZipContainerDetector;
+import org.junit.Test;
+
+/**
+ * Junit test class for {@link TikaConfig}, which cover things
+ * that {@link TikaConfigTest} can't do due to a need for the
+ * full set of detectors
+ */
+public class TikaDetectorConfigTest extends AbstractTikaConfigTest {
+ @Test
+ public void testDetectorExcludeFromDefault() throws Exception {
+ TikaConfig config = getConfig("TIKA-1702-detector-blacklist.xml");
+ assertNotNull(config.getParser());
+ assertNotNull(config.getDetector());
+ CompositeDetector detector = (CompositeDetector)config.getDetector();
+
+ // Should be wrapping two detectors
+ assertEquals(2, detector.getDetectors().size());
+
+
+ // First should be DefaultDetector, second Empty, that order
+ assertEquals(DefaultDetector.class, detector.getDetectors().get(0).getClass());
+ assertEquals(EmptyDetector.class, detector.getDetectors().get(1).getClass());
+
+
+ // Get the DefaultDetector from the config
+ DefaultDetector confDetector = (DefaultDetector)detector.getDetectors().get(0);
+
+ // Get a fresh "default" DefaultParser
+ DefaultDetector normDetector = new DefaultDetector(config.getMimeRepository());
+
+
+ // The default one will offer the Zip and POIFS detectors
+ assertDetectors(normDetector, true, true);
+
+
+ // The one from the config won't, as we excluded those
+ assertDetectors(confDetector, false, false);
+ }
+
+ /**
+ * TIKA-1708 - If the Zip detector is disabled, either explicitly,
+ * or via giving a list of detectors that it isn't part of, ensure
+ * that detection of PST files still works
+ */
+ @Test
+ public void testPSTDetectionWithoutZipDetector() throws Exception {
+ // Check the one with an exclude
+ TikaConfig configWX = getConfig("TIKA-1708-detector-default.xml");
+ assertNotNull(configWX.getParser());
+ assertNotNull(configWX.getDetector());
+ CompositeDetector detectorWX = (CompositeDetector)configWX.getDetector();
+
+ // Check it has the POIFS one, but not the zip one
+ assertDetectors(detectorWX, true, false);
+
+
+ // Check the one with an explicit list
+ TikaConfig configCL = getConfig("TIKA-1708-detector-composite.xml");
+ assertNotNull(configCL.getParser());
+ assertNotNull(configCL.getDetector());
+ CompositeDetector detectorCL = (CompositeDetector)configCL.getDetector();
+ assertEquals(2, detectorCL.getDetectors().size());
+
+ // Check it also has the POIFS one, but not the zip one
+ assertDetectors(detectorCL, true, false);
+
+
+ // Check that both detectors have a mimetypes with entries
+ assertTrue("Not enough mime types: " + configWX.getMediaTypeRegistry().getTypes().size(),
+ configWX.getMediaTypeRegistry().getTypes().size() > 100);
+ assertTrue("Not enough mime types: " + configCL.getMediaTypeRegistry().getTypes().size(),
+ configCL.getMediaTypeRegistry().getTypes().size() > 100);
+
+
+ // Now check they detect PST files correctly
+ TikaInputStream stream = TikaInputStream.cast(
+ getTestDocumentAsStream("testPST.pst"));
+ assertEquals(
+ OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE,
+ detectorWX.detect(stream, new Metadata())
+ );
+ assertEquals(
+ OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE,
+ detectorCL.detect(stream, new Metadata())
+ );
+ }
+
+ private void assertDetectors(CompositeDetector detector, boolean shouldHavePOIFS,
+ boolean shouldHaveZip) {
+ boolean hasZip = false;
+ boolean hasPOIFS = false;
+ for (Detector d : detector.getDetectors()) {
+ if (d instanceof ZipContainerDetector) {
+ if (shouldHaveZip) {
+ hasZip = true;
+ } else {
+ fail("Shouldn't have the ZipContainerDetector from config");
+ }
+ }
+ if (d instanceof POIFSContainerDetector) {
+ if (shouldHavePOIFS) {
+ hasPOIFS = true;
+ } else {
+ fail("Shouldn't have the POIFSContainerDetector from config");
+ }
+ }
+ }
+ if (shouldHavePOIFS) assertTrue("Should have the POIFSContainerDetector", hasPOIFS);
+ if (shouldHaveZip) assertTrue("Should have the ZipContainerDetector", hasZip);
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/config/TikaParserConfigTest.java b/tika-app/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
new file mode 100644
index 0000000..817beb4
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.util.List;
+
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.DefaultParser;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.parser.executable.ExecutableParser;
+import org.apache.tika.parser.xml.XMLParser;
+import org.junit.Test;
+
+/**
+ * Junit test class for {@link TikaConfig}, which cover things
+ * that {@link TikaConfigTest} can't do due to a need for the
+ * full set of parsers
+ */
+public class TikaParserConfigTest extends AbstractTikaConfigTest {
+ @Test
+ public void testMimeExcludeInclude() throws Exception {
+ TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
+ assertNotNull(config.getParser());
+ assertNotNull(config.getDetector());
+ Parser parser = config.getParser();
+
+ MediaType PDF = MediaType.application("pdf");
+ MediaType JPEG = MediaType.image("jpeg");
+
+
+ // Has two parsers
+ assertEquals(CompositeParser.class, parser.getClass());
+ CompositeParser cParser = (CompositeParser)parser;
+ assertEquals(2, cParser.getAllComponentParsers().size());
+
+ // Both are decorated
+ assertTrue(cParser.getAllComponentParsers().get(0) instanceof ParserDecorator);
+ assertTrue(cParser.getAllComponentParsers().get(1) instanceof ParserDecorator);
+ ParserDecorator p0 = (ParserDecorator)cParser.getAllComponentParsers().get(0);
+ ParserDecorator p1 = (ParserDecorator)cParser.getAllComponentParsers().get(1);
+
+
+ // DefaultParser will be wrapped with excludes
+ assertEquals(DefaultParser.class, p0.getWrappedParser().getClass());
+
+ assertNotContained(PDF, p0.getSupportedTypes(context));
+ assertContains(PDF, p0.getWrappedParser().getSupportedTypes(context));
+ assertNotContained(JPEG, p0.getSupportedTypes(context));
+ assertContains(JPEG, p0.getWrappedParser().getSupportedTypes(context));
+
+
+ // Will have an empty parser for PDF
+ assertEquals(EmptyParser.class, p1.getWrappedParser().getClass());
+ assertEquals(1, p1.getSupportedTypes(context).size());
+ assertContains(PDF, p1.getSupportedTypes(context));
+ assertNotContained(PDF, p1.getWrappedParser().getSupportedTypes(context));
+ }
+
+ @Test
+ public void testParserExcludeFromDefault() throws Exception {
+ TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
+ assertNotNull(config.getParser());
+ assertNotNull(config.getDetector());
+ CompositeParser parser = (CompositeParser)config.getParser();
+
+ MediaType PE_EXE = MediaType.application("x-msdownload");
+ MediaType ELF = MediaType.application("x-elf");
+
+
+ // Get the DefaultParser from the config
+ ParserDecorator confWrappedParser = (ParserDecorator)parser.getParsers().get(MediaType.APPLICATION_XML);
+ assertNotNull(confWrappedParser);
+ DefaultParser confParser = (DefaultParser)confWrappedParser.getWrappedParser();
+
+ // Get a fresh "default" DefaultParser
+ DefaultParser normParser = new DefaultParser(config.getMediaTypeRegistry());
+
+
+ // The default one will offer the Executable Parser
+ assertContains(PE_EXE, normParser.getSupportedTypes(context));
+ assertContains(ELF, normParser.getSupportedTypes(context));
+
+ boolean hasExec = false;
+ for (Parser p : normParser.getParsers().values()) {
+ if (p instanceof ExecutableParser) {
+ hasExec = true;
+ break;
+ }
+ }
+ assertTrue(hasExec);
+
+
+ // The one from the config won't
+ assertNotContained(PE_EXE, confParser.getSupportedTypes(context));
+ assertNotContained(ELF, confParser.getSupportedTypes(context));
+
+ for (Parser p : confParser.getParsers().values()) {
+ if (p instanceof ExecutableParser)
+ fail("Shouldn't have the Executable Parser from config");
+ }
+ }
+ /**
+ * TIKA-1558 It should be possible to exclude Parsers from being picked up by
+ * DefaultParser.
+ */
+ @Test
+ public void defaultParserBlacklist() throws Exception {
+ TikaConfig config = new TikaConfig();
+ assertNotNull(config.getParser());
+ assertNotNull(config.getDetector());
+ CompositeParser cp = (CompositeParser) config.getParser();
+ List<Parser> parsers = cp.getAllComponentParsers();
+
+ boolean hasXML = false;
+ for (Parser p : parsers) {
+ if (p instanceof XMLParser) {
+ hasXML = true;
+ break;
+ }
+ }
+ assertTrue("Default config should include an XMLParser.", hasXML);
+
+ // This custom TikaConfig should exclude XMLParser and all of its subclasses.
+ config = getConfig("TIKA-1558-blacklistsub.xml");
+ cp = (CompositeParser) config.getParser();
+ parsers = cp.getAllComponentParsers();
+
+ for (Parser p : parsers) {
+ if (p instanceof XMLParser)
+ fail("Custom config should not include an XMLParser (" + p.getClass() + ").");
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java b/tika-app/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java
new file mode 100644
index 0000000..764bbe4
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.config;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.language.translate.DefaultTranslator;
+import org.apache.tika.language.translate.EmptyTranslator;
+import org.junit.Test;
+
+/**
+ * Junit test class for {@link TikaConfig}, which cover things
+ * that {@link TikaConfigTest} can't do due to a need for the
+ * full set of translators
+ */
+public class TikaTranslatorConfigTest extends AbstractTikaConfigTest {
+ @Test
+ public void testDefaultBehaviour() throws Exception {
+ TikaConfig config = TikaConfig.getDefaultConfig();
+ assertNotNull(config.getTranslator());
+ assertEquals(DefaultTranslator.class, config.getTranslator().getClass());
+ }
+
+ @Test
+ public void testRequestsDefault() throws Exception {
+ TikaConfig config = getConfig("TIKA-1702-translator-default.xml");
+ assertNotNull(config.getParser());
+ assertNotNull(config.getDetector());
+ assertNotNull(config.getTranslator());
+
+ assertEquals(DefaultTranslator.class, config.getTranslator().getClass());
+ }
+
+ @Test
+ public void testRequestsEmpty() throws Exception {
+ TikaConfig config = getConfig("TIKA-1702-translator-empty.xml");
+ assertNotNull(config.getParser());
+ assertNotNull(config.getDetector());
+ assertNotNull(config.getTranslator());
+
+ assertEquals(EmptyTranslator.class, config.getTranslator().getClass());
+ }
+
+ /**
+ * Currently, Translators don't support Composites, so
+ * if multiple translators are given, only the first wins
+ */
+ @Test
+ public void testRequestsMultiple() throws Exception {
+ TikaConfig config = getConfig("TIKA-1702-translator-empty-default.xml");
+ assertNotNull(config.getParser());
+ assertNotNull(config.getDetector());
+ assertNotNull(config.getTranslator());
+
+ assertEquals(EmptyTranslator.class, config.getTranslator().getClass());
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-app/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
new file mode 100644
index 0000000..5787408
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -0,0 +1,410 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeTypes;
+import org.junit.Test;
+
+/**
+ * Junit test class for {@link ContainerAwareDetector}
+ */
+public class TestContainerAwareDetector {
+ private final TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
+ private final MimeTypes mimeTypes = tikaConfig.getMimeRepository();
+ private final Detector detector = new DefaultDetector(mimeTypes);
+
+ private void assertTypeByData(String file, String type) throws Exception {
+ assertTypeByNameAndData(file, null, type);
+ }
+ private void assertTypeByNameAndData(String file, String type) throws Exception {
+ assertTypeByNameAndData(file, file, type);
+ }
+ private void assertType(String file, String byData, String byNameAndData) throws Exception {
+ assertTypeByData(file, byData);
+ assertTypeByNameAndData(file, byNameAndData);
+ }
+ private void assertTypeByNameAndData(String dataFile, String name, String type) throws Exception {
+ assertTypeByNameAndData(dataFile, name, type, null);
+ }
+ private void assertTypeByNameAndData(String dataFile, String name, String typeFromDetector, String typeFromMagic) throws Exception {
+ try (TikaInputStream stream = TikaInputStream.get(
+ TestContainerAwareDetector.class.getResource("/test-documents/" + dataFile))) {
+ Metadata m = new Metadata();
+ if (name != null)
+ m.add(Metadata.RESOURCE_NAME_KEY, name);
+
+ // Mime Magic version is likely to be less precise
+ if (typeFromMagic != null) {
+ assertEquals(
+ MediaType.parse(typeFromMagic),
+ mimeTypes.detect(stream, m));
+ }
+
+ // All being well, the detector should get it perfect
+ assertEquals(
+ MediaType.parse(typeFromDetector),
+ detector.detect(stream, m));
+ }
+ }
+
+ @Test
+ public void testDetectOLE2() throws Exception {
+ // Microsoft office types known by POI
+ assertTypeByData("testEXCEL.xls", "application/vnd.ms-excel");
+ assertTypeByData("testWORD.doc", "application/msword");
+ assertTypeByData("testPPT.ppt", "application/vnd.ms-powerpoint");
+
+ assertTypeByData("test-outlook.msg", "application/vnd.ms-outlook");
+ assertTypeByData("test-outlook2003.msg", "application/vnd.ms-outlook");
+ assertTypeByData("testVISIO.vsd", "application/vnd.visio");
+ assertTypeByData("testPUBLISHER.pub", "application/x-mspublisher");
+ assertTypeByData("testWORKS.wps", "application/vnd.ms-works");
+ assertTypeByData("testWORKS2000.wps", "application/vnd.ms-works");
+
+ // older Works Word Processor files can't be recognized
+ // they were created with Works Word Processor 7.0 (hence the text inside)
+ // and exported to the older formats with the "Save As" feature
+ assertTypeByData("testWORKSWordProcessor3.0.wps","application/vnd.ms-works");
+ assertTypeByData("testWORKSWordProcessor4.0.wps","application/vnd.ms-works");
+ assertTypeByData("testWORKSSpreadsheet7.0.xlr", "application/x-tika-msworks-spreadsheet");
+ assertTypeByData("testPROJECT2003.mpp", "application/vnd.ms-project");
+ assertTypeByData("testPROJECT2007.mpp", "application/vnd.ms-project");
+
+ // Excel95 can be detected by not parsed
+ assertTypeByData("testEXCEL_95.xls", "application/vnd.ms-excel");
+
+ // Try some ones that POI doesn't handle, that are still OLE2 based
+ assertTypeByData("testCOREL.shw", "application/x-corelpresentations");
+ assertTypeByData("testQUATTRO.qpw", "application/x-quattro-pro");
+ assertTypeByData("testQUATTRO.wb3", "application/x-quattro-pro");
+
+ assertTypeByData("testHWP_5.0.hwp", "application/x-hwp-v5");
+
+
+ // With the filename and data
+ assertTypeByNameAndData("testEXCEL.xls", "application/vnd.ms-excel");
+ assertTypeByNameAndData("testWORD.doc", "application/msword");
+ assertTypeByNameAndData("testPPT.ppt", "application/vnd.ms-powerpoint");
+
+ // With the wrong filename supplied, data will trump filename
+ assertTypeByNameAndData("testEXCEL.xls", "notWord.doc", "application/vnd.ms-excel");
+ assertTypeByNameAndData("testWORD.doc", "notExcel.xls", "application/msword");
+ assertTypeByNameAndData("testPPT.ppt", "notWord.doc", "application/vnd.ms-powerpoint");
+
+ // With a filename of a totally different type, data will trump filename
+ assertTypeByNameAndData("testEXCEL.xls", "notPDF.pdf", "application/vnd.ms-excel");
+ assertTypeByNameAndData("testEXCEL.xls", "notPNG.png", "application/vnd.ms-excel");
+ }
+
+ /**
+ * There is no way to distinguish "proper" StarOffice files from templates.
+ * All templates have the same extension but their actual type depends on
+ * the magic. Our current MimeTypes class doesn't allow us to use the same
+ * glob pattern in more than one mimetype.
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testDetectStarOfficeFiles() throws Exception {
+ assertType("testStarOffice-5.2-calc.sdc",
+ "application/vnd.stardivision.calc",
+ "application/vnd.stardivision.calc");
+ assertType("testVORCalcTemplate.vor",
+ "application/vnd.stardivision.calc",
+ "application/vnd.stardivision.calc");
+ assertType("testStarOffice-5.2-draw.sda",
+ "application/vnd.stardivision.draw",
+ "application/vnd.stardivision.draw");
+ assertType("testVORDrawTemplate.vor",
+ "application/vnd.stardivision.draw",
+ "application/vnd.stardivision.draw");
+ assertType("testStarOffice-5.2-impress.sdd",
+ "application/vnd.stardivision.impress",
+ "application/vnd.stardivision.impress");
+ assertType("testVORImpressTemplate.vor",
+ "application/vnd.stardivision.impress",
+ "application/vnd.stardivision.impress");
+ assertType("testStarOffice-5.2-writer.sdw",
+ "application/vnd.stardivision.writer",
+ "application/vnd.stardivision.writer");
+ assertType("testVORWriterTemplate.vor",
+ "application/vnd.stardivision.writer",
+ "application/vnd.stardivision.writer");
+
+ }
+
+ @Test
+ public void testOpenContainer() throws Exception {
+ try (TikaInputStream stream = TikaInputStream.get(
+ TestContainerAwareDetector.class.getResource("/test-documents/testPPT.ppt"))) {
+ assertNull(stream.getOpenContainer());
+ assertEquals(
+ MediaType.parse("application/vnd.ms-powerpoint"),
+ detector.detect(stream, new Metadata()));
+ assertTrue(stream.getOpenContainer() instanceof NPOIFSFileSystem);
+ }
+ }
+
+ /**
+ * EPub uses a similar mimetype entry to OpenDocument for storing
+ * the mimetype within the parent zip file
+ */
+ @Test
+ public void testDetectEPub() throws Exception {
+ assertTypeByData("testEPUB.epub", "application/epub+zip");
+ assertTypeByData("testiBooks.ibooks", "application/x-ibooks+zip");
+ }
+
+ @Test
+ public void testDetectLotusNotesEml() throws Exception {
+ // Lotus .eml files aren't guaranteed to have any of the magic
+ // matches as the first line, but should have X-Notes-Item and Message-ID
+ assertTypeByData("testLotusEml.eml", "message/rfc822");
+ }
+
+ @Test
+ public void testDetectODF() throws Exception {
+ assertTypeByData("testODFwithOOo3.odt", "application/vnd.oasis.opendocument.text");
+ assertTypeByData("testOpenOffice2.odf", "application/vnd.oasis.opendocument.formula");
+ }
+
+ @Test
+ public void testDetectOOXML() throws Exception {
+ assertTypeByData("testEXCEL.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ assertTypeByData("testWORD.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+ assertTypeByData("testPPT.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
+
+ // Check some of the less common OOXML types
+ assertTypeByData("testPPT.pptm", "application/vnd.ms-powerpoint.presentation.macroenabled.12");
+ assertTypeByData("testPPT.ppsx", "application/vnd.openxmlformats-officedocument.presentationml.slideshow");
+ assertTypeByData("testPPT.ppsm", "application/vnd.ms-powerpoint.slideshow.macroEnabled.12");
+ assertTypeByData("testDOTM.dotm", "application/vnd.ms-word.template.macroEnabled.12");
+ assertTypeByData("testEXCEL.strict.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ assertTypeByData("testPPT.xps", "application/vnd.ms-xpsdocument");
+
+ assertTypeByData("testVISIO.vsdm", "application/vnd.ms-visio.drawing.macroenabled.12");
+ assertTypeByData("testVISIO.vsdx", "application/vnd.ms-visio.drawing");
+ assertTypeByData("testVISIO.vssm", "application/vnd.ms-visio.stencil.macroenabled.12");
+ assertTypeByData("testVISIO.vssx", "application/vnd.ms-visio.stencil");
+ assertTypeByData("testVISIO.vstm", "application/vnd.ms-visio.template.macroenabled.12");
+ assertTypeByData("testVISIO.vstx", "application/vnd.ms-visio.template");
+
+ // .xlsb is an OOXML file containing the binary parts, and not
+ // an OLE2 file as you might initially expect!
+ assertTypeByData("testEXCEL.xlsb", "application/vnd.ms-excel.sheet.binary.macroEnabled.12");
+
+ // With the filename and data
+ assertTypeByNameAndData("testEXCEL.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ assertTypeByNameAndData("testWORD.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+ assertTypeByNameAndData("testPPT.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
+
+ // With the wrong filename supplied, data will trump filename
+ assertTypeByNameAndData("testEXCEL.xlsx", "notWord.docx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ assertTypeByNameAndData("testWORD.docx", "notExcel.xlsx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+ assertTypeByNameAndData("testPPT.pptx", "notWord.docx", "application/vnd.openxmlformats-officedocument.presentationml.presentation");
+
+ // With an incorrect filename of a different container type, data trumps filename
+ assertTypeByNameAndData("testEXCEL.xlsx", "notOldExcel.xls", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ }
+
+ /**
+ * Password Protected OLE2 files are fairly straightforward to detect, as they
+ * have the same structure as regular OLE2 files. (Core streams may be encrypted
+ * however)
+ */
+ @Test
+ public void testDetectProtectedOLE2() throws Exception {
+ assertTypeByData("testEXCEL_protected_passtika.xls", "application/vnd.ms-excel");
+ assertTypeByData("testWORD_protected_passtika.doc", "application/msword");
+ assertTypeByData("testPPT_protected_passtika.ppt", "application/vnd.ms-powerpoint");
+ assertTypeByNameAndData("testEXCEL_protected_passtika.xls", "application/vnd.ms-excel");
+ assertTypeByNameAndData("testWORD_protected_passtika.doc", "application/msword");
+ assertTypeByNameAndData("testPPT_protected_passtika.ppt", "application/vnd.ms-powerpoint");
+ }
+
+ /**
+ * Password Protected OOXML files are much more tricky beasts to work with.
+ * They have a very different structure to regular OOXML files, and instead
+ * of being ZIP based they are actually an OLE2 file which contains the
+ * OOXML structure within an encrypted stream.
+ * This makes detecting them much harder...
+ */
+ @Test
+ public void testDetectProtectedOOXML() throws Exception {
+ // Encrypted Microsoft Office OOXML files have OLE magic but
+ // special streams, so we can tell they're Protected OOXML
+ assertTypeByData("testEXCEL_protected_passtika.xlsx",
+ "application/x-tika-ooxml-protected");
+ assertTypeByData("testWORD_protected_passtika.docx",
+ "application/x-tika-ooxml-protected");
+ assertTypeByData("testPPT_protected_passtika.pptx",
+ "application/x-tika-ooxml-protected");
+
+ // At the moment, we can't use the name to specialise
+ // See discussions on TIKA-790 for details
+ assertTypeByNameAndData("testEXCEL_protected_passtika.xlsx",
+ "application/x-tika-ooxml-protected");
+ assertTypeByNameAndData("testWORD_protected_passtika.docx",
+ "application/x-tika-ooxml-protected");
+ assertTypeByNameAndData("testPPT_protected_passtika.pptx",
+ "application/x-tika-ooxml-protected");
+ }
+
+ /**
+ * Check that temporary files created by Tika are removed after
+ * closing TikaInputStream.
+ */
+ @Test
+ public void testRemovalTempfiles() throws Exception {
+ assertRemovalTempfiles("testWORD.docx");
+ assertRemovalTempfiles("test-documents.zip");
+ }
+
+ private int countTemporaryFiles() {
+ return new File(System.getProperty("java.io.tmpdir")).listFiles(
+ new FilenameFilter() {
+ public boolean accept(File dir, String name) {
+ return name.startsWith("apache-tika-");
+ }
+ }).length;
+ }
+
+ private void assertRemovalTempfiles(String fileName) throws Exception {
+ int numberOfTempFiles = countTemporaryFiles();
+
+ try (TikaInputStream stream = TikaInputStream.get(
+ TestContainerAwareDetector.class.getResource("/test-documents/" + fileName))) {
+ detector.detect(stream, new Metadata());
+ }
+
+ assertEquals(numberOfTempFiles, countTemporaryFiles());
+ }
+
+ @Test
+ public void testDetectIWork() throws Exception {
+ assertTypeByData("testKeynote.key", "application/vnd.apple.keynote");
+ assertTypeByData("testNumbers.numbers", "application/vnd.apple.numbers");
+ assertTypeByData("testPages.pages", "application/vnd.apple.pages");
+ }
+
+ @Test
+ public void testDetectKMZ() throws Exception {
+ assertTypeByData("testKMZ.kmz", "application/vnd.google-earth.kmz");
+ }
+
+ @Test
+ public void testDetectIPA() throws Exception {
+ assertTypeByNameAndData("testIPA.ipa", "application/x-itunes-ipa");
+ assertTypeByData("testIPA.ipa", "application/x-itunes-ipa");
+ }
+
+ @Test
+ public void testASiC() throws Exception {
+ assertTypeByData("testASiCE.asice", "application/vnd.etsi.asic-e+zip");
+ assertTypeByData("testASiCS.asics", "application/vnd.etsi.asic-s+zip");
+ assertTypeByNameAndData("testASiCE.asice", "application/vnd.etsi.asic-e+zip");
+ assertTypeByNameAndData("testASiCS.asics", "application/vnd.etsi.asic-s+zip");
+ }
+
+ @Test
+ public void testDetectZip() throws Exception {
+ assertTypeByData("test-documents.zip", "application/zip");
+ assertTypeByData("test-zip-of-zip.zip", "application/zip");
+
+ // JAR based formats
+ assertTypeByData("testJAR.jar", "application/java-archive");
+ assertTypeByData("testWAR.war", "application/x-tika-java-web-archive");
+ assertTypeByData("testEAR.ear", "application/x-tika-java-enterprise-archive");
+ assertTypeByData("testAPK.apk", "application/vnd.android.package-archive");
+
+ // JAR with HTML files in it
+ assertTypeByNameAndData("testJAR_with_HTML.jar", "testJAR_with_HTML.jar",
+ "application/java-archive", "application/java-archive");
+ }
+
+ private TikaInputStream getTruncatedFile(String name, int n)
+ throws IOException {
+ try (InputStream input = TestContainerAwareDetector.class.getResourceAsStream(
+ "/test-documents/" + name)) {
+ byte[] bytes = new byte[n];
+ int m = 0;
+ while (m < bytes.length) {
+ int i = input.read(bytes, m, bytes.length - m);
+ if (i != -1) {
+ m += i;
+ } else {
+ throw new IOException("Unexpected end of stream");
+ }
+ }
+ return TikaInputStream.get(bytes);
+ }
+ }
+
+ @Test
+ public void testTruncatedFiles() throws Exception {
+ // First up a truncated OOXML (zip) file
+
+ // With only the data supplied, the best we can do is the container
+ Metadata m = new Metadata();
+ try (TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300)) {
+ assertEquals(
+ MediaType.application("x-tika-ooxml"),
+ detector.detect(xlsx, m));
+ }
+
+ // With truncated data + filename, we can use the filename to specialise
+ m = new Metadata();
+ m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx");
+ try (TikaInputStream xlsx = getTruncatedFile("testEXCEL.xlsx", 300)) {
+ assertEquals(
+ MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
+ detector.detect(xlsx, m));
+ }
+
+ // Now a truncated OLE2 file
+ m = new Metadata();
+ try (TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400)) {
+ assertEquals(
+ MediaType.application("x-tika-msoffice"),
+ detector.detect(xls, m));
+ }
+
+ // Finally a truncated OLE2 file, with a filename available
+ m = new Metadata();
+ m.add(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xls");
+ try (TikaInputStream xls = getTruncatedFile("testEXCEL.xls", 400)) {
+ assertEquals(
+ MediaType.application("vnd.ms-excel"),
+ detector.detect(xls, m));
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java b/tika-app/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java
new file mode 100644
index 0000000..45f68cc
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/embedder/ExternalEmbedderTest.java
@@ -0,0 +1,285 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.embedder;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStreamWriter;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.txt.TXTParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Unit test for {@link ExternalEmbedder}s.
+ */
+public class ExternalEmbedderTest extends TikaTest {
+
+ static Path TMP_TEST_TXT;
+ protected static final DateFormat EXPECTED_METADATA_DATE_FORMATTER =
+ new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ROOT);
+ protected static final String DEFAULT_CHARSET = UTF_8.name();
+ private static final String COMMAND_METADATA_ARGUMENT_DESCRIPTION = "dc:description";
+ private static final String TEST_TXT_PATH = "test-documents/testTXT.txt";
+
+ private TemporaryResources tmp = new TemporaryResources();
+
+ @BeforeClass
+ public static void copyTestFile() throws Exception {
+ TMP_TEST_TXT = Files.createTempFile("tika-test", "");
+ Files.copy(TikaTest.class.getClassLoader().getResourceAsStream(TEST_TXT_PATH),
+ TMP_TEST_TXT, StandardCopyOption.REPLACE_EXISTING);
+ }
+
+ @AfterClass
+ public static void rmTestFile() throws Exception {
+ Files.delete(TMP_TEST_TXT);
+ }
+
+ /**
+ * Gets the expected returned metadata value for the given field
+ *
+ * @param fieldName
+ * @return a prefix added to the field name
+ */
+ protected String getExpectedMetadataValueString(String fieldName, Date timestamp) {
+ return this.getClass().getSimpleName() + " embedded " + fieldName +
+ " on " + EXPECTED_METADATA_DATE_FORMATTER.format(timestamp);
+ }
+
+ /**
+ * Gets the tika <code>Metadata</code> object containing data to be
+ * embedded.
+ *
+ * @return the populated tika metadata object
+ */
+ protected Metadata getMetadataToEmbed(Date timestamp) {
+ Metadata metadata = new Metadata();
+ metadata.add(TikaCoreProperties.DESCRIPTION,
+ getExpectedMetadataValueString(TikaCoreProperties.DESCRIPTION.toString(), timestamp));
+ return metadata;
+ }
+
+ /**
+ * Gets the <code>Embedder</code> to test.
+ *
+ * @return the embedder under test
+ */
+ protected Embedder getEmbedder() {
+ ExternalEmbedder embedder = new ExternalEmbedder();
+ Map<Property, String[]> metadataCommandArguments = new HashMap<Property, String[]>(1);
+ metadataCommandArguments.put(TikaCoreProperties.DESCRIPTION,
+ new String[] { COMMAND_METADATA_ARGUMENT_DESCRIPTION });
+ embedder.setMetadataCommandArguments(metadataCommandArguments);
+ return embedder;
+ }
+
+ /**
+ * Gets the source input stream through standard Java resource loaders
+ * before metadata has been embedded.
+ *
+ * @return a fresh input stream
+ */
+ protected InputStream getSourceStandardInputStream() {
+ return this.getClass().getResourceAsStream(TEST_TXT_PATH);
+ }
+
+ /**
+ * Gets the source input stream via {@link TikaInputStream}
+ * before metadata has been embedded.
+ *
+ * @return a fresh input stream
+ * @throws FileNotFoundException
+ */
+ protected InputStream getSourceTikaInputStream() throws IOException {
+ return TikaInputStream.get(TMP_TEST_TXT);
+ }
+
+ /**
+ * Gets the parser to use to verify the result of the embed operation.
+ *
+ * @return the parser to read embedded metadata
+ */
+ protected Parser getParser() {
+ return new TXTParser();
+ }
+
+ /**
+ * Whether or not the final result of reading the now embedded metadata is
+ * expected in the output of the external tool
+ *
+ * @return whether or not results are expected in command line output
+ */
+ protected boolean getIsMetadataExpectedInOutput() {
+ return true;
+ }
+
+ /**
+ * Tests embedding metadata then reading metadata to verify the results.
+ *
+ * @param isResultExpectedInOutput whether or not results are expected in command line output
+ */
+ protected void embedInTempFile(InputStream sourceInputStream, boolean isResultExpectedInOutput) {
+ Embedder embedder = getEmbedder();
+
+ // TODO Move this check to ExternalEmbedder
+ String os = System.getProperty("os.name", "");
+ if (os.contains("Windows")) {
+ // Skip test on Windows
+ return;
+ }
+
+ Date timestamp = new Date();
+ Metadata metadataToEmbed = getMetadataToEmbed(timestamp);
+
+ try {
+ File tempOutputFile = tmp.createTemporaryFile();
+ FileOutputStream tempFileOutputStream = new FileOutputStream(tempOutputFile);
+
+ // Embed the metadata into a copy of the original output stream
+ embedder.embed(metadataToEmbed, sourceInputStream, tempFileOutputStream, null);
+
+ ParseContext context = new ParseContext();
+ Parser parser = getParser();
+ context.set(Parser.class, parser);
+
+ // Setup the extracting content handler
+ ByteArrayOutputStream result = new ByteArrayOutputStream();
+ OutputStreamWriter outputWriter = new OutputStreamWriter(result,DEFAULT_CHARSET);
+ ContentHandler handler = new BodyContentHandler(outputWriter);
+
+ // Create a new metadata object to read the new metadata into
+ Metadata embeddedMetadata = new Metadata();
+
+ // Setup a re-read of the now embeded temp file
+ FileInputStream embeddedFileInputStream = new FileInputStream(tempOutputFile);
+
+ parser.parse(embeddedFileInputStream, handler, embeddedMetadata,
+ context);
+
+ tmp.dispose();
+
+ String outputString = null;
+ if (isResultExpectedInOutput) {
+ outputString = result.toString(DEFAULT_CHARSET);
+ } else {
+ assertTrue("no metadata found", embeddedMetadata.size() > 0);
+ }
+
+ // Check each metadata property for the expected value
+ for (String metadataName : metadataToEmbed.names()) {
+ if (metadataToEmbed.get(metadataName) != null) {
+ String expectedValue = metadataToEmbed.get(metadataName);
+ boolean foundExpectedValue = false;
+ if (isResultExpectedInOutput) {
+ // just check that the entire output contains the expected string
+ foundExpectedValue = outputString.contains(expectedValue);
+ } else {
+ if (embeddedMetadata.isMultiValued(metadataName)) {
+ for (String embeddedValue : embeddedMetadata.getValues(metadataName)) {
+ if (embeddedValue != null) {
+ if (embeddedValue.contains(expectedValue)) {
+ foundExpectedValue = true;
+ break;
+ }
+ }
+ }
+ } else {
+ String embeddedValue = embeddedMetadata.get(metadataName);
+ assertNotNull("expected metadata for "
+ + metadataName + " not found",
+ embeddedValue);
+ foundExpectedValue = embeddedValue.contains(expectedValue);
+ }
+ }
+ assertTrue(
+ "result did not contain expected appended metadata "
+ + metadataName + "="
+ + expectedValue,
+ foundExpectedValue);
+ }
+ }
+ } catch (IOException e) {
+ fail(e.getMessage());
+ } catch (TikaException e) {
+ fail(e.getMessage());
+ } catch (SAXException e) {
+ fail(e.getMessage());
+ }
+ }
+
+ protected void checkSourceFileExists() {
+ String message = "the original input file was deleted";
+ assertNotNull(message, TMP_TEST_TXT);
+ assertTrue(message, Files.isRegularFile(TMP_TEST_TXT));
+ }
+
+ /**
+ * Tests embedding using an input stream obtained via {@link ExternalEmbedderTest#getSourceStandardInputStream()}
+ *
+ * @throws IOException
+ */
+ @Test
+ public void testEmbedStandardInputStream() throws IOException {
+ embedInTempFile(getSourceStandardInputStream(), getIsMetadataExpectedInOutput());
+ checkSourceFileExists();
+ }
+
+ /**
+ * Tests embedding using an input stream obtained via {@link ExternalEmbedderTest#getSourceTikaInputStream()}
+ *
+ * @throws IOException
+ */
+ @Test
+ public void testEmbedTikaInputStream() throws IOException {
+ embedInTempFile(getSourceTikaInputStream(), getIsMetadataExpectedInOutput());
+ checkSourceFileExists();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/mime/MimeTypeTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/mime/MimeTypeTest.java b/tika-app/src/test/java/org/apache/tika/mime/MimeTypeTest.java
new file mode 100644
index 0000000..447042b
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/mime/MimeTypeTest.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.junit.Before;
+import org.junit.Test;
+
+public class MimeTypeTest {
+
+ private MimeTypes types;
+ private MimeType text;
+
+ @Before
+ public void setUp() throws MimeTypeException {
+ types = new MimeTypes();
+ text = types.forName("text/plain");
+ }
+
+ /** Test MimeType constructor */
+ @Test
+ public void testConstrctor() {
+ // Missing name
+ try {
+ new MimeType(null);
+ fail("Expected IllegalArgumentException");
+ } catch (IllegalArgumentException e) {
+ // expected result
+ }
+ }
+
+ @Test
+ public void testIsValidName() {
+ assertTrue(MimeType.isValid("application/octet-stream"));
+ assertTrue(MimeType.isValid("text/plain"));
+ assertTrue(MimeType.isValid("foo/bar"));
+ assertTrue(MimeType.isValid("a/b"));
+
+ assertFalse(MimeType.isValid("application"));
+ assertFalse(MimeType.isValid("application/"));
+ assertFalse(MimeType.isValid("/"));
+ assertFalse(MimeType.isValid("/octet-stream"));
+ assertFalse(MimeType.isValid("application//octet-stream"));
+ assertFalse(MimeType.isValid("application/octet=stream"));
+ assertFalse(MimeType.isValid("application/\u00f6ctet-stream"));
+ assertFalse(MimeType.isValid("text/plain;"));
+ assertFalse(MimeType.isValid("text/plain; charset=UTF-8"));
+ try {
+ MimeType.isValid(null);
+ fail("Expected IllegalArgumentException");
+ } catch (IllegalArgumentException e) {
+ // expected result
+ }
+ }
+
+ /** Test MimeType setDescription() */
+ @Test
+ public void testSetEmptyValues() {
+ try {
+ text.setDescription(null);
+ fail("Expected IllegalArgumentException");
+ } catch (IllegalArgumentException e) {
+ // expected result
+ }
+
+ try {
+ text.setAcronym(null);
+ fail("Expected IllegalArgumentException");
+ } catch (IllegalArgumentException e) {
+ // expected result
+ }
+
+ try {
+ text.addLink(null);
+ fail("Expected IllegalArgumentException");
+ } catch (IllegalArgumentException e) {
+ // expected result
+ }
+
+ try {
+ text.setUniformTypeIdentifier(null);
+ fail("Expected IllegalArgumentException");
+ } catch (IllegalArgumentException e) {
+ // expected result
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-app/src/test/java/org/apache/tika/mime/MimeTypesTest.java
----------------------------------------------------------------------
diff --git a/tika-app/src/test/java/org/apache/tika/mime/MimeTypesTest.java b/tika-app/src/test/java/org/apache/tika/mime/MimeTypesTest.java
new file mode 100644
index 0000000..be8a575
--- /dev/null
+++ b/tika-app/src/test/java/org/apache/tika/mime/MimeTypesTest.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+import static org.apache.tika.mime.MediaType.OCTET_STREAM;
+import static org.apache.tika.mime.MediaType.TEXT_PLAIN;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import org.junit.Before;
+import org.junit.Test;
+
+public class MimeTypesTest {
+
+ private MimeTypes types;
+
+ private MediaTypeRegistry registry;
+
+ private MimeType binary;
+
+ private MimeType text;
+
+ private MimeType html;
+
+ @Before
+ public void setUp() throws MimeTypeException {
+ types = new MimeTypes();
+ registry = types.getMediaTypeRegistry();
+ binary = types.forName("application/octet-stream");
+ text = types.forName("text/plain");
+ types.addAlias(text, MediaType.parse("text/x-plain"));
+ html = types.forName("text/html");
+ types.setSuperType(html, TEXT_PLAIN);
+ }
+
+ @Test
+ public void testForName() throws MimeTypeException {
+ assertEquals(text, types.forName("text/plain"));
+ assertEquals(text, types.forName("TEXT/PLAIN"));
+
+ try {
+ types.forName("invalid");
+ fail("MimeTypeException not thrown on invalid type name");
+ } catch (MimeTypeException e) {
+ // expected
+ }
+ }
+
+ @Test
+ public void testRegisteredMimes() throws MimeTypeException {
+ String dummy = "text/xxxxx";
+ assertEquals(text, types.getRegisteredMimeType("text/plain"));
+ assertNull(types.getRegisteredMimeType(dummy));
+ assertNotNull(types.forName(dummy));
+ assertEquals(dummy, types.forName("text/xxxxx").getType().toString());
+ assertEquals(dummy, types.getRegisteredMimeType("text/xxxxx").getType().toString());
+
+ try {
+ types.forName("invalid");
+ fail("MimeTypeException not thrown on invalid type name");
+ } catch (MimeTypeException e) {
+ // expected
+ }
+ }
+
+ @Test
+ public void testSuperType() throws MimeTypeException {
+ assertNull(registry.getSupertype(OCTET_STREAM));
+ assertEquals(OCTET_STREAM, registry.getSupertype(TEXT_PLAIN));
+ assertEquals(TEXT_PLAIN, registry.getSupertype(html.getType()));
+ }
+
+ @Test
+ public void testIsDescendantOf() {
+ assertFalse(registry.isSpecializationOf(OCTET_STREAM, OCTET_STREAM));
+ assertFalse(registry.isSpecializationOf(TEXT_PLAIN, TEXT_PLAIN));
+ assertFalse(registry.isSpecializationOf(html.getType(), html.getType()));
+
+ assertTrue(registry.isSpecializationOf(html.getType(), OCTET_STREAM));
+ assertFalse(registry.isSpecializationOf(OCTET_STREAM, html.getType()));
+
+ assertTrue(registry.isSpecializationOf(html.getType(), TEXT_PLAIN));
+ assertFalse(registry.isSpecializationOf(TEXT_PLAIN, html.getType()));
+
+ assertTrue(registry.isSpecializationOf(TEXT_PLAIN, OCTET_STREAM));
+ assertFalse(registry.isSpecializationOf(OCTET_STREAM, TEXT_PLAIN));
+ }
+
+ @Test
+ public void testCompareTo() {
+ assertTrue(binary.compareTo(binary) == 0);
+ assertTrue(binary.compareTo(text) != 0);
+ assertTrue(binary.compareTo(html) != 0);
+
+ assertTrue(text.compareTo(binary) != 0);
+ assertTrue(text.compareTo(text) == 0);
+ assertTrue(text.compareTo(html) != 0);
+
+ assertTrue(html.compareTo(binary) != 0);
+ assertTrue(html.compareTo(text) != 0);
+ assertTrue(html.compareTo(html) == 0);
+ }
+
+}
[02/13] tika git commit: TIKA-1855 -- first pass. Need to turn back
on the forbidden-apis testCheck. More clean up remains.
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/brwNIMS_2014.dif
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/brwNIMS_2014.dif b/tika-test-resources/src/test/resources/test-documents/brwNIMS_2014.dif
new file mode 100644
index 0000000..e131add
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/brwNIMS_2014.dif
@@ -0,0 +1,56 @@
+<?xml version="1.0" encoding="UTF-8"?>
+ <DIF xmlns="http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/ http://gcmd.gsfc.nasa.gov/Aboutus/xml/dif/dif_v9.8.4.xsd">
+ <Entry_ID>02a6301c-3ab3-11e4-8ee7-00c0f03d5b7c</Entry_ID>
+ <Entry_Title>Barrow Logger Data NIMS 2014</Entry_Title>
+
+ <Parameters>
+ <Category>EARTH SCIENCE</Category>
+ <Topic>BIOSPHERE</Topic>
+ <Term>ECOLOGICAL DYNAMICS</Term>
+ </Parameters>
+
+
+ <Spatial_Coverage>
+ <Southernmost_Latitude>70</Southernmost_Latitude>
+ <Northernmost_Latitude>72</Northernmost_Latitude>
+ <Westernmost_Longitude>-162</Westernmost_Longitude>
+ <Easternmost_Longitude>-150</Easternmost_Longitude>
+ </Spatial_Coverage>
+
+ <Data_Center>
+ <Data_Center_Name>
+ <Short_Name>ACADIS</Short_Name>
+ <Long_Name>Advanced Cooperative Arctic Data and Information Service</Long_Name>
+ </Data_Center_Name>
+ <Data_Center_URL>http://www.aoncadis.org/</Data_Center_URL>
+ <Personnel>
+ <Role>DATA CENTER CONTACT</Role>
+ <First_Name>ACADIS</First_Name>
+ <Last_Name>User Services</Last_Name>
+ <Contact_Address>
+ <Address>NCAR/CISL</Address>
+ <Address>P.O. Box 3000</Address>
+ <City>Boulder</City>
+ <Province_or_State>CO</Province_or_State>
+ <Postal_Code>80307</Postal_Code>
+ <Country>USA</Country>
+ </Contact_Address>
+ </Personnel>
+ </Data_Center>
+
+ <Summary>
+ <Abstract>Logger records from the Networked Info-mechanical Systems (NIMS), Transect length: ~50m The data was recorded using a CR3000 logger. The sensor trolley was equipped with instruments for recording the distance to vegetation canopy (SR50a Sonic Distance, Campbell Scientific), up- and downwelling short- and longwave radiation (CNR4 net radiometer, Kipp & Zonen), air temperature and surface temperature (SI-111 IR radiometer, Apogee Instruments Inc.) and spectral reflection (Jaz Combo-2, Ocean Optics; GreenSeeker RT100 (505), NTech).</Abstract>
+ </Summary>
+
+ <Related_URL>
+ <URL_Content_Type>
+ <Type>GET DATA</Type>
+ </URL_Content_Type>
+ <URL>http://www.aoncadis.org/dataset/id/02a6301c-3ab3-11e4-8ee7-00c0f03d5b7c.html</URL>
+ <Description>Data Center top-level access page for this resource</Description>
+ </Related_URL>
+
+ <Metadata_Name>ACADIS IDN DIF</Metadata_Name>
+ <Metadata_Version>9.8.4</Metadata_Version>
+ <Last_DIF_Revision_Date>2015-02-05</Last_DIF_Revision_Date>
+ </DIF>
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/circles-with-prefix.svg
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/circles-with-prefix.svg b/tika-test-resources/src/test/resources/test-documents/circles-with-prefix.svg
new file mode 100644
index 0000000..d68ff55
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/circles-with-prefix.svg
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg:svg xmlns:svg="http://www.w3.org/2000/svg" width="12cm" height="12cm">
+ <svg:g style="fill-opacity:0.7; stroke:black; stroke-width:0.1cm;">
+ <svg:circle cx="6cm" cy="2cm" r="100" style="fill:red;" transform="translate(0,50)" />
+ <svg:circle cx="6cm" cy="2cm" r="100" style="fill:blue;" transform="translate(70,150)" />
+ <svg:circle cx="6cm" cy="2cm" r="100" style="fill:green;" transform="translate(-70,150)"/>
+ </svg:g>
+</svg:svg>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/circles.svg
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/circles.svg b/tika-test-resources/src/test/resources/test-documents/circles.svg
new file mode 100644
index 0000000..8b71e82
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/circles.svg
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" width="12cm" height="12cm">
+ <g style="fill-opacity:0.7; stroke:black; stroke-width:0.1cm;">
+ <circle cx="6cm" cy="2cm" r="100" style="fill:red;" transform="translate(0,50)" />
+ <circle cx="6cm" cy="2cm" r="100" style="fill:blue;" transform="translate(70,150)" />
+ <circle cx="6cm" cy="2cm" r="100" style="fill:green;" transform="translate(-70,150)"/>
+ </g>
+</svg>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/datamatrix.png
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/datamatrix.png b/tika-test-resources/src/test/resources/test-documents/datamatrix.png
new file mode 100644
index 0000000..4aa5003
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/datamatrix.png differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/english.txt
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/english.txt b/tika-test-resources/src/test/resources/test-documents/english.txt
new file mode 100644
index 0000000..5e3d20e
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/english.txt
@@ -0,0 +1 @@
+This is English!
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/foo.csv
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/foo.csv b/tika-test-resources/src/test/resources/test-documents/foo.csv
new file mode 100644
index 0000000..0f48f3e
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/foo.csv
@@ -0,0 +1,4 @@
+foo,bar,baz
+123,"abc def",-987
+456,"qwertyuiop",98765
+789,"qawsedrft",3.14159
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/french.txt
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/french.txt b/tika-test-resources/src/test/resources/test-documents/french.txt
new file mode 100644
index 0000000..678e6c2
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/french.txt
@@ -0,0 +1 @@
+c’est comme ci comme ça
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/htmlfragment
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/htmlfragment b/tika-test-resources/src/test/resources/test-documents/htmlfragment
new file mode 100644
index 0000000..bf36d08
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/htmlfragment
@@ -0,0 +1,18 @@
+<div id="leftcol">
+ <ul>
+ <li><a href="/mission/sec/sec.html"> Security and Information Sciences Home ›</a> </li>
+ <li><a href="/mission/sec/publications/-publications.html">Publications ›</a> </li>
+ <li><a href="/mission/sec/corpora/corpora.html">Corpora ›</a> </li>
+ <li><a href="/mission/sec/softwaretools/tools.html">Software Tools ›</a></li>
+ <li><a href="/mission/sec/CSO/CSO.html"> Systems and Operations ›</a>
+ <ul>
+ <li><a href="/mission/sec/publications/-publications.html">Publications ›</a></li>
+ <li><a href="/mission/sec/CSO/biographies/CSObios.html">Biographies ›</a></li>
+ </ul>
+ </li>
+ <li><a href="/mission/sec/CST/CST.html"> Systems and Technology ›</a> </li>
+ <li><a href="/mission/sec/CSA/CSA.html"> System Assessments ›</a> </li>
+ <li><a href="/mission/sec/HLT/HLT.html">Human Language Technology ›</a>
+<li><a href="/mission/sec/computing/computing.html">Computing and Analytics ›</a></li>
+ </ul>
+</div>
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/mock/null_pointer.xml
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/mock/null_pointer.xml b/tika-test-resources/src/test/resources/test-documents/mock/null_pointer.xml
index 4561c3a..f4f857a 100644
--- a/tika-test-resources/src/test/resources/test-documents/mock/null_pointer.xml
+++ b/tika-test-resources/src/test/resources/test-documents/mock/null_pointer.xml
@@ -21,5 +21,5 @@
<mock>
<metadata action="add" name="author">Nikolai Lobachevsky</metadata>
<write element="p">some content</write>
- <throw class="java.lang.NullPointerException">another null pointer exception</throw>
-</mock>
\ No newline at end of file
+ <throw class="java.lang.NullPointerException">null pointer message</throw>
+</mock>
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/password.xls
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/password.xls b/tika-test-resources/src/test/resources/test-documents/password.xls
new file mode 100644
index 0000000..a6ad86a
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/password.xls differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/pic.xls
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/pic.xls b/tika-test-resources/src/test/resources/test-documents/pic.xls
new file mode 100644
index 0000000..6798ae2
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/pic.xls differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/pic.xlsx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/pic.xlsx b/tika-test-resources/src/test/resources/test-documents/pic.xlsx
new file mode 100644
index 0000000..9cc155a
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/pic.xlsx differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/plotutils-bin-cgm-v3.cgm
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/plotutils-bin-cgm-v3.cgm b/tika-test-resources/src/test/resources/test-documents/plotutils-bin-cgm-v3.cgm
new file mode 100644
index 0000000..450f5ad
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/plotutils-bin-cgm-v3.cgm differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/stylesheet.xsl
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/stylesheet.xsl b/tika-test-resources/src/test/resources/test-documents/stylesheet.xsl
new file mode 100644
index 0000000..d704f07
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/stylesheet.xsl
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="utf-8"?>
+<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+
+ <xsl:output method="xml" indent="yes"/>
+
+ <xsl:template match="/">
+ <test hello="world"/>
+ </xsl:template>
+</xsl:stylesheet>
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/test-difficult-rdf1.xml
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/test-difficult-rdf1.xml b/tika-test-resources/src/test/resources/test-documents/test-difficult-rdf1.xml
new file mode 100644
index 0000000..dc88dcf
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/test-difficult-rdf1.xml
@@ -0,0 +1,39 @@
+<?xml version='1.0' encoding='ISO-8859-1'?>
+
+<!DOCTYPE uridef[
+ <!ENTITY rdf "http://www.w3.org/1999/02/22-rdf-syntax-ns">
+ <!ENTITY shadow-rdf "http://www.daml.org/services/owl-s/1.2/generic/ObjectList.owl">
+ <!ENTITY expr "http://www.daml.org/services/owl-s/1.2/generic/Expression.owl">
+ <!ENTITY rdfs "http://www.w3.org/2000/01/rdf-schema">
+ <!ENTITY owl "http://www.w3.org/2002/07/owl">
+ <!ENTITY xsd "http://www.w3.org/2001/XMLSchema">
+ <!ENTITY time "http://www.isi.edu/~hobbs/damltime/time-entry.owl">
+ <!ENTITY swrl "http://www.w3.org/2003/11/swrl">
+ <!ENTITY service "http://www.daml.org/services/owl-s/1.2/Service.owl">
+ <!ENTITY grounding "http://www.daml.org/services/owl-s/1.2/Grounding.owl">
+ <!ENTITY process "http://www.daml.org/services/owl-s/1.2/Process.owl">
+ <!ENTITY DEFAULT "http://www.daml.org/services/owl-s/1.2/Process.owl">
+]>
+
+
+<rdf:RDF
+ xmlns:rdf= "&rdf;#"
+ xmlns:shadow-rdf= "&shadow-rdf;#"
+ xmlns:expr= "&expr;#"
+ xmlns:rdfs= "&rdfs;#"
+ xmlns:owl= "&owl;#"
+ xmlns:swrl= "&swrl;#"
+ xmlns:xsd= "&xsd;#"
+ xmlns:service= "&service;#"
+ xmlns:process= "&process;#"
+ xmlns:grounding= "&grounding;#"
+ xmlns= "&DEFAULT;#"
+ xml:base="&process;">
+
+<!--
+ TIKA-309: Mime type application/rdf+xml not correctly detected
+ Simplified test case based on the OWL document at
+ http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl
+-->
+
+</rdf:RDF>
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/test-difficult-rdf2.xml
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/test-difficult-rdf2.xml b/tika-test-resources/src/test/resources/test-documents/test-difficult-rdf2.xml
new file mode 100644
index 0000000..0f8fe28
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/test-difficult-rdf2.xml
@@ -0,0 +1,44 @@
+<!-- This is the OWL 2 Namespace Document, sometimes
+ called the "owl.owl" file.
+
+ For some commentary about its creation, see
+ http://www.w3.org/2007/OWL/wiki/Owl2DotOwlDevel
+
+ This was created from the 16 Oct 2009 version of
+ that page, with the turtle-to-rdf/xml conversion
+ done by cwm, and the conversion to XML entity
+ references done by hand. The GRDDL triple and
+ namespace have also been added by hand
+
+ The real OWL 1 and OWL 2 namespace is:
+ http://www.w3.org/2002/07/owl#
+
+-->
+<!DOCTYPE rdf:RDF [
+
+<!ENTITY location "http://www.w3.org/2002/07/owl" >
+<!ENTITY rdf "http://www.w3.org/1999/02/22-rdf-syntax-ns#" >
+<!ENTITY rdfs "http://www.w3.org/2000/01/rdf-schema#" >
+<!ENTITY xsd "http://www.w3.org/2001/XMLSchema#" >
+<!ENTITY dc "http://purl.org/dc/elements/1.1/" >
+<!ENTITY grddl "http://www.w3.org/2003/g/data-view#" >
+<!ENTITY owl "&location;#" >
+
+]>
+<rdf:RDF
+ xml:base ="&location;"
+ xmlns:rdf ="&rdf;"
+ xmlns:rdfs="&rdfs;"
+ xmlns:xsd = "&xsd;"
+ xmlns:owl ="&owl;"
+ xmlns:dc = "&dc;"
+ xmlns:grddl = "&grddl;"
+ >
+
+<!--
+ TIKA-309: Mime type application/rdf+xml not correctly detected
+ Simplified test case based on the OWL 2 Namespace Document at
+ http://www.w3.org/2002/07/owl#
+-->
+
+</rdf:RDF>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/test-iso-8859-1.xml
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/test-iso-8859-1.xml b/tika-test-resources/src/test/resources/test-documents/test-iso-8859-1.xml
new file mode 100644
index 0000000..7573369
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/test-iso-8859-1.xml
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<test hello="world"/>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/test-long-comment.xml
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/test-long-comment.xml b/tika-test-resources/src/test/resources/test-documents/test-long-comment.xml
new file mode 100644
index 0000000..84844ec
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/test-long-comment.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<?somepi blahblah test="ignore-me.xml" ?>
+<test hello="world"/>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/test-tika-327.html
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/test-tika-327.html b/tika-test-resources/src/test/resources/test-documents/test-tika-327.html
new file mode 100644
index 0000000..792a18b
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/test-tika-327.html
@@ -0,0 +1,50 @@
+<?xml version="1.0" encoding="iso-8859-1"?><link href="http://www.apache.org" rel="stylesheet" type="text/css" />
+<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
+<title>title</title>
+<meta name="description" content="content" />
+<meta name="keywords" content="keys" />
+<script language="JavaScript" type="text/javascript">
+<!--
+function hello() {
+}
+//-->
+
+
+</script>
+
+<!-- IE fix -->
+<style type="text/css">form { display: inline }</style>
+<!--
+comment
+-->
+</head>
+
+<body>
+<table>
+ <tr>
+ <td>
+ <table>
+ <tr>
+ <td><font class="title"><!--comment--><a href="index.php">image</a></font></td>
+ <td> <table>
+ <tr>
+ <td>
+ </td>
+ </tr>
+ </table></td>
+
+
+
+ </tr>
+ <tr>
+ <td>
+ <span class="class">Home </span> </span>
+ </td>
+ <td>
+ July 2, 2013 </td>
+ </tr>
+ </table></td>
+ </tr>
+</table>
+end of table
+</body>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/test-utf16be.xml
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/test-utf16be.xml b/tika-test-resources/src/test/resources/test-documents/test-utf16be.xml
new file mode 100644
index 0000000..6835338
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/test-utf16be.xml differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/test-utf16le.xml
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/test-utf16le.xml b/tika-test-resources/src/test/resources/test-documents/test-utf16le.xml
new file mode 100644
index 0000000..2a9124d
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/test-utf16le.xml differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/test-utf8-bom.xml
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/test-utf8-bom.xml b/tika-test-resources/src/test/resources/test-documents/test-utf8-bom.xml
new file mode 100644
index 0000000..4cd4db3
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/test-utf8-bom.xml
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<test hello="world"/>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/test-utf8.xml
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/test-utf8.xml b/tika-test-resources/src/test/resources/test-documents/test-utf8.xml
new file mode 100644
index 0000000..1304d8b
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/test-utf8.xml
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<test hello="world"/>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/test.html
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/test.html b/tika-test-resources/src/test/resources/test-documents/test.html
new file mode 100644
index 0000000..763e237
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/test.html
@@ -0,0 +1,10 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
+<title>Hello World</title>
+</head>
+<body>
+ <p>Hello World!<p/>
+</body>
+</html>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/test.xls
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/test.xls b/tika-test-resources/src/test/resources/test-documents/test.xls
new file mode 100644
index 0000000..347d8a6
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/test.xls differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/testRTF_npeFromWMFInTikaServer.rtf
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testRTF_npeFromWMFInTikaServer.rtf b/tika-test-resources/src/test/resources/test-documents/testRTF_npeFromWMFInTikaServer.rtf
new file mode 100644
index 0000000..a5870e5
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/testRTF_npeFromWMFInTikaServer.rtf
@@ -0,0 +1,235 @@
+{\rtf1\adeflang1025\ansi\ansicpg1252\uc1\adeff0\deff0\stshfdbch0\stshfloch0\stshfhich0\stshfbi0\deflang2057\deflangfe2057\themelang2057\themelangfe0\themelangcs0{\fonttbl{\f0\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fbidi \fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;}
+{\f34\fbidi \froman\fcharset0\fprq2{\*\panose 02040503050406030204}Cambria Math;}{\f171\fbidi \froman\fcharset0\fprq2{\*\panose 02040602050305030304}Book Antiqua;}
+{\f318\fbidi \fswiss\fcharset0\fprq2{\*\panose 020b0603030504020204}Humnst777 BT{\*\falt Lucida Sans Unicode};}{\flomajor\f31500\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}
+{\fdbmajor\f31501\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\fhimajor\f31502\fbidi \froman\fcharset0\fprq2{\*\panose 02040503050406030204}Cambria;}
+{\fbimajor\f31503\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\flominor\f31504\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}
+{\fdbminor\f31505\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\fhiminor\f31506\fbidi \fswiss\fcharset0\fprq2{\*\panose 020f0502020204030204}Calibri;}
+{\fbiminor\f31507\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f319\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\f320\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}
+{\f322\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\f323\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\f324\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f325\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}
+{\f326\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\f327\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\f329\fbidi \fswiss\fcharset238\fprq2 Arial CE;}{\f330\fbidi \fswiss\fcharset204\fprq2 Arial Cyr;}
+{\f332\fbidi \fswiss\fcharset161\fprq2 Arial Greek;}{\f333\fbidi \fswiss\fcharset162\fprq2 Arial Tur;}{\f334\fbidi \fswiss\fcharset177\fprq2 Arial (Hebrew);}{\f335\fbidi \fswiss\fcharset178\fprq2 Arial (Arabic);}
+{\f336\fbidi \fswiss\fcharset186\fprq2 Arial Baltic;}{\f337\fbidi \fswiss\fcharset163\fprq2 Arial (Vietnamese);}{\f659\fbidi \froman\fcharset238\fprq2 Cambria Math CE;}{\f660\fbidi \froman\fcharset204\fprq2 Cambria Math Cyr;}
+{\f662\fbidi \froman\fcharset161\fprq2 Cambria Math Greek;}{\f663\fbidi \froman\fcharset162\fprq2 Cambria Math Tur;}{\f666\fbidi \froman\fcharset186\fprq2 Cambria Math Baltic;}{\f667\fbidi \froman\fcharset163\fprq2 Cambria Math (Vietnamese);}
+{\f2029\fbidi \froman\fcharset238\fprq2 Book Antiqua CE;}{\f2030\fbidi \froman\fcharset204\fprq2 Book Antiqua Cyr;}{\f2032\fbidi \froman\fcharset161\fprq2 Book Antiqua Greek;}{\f2033\fbidi \froman\fcharset162\fprq2 Book Antiqua Tur;}
+{\f2036\fbidi \froman\fcharset186\fprq2 Book Antiqua Baltic;}{\flomajor\f31508\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\flomajor\f31509\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}
+{\flomajor\f31511\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\flomajor\f31512\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\flomajor\f31513\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}
+{\flomajor\f31514\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\flomajor\f31515\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\flomajor\f31516\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}
+{\fdbmajor\f31518\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\fdbmajor\f31519\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fdbmajor\f31521\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}
+{\fdbmajor\f31522\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\fdbmajor\f31523\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fdbmajor\f31524\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}
+{\fdbmajor\f31525\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\fdbmajor\f31526\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\fhimajor\f31528\fbidi \froman\fcharset238\fprq2 Cambria CE;}
+{\fhimajor\f31529\fbidi \froman\fcharset204\fprq2 Cambria Cyr;}{\fhimajor\f31531\fbidi \froman\fcharset161\fprq2 Cambria Greek;}{\fhimajor\f31532\fbidi \froman\fcharset162\fprq2 Cambria Tur;}
+{\fhimajor\f31535\fbidi \froman\fcharset186\fprq2 Cambria Baltic;}{\fhimajor\f31536\fbidi \froman\fcharset163\fprq2 Cambria (Vietnamese);}{\fbimajor\f31538\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}
+{\fbimajor\f31539\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fbimajor\f31541\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\fbimajor\f31542\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}
+{\fbimajor\f31543\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fbimajor\f31544\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\fbimajor\f31545\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}
+{\fbimajor\f31546\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\flominor\f31548\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\flominor\f31549\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}
+{\flominor\f31551\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\flominor\f31552\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\flominor\f31553\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}
+{\flominor\f31554\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\flominor\f31555\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\flominor\f31556\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}
+{\fdbminor\f31558\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\fdbminor\f31559\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fdbminor\f31561\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}
+{\fdbminor\f31562\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\fdbminor\f31563\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fdbminor\f31564\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}
+{\fdbminor\f31565\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\fdbminor\f31566\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\fhiminor\f31568\fbidi \fswiss\fcharset238\fprq2 Calibri CE;}
+{\fhiminor\f31569\fbidi \fswiss\fcharset204\fprq2 Calibri Cyr;}{\fhiminor\f31571\fbidi \fswiss\fcharset161\fprq2 Calibri Greek;}{\fhiminor\f31572\fbidi \fswiss\fcharset162\fprq2 Calibri Tur;}
+{\fhiminor\f31575\fbidi \fswiss\fcharset186\fprq2 Calibri Baltic;}{\fhiminor\f31576\fbidi \fswiss\fcharset163\fprq2 Calibri (Vietnamese);}{\fbiminor\f31578\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}
+{\fbiminor\f31579\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fbiminor\f31581\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\fbiminor\f31582\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}
+{\fbiminor\f31583\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fbiminor\f31584\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\fbiminor\f31585\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}
+{\fbiminor\f31586\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;
+\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\*\defchp \fs22 }{\*\defpap
+\ql \li0\ri0\sa200\sl276\slmult1\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 }\noqfpromote {\stylesheet{\ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs24\alang1025
+\ltrch\fcs0 \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 \snext0 \sqformat \spriority0 Normal;}{\s1\ql \li0\ri0\keepn\widctlpar\tx5670\tx8222\wrapdefault\faauto\outlinelevel0\rin0\lin0\itap0 \rtlch\fcs1 \ab\af1\afs24\alang1025 \ltrch\fcs0
+\b\f1\fs20\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 \sbasedon0 \snext0 \slink15 \sqformat heading 1;}{\s2\ql \li0\ri0\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel1\adjustright\rin0\lin0\itap0 \rtlch\fcs1
+\ab\af1\afs28\alang1025 \ltrch\fcs0 \b\f1\fs28\lang2057\langfe2057\cgrid\langnp2057\langfenp2057 \sbasedon0 \snext0 \slink16 \sqformat heading 2;}{\s4\ql \li5670\ri0\keepn\widctlpar\tx5670\tx7371\wrapdefault\faauto\outlinelevel3\rin0\lin5670\itap0
+\rtlch\fcs1 \ab\af1\afs16\alang1025 \ltrch\fcs0 \b\f1\fs16\lang2057\langfe2057\cgrid\langnp2057\langfenp2057 \sbasedon0 \snext0 \slink17 \sqformat heading 4;}{\*\cs10 \additive \ssemihidden Default Paragraph Font;}{\*
+\ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\tblind0\tblindtype3\tscellwidthfts0\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv \ql \li0\ri0\sa200\sl276\slmult1
+\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs22\alang1025 \ltrch\fcs0 \fs22\lang2057\langfe2057\cgrid\langnp2057\langfenp2057 \snext11 \ssemihidden \sunhideused \sqformat Normal Table;}{\*\cs15 \additive
+\rtlch\fcs1 \ab\af31503\afs32 \ltrch\fcs0 \b\fs32\lang0\langfe1033\kerning32\loch\f31502\hich\af31502\dbch\af31501\langnp0\langfenp1033 \sbasedon10 \slink1 \slocked \spriority9 Heading 1 Char;}{\*\cs16 \additive \rtlch\fcs1 \ab\ai\af31503\afs28
+\ltrch\fcs0 \b\i\fs28\lang0\langfe1033\loch\f31502\hich\af31502\dbch\af31501\langnp0\langfenp1033 \sbasedon10 \slink2 \slocked \ssemihidden \spriority9 Heading 2 Char;}{\*\cs17 \additive \rtlch\fcs1 \ab\af31507\afs28 \ltrch\fcs0
+\b\fs28\lang0\langfe1033\loch\f31506\hich\af31506\dbch\af31505\langnp0\langfenp1033 \sbasedon10 \slink4 \slocked \ssemihidden \spriority9 Heading 4 Char;}{\s18\ql \li0\ri0\widctlpar
+\tqc\tx4153\tqr\tx8306\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs20\alang1025 \ltrch\fcs0 \fs22\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 \sbasedon0 \snext18 \slink19 header;}{\*\cs19 \additive \rtlch\fcs1
+\af0\afs24 \ltrch\fcs0 \fs24\lang0\langfe1033\langnp0\langfenp1033 \sbasedon10 \slink18 \slocked \ssemihidden Header Char;}{\s20\ql \li0\ri0\widctlpar\tx3402\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1
+\ab\af1\afs24\alang1025 \ltrch\fcs0 \b\f1\fs20\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 \sbasedon0 \snext20 \slink21 Body Text;}{\*\cs21 \additive \rtlch\fcs1 \af0\afs24 \ltrch\fcs0 \fs24\lang0\langfe1033\langnp0\langfenp1033
+\sbasedon10 \slink20 \slocked \ssemihidden Body Text Char;}{\s22\ql \li0\ri0\widctlpar\tqc\tx4153\tqr\tx8306\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs24\alang1025 \ltrch\fcs0
+\fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 \sbasedon0 \snext22 \slink23 \styrsid14506524 footer;}{\*\cs23 \additive \rtlch\fcs1 \af0\afs24 \ltrch\fcs0 \fs24\lang0\langfe1033\langnp0\langfenp1033 \sbasedon10 \slink22 \slocked \ssemihidden
+Footer Char;}}{\*\rsidtbl \rsid69694\rsid615335\rsid817088\rsid1394934\rsid1968554\rsid2362503\rsid2504751\rsid2508965\rsid3497332\rsid3954968\rsid4262707\rsid4459777\rsid4947815\rsid5249973\rsid5375126\rsid5768946\rsid6625584\rsid6695929\rsid7547824
+\rsid7568219\rsid7681002\rsid7756842\rsid8788056\rsid9179382\rsid9185548\rsid9589441\rsid9716173\rsid10108489\rsid10158374\rsid10170376\rsid10447577\rsid10506307\rsid10508481\rsid11937854\rsid12735407\rsid14506524\rsid15223573\rsid15351889\rsid15429861
+\rsid15800823\rsid16209942\rsid16329808\rsid16338741\rsid16531520}{\mmathPr\mmathFont34\mbrkBin0\mbrkBinSub0\msmallFrac0\mdispDef1\mlMargin0\mrMargin0\mdefJc1\mwrapIndent1440\mintLim0\mnaryLim1}{\info{\title Cardiff}{\author A Other}
+{\operator Ian Williams}{\creatim\yr2016\mo2\dy1\hr16\min12}{\revtim\yr2016\mo2\dy1\hr16\min12}{\version2}{\edmins2}{\nofpages1}{\nofwords6}{\nofchars37}{\*\company Cardiff}{\nofcharsws42}{\vern32773}}{\*\xmlnstbl {\xmlns1 http://schemas.microsoft.com/off
+ice/word/2003/wordml}}\paperw11906\paperh16838\margl851\margr851\margt567\margb794\gutter0\ltrsect
+\widowctrl\ftnbj\aenddoc\trackmoves1\trackformatting1\donotembedsysfont1\relyonvml0\donotembedlingdata0\grfdocevents0\validatexml1\showplaceholdtext0\ignoremixedcontent0\saveinvalidxml0\showxmlerrors1\noxlattoyen
+\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\formshade\horzdoc\dgmargin\dghspace120\dgvspace180\dghorigin851\dgvorigin567\dghshow2\dgvshow1
+\jexpand\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot\splytwnine\ftnlytwnine\htmautsp\nolnhtadjtbl\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule\nojkernpunct\rsidroot15429861 \fet0{\*\wgrffmtfilter 013f}\ilfomacatclnup0{\*\template
+C:\\PMS\\DOCUMENT\\gplnew.dot}{\*\ftnsep \ltrpar \pard\plain \ltrpar\ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs24\alang1025 \ltrch\fcs0 \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033
+{\rtlch\fcs1 \af0 \ltrch\fcs0 \insrsid5249973 \chftnsep
+\par }}{\*\ftnsepc \ltrpar \pard\plain \ltrpar\ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs24\alang1025 \ltrch\fcs0 \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\rtlch\fcs1 \af0
+\ltrch\fcs0 \insrsid5249973 \chftnsepc
+\par }}{\*\aftnsep \ltrpar \pard\plain \ltrpar\ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs24\alang1025 \ltrch\fcs0 \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\rtlch\fcs1 \af0
+\ltrch\fcs0 \insrsid5249973 \chftnsep
+\par }}{\*\aftnsepc \ltrpar \pard\plain \ltrpar\ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs24\alang1025 \ltrch\fcs0 \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\rtlch\fcs1 \af0
+\ltrch\fcs0 \insrsid5249973 \chftnsepc
+\par }}\ltrpar \sectd \ltrsect\linex0\headery709\footery709\colsx708\endnhere\sectlinegrid360\sectdefaultcl\sectrsid3497332\sftnbj {\headerl \ltrpar \pard\plain \ltrpar\s18\ql \li0\ri0\widctlpar
+\tqc\tx4153\tqr\tx8306\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs20\alang1025 \ltrch\fcs0 \fs22\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\rtlch\fcs1 \af0 \ltrch\fcs0 \insrsid2504751
+\par }}{\headerr \ltrpar \pard\plain \ltrpar\s18\ql \li0\ri0\widctlpar\tqc\tx4153\tqr\tx8306\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs20\alang1025 \ltrch\fcs0 \fs22\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {
+\rtlch\fcs1 \af0 \ltrch\fcs0 \insrsid2504751
+\par }}{\footerl \ltrpar \pard\plain \ltrpar\s22\ql \li0\ri0\widctlpar\tqc\tx4153\tqr\tx8306\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs24\alang1025 \ltrch\fcs0 \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {
+\rtlch\fcs1 \af0 \ltrch\fcs0 \insrsid2504751
+\par }}{\footerr \ltrpar \pard\plain \ltrpar\s22\qr \li-284\ri0\widctlpar\tqc\tx4153\tqr\tx8306\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin-284\itap0\pararsid14506524 \rtlch\fcs1 \af0\afs24\alang1025 \ltrch\fcs0
+\fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\rtlch\fcs1 \af0\afs16 \ltrch\fcs0 \f318\fs16\insrsid1968554 {\pict{\*\picprop\shplid1026{\sp{\sn shapeType}{\sv 1}}{\sp{\sn fFlipH}{\sv 0}}
+{\sp{\sn fFlipV}{\sv 0}}{\sp{\sn fillColor}{\sv 3355443}}{\sp{\sn fRecolorFillAsPicture}{\sv 0}}{\sp{\sn fUseShapeAnchor}{\sv 0}}{\sp{\sn fFilled}{\sv 1}}{\sp{\sn fLine}{\sv 0}}{\sp{\sn pctHR}{\sv 0}}{\sp{\sn alignHR}{\sv 1}}{\sp{\sn dxHeightHR}{\sv 20}}
+{\sp{\sn dxWidthHR}{\sv 10943}}{\sp{\sn fLayoutInCell}{\sv 1}}{\sp{\sn fStandardHR}{\sv 1}}{\sp{\sn fNoshadeHR}{\sv 1}}{\sp{\sn fHorizRule}{\sv 1}}{\sp{\sn fLayoutInCell}{\sv 1}}}\picscalex1094\picscaley4\piccropl0\piccropr0\piccropt0\piccropb0
+\picw1764\pich882\picwgoal1000\pichgoal500\wmetafile8\bliptag667904020\blipupi71{\*\blipuid 27cf68149ca99ab95f958a7b62da888e}010009000003dd02000006001202000000001202000026060f001a04574d464301000000000001003e050000000001000000f803000000000000f80300000100
+00006c000000ffffffffffffffff111100002c00000000000000000000003e480000b900000020454d4600000100f80300001d00000003000000000000000000
+000000000000981200009f1a0000ca0000002101000000000000000000000000000023130300f6660400160000000c000000180000000a000000100000000000
+0000000000000900000010000000111100002c000000250000000c0000000e000080250000000c0000000e000080120000000c00000001000000520000007001
+000001000000a4ffffff000000000000000000000000900100000000000004400022430061006c00690062007200690000000000000000000000000000000000
+0000000000000000000000000000000000000000000000000000000000000000000000001900304e19001000000094511900144f1900fa4e5966945119008c4e
+190010000000fc4f190078511900cc4e5966945119008c4e1900200000008a790c5f8c4e19009451190020000000ffffffffdc008900057a0c5fffffffffffff
+0180ffff01809f020180ffffffff00420000000800000008000018f18d1001000000000000005802000025000000372e90010000020f0502020204030204ff02
+00e0ffac004001000000000000009f01000000000000430061006c006900620072000000000020ebf70486d759667a68466dbc008900306c8000c04e19009832
+055f1f00000001000000fc4e1900fc4e1900907b035f1f000000244f1900dc0089006476000800000000250000000c00000001000000250000000c0000000100
+0000250000000c00000001000000180000000c0000000000000254000000540000000000000000000000350000002b000000010000005fcc87405eb387400000
+000057000000010000004c000000040000000000000000000000111100002c00000050000000200000003600000046000000280000001c000000474449430200
+0000ffffffffffffffff111100002c000000000000002100000008000000620000000c0000000100000024000000240000000000803e00000000000000000000
+803e000000000000000002000000270000001800000002000000000000003333330000000000250000000c00000002000000250000000c000000080000805600
+000030000000ffffffffffffffff111100002c00000005000000fefffefffeffad004144ad004144fefffefffeff250000000c00000007000080250000000c00
+000000000080240000002400000000008040000000000000000000008040000000000000000002000000220000000c000000ffffffff46000000140000000800
+00004744494303000000250000000c0000000e000080250000000c0000000e0000800e0000001400000000000000100000001400000004000000030108000500
+00000b0200000000050000000c0205000c02040000002e0118001c000000fb020200010000000000bc02000000000102022253797374656d003f00003f3f0000
+0000000000000000000001003f3f3f3f3f00040000002d010000040000002d01000004000000020101001c000000fb02f5ff0000000000009001000000000440
+002243616c6962726900000000000000000000000000000000000000000000000000040000002d010100040000002d010100040000002d010100050000000902
+000000020d000000320a0a00000001000400000000000e02050020000600030000001e0007000000fc020000333333000000040000002d01020008000000fa02
+050000000000ffffff00040000002d0103000e0000002403050000000000000005000e0205000e0200000000000008000000fa0200000000000000000000040000002d01040007000000fc020000ffffff000000040000002d010500040000002701ffff040000002d010000040000002d010000030000000000}}{
+\rtlch\fcs1 \af0 \ltrch\fcs0 \f318\ul\insrsid5249973
+\par }\pard \ltrpar\s22\ql \li-284\ri0\widctlpar\tx3075\tx3119\tx7655\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin-284\itap0\pararsid16338741 {\rtlch\fcs1 \af0\afs14 \ltrch\fcs0 \b\f318\fs14\insrsid2504751 Example footer}{\rtlch\fcs1 \af0\afs14
+\ltrch\fcs0 \f318\fs14\insrsid5249973
+\par }}{\headerf \ltrpar \pard\plain \ltrpar\s18\ql \li0\ri0\widctlpar\tqc\tx4153\tqr\tx8306\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs20\alang1025 \ltrch\fcs0 \fs22\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {
+\rtlch\fcs1 \af0 \ltrch\fcs0 \insrsid2504751
+\par }}{\footerf \ltrpar \pard\plain \ltrpar\s22\ql \li0\ri0\widctlpar\tqc\tx4153\tqr\tx8306\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs24\alang1025 \ltrch\fcs0 \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {
+\rtlch\fcs1 \af0 \ltrch\fcs0 \insrsid2504751
+\par }}{\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang {\pntxta )}}
+{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl8
+\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}\ltrrow\trowd \irow0\irowband0\ltrrow
+\ts11\trgaph108\trleft-108\trkeep\trftsWidth1\trftsWidthB3\trftsWidthA3\trpaddl108\trpaddr108\trpaddfl3\trpaddfr3\tblind0\tblindtype3 \clvertalc\clbrdrt\brdrtbl \clbrdrl\brdrtbl \clbrdrb\brdrtbl \clbrdrr\brdrtbl
+\cltxlrtb\clftsWidth3\clwWidth5508\clshdrawnil \cellx5400\clvertalc\clbrdrt\brdrtbl \clbrdrl\brdrtbl \clbrdrb\brdrtbl \clbrdrr\brdrtbl \cltxlrtb\clftsWidth3\clwWidth4912\clshdrawnil \cellx10312\pard\plain \ltrpar\ql \li0\ri0\widctlpar\intbl
+\tx5670\tx8222\wrapdefault\faauto\rin0\lin0 \rtlch\fcs1 \af0\afs24\alang1025 \ltrch\fcs0 \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\rtlch\fcs1 \af1\afs18 \ltrch\fcs0 \f1\fs18\cf1\lang2057\langfe2057\langfenp2057\insrsid6695929 \cell
+}\pard \ltrpar\qc \li0\ri0\widctlpar\intbl\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\pararsid16338741 {\rtlch\fcs1 \af1\afs18 \ltrch\fcs0 \f1\fs18\lang2057\langfe2057\langfenp2057\insrsid6695929 \cell }\pard \ltrpar
+\ql \li0\ri0\sa200\sl276\slmult1\widctlpar\intbl\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\rtlch\fcs1 \af1 \ltrch\fcs0 \f1\fs20\insrsid6695929 \trowd \irow0\irowband0\ltrrow
+\ts11\trgaph108\trleft-108\trkeep\trftsWidth1\trftsWidthB3\trftsWidthA3\trpaddl108\trpaddr108\trpaddfl3\trpaddfr3\tblind0\tblindtype3 \clvertalc\clbrdrt\brdrtbl \clbrdrl\brdrtbl \clbrdrb\brdrtbl \clbrdrr\brdrtbl
+\cltxlrtb\clftsWidth3\clwWidth5508\clshdrawnil \cellx5400\clvertalc\clbrdrt\brdrtbl \clbrdrl\brdrtbl \clbrdrb\brdrtbl \clbrdrr\brdrtbl \cltxlrtb\clftsWidth3\clwWidth4912\clshdrawnil \cellx10312\row \ltrrow}\trowd \irow1\irowband1\lastrow \ltrrow
+\ts11\trgaph108\trleft-108\trkeep\trftsWidth1\trftsWidthB3\trftsWidthA3\trpaddl108\trpaddr108\trpaddfl3\trpaddfr3\tblind0\tblindtype3 \clvertalc\clbrdrt\brdrtbl \clbrdrl\brdrtbl \clbrdrb\brdrtbl \clbrdrr\brdrtbl
+\cltxlrtb\clftsWidth3\clwWidth10420\clshdrawnil \cellx10312\pard \ltrpar\ql \li0\ri0\widctlpar\intbl\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\rtlch\fcs1 \af1\afs18 \ltrch\fcs0 \f1\fs8\lang2057\langfe2057\langfenp2057\insrsid6695929
+\cell }\pard \ltrpar\ql \li0\ri0\sa200\sl276\slmult1\widctlpar\intbl\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\rtlch\fcs1 \af1 \ltrch\fcs0 \f1\fs20\insrsid6695929 \trowd \irow1\irowband1\lastrow \ltrrow
+\ts11\trgaph108\trleft-108\trkeep\trftsWidth1\trftsWidthB3\trftsWidthA3\trpaddl108\trpaddr108\trpaddfl3\trpaddfr3\tblind0\tblindtype3 \clvertalc\clbrdrt\brdrtbl \clbrdrl\brdrtbl \clbrdrb\brdrtbl \clbrdrr\brdrtbl
+\cltxlrtb\clftsWidth3\clwWidth10420\clshdrawnil \cellx10312\row }\pard \ltrpar\qj \li0\ri0\widctlpar\tx0\wrapdefault\aspalpha\aspnum\faauto\outlinelevel0\adjustright\rin0\lin0\itap0\pararsid15429861 {\rtlch\fcs1 \af0 \ltrch\fcs0 \f171\fs20\insrsid2504751
+Example text to extract from RTF.}{\rtlch\fcs1 \af0 \ltrch\fcs0 \f171\fs20\insrsid15429861
+\par
+\par
+\par }{\rtlch\fcs1 \af0 \ltrch\fcs0 \b\f171\fs20\ul\insrsid15429861
+\par }{\rtlch\fcs1 \af0 \ltrch\fcs0 \b\f171\fs20\ul\insrsid15429861\charrsid4947815
+\par }{\*\themedata 504b030414000600080000002100828abc13fa0000001c020000130000005b436f6e74656e745f54797065735d2e786d6cac91cb6ac3301045f785fe83d0b6d8
+72ba28a5d8cea249777d2cd20f18e4b12d6a8f843409c9df77ecb850ba082d74231062ce997b55ae8fe3a00e1893f354e9555e6885647de3a8abf4fbee29bbd7
+2a3150038327acf409935ed7d757e5ee14302999a654e99e393c18936c8f23a4dc072479697d1c81e51a3b13c07e4087e6b628ee8cf5c4489cf1c4d075f92a0b
+44d7a07a83c82f308ac7b0a0f0fbf90c2480980b58abc733615aa2d210c2e02cb04430076a7ee833dfb6ce62e3ed7e14693e8317d8cd0433bf5c60f53fea2fe7
+065bd80facb647e9e25c7fc421fd2ddb526b2e9373fed4bb902e182e97b7b461e6bfad3f010000ffff0300504b030414000600080000002100a5d6a7e7c00000
+00360100000b0000005f72656c732f2e72656c73848fcf6ac3300c87ef85bd83d17d51d2c31825762fa590432fa37d00e1287f68221bdb1bebdb4fc7060abb08
+84a4eff7a93dfeae8bf9e194e720169aaa06c3e2433fcb68e1763dbf7f82c985a4a725085b787086a37bdbb55fbc50d1a33ccd311ba548b63095120f88d94fbc
+52ae4264d1c910d24a45db3462247fa791715fd71f989e19e0364cd3f51652d73760ae8fa8c9ffb3c330cc9e4fc17faf2ce545046e37944c69e462a1a82fe353
+bd90a865aad41ed0b5b8f9d6fd010000ffff0300504b0304140006000800000021006b799616830000008a0000001c0000007468656d652f7468656d652f7468
+656d654d616e616765722e786d6c0ccc4d0ac3201040e17da17790d93763bb284562b2cbaebbf600439c1a41c7a0d29fdbd7e5e38337cedf14d59b4b0d592c9c
+070d8a65cd2e88b7f07c2ca71ba8da481cc52c6ce1c715e6e97818c9b48d13df49c873517d23d59085adb5dd20d6b52bd521ef2cdd5eb9246a3d8b4757e8d3f7
+29e245eb2b260a0238fd010000ffff0300504b03041400060008000000210096b5ade296060000501b0000160000007468656d652f7468656d652f7468656d65
+312e786d6cec594f6fdb3614bf0fd87720746f6327761a07758ad8b19b2d4d1bc46e871e698996d850a240d2497d1bdae38001c3ba618715d86d87615b8116d8
+a5fb34d93a6c1dd0afb0475292c5585e9236d88aad3e2412f9e3fbff1e1fa9abd7eec70c1d1221294fda5efd72cd4324f1794093b0eddd1ef62fad79482a9c04
+98f184b4bd2991deb58df7dfbb8ad755446282607d22d771db8b944ad79796a40fc3585ee62949606ecc458c15bc8a702910f808e8c66c69b9565b5d8a314d3c
+94e018c8de1a8fa94fd05093f43672e23d06af89927ac06762a049136785c10607758d9053d965021d62d6f6804fc08f86e4bef210c352c144dbab999fb7b471
+7509af678b985ab0b6b4ae6f7ed9ba6c4170b06c788a705430adf71bad2b5b057d03606a1ed7ebf5babd7a41cf00b0ef83a6569632cd467faddec9699640f671
+9e76b7d6ac355c7c89feca9cccad4ea7d36c65b258a206641f1b73f8b5da6a6373d9c11b90c537e7f08dce66b7bbeae00dc8e257e7f0fd2badd5868b37a088d1
+e4600ead1ddaef67d40bc898b3ed4af81ac0d76a197c86826828a24bb318f3442d8ab518dfe3a20f000d6458d104a9694ac6d88728eee2782428d60cf03ac1a5
+193be4cbb921cd0b495fd054b5bd0f530c1931a3f7eaf9f7af9e3f45c70f9e1d3ff8e9f8e1c3e3073f5a42ceaa6d9c84e5552fbffdeccfc71fa33f9e7ef3f2d1
+17d57859c6fffac327bffcfc793510d26726ce8b2f9ffcf6ecc98baf3efdfdbb4715f04d814765f890c644a29be408edf3181433567125272371be15c308d3f2
+8acd249438c19a4b05fd9e8a1cf4cd296699771c393ac4b5e01d01e5a30a787d72cf1178108989a2159c77a2d801ee72ce3a5c545a6147f32a99793849c26ae6
+6252c6ed637c58c5bb8b13c7bfbd490a75330f4b47f16e441c31f7184e140e494214d273fc80900aedee52ead87597fa824b3e56e82e451d4c2b4d32a423279a
+668bb6690c7e9956e90cfe766cb37b077538abd27a8b1cba48c80acc2a841f12e698f13a9e281c57911ce298950d7e03aba84ac8c154f8655c4f2af074481847
+bd804859b5e696007d4b4edfc150b12addbecba6b18b148a1e54d1bc81392f23b7f84137c2715a851dd0242a633f900710a218ed715505dfe56e86e877f0034e
+16bafb0e258ebb4faf06b769e888340b103d3311da9750aa9d0a1cd3e4efca31a3508f6d0c5c5c398602f8e2ebc71591f5b616e24dd893aa3261fb44f95d843b
+5974bb5c04f4edafb95b7892ec1108f3f98de75dc97d5772bdff7cc95d94cf672db4b3da0a6557f70db629362d72bcb0431e53c6066acac80d699a6409fb44d0
+8741bdce9c0e4971624a2378cceaba830b05366b90e0ea23aaa241845368b0eb9e2612ca8c742851ca251ceccc70256d8d87265dd96361531f186c3d9058edf2
+c00eafe8e1fc5c509031bb4d680e9f39a3154de0accc56ae644441edd76156d7429d995bdd88664a9dc3ad50197c38af1a0c16d684060441db02565e85f3b966
+0d0713cc48a0ed6ef7dedc2dc60b17e92219e180643ed27acffba86e9c94c78ab90980d8a9f0913ee49d62b512b79626fb06dccee2a432bbc60276b9f7dec44b
+7904cfbca4f3f6443ab2a49c9c2c41476dafd55c6e7ac8c769db1bc399161ee314bc2e75cf8759081743be1236ec4f4d6693e5336fb672c5dc24a8c33585b5fb
+9cc24e1d4885545b58463634cc5416022cd19cacfccb4d30eb45296023fd35a458598360f8d7a4003bbaae25e331f155d9d9a5116d3bfb9a95523e51440ca2e0
+088dd844ec6370bf0e55d027a012ae264c45d02f708fa6ad6da6dce29c255df9f6cae0ec38666984b372ab5334cf640b37795cc860de4ae2816e95b21be5ceaf
+8a49f90b52a51cc6ff3355f47e0237052b81f6800fd7b802239daf6d8f0b1571a8426944fdbe80c6c1d40e8816b88b8569082ab84c36ff0539d4ff6dce591a26
+ade1c0a7f669880485fd484582903d284b26fa4e2156cff62e4b9265844c4495c495a9157b440e091bea1ab8aaf7760f4510eaa69a6465c0e04ec69ffb9e65d0
+28d44d4e39df9c1a52ecbd3607fee9cec7263328e5d661d3d0e4f62f44acd855ed7ab33cdf7bcb8ae889599bd5c8b3029895b6825696f6af29c239b75a5bb1e6
+345e6ee6c28117e73586c1a2214ae1be07e93fb0ff51e133fb65426fa843be0fb515c187064d0cc206a2fa926d3c902e907670048d931db4c1a44959d366ad93
+b65abe595f70a75bf03d616c2dd959fc7d4e6317cd99cbcec9c58b34766661c7d6766ca1a9c1b327531486c6f941c638c67cd22a7f75e2a37be0e82db8df9f30
+254d30c1372581a1f51c983c80e4b71ccdd28dbf000000ffff0300504b0304140006000800000021000dd1909fb60000001b010000270000007468656d652f74
+68656d652f5f72656c732f7468656d654d616e616765722e786d6c2e72656c73848f4d0ac2301484f78277086f6fd3ba109126dd88d0add40384e4350d363f24
+51eced0dae2c082e8761be9969bb979dc9136332de3168aa1a083ae995719ac16db8ec8e4052164e89d93b64b060828e6f37ed1567914b284d262452282e3198
+720e274a939cd08a54f980ae38a38f56e422a3a641c8bbd048f7757da0f19b017cc524bd62107bd5001996509affb3fd381a89672f1f165dfe514173d9850528
+a2c6cce0239baa4c04ca5bbabac4df000000ffff0300504b01022d0014000600080000002100828abc13fa0000001c0200001300000000000000000000000000
+000000005b436f6e74656e745f54797065735d2e786d6c504b01022d0014000600080000002100a5d6a7e7c0000000360100000b000000000000000000000000
+002b0100005f72656c732f2e72656c73504b01022d00140006000800000021006b799616830000008a0000001c00000000000000000000000000140200007468
+656d652f7468656d652f7468656d654d616e616765722e786d6c504b01022d001400060008000000210096b5ade296060000501b000016000000000000000000
+00000000d10200007468656d652f7468656d652f7468656d65312e786d6c504b01022d00140006000800000021000dd1909fb60000001b010000270000000000
+00000000000000009b0900007468656d652f7468656d652f5f72656c732f7468656d654d616e616765722e786d6c2e72656c73504b050600000000050005005d010000960a00000000}
+{\*\colorschememapping 3c3f786d6c2076657273696f6e3d22312e302220656e636f64696e673d225554462d3822207374616e64616c6f6e653d22796573223f3e0d0a3c613a636c724d
+617020786d6c6e733a613d22687474703a2f2f736368656d61732e6f70656e786d6c666f726d6174732e6f72672f64726177696e676d6c2f323030362f6d6169
+6e22206267313d226c743122207478313d22646b3122206267323d226c743222207478323d22646b322220616363656e74313d22616363656e74312220616363
+656e74323d22616363656e74322220616363656e74333d22616363656e74332220616363656e74343d22616363656e74342220616363656e74353d22616363656e74352220616363656e74363d22616363656e74362220686c696e6b3d22686c696e6b2220666f6c486c696e6b3d22666f6c486c696e6b222f3e}
+{\*\latentstyles\lsdstimax267\lsdlockeddef0\lsdsemihiddendef1\lsdunhideuseddef1\lsdqformatdef0\lsdprioritydef99{\lsdlockedexcept \lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority0 \lsdlocked0 Normal;
+\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 1;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 2;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 3;
+\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 4;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 5;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 6;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 7;
+\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 8;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 9;\lsdpriority39 \lsdlocked0 toc 1;\lsdpriority39 \lsdlocked0 toc 2;\lsdpriority39 \lsdlocked0 toc 3;\lsdpriority39 \lsdlocked0 toc 4;
+\lsdpriority39 \lsdlocked0 toc 5;\lsdpriority39 \lsdlocked0 toc 6;\lsdpriority39 \lsdlocked0 toc 7;\lsdpriority39 \lsdlocked0 toc 8;\lsdpriority39 \lsdlocked0 toc 9;\lsdqformat1 \lsdpriority35 \lsdlocked0 caption;
+\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority10 \lsdlocked0 Title;\lsdpriority1 \lsdlocked0 Default Paragraph Font;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority11 \lsdlocked0 Subtitle;
+\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority22 \lsdlocked0 Strong;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority20 \lsdlocked0 Emphasis;\lsdsemihidden0 \lsdunhideused0 \lsdpriority59 \lsdlocked0 Table Grid;
+\lsdunhideused0 \lsdlocked0 Placeholder Text;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority1 \lsdlocked0 No Spacing;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List;\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 1;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 1;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 1;\lsdunhideused0 \lsdlocked0 Revision;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority34 \lsdlocked0 List Paragraph;
+\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority29 \lsdlocked0 Quote;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority30 \lsdlocked0 Intense Quote;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 1;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 1;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 1;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 2;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 2;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 2;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 2;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 2;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 3;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 3;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 3;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 3;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 4;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 4;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 4;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 4;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 4;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 5;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 5;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 5;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 5;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 5;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 6;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 6;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 6;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 6;
+\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority19 \lsdlocked0 Subtle Emphasis;
+\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority21 \lsdlocked0 Intense Emphasis;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority31 \lsdlocked0 Subtle Reference;
+\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority32 \lsdlocked0 Intense Reference;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority33 \lsdlocked0 Book Title;\lsdpriority37 \lsdlocked0 Bibliography;
+\lsdqformat1 \lsdpriority39 \lsdlocked0 TOC Heading;}}{\*\datastore 0105000002000000180000004d73786d6c322e534158584d4c5265616465722e352e3000000000000000000000060000
+d0cf11e0a1b11ae1000000000000000000000000000000003e000300feff090006000000000000000000000001000000010000000000000000100000feffffff00000000feffffff0000000000000000ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+fffffffffffffffffdfffffffeffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+ffffffffffffffffffffffffffffffff52006f006f007400200045006e00740072007900000000000000000000000000000000000000000000000000000000000000000000000000000000000000000016000500ffffffffffffffffffffffffec69d9888b8b3d4c859eaf6cd158be0f0000000000000000000000004077
+60480b5dd101feffffff00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffff00000000000000000000000000000000000000000000000000000000
+00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffff0000000000000000000000000000000000000000000000000000
+000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffff000000000000000000000000000000000000000000000000
+0000000000000000000000000000000000000000000000000105000000000000}}
\ No newline at end of file
[13/13] tika git commit: Merge remote-tracking branch 'origin/2.x'
into 2.x
Posted by ta...@apache.org.
Merge remote-tracking branch 'origin/2.x' into 2.x
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/e1498edb
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/e1498edb
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/e1498edb
Branch: refs/heads/2.x
Commit: e1498edbbb49c40c0d67c46e469c3db18012e0ae
Parents: aa5f60d cf96323
Author: tballison <ta...@mitre.org>
Authored: Mon Mar 21 21:19:03 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Mon Mar 21 21:19:03 2016 -0400
----------------------------------------------------------------------
.../java/org/apache/tika/parser/external/ExternalParser.java | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
[08/13] tika git commit: TIKA-1855 -- first pass. Need to turn back
on the forbidden-apis testCheck. More clean up remains.
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/pom.xml b/tika-parser-modules/pom.xml
index 88205ca..b95d8a2 100644
--- a/tika-parser-modules/pom.xml
+++ b/tika-parser-modules/pom.xml
@@ -150,32 +150,6 @@
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- <version>2.10</version>
- <executions>
- <execution>
- <id>unpack</id>
- <phase>compile</phase>
- <goals>
- <goal>unpack</goal>
- </goals>
- <configuration>
- <artifactItems>
- <artifactItem>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-test-resources</artifactId>
- <version>${project.version}</version>
- <type>test-jar</type>
- <overWrite>true</overWrite>
- <outputDirectory>${project.build.testOutputDirectory}</outputDirectory>
- </artifactItem>
- </artifactItems>
- </configuration>
- </execution>
- </executions>
- </plugin>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java b/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java
index fff644a..63e75a4 100644
--- a/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java
+++ b/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/NamedEntityParserTest.java
@@ -16,26 +16,26 @@
*/
package org.apache.tika.parser.ner;
-import org.apache.tika.Tika;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser;
-import org.apache.tika.parser.ner.regex.RegexNERecogniser;
-import org.junit.Test;
+import static org.junit.Assert.assertTrue;
import java.io.ByteArrayInputStream;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.HashSet;
-import static org.junit.Assert.assertTrue;
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser;
+import org.apache.tika.parser.ner.regex.RegexNERecogniser;
+import org.junit.Test;
/**
*Test case for {@link NamedEntityParser}
*/
public class NamedEntityParserTest {
- public static final String CONFIG_FILE = "tika-config.xml";
+ public static final String CONFIG_FILE = "tika-config-for-ner.xml";
@Test
public void testParse() throws Exception {
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java b/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java
index 57c2162..257fea8 100644
--- a/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java
+++ b/tika-parser-modules/tika-parser-advanced-module/src/test/java/org/apache/tika/parser/ner/regex/RegexNERecogniserTest.java
@@ -16,11 +16,7 @@
*/
package org.apache.tika.parser.ner.regex;
-import org.apache.tika.Tika;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ner.NamedEntityParser;
-import org.junit.Test;
+import static org.junit.Assert.assertTrue;
import java.io.ByteArrayInputStream;
import java.nio.charset.StandardCharsets;
@@ -28,7 +24,12 @@ import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
-import static org.junit.Assert.assertTrue;
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ner.NamedEntityParser;
+import org.apache.tika.parser.ner.NamedEntityParserTest;
+import org.junit.Test;
public class RegexNERecogniserTest {
@@ -38,7 +39,7 @@ public class RegexNERecogniserTest {
String text = "Hey, Lets meet on this Sunday or MONDAY because i am busy on Saturday";
System.setProperty(NamedEntityParser.SYS_PROP_NER_IMPL, RegexNERecogniser.class.getName());
- Tika tika = new Tika(new TikaConfig(NamedEntityParser.class.getResourceAsStream("tika-config.xml")));
+ Tika tika = new Tika(new TikaConfig(NamedEntityParserTest.class.getResourceAsStream("tika-config-for-ner.xml")));
Metadata md = new Metadata();
tika.parse(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), md);
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt b/tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt
new file mode 100644
index 0000000..e6fa39e
--- /dev/null
+++ b/tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt
@@ -0,0 +1,17 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)?
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/tika-config-for-ner.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/tika-config-for-ner.xml b/tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/tika-config-for-ner.xml
new file mode 100644
index 0000000..267c399
--- /dev/null
+++ b/tika-parser-modules/tika-parser-advanced-module/src/test/resources/org/apache/tika/parser/ner/tika-config-for-ner.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.ner.NamedEntityParser">
+ <mime>text/plain</mime>
+ <mime>text/html</mime>
+ <mime>application/xhtml+xml</mime>
+ </parser>
+ </parsers>
+
+</properties>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java b/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
index 7ea27fa..d394c61 100644
--- a/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
+++ b/tika-parser-modules/tika-parser-database-module/src/test/java/org/apache/tika/parser/jdbc/SQLite3ParserTest.java
@@ -20,10 +20,14 @@ package org.apache.tika.parser.jdbc;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
+import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.List;
@@ -41,13 +45,27 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.ToXMLContentHandler;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
import org.junit.Test;
import org.xml.sax.ContentHandler;
public class SQLite3ParserTest extends TikaTest {
private final static String TEST_FILE_NAME = "testSqlite3b.db";
- private final static String TEST_FILE1 = "/test-documents/" + TEST_FILE_NAME;
+ static Path tmp = null;
+ @BeforeClass
+ public static void createTMPFile() throws IOException {
+ tmp = Files.createTempFile("sqlite-", "");
+ Files.copy(
+ TikaTest.class.getClassLoader().getResourceAsStream("test-documents/"+TEST_FILE_NAME),
+ tmp, StandardCopyOption.REPLACE_EXISTING);
+
+ }
+
+ @AfterClass
+ public static void deleteTMPFile() throws IOException {
+ Files.delete(tmp);
+ }
@Test
public void testBasic() throws Exception {
@@ -56,18 +74,20 @@ public class SQLite3ParserTest extends TikaTest {
//test different types of input streams
//actual inputstream, memory buffered bytearray and literal file
InputStream[] streams = new InputStream[3];
- streams[0] = getResourceAsStream(TEST_FILE1);
+ streams[0] = getTestDocumentAsStream(TEST_FILE_NAME);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
- IOUtils.copy(getResourceAsStream(TEST_FILE1), bos);
+ IOUtils.copy(getTestDocumentAsStream(TEST_FILE_NAME), bos);
streams[1] = new ByteArrayInputStream(bos.toByteArray());
- streams[2] = TikaInputStream.get(getResourceAsFile(TEST_FILE1));
+ streams[2] = TikaInputStream.get(tmp);
int tests = 0;
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, p);
for (InputStream stream : streams) {
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
//1) getXML closes the stream
//2) getXML runs recursively on the contents, so the embedded docs should show up
- XMLResult result = getXML(stream, p, metadata);
+ XMLResult result = getXML(stream, p, metadata, context);
String x = result.xml;
//first table name
assertContains("<table name=\"my_table1\"><thead><tr>\t<th>INT_COL</th>", x);
@@ -106,7 +126,7 @@ public class SQLite3ParserTest extends TikaTest {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext ctx = new ParseContext();
ctx.set(Parser.class, p);
- try (InputStream stream = getResourceAsStream(TEST_FILE1)) {
+ try (InputStream stream = getTestDocumentAsStream(TEST_FILE_NAME)) {
p.parse(stream, handler, metadata, ctx);
}
String s = handler.toString();
@@ -118,14 +138,11 @@ public class SQLite3ParserTest extends TikaTest {
//to handle embedded documents
@Test
public void testNotAddingEmbeddedParserToParseContext() throws Exception {
- Parser p = new AutoDetectParser();
- InputStream is = getResourceAsStream(TEST_FILE1);
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
- ContentHandler handler = new ToXMLContentHandler();
- p.parse(is, handler, metadata, new ParseContext());
- String xml = handler.toString();
+ XMLResult r = getXML(TEST_FILE_NAME, new AutoDetectParser(), new Metadata(), new ParseContext());
+ String xml = r.xml;
//just includes headers for embedded documents
assertContains("<table name=\"my_table1\"><thead><tr>", xml);
assertContains("<td><span type=\"blob\" column_name=\"BYTES_COL\" row_number=\"0\"><div class=\"package-entry\"><h1>BYTES_COL_0.doc</h1>", xml);
@@ -143,7 +160,7 @@ public class SQLite3ParserTest extends TikaTest {
RecursiveParserWrapper wrapper =
new RecursiveParserWrapper(p, new BasicContentHandlerFactory(
BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
- InputStream is = getResourceAsStream(TEST_FILE1);
+ InputStream is = getTestDocumentAsStream(TEST_FILE_NAME);
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
wrapper.parse(is, new BodyContentHandler(-1), metadata, new ParseContext());
@@ -176,7 +193,7 @@ public class SQLite3ParserTest extends TikaTest {
ParserContainerExtractor ex = new ParserContainerExtractor();
ByteCopyingHandler byteCopier = new ByteCopyingHandler();
- InputStream is = getResourceAsStream(TEST_FILE1);
+ InputStream is = getTestDocumentAsStream(TEST_FILE_NAME);
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
ex.extract(TikaInputStream.get(is), ex, byteCopier);
@@ -217,9 +234,12 @@ public class SQLite3ParserTest extends TikaTest {
//4x word files, two docs and two docxs
//4x png files, the same image embedded in each of the doc and docx
+ //not clear why we get an exception on reset if we try
+ //to get the test file directly
ParserContainerExtractor ex = new ParserContainerExtractor();
InputStreamResettingHandler byteCopier = new InputStreamResettingHandler();
- InputStream is = getResourceAsStream(TEST_FILE1);
+ InputStream is = new BufferedInputStream(
+ getResourceAsStream("/test-documents/"+TEST_FILE_NAME));
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
ex.extract(TikaInputStream.get(is), ex, byteCopier);
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
index 90a3c1a..5f53870 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/chm/TestChmExtraction.java
@@ -20,11 +20,8 @@ import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static org.junit.Assert.assertTrue;
import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.net.URL;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
@@ -34,6 +31,7 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Pattern;
+import org.apache.tika.TikaTest;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
@@ -45,7 +43,7 @@ import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.SAXException;
-public class TestChmExtraction {
+public class TestChmExtraction extends TikaTest {
private final Parser parser = new ChmParser();
@@ -196,12 +194,19 @@ public class TestChmExtraction {
@Test
public void test_TIKA_1446() throws Exception {
- URL chmDir = TestChmExtraction.class.getResource("/test-documents/chm/");
- File chmFolder = new File(chmDir.toURI());
- for (String fileName : chmFolder.list()) {
- File file = new File(chmFolder, fileName);
- InputStream stream = new FileInputStream(file);
- testingChm(stream);
+ String[] chemFiles = {
+ "admin.chm",
+ "cmak_ops.CHM",
+ "comexp.CHM",
+ "gpedit.CHM",
+ "IMJPCL.CHM",
+ "IMJPCLE.CHM",
+ "IMTCEN.CHM",
+ "tcpip.CHM",
+ "wmicontrol.CHM"
+ };
+ for (String fileName : chemFiles) {
+ testingChm(getTestDocumentAsStream("chm/"+fileName));
}
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 4f5bfcd..4b92e88 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -16,8 +16,6 @@
*/
package org.apache.tika.parser.microsoft;
-import static org.apache.tika.TikaTest.assertContains;
-import static org.apache.tika.TikaTest.assertNotContained;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
@@ -25,6 +23,7 @@ import static org.junit.Assert.fail;
import java.io.InputStream;
import java.util.Locale;
+import org.apache.tika.TikaTest;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.EncryptedDocumentException;
@@ -41,155 +40,139 @@ import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
-public class ExcelParserTest {
+public class ExcelParserTest extends TikaTest {
@Test
@SuppressWarnings("deprecation") // Checks legacy Tika-1.0 style metadata keys
public void testExcelParser() throws Exception {
- try (InputStream input = ExcelParserTest.class.getResourceAsStream(
- "/test-documents/testEXCEL.xls")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- new OfficeParser().parse(input, handler, metadata, context);
-
- assertEquals(
- "application/vnd.ms-excel",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Simple Excel document", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
-
- // Mon Oct 01 17:13:56 BST 2007
- assertEquals("2007-10-01T16:13:56Z", metadata.get(TikaCoreProperties.CREATED));
- assertEquals("2007-10-01T16:13:56Z", metadata.get(Metadata.CREATION_DATE));
- // Mon Oct 01 17:31:43 BST 2007
- assertEquals("2007-10-01T16:31:43Z", metadata.get(TikaCoreProperties.MODIFIED));
- assertEquals("2007-10-01T16:31:43Z", metadata.get(Metadata.DATE));
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ XMLResult r = getXML("testEXCEL.xls", new OfficeParser(), new Metadata(), context);
+
+ assertEquals(
+ "application/vnd.ms-excel",
+ r.metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Simple Excel document", r.metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Keith Bennett", r.metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Keith Bennett", r.metadata.get(Metadata.AUTHOR));
+
+ // Mon Oct 01 17:13:56 BST 2007
+ assertEquals("2007-10-01T16:13:56Z", r.metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2007-10-01T16:13:56Z", r.metadata.get(Metadata.CREATION_DATE));
+
+ // Mon Oct 01 17:31:43 BST 2007
+ assertEquals("2007-10-01T16:31:43Z", r.metadata.get(TikaCoreProperties.MODIFIED));
+ assertEquals("2007-10-01T16:31:43Z", r.metadata.get(Metadata.DATE));
+
+ String content = r.xml;
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("Numbers and their Squares", content);
+ assertContains("<tr>\t<td />\t<td>Number</td>\t<td>Square", content);
+ assertContains("9", content);
+ assertNotContained("9.0", content);
+ assertContains("196", content);
+ assertNotContained("196.0", content);
- String content = handler.toString();
- assertContains("Sample Excel Worksheet", content);
- assertContains("Numbers and their Squares", content);
- assertContains("\t\tNumber\tSquare", content);
- assertContains("9", content);
- assertNotContained("9.0", content);
- assertContains("196", content);
- assertNotContained("196.0", content);
- }
}
@Test
public void testExcelParserFormatting() throws Exception {
- try (InputStream input = ExcelParserTest.class.getResourceAsStream(
- "/test-documents/testEXCEL-formats.xls")) {
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- ContentHandler handler = new BodyContentHandler();
- new OfficeParser().parse(input, handler, metadata, context);
-
- assertEquals(
- "application/vnd.ms-excel",
- metadata.get(Metadata.CONTENT_TYPE));
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ XMLResult r = getXML("testEXCEL-formats.xls", new OfficeParser(), new Metadata(), context);
+
+ assertEquals(
+ "application/vnd.ms-excel",
+ r.metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = r.xml;
+
+ // Number #,##0.00
+ assertContains("1,599.99", content);
+ assertContains("-1,599.99", content);
+
+ // Currency $#,##0.00;[Red]($#,##0.00)
+ assertContains("$1,599.99", content);
+ assertContains("($1,599.99)", content);
+
+ // Scientific 0.00E+00
+ // poi <=3.8beta1 returns 1.98E08, newer versions return 1.98+E08
+ assertTrue(content.contains("1.98E08") || content.contains("1.98E+08"));
+ assertTrue(content.contains("-1.98E08") || content.contains("-1.98E+08"));
+
+ // Percentage.
+ assertContains("2.50%", content);
+ // Excel rounds up to 3%, but that requires Java 1.6 or later
+ if (System.getProperty("java.version").startsWith("1.5")) {
+ assertContains("2%", content);
+ } else {
+ assertContains("3%", content);
+ }
- String content = handler.toString();
+ // Time Format: h:mm
+ assertContains("6:15", content);
+ assertContains("18:15", content);
- // Number #,##0.00
- assertContains("1,599.99", content);
- assertContains("-1,599.99", content);
-
- // Currency $#,##0.00;[Red]($#,##0.00)
- assertContains("$1,599.99", content);
- assertContains("($1,599.99)", content);
-
- // Scientific 0.00E+00
- // poi <=3.8beta1 returns 1.98E08, newer versions return 1.98+E08
- assertTrue(content.contains("1.98E08") || content.contains("1.98E+08"));
- assertTrue(content.contains("-1.98E08") || content.contains("-1.98E+08"));
-
- // Percentage.
- assertContains("2.50%", content);
- // Excel rounds up to 3%, but that requires Java 1.6 or later
- if (System.getProperty("java.version").startsWith("1.5")) {
- assertContains("2%", content);
- } else {
- assertContains("3%", content);
- }
+ // Date Format: d-mmm-yy
+ assertContains("17-May-07", content);
- // Time Format: h:mm
- assertContains("6:15", content);
- assertContains("18:15", content);
+ // Date Format: m/d/yy
+ assertContains("10/3/09", content);
- // Date Format: d-mmm-yy
- assertContains("17-May-07", content);
+ // Date/Time Format: m/d/yy h:mm
+ assertContains("1/19/08 4:35", content);
- // Date Format: m/d/yy
- assertContains("10/3/09", content);
+ // Fraction (2.5): # ?/?
+ assertContains("2 1/2", content);
- // Date/Time Format: m/d/yy h:mm
- assertContains("1/19/08 4:35", content);
- // Fraction (2.5): # ?/?
- assertContains("2 1/2", content);
+ // Below assertions represent outstanding formatting issues to be addressed
+ // they are included to allow the issues to be progressed with the Apache POI
+ // team - See TIKA-103.
+ /*************************************************************************
+ // Custom Number (0 "dollars and" .00 "cents")
+ assertContains("19 dollars and .99 cents", content);
- // Below assertions represent outstanding formatting issues to be addressed
- // they are included to allow the issues to be progressed with the Apache POI
- // team - See TIKA-103.
+ // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
+ assertContains("At 4:20 AM on Thursday May 17, 2007", content);
+ **************************************************************************/
- /*************************************************************************
- // Custom Number (0 "dollars and" .00 "cents")
- assertContains("19 dollars and .99 cents", content);
- // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
- assertContains("At 4:20 AM on Thursday May 17, 2007", content);
- **************************************************************************/
-
- }
}
@Test
public void testExcelParserPassword() throws Exception {
- try (InputStream input = ExcelParserTest.class.getResourceAsStream(
- "/test-documents/testEXCEL_protected_passtika.xls")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- new OfficeParser().parse(input, handler, metadata, context);
+ try {
+ XMLResult r = getXML("testEXCEL_protected_passtika.xls");
fail("Document is encrypted, shouldn't parse");
} catch (EncryptedDocumentException e) {
// Good
}
// Try again, this time with the password
- try (InputStream input = ExcelParserTest.class.getResourceAsStream(
- "/test-documents/testEXCEL_protected_passtika.xls")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- context.set(PasswordProvider.class, new PasswordProvider() {
- @Override
- public String getPassword(Metadata metadata) {
- return "tika";
- }
- });
- new OfficeParser().parse(input, handler, metadata, context);
-
- assertEquals(
- "application/vnd.ms-excel",
- metadata.get(Metadata.CONTENT_TYPE));
-
- assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Antoni", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("2011-11-25T09:52:48Z", metadata.get(TikaCoreProperties.CREATED));
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ context.set(PasswordProvider.class, new PasswordProvider() {
+ @Override
+ public String getPassword(Metadata metadata) {
+ return "tika";
+ }
+ });
+ XMLResult r = getXML("testEXCEL_protected_passtika.xls", new OfficeParser(), new Metadata(), context);
+
+ assertEquals(
+ "application/vnd.ms-excel",
+ r.metadata.get(Metadata.CONTENT_TYPE));
+
+ assertEquals(null, r.metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Antoni", r.metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("2011-11-25T09:52:48Z", r.metadata.get(TikaCoreProperties.CREATED));
+
+ String content = r.xml;
+ assertContains("This is an Encrypted Excel spreadsheet", content);
+ assertNotContained("9.0", content);
- String content = handler.toString();
- assertContains("This is an Encrypted Excel spreadsheet", content);
- assertNotContained("9.0", content);
- }
}
/**
@@ -197,70 +180,48 @@ public class ExcelParserTest {
*/
@Test
public void testExcelParserCharts() throws Exception {
- try (InputStream input = ExcelParserTest.class.getResourceAsStream(
- "/test-documents/testEXCEL-charts.xls")) {
- Metadata metadata = new Metadata();
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- ContentHandler handler = new BodyContentHandler();
- new OfficeParser().parse(input, handler, metadata, context);
- assertEquals(
- "application/vnd.ms-excel",
- metadata.get(Metadata.CONTENT_TYPE));
+ XMLResult r = getXML("testEXCEL-charts.xls", new OfficeParser());
+ assertEquals(
+ "application/vnd.ms-excel",
+ r.metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
+ String content = r.xml;
+
+ // The first sheet has a pie chart
+ assertContains("charttabyodawg", content);
+ assertContains("WhamPuff", content);
+
+ // The second sheet has a bar chart and some text
+ assertContains("Sheet1", content);
+ assertContains("Test Excel Spreasheet", content);
+ assertContains("foo", content);
+ assertContains("bar", content);
+ assertContains("fizzlepuff", content);
+ assertContains("whyaxis", content);
+ assertContains("eksaxis", content);
+
+ // The third sheet has some text
+ assertContains("Sheet2", content);
+ assertContains("dingdong", content);
- // The first sheet has a pie chart
- assertContains("charttabyodawg", content);
- assertContains("WhamPuff", content);
-
- // The second sheet has a bar chart and some text
- assertContains("Sheet1", content);
- assertContains("Test Excel Spreasheet", content);
- assertContains("foo", content);
- assertContains("bar", content);
- assertContains("fizzlepuff", content);
- assertContains("whyaxis", content);
- assertContains("eksaxis", content);
-
- // The third sheet has some text
- assertContains("Sheet2", content);
- assertContains("dingdong", content);
- }
}
@Test
public void testJXL() throws Exception {
- try (InputStream input = ExcelParserTest.class.getResourceAsStream(
- "/test-documents/jxl.xls")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler(-1);
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- new OfficeParser().parse(input, handler, metadata, context);
- assertEquals(
- "application/vnd.ms-excel",
- metadata.get(Metadata.CONTENT_TYPE));
- String content = handler.toString();
- assertContains("Number Formats", content);
- }
+ XMLResult r = getXML("jxl.xls", new OfficeParser());
+ assertEquals(
+ "application/vnd.ms-excel",
+ r.metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("Number Formats", r.xml);
+
}
@Test
public void testWorksSpreadsheet70() throws Exception {
- try (InputStream input = ExcelParserTest.class.getResourceAsStream(
- "/test-documents/testWORKSSpreadsheet7.0.xlr")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler(-1);
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- new OfficeParser().parse(input, handler, metadata, context);
-
- String content = handler.toString();
- assertContains("Microsoft Works", content);
- }
+ assertContains("Microsoft Works",
+ getXML("testWORKSSpreadsheet7.0.xlr", new OfficeParser()).xml);
}
/**
@@ -278,8 +239,7 @@ public class ExcelParserTest {
// Should be detected correctly
MediaType type;
- try (InputStream input = ExcelParserTest.class.getResourceAsStream(
- "/test-documents/testEXCEL.xlsb")) {
+ try (InputStream input = getTestDocumentAsStream("testEXCEL.xlsb")) {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
}
@@ -291,15 +251,8 @@ public class ExcelParserTest {
assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
// AutoDetectParser doesn't break on it
- try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
- ContentHandler handler = new BodyContentHandler(-1);
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- parser.parse(input, handler, m, context);
+ assertContains("<body />", getXML("testEXCEL.xlsb").xml);
- String content = handler.toString();
- assertEquals("", content);
- }
}
/**
@@ -315,7 +268,7 @@ public class ExcelParserTest {
// First try detection of Excel 5
m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
- try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
+ try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel", type.toString());
}
@@ -323,7 +276,7 @@ public class ExcelParserTest {
// Now Excel 95
m = new Metadata();
m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
- try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
+ try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) {
type = detector.detect(input, m);
assertEquals("application/vnd.ms-excel", type.toString());
}
@@ -337,7 +290,7 @@ public class ExcelParserTest {
// Parse the Excel 5 file
m = new Metadata();
- try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
+ try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
@@ -364,7 +317,7 @@ public class ExcelParserTest {
// Parse the Excel 95 file
m = new Metadata();
- try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
+ try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) {
ContentHandler handler = new BodyContentHandler(-1);
ParseContext context = new ParseContext();
context.set(Locale.class, Locale.US);
@@ -388,16 +341,11 @@ public class ExcelParserTest {
*/
@Test
public void testCustomProperties() throws Exception {
- Metadata metadata = new Metadata();
-
- try (InputStream input = ExcelParserTest.class.getResourceAsStream(
- "/test-documents/testEXCEL_custom_props.xls")) {
- ContentHandler handler = new BodyContentHandler(-1);
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- new OfficeParser().parse(input, handler, metadata, context);
- }
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ XMLResult r = getXML("testEXCEL_custom_props.xls", new OfficeParser(), new Metadata(), context);
+ Metadata metadata = r.metadata;
assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("", metadata.get(TikaCoreProperties.MODIFIER));
@@ -413,31 +361,30 @@ public class ExcelParserTest {
@Test
public void testHeaderAndFooterExtraction() throws Exception {
- try (InputStream input = ExcelParserTest.class.getResourceAsStream(
- "/test-documents/testEXCEL_headers_footers.xls")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.UK);
- new OfficeParser().parse(input, handler, metadata, context);
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.UK);
+
+ XMLResult r = getXML("testEXCEL_headers_footers.xls", new OfficeParser(),
+ new Metadata(), context);
+
+ Metadata metadata = r.metadata;
+ assertEquals(
+ "application/vnd.ms-excel",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Internal spreadsheet", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Aeham Abushwashi", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR));
+
+ String content = r.xml;
+ assertContains("John Smith1", content);
+ assertContains("John Smith50", content);
+ assertContains("1 Corporate HQ", content);
+ assertContains("Header - Corporate Spreadsheet", content);
+ assertContains("Header - For Internal Use Only", content);
+ assertContains("Header - Author: John Smith", content);
+ assertContains("Footer - Corporate Spreadsheet", content);
+ assertContains("Footer - For Internal Use Only", content);
+ assertContains("Footer - Author: John Smith", content);
- assertEquals(
- "application/vnd.ms-excel",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Internal spreadsheet", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Aeham Abushwashi", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR));
-
- String content = handler.toString();
- assertContains("John Smith1", content);
- assertContains("John Smith50", content);
- assertContains("1 Corporate HQ", content);
- assertContains("Header - Corporate Spreadsheet", content);
- assertContains("Header - For Internal Use Only", content);
- assertContains("Header - Author: John Smith", content);
- assertContains("Footer - Corporate Spreadsheet", content);
- assertContains("Footer - For Internal Use Only", content);
- assertContains("Footer - Author: John Smith", content);
- }
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index 8a7c202..3cfda82 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -17,9 +17,11 @@
package org.apache.tika.parser.odf;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
import org.apache.tika.TikaTest;
import org.apache.tika.io.TikaInputStream;
@@ -27,7 +29,6 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.opendocument.OpenOfficeParser;
@@ -50,270 +51,235 @@ public class ODFParserTest extends TikaTest {
@Test
public void testOO3() throws Exception {
for (Parser parser : getParsers()) {
- try (InputStream input = ODFParserTest.class.getResourceAsStream(
- "/test-documents/testODFwithOOo3.odt")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- parser.parse(input, handler, metadata, new ParseContext());
-
- assertEquals(
- "application/vnd.oasis.opendocument.text",
- metadata.get(Metadata.CONTENT_TYPE));
-
- String content = handler.toString();
- assertContains("Tika is part of the Lucene project.", content);
- assertContains("Solr", content);
- assertContains("one embedded", content);
- assertContains("Rectangle Title", content);
- assertContains("a blue background and dark border", content);
- }
+ XMLResult r = getXML("testODFwithOOo3.odt", parser);
+ assertEquals(
+ "application/vnd.oasis.opendocument.text",
+ r.metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = r.xml;
+ assertContains("Tika is part of the Lucene project.", content);
+ assertContains("Solr", content);
+ assertContains("one embedded", content);
+ assertContains("Rectangle Title", content);
+ assertContains("a blue background and dark border", content);
+
}
}
@Test
public void testOO2() throws Exception {
- for (Parser parser : getParsers()) {
- try (InputStream input = ODFParserTest.class.getResourceAsStream(
- "/test-documents/testOpenOffice2.odt")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- parser.parse(input, handler, metadata, new ParseContext());
-
- assertEquals(
- "application/vnd.oasis.opendocument.text",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("en-US", metadata.get(Metadata.LANGUAGE));
- assertEquals("PT1M7S", metadata.get(Metadata.EDIT_TIME));
- assertEquals(
- "NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161",
- metadata.get("generator"));
-
- // Check date metadata, both old-style and new-style
- assertEquals("2007-09-14T11:07:10", metadata.get(TikaCoreProperties.MODIFIED));
- assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.MODIFIED));
- assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.DATE));
- assertEquals("2007-09-14T11:06:08", metadata.get(TikaCoreProperties.CREATED));
- assertEquals("2007-09-14T11:06:08", metadata.get(Metadata.CREATION_DATE));
-
- // Check the document statistics
- assertEquals("1", metadata.get(Office.PAGE_COUNT));
- assertEquals("1", metadata.get(Office.PARAGRAPH_COUNT));
- assertEquals("14", metadata.get(Office.WORD_COUNT));
- assertEquals("78", metadata.get(Office.CHARACTER_COUNT));
- assertEquals("0", metadata.get(Office.TABLE_COUNT));
- assertEquals("0", metadata.get(Office.OBJECT_COUNT));
- assertEquals("0", metadata.get(Office.IMAGE_COUNT));
-
- // Check the Tika-1.0 style document statistics
- assertEquals("1", metadata.get(Metadata.PAGE_COUNT));
- assertEquals("1", metadata.get(Metadata.PARAGRAPH_COUNT));
- assertEquals("14", metadata.get(Metadata.WORD_COUNT));
- assertEquals("78", metadata.get(Metadata.CHARACTER_COUNT));
- assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
- assertEquals("0", metadata.get(Metadata.OBJECT_COUNT));
- assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
-
- // Check the very old style statistics (these will be removed shortly)
- assertEquals("0", metadata.get("nbTab"));
- assertEquals("0", metadata.get("nbObject"));
- assertEquals("0", metadata.get("nbImg"));
- assertEquals("1", metadata.get("nbPage"));
- assertEquals("1", metadata.get("nbPara"));
- assertEquals("14", metadata.get("nbWord"));
- assertEquals("78", metadata.get("nbCharacter"));
-
- // Custom metadata tags present but without values
- assertEquals(null, metadata.get("custom:Info 1"));
- assertEquals(null, metadata.get("custom:Info 2"));
- assertEquals(null, metadata.get("custom:Info 3"));
- assertEquals(null, metadata.get("custom:Info 4"));
-
- String content = handler.toString();
- assertTrue(content.contains(
- "This is a sample Open Office document,"
- + " written in NeoOffice 2.2.1 for the Mac."));
- }
- }
- }
-
- /**
- * Similar to {@link #testXMLParser()}, but using a different
- * OO2 file with different metadata in it
- */
- @Test
- public void testOO2Metadata() throws Exception {
- try (InputStream input = ODFParserTest.class.getResourceAsStream(
- "/test-documents/testOpenOffice2.odf")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new OpenDocumentParser().parse(input, handler, metadata);
-
- assertEquals(
- "application/vnd.oasis.opendocument.formula",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals(null, metadata.get(TikaCoreProperties.MODIFIED));
- assertEquals("2006-01-27T11:55:22", metadata.get(Metadata.CREATION_DATE));
- assertEquals("The quick brown fox jumps over the lazy dog",
- metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Gym class featuring a brown fox and lazy dog",
- metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("Gym class featuring a brown fox and lazy dog",
- metadata.get(OfficeOpenXMLCore.SUBJECT));
- assertEquals("Gym class featuring a brown fox and lazy dog",
- metadata.get(Metadata.SUBJECT));
- assertEquals("PT0S", metadata.get(Metadata.EDIT_TIME));
- assertEquals("1", metadata.get("editing-cycles"));
- assertEquals(
- "OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134",
- metadata.get("generator"));
- assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS));
-
- // User defined metadata
- assertEquals("Text 1", metadata.get("custom:Info 1"));
- assertEquals("2", metadata.get("custom:Info 2"));
- assertEquals("false", metadata.get("custom:Info 3"));
- assertEquals("true", metadata.get("custom:Info 4"));
-
- // No statistics present
- assertEquals(null, metadata.get(Metadata.PAGE_COUNT));
- assertEquals(null, metadata.get(Metadata.PARAGRAPH_COUNT));
- assertEquals(null, metadata.get(Metadata.WORD_COUNT));
- assertEquals(null, metadata.get(Metadata.CHARACTER_COUNT));
- assertEquals(null, metadata.get(Metadata.TABLE_COUNT));
- assertEquals(null, metadata.get(Metadata.OBJECT_COUNT));
- assertEquals(null, metadata.get(Metadata.IMAGE_COUNT));
- assertEquals(null, metadata.get("nbTab"));
- assertEquals(null, metadata.get("nbObject"));
- assertEquals(null, metadata.get("nbImg"));
- assertEquals(null, metadata.get("nbPage"));
- assertEquals(null, metadata.get("nbPara"));
- assertEquals(null, metadata.get("nbWord"));
- assertEquals(null, metadata.get("nbCharacter"));
-
- // Note - contents of maths files not currently supported
- String content = handler.toString();
- assertEquals("", content);
- }
- }
-
- /**
- * Similar to {@link #testXMLParser()}, but using an OO3 file
- */
- @Test
- public void testOO3Metadata() throws Exception {
- try (InputStream input = ODFParserTest.class.getResourceAsStream(
- "/test-documents/testODFwithOOo3.odt")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new OpenDocumentParser().parse(input, handler, metadata);
-
+ for (Parser parser : getParsers()) {
+ XMLResult r = getXML("testOpenOffice2.odt", parser);
+ Metadata metadata = r.metadata;
assertEquals(
"application/vnd.oasis.opendocument.text",
metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("2009-10-05T21:22:38", metadata.get(TikaCoreProperties.MODIFIED));
- assertEquals("2009-10-05T19:04:01", metadata.get(TikaCoreProperties.CREATED));
- assertEquals("2009-10-05T19:04:01", metadata.get(Metadata.CREATION_DATE));
- assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Test document", metadata.get(OfficeOpenXMLCore.SUBJECT));
- assertEquals("Test document", metadata.get(Metadata.SUBJECT));
- assertEquals("A rather complex document", metadata.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("Bart Hanssens", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Bart Hanssens", metadata.get("initial-creator"));
- assertEquals("2", metadata.get("editing-cycles"));
- assertEquals("PT02H03M24S", metadata.get(Metadata.EDIT_TIME));
+ assertEquals("en-US", metadata.get(Metadata.LANGUAGE));
+ assertEquals("PT1M7S", metadata.get(Metadata.EDIT_TIME));
assertEquals(
- "OpenOffice.org/3.1$Unix OpenOffice.org_project/310m19$Build-9420",
+ "NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161",
metadata.get("generator"));
- assertEquals("Apache, Lucene, Tika", metadata.get(Metadata.KEYWORDS));
- // User defined metadata
- assertEquals("Bart Hanssens", metadata.get("custom:Editor"));
- assertEquals(null, metadata.get("custom:Info 2"));
- assertEquals(null, metadata.get("custom:Info 3"));
- assertEquals(null, metadata.get("custom:Info 4"));
+ // Check date metadata, both old-style and new-style
+ assertEquals("2007-09-14T11:07:10", metadata.get(TikaCoreProperties.MODIFIED));
+ assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.MODIFIED));
+ assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.DATE));
+ assertEquals("2007-09-14T11:06:08", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2007-09-14T11:06:08", metadata.get(Metadata.CREATION_DATE));
// Check the document statistics
- assertEquals("2", metadata.get(Office.PAGE_COUNT));
- assertEquals("13", metadata.get(Office.PARAGRAPH_COUNT));
- assertEquals("54", metadata.get(Office.WORD_COUNT));
- assertEquals("351", metadata.get(Office.CHARACTER_COUNT));
+ assertEquals("1", metadata.get(Office.PAGE_COUNT));
+ assertEquals("1", metadata.get(Office.PARAGRAPH_COUNT));
+ assertEquals("14", metadata.get(Office.WORD_COUNT));
+ assertEquals("78", metadata.get(Office.CHARACTER_COUNT));
assertEquals("0", metadata.get(Office.TABLE_COUNT));
- assertEquals("2", metadata.get(Office.OBJECT_COUNT));
+ assertEquals("0", metadata.get(Office.OBJECT_COUNT));
assertEquals("0", metadata.get(Office.IMAGE_COUNT));
// Check the Tika-1.0 style document statistics
- assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
- assertEquals("13", metadata.get(Metadata.PARAGRAPH_COUNT));
- assertEquals("54", metadata.get(Metadata.WORD_COUNT));
- assertEquals("351", metadata.get(Metadata.CHARACTER_COUNT));
+ assertEquals("1", metadata.get(Metadata.PAGE_COUNT));
+ assertEquals("1", metadata.get(Metadata.PARAGRAPH_COUNT));
+ assertEquals("14", metadata.get(Metadata.WORD_COUNT));
+ assertEquals("78", metadata.get(Metadata.CHARACTER_COUNT));
assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
- assertEquals("2", metadata.get(Metadata.OBJECT_COUNT));
+ assertEquals("0", metadata.get(Metadata.OBJECT_COUNT));
assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
- // Check the old style statistics (these will be removed shortly)
+ // Check the very old style statistics (these will be removed shortly)
assertEquals("0", metadata.get("nbTab"));
- assertEquals("2", metadata.get("nbObject"));
+ assertEquals("0", metadata.get("nbObject"));
assertEquals("0", metadata.get("nbImg"));
- assertEquals("2", metadata.get("nbPage"));
- assertEquals("13", metadata.get("nbPara"));
- assertEquals("54", metadata.get("nbWord"));
- assertEquals("351", metadata.get("nbCharacter"));
+ assertEquals("1", metadata.get("nbPage"));
+ assertEquals("1", metadata.get("nbPara"));
+ assertEquals("14", metadata.get("nbWord"));
+ assertEquals("78", metadata.get("nbCharacter"));
+
+ // Custom metadata tags present but without values
+ assertEquals(null, metadata.get("custom:Info 1"));
+ assertEquals(null, metadata.get("custom:Info 2"));
+ assertEquals(null, metadata.get("custom:Info 3"));
+ assertEquals(null, metadata.get("custom:Info 4"));
+
+ assertContains(
+ "This is a sample Open Office document,"
+ + " written in NeoOffice 2.2.1 for the Mac.",
+ r.xml);
- String content = handler.toString();
- assertTrue(content.contains(
- "Apache Tika Tika is part of the Lucene project."
- ));
}
}
+ /**
+ * Similar to {@link #testXMLParser()}, but using a different
+ * OO2 file with different metadata in it
+ */
@Test
- public void testODPMasterFooter() throws Exception {
- try (InputStream input = ODFParserTest.class.getResourceAsStream(
- "/test-documents/testMasterFooter.odp")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new AutoDetectParser().parse(input, handler, metadata);
+ public void testOO2Metadata() throws Exception {
+ XMLResult r = getXML("testOpenOffice2.odf", new OpenDocumentParser());
+ Metadata metadata = r.metadata;
+ assertEquals(
+ "application/vnd.oasis.opendocument.formula",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(null, metadata.get(TikaCoreProperties.MODIFIED));
+ assertEquals("2006-01-27T11:55:22", metadata.get(Metadata.CREATION_DATE));
+ assertEquals("The quick brown fox jumps over the lazy dog",
+ metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Gym class featuring a brown fox and lazy dog",
+ metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Gym class featuring a brown fox and lazy dog",
+ metadata.get(OfficeOpenXMLCore.SUBJECT));
+ assertEquals("Gym class featuring a brown fox and lazy dog",
+ metadata.get(Metadata.SUBJECT));
+ assertEquals("PT0S", metadata.get(Metadata.EDIT_TIME));
+ assertEquals("1", metadata.get("editing-cycles"));
+ assertEquals(
+ "OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134",
+ metadata.get("generator"));
+ assertEquals("Pangram, fox, dog", metadata.get(Metadata.KEYWORDS));
+
+ // User defined metadata
+ assertEquals("Text 1", metadata.get("custom:Info 1"));
+ assertEquals("2", metadata.get("custom:Info 2"));
+ assertEquals("false", metadata.get("custom:Info 3"));
+ assertEquals("true", metadata.get("custom:Info 4"));
+
+ // No statistics present
+ assertEquals(null, metadata.get(Metadata.PAGE_COUNT));
+ assertEquals(null, metadata.get(Metadata.PARAGRAPH_COUNT));
+ assertEquals(null, metadata.get(Metadata.WORD_COUNT));
+ assertEquals(null, metadata.get(Metadata.CHARACTER_COUNT));
+ assertEquals(null, metadata.get(Metadata.TABLE_COUNT));
+ assertEquals(null, metadata.get(Metadata.OBJECT_COUNT));
+ assertEquals(null, metadata.get(Metadata.IMAGE_COUNT));
+ assertEquals(null, metadata.get("nbTab"));
+ assertEquals(null, metadata.get("nbObject"));
+ assertEquals(null, metadata.get("nbImg"));
+ assertEquals(null, metadata.get("nbPage"));
+ assertEquals(null, metadata.get("nbPara"));
+ assertEquals(null, metadata.get("nbWord"));
+ assertEquals(null, metadata.get("nbCharacter"));
+
+ // Note - contents of maths files not currently supported
+ assertContains("<body />", r.xml);
- String content = handler.toString();
- assertContains("Master footer is here", content);
- }
- }
+ }
+ /**
+ * Similar to {@link #testXMLParser()}, but using an OO3 file
+ */
@Test
- public void testODTFooter() throws Exception {
- try (InputStream input = ODFParserTest.class.getResourceAsStream(
- "/test-documents/testFooter.odt")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new AutoDetectParser().parse(input, handler, metadata);
+ public void testOO3Metadata() throws Exception {
+ XMLResult r = getXML("testODFwithOOo3.odt", new OpenDocumentParser());
+ Metadata metadata = r.metadata;
+ assertEquals(
+ "application/vnd.oasis.opendocument.text",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("2009-10-05T21:22:38", metadata.get(TikaCoreProperties.MODIFIED));
+ assertEquals("2009-10-05T19:04:01", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2009-10-05T19:04:01", metadata.get(Metadata.CREATION_DATE));
+ assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Test document", metadata.get(OfficeOpenXMLCore.SUBJECT));
+ assertEquals("Test document", metadata.get(Metadata.SUBJECT));
+ assertEquals("A rather complex document", metadata.get(TikaCoreProperties.DESCRIPTION));
+ assertEquals("Bart Hanssens", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Bart Hanssens", metadata.get("initial-creator"));
+ assertEquals("2", metadata.get("editing-cycles"));
+ assertEquals("PT02H03M24S", metadata.get(Metadata.EDIT_TIME));
+ assertEquals(
+ "OpenOffice.org/3.1$Unix OpenOffice.org_project/310m19$Build-9420",
+ metadata.get("generator"));
+ assertEquals("Apache, Lucene, Tika", metadata.get(Metadata.KEYWORDS));
+
+ // User defined metadata
+ assertEquals("Bart Hanssens", metadata.get("custom:Editor"));
+ assertEquals(null, metadata.get("custom:Info 2"));
+ assertEquals(null, metadata.get("custom:Info 3"));
+ assertEquals(null, metadata.get("custom:Info 4"));
+
+ // Check the document statistics
+ assertEquals("2", metadata.get(Office.PAGE_COUNT));
+ assertEquals("13", metadata.get(Office.PARAGRAPH_COUNT));
+ assertEquals("54", metadata.get(Office.WORD_COUNT));
+ assertEquals("351", metadata.get(Office.CHARACTER_COUNT));
+ assertEquals("0", metadata.get(Office.TABLE_COUNT));
+ assertEquals("2", metadata.get(Office.OBJECT_COUNT));
+ assertEquals("0", metadata.get(Office.IMAGE_COUNT));
+
+ // Check the Tika-1.0 style document statistics
+ assertEquals("2", metadata.get(Metadata.PAGE_COUNT));
+ assertEquals("13", metadata.get(Metadata.PARAGRAPH_COUNT));
+ assertEquals("54", metadata.get(Metadata.WORD_COUNT));
+ assertEquals("351", metadata.get(Metadata.CHARACTER_COUNT));
+ assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
+ assertEquals("2", metadata.get(Metadata.OBJECT_COUNT));
+ assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));
+
+ // Check the old style statistics (these will be removed shortly)
+ assertEquals("0", metadata.get("nbTab"));
+ assertEquals("2", metadata.get("nbObject"));
+ assertEquals("0", metadata.get("nbImg"));
+ assertEquals("2", metadata.get("nbPage"));
+ assertEquals("13", metadata.get("nbPara"));
+ assertEquals("54", metadata.get("nbWord"));
+ assertEquals("351", metadata.get("nbCharacter"));
+
+ assertContains(
+ "Tika is part of the Lucene project.", r.xml);
- String content = handler.toString();
- assertContains("Here is some text...", content);
- assertContains("Here is some text on page 2", content);
- assertContains("Here is footer text", content);
- }
- }
+
+ }
+
+ @Test
+ public void testODPMasterFooter() throws Exception {
+ assertContains("Master footer is here",
+ getXML("testMasterFooter.odp").xml);
+ }
+
+ @Test
+ public void testODTFooter() throws Exception {
+ XMLResult r = getXML("testFooter.odt");
+ assertContains("Here is some text...", r.xml);
+ assertContains("Here is some text on page 2", r.xml);
+ assertContains("Here is footer text", r.xml);
+ }
@Test
public void testODSFooter() throws Exception {
- try (InputStream input = ODFParserTest.class.getResourceAsStream(
- "/test-documents/testFooter.ods")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- new AutoDetectParser().parse(input, handler, metadata);
+ assertContains("Here is a footer in the center area",
+ getXML("testFooter.ods").xml);
- String content = handler.toString();
- assertContains("Here is a footer in the center area", content);
- }
}
@Test
public void testFromFile() throws Exception {
- try (TikaInputStream tis = TikaInputStream.get(this.getClass().getResource(
- "/test-documents/testODFwithOOo3.odt"))) {
- assertEquals(true, tis.hasFile());
- OpenDocumentParser parser = new OpenDocumentParser();
+ OpenDocumentParser parser = new OpenDocumentParser();
+ Path tmp = null;
+ try {
+ tmp = Files.createTempFile("test-odf-", ".odt");
+ Files.copy(getTestDocumentAsStream("testODFwithOOo3.odt"), tmp,
+ StandardCopyOption.REPLACE_EXISTING);
Metadata metadata = new Metadata();
+ TikaInputStream tis = TikaInputStream.get(tmp, metadata);
+ assertEquals(true, tis.hasFile());
ContentHandler handler = new BodyContentHandler();
parser.parse(tis, handler, metadata, new ParseContext());
@@ -323,25 +289,20 @@ public class ODFParserTest extends TikaTest {
String content = handler.toString();
assertContains("Tika is part of the Lucene project.", content);
+ } finally {
+ Files.delete(tmp);
}
}
-
+
@Test
public void testNPEFromFile() throws Exception {
- OpenDocumentParser parser = new OpenDocumentParser();
- try (TikaInputStream tis = TikaInputStream.get(this.getClass().getResource(
- "/test-documents/testNPEOpenDocument.odt"))) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- parser.parse(tis, handler, metadata, new ParseContext());
+ XMLResult r = getXML("testNPEOpenDocument.odt", new OpenDocumentParser());
+ assertEquals(
+ "application/vnd.oasis.opendocument.text",
+ r.metadata.get(Metadata.CONTENT_TYPE));
- assertEquals(
- "application/vnd.oasis.opendocument.text",
- metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("primero hay que generar un par de claves", r.xml);
- String content = handler.toString();
- assertContains("primero hay que generar un par de claves", content);
- }
}
// TIKA-1063: Test basic style support.
@@ -359,20 +320,17 @@ public class ODFParserTest extends TikaTest {
//TIKA-1600: Test that null pointer doesn't break parsing.
@Test
public void testNullStylesInODTFooter() throws Exception {
- Parser parser = new OpenDocumentParser();
- try (InputStream input = ODFParserTest.class.getResourceAsStream("/test-documents/testODT-TIKA-6000.odt")) {
- Metadata metadata = new Metadata();
- ContentHandler handler = new BodyContentHandler();
- parser.parse(input, handler, metadata, new ParseContext());
- assertEquals("application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE));
+ XMLResult r = getXML("testODT-TIKA-6000.odt", new OpenDocumentParser(), new Metadata(), new ParseContext());
- String content = handler.toString();
+ assertEquals("application/vnd.oasis.opendocument.text", r.metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = r.xml;
+
+ assertContains("Utilisation de ce document", content);
+ assertContains("Copyright and License", content);
+ assertContains("Changer la langue", content);
+ assertContains("La page d’accueil permet de faire une recherche simple", content);
- assertContains("Utilisation de ce document", content);
- assertContains("Copyright and License", content);
- assertContains("Changer la langue", content);
- assertContains("La page d’accueil permet de faire une recherche simple", content);
- }
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
index 365de77..dc75be5 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
@@ -17,15 +17,11 @@
package org.apache.tika.parser.rtf;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
-import java.io.File;
-import java.io.FileInputStream;
import java.io.InputStream;
-import java.io.StringWriter;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
@@ -49,7 +45,6 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.WriteOutContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
@@ -62,117 +57,98 @@ public class RTFParserTest extends TikaTest {
@Test
public void testBasicExtraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testRTF.rtf");
- Metadata metadata = new Metadata();
- StringWriter writer = new StringWriter();
- tika.getParser().parse(
- new FileInputStream(file),
- new WriteOutContentHandler(writer),
- metadata,
- new ParseContext());
- String content = writer.toString();
-
- assertEquals("application/rtf", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length);
- assertContains("Test", content);
- assertContains("indexation Word", content);
+ XMLResult r = getXML("testRTF.rtf");
+ assertEquals("application/rtf", r.metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(1, r.metadata.getValues(Metadata.CONTENT_TYPE).length);
+ assertContains("Test", r.xml);
+ assertContains("indexation Word", r.xml);
}
@Test
public void testUmlautSpacesExtraction2() throws Exception {
- String content = getText("testRTFUmlautSpaces2.rtf");
- content = content.replaceAll("\\s+", "");
- assertEquals("\u00DCbersicht", content);
+ assertContains("<p>\u00DCbersicht</p>",
+ getXML("testRTFUmlautSpaces2.rtf").xml);
}
@Test
public void testUnicodeUCNControlWordCharacterDoublingExtraction() throws Exception {
- String content = getText("testRTFUnicodeUCNControlWordCharacterDoubling.rtf");
+ XMLResult r = getXML("testRTFUnicodeUCNControlWordCharacterDoubling.rtf");
- assertContains("\u5E74", content);
- assertContains("\u5ff5", content);
- assertContains("0 ", content);
- assertContains("abc", content);
- assertFalse("Doubled character \u5E74", content.contains("\u5E74\u5E74"));
+ assertContains("\u5E74", r.xml);
+ assertContains("\u5ff5", r.xml);
+ assertContains("0 ", r.xml);
+ assertContains("abc", r.xml);
+ assertNotContained("\u5E74\u5E74", r.xml);
}
@Test
public void testHexEscapeInsideWord() throws Exception {
- String content = getText("testRTFHexEscapeInsideWord.rtf");
- assertContains("ESP\u00cdRITO", content);
+ XMLResult r = getXML("testRTFHexEscapeInsideWord.rtf");
+ assertContains("ESP\u00cdRITO", r.xml);
}
@Test
public void testWindowsCodepage1250() throws Exception {
- String content = getText("testRTFWindowsCodepage1250.rtf");
- assertContains("za\u017c\u00f3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144", content);
- assertContains("ZA\u017b\u00d3\u0141\u0106 G\u0118\u015aL\u0104 JA\u0179\u0143", content);
+ XMLResult r = getXML("testRTFWindowsCodepage1250.rtf");
+ assertContains("za\u017c\u00f3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144", r.xml);
+ assertContains("ZA\u017b\u00d3\u0141\u0106 G\u0118\u015aL\u0104 JA\u0179\u0143", r.xml);
}
@Test
public void testTableCellSeparation() throws Exception {
- File file = getResourceAsFile("/test-documents/testRTFTableCellSeparation.rtf");
- String content = tika.parseToString(file);
- content = content.replaceAll("\\s+", " ");
- assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content);
+ String content = getXML("testRTFTableCellSeparation.rtf").xml;
+ content = content.replaceAll("(\\s|<\\/?p>)+", " ");
assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content);
}
@Test
public void testTableCellSeparation2() throws Exception {
- String content = getText("testRTFTableCellSeparation2.rtf");
+ String content = getXML("testRTFTableCellSeparation2.rtf").xml.replaceAll("\\s+", " ");
// TODO: why do we insert extra whitespace...?
- content = content.replaceAll("\\s+", " ");
- assertContains("Station Fax", content);
+ assertContains("Station</p> <p>Fax", content);
}
@Test
public void testWordPadCzechCharactersExtraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testRTFWordPadCzechCharacters.rtf");
- String s1 = tika.parseToString(file);
- assertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne"));
- assertTrue(s1.contains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty"));
+ XMLResult r = getXML("testRTFWordPadCzechCharacters.rtf");
+ assertContains("\u010Cl\u00E1nek t\u00FDdne", r.xml);
+ assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", r.xml);
}
@Test
public void testWord2010CzechCharactersExtraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testRTFWord2010CzechCharacters.rtf");
- String s1 = tika.parseToString(file);
- assertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne"));
- assertTrue(s1.contains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty"));
+ XMLResult r = getXML("testRTFWord2010CzechCharacters.rtf");
+ assertContains("\u010Cl\u00E1nek t\u00FDdne", r.xml);
+ assertContains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty", r.xml);
}
@Test
public void testMS932Extraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testRTF-ms932.rtf");
- String s1 = tika.parseToString(file);
-
+ XMLResult r = getXML("testRTF-ms932.rtf");
// Hello in Japanese
- assertTrue(s1.contains("\u3053\u3093\u306b\u3061\u306f"));
+ assertContains("\u3053\u3093\u306b\u3061\u306f", r.xml);
// Verify title, since it was also encoded with MS932:
- Result r = getResult("testRTF-ms932.rtf");
+ r = getXML("testRTF-ms932.rtf");
assertEquals("\u30bf\u30a4\u30c8\u30eb", r.metadata.get(TikaCoreProperties.TITLE));
}
@Test
public void testUmlautSpacesExtraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testRTFUmlautSpaces.rtf");
- String s1 = tika.parseToString(file);
- assertTrue(s1.contains("\u00DCbersicht"));
+ XMLResult r = getXML("testRTFUmlautSpaces.rtf");
+ assertContains("\u00DCbersicht", r.xml);
}
@Test
public void testGothic() throws Exception {
- String content = getText("testRTFUnicodeGothic.rtf");
- assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
+ XMLResult r = getXML("testRTFUnicodeGothic.rtf");
+ assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", r.xml);
}
@Test
public void testJapaneseText() throws Exception {
- Result r = getResult("testRTFJapanese.rtf");
- String content = r.text;
+ XMLResult r = getXML("testRTFJapanese.rtf");
// Verify title -- this title uses upr escape inside
// title info field:
@@ -183,17 +159,17 @@ public class RTFParserTest extends TikaTest {
assertEquals("StarWriter", r.metadata.get(TikaCoreProperties.COMMENTS));
// Special version of (GHQ)
- assertContains("\uff08\uff27\uff28\uff31\uff09", content);
+ assertContains("\uff08\uff27\uff28\uff31\uff09", r.xml);
// 6 other characters
- assertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", content);
+ assertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", r.xml);
}
@Test
public void testMaxLength() throws Exception {
- File file = getResourceAsFile("/test-documents/testRTFJapanese.rtf");
Metadata metadata = new Metadata();
- InputStream stream = TikaInputStream.get(file, metadata);
+ InputStream stream = TikaInputStream.get(
+ getTestDocumentAsStream("testRTFJapanese.rtf"));
// Test w/ default limit:
Tika localTika = new Tika();
@@ -204,7 +180,7 @@ public class RTFParserTest extends TikaTest {
// Test setting max length on the instance:
localTika.setMaxStringLength(200);
- stream = TikaInputStream.get(file, metadata);
+ stream = TikaInputStream.get(getTestDocumentAsStream("testRTFJapanese.rtf"));
content = localTika.parseToString(stream, metadata);
// parseToString closes for convenience:
@@ -212,7 +188,7 @@ public class RTFParserTest extends TikaTest {
assertTrue(content.length() <= 200);
// Test setting max length per-call:
- stream = TikaInputStream.get(file, metadata);
+ stream = TikaInputStream.get(getTestDocumentAsStream("testRTFJapanese.rtf"));
content = localTika.parseToString(stream, metadata, 100);
// parseToString closes for convenience:
//stream.close();
@@ -221,14 +197,14 @@ public class RTFParserTest extends TikaTest {
@Test
public void testTextWithCurlyBraces() throws Exception {
- String content = getText("testRTFWithCurlyBraces.rtf");
- assertContains("{ some text inside curly brackets }", content);
+ XMLResult r = getXML("testRTFWithCurlyBraces.rtf");
+ assertContains("{ some text inside curly brackets }", r.xml);
}
@Test
public void testControls() throws Exception {
- Result r = getResult("testRTFControls.rtf");
- String content = r.text;
+ XMLResult r = getXML("testRTFControls.rtf");
+ String content = r.xml;
assertContains("Thiswordhasanem\u2014dash", content);
assertContains("Thiswordhasanen\u2013dash", content);
assertContains("Thiswordhasanon\u2011breakinghyphen", content);
@@ -241,8 +217,8 @@ public class RTFParserTest extends TikaTest {
@Test
public void testInvalidUnicode() throws Exception {
- Result r = getResult("testRTFInvalidUnicode.rtf");
- String content = r.text;
+ XMLResult r = getXML("testRTFInvalidUnicode.rtf");
+ String content = r.xml;
assertContains("Unpaired hi \ufffd here", content);
assertContains("Unpaired lo \ufffd here", content);
assertContains("Mismatched pair \ufffd\ufffd here", content);
@@ -250,8 +226,8 @@ public class RTFParserTest extends TikaTest {
@Test
public void testVarious() throws Exception {
- Result r = getResult("testRTFVarious.rtf");
- String content = r.text;
+ XMLResult r = getXML("testRTFVarious.rtf");
+ String content = r.xml;
assertContains("Footnote appears here", content);
assertContains("This is a footnote.", content);
assertContains("This is the header text.", content);
@@ -267,10 +243,10 @@ public class RTFParserTest extends TikaTest {
assertContains("(Kramer)", content);
// Table
- assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " "));
+ assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("(\\s|<\\/?p>)+", " "));
// 2-columns
- assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " "));
+ assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("(\\s|<\\/?p>)+", " "));
assertContains("This is a hyperlink", content);
assertContains("Here is a list:", content);
for (int row = 1; row <= 3; row++) {
@@ -393,17 +369,13 @@ public class RTFParserTest extends TikaTest {
// TIKA-1192
@Test
public void testListOverride() throws Exception {
- Result r = getResult("testRTFListOverride.rtf");
- String content = r.text;
- assertContains("Body", content);
+ assertContains("Body", getXML("testRTFListOverride.rtf").xml);
}
// TIKA-1305
@Test
public void testCorruptListOverride() throws Exception {
- Result r = getResult("testRTFCorruptListOverride.rtf");
- String content = r.text;
- assertContains("apple", content);
+ assertContains("apple", getXML("testRTFCorruptListOverride.rtf").xml);
}
// TIKA-1010
@@ -565,31 +537,4 @@ public class RTFParserTest extends TikaTest {
assertEquals(2, tracker.filenames.size());
}
- private Result getResult(String filename) throws Exception {
- File file = getResourceAsFile("/test-documents/" + filename);
-
- Metadata metadata = new Metadata();
- StringWriter writer = new StringWriter();
- tika.getParser().parse(
- new FileInputStream(file),
- new WriteOutContentHandler(writer),
- metadata,
- new ParseContext());
- String content = writer.toString();
- return new Result(content, metadata);
- }
-
- private String getText(String filename) throws Exception {
- return getResult(filename).text;
- }
-
- private static class Result {
- public final String text;
- public final Metadata metadata;
-
- public Result(String text, Metadata metadata) {
- this.text = text;
- this.metadata = metadata;
- }
- }
}
[03/13] tika git commit: TIKA-1855 -- first pass. Need to turn back
on the forbidden-apis testCheck. More clean up remains.
Posted by ta...@apache.org.
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
----------------------------------------------------------------------
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 0521032..2830b5a 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -69,7 +69,7 @@ public class TikaResourceTest extends CXFTestBase {
Response response = WebClient.create(endPoint + TIKA_PATH)
.type("application/msword")
.accept("text/plain")
- .put(ClassLoader.getSystemResourceAsStream(TEST_DOC));
+ .put(getTestDocumentAsStream(TEST_DOC));
String responseMsg = getStringFromInputStream((InputStream) response
.getEntity());
assertTrue(responseMsg.contains("test"));
@@ -90,7 +90,7 @@ public class TikaResourceTest extends CXFTestBase {
Response response = WebClient.create(endPoint + TIKA_PATH)
.type("application/vnd.ms-excel")
.accept("text/plain")
- .put(ClassLoader.getSystemResourceAsStream("password.xls"));
+ .put(getTestDocumentAsStream("password.xls"));
assertEquals(UNPROCESSEABLE, response.getStatus());
}
@@ -100,7 +100,7 @@ public class TikaResourceTest extends CXFTestBase {
Response response = WebClient.create(endPoint + TIKA_PATH)
.type("application/msword")
.accept("text/html")
- .put(ClassLoader.getSystemResourceAsStream(TEST_DOC));
+ .put(getTestDocumentAsStream(TEST_DOC));
String responseMsg = getStringFromInputStream((InputStream) response
.getEntity());
assertTrue(responseMsg.contains("test"));
@@ -113,7 +113,7 @@ public class TikaResourceTest extends CXFTestBase {
Response response = WebClient.create(endPoint + TIKA_PATH)
.type("application/vnd.ms-excel")
.accept("text/html")
- .put(ClassLoader.getSystemResourceAsStream("password.xls"));
+ .put(getTestDocumentAsStream("password.xls"));
assertEquals(UNPROCESSEABLE, response.getStatus());
}
@@ -123,7 +123,7 @@ public class TikaResourceTest extends CXFTestBase {
Response response = WebClient.create(endPoint + TIKA_PATH)
.type("application/msword")
.accept("text/xml")
- .put(ClassLoader.getSystemResourceAsStream(TEST_DOC));
+ .put(getTestDocumentAsStream(TEST_DOC));
String responseMsg = getStringFromInputStream((InputStream) response
.getEntity());
assertTrue(responseMsg.contains("test"));
@@ -134,7 +134,7 @@ public class TikaResourceTest extends CXFTestBase {
Response response = WebClient.create(endPoint + TIKA_PATH)
.type("application/vnd.ms-excel")
.accept("text/xml")
- .put(ClassLoader.getSystemResourceAsStream("password.xls"));
+ .put(getTestDocumentAsStream("password.xls"));
assertEquals(UNPROCESSEABLE, response.getStatus());
}
@@ -143,7 +143,8 @@ public class TikaResourceTest extends CXFTestBase {
public void testSimpleWordMultipartXML() throws Exception {
ClassLoader.getSystemResourceAsStream(TEST_DOC);
Attachment attachmentPart =
- new Attachment("myworddoc", "application/msword", ClassLoader.getSystemResourceAsStream(TEST_DOC));
+ new Attachment("myworddoc", "application/msword",
+ getTestDocumentAsStream(TEST_DOC));
WebClient webClient = WebClient.create(endPoint + TIKA_PATH + "/form");
Response response = webClient.type("multipart/form-data")
.accept("text/xml")
@@ -161,7 +162,7 @@ public class TikaResourceTest extends CXFTestBase {
//first try text
Response response = WebClient.create(endPoint + TIKA_PATH)
.accept("text/plain")
- .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ .put(getTestDocumentAsStream(TEST_RECURSIVE_DOC));
String responseMsg = getStringFromInputStream((InputStream) response
.getEntity());
assertTrue(responseMsg.contains("Course of human events"));
@@ -169,7 +170,7 @@ public class TikaResourceTest extends CXFTestBase {
//now go for xml -- different call than text
response = WebClient.create(endPoint + TIKA_PATH)
.accept("text/xml")
- .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ .put(getTestDocumentAsStream(TEST_RECURSIVE_DOC));
responseMsg = getStringFromInputStream((InputStream) response
.getEntity());
assertTrue(responseMsg.contains("Course of human events"));
@@ -185,9 +186,9 @@ public class TikaResourceTest extends CXFTestBase {
Response response = WebClient.create(endPoint + TIKA_PATH)
.type("application/rtf")
.accept("text/plain")
- .put(ClassLoader.getSystemResourceAsStream("testRTF_npeFromWMFInTikaServer.rtf"));
+ .put(getTestDocumentAsStream("testRTF_npeFromWMFInTikaServer.rtf"));
String responseMsg = getStringFromInputStream((InputStream) response
.getEntity());
- assertTrue(responseMsg.contains("Example text"));
+ assertContains("Example text", responseMsg);
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java
----------------------------------------------------------------------
diff --git a/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java
index b883c96..a61bf52 100644
--- a/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/UnpackerResourceTest.java
@@ -87,7 +87,7 @@ public class UnpackerResourceTest extends CXFTestBase {
public void testDocWAV() throws Exception {
Response response = WebClient.create(endPoint + UNPACKER_PATH)
.type(APPLICATION_MSWORD).accept("application/zip")
- .put(ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV));
+ .put(getTestDocumentAsStream(TEST_DOC_WAV));
Map<String, String> data = readZipArchive((InputStream) response.getEntity());
assertEquals(WAV1_MD5, data.get(WAV1_NAME));
@@ -99,7 +99,7 @@ public class UnpackerResourceTest extends CXFTestBase {
public void testDocWAVText() throws Exception {
Response response = WebClient.create(endPoint + ALL_PATH)
.type(APPLICATION_MSWORD).accept("application/zip")
- .put(ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV));
+ .put(getTestDocumentAsStream(TEST_DOC_WAV));
Map<String, String> data = readZipArchive((InputStream) response.getEntity());
assertEquals(WAV1_MD5, data.get(WAV1_NAME));
@@ -111,7 +111,7 @@ public class UnpackerResourceTest extends CXFTestBase {
public void testDocPicture() throws Exception {
Response response = WebClient.create(endPoint + UNPACKER_PATH)
.type(APPLICATION_MSWORD).accept("application/zip")
- .put(ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV));
+ .put(getTestDocumentAsStream(TEST_DOC_WAV));
Map<String, String> data = readZipArchive((InputStream) response.getEntity());
@@ -122,7 +122,7 @@ public class UnpackerResourceTest extends CXFTestBase {
public void testDocPictureNoOle() throws Exception {
Response response = WebClient.create(endPoint + UNPACKER_PATH)
.type(APPLICATION_MSWORD).accept("application/zip")
- .put(ClassLoader.getSystemResourceAsStream("2pic.doc"));
+ .put(getTestDocumentAsStream("2pic.doc"));
Map<String, String> data = readZipArchive((InputStream) response.getEntity());
assertEquals(JPG2_MD5, data.get(JPG2_NAME));
@@ -132,7 +132,7 @@ public class UnpackerResourceTest extends CXFTestBase {
public void testImageDOCX() throws Exception {
Response response = WebClient.create(endPoint + UNPACKER_PATH)
.accept("application/zip").put(
- ClassLoader.getSystemResourceAsStream(TEST_DOCX_IMAGE));
+ getTestDocumentAsStream(TEST_DOCX_IMAGE));
Map<String, String> data = readZipArchive((InputStream) response.getEntity());
assertEquals(DOCX_IMAGE1_MD5, data.get(DOCX_IMAGE1_NAME));
@@ -144,7 +144,7 @@ public class UnpackerResourceTest extends CXFTestBase {
Response response = WebClient.create(endPoint + UNPACKER_PATH)
.type("xxx/xxx")
.accept("*/*")
- .put(ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV));
+ .put(getTestDocumentAsStream(TEST_DOC_WAV));
assertEquals(415, response.getStatus());
}
@@ -154,7 +154,7 @@ public class UnpackerResourceTest extends CXFTestBase {
String TEST_DOCX_EXE = "2exe.docx";
Response response = WebClient.create(endPoint + UNPACKER_PATH)
.accept("application/zip")
- .put(ClassLoader.getSystemResourceAsStream(TEST_DOCX_EXE));
+ .put(getTestDocumentAsStream(TEST_DOCX_EXE));
Map<String, String> data = readZipArchive((InputStream) response.getEntity());
@@ -166,7 +166,7 @@ public class UnpackerResourceTest extends CXFTestBase {
public void testImageXSL() throws Exception {
Response response = WebClient.create(endPoint + UNPACKER_PATH)
.accept("application/zip")
- .put(ClassLoader.getSystemResourceAsStream("pic.xls"));
+ .put(getTestDocumentAsStream("pic.xls"));
Map<String, String> data = readZipArchive((InputStream) response.getEntity());
assertEquals(XSL_IMAGE1_MD5, data.get("0.jpg"));
@@ -177,7 +177,7 @@ public class UnpackerResourceTest extends CXFTestBase {
public void testTarDocPicture() throws Exception {
Response response = WebClient.create(endPoint + UNPACKER_PATH)
.type(APPLICATION_MSWORD).accept("application/x-tar")
- .put(ClassLoader.getSystemResourceAsStream(TEST_DOC_WAV));
+ .put(getTestDocumentAsStream(TEST_DOC_WAV));
Map<String, String> data = readArchiveFromStream(new TarArchiveInputStream((InputStream) response.getEntity()));
@@ -189,7 +189,7 @@ public class UnpackerResourceTest extends CXFTestBase {
Response response = WebClient.create(endPoint + ALL_PATH)
.header(CONTENT_TYPE, APPLICATION_XML)
.accept("application/zip")
- .put(ClassLoader.getSystemResourceAsStream("test.doc"));
+ .put(getTestDocumentAsStream("test.doc"));
String responseMsg = readArchiveText((InputStream) response.getEntity());
assertNotNull(responseMsg);
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/resources/2exe.docx
----------------------------------------------------------------------
diff --git a/tika-server/src/test/resources/2exe.docx b/tika-server/src/test/resources/2exe.docx
deleted file mode 100644
index 64cfbe1..0000000
Binary files a/tika-server/src/test/resources/2exe.docx and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/resources/2pic.doc
----------------------------------------------------------------------
diff --git a/tika-server/src/test/resources/2pic.doc b/tika-server/src/test/resources/2pic.doc
deleted file mode 100644
index 75c53b3..0000000
Binary files a/tika-server/src/test/resources/2pic.doc and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/resources/2pic.docx
----------------------------------------------------------------------
diff --git a/tika-server/src/test/resources/2pic.docx b/tika-server/src/test/resources/2pic.docx
deleted file mode 100644
index fe424e4..0000000
Binary files a/tika-server/src/test/resources/2pic.docx and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/resources/CDEC_WEATHER_2010_03_02
----------------------------------------------------------------------
diff --git a/tika-server/src/test/resources/CDEC_WEATHER_2010_03_02 b/tika-server/src/test/resources/CDEC_WEATHER_2010_03_02
deleted file mode 100644
index c50e8e7..0000000
--- a/tika-server/src/test/resources/CDEC_WEATHER_2010_03_02
+++ /dev/null
@@ -1,98 +0,0 @@
-Station ID Start Date Date Time Temp Cond Depth DO Flow WXT510P Latitude Longitude
-SMN 03/02/2010 03/01/2010 23:00 14.5 791.00 53.00 7.5 1460 37.347214 -120.976181
-SMN 03/02/2010 03/01/2010 23:15 14.5 790.00 52.99 7.5 1450 37.347214 -120.976181
-SMN 03/02/2010 03/01/2010 23:30 14.5 788.00 53.03 7.4 1480 37.347214 -120.976181
-SMN 03/02/2010 03/01/2010 23:45 14.5 790.00 53.03 7.4 1480 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 00:00 14.5 785.00 53.02 7.4 1470 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 00:15 14.5 786.00 53.00 7.4 1460 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 00:30 14.5 790.00 53.04 7.4 1480 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 00:45 14.5 792.00 53.02 7.3 1470 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 01:00 14.5 786.00 53.03 7.3 1480 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 01:15 14.5 787.00 53.03 7.3 1480 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 01:30 14.5 791.00 53.03 7.3 1480 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 01:45 14.5 789.00 53.04 7.3 1480 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 02:00 14.5 794.00 53.06 7.2 1490 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 02:15 14.4 801.00 53.06 7.2 1490 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 02:30 14.4 802.00 53.04 7.2 1480 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 02:45 14.4 803.00 53.07 7.2 1500 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 03:00 14.4 802.00 53.06 7.2 1490 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 03:15 14.4 803.00 53.08 7.2 1500 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 03:30 14.4 806.00 53.06 7.2 1490 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 03:45 14.4 807.00 53.08 7.1 1500 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 04:00 14.4 810.00 53.09 7.1 1510 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 04:15 14.4 810.00 53.10 7.1 1520 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 04:30 14.3 808.00 53.11 7.1 1520 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 04:45 14.3 810.00 53.11 7.1 1520 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 05:00 14.3 813.00 53.11 7.0 1520 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 05:15 14.3 811.00 53.11 7.0 1520 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 05:30 14.3 810.00 53.10 7.0 1520 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 05:45 14.3 805.00 53.12 7.0 1530 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 06:00 14.2 806.00 53.10 7.0 1520 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 06:15 14.2 805.00 53.12 7.0 1530 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 06:30 14.2 808.00 53.14 6.9 1540 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 06:45 14.2 809.00 53.14 6.9 1540 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 07:00 14.2 803.00 53.13 6.9 1530 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 07:15 14.2 807.00 53.13 6.9 1530 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 07:30 14.2 805.00 53.14 6.9 1540 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 07:45 14.2 811.00 53.14 6.9 1540 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 08:00 14.2 815.00 53.15 6.9 1540 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 08:15 14.3 817.00 53.13 6.9 1530 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 08:30 14.3 817.00 53.15 6.9 1540 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 08:45 14.3 811.00 53.16 6.8 1550 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 09:00 14.3 810.00 53.17 6.9 1560 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 09:15 14.3 809.00 53.16 6.9 1550 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 09:30 14.3 813.00 53.18 6.9 1560 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 09:45 14.3 813.00 53.17 6.9 1560 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 10:00 14.3 813.00 53.19 6.9 1570 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 10:15 14.3 820.00 53.17 6.9 1560 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 10:30 14.3 818.00 53.18 6.9 1560 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 10:45 14.3 821.00 53.19 6.9 1570 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 11:00 14.3 821.00 53.18 6.9 1560 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 11:15 14.3 825.00 53.18 6.9 1560 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 11:30 14.3 827.00 53.17 6.9 1560 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 11:45 14.3 825.00 53.18 6.9 1560 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 12:00 14.3 829.00 53.19 6.9 1570 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 12:15 14.4 831.00 53.20 6.9 1570 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 12:30 14.4 837.00 53.20 7.0 1570 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 12:45 14.4 835.00 53.20 7.0 1570 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 13:00 14.5 837.00 53.21 7.0 1580 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 13:15 14.5 837.00 53.20 7.0 1570 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 13:30 14.5 842.00 53.20 7.0 1570 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 13:45 14.5 848.00 53.22 7.0 1590 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 14:00 14.5 850.00 53.20 7.0 1570 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 14:15 14.5 851.00 53.20 7.0 1570 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 14:30 14.5 849.00 53.20 7.1 1570 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 14:45 14.6 858.00 53.20 7.1 1570 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 15:00 14.6 869.00 53.20 7.1 1570 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 15:15 14.6 868.00 53.22 7.1 1590 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 15:30 14.5 868.00 53.23 7.1 1590 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 15:45 14.5 869.00 53.22 7.1 1590 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 16:00 14.5 873.00 53.22 7.2 1590 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 16:15 14.5 877.00 53.23 7.2 1590 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 16:30 14.5 884.00 53.23 7.2 1590 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 16:45 14.5 887.00 53.23 7.2 1590 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 17:00 14.5 889.00 53.22 7.2 1590 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 17:15 14.5 891.00 53.25 7.2 1600 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 17:30 14.4 893.00 53.24 7.2 1600 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 17:45 14.4 896.00 53.23 7.2 1590 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 18:00 14.4 896.00 53.24 7.2 1600 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 18:15 14.4 895.00 53.24 7.2 1600 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 18:30 14.4 899.00 53.25 7.2 1600 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 18:45 14.3 901.00 53.25 7.2 1600 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 19:00 14.3 899.00 53.24 7.2 1600 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 19:15 14.3 911.00 53.25 7.2 1600 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 19:30 14.3 914.00 53.26 7.2 1610 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 19:45 14.3 913.00 53.22 7.2 1590 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 20:00 14.3 914.00 53.24 7.2 1600 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 20:15 14.2 915.00 53.22 7.3 1590 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 20:30 14.2 917.00 53.24 7.2 1600 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 20:45 14.2 919.00 53.24 7.2 1600 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 21:00 14.2 919.00 53.23 7.2 1590 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 21:15 14.2 923.00 53.21 7.2 1580 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 21:30 14.2 920.00 53.24 7.2 1600 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 21:45 14.2 927.00 53.25 7.2 1600 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 22:00 14.2 929.00 53.23 7.2 1590 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 22:15 14.1 927.00 53.25 7.2 1600 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 22:30 14.1 931.00 53.22 7.2 1590 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 22:45 14.1 931.00 53.25 7.2 1600 760 37.347214 -120.976181
-SMN 03/02/2010 03/02/2010 23:00 14.1 937.00 53.23 7.2 1590 760 37.347214 -120.976181
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/resources/Doc1_ole.doc
----------------------------------------------------------------------
diff --git a/tika-server/src/test/resources/Doc1_ole.doc b/tika-server/src/test/resources/Doc1_ole.doc
deleted file mode 100644
index 953fe78..0000000
Binary files a/tika-server/src/test/resources/Doc1_ole.doc and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/resources/english.txt
----------------------------------------------------------------------
diff --git a/tika-server/src/test/resources/english.txt b/tika-server/src/test/resources/english.txt
deleted file mode 100644
index 5e3d20e..0000000
--- a/tika-server/src/test/resources/english.txt
+++ /dev/null
@@ -1 +0,0 @@
-This is English!
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/resources/foo.csv
----------------------------------------------------------------------
diff --git a/tika-server/src/test/resources/foo.csv b/tika-server/src/test/resources/foo.csv
deleted file mode 100644
index 0f48f3e..0000000
--- a/tika-server/src/test/resources/foo.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-foo,bar,baz
-123,"abc def",-987
-456,"qwertyuiop",98765
-789,"qawsedrft",3.14159
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/resources/french.txt
----------------------------------------------------------------------
diff --git a/tika-server/src/test/resources/french.txt b/tika-server/src/test/resources/french.txt
deleted file mode 100644
index 678e6c2..0000000
--- a/tika-server/src/test/resources/french.txt
+++ /dev/null
@@ -1 +0,0 @@
-c’est comme ci comme ça
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/resources/mime/custom-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-server/src/test/resources/mime/custom-mimetypes.xml b/tika-server/src/test/resources/mime/custom-mimetypes.xml
deleted file mode 100644
index 78cf392..0000000
--- a/tika-server/src/test/resources/mime/custom-mimetypes.xml
+++ /dev/null
@@ -1,24 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<mime-info>
- <mime-type type="application/evil">
- <glob pattern="*.evil"/>
- <sub-class-of type="text/plain"/>
- </mime-type>
-</mime-info>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/resources/mock/null_pointer.xml
----------------------------------------------------------------------
diff --git a/tika-server/src/test/resources/mock/null_pointer.xml b/tika-server/src/test/resources/mock/null_pointer.xml
deleted file mode 100644
index 80043c0..0000000
--- a/tika-server/src/test/resources/mock/null_pointer.xml
+++ /dev/null
@@ -1,25 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-<mock>
- <metadata action="add" name="author">Nikolai Lobachevsky</metadata>
- <write element="p">some content</write>
- <throw class="java.lang.NullPointerException">null pointer message</throw>
-</mock>
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/resources/org/apache/tika/mime/custom-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-server/src/test/resources/org/apache/tika/mime/custom-mimetypes.xml b/tika-server/src/test/resources/org/apache/tika/mime/custom-mimetypes.xml
new file mode 100644
index 0000000..b3ddc83
--- /dev/null
+++ b/tika-server/src/test/resources/org/apache/tika/mime/custom-mimetypes.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<mime-info>
+ <mime-type type="application/evil">
+ <glob pattern="*.evil"/>
+ <sub-class-of type="text/plain"/>
+ </mime-type>
+</mime-info>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/resources/password.xls
----------------------------------------------------------------------
diff --git a/tika-server/src/test/resources/password.xls b/tika-server/src/test/resources/password.xls
deleted file mode 100644
index a6ad86a..0000000
Binary files a/tika-server/src/test/resources/password.xls and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/resources/pic.xls
----------------------------------------------------------------------
diff --git a/tika-server/src/test/resources/pic.xls b/tika-server/src/test/resources/pic.xls
deleted file mode 100644
index 6798ae2..0000000
Binary files a/tika-server/src/test/resources/pic.xls and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/resources/pic.xlsx
----------------------------------------------------------------------
diff --git a/tika-server/src/test/resources/pic.xlsx b/tika-server/src/test/resources/pic.xlsx
deleted file mode 100644
index 9cc155a..0000000
Binary files a/tika-server/src/test/resources/pic.xlsx and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/resources/test.doc
----------------------------------------------------------------------
diff --git a/tika-server/src/test/resources/test.doc b/tika-server/src/test/resources/test.doc
deleted file mode 100644
index 93198c8..0000000
Binary files a/tika-server/src/test/resources/test.doc and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/resources/testRTF_npeFromWMFInTikaServer.rtf
----------------------------------------------------------------------
diff --git a/tika-server/src/test/resources/testRTF_npeFromWMFInTikaServer.rtf b/tika-server/src/test/resources/testRTF_npeFromWMFInTikaServer.rtf
deleted file mode 100644
index a5870e5..0000000
--- a/tika-server/src/test/resources/testRTF_npeFromWMFInTikaServer.rtf
+++ /dev/null
@@ -1,235 +0,0 @@
-{\rtf1\adeflang1025\ansi\ansicpg1252\uc1\adeff0\deff0\stshfdbch0\stshfloch0\stshfhich0\stshfbi0\deflang2057\deflangfe2057\themelang2057\themelangfe0\themelangcs0{\fonttbl{\f0\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f1\fbidi \fswiss\fcharset0\fprq2{\*\panose 020b0604020202020204}Arial;}
-{\f34\fbidi \froman\fcharset0\fprq2{\*\panose 02040503050406030204}Cambria Math;}{\f171\fbidi \froman\fcharset0\fprq2{\*\panose 02040602050305030304}Book Antiqua;}
-{\f318\fbidi \fswiss\fcharset0\fprq2{\*\panose 020b0603030504020204}Humnst777 BT{\*\falt Lucida Sans Unicode};}{\flomajor\f31500\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}
-{\fdbmajor\f31501\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\fhimajor\f31502\fbidi \froman\fcharset0\fprq2{\*\panose 02040503050406030204}Cambria;}
-{\fbimajor\f31503\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\flominor\f31504\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}
-{\fdbminor\f31505\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\fhiminor\f31506\fbidi \fswiss\fcharset0\fprq2{\*\panose 020f0502020204030204}Calibri;}
-{\fbiminor\f31507\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f319\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\f320\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}
-{\f322\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\f323\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\f324\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f325\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}
-{\f326\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\f327\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\f329\fbidi \fswiss\fcharset238\fprq2 Arial CE;}{\f330\fbidi \fswiss\fcharset204\fprq2 Arial Cyr;}
-{\f332\fbidi \fswiss\fcharset161\fprq2 Arial Greek;}{\f333\fbidi \fswiss\fcharset162\fprq2 Arial Tur;}{\f334\fbidi \fswiss\fcharset177\fprq2 Arial (Hebrew);}{\f335\fbidi \fswiss\fcharset178\fprq2 Arial (Arabic);}
-{\f336\fbidi \fswiss\fcharset186\fprq2 Arial Baltic;}{\f337\fbidi \fswiss\fcharset163\fprq2 Arial (Vietnamese);}{\f659\fbidi \froman\fcharset238\fprq2 Cambria Math CE;}{\f660\fbidi \froman\fcharset204\fprq2 Cambria Math Cyr;}
-{\f662\fbidi \froman\fcharset161\fprq2 Cambria Math Greek;}{\f663\fbidi \froman\fcharset162\fprq2 Cambria Math Tur;}{\f666\fbidi \froman\fcharset186\fprq2 Cambria Math Baltic;}{\f667\fbidi \froman\fcharset163\fprq2 Cambria Math (Vietnamese);}
-{\f2029\fbidi \froman\fcharset238\fprq2 Book Antiqua CE;}{\f2030\fbidi \froman\fcharset204\fprq2 Book Antiqua Cyr;}{\f2032\fbidi \froman\fcharset161\fprq2 Book Antiqua Greek;}{\f2033\fbidi \froman\fcharset162\fprq2 Book Antiqua Tur;}
-{\f2036\fbidi \froman\fcharset186\fprq2 Book Antiqua Baltic;}{\flomajor\f31508\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\flomajor\f31509\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}
-{\flomajor\f31511\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\flomajor\f31512\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\flomajor\f31513\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}
-{\flomajor\f31514\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\flomajor\f31515\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\flomajor\f31516\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}
-{\fdbmajor\f31518\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\fdbmajor\f31519\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fdbmajor\f31521\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}
-{\fdbmajor\f31522\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\fdbmajor\f31523\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fdbmajor\f31524\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}
-{\fdbmajor\f31525\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\fdbmajor\f31526\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\fhimajor\f31528\fbidi \froman\fcharset238\fprq2 Cambria CE;}
-{\fhimajor\f31529\fbidi \froman\fcharset204\fprq2 Cambria Cyr;}{\fhimajor\f31531\fbidi \froman\fcharset161\fprq2 Cambria Greek;}{\fhimajor\f31532\fbidi \froman\fcharset162\fprq2 Cambria Tur;}
-{\fhimajor\f31535\fbidi \froman\fcharset186\fprq2 Cambria Baltic;}{\fhimajor\f31536\fbidi \froman\fcharset163\fprq2 Cambria (Vietnamese);}{\fbimajor\f31538\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}
-{\fbimajor\f31539\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fbimajor\f31541\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\fbimajor\f31542\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}
-{\fbimajor\f31543\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fbimajor\f31544\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\fbimajor\f31545\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}
-{\fbimajor\f31546\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\flominor\f31548\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\flominor\f31549\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}
-{\flominor\f31551\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\flominor\f31552\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\flominor\f31553\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}
-{\flominor\f31554\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\flominor\f31555\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\flominor\f31556\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}
-{\fdbminor\f31558\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\fdbminor\f31559\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fdbminor\f31561\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}
-{\fdbminor\f31562\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\fdbminor\f31563\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fdbminor\f31564\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}
-{\fdbminor\f31565\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\fdbminor\f31566\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\fhiminor\f31568\fbidi \fswiss\fcharset238\fprq2 Calibri CE;}
-{\fhiminor\f31569\fbidi \fswiss\fcharset204\fprq2 Calibri Cyr;}{\fhiminor\f31571\fbidi \fswiss\fcharset161\fprq2 Calibri Greek;}{\fhiminor\f31572\fbidi \fswiss\fcharset162\fprq2 Calibri Tur;}
-{\fhiminor\f31575\fbidi \fswiss\fcharset186\fprq2 Calibri Baltic;}{\fhiminor\f31576\fbidi \fswiss\fcharset163\fprq2 Calibri (Vietnamese);}{\fbiminor\f31578\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}
-{\fbiminor\f31579\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fbiminor\f31581\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\fbiminor\f31582\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}
-{\fbiminor\f31583\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fbiminor\f31584\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\fbiminor\f31585\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}
-{\fbiminor\f31586\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}}{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;
-\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}{\*\defchp \fs22 }{\*\defpap
-\ql \li0\ri0\sa200\sl276\slmult1\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 }\noqfpromote {\stylesheet{\ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs24\alang1025
-\ltrch\fcs0 \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 \snext0 \sqformat \spriority0 Normal;}{\s1\ql \li0\ri0\keepn\widctlpar\tx5670\tx8222\wrapdefault\faauto\outlinelevel0\rin0\lin0\itap0 \rtlch\fcs1 \ab\af1\afs24\alang1025 \ltrch\fcs0
-\b\f1\fs20\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 \sbasedon0 \snext0 \slink15 \sqformat heading 1;}{\s2\ql \li0\ri0\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel1\adjustright\rin0\lin0\itap0 \rtlch\fcs1
-\ab\af1\afs28\alang1025 \ltrch\fcs0 \b\f1\fs28\lang2057\langfe2057\cgrid\langnp2057\langfenp2057 \sbasedon0 \snext0 \slink16 \sqformat heading 2;}{\s4\ql \li5670\ri0\keepn\widctlpar\tx5670\tx7371\wrapdefault\faauto\outlinelevel3\rin0\lin5670\itap0
-\rtlch\fcs1 \ab\af1\afs16\alang1025 \ltrch\fcs0 \b\f1\fs16\lang2057\langfe2057\cgrid\langnp2057\langfenp2057 \sbasedon0 \snext0 \slink17 \sqformat heading 4;}{\*\cs10 \additive \ssemihidden Default Paragraph Font;}{\*
-\ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\tblind0\tblindtype3\tscellwidthfts0\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv \ql \li0\ri0\sa200\sl276\slmult1
-\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs22\alang1025 \ltrch\fcs0 \fs22\lang2057\langfe2057\cgrid\langnp2057\langfenp2057 \snext11 \ssemihidden \sunhideused \sqformat Normal Table;}{\*\cs15 \additive
-\rtlch\fcs1 \ab\af31503\afs32 \ltrch\fcs0 \b\fs32\lang0\langfe1033\kerning32\loch\f31502\hich\af31502\dbch\af31501\langnp0\langfenp1033 \sbasedon10 \slink1 \slocked \spriority9 Heading 1 Char;}{\*\cs16 \additive \rtlch\fcs1 \ab\ai\af31503\afs28
-\ltrch\fcs0 \b\i\fs28\lang0\langfe1033\loch\f31502\hich\af31502\dbch\af31501\langnp0\langfenp1033 \sbasedon10 \slink2 \slocked \ssemihidden \spriority9 Heading 2 Char;}{\*\cs17 \additive \rtlch\fcs1 \ab\af31507\afs28 \ltrch\fcs0
-\b\fs28\lang0\langfe1033\loch\f31506\hich\af31506\dbch\af31505\langnp0\langfenp1033 \sbasedon10 \slink4 \slocked \ssemihidden \spriority9 Heading 4 Char;}{\s18\ql \li0\ri0\widctlpar
-\tqc\tx4153\tqr\tx8306\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs20\alang1025 \ltrch\fcs0 \fs22\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 \sbasedon0 \snext18 \slink19 header;}{\*\cs19 \additive \rtlch\fcs1
-\af0\afs24 \ltrch\fcs0 \fs24\lang0\langfe1033\langnp0\langfenp1033 \sbasedon10 \slink18 \slocked \ssemihidden Header Char;}{\s20\ql \li0\ri0\widctlpar\tx3402\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1
-\ab\af1\afs24\alang1025 \ltrch\fcs0 \b\f1\fs20\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 \sbasedon0 \snext20 \slink21 Body Text;}{\*\cs21 \additive \rtlch\fcs1 \af0\afs24 \ltrch\fcs0 \fs24\lang0\langfe1033\langnp0\langfenp1033
-\sbasedon10 \slink20 \slocked \ssemihidden Body Text Char;}{\s22\ql \li0\ri0\widctlpar\tqc\tx4153\tqr\tx8306\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs24\alang1025 \ltrch\fcs0
-\fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 \sbasedon0 \snext22 \slink23 \styrsid14506524 footer;}{\*\cs23 \additive \rtlch\fcs1 \af0\afs24 \ltrch\fcs0 \fs24\lang0\langfe1033\langnp0\langfenp1033 \sbasedon10 \slink22 \slocked \ssemihidden
-Footer Char;}}{\*\rsidtbl \rsid69694\rsid615335\rsid817088\rsid1394934\rsid1968554\rsid2362503\rsid2504751\rsid2508965\rsid3497332\rsid3954968\rsid4262707\rsid4459777\rsid4947815\rsid5249973\rsid5375126\rsid5768946\rsid6625584\rsid6695929\rsid7547824
-\rsid7568219\rsid7681002\rsid7756842\rsid8788056\rsid9179382\rsid9185548\rsid9589441\rsid9716173\rsid10108489\rsid10158374\rsid10170376\rsid10447577\rsid10506307\rsid10508481\rsid11937854\rsid12735407\rsid14506524\rsid15223573\rsid15351889\rsid15429861
-\rsid15800823\rsid16209942\rsid16329808\rsid16338741\rsid16531520}{\mmathPr\mmathFont34\mbrkBin0\mbrkBinSub0\msmallFrac0\mdispDef1\mlMargin0\mrMargin0\mdefJc1\mwrapIndent1440\mintLim0\mnaryLim1}{\info{\title Cardiff}{\author A Other}
-{\operator Ian Williams}{\creatim\yr2016\mo2\dy1\hr16\min12}{\revtim\yr2016\mo2\dy1\hr16\min12}{\version2}{\edmins2}{\nofpages1}{\nofwords6}{\nofchars37}{\*\company Cardiff}{\nofcharsws42}{\vern32773}}{\*\xmlnstbl {\xmlns1 http://schemas.microsoft.com/off
-ice/word/2003/wordml}}\paperw11906\paperh16838\margl851\margr851\margt567\margb794\gutter0\ltrsect
-\widowctrl\ftnbj\aenddoc\trackmoves1\trackformatting1\donotembedsysfont1\relyonvml0\donotembedlingdata0\grfdocevents0\validatexml1\showplaceholdtext0\ignoremixedcontent0\saveinvalidxml0\showxmlerrors1\noxlattoyen
-\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\formshade\horzdoc\dgmargin\dghspace120\dgvspace180\dghorigin851\dgvorigin567\dghshow2\dgvshow1
-\jexpand\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot\splytwnine\ftnlytwnine\htmautsp\nolnhtadjtbl\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule\nojkernpunct\rsidroot15429861 \fet0{\*\wgrffmtfilter 013f}\ilfomacatclnup0{\*\template
-C:\\PMS\\DOCUMENT\\gplnew.dot}{\*\ftnsep \ltrpar \pard\plain \ltrpar\ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs24\alang1025 \ltrch\fcs0 \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033
-{\rtlch\fcs1 \af0 \ltrch\fcs0 \insrsid5249973 \chftnsep
-\par }}{\*\ftnsepc \ltrpar \pard\plain \ltrpar\ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs24\alang1025 \ltrch\fcs0 \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\rtlch\fcs1 \af0
-\ltrch\fcs0 \insrsid5249973 \chftnsepc
-\par }}{\*\aftnsep \ltrpar \pard\plain \ltrpar\ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs24\alang1025 \ltrch\fcs0 \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\rtlch\fcs1 \af0
-\ltrch\fcs0 \insrsid5249973 \chftnsep
-\par }}{\*\aftnsepc \ltrpar \pard\plain \ltrpar\ql \li0\ri0\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs24\alang1025 \ltrch\fcs0 \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\rtlch\fcs1 \af0
-\ltrch\fcs0 \insrsid5249973 \chftnsepc
-\par }}\ltrpar \sectd \ltrsect\linex0\headery709\footery709\colsx708\endnhere\sectlinegrid360\sectdefaultcl\sectrsid3497332\sftnbj {\headerl \ltrpar \pard\plain \ltrpar\s18\ql \li0\ri0\widctlpar
-\tqc\tx4153\tqr\tx8306\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs20\alang1025 \ltrch\fcs0 \fs22\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\rtlch\fcs1 \af0 \ltrch\fcs0 \insrsid2504751
-\par }}{\headerr \ltrpar \pard\plain \ltrpar\s18\ql \li0\ri0\widctlpar\tqc\tx4153\tqr\tx8306\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs20\alang1025 \ltrch\fcs0 \fs22\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {
-\rtlch\fcs1 \af0 \ltrch\fcs0 \insrsid2504751
-\par }}{\footerl \ltrpar \pard\plain \ltrpar\s22\ql \li0\ri0\widctlpar\tqc\tx4153\tqr\tx8306\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs24\alang1025 \ltrch\fcs0 \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {
-\rtlch\fcs1 \af0 \ltrch\fcs0 \insrsid2504751
-\par }}{\footerr \ltrpar \pard\plain \ltrpar\s22\qr \li-284\ri0\widctlpar\tqc\tx4153\tqr\tx8306\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin-284\itap0\pararsid14506524 \rtlch\fcs1 \af0\afs24\alang1025 \ltrch\fcs0
-\fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\rtlch\fcs1 \af0\afs16 \ltrch\fcs0 \f318\fs16\insrsid1968554 {\pict{\*\picprop\shplid1026{\sp{\sn shapeType}{\sv 1}}{\sp{\sn fFlipH}{\sv 0}}
-{\sp{\sn fFlipV}{\sv 0}}{\sp{\sn fillColor}{\sv 3355443}}{\sp{\sn fRecolorFillAsPicture}{\sv 0}}{\sp{\sn fUseShapeAnchor}{\sv 0}}{\sp{\sn fFilled}{\sv 1}}{\sp{\sn fLine}{\sv 0}}{\sp{\sn pctHR}{\sv 0}}{\sp{\sn alignHR}{\sv 1}}{\sp{\sn dxHeightHR}{\sv 20}}
-{\sp{\sn dxWidthHR}{\sv 10943}}{\sp{\sn fLayoutInCell}{\sv 1}}{\sp{\sn fStandardHR}{\sv 1}}{\sp{\sn fNoshadeHR}{\sv 1}}{\sp{\sn fHorizRule}{\sv 1}}{\sp{\sn fLayoutInCell}{\sv 1}}}\picscalex1094\picscaley4\piccropl0\piccropr0\piccropt0\piccropb0
-\picw1764\pich882\picwgoal1000\pichgoal500\wmetafile8\bliptag667904020\blipupi71{\*\blipuid 27cf68149ca99ab95f958a7b62da888e}010009000003dd02000006001202000000001202000026060f001a04574d464301000000000001003e050000000001000000f803000000000000f80300000100
-00006c000000ffffffffffffffff111100002c00000000000000000000003e480000b900000020454d4600000100f80300001d00000003000000000000000000
-000000000000981200009f1a0000ca0000002101000000000000000000000000000023130300f6660400160000000c000000180000000a000000100000000000
-0000000000000900000010000000111100002c000000250000000c0000000e000080250000000c0000000e000080120000000c00000001000000520000007001
-000001000000a4ffffff000000000000000000000000900100000000000004400022430061006c00690062007200690000000000000000000000000000000000
-0000000000000000000000000000000000000000000000000000000000000000000000001900304e19001000000094511900144f1900fa4e5966945119008c4e
-190010000000fc4f190078511900cc4e5966945119008c4e1900200000008a790c5f8c4e19009451190020000000ffffffffdc008900057a0c5fffffffffffff
-0180ffff01809f020180ffffffff00420000000800000008000018f18d1001000000000000005802000025000000372e90010000020f0502020204030204ff02
-00e0ffac004001000000000000009f01000000000000430061006c006900620072000000000020ebf70486d759667a68466dbc008900306c8000c04e19009832
-055f1f00000001000000fc4e1900fc4e1900907b035f1f000000244f1900dc0089006476000800000000250000000c00000001000000250000000c0000000100
-0000250000000c00000001000000180000000c0000000000000254000000540000000000000000000000350000002b000000010000005fcc87405eb387400000
-000057000000010000004c000000040000000000000000000000111100002c00000050000000200000003600000046000000280000001c000000474449430200
-0000ffffffffffffffff111100002c000000000000002100000008000000620000000c0000000100000024000000240000000000803e00000000000000000000
-803e000000000000000002000000270000001800000002000000000000003333330000000000250000000c00000002000000250000000c000000080000805600
-000030000000ffffffffffffffff111100002c00000005000000fefffefffeffad004144ad004144fefffefffeff250000000c00000007000080250000000c00
-000000000080240000002400000000008040000000000000000000008040000000000000000002000000220000000c000000ffffffff46000000140000000800
-00004744494303000000250000000c0000000e000080250000000c0000000e0000800e0000001400000000000000100000001400000004000000030108000500
-00000b0200000000050000000c0205000c02040000002e0118001c000000fb020200010000000000bc02000000000102022253797374656d003f00003f3f0000
-0000000000000000000001003f3f3f3f3f00040000002d010000040000002d01000004000000020101001c000000fb02f5ff0000000000009001000000000440
-002243616c6962726900000000000000000000000000000000000000000000000000040000002d010100040000002d010100040000002d010100050000000902
-000000020d000000320a0a00000001000400000000000e02050020000600030000001e0007000000fc020000333333000000040000002d01020008000000fa02
-050000000000ffffff00040000002d0103000e0000002403050000000000000005000e0205000e0200000000000008000000fa0200000000000000000000040000002d01040007000000fc020000ffffff000000040000002d010500040000002701ffff040000002d010000040000002d010000030000000000}}{
-\rtlch\fcs1 \af0 \ltrch\fcs0 \f318\ul\insrsid5249973
-\par }\pard \ltrpar\s22\ql \li-284\ri0\widctlpar\tx3075\tx3119\tx7655\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin-284\itap0\pararsid16338741 {\rtlch\fcs1 \af0\afs14 \ltrch\fcs0 \b\f318\fs14\insrsid2504751 Example footer}{\rtlch\fcs1 \af0\afs14
-\ltrch\fcs0 \f318\fs14\insrsid5249973
-\par }}{\headerf \ltrpar \pard\plain \ltrpar\s18\ql \li0\ri0\widctlpar\tqc\tx4153\tqr\tx8306\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs20\alang1025 \ltrch\fcs0 \fs22\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {
-\rtlch\fcs1 \af0 \ltrch\fcs0 \insrsid2504751
-\par }}{\footerf \ltrpar \pard\plain \ltrpar\s22\ql \li0\ri0\widctlpar\tqc\tx4153\tqr\tx8306\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af0\afs24\alang1025 \ltrch\fcs0 \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {
-\rtlch\fcs1 \af0 \ltrch\fcs0 \insrsid2504751
-\par }}{\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang {\pntxta )}}
-{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl8
-\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}\ltrrow\trowd \irow0\irowband0\ltrrow
-\ts11\trgaph108\trleft-108\trkeep\trftsWidth1\trftsWidthB3\trftsWidthA3\trpaddl108\trpaddr108\trpaddfl3\trpaddfr3\tblind0\tblindtype3 \clvertalc\clbrdrt\brdrtbl \clbrdrl\brdrtbl \clbrdrb\brdrtbl \clbrdrr\brdrtbl
-\cltxlrtb\clftsWidth3\clwWidth5508\clshdrawnil \cellx5400\clvertalc\clbrdrt\brdrtbl \clbrdrl\brdrtbl \clbrdrb\brdrtbl \clbrdrr\brdrtbl \cltxlrtb\clftsWidth3\clwWidth4912\clshdrawnil \cellx10312\pard\plain \ltrpar\ql \li0\ri0\widctlpar\intbl
-\tx5670\tx8222\wrapdefault\faauto\rin0\lin0 \rtlch\fcs1 \af0\afs24\alang1025 \ltrch\fcs0 \fs24\lang2057\langfe1033\cgrid\langnp2057\langfenp1033 {\rtlch\fcs1 \af1\afs18 \ltrch\fcs0 \f1\fs18\cf1\lang2057\langfe2057\langfenp2057\insrsid6695929 \cell
-}\pard \ltrpar\qc \li0\ri0\widctlpar\intbl\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\pararsid16338741 {\rtlch\fcs1 \af1\afs18 \ltrch\fcs0 \f1\fs18\lang2057\langfe2057\langfenp2057\insrsid6695929 \cell }\pard \ltrpar
-\ql \li0\ri0\sa200\sl276\slmult1\widctlpar\intbl\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\rtlch\fcs1 \af1 \ltrch\fcs0 \f1\fs20\insrsid6695929 \trowd \irow0\irowband0\ltrrow
-\ts11\trgaph108\trleft-108\trkeep\trftsWidth1\trftsWidthB3\trftsWidthA3\trpaddl108\trpaddr108\trpaddfl3\trpaddfr3\tblind0\tblindtype3 \clvertalc\clbrdrt\brdrtbl \clbrdrl\brdrtbl \clbrdrb\brdrtbl \clbrdrr\brdrtbl
-\cltxlrtb\clftsWidth3\clwWidth5508\clshdrawnil \cellx5400\clvertalc\clbrdrt\brdrtbl \clbrdrl\brdrtbl \clbrdrb\brdrtbl \clbrdrr\brdrtbl \cltxlrtb\clftsWidth3\clwWidth4912\clshdrawnil \cellx10312\row \ltrrow}\trowd \irow1\irowband1\lastrow \ltrrow
-\ts11\trgaph108\trleft-108\trkeep\trftsWidth1\trftsWidthB3\trftsWidthA3\trpaddl108\trpaddr108\trpaddfl3\trpaddfr3\tblind0\tblindtype3 \clvertalc\clbrdrt\brdrtbl \clbrdrl\brdrtbl \clbrdrb\brdrtbl \clbrdrr\brdrtbl
-\cltxlrtb\clftsWidth3\clwWidth10420\clshdrawnil \cellx10312\pard \ltrpar\ql \li0\ri0\widctlpar\intbl\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\rtlch\fcs1 \af1\afs18 \ltrch\fcs0 \f1\fs8\lang2057\langfe2057\langfenp2057\insrsid6695929
-\cell }\pard \ltrpar\ql \li0\ri0\sa200\sl276\slmult1\widctlpar\intbl\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0 {\rtlch\fcs1 \af1 \ltrch\fcs0 \f1\fs20\insrsid6695929 \trowd \irow1\irowband1\lastrow \ltrrow
-\ts11\trgaph108\trleft-108\trkeep\trftsWidth1\trftsWidthB3\trftsWidthA3\trpaddl108\trpaddr108\trpaddfl3\trpaddfr3\tblind0\tblindtype3 \clvertalc\clbrdrt\brdrtbl \clbrdrl\brdrtbl \clbrdrb\brdrtbl \clbrdrr\brdrtbl
-\cltxlrtb\clftsWidth3\clwWidth10420\clshdrawnil \cellx10312\row }\pard \ltrpar\qj \li0\ri0\widctlpar\tx0\wrapdefault\aspalpha\aspnum\faauto\outlinelevel0\adjustright\rin0\lin0\itap0\pararsid15429861 {\rtlch\fcs1 \af0 \ltrch\fcs0 \f171\fs20\insrsid2504751
-Example text to extract from RTF.}{\rtlch\fcs1 \af0 \ltrch\fcs0 \f171\fs20\insrsid15429861
-\par
-\par
-\par }{\rtlch\fcs1 \af0 \ltrch\fcs0 \b\f171\fs20\ul\insrsid15429861
-\par }{\rtlch\fcs1 \af0 \ltrch\fcs0 \b\f171\fs20\ul\insrsid15429861\charrsid4947815
-\par }{\*\themedata 504b030414000600080000002100828abc13fa0000001c020000130000005b436f6e74656e745f54797065735d2e786d6cac91cb6ac3301045f785fe83d0b6d8
-72ba28a5d8cea249777d2cd20f18e4b12d6a8f843409c9df77ecb850ba082d74231062ce997b55ae8fe3a00e1893f354e9555e6885647de3a8abf4fbee29bbd7
-2a3150038327acf409935ed7d757e5ee14302999a654e99e393c18936c8f23a4dc072479697d1c81e51a3b13c07e4087e6b628ee8cf5c4489cf1c4d075f92a0b
-44d7a07a83c82f308ac7b0a0f0fbf90c2480980b58abc733615aa2d210c2e02cb04430076a7ee833dfb6ce62e3ed7e14693e8317d8cd0433bf5c60f53fea2fe7
-065bd80facb647e9e25c7fc421fd2ddb526b2e9373fed4bb902e182e97b7b461e6bfad3f010000ffff0300504b030414000600080000002100a5d6a7e7c00000
-00360100000b0000005f72656c732f2e72656c73848fcf6ac3300c87ef85bd83d17d51d2c31825762fa590432fa37d00e1287f68221bdb1bebdb4fc7060abb08
-84a4eff7a93dfeae8bf9e194e720169aaa06c3e2433fcb68e1763dbf7f82c985a4a725085b787086a37bdbb55fbc50d1a33ccd311ba548b63095120f88d94fbc
-52ae4264d1c910d24a45db3462247fa791715fd71f989e19e0364cd3f51652d73760ae8fa8c9ffb3c330cc9e4fc17faf2ce545046e37944c69e462a1a82fe353
-bd90a865aad41ed0b5b8f9d6fd010000ffff0300504b0304140006000800000021006b799616830000008a0000001c0000007468656d652f7468656d652f7468
-656d654d616e616765722e786d6c0ccc4d0ac3201040e17da17790d93763bb284562b2cbaebbf600439c1a41c7a0d29fdbd7e5e38337cedf14d59b4b0d592c9c
-070d8a65cd2e88b7f07c2ca71ba8da481cc52c6ce1c715e6e97818c9b48d13df49c873517d23d59085adb5dd20d6b52bd521ef2cdd5eb9246a3d8b4757e8d3f7
-29e245eb2b260a0238fd010000ffff0300504b03041400060008000000210096b5ade296060000501b0000160000007468656d652f7468656d652f7468656d65
-312e786d6cec594f6fdb3614bf0fd87720746f6327761a07758ad8b19b2d4d1bc46e871e698996d850a240d2497d1bdae38001c3ba618715d86d87615b8116d8
-a5fb34d93a6c1dd0afb0475292c5585e9236d88aad3e2412f9e3fbff1e1fa9abd7eec70c1d1221294fda5efd72cd4324f1794093b0eddd1ef62fad79482a9c04
-98f184b4bd2991deb58df7dfbb8ad755446282607d22d771db8b944ad79796a40fc3585ee62949606ecc458c15bc8a702910f808e8c66c69b9565b5d8a314d3c
-94e018c8de1a8fa94fd05093f43672e23d06af89927ac06762a049136785c10607758d9053d965021d62d6f6804fc08f86e4bef210c352c144dbab999fb7b471
-7509af678b985ab0b6b4ae6f7ed9ba6c4170b06c788a705430adf71bad2b5b057d03606a1ed7ebf5babd7a41cf00b0ef83a6569632cd467faddec9699640f671
-9e76b7d6ac355c7c89feca9cccad4ea7d36c65b258a206641f1b73f8b5da6a6373d9c11b90c537e7f08dce66b7bbeae00dc8e257e7f0fd2badd5868b37a088d1
-e4600ead1ddaef67d40bc898b3ed4af81ac0d76a197c86826828a24bb318f3442d8ab518dfe3a20f000d6458d104a9694ac6d88728eee2782428d60cf03ac1a5
-193be4cbb921cd0b495fd054b5bd0f530c1931a3f7eaf9f7af9e3f45c70f9e1d3ff8e9f8e1c3e3073f5a42ceaa6d9c84e5552fbffdeccfc71fa33f9e7ef3f2d1
-17d57859c6fffac327bffcfc793510d26726ce8b2f9ffcf6ecc98baf3efdfdbb4715f04d814765f890c644a29be408edf3181433567125272371be15c308d3f2
-8acd249438c19a4b05fd9e8a1cf4cd296699771c393ac4b5e01d01e5a30a787d72cf1178108989a2159c77a2d801ee72ce3a5c545a6147f32a99793849c26ae6
-6252c6ed637c58c5bb8b13c7bfbd490a75330f4b47f16e441c31f7184e140e494214d273fc80900aedee52ead87597fa824b3e56e82e451d4c2b4d32a423279a
-668bb6690c7e9956e90cfe766cb37b077538abd27a8b1cba48c80acc2a841f12e698f13a9e281c57911ce298950d7e03aba84ac8c154f8655c4f2af074481847
-bd804859b5e696007d4b4edfc150b12addbecba6b18b148a1e54d1bc81392f23b7f84137c2715a851dd0242a633f900710a218ed715505dfe56e86e877f0034e
-16bafb0e258ebb4faf06b769e888340b103d3311da9750aa9d0a1cd3e4efca31a3508f6d0c5c5c398602f8e2ebc71591f5b616e24dd893aa3261fb44f95d843b
-5974bb5c04f4edafb95b7892ec1108f3f98de75dc97d5772bdff7cc95d94cf672db4b3da0a6557f70db629362d72bcb0431e53c6066acac80d699a6409fb44d0
-8741bdce9c0e4971624a2378cceaba830b05366b90e0ea23aaa241845368b0eb9e2612ca8c742851ca251ceccc70256d8d87265dd96361531f186c3d9058edf2
-c00eafe8e1fc5c509031bb4d680e9f39a3154de0accc56ae644441edd76156d7429d995bdd88664a9dc3ad50197c38af1a0c16d684060441db02565e85f3b966
-0d0713cc48a0ed6ef7dedc2dc60b17e92219e180643ed27acffba86e9c94c78ab90980d8a9f0913ee49d62b512b79626fb06dccee2a432bbc60276b9f7dec44b
-7904cfbca4f3f6443ab2a49c9c2c41476dafd55c6e7ac8c769db1bc399161ee314bc2e75cf8759081743be1236ec4f4d6693e5336fb672c5dc24a8c33585b5fb
-9cc24e1d4885545b58463634cc5416022cd19cacfccb4d30eb45296023fd35a458598360f8d7a4003bbaae25e331f155d9d9a5116d3bfb9a95523e51440ca2e0
-088dd844ec6370bf0e55d027a012ae264c45d02f708fa6ad6da6dce29c255df9f6cae0ec38666984b372ab5334cf640b37795cc860de4ae2816e95b21be5ceaf
-8a49f90b52a51cc6ff3355f47e0237052b81f6800fd7b802239daf6d8f0b1571a8426944fdbe80c6c1d40e8816b88b8569082ab84c36ff0539d4ff6dce591a26
-ade1c0a7f669880485fd484582903d284b26fa4e2156cff62e4b9265844c4495c495a9157b440e091bea1ab8aaf7760f4510eaa69a6465c0e04ec69ffb9e65d0
-28d44d4e39df9c1a52ecbd3607fee9cec7263328e5d661d3d0e4f62f44acd855ed7ab33cdf7bcb8ae889599bd5c8b3029895b6825696f6af29c239b75a5bb1e6
-345e6ee6c28117e73586c1a2214ae1be07e93fb0ff51e133fb65426fa843be0fb515c187064d0cc206a2fa926d3c902e907670048d931db4c1a44959d366ad93
-b65abe595f70a75bf03d616c2dd959fc7d4e6317cd99cbcec9c58b34766661c7d6766ca1a9c1b327531486c6f941c638c67cd22a7f75e2a37be0e82db8df9f30
-254d30c1372581a1f51c983c80e4b71ccdd28dbf000000ffff0300504b0304140006000800000021000dd1909fb60000001b010000270000007468656d652f74
-68656d652f5f72656c732f7468656d654d616e616765722e786d6c2e72656c73848f4d0ac2301484f78277086f6fd3ba109126dd88d0add40384e4350d363f24
-51eced0dae2c082e8761be9969bb979dc9136332de3168aa1a083ae995719ac16db8ec8e4052164e89d93b64b060828e6f37ed1567914b284d262452282e3198
-720e274a939cd08a54f980ae38a38f56e422a3a641c8bbd048f7757da0f19b017cc524bd62107bd5001996509affb3fd381a89672f1f165dfe514173d9850528
-a2c6cce0239baa4c04ca5bbabac4df000000ffff0300504b01022d0014000600080000002100828abc13fa0000001c0200001300000000000000000000000000
-000000005b436f6e74656e745f54797065735d2e786d6c504b01022d0014000600080000002100a5d6a7e7c0000000360100000b000000000000000000000000
-002b0100005f72656c732f2e72656c73504b01022d00140006000800000021006b799616830000008a0000001c00000000000000000000000000140200007468
-656d652f7468656d652f7468656d654d616e616765722e786d6c504b01022d001400060008000000210096b5ade296060000501b000016000000000000000000
-00000000d10200007468656d652f7468656d652f7468656d65312e786d6c504b01022d00140006000800000021000dd1909fb60000001b010000270000000000
-00000000000000009b0900007468656d652f7468656d652f5f72656c732f7468656d654d616e616765722e786d6c2e72656c73504b050600000000050005005d010000960a00000000}
-{\*\colorschememapping 3c3f786d6c2076657273696f6e3d22312e302220656e636f64696e673d225554462d3822207374616e64616c6f6e653d22796573223f3e0d0a3c613a636c724d
-617020786d6c6e733a613d22687474703a2f2f736368656d61732e6f70656e786d6c666f726d6174732e6f72672f64726177696e676d6c2f323030362f6d6169
-6e22206267313d226c743122207478313d22646b3122206267323d226c743222207478323d22646b322220616363656e74313d22616363656e74312220616363
-656e74323d22616363656e74322220616363656e74333d22616363656e74332220616363656e74343d22616363656e74342220616363656e74353d22616363656e74352220616363656e74363d22616363656e74362220686c696e6b3d22686c696e6b2220666f6c486c696e6b3d22666f6c486c696e6b222f3e}
-{\*\latentstyles\lsdstimax267\lsdlockeddef0\lsdsemihiddendef1\lsdunhideuseddef1\lsdqformatdef0\lsdprioritydef99{\lsdlockedexcept \lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority0 \lsdlocked0 Normal;
-\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 1;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 2;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 3;
-\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 4;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 5;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 6;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 7;
-\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 8;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 9;\lsdpriority39 \lsdlocked0 toc 1;\lsdpriority39 \lsdlocked0 toc 2;\lsdpriority39 \lsdlocked0 toc 3;\lsdpriority39 \lsdlocked0 toc 4;
-\lsdpriority39 \lsdlocked0 toc 5;\lsdpriority39 \lsdlocked0 toc 6;\lsdpriority39 \lsdlocked0 toc 7;\lsdpriority39 \lsdlocked0 toc 8;\lsdpriority39 \lsdlocked0 toc 9;\lsdqformat1 \lsdpriority35 \lsdlocked0 caption;
-\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority10 \lsdlocked0 Title;\lsdpriority1 \lsdlocked0 Default Paragraph Font;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority11 \lsdlocked0 Subtitle;
-\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority22 \lsdlocked0 Strong;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority20 \lsdlocked0 Emphasis;\lsdsemihidden0 \lsdunhideused0 \lsdpriority59 \lsdlocked0 Table Grid;
-\lsdunhideused0 \lsdlocked0 Placeholder Text;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority1 \lsdlocked0 No Spacing;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List;\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 1;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 1;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 1;\lsdunhideused0 \lsdlocked0 Revision;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority34 \lsdlocked0 List Paragraph;
-\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority29 \lsdlocked0 Quote;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority30 \lsdlocked0 Intense Quote;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 1;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 1;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 1;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 1;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 2;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 2;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 2;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 2;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 2;\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 2;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 3;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 3;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 3;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 3;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 3;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 4;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 4;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 4;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 4;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 4;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 4;\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 5;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 5;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 5;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 5;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 5;\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 5;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority60 \lsdlocked0 Light Shading Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority61 \lsdlocked0 Light List Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority62 \lsdlocked0 Light Grid Accent 6;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority65 \lsdlocked0 Medium List 1 Accent 6;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority66 \lsdlocked0 Medium List 2 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 6;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority70 \lsdlocked0 Dark List Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority71 \lsdlocked0 Colorful Shading Accent 6;
-\lsdsemihidden0 \lsdunhideused0 \lsdpriority72 \lsdlocked0 Colorful List Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdpriority73 \lsdlocked0 Colorful Grid Accent 6;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority19 \lsdlocked0 Subtle Emphasis;
-\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority21 \lsdlocked0 Intense Emphasis;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority31 \lsdlocked0 Subtle Reference;
-\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority32 \lsdlocked0 Intense Reference;\lsdsemihidden0 \lsdunhideused0 \lsdqformat1 \lsdpriority33 \lsdlocked0 Book Title;\lsdpriority37 \lsdlocked0 Bibliography;
-\lsdqformat1 \lsdpriority39 \lsdlocked0 TOC Heading;}}{\*\datastore 0105000002000000180000004d73786d6c322e534158584d4c5265616465722e352e3000000000000000000000060000
-d0cf11e0a1b11ae1000000000000000000000000000000003e000300feff090006000000000000000000000001000000010000000000000000100000feffffff00000000feffffff0000000000000000ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
-fffffffffffffffffdfffffffeffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
-ffffffffffffffffffffffffffffffff52006f006f007400200045006e00740072007900000000000000000000000000000000000000000000000000000000000000000000000000000000000000000016000500ffffffffffffffffffffffffec69d9888b8b3d4c859eaf6cd158be0f0000000000000000000000004077
-60480b5dd101feffffff00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffff00000000000000000000000000000000000000000000000000000000
-00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffff0000000000000000000000000000000000000000000000000000
-000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffff000000000000000000000000000000000000000000000000
-0000000000000000000000000000000000000000000000000105000000000000}}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-server/src/test/resources/test_recursive_embedded.docx
----------------------------------------------------------------------
diff --git a/tika-server/src/test/resources/test_recursive_embedded.docx b/tika-server/src/test/resources/test_recursive_embedded.docx
deleted file mode 100644
index cd562cb..0000000
Binary files a/tika-server/src/test/resources/test_recursive_embedded.docx and /dev/null differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/pom.xml
----------------------------------------------------------------------
diff --git a/tika-test-resources/pom.xml b/tika-test-resources/pom.xml
index a39fc62..30d41e0 100644
--- a/tika-test-resources/pom.xml
+++ b/tika-test-resources/pom.xml
@@ -16,11 +16,6 @@
<dependencies>
<dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
@@ -83,6 +78,4 @@
</plugin>
</plugins>
</build>
-
-
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt b/tika-test-resources/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt
deleted file mode 100644
index e6fa39e..0000000
--- a/tika-test-resources/src/test/resources/org/apache/tika/parser/ner/regex/ner-regex.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-WEEK_DAY=(?i)((sun)|(mon)|(tues)|(thurs)|(fri)|((sat)(ur)?))(day)?
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/org/apache/tika/parser/ner/tika-config.xml
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/org/apache/tika/parser/ner/tika-config.xml b/tika-test-resources/src/test/resources/org/apache/tika/parser/ner/tika-config.xml
deleted file mode 100644
index 267c399..0000000
--- a/tika-test-resources/src/test/resources/org/apache/tika/parser/ner/tika-config.xml
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.ner.NamedEntityParser">
- <mime>text/plain</mime>
- <mime>text/html</mime>
- <mime>application/xhtml+xml</mime>
- </parser>
- </parsers>
-
-</properties>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/2exe.docx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/2exe.docx b/tika-test-resources/src/test/resources/test-documents/2exe.docx
new file mode 100644
index 0000000..64cfbe1
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/2exe.docx differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/2pic.doc
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/2pic.doc b/tika-test-resources/src/test/resources/test-documents/2pic.doc
new file mode 100644
index 0000000..75c53b3
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/2pic.doc differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/2pic.docx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/2pic.docx b/tika-test-resources/src/test/resources/test-documents/2pic.docx
new file mode 100644
index 0000000..fe424e4
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/2pic.docx differ
http://git-wip-us.apache.org/repos/asf/tika/blob/aa5f60d7/tika-test-resources/src/test/resources/test-documents/CDEC_WEATHER_2010_03_02
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/CDEC_WEATHER_2010_03_02 b/tika-test-resources/src/test/resources/test-documents/CDEC_WEATHER_2010_03_02
new file mode 100644
index 0000000..c50e8e7
--- /dev/null
+++ b/tika-test-resources/src/test/resources/test-documents/CDEC_WEATHER_2010_03_02
@@ -0,0 +1,98 @@
+Station ID Start Date Date Time Temp Cond Depth DO Flow WXT510P Latitude Longitude
+SMN 03/02/2010 03/01/2010 23:00 14.5 791.00 53.00 7.5 1460 37.347214 -120.976181
+SMN 03/02/2010 03/01/2010 23:15 14.5 790.00 52.99 7.5 1450 37.347214 -120.976181
+SMN 03/02/2010 03/01/2010 23:30 14.5 788.00 53.03 7.4 1480 37.347214 -120.976181
+SMN 03/02/2010 03/01/2010 23:45 14.5 790.00 53.03 7.4 1480 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 00:00 14.5 785.00 53.02 7.4 1470 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 00:15 14.5 786.00 53.00 7.4 1460 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 00:30 14.5 790.00 53.04 7.4 1480 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 00:45 14.5 792.00 53.02 7.3 1470 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 01:00 14.5 786.00 53.03 7.3 1480 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 01:15 14.5 787.00 53.03 7.3 1480 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 01:30 14.5 791.00 53.03 7.3 1480 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 01:45 14.5 789.00 53.04 7.3 1480 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 02:00 14.5 794.00 53.06 7.2 1490 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 02:15 14.4 801.00 53.06 7.2 1490 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 02:30 14.4 802.00 53.04 7.2 1480 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 02:45 14.4 803.00 53.07 7.2 1500 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 03:00 14.4 802.00 53.06 7.2 1490 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 03:15 14.4 803.00 53.08 7.2 1500 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 03:30 14.4 806.00 53.06 7.2 1490 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 03:45 14.4 807.00 53.08 7.1 1500 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 04:00 14.4 810.00 53.09 7.1 1510 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 04:15 14.4 810.00 53.10 7.1 1520 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 04:30 14.3 808.00 53.11 7.1 1520 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 04:45 14.3 810.00 53.11 7.1 1520 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 05:00 14.3 813.00 53.11 7.0 1520 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 05:15 14.3 811.00 53.11 7.0 1520 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 05:30 14.3 810.00 53.10 7.0 1520 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 05:45 14.3 805.00 53.12 7.0 1530 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 06:00 14.2 806.00 53.10 7.0 1520 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 06:15 14.2 805.00 53.12 7.0 1530 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 06:30 14.2 808.00 53.14 6.9 1540 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 06:45 14.2 809.00 53.14 6.9 1540 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 07:00 14.2 803.00 53.13 6.9 1530 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 07:15 14.2 807.00 53.13 6.9 1530 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 07:30 14.2 805.00 53.14 6.9 1540 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 07:45 14.2 811.00 53.14 6.9 1540 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 08:00 14.2 815.00 53.15 6.9 1540 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 08:15 14.3 817.00 53.13 6.9 1530 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 08:30 14.3 817.00 53.15 6.9 1540 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 08:45 14.3 811.00 53.16 6.8 1550 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 09:00 14.3 810.00 53.17 6.9 1560 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 09:15 14.3 809.00 53.16 6.9 1550 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 09:30 14.3 813.00 53.18 6.9 1560 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 09:45 14.3 813.00 53.17 6.9 1560 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 10:00 14.3 813.00 53.19 6.9 1570 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 10:15 14.3 820.00 53.17 6.9 1560 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 10:30 14.3 818.00 53.18 6.9 1560 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 10:45 14.3 821.00 53.19 6.9 1570 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 11:00 14.3 821.00 53.18 6.9 1560 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 11:15 14.3 825.00 53.18 6.9 1560 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 11:30 14.3 827.00 53.17 6.9 1560 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 11:45 14.3 825.00 53.18 6.9 1560 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 12:00 14.3 829.00 53.19 6.9 1570 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 12:15 14.4 831.00 53.20 6.9 1570 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 12:30 14.4 837.00 53.20 7.0 1570 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 12:45 14.4 835.00 53.20 7.0 1570 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 13:00 14.5 837.00 53.21 7.0 1580 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 13:15 14.5 837.00 53.20 7.0 1570 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 13:30 14.5 842.00 53.20 7.0 1570 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 13:45 14.5 848.00 53.22 7.0 1590 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 14:00 14.5 850.00 53.20 7.0 1570 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 14:15 14.5 851.00 53.20 7.0 1570 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 14:30 14.5 849.00 53.20 7.1 1570 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 14:45 14.6 858.00 53.20 7.1 1570 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 15:00 14.6 869.00 53.20 7.1 1570 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 15:15 14.6 868.00 53.22 7.1 1590 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 15:30 14.5 868.00 53.23 7.1 1590 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 15:45 14.5 869.00 53.22 7.1 1590 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 16:00 14.5 873.00 53.22 7.2 1590 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 16:15 14.5 877.00 53.23 7.2 1590 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 16:30 14.5 884.00 53.23 7.2 1590 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 16:45 14.5 887.00 53.23 7.2 1590 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 17:00 14.5 889.00 53.22 7.2 1590 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 17:15 14.5 891.00 53.25 7.2 1600 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 17:30 14.4 893.00 53.24 7.2 1600 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 17:45 14.4 896.00 53.23 7.2 1590 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 18:00 14.4 896.00 53.24 7.2 1600 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 18:15 14.4 895.00 53.24 7.2 1600 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 18:30 14.4 899.00 53.25 7.2 1600 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 18:45 14.3 901.00 53.25 7.2 1600 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 19:00 14.3 899.00 53.24 7.2 1600 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 19:15 14.3 911.00 53.25 7.2 1600 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 19:30 14.3 914.00 53.26 7.2 1610 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 19:45 14.3 913.00 53.22 7.2 1590 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 20:00 14.3 914.00 53.24 7.2 1600 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 20:15 14.2 915.00 53.22 7.3 1590 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 20:30 14.2 917.00 53.24 7.2 1600 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 20:45 14.2 919.00 53.24 7.2 1600 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 21:00 14.2 919.00 53.23 7.2 1590 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 21:15 14.2 923.00 53.21 7.2 1580 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 21:30 14.2 920.00 53.24 7.2 1600 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 21:45 14.2 927.00 53.25 7.2 1600 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 22:00 14.2 929.00 53.23 7.2 1590 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 22:15 14.1 927.00 53.25 7.2 1600 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 22:30 14.1 931.00 53.22 7.2 1590 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 22:45 14.1 931.00 53.25 7.2 1600 760 37.347214 -120.976181
+SMN 03/02/2010 03/02/2010 23:00 14.1 937.00 53.23 7.2 1590 760 37.347214 -120.976181