You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:26 UTC

[10/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/nutch.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/nutch.html b/nutch-plugins/parse-tika/src/test/resources/nutch.html
new file mode 100644
index 0000000..0aa7c98
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/resources/nutch.html
@@ -0,0 +1,519 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta content="Apache Forrest" name="Generator">
+<meta name="Forrest-version" content="0.8">
+<meta name="Forrest-skin-name" content="lucene">
+<title>Welcome to Nutch!</title>
+<link type="text/css" href="skin/basic.css" rel="stylesheet">
+<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
+<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
+<link type="text/css" href="skin/profile.css" rel="stylesheet">
+<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
+<link rel="shortcut icon" href="images/favicon.ico">
+</head>
+<body onload="init()">
+<script type="text/javascript">ndeSetTextSize();</script>
+<div id="top">
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+<a href="http://www.apache.org/">Apache</a> &gt; <a href="http://lucene.apache.org/">Lucene</a> &gt; <a href="http://lucene.apache.org/nutch/">Nutch</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+</div>
+<!--+
+    |header
+    +-->
+<div class="header">
+<!--+
+    |start group logo
+    +-->
+<div class="grouplogo">
+<a href="http://lucene.apache.org/"><img class="logoImage" alt="Lucene" src="images/lucene_green_150.gif" title="Apache Lucene"></a>
+</div>
+<!--+
+    |end group logo
+    +-->
+<!--+
+    |start Project Logo
+    +-->
+<div class="projectlogo">
+<a href="http://lucene.apache.org/nutch/"><img class="logoImage" alt="Nutch" src="images/nutch-logo.gif" title="Open Source Web Search Software"></a>
+</div>
+<!--+
+    |end Project Logo
+    +-->
+<!--+
+    |start Search
+    +-->
+<div class="searchbox">
+<form action="http://search.lucidimagination.com/p:nutch" method="get" class="roundtopsmall">
+<input onFocus="getBlank (this, 'Search the site with Solr');" size="25" name="q" id="query" type="text" value="Search the site with Solr">&nbsp; 
+                    <input name="Search" value="Search" type="submit">
+</form>
+<div style="position: relative; top: -5px; left: -10px">Powered by <a href="http://www.lucidimagination.com" style="color: #033268">Lucid Imagination</a>
+</div>
+</div>
+<!--+
+    |end search
+    +-->
+<!--+
+    |start Tabs
+    +-->
+<ul id="tabs">
+<li class="current">
+<a class="selected" href="index.html">Main</a>
+</li>
+<li>
+<a class="unselected" href="http://wiki.apache.org/nutch/">Wiki</a>
+</li>
+<li>
+<a class="unselected" href="http://issues.apache.org/jira/browse/Nutch">Jira</a>
+</li>
+</ul>
+<!--+
+    |end Tabs
+    +-->
+</div>
+</div>
+<div id="main">
+<div id="publishedStrip">
+<!--+
+    |start Subtabs
+    +-->
+<div id="level2tabs"></div>
+<!--+
+    |end Endtabs
+    +-->
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+
+             &nbsp;
+           </div>
+<!--+
+    |start Menu, mainarea
+    +-->
+<!--+
+    |start Menu
+    +-->
+<div id="menu">
+<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Project</div>
+<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
+<div class="menupage">
+<div class="menupagetitle">News</div>
+</div>
+<div class="menuitem">
+<a href="about.html">About</a>
+</div>
+<div class="menuitem">
+<a href="credits.html">Credits</a>
+</div>
+<div class="menuitem">
+<a href="http://www.cafepress.com/nutch/">Buy Stuff</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Documentation</div>
+<div id="menu_1.2" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://wiki.apache.org/nutch/FAQ">FAQ</a>
+</div>
+<div class="menuitem">
+<a href="http://wiki.apache.org/nutch/">Wiki</a>
+</div>
+<div class="menuitem">
+<a href="tutorial.html">Tutorial (0.7.2)</a>
+</div>
+<div class="menuitem">
+<a href="tutorial8.html">Tutorial (0.8.x)</a>
+</div>
+<div class="menuitem">
+<a href="bot.html">Robot     </a>
+</div>
+<div class="menuitem">
+<a href="i18n.html">i18n</a>
+</div>
+<div class="menuitem">
+<a href="apidocs-1.0/index.html">API Docs (1.0)</a>
+</div>
+<div class="menuitem">
+<a href="apidocs-0.9/index.html">API Docs (0.9)</a>
+</div>
+<div class="menuitem">
+<a href="apidocs-0.8.x/index.html">API Docs (0.8.x)</a>
+</div>
+<div class="menuitem">
+<a href="apidocs/index.html">API Docs (0.7.2)</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.zones.apache.org:8080/hudson/job/Nutch-Nightly/ws/trunk/build/docs/api/index.html">API Docs (nightly)</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Resources</div>
+<div id="menu_1.3" class="menuitemgroup">
+<div class="menuitem">
+<a href="release/">Download</a>
+</div>
+<div class="menuitem">
+<a href="nightly.html">Nightly builds</a>
+</div>
+<div class="menuitem">
+<a href="mailing_lists.html">Mailing Lists</a>
+</div>
+<div class="menuitem">
+<a href="issue_tracking.html">Issue Tracking</a>
+</div>
+<div class="menuitem">
+<a href="version_control.html">Version Control</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle">Related Projects</div>
+<div id="menu_1.4" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://lucene.apache.org/java/">Lucene Java</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/hadoop/">Hadoop</a>
+</div>
+<div class="menuitem">
+<a href="http://incubator.apache.org/solr/">Solr</a>
+</div>
+</div>
+<div id="credit">
+<hr>
+<a href="http://forrest.apache.org/"><img border="0" title="Built with Apache Forrest" alt="Built with Apache Forrest - logo" src="images/built-with-forrest-button.png" style="width: 88px;height: 31px;"></a>
+</div>
+<div id="roundbottom">
+<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
+<!--+
+  |alternative credits
+  +-->
+<div id="credit2"></div>
+</div>
+<!--+
+    |end Menu
+    +-->
+<!--+
+    |start content
+    +-->
+<div id="content">
+<div title="Portable Document Format" class="pdflink">
+<a class="dida" href="index.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
+        PDF</a>
+</div>
+<h1>Welcome to Nutch!</h1>
+<div id="minitoc-area">
+<ul class="minitoc">
+<li>
+<a href="#News">News</a>
+<ul class="minitoc">
+<li>
+<a href="#14+August+2009+-+Lucene+at+US+ApacheCon">14 August 2009 - Lucene at US ApacheCon</a>
+</li>
+<li>
+<a href="#23+March+2009+-+Apache+Nutch+1.0+Released">23 March 2009 - Apache Nutch 1.0 Released</a>
+</li>
+<li>
+<a href="#09+February+2009+-+Lucene+at+ApacheCon+Europe+2009+in%0A%09%09%09Amsterdam">09 February 2009 - Lucene at ApacheCon Europe 2009 in
+			Amsterdam</a>
+</li>
+<li>
+<a href="#2+April+2007%3A+Nutch+0.9+Released">2 April 2007: Nutch 0.9 Released</a>
+</li>
+<li>
+<a href="#24+September+2006%3A+Nutch+0.8.1+Released">24 September 2006: Nutch 0.8.1 Released</a>
+</li>
+<li>
+<a href="#25+July+2006%3A+Nutch+0.8+Released">25 July 2006: Nutch 0.8 Released</a>
+</li>
+<li>
+<a href="#31+March+2006%3A+Nutch+0.7.2+Released">31 March 2006: Nutch 0.7.2 Released</a>
+</li>
+<li>
+<a href="#1+October+2005%3A+Nutch+0.7.1+Released">1 October 2005: Nutch 0.7.1 Released</a>
+</li>
+<li>
+<a href="#17+August+2005%3A+Nutch+0.7+Released">17 August 2005: Nutch 0.7 Released</a>
+</li>
+<li>
+<a href="#June+2005%3A+Nutch+graduates+from+Incubator">June 2005: Nutch graduates from Incubator</a>
+</li>
+<li>
+<a href="#January+2005%3A+Nutch+Joins+Apache+Incubator">January 2005: Nutch Joins Apache Incubator</a>
+</li>
+<li>
+<a href="#September+2004%3A+Creative+Commons+launches+Nutch-based+Search">September 2004: Creative Commons launches Nutch-based Search</a>
+</li>
+<li>
+<a href="#September+2004%3A+Oregon+State+University+switches+to+Nutch">September 2004: Oregon State University switches to Nutch</a>
+</li>
+</ul>
+</li>
+</ul>
+</div> 
+
+    
+<a name="N1000D"></a><a name="News"></a>
+<h2 class="h3">News</h2>
+<div class="section">
+<a name="N10013"></a><a name="14+August+2009+-+Lucene+at+US+ApacheCon"></a>
+<h3 class="h4">14 August 2009 - Lucene at US ApacheCon</h3>
+<p>
+        
+<a href="http://www.us.apachecon.com/c/acus2009/" title="ApacheCon US 2009">
+            <img alt="ApacheCon Logo" class="float-right" src="http://www.apache.org/events/current-event-125x125.png">
+        </a>
+        ApacheCon US is once again in the Bay Area and Lucene is coming
+        along for the ride! The Lucene community has planned two full
+        days of talks, plus a meetup and the usual bevy of training.
+        With a well-balanced mix of first time and veteran ApacheCon
+        speakers, the
+        <a href="http://www.us.apachecon.com/c/acus2009/schedule#lucene">Lucene track</a>
+        at ApacheCon US promises to have something for everyone. Be sure
+        not to miss:
+    </p>
+<p> Training:</p>
+<ul>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/437">Lucene Boot Camp</a>
+            - A two day training session, Nov. 2nd &amp; 3rd
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/375">Solr Day</a>
+            - A one day training session, Nov. 2nd
+        </li>
+    
+</ul>
+<p>Thursday, Nov. 5th</p>
+<ul>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/428">Introduction to the Lucene Ecosystem
+            </a>
+            - Grant Ingersoll @ 9:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/461">Lucene Basics and New Features</a>
+            - Michael Busch @ 10:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/331">Apache Solr: Out of the Box</a>
+            - Chris Hostetter @ 14:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/427">Introduction to Nutch</a>
+            - Andrzej Bialecki @ 15:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/430">Lucene and Solr Performance Tuning</a>
+            - Mark Miller @ 16:30
+        </li>
+    
+</ul>
+<p>Friday, Nov. 6th</p>
+<ul>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/332">Implementing an Information Retrieval
+                Framework for an Organizational Repository</a>
+            - Sithu D Sudarsan @ 9:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/333">Apache Mahout - Going from raw data to
+                Information</a>
+            - Isabel Drost @ 10:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/334">MIME Magic with Apache Tika</a>
+            - Jukka Zitting @ 11:30
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/335">Building Intelligent Search Applications
+                with the Lucene Ecosystem</a>
+            - Ted Dunning @ 14:00
+        </li>
+        
+<li>
+            
+<a href="http://www.us.apachecon.com/c/acus2009/sessions/462">Realtime Search</a>
+            - Jason Rutherglen @ 15:00
+        </li>
+    
+</ul>
+<a name="N10091"></a><a name="23+March+2009+-+Apache+Nutch+1.0+Released"></a>
+<h3 class="h4">23 March 2009 - Apache Nutch 1.0 Released</h3>
+<p>The 1.0 release of Nutch is now available. This release includes several major feature improvements
+      such as new indexing framework, new scoring framework, Apache Solr integration just to mention a few.
+      See <a href="http://www.apache.org/dist/lucene/nutch/CHANGES-1.0.txt">
+      list of changes</a>  made in this version. The release is available
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N100A3"></a><a name="09+February+2009+-+Lucene+at+ApacheCon+Europe+2009+in%0A%09%09%09Amsterdam"></a>
+<h3 class="h4">09 February 2009 - Lucene at ApacheCon Europe 2009 in
+			Amsterdam</h3>
+<p>
+			
+<a href="http://www.eu.apachecon.com/c/aceu2009/" title="ApacheCon EU 2009">
+				<img alt="ApacheCon EU 2009 Logo" class="float-right" src="http://www.eu.apachecon.com/page_attachments/0000/0115/125x125_basic.gif">
+			</a>
+
+			Lucene will be extremely well represented at
+			<a href="http://www.eu.apachecon.com/c/aceu2009/">ApacheCon EU 2009</a>
+			in Amsterdam, Netherlands this March 23-27, 2009:
+		</p>
+<ul>
+			
+<li>
+				
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/197">Lucene Boot Camp</a>
+				- A two day training session, March 23 &amp; 24th</li>
+                
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/201">Solr Boot Camp</a> - A one day training session, March 24th</li>
+                
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/136">Introducing Apache Mahout</a> - Grant Ingersoll. March 25th @ 10:30</li>
+                
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/137">Lucene/Solr Case Studies</a> - Erik Hatcher. March 25th @ 11:30</li>
+                
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/138">Advanced Indexing Techniques with Apache Lucene</a> - Michael Busch. March 25th @ 14:00</li>  
+                   
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/251">Apache Solr - A Case Study</a> - Uri Boness. March 26th @ 17:30</li>
+           
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/250">Best of breed - httpd, forrest, solr and droids</a> - Thorsten Scherler. March 27th @ 17:30</li>
+           
+<li>
+<a href="http://eu.apachecon.com/c/aceu2009/sessions/165">Apache Droids - an intelligent standalone robot framework</a> - Thorsten Scherler. March 26th @ 15:00</li>
+
+               
+</ul>
+<a name="N100EF"></a><a name="2+April+2007%3A+Nutch+0.9+Released"></a>
+<h3 class="h4">2 April 2007: Nutch 0.9 Released</h3>
+<p>The 0.9 release of Nutch is now available. This is the second release of Nutch
+      based entirely on the underlying Hadoop platform. This release includes several critical
+      bug fixes, as well as key speedups described in more detail at 
+      <a href="http://blog.foofactory.fi/2007/03/twice-speed-half-size.html">Sami Siren's blog</a>.
+      See <a href="http://www.apache.org/dist/lucene/nutch/CHANGES-0.9.txt">
+      list of changes</a>  made in this version. The release is available
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N10105"></a><a name="24+September+2006%3A+Nutch+0.8.1+Released"></a>
+<h3 class="h4">24 September 2006: Nutch 0.8.1 Released</h3>
+<p>The 0.8.1 release of Nutch is now available. This is a maintenance release to 0.8 branch fixing many serous bugs found in version 0.8.
+      See <a href="http://www.apache.org/dist/lucene/nutch/CHANGES-0.8.1.txt">
+      list of changes</a>  made in this version. The release is available
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N10117"></a><a name="25+July+2006%3A+Nutch+0.8+Released"></a>
+<h3 class="h4">25 July 2006: Nutch 0.8 Released</h3>
+<p>The 0.8 release of Nutch is now available. This is the first release of Nutch based on
+      hadoop architecure. See <a href="http://svn.apache.org/viewvc/lucene/nutch/tags/release-0.8/CHANGES.txt?view=markup">
+      CHANGES.txt</a> for list of changes made in this version. The release is available
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N10129"></a><a name="31+March+2006%3A+Nutch+0.7.2+Released"></a>
+<h3 class="h4">31 March 2006: Nutch 0.7.2 Released</h3>
+<p>The 0.7.2 release of Nutch is now available. This is a bug fix release for 0.7 branch. See
+      <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=390158">
+      CHANGES.txt</a> for details. The release is available
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N1013B"></a><a name="1+October+2005%3A+Nutch+0.7.1+Released"></a>
+<h3 class="h4">1 October 2005: Nutch 0.7.1 Released</h3>
+<p>The 0.7.1 release of Nutch is now available. This is a bug fix release. See
+      <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=292986">
+      CHANGES.txt</a> for details. The release is available
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N1014D"></a><a name="17+August+2005%3A+Nutch+0.7+Released"></a>
+<h3 class="h4">17 August 2005: Nutch 0.7 Released</h3>
+<p>This is the first Nutch release as an Apache Lucene sub-project. See 
+      <a href="http://svn.apache.org/viewcvs.cgi/lucene/nutch/trunk/CHANGES.txt?rev=233150">
+      CHANGES.txt</a> for details. The release is available 
+      <a href="http://lucene.apache.org/nutch/release/">here</a>.</p>
+<a name="N1015F"></a><a name="June+2005%3A+Nutch+graduates+from+Incubator"></a>
+<h3 class="h4">June 2005: Nutch graduates from Incubator</h3>
+<p>Nutch has now graduated from the Apache incubator, and is now
+      a Subproject of Lucene.</p>
+<a name="N10169"></a><a name="January+2005%3A+Nutch+Joins+Apache+Incubator"></a>
+<h3 class="h4">January 2005: Nutch Joins Apache Incubator</h3>
+<p>Nutch is a two-year-old open source project, previously
+        hosted at Sourceforge and backed by its own non-profit
+        organization. The non-profit was founded in order to assign
+        copyright, so that we could retain the right to change the
+        license. We have now determined that the Apache license is the
+        appropriate license for Nutch and no longer require the
+        overhead of an independent non-profit organization. Nutch's
+        board of directors and its developers were both polled and
+        supported the move to the Apache foundation.</p>
+<a name="N10173"></a><a name="September+2004%3A+Creative+Commons+launches+Nutch-based+Search"></a>
+<h3 class="h4">September 2004: Creative Commons launches Nutch-based Search</h3>
+<p>Creative Commons unveiled a beta version of its search
+      engine, which scours the web for text, images, audio, and video
+      free to re-use on certain terms a search refinement offered by
+      no other company or organization.</p>
+<p>See the <a href="http://creativecommons.org/press-releases/entry/5064">Creative
+      Commons Press Release</a> for more details.</p>
+<a name="N10184"></a><a name="September+2004%3A+Oregon+State+University+switches+to+Nutch"></a>
+<h3 class="h4">September 2004: Oregon State University switches to Nutch</h3>
+<p>Oregon State University is converting its searching
+      infrastructure from Googletm to the open source project
+      Nutch. The effort to replace the Googletm will realize
+      significant cost savings for Oregon State University, while
+      promoting both the Nutch Search Engine and transparency in
+      search engine use and management.</p>
+<p>For more details see the announcement by OSU's <a href="http://osuosl.org/news_folder/nutch">Open Source
+      Lab</a>.</p>
+</div>
+
+  
+</div>
+<!--+
+    |end content
+    +-->
+<div class="clearboth">&nbsp;</div>
+</div>
+<div id="footer">
+<!--+
+    |start bottomstrip
+    +-->
+<div class="lastmodified">
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<div class="copyright">
+        Copyright &copy;
+         2006 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
+</div>
+<div id="logos"></div>
+<!--+
+    |end bottomstrip
+    +-->
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/nutch_logo_tm.gif
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/nutch_logo_tm.gif b/nutch-plugins/parse-tika/src/test/resources/nutch_logo_tm.gif
new file mode 100644
index 0000000..0545a60
Binary files /dev/null and b/nutch-plugins/parse-tika/src/test/resources/nutch_logo_tm.gif differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/ootest.odt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/ootest.odt b/nutch-plugins/parse-tika/src/test/resources/ootest.odt
new file mode 100644
index 0000000..e36e389
Binary files /dev/null and b/nutch-plugins/parse-tika/src/test/resources/ootest.odt differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/ootest.sxw
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/ootest.sxw b/nutch-plugins/parse-tika/src/test/resources/ootest.sxw
new file mode 100644
index 0000000..260b1c2
Binary files /dev/null and b/nutch-plugins/parse-tika/src/test/resources/ootest.sxw differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/ootest.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/ootest.txt b/nutch-plugins/parse-tika/src/test/resources/ootest.txt
new file mode 100644
index 0000000..685f89a
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/resources/ootest.txt
@@ -0,0 +1,30 @@
+\ufeffAbcedfg				?????
+Abcdefg
+Abcdefg
+abcdefg
+
+
+
+
+
+
+
+
+
+
+ http://www.openoffice.org
+
+Title
+Col1
+Col2
+Col3
+head
+Cell1
+Cell2
+Cel3
+total
+TOTAL
+TOTAL
+TOTAL
+
+Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Integer a leo in lacus malesuada ornare. Mauris sagittis. Nam vestibulum. Nunc gravida vestibulum augue. Praesent sed lectus quis lectus adipiscing bibendum. Sed nulla. Duis posuere justo eget urna. Proin lorem orci, vestibulum ut, consequat molestie, eleifend a, nibh. Mauris sed lacus. Etiam blandit tincidunt neque. Cras ac sapien. Duis erat. 

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/pdftest.pdf
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/pdftest.pdf b/nutch-plugins/parse-tika/src/test/resources/pdftest.pdf
new file mode 100644
index 0000000..e7c6e62
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/resources/pdftest.pdf
@@ -0,0 +1,157 @@
+%PDF-1.2 
+%\ufffd\ufffd\ufffd\ufffd
+ 
+9 0 obj
+<<
+/Length 10 0 R
+/Filter /FlateDecode 
+>>
+stream
+H\ufffd\u0350\ufffdJ\ufffd0\ufffd\ufffd \ufffd\ufffd{\ufffd\ufffdf\ufffd$M\ufffd\ufffdn\ufffd-\ufffd\ufffd\ufffd[&je\ufffd\ufffd\ufffd\u06e4\ufffd~\ufffd$\ufffd\ufffd\ufffd}\ufffd\u0245\ufffdIj\ufffd\ufffd\ufffds\ufffd\ufffd\ufffd\ufffd~\ufffdX\ufffd-],\ufffd\ufffd$Y\ufffd\ufffd\ufffd)\ufffd'N\ufffdu\ufffd1!\ufffd\ufffd\ufffdV\ufffd?\ufffd\ufffd?
+\ufffdb1Rbb\ufffd\u0489\ufffdH\ufffd[\ufffd\ufffdTD:#\ufffd&\u062d\ufffd\ufffdX\ufffd\ufffd\ufffdi\ufffd$qnf\ufffd\ufffd\ufffd\ufffd\ufffd]\ufffd\ufffd\ufffd\ufffd\ufffd\ufffda\ufffd\ufffd{\ufffd\ufffd\u0623\ufffd\ufffd\ufffdq|J\ufffdLs]\ufffdQ\ufffdI\ufffd\ufffdj\ufffd%\ufffd\ufffd9\ufffd\ufffd`\ufffd\u09ba\ufffd\ufffdU\ufffdite\ufffdz\ufffd$\ufffd\ufffd\ufffd\ufffdOeB\ufffd\u0112\u04af\ufffdR\ufffd\ufffd@z\u0717\ufffd\ufffd\ufffdg\ufffd\ufffd\ufffd<\ufffd\ufffd\ufffd
+endstream
+endobj
+10 0 obj
+246
+endobj
+4 0 obj
+<<
+/Type /Page
+/Parent 5 0 R
+/Resources <<
+/Font <<
+/F0 6 0 R 
+/F1 7 0 R 
+>>
+/ProcSet 2 0 R
+>>
+/Contents 9 0 R
+>>
+endobj
+6 0 obj
+<<
+/Type /Font
+/Subtype /TrueType
+/Name /F0
+/BaseFont /Arial
+/Encoding /WinAnsiEncoding
+>>
+endobj
+7 0 obj
+<<
+/Type /Font
+/Subtype /TrueType
+/Name /F1
+/BaseFont /BookAntiqua,Bold
+/FirstChar 31
+/LastChar 255
+/Widths [ 750 250 278 402 606 500 889 833 227 333 333 444 606 250 333 250 
+296 500 500 500 500 500 500 500 500 500 500 250 250 606 606 606 
+444 747 778 667 722 833 611 556 833 833 389 389 778 611 1000 833 
+833 611 833 722 611 667 778 778 1000 667 667 667 333 606 333 606 
+500 333 500 611 444 611 500 389 556 611 333 333 611 333 889 611 
+556 611 611 389 444 333 611 556 833 500 556 500 310 606 310 606 
+750 500 750 333 500 500 1000 500 500 333 1000 611 389 1000 750 750 
+750 750 278 278 500 500 606 500 1000 333 998 444 389 833 750 750 
+667 250 278 500 500 606 500 606 500 333 747 438 500 606 333 747 
+500 400 549 361 361 333 576 641 250 333 361 488 500 889 890 889 
+444 778 778 778 778 778 778 1000 722 611 611 611 611 389 389 389 
+389 833 833 833 833 833 833 833 606 833 778 778 778 778 667 611 
+611 500 500 500 500 500 500 778 444 500 500 500 500 333 333 333 
+333 556 611 556 556 556 556 556 549 556 611 611 611 611 556 611 
+556 ]
+/Encoding /WinAnsiEncoding
+/FontDescriptor 8 0 R
+>>
+endobj
+8 0 obj
+<<
+/Type /FontDescriptor
+/FontName /BookAntiqua,Bold
+/Flags 16418
+/FontBBox [ -250 -260 1236 930 ]
+/MissingWidth 750
+/StemV 146
+/StemH 146
+/ItalicAngle 0
+/CapHeight 930
+/XHeight 651
+/Ascent 930
+/Descent 260
+/Leading 210
+/MaxWidth 1030
+/AvgWidth 460
+>>
+endobj
+2 0 obj
+[ /PDF /Text  ]
+endobj
+5 0 obj
+<<
+/Kids [4 0 R ]
+/Count 1
+/Type /Pages
+/MediaBox [ 0 0 612 792 ]
+>>
+endobj
+1 0 obj
+<<
+/Creator (1725.fm)
+/CreationDate (1-Jan-3 18:15PM)
+/Title (1725.PDF)
+/Author (Unknown)
+/Producer (Acrobat PDFWriter 3.02 for Windows)
+/Keywords ()
+/Subject ()
+>>
+endobj
+3 0 obj
+<<
+/Pages 5 0 R
+/Type /Catalog
+/DefaultGray 11 0 R
+/DefaultRGB  12 0 R
+>>
+endobj
+11 0 obj
+[/CalGray
+<<
+/WhitePoint [0.9505 1 1.0891 ]
+/Gamma 0.2468 
+>>
+]
+endobj
+12 0 obj
+[/CalRGB
+<<
+/WhitePoint [0.9505 1 1.0891 ]
+/Gamma [0.2468 0.2468 0.2468 ]
+/Matrix [0.4361 0.2225 0.0139 0.3851 0.7169 0.0971 0.1431 0.0606 0.7141 ]
+>>
+]
+endobj
+xref
+0 13
+0000000000 65535 f
+0000002172 00000 n
+0000002046 00000 n
+0000002363 00000 n
+0000000375 00000 n
+0000002080 00000 n
+0000000518 00000 n
+0000000633 00000 n
+0000001760 00000 n
+0000000021 00000 n
+0000000352 00000 n
+0000002460 00000 n
+0000002548 00000 n
+trailer
+<<
+/Size 13
+/Root 3 0 R
+/Info 1 0 R
+/ID [<47149510433dd4882f05f8c124223734><47149510433dd4882f05f8c124223734>]
+>>
+startxref
+2726
+%%EOF

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/rsstest.rss
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/rsstest.rss b/nutch-plugins/parse-tika/src/test/resources/rsstest.rss
new file mode 100644
index 0000000..6c4ae48
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/resources/rsstest.rss
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="ISO-8859-1" ?>
+<!--
+	Licensed to the Apache Software Foundation (ASF) under one or more
+	contributor license agreements.  See the NOTICE file distributed with
+	this work for additional information regarding copyright ownership.
+	The ASF licenses this file to You under the Apache License, Version 2.0
+	(the "License"); you may not use this file except in compliance with
+	the License.  You may obtain a copy of the License at
+	
+	http://www.apache.org/licenses/LICENSE-2.0
+	
+	Unless required by applicable law or agreed to in writing, software
+	distributed under the License is distributed on an "AS IS" BASIS,
+	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	See the License for the specific language governing permissions and
+	limitations under the License.
+-->
+<rss version="0.91">
+    <channel>
+      <title>TestChannel</title>
+      <link>http://test.channel.com/</link> 
+      <description>Sample RSS File for Junit test</description> 
+      <language>en-us</language>
+      
+      <item>
+        <title>Home Page of Chris Mattmann</title>
+        <link>http://www-scf.usc.edu/~mattmann/</link>
+        <description>Chris Mattmann's home page</description>
+      </item>
+
+      <item>
+        <title>Awesome Open Source Search Engine</title> 
+        <link>http://www.nutch.org/</link> 
+        <description>Yup, that's what it is</description> 
+      </item>
+   </channel>
+</rss>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/test.rtf
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/test.rtf b/nutch-plugins/parse-tika/src/test/resources/test.rtf
new file mode 100644
index 0000000..c67a6c8
--- /dev/null
+++ b/nutch-plugins/parse-tika/src/test/resources/test.rtf
@@ -0,0 +1,17 @@
+{\rtf1\ansi\deff1\adeflang1025
+{\fonttbl{\f0\froman\fprq2\fcharset0 Times;}{\f1\froman\fprq2\fcharset0 Times New Roman;}{\f2\fmodern\fprq1\fcharset0 Courier New;}{\f3\froman\fprq2\fcharset0 Times New Roman;}{\f4\fnil\fprq2\fcharset0 Interface User;}{\f5\fnil\fprq2\fcharset0 Lucidasans;}{\f6\fnil\fprq0\fcharset0 Lucidasans;}}
+{\colortbl;\red0\green0\blue0;\red0\green0\blue128;\red128\green128\blue128;}
+{\stylesheet{\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af5\afs24\lang255\ltrch\dbch\af4\afs24\langfe255\loch\f1\fs24\lang1033\snext1 Default;}
+{\s2\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\afs24\lang255\ltrch\dbch\afs24\langfe255\loch\f1\fs24\lang1033\sbasedon1\snext2 Text body;}
+{\s3\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs24\lang255\ltrch\dbch\af1\afs24\langfe255\loch\f1\fs24\lang1033\sbasedon2\snext3 List;}
+{\s4\sb120\sa120\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs20\lang255\ai\ltrch\dbch\afs20\langfe255\ai\loch\f1\fs20\lang1033\i\sbasedon1\snext4 Caption;}
+{\s5\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\rtlch\af6\afs24\lang255\ltrch\dbch\afs24\langfe255\loch\f1\fs24\lang1033\sbasedon1\snext5 Index;}
+{\*\cs7\cf0\rtlch\af2\afs24\lang255\ltrch\dbch\af2\afs24\langfe255\loch\f2\fs24\lang1033 Teletype;}
+{\*\cs8\cf2\ul\rtlch\afs24\lang255\ltrch\dbch\afs24\langfe255\loch\fs24\lang1033 Internet Link;}
+}
+{\info{\title test rft document}{\subject tests}{\creatim\yr2004\mo9\dy20\hr19\min36}{\revtim\yr1601\mo1\dy1\hr0\min0}{\printim\yr1601\mo1\dy1\hr0\min0}{\comment StarWriter}{\vern6450}}\deftab709
+{\*\pgdsctbl
+{\pgdsc0\pgdscuse195\pgwsxn11905\pghsxn16837\marglsxn1800\margrsxn1800\margtsxn1440\margbsxn1440\pgdscnxt0 Default;}}
+{\*\pgdscno0}\paperh16837\paperw11905\margl1800\margr1800\margt1440\margb1440\sectd\sbknone\pgwsxn11905\pghsxn16837\marglsxn1800\margrsxn1800\margtsxn1440\margbsxn1440\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
+\pard\plain \ltrpar\s1\cf0{\*\hyphen2\hyphlead2\hyphtrail2\hyphmax0}\ql\rtlch\af5\afs24\lang255\ltrch\dbch\af4\afs24\langfe255\loch\f1\fs24\lang1033{\loch\f2\fs24\lang1033\i0\b0\*\cs7\cf0\rtlch\ltrch\dbch\loch\f2\fs24\lang1033 The quick brown fox jumps over the lazy dog}
+\par }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-tika/src/test/resources/word97.doc
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-tika/src/test/resources/word97.doc b/nutch-plugins/parse-tika/src/test/resources/word97.doc
new file mode 100644
index 0000000..4d012da
Binary files /dev/null and b/nutch-plugins/parse-tika/src/test/resources/word97.doc differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-zip/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/build.xml b/nutch-plugins/parse-zip/build.xml
new file mode 100644
index 0000000..991ce31
--- /dev/null
+++ b/nutch-plugins/parse-zip/build.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-zip" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
+    <ant target="deploy" inheritall="false" dir="../protocol-file"/>
+   <!-- <ant target="deploy" inheritall="false" dir="../parse-text"/>-->
+  </target>
+
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data" />
+  <copy todir="${build.test}/data">
+    <fileset dir="sample">
+      <include name="*.zip" />
+    </fileset>
+  </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-zip/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/ivy.xml b/nutch-plugins/parse-zip/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/parse-zip/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-zip/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/plugin.xml b/nutch-plugins/parse-zip/plugin.xml
new file mode 100644
index 0000000..35ec0eb
--- /dev/null
+++ b/nutch-plugins/parse-zip/plugin.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parse-zip"
+   name="Zip Parse Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parse-zip.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.parse.zip"
+              name="ZipParser" 
+              point="org.apache.nutch.parse.Parser">
+
+      <implementation id="org.apache.nutch.parse.zip.ZipParser" 
+                      class="org.apache.nutch.parse.zip.ZipParser">
+        <parameter name="contentType" value="application/zip"/>
+        <parameter name="pathSuffix"  value="zip"/>
+      </implementation>
+      
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-zip/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/pom.xml b/nutch-plugins/parse-zip/pom.xml
new file mode 100644
index 0000000..b30b9a1
--- /dev/null
+++ b/nutch-plugins/parse-zip/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parse-zip</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parse-zip</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipParser.java b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipParser.java
new file mode 100644
index 0000000..f441fd0
--- /dev/null
+++ b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipParser.java
@@ -0,0 +1,144 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.zip;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * ZipParser class based on MSPowerPointParser class by Stephan Strittmatter.
+ * Nutch parse plugin for zip files - Content Type : application/zip
+ */
+public class ZipParser implements Parser {
+
+  private static final Logger LOG = LoggerFactory.getLogger(ZipParser.class);
+  private Configuration conf;
+
+  /** Creates a new instance of ZipParser */
+  public ZipParser() {
+  }
+
+  public ParseResult getParse(final Content content) {
+
+    String resultText = null;
+    String resultTitle = null;
+    Outlink[] outlinks = null;
+    List<Outlink> outLinksList = new ArrayList<Outlink>();
+
+    try {
+      final String contentLen = content.getMetadata().get(
+          Response.CONTENT_LENGTH);
+      final int len = Integer.parseInt(contentLen);
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("ziplen: " + len);
+      }
+      final byte[] contentInBytes = content.getContent();
+
+      if (contentLen != null && contentInBytes.length != len) {
+        return new ParseStatus(ParseStatus.FAILED,
+            ParseStatus.FAILED_TRUNCATED, "Content truncated at "
+                + contentInBytes.length
+                + " bytes. Parser can't handle incomplete zip file.")
+            .getEmptyParseResult(content.getUrl(), getConf());
+      }
+
+      ZipTextExtractor extractor = new ZipTextExtractor(getConf());
+
+      // extract text
+      resultText = extractor.extractText(new ByteArrayInputStream(
+          contentInBytes), content.getUrl(), outLinksList);
+
+    } catch (Exception e) {
+      return new ParseStatus(ParseStatus.FAILED,
+          "Can't be handled as Zip document. " + e).getEmptyParseResult(
+          content.getUrl(), getConf());
+    }
+
+    if (resultText == null) {
+      resultText = "";
+    }
+
+    if (resultTitle == null) {
+      resultTitle = "";
+    }
+
+    outlinks = (Outlink[]) outLinksList.toArray(new Outlink[0]);
+    final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+        resultTitle, outlinks, content.getMetadata());
+
+    if (LOG.isTraceEnabled()) {
+      LOG.trace("Zip file parsed sucessfully !!");
+    }
+    return ParseResult.createParseResult(content.getUrl(), new ParseImpl(
+        resultText, parseData));
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public static void main(String[] args) throws IOException {
+    if (args.length < 1) {
+      System.out.println("ZipParser <zip_file>");
+      System.exit(1);
+    }
+    File file = new File(args[0]);
+    String url = "file:"+file.getCanonicalPath();
+    FileInputStream in = new FileInputStream(file);
+    byte[] bytes = new byte[in.available()];
+    in.read(bytes);
+    in.close();
+    Configuration conf = NutchConfiguration.create();
+    ZipParser parser = new ZipParser();
+    parser.setConf(conf);
+    Metadata meta = new Metadata();
+    meta.add(Response.CONTENT_LENGTH, ""+file.length());
+    ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
+        "application/zip", meta, conf));
+    Parse p = parseResult.get(url);
+    System.out.println(parseResult.size());
+    System.out.println("Parse Text:");
+    System.out.println(p.getText());
+    System.out.println("Parse Data:");
+    System.out.println(p.getData());
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipTextExtractor.java b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
new file mode 100644
index 0000000..b454727
--- /dev/null
+++ b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.zip;
+
+// JDK imports
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+import java.net.URL;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.protocol.Content;
+import org.apache.tika.Tika;
+
+/**
+ * 
+ * @author Rohit Kulkarni & Ashish Vaidya
+ */
+public class ZipTextExtractor {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(ZipTextExtractor.class);
+
+  private Configuration conf;
+
+  /** Creates a new instance of ZipTextExtractor */
+  public ZipTextExtractor(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public String extractText(InputStream input, String url,
+      List<Outlink> outLinksList) throws IOException {
+    String resultText = "";
+    ZipInputStream zin = new ZipInputStream(input);
+    ZipEntry entry;
+
+    while ((entry = zin.getNextEntry()) != null) {
+
+      if (!entry.isDirectory()) {
+        int size = (int) entry.getSize();
+        byte[] b = new byte[size];
+        for (int x = 0; x < size; x++) {
+          int err = zin.read();
+          if (err != -1) {
+            b[x] = (byte) err;
+          }
+        }
+        String newurl = url + "/";
+        String fname = entry.getName();
+        newurl += fname;
+        URL aURL = new URL(newurl);
+        String base = aURL.toString();
+        int i = fname.lastIndexOf('.');
+        if (i != -1) {
+          // Trying to resolve the Mime-Type
+          Tika tika = new Tika();
+          String contentType = tika.detect(fname);
+          try {
+            Metadata metadata = new Metadata();
+            metadata.set(Response.CONTENT_LENGTH,
+                Long.toString(entry.getSize()));
+            metadata.set(Response.CONTENT_TYPE, contentType);
+            Content content = new Content(newurl, base, b, contentType,
+                metadata, this.conf);
+            Parse parse = new ParseUtil(this.conf).parse(content).get(
+                content.getUrl());
+            ParseData theParseData = parse.getData();
+            Outlink[] theOutlinks = theParseData.getOutlinks();
+
+            for (int count = 0; count < theOutlinks.length; count++) {
+              outLinksList.add(new Outlink(theOutlinks[count].getToUrl(),
+                  theOutlinks[count].getAnchor()));
+            }
+
+            resultText += entry.getName() + " " + parse.getText() + " ";
+          } catch (ParseException e) {
+            if (LOG.isInfoEnabled()) {
+              LOG.info("fetch okay, but can't parse " + fname + ", reason: "
+                  + e.getMessage());
+            }
+          }
+        }
+      }
+    }
+
+    return resultText;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/package-info.java b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/package-info.java
new file mode 100644
index 0000000..fc81ee1
--- /dev/null
+++ b/nutch-plugins/parse-zip/src/main/java/org/apache/nutch/parse/zip/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse ZIP files: embedded files are recursively passed to appropriate parsers.
+ */
+package org.apache.nutch.parse.zip;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-zip/src/test/java/org/apache/nutch/parse/zip/TestZipParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/src/test/java/org/apache/nutch/parse/zip/TestZipParser.java b/nutch-plugins/parse-zip/src/test/java/org/apache/nutch/parse/zip/TestZipParser.java
new file mode 100644
index 0000000..17e386a
--- /dev/null
+++ b/nutch-plugins/parse-zip/src/test/java/org/apache/nutch/parse/zip/TestZipParser.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.zip;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Based on Unit tests for MSWordParser by John Xing
+ * 
+ * @author Rohit Kulkarni & Ashish Vaidya
+ */
+public class TestZipParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  // Make sure sample files are copied to "test.data"
+
+  private String[] sampleFiles = { "test.zip" };
+
+  private String expectedText = "textfile.txt This is text file number 1 ";
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    Parse parse;
+
+    Configuration conf = NutchConfiguration.create();
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      parse = new ParseUtil(conf).parseByExtensionId("parse-zip", content).get(
+          content.getUrl());
+      Assert.assertTrue(parse.getText().equals(expectedText));
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parse-zip/src/test/resources/test.zip
----------------------------------------------------------------------
diff --git a/nutch-plugins/parse-zip/src/test/resources/test.zip b/nutch-plugins/parse-zip/src/test/resources/test.zip
new file mode 100644
index 0000000..0c649d2
Binary files /dev/null and b/nutch-plugins/parse-zip/src/test/resources/test.zip differ

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-naivebayes/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/build-ivy.xml b/nutch-plugins/parsefilter-naivebayes/build-ivy.xml
new file mode 100644
index 0000000..22bee5f
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parsefilter-naivebayes" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not already dropped
+              it into ant's lib dir (note that the latter copy will always take precedence).
+              We will not fail as long as local lib dir exists (it may be empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-naivebayes/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/build.xml b/nutch-plugins/parsefilter-naivebayes/build.xml
new file mode 100644
index 0000000..6fb7a9d
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parsefilter-naivebayes" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-naivebayes/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/ivy.xml b/nutch-plugins/parsefilter-naivebayes/ivy.xml
new file mode 100644
index 0000000..08cca2c
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/ivy.xml
@@ -0,0 +1,49 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+
+    <dependency org="org.apache.mahout" name="mahout-math" rev="0.10.1" />
+    <dependency org="org.apache.mahout" name="mahout-core" rev="0.9" >
+      <exclude org="org.apache.mrunit" name="mrunit"/>
+    </dependency>
+    <dependency org="org.apache.lucene" name="lucene-core" rev="5.5.0" />
+    <dependency org="org.apache.lucene" name="lucene-analyzers-common" rev="5.5.0" />
+
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-naivebayes/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/plugin.xml b/nutch-plugins/parsefilter-naivebayes/plugin.xml
new file mode 100644
index 0000000..ac15041
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/plugin.xml
@@ -0,0 +1,56 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parsefilter-naivebayes"
+   name="Naive Bayes Parse Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parsefilter-naivebayes.jar">
+         <export name="*"/>
+      </library>
+      <library name="commons-cli-2.0-mahout.jar"/>
+      <library name="commons-lang3-3.1.jar"/>
+      <library name="commons-math3-3.2.jar"/>
+      <library name="guava-14.0.1.jar"/>
+      <library name="jackson-core-asl-1.9.12.jar"/>
+      <library name="jackson-mapper-asl-1.9.12.jar"/>
+      <library name="lucene-analyzers-common-5.5.0.jar"/>
+      <library name="lucene-core-5.5.0.jar"/>
+      <library name="mahout-core-0.9.jar"/>
+      <library name="mahout-math-0.10.1.jar"/>
+      <library name="slf4j-api-1.7.12.jar"/>
+      <library name="solr-commons-csv-3.5.0.jar"/>
+      <library name="t-digest-3.1.jar"/>
+      <library name="xmlpull-1.1.3.1.jar"/>
+      <library name="xpp3_min-1.1.4c.jar"/>
+      <library name="xstream-1.4.4.jar"/> 
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.htmlparsefilter.naivebayes"
+        name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="NaiveBayesHTMLParseFilter" 
+        class="org.apache.nutch.parsefilter.naivebayes.NaiveBayesParseFilter"/>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-naivebayes/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/pom.xml b/nutch-plugins/parsefilter-naivebayes/pom.xml
new file mode 100644
index 0000000..0a99e47
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>parsefilter-naivebayes</artifactId>
+    <packaging>jar</packaging>
+
+    <name>parsefilter-naivebayes</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Classify.java b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
new file mode 100644
index 0000000..d755ff6
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
@@ -0,0 +1,120 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parsefilter.naivebayes;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.HashMap;
+import java.io.InputStreamReader;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+public class Classify {
+
+  private static int uniquewords_size = 0;
+
+  private static int numof_ir = 0;
+  private static int numwords_ir = 0;
+  private static HashMap<String, Integer> wordfreq_ir = null;
+
+  private static int numof_r = 0;
+  private static int numwords_r = 0;
+  private static HashMap<String, Integer> wordfreq_r = null;
+  private static boolean ismodel = false;
+
+  public static HashMap<String, Integer> unflattenToHashmap(String line) {
+    HashMap<String, Integer> dict = new HashMap<String, Integer>();
+
+    String dictarray[] = line.split(",");
+
+    for (String field : dictarray) {
+
+      dict.put(field.split(":")[0], Integer.valueOf(field.split(":")[1]));
+    }
+
+    return dict;
+
+  }
+
+  public static String classify(String line) throws IOException {
+
+    double prob_ir = 0;
+    double prob_r = 0;
+
+    String result = "1";
+
+    String[] linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase()
+        .split(" ");
+
+    // read the training file
+    // read the line
+    if (!ismodel) {
+      Configuration configuration = new Configuration();
+      FileSystem fs = FileSystem.get(configuration);
+
+      BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(
+          fs.open(new Path("naivebayes-model"))));
+
+      uniquewords_size = Integer.valueOf(bufferedReader.readLine());
+      bufferedReader.readLine();
+
+      numof_ir = Integer.valueOf(bufferedReader.readLine());
+      numwords_ir = Integer.valueOf(bufferedReader.readLine());
+      wordfreq_ir = unflattenToHashmap(bufferedReader.readLine());
+      bufferedReader.readLine();
+      numof_r = Integer.valueOf(bufferedReader.readLine());
+      numwords_r = Integer.valueOf(bufferedReader.readLine());
+      wordfreq_r = unflattenToHashmap(bufferedReader.readLine());
+
+      ismodel = true;
+
+      bufferedReader.close();
+
+    }
+
+    // update probabilities
+
+    for (String word : linearray) {
+      if (wordfreq_ir.containsKey(word))
+        prob_ir += Math.log(wordfreq_ir.get(word)) + 1
+            - Math.log(numwords_ir + uniquewords_size);
+      else
+        prob_ir += 1 - Math.log(numwords_ir + uniquewords_size);
+
+      if (wordfreq_r.containsKey(word))
+        prob_r += Math.log(wordfreq_r.get(word)) + 1
+            - Math.log(numwords_r + uniquewords_size);
+      else
+        prob_r += 1 - Math.log(numwords_r + uniquewords_size);
+
+    }
+
+    prob_ir += Math.log(numof_ir) - Math.log(numof_ir + numof_r);
+    prob_r += Math.log(numof_r) - Math.log(numof_ir + numof_r);
+
+    if (prob_ir > prob_r)
+      result = "0";
+    else
+      result = "1";
+
+    return result;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
new file mode 100644
index 0000000..30810ae
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
@@ -0,0 +1,197 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parsefilter.naivebayes;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.DocumentFragment;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+
+import java.io.Reader;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.ArrayList;
+
+/**
+ * Html Parse filter that classifies the outlinks from the parseresult as
+ * relevant or irrelevant based on the parseText's relevancy (using a training
+ * file where you can give positive and negative example texts see the
+ * description of parsefilter.naivebayes.trainfile) and if found irrelevant it
+ * gives the link a second chance if it contains any of the words from the list
+ * given in parsefilter.naivebayes.wordlist. CAUTION: Set the parser.timeout to
+ * -1 or a bigger value than 30, when using this classifier.
+ */
+public class NaiveBayesParseFilter implements HtmlParseFilter {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(NaiveBayesParseFilter.class);
+
+  public static final String TRAINFILE_MODELFILTER = "parsefilter.naivebayes.trainfile";
+  public static final String DICTFILE_MODELFILTER = "parsefilter.naivebayes.wordlist";
+
+  private Configuration conf;
+  private String inputFilePath;
+  private String dictionaryFile;
+  private ArrayList<String> wordlist = new ArrayList<String>();
+
+  public boolean filterParse(String text) {
+
+    try {
+      return classify(text);
+    } catch (IOException e) {
+      LOG.error("Error occured while classifying:: " + text + " ::"
+          + StringUtils.stringifyException(e));
+    }
+
+    return false;
+  }
+
+  public boolean filterUrl(String url) {
+
+    return containsWord(url, wordlist);
+
+  }
+
+  public boolean classify(String text) throws IOException {
+
+    // if classified as relevant "1" then return true
+    if (Classify.classify(text).equals("1"))
+      return true;
+    return false;
+  }
+
+  public void train() throws Exception {
+    // check if the model file exists, if it does then don't train
+    if (!FileSystem.get(conf).exists(new Path("naivebayes-model"))) {
+      LOG.info("Training the Naive Bayes Model");
+      Train.start(inputFilePath);
+    } else {
+      LOG.info("Model file already exists. Skipping training.");
+    }
+  }
+
+  public boolean containsWord(String url, ArrayList<String> wordlist) {
+    for (String word : wordlist) {
+      if (url.contains(word)) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    inputFilePath = conf.get(TRAINFILE_MODELFILTER);
+    dictionaryFile = conf.get(DICTFILE_MODELFILTER);
+    if (inputFilePath == null || inputFilePath.trim().length() == 0
+        || dictionaryFile == null || dictionaryFile.trim().length() == 0) {
+      String message = "ParseFilter: NaiveBayes: trainfile or wordlist not set in the parsefilte.naivebayes.trainfile or parsefilte.naivebayes.wordlist";
+      if (LOG.isErrorEnabled()) {
+        LOG.error(message);
+      }
+      throw new IllegalArgumentException(message);
+    }
+    try {
+      if ((FileSystem.get(conf).exists(new Path(inputFilePath)))
+          || (FileSystem.get(conf).exists(new Path(dictionaryFile)))) {
+        String message = "ParseFilter: NaiveBayes: " + inputFilePath + " or "
+            + dictionaryFile + " not found!";
+        if (LOG.isErrorEnabled()) {
+          LOG.error(message);
+        }
+        throw new IllegalArgumentException(message);
+      }
+
+      BufferedReader br = null;
+
+      String CurrentLine;
+      Reader reader = conf.getConfResourceAsReader(dictionaryFile);
+      br = new BufferedReader(reader);
+      while ((CurrentLine = br.readLine()) != null) {
+        wordlist.add(CurrentLine);
+      }
+
+    } catch (IOException e) {
+      LOG.error(StringUtils.stringifyException(e));
+    }
+    try {
+      train();
+    } catch (Exception e) {
+
+      LOG.error("Error occured while training:: "
+          + StringUtils.stringifyException(e));
+
+    }
+
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  @Override
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    Parse parse = parseResult.get(content.getUrl());
+
+    String url = content.getBaseUrl();
+    ArrayList<Outlink> tempOutlinks = new ArrayList<Outlink>();
+    String text = parse.getText();
+
+    if (!filterParse(text)) { // kick in the second tier
+      // if parent page found
+      // irrelevant
+      LOG.info("ParseFilter: NaiveBayes: Page found irrelevant:: " + url);
+      LOG.info("Checking outlinks");
+
+      Outlink[] out = null;
+      for (int i = 0; i < parse.getData().getOutlinks().length; i++) {
+        LOG.info("ParseFilter: NaiveBayes: Outlink to check:: "
+            + parse.getData().getOutlinks()[i].getToUrl());
+        if (filterUrl(parse.getData().getOutlinks()[i].getToUrl())) {
+          tempOutlinks.add(parse.getData().getOutlinks()[i]);
+          LOG.info("ParseFilter: NaiveBayes: found relevant");
+
+        } else {
+          LOG.info("ParseFilter: NaiveBayes: found irrelevant");
+        }
+      }
+      out = new Outlink[tempOutlinks.size()];
+      for (int i = 0; i < tempOutlinks.size(); i++) {
+        out[i] = tempOutlinks.get(i);
+      }
+      parse.getData().setOutlinks(out);
+
+    } else {
+      LOG.info("ParseFilter: NaiveBayes: Page found relevant:: " + url);
+    }
+
+    return parseResult;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Train.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Train.java b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Train.java
new file mode 100644
index 0000000..19a6911
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/Train.java
@@ -0,0 +1,148 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parsefilter.naivebayes;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.HashMap;
+import java.util.HashSet;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+public class Train {
+
+  public static String replacefirstoccuranceof(String tomatch, String line) {
+
+    int index = line.indexOf(tomatch);
+    if (index == -1) {
+      return line;
+    } else {
+      return line.substring(0, index)
+          + line.substring(index + tomatch.length());
+    }
+
+  }
+
+  public static void updateHashMap(HashMap<String, Integer> dict, String key) {
+    if (!key.equals("")) {
+      if (dict.containsKey(key))
+        dict.put(key, dict.get(key) + 1);
+      else
+        dict.put(key, 1);
+    }
+  }
+
+  public static String flattenHashMap(HashMap<String, Integer> dict) {
+    String result = "";
+
+    for (String key : dict.keySet()) {
+
+      result += key + ":" + dict.get(key) + ",";
+    }
+
+    // remove the last comma
+    result = result.substring(0, result.length() - 1);
+
+    return result;
+  }
+
+  public static void start(String filepath) throws IOException {
+
+    // two classes 0/irrelevant and 1/relevant
+
+    // calculate the total number of instances/examples per class, word count in
+    // each class and for each class a word:frequency map
+
+    int numof_ir = 0;
+    int numof_r = 0;
+    int numwords_ir = 0;
+    int numwords_r = 0;
+    HashSet<String> uniquewords = new HashSet<String>();
+    HashMap<String, Integer> wordfreq_ir = new HashMap<String, Integer>();
+    HashMap<String, Integer> wordfreq_r = new HashMap<String, Integer>();
+
+    String line = "";
+    String target = "";
+    String[] linearray = null;
+
+    // read the line
+    Configuration configuration = new Configuration();
+    FileSystem fs = FileSystem.get(configuration);
+
+    BufferedReader bufferedReader = new BufferedReader(
+        configuration.getConfResourceAsReader(filepath));
+
+    while ((line = bufferedReader.readLine()) != null) {
+
+      target = line.split("\t")[0];
+
+      line = replacefirstoccuranceof(target + "\t", line);
+
+      linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase().split(" ");
+
+      // update the data structures
+      if (target.equals("0")) {
+
+        numof_ir += 1;
+        numwords_ir += linearray.length;
+        for (int i = 0; i < linearray.length; i++) {
+          uniquewords.add(linearray[i]);
+          updateHashMap(wordfreq_ir, linearray[i]);
+        }
+      } else {
+
+        numof_r += 1;
+        numwords_r += linearray.length;
+        for (int i = 0; i < linearray.length; i++) {
+          uniquewords.add(linearray[i]);
+          updateHashMap(wordfreq_r, linearray[i]);
+        }
+
+      }
+
+    }
+
+    // write the model file
+
+    Path path = new Path("naivebayes-model");
+
+    Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(path,
+        true)));
+
+    writer.write(String.valueOf(uniquewords.size()) + "\n");
+    writer.write("0\n");
+    writer.write(String.valueOf(numof_ir) + "\n");
+    writer.write(String.valueOf(numwords_ir) + "\n");
+    writer.write(flattenHashMap(wordfreq_ir) + "\n");
+    writer.write("1\n");
+    writer.write(String.valueOf(numof_r) + "\n");
+    writer.write(String.valueOf(numwords_r) + "\n");
+    writer.write(flattenHashMap(wordfreq_r) + "\n");
+
+    writer.close();
+
+    bufferedReader.close();
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/package-info.java b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
new file mode 100644
index 0000000..6a892be
--- /dev/null
+++ b/nutch-plugins/parsefilter-naivebayes/src/main/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Html Parse filter that classifies the outlinks from the parseresult as
+ * relevant or irrelevant based on the parseText's relevancy (using a training
+ * file where you can give positive and negative example texts see the
+ * description of parsefilter.naivebayes.trainfile) and if found irrelevent
+ * it gives the link a second chance if it contains any of the words from the
+ * list given in parsefilter.naivebayes.wordlist. CAUTION: Set the
+ * parser.timeout to -1 or a bigger value than 30, when using this classifier.
+ */
+package org.apache.nutch.parsefilter.naivebayes;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/build.xml b/nutch-plugins/parsefilter-regex/build.xml
new file mode 100644
index 0000000..14d1127
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parsefilter-regex" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="data" />
+  </copy>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/parsefilter-regex/ivy.xml b/nutch-plugins/parsefilter-regex/ivy.xml
new file mode 100644
index 0000000..ed4cbc3
--- /dev/null
+++ b/nutch-plugins/parsefilter-regex/ivy.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>  
+</ivy-module>