You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/05/20 12:18:32 UTC

svn commit: r1596180 - in /manifoldcf/branches/CONNECTORS-916: ./ connectors/amazoncloudsearch/ connectors/amazoncloudsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/amazoncloudsearch/

Author: kwright
Date: Tue May 20 10:18:32 2014
New Revision: 1596180

URL: http://svn.apache.org/r1596180
Log:
Latest patch

Modified:
    manifoldcf/branches/CONNECTORS-916/build.xml
    manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/build.xml
    manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/amazoncloudsearch/AmazonCloudSearchConnector.java
    manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/pom.xml

Modified: manifoldcf/branches/CONNECTORS-916/build.xml
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-916/build.xml?rev=1596180&r1=1596179&r2=1596180&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-916/build.xml (original)
+++ manifoldcf/branches/CONNECTORS-916/build.xml Tue May 20 10:18:32 2014
@@ -1015,7 +1015,7 @@ Use Apache Forrest version forrest-0.9-d
         <mkdir dir="lib"/>
         <antcall target="download-via-maven"><param name="target" value="lib"/>
             <param name="project-path" value="org/apache/poi"/>
-            <param name="artifact-version" value="3.7"/>
+            <param name="artifact-version" value="3.10-beta2"/>
             <param name="artifact-name" value="poi"/>
             <param name="artifact-type" value="jar"/>
         </antcall>
@@ -1591,6 +1591,174 @@ Use Apache Forrest version forrest-0.9-d
             <param name="artifact-name" value="tagsoup"/>
             <param name="artifact-type" value="jar"/>
         </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="org/apache/james"/>
+            <param name="artifact-name" value="apache-mime4j-core"/>
+            <param name="artifact-version" value="0.7.2"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="org/apache/james"/>
+            <param name="artifact-name" value="apache-mime4j-dom"/>
+            <param name="artifact-version" value="0.7.2"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="org/gagravarr"/>
+            <param name="artifact-name" value="vorbis-java-tika"/>
+            <param name="artifact-version" value="0.1"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="org/apache/poi"/>
+            <param name="artifact-name" value="poi-scratchpad"/>
+            <param name="artifact-version" value="3.10-beta2"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="com/drewnoakes"/>
+            <param name="artifact-name" value="metadata-extractor"/>
+            <param name="artifact-version" value="2.6.2"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="org/aspectj"/>
+            <param name="artifact-name" value="aspectjrt"/>
+            <param name="artifact-version" value="1.6.11"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="com/uwyn"/>
+            <param name="artifact-name" value="jhighlight"/>
+            <param name="artifact-version" value="1.0"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="org/apache/xmlbeans"/>
+            <param name="artifact-name" value="xmlbeans"/>
+            <param name="artifact-version" value="2.3.0"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="org/bouncycastle"/>
+            <param name="artifact-name" value="bcprov-jdk15"/>
+            <param name="artifact-version" value="1.45"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="org/tukaani"/>
+            <param name="artifact-name" value="xz"/>
+            <param name="artifact-version" value="1.2"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="org/apache/pdfbox"/>
+            <param name="artifact-name" value="jempbox"/>
+            <param name="artifact-version" value="1.8.4"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="org/apache/pdfbox"/>
+            <param name="artifact-name" value="pdfbox"/>
+            <param name="artifact-version" value="1.8.4"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="org/apache/pdfbox"/>
+            <param name="artifact-name" value="fontbox"/>
+            <param name="artifact-version" value="1.8.4"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="org/apache/commons"/>
+            <param name="artifact-name" value="commons-compress"/>
+            <param name="artifact-version" value="1.5"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="com/googlecode/juniversalchardet"/>
+            <param name="artifact-name" value="juniversalchardet"/>
+            <param name="artifact-version" value="1.0.3"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="edu/ucar"/>
+            <param name="artifact-name" value="netcdf"/>
+            <param name="artifact-version" value="4.2-min"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="de/l3s/boilerpipe"/>
+            <param name="artifact-name" value="boilerpipe"/>
+            <param name="artifact-version" value="1.1.0"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="dom4j"/>
+            <param name="artifact-name" value="dom4j"/>
+            <param name="artifact-version" value="1.6.1"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="org/ow2/asm"/>
+            <param name="artifact-name" value="asm-debug-all"/>
+            <param name="artifact-version" value="4.1"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="com/adobe/xmp"/>
+            <param name="artifact-name" value="xmpcore"/>
+            <param name="artifact-version" value="5.1.2"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="org/gagravarr"/>
+            <param name="artifact-name" value="vorbis-java-core"/>
+            <param name="artifact-version" value="0.1"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="org/apache/poi"/>
+            <param name="artifact-name" value="poi-ooxml"/>
+            <param name="artifact-version" value="3.10-beta2"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="org/apache/poi"/>
+            <param name="artifact-name" value="poi-ooxml-schemas"/>
+            <param name="artifact-version" value="3.10-beta2"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="org/bouncycastle"/>
+            <param name="artifact-name" value="bcmail-jdk15"/>
+            <param name="artifact-version" value="1.45"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="jdom"/>
+            <param name="artifact-name" value="jdom"/>
+            <param name="artifact-version" value="1.0"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="org/apache/geronimo/specs"/>
+            <param name="artifact-name" value="geronimo-stax-api_1.0_spec"/>
+            <param name="artifact-version" value="1.0.1"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="rome"/>
+            <param name="artifact-name" value="rome"/>
+            <param name="artifact-version" value="0.9"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
+        <antcall target="download-via-maven"><param name="target" value="lib"/>
+            <param name="project-path" value="com/googlecode/mp4parser"/>
+            <param name="artifact-name" value="isoparser"/>
+            <param name="artifact-version" value="1.0-RC-1"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
     </target>
 	
     <target name="download-jackson">

Modified: manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/build.xml
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/build.xml?rev=1596180&r1=1596179&r2=1596180&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/build.xml (original)
+++ manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/build.xml Tue May 20 10:18:32 2014
@@ -41,6 +41,38 @@
             <include name="tika-parsers*.jar"/>
             <include name="tagsoup*.jar"/>
             <include name="poi*.jar"/>
+            <include name="vorbis-java-tika*.jar"/>
+            <include name="vorbis-java-core*.jar"/>
+            <include name="netcdf*.jar"/>
+            <include name="apache-mime4j-core*.jar"/>
+            <include name="apache-mime4j-dom*.jar"/>
+            <include name="commons-compress*.jar"/>
+            <include name="commons-codec*.jar"/>
+            <include name="pdfbox*.jar"/>
+            <include name="fontbox*.jar"/>
+            <include name="jempbox*.jar"/>
+            <include name="commons-logging*.jar"/>
+            <include name="bcmail-jdk15*.jar"/>
+            <include name="bcprov-jdk15*.jar"/>
+            <include name="poi-scratchpad*.jar"/>
+            <include name="poi-ooxml*.jar"/>
+            <include name="poi-ooxml-schemas*.jar"/>
+            <include name="xmlbeans*.jar"/>
+            <include name="dom4j*.jar"/>
+            <include name="geronimo-stax-api_1.0_spec*.jar"/>
+            <include name="asm-debug-all*.jar"/>
+            <include name="isoparser*.jar"/>
+            <include name="aspectjrt*.jar"/>
+            <include name="metadata-extractor*.jar"/>
+            <include name="xmpcore*.jar"/>
+            <include name="xml-apis*.jar"/>
+            <include name="boilerpipe*.jar"/>
+            <include name="rome*.jar"/>
+            <include name="jdom*.jar"/>
+            <include name="xercesImpl*.jar"/>
+            <include name="vorbis-java-core*.jar"/>
+            <include name="juniversalchardet*.jar"/>
+            <include name="jhighlight*.jar"/>
         </fileset>
     </path>
 
@@ -56,6 +88,38 @@
                 <include name="tika-parsers*.jar"/>
                 <include name="tagsoup*.jar"/>
                 <include name="poi*.jar"/>
+                <include name="vorbis-java-tika*.jar"/>
+                <include name="vorbis-java-core*.jar"/>
+                <include name="netcdf*.jar"/>
+                <include name="apache-mime4j-core*.jar"/>
+                <include name="apache-mime4j-dom*.jar"/>
+                <include name="commons-compress*.jar"/>
+                <include name="commons-codec*.jar"/>
+                <include name="pdfbox*.jar"/>
+                <include name="fontbox*.jar"/>
+                <include name="jempbox*.jar"/>
+                <include name="commons-logging*.jar"/>
+                <include name="bcmail-jdk15*.jar"/>
+                <include name="bcprov-jdk15*.jar"/>
+                <include name="poi-scratchpad*.jar"/>
+                <include name="poi-ooxml*.jar"/>
+                <include name="poi-ooxml-schemas*.jar"/>
+                <include name="xmlbeans*.jar"/>
+                <include name="dom4j*.jar"/>
+                <include name="geronimo-stax-api_1.0_spec*.jar"/>
+                <include name="asm-debug-all*.jar"/>
+                <include name="isoparser*.jar"/>
+                <include name="aspectjrt*.jar"/>
+                <include name="metadata-extractor*.jar"/>
+                <include name="xmpcore*.jar"/>
+                <include name="xml-apis*.jar"/>
+                <include name="boilerpipe*.jar"/>
+                <include name="rome*.jar"/>
+                <include name="jdom*.jar"/>
+                <include name="xercesImpl*.jar"/>
+                <include name="vorbis-java-core*.jar"/>
+                <include name="juniversalchardet*.jar"/>
+                <include name="jhighlight*.jar"/>
             </fileset>
         </copy>
     </target>

Modified: manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/amazoncloudsearch/AmazonCloudSearchConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/amazoncloudsearch/AmazonCloudSearchConnector.java?rev=1596180&r1=1596179&r2=1596180&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/amazoncloudsearch/AmazonCloudSearchConnector.java (original)
+++ manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/amazoncloudsearch/AmazonCloudSearchConnector.java Tue May 20 10:18:32 2014
@@ -59,14 +59,15 @@ import org.apache.manifoldcf.core.interf
 import org.apache.manifoldcf.core.interfaces.IPostParameters;
 import org.apache.manifoldcf.core.interfaces.IPasswordMapperActivity;
 import org.apache.manifoldcf.core.interfaces.SpecificationNode;
-import org.apache.manifoldcf.crawler.system.Logging;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.html.HtmlParser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.xml.sax.ContentHandler;
+import org.apache.manifoldcf.core.system.ManifoldCF;
+import org.apache.manifoldcf.crawler.system.Logging;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 import com.fasterxml.jackson.core.JsonFactory;
@@ -355,7 +356,7 @@ public class AmazonCloudSearchConnector 
       SDFModel model = new SDFModel();
       Document doc = model.new Document();
       doc.setType("add");
-      doc.setId(documentURI);
+      doc.setId(ManifoldCF.hash(documentURI));
       
       HashMap fields = new HashMap();
       Metadata metadata = extractBinaryFile(document, fields);
@@ -440,7 +441,7 @@ public class AmazonCloudSearchConnector 
     
     //extract body text and metadata fields from binary file.
     InputStream is = document.getBinaryStream();
-    Parser parser = new HtmlParser(); //TODO
+    Parser parser = new AutoDetectParser();
     ContentHandler handler = new BodyContentHandler();
     Metadata metadata = new Metadata();
     parser.parse(is, handler, metadata, new ParseContext());

Modified: manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/pom.xml
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/pom.xml?rev=1596180&r1=1596179&r2=1596180&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/pom.xml (original)
+++ manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/pom.xml Tue May 20 10:18:32 2014
@@ -227,14 +227,8 @@
 	  <groupId>org.apache.tika</groupId>
 	  <artifactId>tika-parsers</artifactId>
 	  <version>1.5</version>
-    </dependency>      
-    <dependency>
-      <groupId>org.ccil.cowan.tagsoup</groupId>
-      <artifactId>tagsoup</artifactId>
-      <version>1.2.1</version>
     </dependency>
     
-    
     <!-- Testing dependencies -->
     
     <dependency>