You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/05/20 12:18:32 UTC
svn commit: r1596180 - in /manifoldcf/branches/CONNECTORS-916: ./
connectors/amazoncloudsearch/
connectors/amazoncloudsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/amazoncloudsearch/
Author: kwright
Date: Tue May 20 10:18:32 2014
New Revision: 1596180
URL: http://svn.apache.org/r1596180
Log:
Latest patch
Modified:
manifoldcf/branches/CONNECTORS-916/build.xml
manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/build.xml
manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/amazoncloudsearch/AmazonCloudSearchConnector.java
manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/pom.xml
Modified: manifoldcf/branches/CONNECTORS-916/build.xml
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-916/build.xml?rev=1596180&r1=1596179&r2=1596180&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-916/build.xml (original)
+++ manifoldcf/branches/CONNECTORS-916/build.xml Tue May 20 10:18:32 2014
@@ -1015,7 +1015,7 @@ Use Apache Forrest version forrest-0.9-d
<mkdir dir="lib"/>
<antcall target="download-via-maven"><param name="target" value="lib"/>
<param name="project-path" value="org/apache/poi"/>
- <param name="artifact-version" value="3.7"/>
+ <param name="artifact-version" value="3.10-beta2"/>
<param name="artifact-name" value="poi"/>
<param name="artifact-type" value="jar"/>
</antcall>
@@ -1591,6 +1591,174 @@ Use Apache Forrest version forrest-0.9-d
<param name="artifact-name" value="tagsoup"/>
<param name="artifact-type" value="jar"/>
</antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="org/apache/james"/>
+ <param name="artifact-name" value="apache-mime4j-core"/>
+ <param name="artifact-version" value="0.7.2"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="org/apache/james"/>
+ <param name="artifact-name" value="apache-mime4j-dom"/>
+ <param name="artifact-version" value="0.7.2"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="org/gagravarr"/>
+ <param name="artifact-name" value="vorbis-java-tika"/>
+ <param name="artifact-version" value="0.1"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="org/apache/poi"/>
+ <param name="artifact-name" value="poi-scratchpad"/>
+ <param name="artifact-version" value="3.10-beta2"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="com/drewnoakes"/>
+ <param name="artifact-name" value="metadata-extractor"/>
+ <param name="artifact-version" value="2.6.2"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="org/aspectj"/>
+ <param name="artifact-name" value="aspectjrt"/>
+ <param name="artifact-version" value="1.6.11"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="com/uwyn"/>
+ <param name="artifact-name" value="jhighlight"/>
+ <param name="artifact-version" value="1.0"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="org/apache/xmlbeans"/>
+ <param name="artifact-name" value="xmlbeans"/>
+ <param name="artifact-version" value="2.3.0"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="org/bouncycastle"/>
+ <param name="artifact-name" value="bcprov-jdk15"/>
+ <param name="artifact-version" value="1.45"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="org/tukaani"/>
+ <param name="artifact-name" value="xz"/>
+ <param name="artifact-version" value="1.2"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="org/apache/pdfbox"/>
+ <param name="artifact-name" value="jempbox"/>
+ <param name="artifact-version" value="1.8.4"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="org/apache/pdfbox"/>
+ <param name="artifact-name" value="pdfbox"/>
+ <param name="artifact-version" value="1.8.4"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="org/apache/pdfbox"/>
+ <param name="artifact-name" value="fontbox"/>
+ <param name="artifact-version" value="1.8.4"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="org/apache/commons"/>
+ <param name="artifact-name" value="commons-compress"/>
+ <param name="artifact-version" value="1.5"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="com/googlecode/juniversalchardet"/>
+ <param name="artifact-name" value="juniversalchardet"/>
+ <param name="artifact-version" value="1.0.3"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="edu/ucar"/>
+ <param name="artifact-name" value="netcdf"/>
+ <param name="artifact-version" value="4.2-min"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="de/l3s/boilerpipe"/>
+ <param name="artifact-name" value="boilerpipe"/>
+ <param name="artifact-version" value="1.1.0"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="dom4j"/>
+ <param name="artifact-name" value="dom4j"/>
+ <param name="artifact-version" value="1.6.1"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="org/ow2/asm"/>
+ <param name="artifact-name" value="asm-debug-all"/>
+ <param name="artifact-version" value="4.1"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="com/adobe/xmp"/>
+ <param name="artifact-name" value="xmpcore"/>
+ <param name="artifact-version" value="5.1.2"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="org/gagravarr"/>
+ <param name="artifact-name" value="vorbis-java-core"/>
+ <param name="artifact-version" value="0.1"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="org/apache/poi"/>
+ <param name="artifact-name" value="poi-ooxml"/>
+ <param name="artifact-version" value="3.10-beta2"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="org/apache/poi"/>
+ <param name="artifact-name" value="poi-ooxml-schemas"/>
+ <param name="artifact-version" value="3.10-beta2"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="org/bouncycastle"/>
+ <param name="artifact-name" value="bcmail-jdk15"/>
+ <param name="artifact-version" value="1.45"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="jdom"/>
+ <param name="artifact-name" value="jdom"/>
+ <param name="artifact-version" value="1.0"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="org/apache/geronimo/specs"/>
+ <param name="artifact-name" value="geronimo-stax-api_1.0_spec"/>
+ <param name="artifact-version" value="1.0.1"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="rome"/>
+ <param name="artifact-name" value="rome"/>
+ <param name="artifact-version" value="0.9"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
+ <antcall target="download-via-maven"><param name="target" value="lib"/>
+ <param name="project-path" value="com/googlecode/mp4parser"/>
+ <param name="artifact-name" value="isoparser"/>
+ <param name="artifact-version" value="1.0-RC-1"/>
+ <param name="artifact-type" value="jar"/>
+ </antcall>
</target>
<target name="download-jackson">
Modified: manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/build.xml
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/build.xml?rev=1596180&r1=1596179&r2=1596180&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/build.xml (original)
+++ manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/build.xml Tue May 20 10:18:32 2014
@@ -41,6 +41,38 @@
<include name="tika-parsers*.jar"/>
<include name="tagsoup*.jar"/>
<include name="poi*.jar"/>
+ <include name="vorbis-java-tika*.jar"/>
+ <include name="vorbis-java-core*.jar"/>
+ <include name="netcdf*.jar"/>
+ <include name="apache-mime4j-core*.jar"/>
+ <include name="apache-mime4j-dom*.jar"/>
+ <include name="commons-compress*.jar"/>
+ <include name="commons-codec*.jar"/>
+ <include name="pdfbox*.jar"/>
+ <include name="fontbox*.jar"/>
+ <include name="jempbox*.jar"/>
+ <include name="commons-logging*.jar"/>
+ <include name="bcmail-jdk15*.jar"/>
+ <include name="bcprov-jdk15*.jar"/>
+ <include name="poi-scratchpad*.jar"/>
+ <include name="poi-ooxml*.jar"/>
+ <include name="poi-ooxml-schemas*.jar"/>
+ <include name="xmlbeans*.jar"/>
+ <include name="dom4j*.jar"/>
+ <include name="geronimo-stax-api_1.0_spec*.jar"/>
+ <include name="asm-debug-all*.jar"/>
+ <include name="isoparser*.jar"/>
+ <include name="aspectjrt*.jar"/>
+ <include name="metadata-extractor*.jar"/>
+ <include name="xmpcore*.jar"/>
+ <include name="xml-apis*.jar"/>
+ <include name="boilerpipe*.jar"/>
+ <include name="rome*.jar"/>
+ <include name="jdom*.jar"/>
+ <include name="xercesImpl*.jar"/>
+ <include name="vorbis-java-core*.jar"/>
+ <include name="juniversalchardet*.jar"/>
+ <include name="jhighlight*.jar"/>
</fileset>
</path>
@@ -56,6 +88,38 @@
<include name="tika-parsers*.jar"/>
<include name="tagsoup*.jar"/>
<include name="poi*.jar"/>
+ <include name="vorbis-java-tika*.jar"/>
+ <include name="vorbis-java-core*.jar"/>
+ <include name="netcdf*.jar"/>
+ <include name="apache-mime4j-core*.jar"/>
+ <include name="apache-mime4j-dom*.jar"/>
+ <include name="commons-compress*.jar"/>
+ <include name="commons-codec*.jar"/>
+ <include name="pdfbox*.jar"/>
+ <include name="fontbox*.jar"/>
+ <include name="jempbox*.jar"/>
+ <include name="commons-logging*.jar"/>
+ <include name="bcmail-jdk15*.jar"/>
+ <include name="bcprov-jdk15*.jar"/>
+ <include name="poi-scratchpad*.jar"/>
+ <include name="poi-ooxml*.jar"/>
+ <include name="poi-ooxml-schemas*.jar"/>
+ <include name="xmlbeans*.jar"/>
+ <include name="dom4j*.jar"/>
+ <include name="geronimo-stax-api_1.0_spec*.jar"/>
+ <include name="asm-debug-all*.jar"/>
+ <include name="isoparser*.jar"/>
+ <include name="aspectjrt*.jar"/>
+ <include name="metadata-extractor*.jar"/>
+ <include name="xmpcore*.jar"/>
+ <include name="xml-apis*.jar"/>
+ <include name="boilerpipe*.jar"/>
+ <include name="rome*.jar"/>
+ <include name="jdom*.jar"/>
+ <include name="xercesImpl*.jar"/>
+ <include name="vorbis-java-core*.jar"/>
+ <include name="juniversalchardet*.jar"/>
+ <include name="jhighlight*.jar"/>
</fileset>
</copy>
</target>
Modified: manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/amazoncloudsearch/AmazonCloudSearchConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/amazoncloudsearch/AmazonCloudSearchConnector.java?rev=1596180&r1=1596179&r2=1596180&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/amazoncloudsearch/AmazonCloudSearchConnector.java (original)
+++ manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/amazoncloudsearch/AmazonCloudSearchConnector.java Tue May 20 10:18:32 2014
@@ -59,14 +59,15 @@ import org.apache.manifoldcf.core.interf
import org.apache.manifoldcf.core.interfaces.IPostParameters;
import org.apache.manifoldcf.core.interfaces.IPasswordMapperActivity;
import org.apache.manifoldcf.core.interfaces.SpecificationNode;
-import org.apache.manifoldcf.crawler.system.Logging;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.html.HtmlParser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.xml.sax.ContentHandler;
+import org.apache.manifoldcf.core.system.ManifoldCF;
+import org.apache.manifoldcf.crawler.system.Logging;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import com.fasterxml.jackson.core.JsonFactory;
@@ -355,7 +356,7 @@ public class AmazonCloudSearchConnector
SDFModel model = new SDFModel();
Document doc = model.new Document();
doc.setType("add");
- doc.setId(documentURI);
+ doc.setId(ManifoldCF.hash(documentURI));
HashMap fields = new HashMap();
Metadata metadata = extractBinaryFile(document, fields);
@@ -440,7 +441,7 @@ public class AmazonCloudSearchConnector
//extract body text and metadata fields from binary file.
InputStream is = document.getBinaryStream();
- Parser parser = new HtmlParser(); //TODO
+ Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
parser.parse(is, handler, metadata, new ParseContext());
Modified: manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/pom.xml
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/pom.xml?rev=1596180&r1=1596179&r2=1596180&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/pom.xml (original)
+++ manifoldcf/branches/CONNECTORS-916/connectors/amazoncloudsearch/pom.xml Tue May 20 10:18:32 2014
@@ -227,14 +227,8 @@
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.5</version>
- </dependency>
- <dependency>
- <groupId>org.ccil.cowan.tagsoup</groupId>
- <artifactId>tagsoup</artifactId>
- <version>1.2.1</version>
</dependency>
-
<!-- Testing dependencies -->
<dependency>