You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/06/21 15:36:56 UTC
svn commit: r1604371 [2/2] - in /manifoldcf/trunk/connectors: ./
amazoncloudsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/amazoncloudsearch/
amazoncloudsearch/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/amaz...
Added: manifoldcf/trunk/connectors/documentfilter/pom.xml
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/documentfilter/pom.xml?rev=1604371&view=auto
==============================================================================
--- manifoldcf/trunk/connectors/documentfilter/pom.xml (added)
+++ manifoldcf/trunk/connectors/documentfilter/pom.xml Sat Jun 21 13:36:55 2014
@@ -0,0 +1,361 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <parent>
+ <groupId>org.apache.manifoldcf</groupId>
+ <artifactId>mcf-connectors</artifactId>
+ <version>1.7-SNAPSHOT</version>
+ </parent>
+ <modelVersion>4.0.0</modelVersion>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
+ </properties>
+
+ <artifactId>mcf-documentfilter-connector</artifactId>
+ <name>ManifoldCF - Connectors - Document Filter</name>
+
+ <build>
+ <defaultGoal>integration-test</defaultGoal>
+ <sourceDirectory>${basedir}/connector/src/main/java</sourceDirectory>
+ <testSourceDirectory>${basedir}/connector/src/test/java</testSourceDirectory>
+ <resources>
+ <resource>
+ <directory>${basedir}/connector/src/main/native2ascii</directory>
+ <includes>
+ <include>**/*.properties</include>
+ </includes>
+ </resource>
+ <resource>
+ <directory>${basedir}/connector/src/main/resources</directory>
+ <includes>
+ <include>**/*.html</include>
+ <include>**/*.js</include>
+ </includes>
+ </resource>
+ </resources>
+ <testResources>
+ <testResource>
+ <directory>${basedir}/connector/src/test/resources</directory>
+ </testResource>
+ </testResources>
+
+ <plugins>
+
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>native2ascii-maven-plugin</artifactId>
+ <version>1.0-beta-1</version>
+ <configuration>
+ <workDir>target/classes</workDir>
+ </configuration>
+ <executions>
+ <execution>
+ <id>native2ascii-utf8</id>
+ <goals>
+ <goal>native2ascii</goal>
+ </goals>
+ <configuration>
+ <encoding>UTF8</encoding>
+ <includes>
+ <include>**/*.properties</include>
+ </includes>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+ <!-- Test plugin configuration -->
+ <plugin>
+ <artifactId>maven-dependency-plugin</artifactId>
+ <executions>
+ <execution>
+ <id>copy-war</id>
+ <phase>generate-resources</phase>
+ <goals>
+ <goal>copy</goal>
+ </goals>
+ <configuration>
+ <outputDirectory>target/dependency</outputDirectory>
+ <artifactItems>
+ <artifactItem>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mcf-api-service</artifactId>
+ <version>${project.version}</version>
+ <type>war</type>
+ <overWrite>false</overWrite>
+ <destFileName>mcf-api-service.war</destFileName>
+ </artifactItem>
+ <artifactItem>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mcf-authority-service</artifactId>
+ <version>${project.version}</version>
+ <type>war</type>
+ <overWrite>false</overWrite>
+ <destFileName>mcf-authority-service.war</destFileName>
+ </artifactItem>
+ <artifactItem>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mcf-crawler-ui</artifactId>
+ <version>${project.version}</version>
+ <type>war</type>
+ <overWrite>false</overWrite>
+ <destFileName>mcf-crawler-ui.war</destFileName>
+ </artifactItem>
+ </artifactItems>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+ <excludes>
+ <exclude>**/*Postgresql*.java</exclude>
+ <exclude>**/*MySQL*.java</exclude>
+ </excludes>
+ <forkMode>always</forkMode>
+ <workingDirectory>target/test-output</workingDirectory>
+ </configuration>
+ </plugin>
+
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-failsafe-plugin</artifactId>
+ <version>2.12.3</version>
+ <configuration>
+ <skipTests>${skipITs}</skipTests>
+ <systemPropertyVariables>
+ <crawlerWarPath>../dependency/mcf-crawler-ui.war</crawlerWarPath>
+ <authorityserviceWarPath>../dependency/mcf-authority-service.war</authorityserviceWarPath>
+ <apiWarPath>../dependency/mcf-api-service.war</apiWarPath>
+ </systemPropertyVariables>
+ <excludes>
+ <exclude>**/*Postgresql*.java</exclude>
+ <exclude>**/*MySQL*.java</exclude>
+ </excludes>
+ <forkMode>always</forkMode>
+ <workingDirectory>target/test-output</workingDirectory>
+ </configuration>
+ <executions>
+ <execution>
+ <id>integration-test</id>
+ <goals>
+ <goal>integration-test</goal>
+ </goals>
+ </execution>
+ <execution>
+ <id>verify</id>
+ <goals>
+ <goal>verify</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mcf-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mcf-agents</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mcf-ui-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <!-- Testing dependencies -->
+
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>${junit.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mcf-core</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mcf-agents</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mcf-pull-agent</artifactId>
+ <version>${project.version}</version>
+ <type>jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mcf-pull-agent</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>postgresql</groupId>
+ <artifactId>postgresql</artifactId>
+ <version>${postgresql.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.hsqldb</groupId>
+ <artifactId>hsqldb</artifactId>
+ <version>${hsqldb.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.derby</groupId>
+ <artifactId>derby</artifactId>
+ <version>${derby.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>mysql</groupId>
+ <artifactId>mysql-connector-java</artifactId>
+ <version>${mysql.version}</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mcf-api-service</artifactId>
+ <version>${project.version}</version>
+ <type>war</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mcf-authority-service</artifactId>
+ <version>${project.version}</version>
+ <type>war</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>mcf-crawler-ui</artifactId>
+ <version>${project.version}</version>
+ <type>war</type>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-server</artifactId>
+ <version>${jetty.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-util</artifactId>
+ <version>${jetty.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-webapp</artifactId>
+ <version>${jetty.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-servlet</artifactId>
+ <version>${jetty.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-http</artifactId>
+ <version>${jetty.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-io</artifactId>
+ <version>${jetty.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-security</artifactId>
+ <version>${jetty.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-continuation</artifactId>
+ <version>${jetty.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.eclipse.jetty</groupId>
+ <artifactId>jetty-xml</artifactId>
+ <version>${jetty.version}</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.mortbay.jetty</groupId>
+ <artifactId>jsp-api-2.1-glassfish</artifactId>
+ <version>${glassfish.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.mortbay.jetty</groupId>
+ <artifactId>jsp-2.1-glassfish</artifactId>
+ <version>${glassfish.version}</version>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ <version>${slf4j.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-simple</artifactId>
+ <version>${slf4j.version}</version>
+ <scope>test</scope>
+ </dependency>
+
+ </dependencies>
+</project>
Propchange: manifoldcf/trunk/connectors/documentfilter/pom.xml
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: manifoldcf/trunk/connectors/documentfilter/pom.xml
------------------------------------------------------------------------------
svn:keywords = Id
Modified: manifoldcf/trunk/connectors/pom.xml
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/pom.xml?rev=1604371&r1=1604370&r2=1604371&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/pom.xml (original)
+++ manifoldcf/trunk/connectors/pom.xml Sat Jun 21 13:36:55 2014
@@ -62,6 +62,7 @@
<module>amazoncloudsearch</module>
<module>forcedmetadata</module>
<module>tika</module>
+ <module>documentfilter</module>
</modules>
</project>
Modified: manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformer/tika/TikaExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformer/tika/TikaExtractor.java?rev=1604371&r1=1604370&r2=1604371&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformer/tika/TikaExtractor.java (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformer/tika/TikaExtractor.java Sat Jun 21 13:36:55 2014
@@ -49,8 +49,8 @@ public class TikaExtractor extends org.a
protected static final String[] activitiesList = new String[]{ACTIVITY_EXTRACT};
- /** We handle up to a megabyte in memory; after that we go to disk. */
- protected static final long inMemoryMaximumFile = 1000000;
+ /** We handle up to 64K in memory; after that we go to disk. */
+ protected static final long inMemoryMaximumFile = 65536;
/** Return a list of activities that this connector generates.
* The connector does NOT need to be connected before this method is called.
@@ -81,6 +81,56 @@ public class TikaExtractor extends org.a
return sp.toPackedString();
}
+ // We intercept checks pertaining to the document format and send modified checks further down
+
+ /** Detect if a mime type is acceptable or not. This method is used to determine whether it makes sense to fetch a document
+ * in the first place.
+ *@param pipelineDescription is the document's pipeline version string, for this connection.
+ *@param mimeType is the mime type of the document.
+ *@param checkActivity is an object including the activities that can be performed by this method.
+ *@return true if the mime type can be accepted by this connector.
+ */
+ public boolean checkMimeTypeIndexable(String pipelineDescription, String mimeType, IOutputCheckActivity checkActivity)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ // We should see what Tika will transform
+ // MHL
+ // Do a downstream check
+ return checkActivity.checkMimeTypeIndexable("text/plain;charset=utf-8");
+ }
+
+ /** Pre-determine whether a document (passed here as a File object) is acceptable or not. This method is
+ * used to determine whether a document needs to be actually transferred. This hook is provided mainly to support
+ * search engines that only handle a small set of accepted file types.
+ *@param pipelineDescription is the document's pipeline version string, for this connection.
+ *@param localFile is the local file to check.
+ *@param checkActivity is an object including the activities that can be done by this method.
+ *@return true if the file is acceptable, false if not.
+ */
+ @Override
+ public boolean checkDocumentIndexable(String pipelineDescription, File localFile, IOutputCheckActivity checkActivity)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ // Document contents are not germane anymore, unless it looks like Tika won't accept them.
+ // Not sure how to check that...
+ return true;
+ }
+
+ /** Pre-determine whether a document's length is acceptable. This method is used
+ * to determine whether to fetch a document in the first place.
+ *@param pipelineDescription is the document's pipeline version string, for this connection.
+ *@param length is the length of the document.
+ *@param checkActivity is an object including the activities that can be done by this method.
+ *@return true if the file is acceptable, false if not.
+ */
+ @Override
+ public boolean checkLengthIndexable(String pipelineDescription, long length, IOutputCheckActivity checkActivity)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ // Always true
+ return true;
+ }
+
/** Add (or replace) a document in the output data store using the connector.
* This method presumes that the connector object has been configured, and it is thus able to communicate with the output data store should that be
* necessary.
@@ -101,6 +151,10 @@ public class TikaExtractor extends org.a
public int addOrReplaceDocumentWithException(String documentURI, String pipelineDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
throws ManifoldCFException, ServiceInterruption, IOException
{
+ // First, make sure downstream pipeline will now accept text/plain;charset=utf-8
+ if (!activities.checkMimeTypeIndexable("text/plain;charset=utf-8"))
+ return DOCUMENTSTATUS_REJECTED;
+
SpecPacker sp = new SpecPacker(pipelineDescription);
// Tika's API reads from an input stream and writes to an output Writer.
@@ -188,6 +242,10 @@ public class TikaExtractor extends org.a
activities.recordActivity(new Long(startTime), ACTIVITY_EXTRACT, length, documentURI,
resultCode, description);
}
+
+ // Check to be sure downstream pipeline will accept document of specified length
+ if (!activities.checkLengthIndexable(ds.getBinaryLength()))
+ return DOCUMENTSTATUS_REJECTED;
// Parsing complete!
// Create a copy of Repository Document