You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/06/21 15:36:56 UTC

svn commit: r1604371 [2/2] - in /manifoldcf/trunk/connectors: ./ amazoncloudsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/amazoncloudsearch/ amazoncloudsearch/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/amaz...

Added: manifoldcf/trunk/connectors/documentfilter/pom.xml
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/documentfilter/pom.xml?rev=1604371&view=auto
==============================================================================
--- manifoldcf/trunk/connectors/documentfilter/pom.xml (added)
+++ manifoldcf/trunk/connectors/documentfilter/pom.xml Sat Jun 21 13:36:55 2014
@@ -0,0 +1,361 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <parent>
+    <groupId>org.apache.manifoldcf</groupId>
+    <artifactId>mcf-connectors</artifactId>
+    <version>1.7-SNAPSHOT</version>
+  </parent>
+  <modelVersion>4.0.0</modelVersion>
+
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
+  </properties>
+
+  <artifactId>mcf-documentfilter-connector</artifactId>
+  <name>ManifoldCF - Connectors - Document Filter</name>
+
+  <build>
+    <defaultGoal>integration-test</defaultGoal>
+    <sourceDirectory>${basedir}/connector/src/main/java</sourceDirectory>
+    <testSourceDirectory>${basedir}/connector/src/test/java</testSourceDirectory>
+    <resources>
+      <resource>
+        <directory>${basedir}/connector/src/main/native2ascii</directory>
+        <includes>
+          <include>**/*.properties</include>
+        </includes>
+      </resource>
+      <resource>
+        <directory>${basedir}/connector/src/main/resources</directory>
+        <includes>
+          <include>**/*.html</include>
+          <include>**/*.js</include>
+        </includes>
+      </resource>
+    </resources> 
+    <testResources>
+      <testResource>
+        <directory>${basedir}/connector/src/test/resources</directory>
+      </testResource>
+    </testResources>
+
+    <plugins>
+
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>native2ascii-maven-plugin</artifactId>
+        <version>1.0-beta-1</version>
+        <configuration>
+            <workDir>target/classes</workDir>
+        </configuration>
+        <executions>
+            <execution>
+                <id>native2ascii-utf8</id>
+                <goals>
+                    <goal>native2ascii</goal>
+                </goals>
+                <configuration>
+                    <encoding>UTF8</encoding>
+                    <includes>
+                      <include>**/*.properties</include>
+                    </includes>
+                </configuration>
+            </execution>
+        </executions>
+      </plugin>
+
+      <!-- Test plugin configuration -->
+      <plugin>
+        <artifactId>maven-dependency-plugin</artifactId>
+        <executions>
+           <execution>
+            <id>copy-war</id>
+            <phase>generate-resources</phase>
+            <goals>
+              <goal>copy</goal>
+            </goals>
+            <configuration>
+              <outputDirectory>target/dependency</outputDirectory>
+              <artifactItems>
+                <artifactItem>
+                  <groupId>${project.groupId}</groupId>
+                  <artifactId>mcf-api-service</artifactId>
+                  <version>${project.version}</version>
+                  <type>war</type>
+                  <overWrite>false</overWrite>
+                  <destFileName>mcf-api-service.war</destFileName>
+                </artifactItem>
+                <artifactItem>
+                  <groupId>${project.groupId}</groupId>
+                  <artifactId>mcf-authority-service</artifactId>
+                  <version>${project.version}</version>
+                  <type>war</type>
+                  <overWrite>false</overWrite>
+                  <destFileName>mcf-authority-service.war</destFileName>
+                </artifactItem>
+                <artifactItem>
+                  <groupId>${project.groupId}</groupId>
+                  <artifactId>mcf-crawler-ui</artifactId>
+                  <version>${project.version}</version>
+                  <type>war</type>
+                  <overWrite>false</overWrite>
+                  <destFileName>mcf-crawler-ui.war</destFileName>
+                </artifactItem>
+              </artifactItems>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-surefire-plugin</artifactId>
+        <configuration>
+          <excludes>
+            <exclude>**/*Postgresql*.java</exclude>
+            <exclude>**/*MySQL*.java</exclude>
+          </excludes>
+          <forkMode>always</forkMode>
+          <workingDirectory>target/test-output</workingDirectory>
+        </configuration>
+      </plugin>
+
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-failsafe-plugin</artifactId>
+        <version>2.12.3</version>
+        <configuration>
+          <skipTests>${skipITs}</skipTests>
+          <systemPropertyVariables>
+            <crawlerWarPath>../dependency/mcf-crawler-ui.war</crawlerWarPath>
+            <authorityserviceWarPath>../dependency/mcf-authority-service.war</authorityserviceWarPath>
+            <apiWarPath>../dependency/mcf-api-service.war</apiWarPath>
+          </systemPropertyVariables>
+          <excludes>
+            <exclude>**/*Postgresql*.java</exclude>
+            <exclude>**/*MySQL*.java</exclude>
+          </excludes>
+          <forkMode>always</forkMode>
+          <workingDirectory>target/test-output</workingDirectory>
+        </configuration>
+        <executions>
+          <execution>
+            <id>integration-test</id>
+            <goals>
+              <goal>integration-test</goal>
+            </goals>
+          </execution>
+          <execution>
+            <id>verify</id>
+            <goals>
+              <goal>verify</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+
+    </plugins>
+  </build>
+  
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>mcf-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>mcf-agents</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>mcf-ui-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    
+    <!-- Testing dependencies -->
+    
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>${junit.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>mcf-core</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>mcf-agents</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>mcf-pull-agent</artifactId>
+      <version>${project.version}</version>
+      <type>jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>mcf-pull-agent</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>postgresql</groupId>
+      <artifactId>postgresql</artifactId>
+      <version>${postgresql.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.hsqldb</groupId>
+      <artifactId>hsqldb</artifactId>
+      <version>${hsqldb.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.derby</groupId>
+      <artifactId>derby</artifactId>
+      <version>${derby.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>mysql</groupId>
+      <artifactId>mysql-connector-java</artifactId>
+      <version>${mysql.version}</version>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>mcf-api-service</artifactId>
+      <version>${project.version}</version>
+      <type>war</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>mcf-authority-service</artifactId>
+      <version>${project.version}</version>
+      <type>war</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>mcf-crawler-ui</artifactId>
+      <version>${project.version}</version>
+      <type>war</type>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.eclipse.jetty</groupId>
+      <artifactId>jetty-server</artifactId>
+      <version>${jetty.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.eclipse.jetty</groupId>
+      <artifactId>jetty-util</artifactId>
+      <version>${jetty.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.eclipse.jetty</groupId>
+      <artifactId>jetty-webapp</artifactId>
+      <version>${jetty.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.eclipse.jetty</groupId>
+      <artifactId>jetty-servlet</artifactId>
+      <version>${jetty.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.eclipse.jetty</groupId>
+      <artifactId>jetty-http</artifactId>
+      <version>${jetty.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.eclipse.jetty</groupId>
+      <artifactId>jetty-io</artifactId>
+      <version>${jetty.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.eclipse.jetty</groupId>
+      <artifactId>jetty-security</artifactId>
+      <version>${jetty.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.eclipse.jetty</groupId>
+      <artifactId>jetty-continuation</artifactId>
+      <version>${jetty.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.eclipse.jetty</groupId>
+      <artifactId>jetty-xml</artifactId>
+      <version>${jetty.version}</version>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.mortbay.jetty</groupId>
+      <artifactId>jsp-api-2.1-glassfish</artifactId>
+      <version>${glassfish.version}</version>
+      <scope>test</scope>
+    </dependency>    
+    <dependency>
+      <groupId>org.mortbay.jetty</groupId>
+      <artifactId>jsp-2.1-glassfish</artifactId>
+      <version>${glassfish.version}</version>
+      <scope>test</scope>
+    </dependency>
+    
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+      <version>${slf4j.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-simple</artifactId>
+      <version>${slf4j.version}</version>
+      <scope>test</scope>
+    </dependency>
+
+  </dependencies>
+</project>

Propchange: manifoldcf/trunk/connectors/documentfilter/pom.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/trunk/connectors/documentfilter/pom.xml
------------------------------------------------------------------------------
    svn:keywords = Id

Modified: manifoldcf/trunk/connectors/pom.xml
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/pom.xml?rev=1604371&r1=1604370&r2=1604371&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/pom.xml (original)
+++ manifoldcf/trunk/connectors/pom.xml Sat Jun 21 13:36:55 2014
@@ -62,6 +62,7 @@
     <module>amazoncloudsearch</module>
     <module>forcedmetadata</module>
     <module>tika</module>
+    <module>documentfilter</module>
   </modules>
 
 </project>

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformer/tika/TikaExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformer/tika/TikaExtractor.java?rev=1604371&r1=1604370&r2=1604371&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformer/tika/TikaExtractor.java (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformer/tika/TikaExtractor.java Sat Jun 21 13:36:55 2014
@@ -49,8 +49,8 @@ public class TikaExtractor extends org.a
 
   protected static final String[] activitiesList = new String[]{ACTIVITY_EXTRACT};
   
-  /** We handle up to a megabyte in memory; after that we go to disk. */
-  protected static final long inMemoryMaximumFile = 1000000;
+  /** We handle up to 64K in memory; after that we go to disk. */
+  protected static final long inMemoryMaximumFile = 65536;
   
   /** Return a list of activities that this connector generates.
   * The connector does NOT need to be connected before this method is called.
@@ -81,6 +81,56 @@ public class TikaExtractor extends org.a
     return sp.toPackedString();
   }
 
+  // We intercept checks pertaining to the document format and send modified checks further down
+  
+  /** Detect if a mime type is acceptable or not.  This method is used to determine whether it makes sense to fetch a document
+  * in the first place.
+  *@param pipelineDescription is the document's pipeline version string, for this connection.
+  *@param mimeType is the mime type of the document.
+  *@param checkActivity is an object including the activities that can be performed by this method.
+  *@return true if the mime type can be accepted by this connector.
+  */
+  public boolean checkMimeTypeIndexable(String pipelineDescription, String mimeType, IOutputCheckActivity checkActivity)
+    throws ManifoldCFException, ServiceInterruption
+  {
+    // We should see what Tika will transform
+    // MHL
+    // Do a downstream check
+    return checkActivity.checkMimeTypeIndexable("text/plain;charset=utf-8");
+  }
+
+  /** Pre-determine whether a document (passed here as a File object) is acceptable or not.  This method is
+  * used to determine whether a document needs to be actually transferred.  This hook is provided mainly to support
+  * search engines that only handle a small set of accepted file types.
+  *@param pipelineDescription is the document's pipeline version string, for this connection.
+  *@param localFile is the local file to check.
+  *@param checkActivity is an object including the activities that can be done by this method.
+  *@return true if the file is acceptable, false if not.
+  */
+  @Override
+  public boolean checkDocumentIndexable(String pipelineDescription, File localFile, IOutputCheckActivity checkActivity)
+    throws ManifoldCFException, ServiceInterruption
+  {
+    // Document contents are not germane anymore, unless it looks like Tika won't accept them.
+    // Not sure how to check that...
+    return true;
+  }
+
+  /** Pre-determine whether a document's length is acceptable.  This method is used
+  * to determine whether to fetch a document in the first place.
+  *@param pipelineDescription is the document's pipeline version string, for this connection.
+  *@param length is the length of the document.
+  *@param checkActivity is an object including the activities that can be done by this method.
+  *@return true if the file is acceptable, false if not.
+  */
+  @Override
+  public boolean checkLengthIndexable(String pipelineDescription, long length, IOutputCheckActivity checkActivity)
+    throws ManifoldCFException, ServiceInterruption
+  {
+    // Always true
+    return true;
+  }
+
   /** Add (or replace) a document in the output data store using the connector.
   * This method presumes that the connector object has been configured, and it is thus able to communicate with the output data store should that be
   * necessary.
@@ -101,6 +151,10 @@ public class TikaExtractor extends org.a
   public int addOrReplaceDocumentWithException(String documentURI, String pipelineDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
     throws ManifoldCFException, ServiceInterruption, IOException
   {
+    // First, make sure downstream pipeline will now accept text/plain;charset=utf-8
+    if (!activities.checkMimeTypeIndexable("text/plain;charset=utf-8"))
+      return DOCUMENTSTATUS_REJECTED;
+
     SpecPacker sp = new SpecPacker(pipelineDescription);
 
     // Tika's API reads from an input stream and writes to an output Writer.
@@ -188,6 +242,10 @@ public class TikaExtractor extends org.a
         activities.recordActivity(new Long(startTime), ACTIVITY_EXTRACT, length, documentURI,
           resultCode, description);
       }
+      
+      // Check to be sure downstream pipeline will accept document of specified length
+      if (!activities.checkLengthIndexable(ds.getBinaryLength()))
+        return DOCUMENTSTATUS_REJECTED;
         
       // Parsing complete!
       // Create a copy of Repository Document