You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/10/28 12:38:06 UTC

svn commit: r1634850 - in /manifoldcf/trunk: ./ connectors/tika/ connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/

Author: kwright
Date: Tue Oct 28 11:38:06 2014
New Revision: 1634850

URL: http://svn.apache.org/r1634850
Log:
First part of CONNECTORS-1088: implement the functionality

Modified:
    manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
    manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
    manifoldcf/trunk/connectors/tika/pom.xml
    manifoldcf/trunk/pom.xml

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java?rev=1634850&r1=1634849&r2=1634850&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java Tue Oct 28 11:38:06 2014
@@ -30,6 +30,7 @@ public class TikaConfig {
   public static final String NODE_FIELDMAP = "fieldmap";
   public static final String NODE_KEEPMETADATA = "keepAllMetadata";
   public static final String NODE_IGNORETIKAEXCEPTION = "ignoreException";
+  public static final String NODE_BOILERPLATEPROCESSOR = "boilerplateprocessor";
   public static final String ATTRIBUTE_SOURCE = "source";
   public static final String ATTRIBUTE_TARGET = "target";
   public static final String ATTRIBUTE_VALUE = "value";

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java?rev=1634850&r1=1634849&r2=1634850&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java Tue Oct 28 11:38:06 2014
@@ -32,6 +32,8 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import de.l3s.boilerpipe.BoilerpipeExtractor;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -164,6 +166,8 @@ public class TikaExtractor extends org.a
 
     SpecPacker sp = new SpecPacker(pipelineDescription.getSpecification());
 
+    BoilerpipeExtractor extractorClassInstance = sp.getExtractorClassInstance();
+    
     // Tika's API reads from an input stream and writes to an output Writer.
     // Since a RepositoryDocument includes readers and inputstreams exclusively, AND all downstream
     // processing needs to occur in a ManifoldCF thread, we have some constraints on the architecture we need to get this done:
@@ -217,6 +221,8 @@ public class TikaExtractor extends org.a
             // Use tika to parse stuff
             Parser parser = new AutoDetectParser();
             ContentHandler handler = new BodyContentHandler(w);
+            if (extractorClassInstance != null)
+              handler = new BoilerpipeContentHandler(handler, extractorClassInstance);
             ParseContext pc = new ParseContext();
             try
             {
@@ -752,10 +758,12 @@ public class TikaExtractor extends org.a
     private final Map<String,String> sourceTargets = new HashMap<String,String>();
     private final boolean keepAllMetadata;
     private final boolean ignoreTikaException;
+    private final String extractorClassName;
     
     public SpecPacker(Specification os) {
       boolean keepAllMetadata = true;
       boolean ignoreTikaException = true;
+      String extractorClassName = null;
       for (int i = 0; i < os.getChildCount(); i++) {
         SpecificationNode sn = os.getChild(i);
         
@@ -773,10 +781,13 @@ public class TikaExtractor extends org.a
         } else if (sn.getType().equals(TikaConfig.NODE_IGNORETIKAEXCEPTION)) {
           String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
           ignoreTikaException = Boolean.parseBoolean(value);
+        } else if (sn.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR)) {
+          extractorClassName = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
         }
       }
       this.keepAllMetadata = keepAllMetadata;
       this.ignoreTikaException = ignoreTikaException;
+      this.extractorClassName = extractorClassName;
     }
     
     public String toPackedString() {
@@ -814,6 +825,14 @@ public class TikaExtractor extends org.a
       else
         sb.append('-');
 
+      if (extractorClassName != null)
+      {
+        sb.append('+');
+        sb.append(extractorClassName);
+      }
+      else
+        sb.append('-');
+      
       return sb.toString();
     }
     
@@ -828,6 +847,24 @@ public class TikaExtractor extends org.a
     public boolean ignoreTikaException() {
       return ignoreTikaException;
     }
+    
+    public BoilerpipeExtractor getExtractorClassInstance()
+      throws ManifoldCFException {
+      if (extractorClassName == null)
+        return null;
+      try {
+        ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
+        Class extractorClass = loader.loadClass(extractorClassName);
+        return (BoilerpipeExtractor)extractorClass.newInstance();
+      } catch (ClassNotFoundException e) {
+        throw new ManifoldCFException("Boilerpipe extractor class '"+extractorClassName+"' not found: "+e.getMessage(),e);
+      } catch (InstantiationException e) {
+        throw new ManifoldCFException("Boilerpipe extractor class '"+extractorClassName+"' could not be instantiated: "+e.getMessage(),e);
+      } catch (Exception e) {
+        throw new ManifoldCFException("Boilerpipe extractor class '"+extractorClassName+"' exception on instantiation: "+e.getMessage(),e);
+      }
+    }
+
   }
 
 }

Modified: manifoldcf/trunk/connectors/tika/pom.xml
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/pom.xml?rev=1634850&r1=1634849&r2=1634850&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/pom.xml (original)
+++ manifoldcf/trunk/connectors/tika/pom.xml Tue Oct 28 11:38:06 2014
@@ -212,14 +212,24 @@
       <version>2.3.2</version>
     </dependency>
     <dependency>
-	  <groupId>com.fasterxml.jackson.core</groupId>
-	  <artifactId>jackson-annotations</artifactId>
-	  <version>2.3.0</version>
+        <groupId>com.fasterxml.jackson.core</groupId>
+        <artifactId>jackson-annotations</artifactId>
+        <version>2.3.0</version>
     </dependency>
     <dependency>
-	  <groupId>org.apache.tika</groupId>
-	  <artifactId>tika-parsers</artifactId>
-	  <version>${tika.version}</version>
+        <groupId>org.apache.tika</groupId>
+        <artifactId>tika-core</artifactId>
+        <version>${tika.version}</version>
+    </dependency>
+    <dependency>
+        <groupId>org.apache.tika</groupId>
+        <artifactId>tika-parsers</artifactId>
+        <version>${tika.version}</version>
+    </dependency>
+    <dependency>
+        <groupId>de.l3s.boilerpipe</groupId>
+        <artifactId>boilerpipe</artifactId>
+        <version>${boilerpipe.version}</version>
     </dependency>
     
     <!-- Testing dependencies -->

Modified: manifoldcf/trunk/pom.xml
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/pom.xml?rev=1634850&r1=1634849&r2=1634850&view=diff
==============================================================================
--- manifoldcf/trunk/pom.xml (original)
+++ manifoldcf/trunk/pom.xml Tue Oct 28 11:38:06 2014
@@ -81,6 +81,7 @@
     <xmlbeans.version>2.6.0</xmlbeans.version>
     <poi.version>3.11-beta2</poi.version>
     <tika.version>1.6</tika.version>
+    <boilerpipe.version>1.1.0</boilerpipe.version>
   </properties>
 
   <modules>