You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/10/28 12:38:06 UTC
svn commit: r1634850 - in /manifoldcf/trunk: ./ connectors/tika/
connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/
Author: kwright
Date: Tue Oct 28 11:38:06 2014
New Revision: 1634850
URL: http://svn.apache.org/r1634850
Log:
First part of CONNECTORS-1088: implement the functionality
Modified:
manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
manifoldcf/trunk/connectors/tika/pom.xml
manifoldcf/trunk/pom.xml
Modified: manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java?rev=1634850&r1=1634849&r2=1634850&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java Tue Oct 28 11:38:06 2014
@@ -30,6 +30,7 @@ public class TikaConfig {
public static final String NODE_FIELDMAP = "fieldmap";
public static final String NODE_KEEPMETADATA = "keepAllMetadata";
public static final String NODE_IGNORETIKAEXCEPTION = "ignoreException";
+ public static final String NODE_BOILERPLATEPROCESSOR = "boilerplateprocessor";
public static final String ATTRIBUTE_SOURCE = "source";
public static final String ATTRIBUTE_TARGET = "target";
public static final String ATTRIBUTE_VALUE = "value";
Modified: manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java?rev=1634850&r1=1634849&r2=1634850&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java (original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java Tue Oct 28 11:38:06 2014
@@ -32,6 +32,8 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import de.l3s.boilerpipe.BoilerpipeExtractor;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -164,6 +166,8 @@ public class TikaExtractor extends org.a
SpecPacker sp = new SpecPacker(pipelineDescription.getSpecification());
+ BoilerpipeExtractor extractorClassInstance = sp.getExtractorClassInstance();
+
// Tika's API reads from an input stream and writes to an output Writer.
// Since a RepositoryDocument includes readers and inputstreams exclusively, AND all downstream
// processing needs to occur in a ManifoldCF thread, we have some constraints on the architecture we need to get this done:
@@ -217,6 +221,8 @@ public class TikaExtractor extends org.a
// Use tika to parse stuff
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler(w);
+ if (extractorClassInstance != null)
+ handler = new BoilerpipeContentHandler(handler, extractorClassInstance);
ParseContext pc = new ParseContext();
try
{
@@ -752,10 +758,12 @@ public class TikaExtractor extends org.a
private final Map<String,String> sourceTargets = new HashMap<String,String>();
private final boolean keepAllMetadata;
private final boolean ignoreTikaException;
+ private final String extractorClassName;
public SpecPacker(Specification os) {
boolean keepAllMetadata = true;
boolean ignoreTikaException = true;
+ String extractorClassName = null;
for (int i = 0; i < os.getChildCount(); i++) {
SpecificationNode sn = os.getChild(i);
@@ -773,10 +781,13 @@ public class TikaExtractor extends org.a
} else if (sn.getType().equals(TikaConfig.NODE_IGNORETIKAEXCEPTION)) {
String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
ignoreTikaException = Boolean.parseBoolean(value);
+ } else if (sn.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR)) {
+ extractorClassName = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
}
}
this.keepAllMetadata = keepAllMetadata;
this.ignoreTikaException = ignoreTikaException;
+ this.extractorClassName = extractorClassName;
}
public String toPackedString() {
@@ -814,6 +825,14 @@ public class TikaExtractor extends org.a
else
sb.append('-');
+ if (extractorClassName != null)
+ {
+ sb.append('+');
+ sb.append(extractorClassName);
+ }
+ else
+ sb.append('-');
+
return sb.toString();
}
@@ -828,6 +847,24 @@ public class TikaExtractor extends org.a
public boolean ignoreTikaException() {
return ignoreTikaException;
}
+
+ public BoilerpipeExtractor getExtractorClassInstance()
+ throws ManifoldCFException {
+ if (extractorClassName == null)
+ return null;
+ try {
+ ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
+ Class extractorClass = loader.loadClass(extractorClassName);
+ return (BoilerpipeExtractor)extractorClass.newInstance();
+ } catch (ClassNotFoundException e) {
+ throw new ManifoldCFException("Boilerpipe extractor class '"+extractorClassName+"' not found: "+e.getMessage(),e);
+ } catch (InstantiationException e) {
+ throw new ManifoldCFException("Boilerpipe extractor class '"+extractorClassName+"' could not be instantiated: "+e.getMessage(),e);
+ } catch (Exception e) {
+ throw new ManifoldCFException("Boilerpipe extractor class '"+extractorClassName+"' exception on instantiation: "+e.getMessage(),e);
+ }
+ }
+
}
}
Modified: manifoldcf/trunk/connectors/tika/pom.xml
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/pom.xml?rev=1634850&r1=1634849&r2=1634850&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/pom.xml (original)
+++ manifoldcf/trunk/connectors/tika/pom.xml Tue Oct 28 11:38:06 2014
@@ -212,14 +212,24 @@
<version>2.3.2</version>
</dependency>
<dependency>
- <groupId>com.fasterxml.jackson.core</groupId>
- <artifactId>jackson-annotations</artifactId>
- <version>2.3.0</version>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-annotations</artifactId>
+ <version>2.3.0</version>
</dependency>
<dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parsers</artifactId>
- <version>${tika.version}</version>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${tika.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ <version>${tika.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>de.l3s.boilerpipe</groupId>
+ <artifactId>boilerpipe</artifactId>
+ <version>${boilerpipe.version}</version>
</dependency>
<!-- Testing dependencies -->
Modified: manifoldcf/trunk/pom.xml
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/pom.xml?rev=1634850&r1=1634849&r2=1634850&view=diff
==============================================================================
--- manifoldcf/trunk/pom.xml (original)
+++ manifoldcf/trunk/pom.xml Tue Oct 28 11:38:06 2014
@@ -81,6 +81,7 @@
<xmlbeans.version>2.6.0</xmlbeans.version>
<poi.version>3.11-beta2</poi.version>
<tika.version>1.6</tika.version>
+ <boilerpipe.version>1.1.0</boilerpipe.version>
</properties>
<modules>