You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2016/01/27 09:56:36 UTC
svn commit: r1726966 -
/manifoldcf/branches/CONNECTORS-1270/connectors/opennlp/connector/src/main/java/org/apache/manifoldcf/agents/transformation/opennlp/OpenNlpExtractor.java
Author: kwright
Date: Wed Jan 27 08:56:36 2016
New Revision: 1726966
URL: http://svn.apache.org/viewvc?rev=1726966&view=rev
Log:
Add mime type check
Modified:
manifoldcf/branches/CONNECTORS-1270/connectors/opennlp/connector/src/main/java/org/apache/manifoldcf/agents/transformation/opennlp/OpenNlpExtractor.java
Modified: manifoldcf/branches/CONNECTORS-1270/connectors/opennlp/connector/src/main/java/org/apache/manifoldcf/agents/transformation/opennlp/OpenNlpExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1270/connectors/opennlp/connector/src/main/java/org/apache/manifoldcf/agents/transformation/opennlp/OpenNlpExtractor.java?rev=1726966&r1=1726965&r2=1726966&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1270/connectors/opennlp/connector/src/main/java/org/apache/manifoldcf/agents/transformation/opennlp/OpenNlpExtractor.java (original)
+++ manifoldcf/branches/CONNECTORS-1270/connectors/opennlp/connector/src/main/java/org/apache/manifoldcf/agents/transformation/opennlp/OpenNlpExtractor.java Wed Jan 27 08:56:36 2016
@@ -26,6 +26,8 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
+import java.util.Set;
+import java.util.HashSet;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.sentdetect.SentenceDetector;
@@ -34,6 +36,7 @@ import opennlp.tools.util.Span;
import org.apache.commons.io.IOUtils;
import org.apache.manifoldcf.agents.interfaces.IOutputAddActivity;
+import org.apache.manifoldcf.agents.interfaces.IOutputCheckActivity;
import org.apache.manifoldcf.agents.interfaces.RepositoryDocument;
import org.apache.manifoldcf.agents.interfaces.ServiceInterruption;
import org.apache.manifoldcf.agents.system.Logging;
@@ -260,6 +263,33 @@ public class OpenNlpExtractor extends Ba
}
}
+ private final static Set<String> acceptableMimeTypes = new HashSet<String>();
+ static
+ {
+ acceptableMimeTypes.add("text/plain;charset=utf-8");
+ acceptableMimeTypes.add("text/plain;charset=ascii");
+ acceptableMimeTypes.add("text/plain;charset=us-ascii");
+ acceptableMimeTypes.add("text/plain");
+ }
+
+ /** Detect if a mime type is acceptable or not. This method is used to determine whether it makes sense to fetch a document
+ * in the first place.
+ *@param pipelineDescription is the document's pipeline version string, for this connection.
+ *@param mimeType is the mime type of the document.
+ *@param checkActivity is an object including the activities that can be performed by this method.
+ *@return true if the mime type can be accepted by this connector.
+ */
+ @Override
+ public boolean checkMimeTypeIndexable(VersionContext pipelineDescription, String mimeType, IOutputCheckActivity checkActivity)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ if (!acceptableMimeTypes.contains(mimeType.toLowerCase(Locale.ROOT))) {
+ return false;
+ }
+ // Do a downstream check too
+ return super.checkMimeTypeIndexable(pipelineDescription, mimeType, checkActivity);
+ }
+
// ////////////////////////
// UI Methods
// ////////////////////////