You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2017/05/11 19:55:29 UTC
svn commit: r1794885 - in
/manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main:
java/org/apache/manifoldcf/agents/transformation/tikaservice/
native2ascii/org/apache/manifoldcf/agents/transformation/tikaservice/
Author: kwright
Date: Thu May 11 19:55:29 2017
New Revision: 1794885
URL: http://svn.apache.org/viewvc?rev=1794885&view=rev
Log:
Major code changes
Modified:
manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/TikaConfig.java
manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/TikaExtractor.java
manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tikaservice/common_en_US.properties
Modified: manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/TikaConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/TikaConfig.java?rev=1794885&r1=1794884&r2=1794885&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/TikaConfig.java (original)
+++ manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/TikaConfig.java Thu May 11 19:55:29 2017
@@ -24,7 +24,10 @@ package org.apache.manifoldcf.agents.tra
public class TikaConfig {
// Configuration parameters
- // None
+ public static final String PARAM_TIKAHOSTNAME = "tikaHostname";
+ public static final String PARAM_TIKAPORT = "tikaPort";
+ public static final String TIKAHOSTNAME_DEFAULT = "localhost";
+ public static final String TIKAPORT_DEFAULT = "9998";
// Specification nodes and values
public static final String NODE_FIELDMAP = "fieldmap";
@@ -33,16 +36,8 @@ public class TikaConfig {
public static final String NODE_WRITELIMIT = "writeLimit";
public static final int WRITELIMIT_DEFAULT = -1;
public static final String NODE_IGNORETIKAEXCEPTION = "ignoreException";
- public static final String NODE_BOILERPLATEPROCESSOR = "boilerplateprocessor";
public static final String ATTRIBUTE_SOURCE = "source";
public static final String ATTRIBUTE_TARGET = "target";
public static final String ATTRIBUTE_VALUE = "value";
- public static final String TIKAHOSTNAME_DEFAULT = "localhost";
- public static final int TIKAPORT_DEFAULT = 9998;
- public static final String NODE_TIKAHOSTNAME = "tikaHostname";
- public static final String NODE_TIKAPORT = "tikaPort";
- public static final String NODE_TIKASERVER = "tikaServer";
- public static final long TIKARETRY_DEFAULT = 10000;
- public static final String NODE_TIKARETRY = "tikaRetry";
}
Modified: manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/TikaExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/TikaExtractor.java?rev=1794885&r1=1794884&r2=1794885&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/TikaExtractor.java (original)
+++ manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/TikaExtractor.java Thu May 11 19:55:29 2017
@@ -28,6 +28,8 @@ import org.apache.http.client.HttpClient
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpPut;
import org.apache.http.entity.InputStreamEntity;
+import org.apache.http.conn.HttpClientConnectionManager;
+import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.manifoldcf.agents.interfaces.*;
import org.apache.manifoldcf.agents.system.Logging;
@@ -58,20 +60,183 @@ import org.xml.sax.SAXException;
public class TikaExtractor extends org.apache.manifoldcf.agents.transformation.BaseTransformationConnector {
public static final String _rcsid = "@(#)$Id$";
+ private static final String EDIT_CONFIGURATION_JS = "editConfiguration.js";
+ private static final String EDIT_CONFIGURATION_SERVER_HTML = "editConfiguration_Server.html";
+ private static final String VIEW_CONFIGURATION_HTML = "viewConfiguration.html";
private static final String EDIT_SPECIFICATION_JS = "editSpecification.js";
- private static final String EDIT_SPECIFICATION_TIKATYPE_HTML = "editSpecification_TikaType.html";
private static final String EDIT_SPECIFICATION_FIELDMAPPING_HTML = "editSpecification_FieldMapping.html";
private static final String EDIT_SPECIFICATION_EXCEPTIONS_HTML = "editSpecification_Exceptions.html";
- private static final String EDIT_SPECIFICATION_BOILERPLATE_HTML = "editSpecification_Boilerplate.html";
private static final String VIEW_SPECIFICATION_HTML = "viewSpecification.html";
protected static final String ACTIVITY_EXTRACT = "extract";
protected static final String[] activitiesList = new String[] { ACTIVITY_EXTRACT };
-
+ protected final static long sessionExpirationInterval = 300000L;
+
/** We handle up to 64K in memory; after that we go to disk. */
protected static final long inMemoryMaximumFile = 65536;
+ // Raw parameters
+
+ /** Tika host name */
+ private String tikaHostname = null;
+
+ /** Tika port */
+ private String tikaPortString = null;
+
+ // Computed parameters
+
+ /** Session timeout */
+ private long sessionTimeout = -1L;
+
+ /** Tika port */
+ private int tikaPort = -1;
+
+ /** Connection manager */
+ private HttpClientConnectionManager connectionManager = null;
+
+ /** Httpclient instance */
+ private HttpClient httpClient = null;
+
+ /** HttpHost */
+ private HttpHost tikaHost = null;
+
+ // Static data
+
+ /** Metadata URI */
+ protected final static URI metaURI;
+ /** Content URI */
+ protected final static URI contentURI;
+
+ static {
+ try {
+ metaURI = new URI("/meta");
+ contentURI = new URI("/tika");
+ } catch (URISyntaxException e) {
+ throw new RuntimeException(e.getMessage());
+ }
+ }
+
+s /** Connect.
+ *@param configParameters is the set of configuration parameters, which
+ * in this case describe the root directory.
+ */
+ @Override
+ public void connect(ConfigParams configParameters)
+ {
+ super.connect(configParameters);
+ tikaHostname = configParameters.getParameter(TikaConfig.PARAM_TIKAHOSTNAME);
+ tikaPortString = configParameters.getParameter(TikaConfig.PARAM_TIKAPORT);
+ }
+
+ /** Close the connection. Call this before discarding the repository connector.
+ */
+ @Override
+ public void disconnect()
+ throws ManifoldCFException
+ {
+ expireSession();
+
+ super.disconnect();
+ }
+
+ /** This method is periodically called for all connectors that are connected but not
+ * in active use.
+ */
+ @Override
+ public void poll()
+ throws ManifoldCFException
+ {
+ if (System.currentTimeMillis() >= sessionTimeout)
+ {
+ expireSession();
+ }
+ if (connectionManager != null)
+ connectionManager.closeIdleConnections(60000L,TimeUnit.MILLISECONDS);
+ }
+
+ /** This method is called to assess whether to count this connector instance should
+ * actually be counted as being connected.
+ *@return true if the connector instance is actually connected.
+ */
+ @Override
+ public boolean isConnected()
+ {
+ return sessionTimeout != -1L;
+ }
+
+ /** Set up a session */
+ protected void getSession()
+ throws ManifoldCFException
+ {
+ if (sessionTimeout == -1L)
+ {
+ this.tikaPort = Integer.parseInt(tikaPortString);
+
+ final int connectionTimeout = 60000;
+ final int socketTimeout = 900000;
+
+ final PoolingHttpClientConnectionManager poolingConnectionManager = new PoolingHttpClientConnectionManager(RegistryBuilder.<ConnectionSocketFactory>create()
+ .register("http", PlainConnectionSocketFactory.getSocketFactory())
+ .register("https", myFactory)
+ .build());
+ poolingConnectionManager.setDefaultMaxPerRoute(1);
+ poolingConnectionManager.setValidateAfterInactivity(2000);
+ poolingConnectionManager.setDefaultSocketConfig(SocketConfig.custom()
+ .setTcpNoDelay(true)
+ .setSoTimeout(socketTimeout)
+ .build());
+
+ this.connectionManager = poolingConnectionManager;
+
+ final RequestConfig.Builder requestBuilder = RequestConfig.custom()
+ .setCircularRedirectsAllowed(true)
+ .setSocketTimeout(socketTimeout)
+ .setExpectContinueEnabled(false)
+ .setConnectTimeout(connectionTimeout)
+ .setConnectionRequestTimeout(socketTimeout);
+
+ final HttpClientBuilder builder = HttpClients.custom()
+ .setConnectionManager(connectionManager)
+ .disableAutomaticRetries()
+ .setDefaultRequestConfig(requestBuilder.build());
+ builder.setRequestExecutor(new HttpRequestExecutor(socketTimeout))
+ .setRedirectStrategy(new DefaultRedirectStrategy());
+ this.httpClient = builder.build();
+
+ this.tikaHost = new HttpHost(tikaHostname, tikaPort);
+
+ }
+ sessionTimeout = System.currentTimeMillis() + sessionExpirationInterval;
+ }
+
+ /** Expire the current session */
+ protected void expireSession()
+ throws ManifoldCFException
+ {
+ tikaHostname = null;
+ tikaPortString = null;
+ tikaPort = -1;
+ httpClient = null;
+ tikaHost = null;
+ if (connectionManager != null)
+ connectionManager.shutdown();
+ connectionManager = null;
+ sessionTimeout = -1L;
+ }
+
+ /** Test the connection. Returns a string describing the connection integrity.
+ *@return the connection's status as a displayable string.
+ */
+ @Override
+ public String check()
+ throws ManifoldCFException
+ {
+ getSession();
+ // MHL
+ return super.check();
+ }
+
/**
* Return a list of activities that this connector generates. The connector
* does NOT need to be connected before this method is called.
@@ -83,6 +248,103 @@ public class TikaExtractor extends org.a
return activitiesList;
}
+ /** Output the configuration header section.
+ * This method is called in the head section of the connector's configuration page. Its purpose is to add the required tabs to the list, and to output any
+ * javascript methods that might be needed by the configuration editing HTML.
+ *@param threadContext is the local thread context.
+ *@param out is the output to which any HTML should be sent.
+ *@param parameters are the configuration parameters, as they currently exist, for this connection being configured.
+ *@param tabsArray is an array of tab names. Add to this array any tab names that are specific to the connector.
+ */
+ @Override
+ public void outputConfigurationHeader(IThreadContext threadContext, IHTTPOutput out,
+ Locale locale, ConfigParams parameters, List<String> tabsArray)
+ throws ManifoldCFException, IOException
+ {
+ tabsArray.add(Messages.getString(locale,"TikaExtractor.TikaServerTabName"));
+ Messages.outputResourceWithVelocity(out,locale,EDIT_CONFIGURATION_JS,null);
+ }
+
+ /** Output the configuration body section.
+ * This method is called in the body section of the connector's configuration page. Its purpose is to present the required form elements for editing.
+ * The coder can presume that the HTML that is output from this configuration will be within appropriate <html>, <body>, and <form> tags. The name of the
+ * form is "editconnection".
+ *@param threadContext is the local thread context.
+ *@param out is the output to which any HTML should be sent.
+ *@param parameters are the configuration parameters, as they currently exist, for this connection being configured.
+ *@param tabName is the current tab name.
+ */
+ @Override
+ public void outputConfigurationBody(IThreadContext threadContext, IHTTPOutput out,
+ Locale locale, ConfigParams parameters, String tabName)
+ throws ManifoldCFException, IOException
+ {
+ Map<String,Object> velocityContext = new HashMap<String,Object>();
+ velocityContext.put("TabName",tabName);
+ fillInServerTab(velocityContext,out,parameters);
+ Messages.outputResourceWithVelocity(out,locale,EDIT_CONFIGURATION_SERVER_HTML,velocityContext);
+ }
+
+
+ /** Process a configuration post.
+ * This method is called at the start of the connector's configuration page, whenever there is a possibility that form data for a connection has been
+ * posted. Its purpose is to gather form information and modify the configuration parameters accordingly.
+ * The name of the posted form is "editconnection".
+ *@param threadContext is the local thread context.
+ *@param variableContext is the set of variables available from the post, including binary file post information.
+ *@param parameters are the configuration parameters, as they currently exist, for this connection being configured.
+ *@return null if all is well, or a string error message if there is an error that should prevent saving of the connection (and cause a redirection to an error page).
+ */
+ @Override
+ public String processConfigurationPost(IThreadContext threadContext, IPostParameters variableContext,
+ Locale locale, ConfigParams parameters)
+ throws ManifoldCFException
+ {
+ String tikaHostname = variableContext.getParameter("tikaHostname");
+
+ if (tikaHostname != null)
+ parameters.setParameter(TikaConfig.PARAM_TIKAHOSTNAME,tikaHostname);
+
+ String tikaPort = variableContext.getParameter("tikaPort");
+ if (tikaPort != null)
+ parameters.setParameter(TikaConfig.PARAM_TIKAPORT,tikaPort);
+
+ return null;
+ }
+
+ /** View configuration.
+ * This method is called in the body section of the connector's view configuration page. Its purpose is to present the connection information to the user.
+ * The coder can presume that the HTML that is output from this configuration will be within appropriate <html> and <body> tags.
+ *@param threadContext is the local thread context.
+ *@param out is the output to which any HTML should be sent.
+ *@param parameters are the configuration parameters, as they currently exist, for this connection being configured.
+ */
+ @Override
+ public void viewConfiguration(IThreadContext threadContext, IHTTPOutput out,
+ Locale locale, ConfigParams parameters)
+ throws ManifoldCFException, IOException
+ {
+ Map<String,Object> velocityContext = new HashMap<String,Object>();
+ fillInServerTab(velocityContext,out,parameters);
+ Messages.outputResourceWithVelocity(out,locale,VIEW_CONFIGURATION_HTML,velocityContext);
+ }
+
+ protected static void fillInServerTab(Map<String,Object> velocityContext, IHTTPOutput out, ConfigParams parameters)
+ throws ManifoldCFException
+ {
+ String tikaHostname = parameters.getParameter(TikaConfig.PARAM_TIKAHOSTNAME);
+ if (tikaHostname == null)
+ tikaHostname = TikaConfig.TIKAHOSTNAME_DEFAULT;
+
+ String tikaPort = parameters.getParameter(TikaConfig.PARAM_TIKAPORT);
+ if (tikaPort == null)
+ tikaPort = TikaConfig.TIKAPORT_DEFAULT;
+
+ // Fill in context
+ velocityContext.put("TIKAHOSTNAME", tikaHostname);
+ velocityContext.put("TIKAPORT", tikaPort);
+ }
+
/**
* Get an output version string, given an output specification. The output
* version string is used to uniquely describe the pertinent details of the
@@ -229,15 +491,14 @@ public class TikaExtractor extends org.a
SpecPacker sp = new SpecPacker(pipelineDescription.getSpecification());
+ getSession();
+
// Tika server variables
String mime = "";
InputStream tikaServerIs = null;
- int retry = 0;
HttpResponse response = null;
IOException tikaServerDownException = null;
- BoilerpipeExtractor extractorClassInstance = sp.getExtractorClassInstance();
-
// Tika's API reads from an input stream and writes to an output Writer.
// Since a RepositoryDocument includes readers and inputstreams exclusively,
// AND all downstream
@@ -291,194 +552,121 @@ public class TikaExtractor extends org.a
Long length = null;
try {
- if (sp.tikaServer) {
- try {
- final HttpClient client = HttpClientBuilder.create().build();
- final HttpHost tikaHost = new HttpHost(sp.tikaHostname, sp.tikaPort);
-
- // Make a copy of the original stream as it needs to be sent two
- // times to Tika
- // one for the metadata and one for the content
- IOUtils.copy(document.getBinaryStream(), ds.getOutputStream());
-
- // Metadata
- HttpPut httpPut = new HttpPut(sp.metaURI);
- if (!mime.isEmpty()) {
- httpPut.addHeader("Content-Type", mime);
- }
- httpPut.addHeader("Accept", "application/json");
- HttpEntity entity = new InputStreamEntity(ds.getInputStream());
- httpPut.setEntity(entity);
- while (retry < 3 && response == null) {
- try {
- response = client.execute(tikaHost, httpPut);
- tikaServerDownException = null;
- } catch (IOException e) {
- tikaServerDownException = e;
- retry++;
- if (retry < 3) {
- try {
- Thread.sleep(sp.tikaRetry);
- } catch (InterruptedException e1) {
- // Should not happen
- }
- }
- }
- }
- if (tikaServerDownException != null) {
- throw tikaServerDownException;
- }
- int responseCode = response.getStatusLine().getStatusCode();
- if (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 204) {
- tikaServerIs = response.getEntity().getContent();
- try {
- final BufferedReader br = new BufferedReader(new InputStreamReader(tikaServerIs));
- final JSONParser parser = new JSONParser();
- JSONObject metaJson;
- final StringBuilder sb = new StringBuilder();
- String output;
- while ((output = br.readLine()) != null) {
- sb.append(output);
- }
- metaJson = (JSONObject) parser.parse(sb.toString());
- for (Object key : metaJson.keySet()) {
- metadata.add(key.toString(), metaJson.get(key).toString());
- }
- } finally {
- tikaServerIs.close();
- }
- } else {
- activities.noDocument();
- if (responseCode == 422) {
- resultCode = "TIKASERVERREJECTS";
- description = "Tika Server rejected document with the following reason: "
- + response.getStatusLine().getReasonPhrase();
- handleTikaServerRejects(description);
- } else {
- resultCode = "TIKASERVERERROR";
- description = "Tika Server failed to parse document with the following error: "
- + response.getStatusLine().getReasonPhrase();
- handleTikaServerError(description);
- }
- return DOCUMENTSTATUS_REJECTED;
- }
+ try {
- // Content
- httpPut = new HttpPut(sp.contentURI);
- if (!mime.isEmpty()) {
- httpPut.addHeader("Content-Type", mime);
- }
- httpPut.addHeader("Accept", "text/plain");
- entity = new InputStreamEntity(ds.getInputStream());
- httpPut.setEntity(entity);
-
- // Retry mecanism
- retry = 0;
- response = null;
- while (retry < 3 && response == null) {
- try {
- response = client.execute(tikaHost, httpPut);
- tikaServerDownException = null;
- } catch (IOException e) {
- tikaServerDownException = e;
- retry++;
- if (retry < 3) {
- try {
- Thread.sleep(sp.tikaRetry);
- } catch (InterruptedException e1) {
- // Should not happen
- }
- }
- }
- }
- if (tikaServerDownException != null) {
- throw tikaServerDownException;
- }
-
- responseCode = response.getStatusLine().getStatusCode();
- if (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 204) {
- tikaServerIs = response.getEntity().getContent();
- try {
- ds.close();
- ds = new FileDestinationStorage();
- IOUtils.copyLarge(tikaServerIs, ds.getOutputStream(), 0L, sp.writeLimit);
- length = new Long(ds.getBinaryLength());
- } finally {
- tikaServerIs.close();
+ // Make a copy of the original stream as it needs to be sent two
+ // times to Tika
+ // one for the metadata and one for the content
+ IOUtils.copy(document.getBinaryStream(), ds.getOutputStream());
+
+ // Metadata
+ HttpPut httpPut = new HttpPut(metaURI);
+ if (!mime.isEmpty()) {
+ httpPut.addHeader("Content-Type", mime);
+ }
+ httpPut.addHeader("Accept", "application/json");
+ HttpEntity entity = new InputStreamEntity(ds.getInputStream());
+ httpPut.setEntity(entity);
+ try {
+ response = client.execute(tikaHost, httpPut);
+ } catch (IOExceptione e) {
+ // Retry 3 times, 10000 ms between retries, and abort if doesn't work
+ final long currentTime = System.currentTimeMillis();
+ throw new ServiceInterruption("Tika down, retrying: "+e.getMessage(),e,currentTime + 10000L,
+ -1L,3,true);
+ }
+ int responseCode = response.getStatusLine().getStatusCode();
+ if (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 204) {
+ tikaServerIs = response.getEntity().getContent();
+ try {
+ final BufferedReader br = new BufferedReader(new InputStreamReader(tikaServerIs));
+ final JSONParser parser = new JSONParser();
+ JSONObject metaJson;
+ final StringBuilder sb = new StringBuilder();
+ String output;
+ while ((output = br.readLine()) != null) {
+ sb.append(output);
}
- } else {
- activities.noDocument();
- if (responseCode == 422) {
- resultCode = "TIKASERVERREJECTS";
- description = "Tika Server rejected document with the following reason: "
- + response.getStatusLine().getReasonPhrase();
- handleTikaServerRejects(description);
- } else {
- resultCode = "TIKASERVERERROR";
- description = "Tika Server failed to parse document with the following error: "
- + response.getStatusLine().getReasonPhrase();
- handleTikaServerError(description);
+ metaJson = (JSONObject) parser.parse(sb.toString());
+ for (Object key : metaJson.keySet()) {
+ metadata.add(key.toString(), metaJson.get(key).toString());
}
- return DOCUMENTSTATUS_REJECTED;
+ } finally {
+ tikaServerIs.close();
}
-
- } catch (IOException | ParseException e) {
- resultCode = "TIKASERVERRESPONSEISSUE";
- description = e.getMessage();
- int rval;
- if (e instanceof IOException) {
- rval = handleTikaServerException((IOException) e);
+ } else {
+ activities.noDocument();
+ if (responseCode == 422) {
+ resultCode = "TIKASERVERREJECTS";
+ description = "Tika Server rejected document with the following reason: "
+ + response.getStatusLine().getReasonPhrase();
+ handleTikaServerRejects(description);
} else {
- rval = handleTikaServerException((ParseException) e);
+ resultCode = "TIKASERVERERROR";
+ description = "Tika Server failed to parse document with the following error: "
+ + response.getStatusLine().getReasonPhrase();
+ handleTikaServerError(description);
}
- if (rval == DOCUMENTSTATUS_REJECTED) {
- activities.noDocument();
- }
- return rval;
+ return DOCUMENTSTATUS_REJECTED;
}
- } else {
- OutputStream os = ds.getOutputStream();
+ // Content
+ httpPut = new HttpPut(sp.contentURI);
+ if (!mime.isEmpty()) {
+ httpPut.addHeader("Content-Type", mime);
+ }
+ httpPut.addHeader("Accept", "text/plain");
+ entity = new InputStreamEntity(ds.getInputStream());
+ httpPut.setEntity(entity);
try {
- Writer w = new OutputStreamWriter(os, "utf-8");
+ response = client.execute(tikaHost, httpPut);
+ } catch (IOException e) {
+ // Retry 3 times, 10000 ms between retries, and abort if doesn't work
+ final long currentTime = System.currentTimeMillis();
+ throw new ServiceInterruption("Tika down, retrying: "+e.getMessage(),e,currentTime + 10000L,
+ -1L,3,true);
+ }
+
+ responseCode = response.getStatusLine().getStatusCode();
+ if (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 204) {
+ tikaServerIs = response.getEntity().getContent();
try {
- // Use tika to parse stuff
- ContentHandler handler = TikaParser.newWriteOutBodyContentHandler(w, sp.writeLimit());
- if (extractorClassInstance != null)
- handler = new BoilerpipeContentHandler(handler, extractorClassInstance);
- try {
- TikaParser.parse(document.getBinaryStream(), metadata, handler);
- } catch (TikaException e) {
- if (sp.ignoreTikaException()) {
- resultCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
- description = e.getMessage();
- } else {
- resultCode = "TIKAREJECTION";
- description = e.getMessage();
- int rval = handleTikaException(e);
- if (rval == DOCUMENTSTATUS_REJECTED)
- activities.noDocument();
- return rval;
- }
- } catch (SAXException e) {
- resultCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
- description = e.getMessage();
- int rval = handleSaxException(e);
- if (rval == DOCUMENTSTATUS_REJECTED)
- activities.noDocument();
- return rval;
- } catch (IOException e) {
- resultCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
- description = e.getMessage();
- throw e;
- }
+ ds.close();
+ ds = new FileDestinationStorage();
+ IOUtils.copyLarge(tikaServerIs, ds.getOutputStream(), 0L, sp.writeLimit);
+ length = new Long(ds.getBinaryLength());
} finally {
- w.flush();
+ tikaServerIs.close();
}
- } finally {
- os.close();
- length = new Long(ds.getBinaryLength());
+ } else {
+ activities.noDocument();
+ if (responseCode == 422) {
+ resultCode = "TIKASERVERREJECTS";
+ description = "Tika Server rejected document with the following reason: "
+ + response.getStatusLine().getReasonPhrase();
+ handleTikaServerRejects(description);
+ } else {
+ resultCode = "TIKASERVERERROR";
+ description = "Tika Server failed to parse document with the following error: "
+ + response.getStatusLine().getReasonPhrase();
+ handleTikaServerError(description);
+ }
+ return DOCUMENTSTATUS_REJECTED;
+ }
+
+ } catch (IOException | ParseException e) {
+ resultCode = "TIKASERVERRESPONSEISSUE";
+ description = e.getMessage();
+ int rval;
+ if (e instanceof IOException) {
+ rval = handleTikaServerException((IOException) e);
+ } else {
+ rval = handleTikaServerException((ParseException) e);
}
+ if (rval == DOCUMENTSTATUS_REJECTED) {
+ activities.noDocument();
+ }
+ return rval;
}
if (!activities.checkLengthIndexable(ds.getBinaryLength())) {
@@ -591,16 +779,12 @@ public class TikaExtractor extends org.a
Map<String, Object> paramMap = new HashMap<String, Object>();
paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
- tabsArray.add(Messages.getString(locale, "TikaExtractor.TikaTypeTabName"));
tabsArray.add(Messages.getString(locale, "TikaExtractor.FieldMappingTabName"));
tabsArray.add(Messages.getString(locale, "TikaExtractor.ExceptionsTabName"));
- tabsArray.add(Messages.getString(locale, "TikaExtractor.BoilerplateTabName"));
// Fill in the specification header map, using data from all tabs.
- fillInTikaTypeSpecificationMap(paramMap, os);
fillInFieldMappingSpecificationMap(paramMap, os);
fillInExceptionsSpecificationMap(paramMap, os);
- fillInBoilerplateSpecificationMap(paramMap, os);
Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_JS, paramMap);
}
@@ -637,15 +821,11 @@ public class TikaExtractor extends org.a
paramMap.put("SELECTEDNUM", Integer.toString(actualSequenceNumber));
// Fill in the field mapping tab data
- fillInTikaTypeSpecificationMap(paramMap, os);
fillInFieldMappingSpecificationMap(paramMap, os);
fillInExceptionsSpecificationMap(paramMap, os);
- fillInBoilerplateSpecificationMap(paramMap, os);
- Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_TIKATYPE_HTML, paramMap);
Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_FIELDMAPPING_HTML, paramMap);
Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_EXCEPTIONS_HTML, paramMap);
- Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_BOILERPLATE_HTML, paramMap);
}
/**
@@ -767,24 +947,6 @@ public class TikaExtractor extends org.a
os.addChild(os.getChildCount(), node);
}
- x = variableContext.getParameter(seqPrefix + "boilerplateclassname");
- if (x != null) {
- int i = 0;
- while (i < os.getChildCount()) {
- SpecificationNode node = os.getChild(i);
- if (node.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR))
- os.removeChild(i);
- else
- i++;
- }
-
- if (x.length() > 0) {
- SpecificationNode node = new SpecificationNode(TikaConfig.NODE_BOILERPLATEPROCESSOR);
- node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, x);
- os.addChild(os.getChildCount(), node);
- }
- }
-
x = variableContext.getParameter(seqPrefix + "tikaserver");
if (x != null) {
int i = 0;
@@ -863,10 +1025,8 @@ public class TikaExtractor extends org.a
paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
// Fill in the map with data from all tabs
- fillInTikaTypeSpecificationMap(paramMap, os);
fillInFieldMappingSpecificationMap(paramMap, os);
fillInExceptionsSpecificationMap(paramMap, os);
- fillInBoilerplateSpecificationMap(paramMap, os);
Messages.outputResourceWithVelocity(out, locale, VIEW_SPECIFICATION_HTML, paramMap);
@@ -942,17 +1102,6 @@ public class TikaExtractor extends org.a
paramMap.put("IGNORETIKAEXCEPTIONS", ignoreTikaExceptions);
}
- protected static void fillInBoilerplateSpecificationMap(Map<String, Object> paramMap, Specification os) {
- String boilerplateClassName = "";
- for (int i = 0; i < os.getChildCount(); i++) {
- SpecificationNode sn = os.getChild(i);
- if (sn.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR)) {
- boilerplateClassName = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
- }
- }
- paramMap.put("BOILERPLATECLASSNAME", boilerplateClassName);
- }
-
protected static int handleTikaException(TikaException e)
throws IOException, ManifoldCFException, ServiceInterruption {
// MHL - what does Tika throw if it gets an IOException reading the stream??
@@ -1128,33 +1277,12 @@ public class TikaExtractor extends org.a
private final boolean lowerNames;
private final int writeLimit;
private final boolean ignoreTikaException;
- private final String extractorClassName;
- private URI metaURI;
- private URI contentURI;
- private final String tikaHostname;
- private final int tikaPort;
- private final boolean tikaServer;
- private final long tikaRetry;
public SpecPacker(Specification os) {
boolean keepAllMetadata = true;
boolean lowerNames = false;
int writeLimit = TikaConfig.WRITELIMIT_DEFAULT;
boolean ignoreTikaException = true;
- String extractorClassName = null;
- String tikaHostname = TikaConfig.TIKAHOSTNAME_DEFAULT;
- int tikaPort = TikaConfig.TIKAPORT_DEFAULT;
- boolean tikaServer = false;
- long tikaRetry = TikaConfig.TIKARETRY_DEFAULT;
- try {
- metaURI = new URI("/meta");
- contentURI = new URI("/tika");
- } catch (URISyntaxException e) {
- // Should be impossible
- metaURI = null;
- contentURI = null;
- }
-
for (int i = 0; i < os.getChildCount(); i++) {
SpecificationNode sn = os.getChild(i);
@@ -1278,14 +1406,6 @@ public class TikaExtractor extends org.a
return sb.toString();
}
- public URI metaURI() {
- return metaURI;
- }
-
- public URI contentURI() {
- return contentURI;
- }
-
public String getMapping(String source) {
return sourceTargets.get(source);
}
Modified: manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tikaservice/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tikaservice/common_en_US.properties?rev=1794885&r1=1794884&r2=1794885&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tikaservice/common_en_US.properties (original)
+++ manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tikaservice/common_en_US.properties Thu May 11 19:55:29 2017
@@ -15,22 +15,9 @@
TikaExtractor.TikaHostname=Tika hostname:
TikaExtractor.TikaPort=Tika port:
-TikaExtractor.TikaRetry=Retry interval (ms):
-TikaExtractor.TikaParsersSelected=Tika Parsers
-TikaExtractor.TikaServerSelected=Tika Server
-TikaExtractor.TikaTypeTabName=Tika type
-TikaExtractor.TikaType=Tika type:
+TikaExtractor.TikaServerTabName=Tika server
TikaExtractor.FieldMappingTabName=Field mapping
TikaExtractor.ExceptionsTabName=Exceptions
-TikaExtractor.BoilerplateTabName=Boilerplate
-TikaExtractor.BoilerplateExtractorColon=Boilerplate extractor:
-TikaExtractor.NoExtractionSelected=-- No extraction selected --
-TikaExtractor.ExtractArticles=Extract articles
-TikaExtractor.ExtractArticleSentences=Extract article sentences
-TikaExtractor.BasicExtraction=Basic general-purpose extraction
-TikaExtractor.ExtractEverything=Extract everything
-TikaExtractor.ExtractLargestTextComponent=Extract the largest text component of the document
-TikaExtractor.ExtractNumWords=Extract based on number of words per block
TikaExtractor.FieldMappings=Field mappings:
TikaExtractor.MetadataFieldName=Metadata field name
TikaExtractor.FinalFieldName=Final field name