You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2017/05/12 02:38:08 UTC

svn commit: r1794910 - /manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/TikaExtractor.java

Author: kwright
Date: Fri May 12 02:38:08 2017
New Revision: 1794910

URL: http://svn.apache.org/viewvc?rev=1794910&view=rev
Log:
Get it to build

Modified:
    manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/TikaExtractor.java

Modified: manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/TikaExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/TikaExtractor.java?rev=1794910&r1=1794909&r2=1794910&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/TikaExtractor.java (original)
+++ manifoldcf/branches/CONNECTORS-1425/connectors/tikaservice/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tikaservice/TikaExtractor.java Fri May 12 02:38:08 2017
@@ -31,6 +31,19 @@ import org.apache.http.entity.InputStrea
 import org.apache.http.conn.HttpClientConnectionManager;
 import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
 import org.apache.http.impl.client.HttpClientBuilder;
+import org.apache.http.protocol.HttpRequestExecutor;
+import org.apache.http.impl.client.DefaultRedirectStrategy;
+import org.apache.http.impl.client.HttpClients;
+import org.apache.http.client.config.RequestConfig;
+import org.apache.http.config.SocketConfig;
+import org.apache.http.config.RegistryBuilder;
+import org.apache.http.conn.socket.ConnectionSocketFactory;
+import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
+import org.apache.http.conn.socket.PlainConnectionSocketFactory;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+
 import org.apache.manifoldcf.agents.interfaces.*;
 import org.apache.manifoldcf.agents.system.Logging;
 
@@ -38,20 +51,12 @@ import java.io.*;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.util.*;
+import java.util.concurrent.TimeUnit;
 
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaMetadataKeys;
-import org.apache.tika.parser.html.BoilerpipeContentHandler;
 import org.json.simple.JSONObject;
 import org.json.simple.parser.JSONParser;
 import org.json.simple.parser.ParseException;
 
-import de.l3s.boilerpipe.BoilerpipeExtractor;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
 /**
  * This connector works as a transformation connector, but does nothing other
  * than logging.
@@ -117,7 +122,7 @@ public class TikaExtractor extends org.a
     }
   }
   
-s  /** Connect.
+  /** Connect.
   *@param configParameters is the set of configuration parameters, which
   * in this case describe the root directory.
   */
@@ -178,7 +183,7 @@ s  /** Connect.
 
       final PoolingHttpClientConnectionManager poolingConnectionManager = new PoolingHttpClientConnectionManager(RegistryBuilder.<ConnectionSocketFactory>create()
         .register("http", PlainConnectionSocketFactory.getSocketFactory())
-        .register("https", myFactory)
+        //.register("https", myFactory)
         .build());
       poolingConnectionManager.setDefaultMaxPerRoute(1);
       poolingConnectionManager.setValidateAfterInactivity(2000);
@@ -568,8 +573,8 @@ s  /** Connect.
           HttpEntity entity = new InputStreamEntity(ds.getInputStream());
           httpPut.setEntity(entity);
           try {
-            response = client.execute(tikaHost, httpPut);
-          } catch (IOExceptione e) {
+            response = this.httpClient.execute(tikaHost, httpPut);
+          } catch (IOException e) {
             // Retry 3 times, 10000 ms between retries, and abort if doesn't work
             final long currentTime = System.currentTimeMillis();
             throw new ServiceInterruption("Tika down, retrying: "+e.getMessage(),e,currentTime + 10000L,
@@ -611,7 +616,7 @@ s  /** Connect.
           }
 
           // Content
-          httpPut = new HttpPut(sp.contentURI);
+          httpPut = new HttpPut(contentURI);
           if (!mime.isEmpty()) {
             httpPut.addHeader("Content-Type", mime);
           }
@@ -619,7 +624,7 @@ s  /** Connect.
           entity = new InputStreamEntity(ds.getInputStream());
           httpPut.setEntity(entity);
           try {
-            response = client.execute(tikaHost, httpPut);
+            response = this.httpClient.execute(tikaHost, httpPut);
           } catch (IOException e) {
             // Retry 3 times, 10000 ms between retries, and abort if doesn't work
             final long currentTime = System.currentTimeMillis();
@@ -947,59 +952,6 @@ s  /** Connect.
       os.addChild(os.getChildCount(), node);
     }
 
-    x = variableContext.getParameter(seqPrefix + "tikaserver");
-    if (x != null) {
-      int i = 0;
-      while (i < os.getChildCount()) {
-        SpecificationNode node = os.getChild(i);
-        if (node.getType().equals(TikaConfig.NODE_TIKASERVER) || node.getType().equals(TikaConfig.NODE_TIKAHOSTNAME)
-            || node.getType().equals(TikaConfig.NODE_TIKAPORT) || node.getType().equals(TikaConfig.NODE_TIKARETRY))
-          os.removeChild(i);
-        else
-          i++;
-      }
-
-      SpecificationNode node = new SpecificationNode(TikaConfig.NODE_TIKASERVER);
-      String tikaServer = variableContext.getParameter(seqPrefix + "tikaserver");
-      if (tikaServer != null) {
-        node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, tikaServer);
-      } else {
-        node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false");
-      }
-      // Add the new tikaserver config parameter
-      os.addChild(os.getChildCount(), node);
-
-      SpecificationNode node2 = new SpecificationNode(TikaConfig.NODE_TIKAHOSTNAME);
-      String tikaHostname = variableContext.getParameter(seqPrefix + "tikahostname");
-      if (tikaHostname != null) {
-        node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, tikaHostname);
-      } else {
-        node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "");
-      }
-      // Add the new tikahostname config parameter
-      os.addChild(os.getChildCount(), node2);
-
-      SpecificationNode node3 = new SpecificationNode(TikaConfig.NODE_TIKAPORT);
-      String tikaPort = variableContext.getParameter(seqPrefix + "tikaport");
-      if (tikaPort != null) {
-        node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, tikaPort);
-      } else {
-        node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "");
-      }
-      // Add the new tikaport config parameter
-      os.addChild(os.getChildCount(), node3);
-
-      SpecificationNode node4 = new SpecificationNode(TikaConfig.NODE_TIKARETRY);
-      String tikaRetry = variableContext.getParameter(seqPrefix + "tikaretry");
-      if (tikaRetry != null) {
-        node4.setAttribute(TikaConfig.ATTRIBUTE_VALUE, tikaRetry);
-      } else {
-        node4.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "");
-      }
-      // Add the new tikaport config parameter
-      os.addChild(os.getChildCount(), node4);
-    }
-
     return null;
   }
 
@@ -1032,29 +984,6 @@ s  /** Connect.
 
   }
 
-  protected static void fillInTikaTypeSpecificationMap(Map<String, Object> paramMap, Specification os) {
-    String tikaServer = "false";
-    String tikaHostname = TikaConfig.TIKAHOSTNAME_DEFAULT;
-    String tikaPort = String.valueOf(TikaConfig.TIKAPORT_DEFAULT);
-    String tikaRetry = String.valueOf(TikaConfig.TIKARETRY_DEFAULT);
-    for (int i = 0; i < os.getChildCount(); i++) {
-      SpecificationNode sn = os.getChild(i);
-      if (sn.getType().equals(TikaConfig.NODE_TIKASERVER)) {
-        tikaServer = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
-      } else if (sn.getType().equals(TikaConfig.NODE_TIKAHOSTNAME)) {
-        tikaHostname = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
-      } else if (sn.getType().equals(TikaConfig.NODE_TIKAPORT)) {
-        tikaPort = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
-      } else if (sn.getType().equals(TikaConfig.NODE_TIKARETRY)) {
-        tikaRetry = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
-      }
-    }
-    paramMap.put("TIKASERVER", tikaServer);
-    paramMap.put("TIKAHOSTNAME", tikaHostname);
-    paramMap.put("TIKAPORT", tikaPort);
-    paramMap.put("TIKARETRY", tikaRetry);
-  }
-
   protected static void fillInFieldMappingSpecificationMap(Map<String, Object> paramMap, Specification os) {
     // Prep for field mappings
     List<Map<String, String>> fieldMappings = new ArrayList<Map<String, String>>();
@@ -1102,13 +1031,6 @@ s  /** Connect.
     paramMap.put("IGNORETIKAEXCEPTIONS", ignoreTikaExceptions);
   }
 
-  protected static int handleTikaException(TikaException e)
-      throws IOException, ManifoldCFException, ServiceInterruption {
-    // MHL - what does Tika throw if it gets an IOException reading the stream??
-    Logging.ingest.warn("Tika: Tika exception extracting: " + e.getMessage(), e);
-    return DOCUMENTSTATUS_REJECTED;
-  }
-
   protected static int handleTikaServerRejects(String reason)
       throws IOException, ManifoldCFException, ServiceInterruption {
     // MHL - what does Tika throw if it gets an IOException reading the stream??
@@ -1137,12 +1059,6 @@ s  /** Connect.
     return DOCUMENTSTATUS_REJECTED;
   }
 
-  protected static int handleSaxException(SAXException e) throws IOException, ManifoldCFException, ServiceInterruption {
-    // MHL - what does this mean?
-    Logging.ingest.warn("Tika: SAX exception extracting: " + e.getMessage(), e);
-    return DOCUMENTSTATUS_REJECTED;
-  }
-
   protected static int handleIOException(IOException e) throws ManifoldCFException {
     // IOException reading from our local storage...
     if (e instanceof InterruptedIOException)
@@ -1310,47 +1226,12 @@ s  /** Connect.
         } else if (sn.getType().equals(TikaConfig.NODE_IGNORETIKAEXCEPTION)) {
           String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
           ignoreTikaException = Boolean.parseBoolean(value);
-        } else if (sn.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR)) {
-          extractorClassName = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
-        } else if (sn.getType().equals(TikaConfig.NODE_TIKAHOSTNAME)) {
-          String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
-          if (value.length() == 0) {
-            tikaHostname = TikaConfig.TIKAHOSTNAME_DEFAULT;
-          } else {
-            tikaHostname = value;
-          }
-        } else if (sn.getType().equals(TikaConfig.NODE_TIKAPORT)) {
-          String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
-          if (value.length() == 0) {
-            tikaPort = TikaConfig.TIKAPORT_DEFAULT;
-          } else {
-            tikaPort = Integer.parseInt(value);
-          }
-        } else if (sn.getType().equals(TikaConfig.NODE_TIKASERVER)) {
-          String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
-          if (value.length() == 0) {
-            tikaServer = false;
-          } else {
-            tikaServer = Boolean.parseBoolean(value);
-          }
-        } else if (sn.getType().equals(TikaConfig.NODE_TIKARETRY)) {
-          String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
-          if (value.length() == 0) {
-            tikaRetry = TikaConfig.TIKARETRY_DEFAULT;
-          } else {
-            tikaRetry = Long.parseLong(value);
-          }
         }
       }
       this.keepAllMetadata = keepAllMetadata;
       this.lowerNames = lowerNames;
       this.writeLimit = writeLimit;
       this.ignoreTikaException = ignoreTikaException;
-      this.extractorClassName = extractorClassName;
-      this.tikaHostname = tikaHostname;
-      this.tikaPort = tikaPort;
-      this.tikaServer = tikaServer;
-      this.tikaRetry = tikaRetry;
     }
 
     public String toPackedString() {
@@ -1397,12 +1278,6 @@ s  /** Connect.
       else
         sb.append('-');
 
-      if (extractorClassName != null) {
-        sb.append('+');
-        sb.append(extractorClassName);
-      } else
-        sb.append('-');
-
       return sb.toString();
     }
 
@@ -1426,23 +1301,6 @@ s  /** Connect.
       return ignoreTikaException;
     }
 
-    public BoilerpipeExtractor getExtractorClassInstance() throws ManifoldCFException {
-      if (extractorClassName == null)
-        return null;
-      try {
-        ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
-        Class extractorClass = loader.loadClass(extractorClassName);
-        java.lang.reflect.Field f = extractorClass.getField("INSTANCE");
-        return (BoilerpipeExtractor) f.get(null);
-      } catch (ClassNotFoundException e) {
-        throw new ManifoldCFException(
-            "Boilerpipe extractor class '" + extractorClassName + "' not found: " + e.getMessage(), e);
-      } catch (Exception e) {
-        throw new ManifoldCFException(
-            "Boilerpipe extractor class '" + extractorClassName + "' exception on instantiation: " + e.getMessage(), e);
-      }
-    }
-
   }
 
 }