You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/06/25 14:10:00 UTC

svn commit: r1605363 - in /manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr: HttpPoster.java SolrConnector.java

Author: kwright
Date: Wed Jun 25 12:10:00 2014
New Revision: 1605363

URL: http://svn.apache.org/r1605363
Log:
HttpPoster now complete, given SOLR-6199

Modified:
    manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
    manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java

Modified: manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java?rev=1605363&r1=1605362&r2=1605363&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java (original)
+++ manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java Wed Jun 25 12:10:00 2014
@@ -36,6 +36,7 @@ import java.net.MalformedURLException;
 import java.util.*;
 import java.util.regex.*;
 
+import org.apache.http.Consts;
 import org.apache.http.auth.AuthScope;
 import org.apache.http.auth.Credentials;
 import org.apache.http.auth.UsernamePasswordCredentials;
@@ -83,25 +84,29 @@ public class HttpPoster
   protected SolrServer solrServer = null;
   
   // Action URI pieces
-  private String postUpdateAction;
-  private String postRemoveAction;
-  private String postStatusAction;
+  private final String postUpdateAction;
+  private final String postRemoveAction;
+  private final String postStatusAction;
   
   // Attribute names
-  private String allowAttributeName;
-  private String denyAttributeName;
-  private String idAttributeName;
-  private String modifiedDateAttributeName;
-  private String createdDateAttributeName;
-  private String indexedDateAttributeName;
-  private String fileNameAttributeName;
-  private String mimeTypeAttributeName;
+  private final String allowAttributeName;
+  private final String denyAttributeName;
+  private final String idAttributeName;
+  private final String modifiedDateAttributeName;
+  private final String createdDateAttributeName;
+  private final String indexedDateAttributeName;
+  private final String fileNameAttributeName;
+  private final String mimeTypeAttributeName;
+  private final String contentAttributeName;
+  
+  // Whether we use extract/update handler or not
+  private final boolean useExtractUpdateHandler;
   
   // Document max length
-  private Long maxDocumentLength;
+  private final Long maxDocumentLength;
 
   // Commit-within flag
-  private String commitWithin;
+  private final String commitWithin;
 
   // Constants we need
   private static final String LITERAL = "literal.";
@@ -119,9 +124,9 @@ public class HttpPoster
     String updatePath, String removePath, String statusPath,
     String allowAttributeName, String denyAttributeName, String idAttributeName,
     String modifiedDateAttributeName, String createdDateAttributeName, String indexedDateAttributeName,
-    String fileNameAttributeName, String mimeTypeAttributeName,
+    String fileNameAttributeName, String mimeTypeAttributeName, String contentAttributeName,
     Long maxDocumentLength,
-    String commitWithin)
+    String commitWithin, boolean useExtractUpdateHandler)
     throws ManifoldCFException
   {
     // These are the paths to the handlers in Solr that deal with the actions we need to do
@@ -139,6 +144,8 @@ public class HttpPoster
     this.indexedDateAttributeName = indexedDateAttributeName;
     this.fileNameAttributeName = fileNameAttributeName;
     this.mimeTypeAttributeName = mimeTypeAttributeName;
+    this.contentAttributeName = contentAttributeName;
+    this.useExtractUpdateHandler = useExtractUpdateHandler;
     
     this.maxDocumentLength = maxDocumentLength;
     
@@ -165,9 +172,9 @@ public class HttpPoster
     String realm, String userID, String password,
     String allowAttributeName, String denyAttributeName, String idAttributeName,
     String modifiedDateAttributeName, String createdDateAttributeName, String indexedDateAttributeName,
-    String fileNameAttributeName, String mimeTypeAttributeName,
+    String fileNameAttributeName, String mimeTypeAttributeName, String contentAttributeName,
     IKeystoreManager keystoreManager, Long maxDocumentLength,
-    String commitWithin)
+    String commitWithin, boolean useExtractUpdateHandler)
     throws ManifoldCFException
   {
     // These are the paths to the handlers in Solr that deal with the actions we need to do
@@ -185,6 +192,8 @@ public class HttpPoster
     this.indexedDateAttributeName = indexedDateAttributeName;
     this.fileNameAttributeName = fileNameAttributeName;
     this.mimeTypeAttributeName = mimeTypeAttributeName;
+    this.contentAttributeName = contentAttributeName;
+    this.useExtractUpdateHandler = useExtractUpdateHandler;
     
     this.maxDocumentLength = maxDocumentLength;
 
@@ -507,14 +516,13 @@ public class HttpPoster
    * @param document is the document structure to ingest.
    * @param arguments are the configuration arguments to pass in the post.  Key is argument name, value is a list of the argument values.
    * @param keepAllMetadata
-   * @param useExtractUpdateHandler is true if the extract update handler should be used.
    * @param authorityNameString is the name of the governing authority for this document's acls, or null if none.
    * @param activities is the activities object, so we can report what's happening.   @return true if the ingestion was successful, or false if the ingestion is illegal.
   * @throws ManifoldCFException, ServiceInterruption
   */
   public boolean indexPost(String documentURI,
     RepositoryDocument document, Map arguments, Map<String, List<String>> sourceTargets,
-    boolean keepAllMetadata, boolean useExtractUpdateHandler, String authorityNameString, IOutputAddActivity activities)
+    boolean keepAllMetadata, String authorityNameString, IOutputAddActivity activities)
     throws ManifoldCFException, ServiceInterruption
   {
     if (Logging.ingest.isDebugEnabled())
@@ -546,7 +554,7 @@ public class HttpPoster
     try
     {
       IngestThread t = new IngestThread(documentURI,document,arguments,keepAllMetadata,sourceTargets,
-                                        aclsMap,denyAclsMap,commitWithin,useExtractUpdateHandler);
+                                        aclsMap,denyAclsMap);
       try
       {
         t.start();
@@ -831,9 +839,7 @@ public class HttpPoster
     protected final Map<String,List<String>> sourceTargets;
     protected final Map<String,String[]> aclsMap;
     protected final Map<String,String[]> denyAclsMap;
-    protected final String commitWithin;
     protected final boolean keepAllMetadata;
-    protected final boolean useExtractUpdateHandler;
     
     protected Long activityStart = null;
     protected Long activityBytes = null;
@@ -845,8 +851,7 @@ public class HttpPoster
 
     public IngestThread(String documentURI, RepositoryDocument document,
       Map<String, List<String>> arguments, boolean keepAllMetadata, Map<String, List<String>> sourceTargets,
-      Map<String,String[]> aclsMap, Map<String,String[]> denyAclsMap,
-      String commitWithin, boolean useExtractUpdateHandler)
+      Map<String,String[]> aclsMap, Map<String,String[]> denyAclsMap)
     {
       super();
       setDaemon(true);
@@ -856,9 +861,7 @@ public class HttpPoster
       this.aclsMap = aclsMap;
       this.denyAclsMap = denyAclsMap;
       this.sourceTargets = sourceTargets;
-      this.commitWithin = commitWithin;
       this.keepAllMetadata=keepAllMetadata;
-      this.useExtractUpdateHandler=useExtractUpdateHandler;
     }
 
     public void run()
@@ -885,7 +888,7 @@ public class HttpPoster
           }
           else
           {
-            currentSolrDoc = buildSolrDocument();
+            currentSolrDoc = buildSolrDocument( length, is );
           }
 
           // Fire off the request.
@@ -968,13 +971,31 @@ public class HttpPoster
       }
     }
 
-    private SolrInputDocument buildSolrDocument( )
+    private SolrInputDocument buildSolrDocument( long length, InputStream is )
       throws IOException
     {
       SolrInputDocument outputDoc = new SolrInputDocument();
 
       // Write the id field
       outputDoc.addField( idAttributeName, documentURI );
+      
+      if (contentAttributeName != null)
+      {
+        // Copy the content into a string.  This is a bad thing to do, but we have no choice given SolrJ architecture at this time.
+        // We enforce a size limit upstream.
+        Reader r = new InputStreamReader(is, Consts.UTF_8);
+        StringBuilder sb = new StringBuilder((int)length);
+        char[] buffer = new char[65536];
+        while (true)
+        {
+          int amt = r.read(buffer,0,buffer.length);
+          if (amt == -1)
+            break;
+          sb.append(buffer,0,amt);
+        }
+        outputDoc.addField( contentAttributeName, sb.toString() );
+      }
+      
       // Write the rest of the attributes
       if ( modifiedDateAttributeName != null )
       {

Modified: manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java?rev=1605363&r1=1605362&r2=1605363&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java (original)
+++ manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java Wed Jun 25 12:10:00 2014
@@ -207,6 +207,9 @@ public class SolrConnector extends org.a
       if (mimeTypeAttributeName == null || mimeTypeAttributeName.length() == 0)
         mimeTypeAttributeName = null;
 
+      String contentAttributeName = "content";	// ??? -- should be settable
+      boolean useExtractUpdateHandler = true;   // ???
+      
       String commits = params.getParameter(SolrConfig.PARAM_COMMITS);
       if (commits == null || commits.length() == 0)
         commits = "true";
@@ -313,8 +316,8 @@ public class SolrConnector extends org.a
             updatePath,removePath,statusPath,realm,userID,password,
             allowAttributeName,denyAttributeName,idAttributeName,
             modifiedDateAttributeName,createdDateAttributeName,indexedDateAttributeName,
-            fileNameAttributeName,mimeTypeAttributeName,
-            keystoreManager,maxDocumentLength,commitWithin);
+            fileNameAttributeName,mimeTypeAttributeName,contentAttributeName,
+            keystoreManager,maxDocumentLength,commitWithin,useExtractUpdateHandler);
           
         }
         catch (NumberFormatException e)
@@ -368,8 +371,8 @@ public class SolrConnector extends org.a
             updatePath,removePath,statusPath,
             allowAttributeName,denyAttributeName,idAttributeName,
             modifiedDateAttributeName,createdDateAttributeName,indexedDateAttributeName,
-            fileNameAttributeName,mimeTypeAttributeName,
-            maxDocumentLength,commitWithin);
+            fileNameAttributeName,mimeTypeAttributeName,contentAttributeName,
+            maxDocumentLength,commitWithin,useExtractUpdateHandler);
           
         }
         catch (NumberFormatException e)
@@ -710,7 +713,7 @@ public class SolrConnector extends org.a
     getSession();
 
     // Now, go off and call the ingest API.
-    if (poster.indexPost(documentURI,document,args,sourceTargets,keepAllMetadata,true,authorityNameString,activities))
+    if (poster.indexPost(documentURI,document,args,sourceTargets,keepAllMetadata,authorityNameString,activities))
       return DOCUMENTSTATUS_ACCEPTED;
     return DOCUMENTSTATUS_REJECTED;
   }