You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/06/24 14:49:29 UTC

svn commit: r1605070 - in /manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main: java/org/apache/manifoldcf/agents/output/solr/ native2ascii/org/apache/manifoldcf/agents/output/solr/

Author: kwright
Date: Tue Jun 24 12:49:29 2014
New Revision: 1605070

URL: http://svn.apache.org/r1605070
Log:
Partial commit of Alessandro's patch, including HttpPoster put into a form where a diff is actually possible.

Modified:
    manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
    manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConfig.java
    manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java
    manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_en_US.properties

Modified: manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java?rev=1605070&r1=1605069&r2=1605070&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java (original)
+++ manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java Tue Jun 24 12:49:29 2014
@@ -55,6 +55,7 @@ import org.apache.solr.common.params.Mod
 import org.apache.solr.common.util.ContentStream;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.client.solrj.impl.HttpClientUtil;
+import org.apache.solr.common.SolrInputDocument;
 
 
 /**
@@ -506,13 +507,14 @@ public class HttpPoster
    * @param document is the document structure to ingest.
    * @param arguments are the configuration arguments to pass in the post.  Key is argument name, value is a list of the argument values.
    * @param keepAllMetadata
-   *@param authorityNameString is the name of the governing authority for this document's acls, or null if none.
+   * @param useExtractUpdateHandler is true if the extract update handler should be used.
+   * @param authorityNameString is the name of the governing authority for this document's acls, or null if none.
    * @param activities is the activities object, so we can report what's happening.   @return true if the ingestion was successful, or false if the ingestion is illegal.
   * @throws ManifoldCFException, ServiceInterruption
   */
   public boolean indexPost(String documentURI,
     RepositoryDocument document, Map arguments, Map<String, List<String>> sourceTargets,
-    boolean keepAllMetadata, String authorityNameString, IOutputAddActivity activities)
+    boolean keepAllMetadata, boolean useExtractUpdateHandler, String authorityNameString, IOutputAddActivity activities)
     throws ManifoldCFException, ServiceInterruption
   {
     if (Logging.ingest.isDebugEnabled())
@@ -544,7 +546,7 @@ public class HttpPoster
     try
     {
       IngestThread t = new IngestThread(documentURI,document,arguments,keepAllMetadata,sourceTargets,
-                                        aclsMap,denyAclsMap,commitWithin);
+                                        aclsMap,denyAclsMap,commitWithin,useExtractUpdateHandler);
       try
       {
         t.start();
@@ -803,6 +805,18 @@ public class HttpPoster
     }
   }
   
+  /**
+    * Output an acl level in a SolrInputDocument
+    */
+  protected void writeACLsInSolrDoc( SolrInputDocument inputDoc, String aclType, String[] acl, String[] denyAcl )
+  {
+    String metadataACLName = allowAttributeName + aclType;
+    inputDoc.addField( metadataACLName, acl );
+
+    String metadataDenyACLName = denyAttributeName + aclType;
+    inputDoc.addField( metadataDenyACLName, denyAcl );
+  }
+
   /** Killable thread that does ingestions.
   * Java 1.5 stopped permitting thread interruptions to abort socket waits.  As a result, it is impossible to get threads to shutdown cleanly that are doing
   * such waits.  So, the places where this happens are segregated in their own threads so that they can be just abandoned.
@@ -819,6 +833,7 @@ public class HttpPoster
     protected final Map<String,String[]> denyAclsMap;
     protected final String commitWithin;
     protected final boolean keepAllMetadata;
+    protected final boolean useExtractUpdateHandler;
     
     protected Long activityStart = null;
     protected Long activityBytes = null;
@@ -831,7 +846,7 @@ public class HttpPoster
     public IngestThread(String documentURI, RepositoryDocument document,
       Map<String, List<String>> arguments, boolean keepAllMetadata, Map<String, List<String>> sourceTargets,
       Map<String,String[]> aclsMap, Map<String,String[]> denyAclsMap,
-      String commitWithin)
+      String commitWithin, boolean useExtractUpdateHandler)
     {
       super();
       setDaemon(true);
@@ -843,6 +858,7 @@ public class HttpPoster
       this.sourceTargets = sourceTargets;
       this.commitWithin = commitWithin;
       this.keepAllMetadata=keepAllMetadata;
+      this.useExtractUpdateHandler=useExtractUpdateHandler;
     }
 
     public void run()
@@ -860,90 +876,34 @@ public class HttpPoster
         // Open a socket to ingest, and to the response stream to get the post result
         try
         {
+          SolrInputDocument currentSolrDoc = new SolrInputDocument();
           ContentStreamUpdateRequest contentStreamUpdateRequest = new ContentStreamUpdateRequest(postUpdateAction);
-          
-          ModifiableSolrParams out = new ModifiableSolrParams();
-          
-          // Write the id field
-          writeField(out,LITERAL+idAttributeName,documentURI);
-          // Write the rest of the attributes
-          if (modifiedDateAttributeName != null)
-          {
-            Date date = document.getModifiedDate();
-            if (date != null)
-              // Write value
-              writeField(out,LITERAL+modifiedDateAttributeName,DateParser.formatISO8601Date(date));
-          }
-          if (createdDateAttributeName != null)
-          {
-            Date date = document.getCreatedDate();
-            if (date != null)
-              // Write value
-              writeField(out,LITERAL+createdDateAttributeName,DateParser.formatISO8601Date(date));
-          }
-          if (indexedDateAttributeName != null)
-          {
-            Date date = document.getIndexingDate();
-            if (date != null)
-              // Write value
-              writeField(out,LITERAL+indexedDateAttributeName,DateParser.formatISO8601Date(date));
-          }
-          if (fileNameAttributeName != null)
-          {
-            String fileName = document.getFileName();
-            if (fileName != null)
-              writeField(out,LITERAL+fileNameAttributeName,fileName);
-          }
-          if (mimeTypeAttributeName != null)
+          if ( useExtractUpdateHandler )
           {
-            String mimeType = document.getMimeType();
-            if (mimeType != null)
-              writeField(out,LITERAL+mimeTypeAttributeName,mimeType);
+            buildExtractUpdateHandlerRequest( length, is, contentType, contentName,
+              contentStreamUpdateRequest );
           }
-          
-          // Write the access token information
-          // Both maps have the same keys.
-          Iterator<String> typeIterator = aclsMap.keySet().iterator();
-          while (typeIterator.hasNext())
-          {
-            String aclType = typeIterator.next();
-            writeACLs(out,aclType,aclsMap.get(aclType),denyAclsMap.get(aclType));
-          }
-
-          // Write the arguments
-          for (String name : arguments.keySet())
+          else
           {
-            List<String> values = arguments.get(name);
-            writeField(out,name,values);
+            currentSolrDoc = buildSolrDocument();
           }
 
-          // Write the metadata, each in a field by itself
-           buildSolrParamsFromMetadata(out);
-             
-          // These are unnecessary now in the case of non-solrcloud setups, because we overrode the SolrJ posting method to use multipart.
-          //writeField(out,LITERAL+"stream_size",String.valueOf(length));
-          //writeField(out,LITERAL+"stream_name",document.getFileName());
-          
-          // General hint for Tika
-          if (document.getFileName() != null)
-            writeField(out,"resource.name",document.getFileName());
-          
-          // Write the commitWithin parameter
-          if (commitWithin != null)
-            writeField(out,COMMITWITHIN_METADATA,commitWithin);
-
-          contentStreamUpdateRequest.setParams(out);
-          
-          contentStreamUpdateRequest.addContentStream(new RepositoryDocumentStream(is,length,contentType,contentName));
-
           // Fire off the request.
           // Note: I need to know whether the document has been permanently rejected or not, but we currently have
           // no means to determine that.  Analysis of SolrServerExceptions that have been thrown is likely needed.
           try
           {
             readFromDocumentStreamYet = true;
-            UpdateResponse response = contentStreamUpdateRequest.process(solrServer);
-            
+            UpdateResponse response;
+            if ( useExtractUpdateHandler )
+            {
+              response = contentStreamUpdateRequest.process( solrServer );
+            }
+            else
+            {
+              response = solrServer.add( currentSolrDoc );
+            }
+
             // Successful completion
             activityStart = new Long(fullStartTime);
             activityBytes = new Long(length);
@@ -1008,6 +968,156 @@ public class HttpPoster
       }
     }
 
+    private SolrInputDocument buildSolrDocument( )
+      throws IOException
+    {
+      SolrInputDocument outputDoc = new SolrInputDocument();
+
+      // Write the id field
+      outputDoc.addField( idAttributeName, documentURI );
+      // Write the rest of the attributes
+      if ( modifiedDateAttributeName != null )
+      {
+        Date date = document.getModifiedDate();
+        if ( date != null )
+        {
+          outputDoc.addField( modifiedDateAttributeName, DateParser.formatISO8601Date( date ) );
+        }
+      }
+      if ( createdDateAttributeName != null )
+      {
+        Date date = document.getCreatedDate();
+        if ( date != null )
+        {
+          outputDoc.addField( createdDateAttributeName, DateParser.formatISO8601Date( date ) );
+        }
+
+      }
+      if ( indexedDateAttributeName != null )
+      {
+        Date date = document.getIndexingDate();
+        if ( date != null )
+        {
+          outputDoc.addField( indexedDateAttributeName, DateParser.formatISO8601Date( date ) );
+        }
+      }
+      if ( fileNameAttributeName != null )
+      {
+        String fileName = document.getFileName();
+        if ( fileName != null )
+        {
+          outputDoc.addField( fileNameAttributeName, fileName );
+        }
+      }
+      if ( mimeTypeAttributeName != null )
+      {
+        String mimeType = document.getMimeType();
+        if ( mimeType != null )
+        {
+          outputDoc.addField( mimeTypeAttributeName, mimeType );
+        }
+      }
+
+      Iterator<String> typeIterator = aclsMap.keySet().iterator();
+      while (typeIterator.hasNext())
+      {
+        String aclType = typeIterator.next();
+        writeACLsInSolrDoc(outputDoc,aclType,aclsMap.get(aclType),denyAclsMap.get(aclType));
+      }
+
+      // Write the arguments
+      for ( String name : arguments.keySet() )
+      {
+        List<String> values = arguments.get( name );
+        outputDoc.addField( name, values );
+      }
+
+      // Write the metadata, each in a field by itself
+      buildSolrParamsFromMetadata( outputDoc );
+
+      return outputDoc;
+    }
+
+    private void buildExtractUpdateHandlerRequest( long length, InputStream is, String contentType,
+      String contentName,
+      ContentStreamUpdateRequest contentStreamUpdateRequest )
+      throws IOException
+    {
+      ModifiableSolrParams out = new ModifiableSolrParams();
+          
+      // Write the id field
+      writeField(out,LITERAL+idAttributeName,documentURI);
+      // Write the rest of the attributes
+      if (modifiedDateAttributeName != null)
+      {
+        Date date = document.getModifiedDate();
+        if (date != null)
+          // Write value
+          writeField(out,LITERAL+modifiedDateAttributeName,DateParser.formatISO8601Date(date));
+      }
+      if (createdDateAttributeName != null)
+      {
+        Date date = document.getCreatedDate();
+        if (date != null)
+          // Write value
+          writeField(out,LITERAL+createdDateAttributeName,DateParser.formatISO8601Date(date));
+      }
+      if (indexedDateAttributeName != null)
+      {
+        Date date = document.getIndexingDate();
+        if (date != null)
+          // Write value
+          writeField(out,LITERAL+indexedDateAttributeName,DateParser.formatISO8601Date(date));
+      }
+      if (fileNameAttributeName != null)
+      {
+        String fileName = document.getFileName();
+        if (fileName != null)
+          writeField(out,LITERAL+fileNameAttributeName,fileName);
+      }
+      if (mimeTypeAttributeName != null)
+      {
+        String mimeType = document.getMimeType();
+        if (mimeType != null)
+          writeField(out,LITERAL+mimeTypeAttributeName,mimeType);
+      }
+          
+      // Write the access token information
+      // Both maps have the same keys.
+      Iterator<String> typeIterator = aclsMap.keySet().iterator();
+      while (typeIterator.hasNext())
+      {
+        String aclType = typeIterator.next();
+        writeACLs(out,aclType,aclsMap.get(aclType),denyAclsMap.get(aclType));
+      }
+
+      // Write the arguments
+      for (String name : arguments.keySet())
+      {
+        List<String> values = arguments.get(name);
+        writeField(out,name,values);
+      }
+
+      // Write the metadata, each in a field by itself
+      buildSolrParamsFromMetadata(out);
+             
+      // These are unnecessary now in the case of non-solrcloud setups, because we overrode the SolrJ posting method to use multipart.
+      //writeField(out,LITERAL+"stream_size",String.valueOf(length));
+      //writeField(out,LITERAL+"stream_name",document.getFileName());
+          
+      // General hint for Tika
+      if (document.getFileName() != null)
+        writeField(out,"resource.name",document.getFileName());
+          
+      // Write the commitWithin parameter
+      if (commitWithin != null)
+        writeField(out,COMMITWITHIN_METADATA,commitWithin);
+
+      contentStreamUpdateRequest.setParams(out);
+          
+      contentStreamUpdateRequest.addContentStream(new RepositoryDocumentStream(is,length,contentType,contentName));
+    }
+
     /**
       * builds the solr parameter maps for the update request.
       * For each mapping expressed is applied the renaming for the metadata field name.
@@ -1044,6 +1154,34 @@ public class HttpPoster
       }
     }
 
+    private void buildSolrParamsFromMetadata(SolrInputDocument outputDocument) throws IOException
+    {
+      if (this.keepAllMetadata)
+      {
+        Iterator<String> iter = document.getFields();
+        while (iter.hasNext())
+        {
+          String fieldName = iter.next();
+          List<String> mappings = sourceTargets.get(fieldName);
+          if (mappings != null)
+            for (String newFieldName : mappings)
+              applySingleMapping(fieldName, outputDocument, newFieldName);
+          else // the fields not mentioned in the mapping are added only if we have set the keep all metadata=true.
+            applySingleMapping(fieldName, outputDocument, fieldName);
+        }
+      }
+      else
+      {
+        //don't keep all the metadata but only the ones in sourceTargets
+        for (String originalFieldName : sourceTargets.keySet())
+        {
+          List<String> mapping = sourceTargets.get(originalFieldName);
+          for (String newFieldName : mapping)
+            applySingleMapping(originalFieldName, outputDocument, newFieldName);
+        }
+      }
+    }
+
     private void applySingleMapping(String originalFieldName, ModifiableSolrParams out, String newFieldName) throws IOException {
       if(newFieldName != null && !newFieldName.isEmpty()) {
         if (newFieldName.toLowerCase(Locale.ROOT).equals(idAttributeName.toLowerCase(Locale.ROOT))) {
@@ -1054,6 +1192,16 @@ public class HttpPoster
       }
     }
 
+    private void applySingleMapping(String originalFieldName, SolrInputDocument outputDocument, String newFieldName) throws IOException {
+      if(newFieldName != null && !newFieldName.isEmpty()) {
+        if (newFieldName.toLowerCase(Locale.ROOT).equals(idAttributeName.toLowerCase(Locale.ROOT))) {
+          newFieldName = ID_METADATA;
+        }
+        String[] values = document.getFieldAsStrings(originalFieldName);
+        outputDocument.addField( newFieldName, values );
+      }
+    }
+
     public void finishUp()
       throws InterruptedException, SolrServerException, IOException
     {

Modified: manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConfig.java?rev=1605070&r1=1605069&r2=1605070&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConfig.java (original)
+++ manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConfig.java Tue Jun 24 12:49:29 2014
@@ -132,4 +132,9 @@ public class SolrConfig
    */
   public static final String NODE_KEEPMETADATA = "keepAllMetadata";
 
+    /**
+     * Node describing the use of Extract Update handler
+     */
+    public static final String NODE_EXTRACTUPDATE = "useExtractUpdateHandler";
+
 }

Modified: manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java?rev=1605070&r1=1605069&r2=1605070&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java (original)
+++ manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java Tue Jun 24 12:49:29 2014
@@ -710,7 +710,7 @@ public class SolrConnector extends org.a
     getSession();
 
     // Now, go off and call the ingest API.
-    if (poster.indexPost(documentURI,document,args,sourceTargets,keepAllMetadata,authorityNameString,activities))
+    if (poster.indexPost(documentURI,document,args,sourceTargets,keepAllMetadata,true,authorityNameString,activities))
       return DOCUMENTSTATUS_ACCEPTED;
     return DOCUMENTSTATUS_REJECTED;
   }

Modified: manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_en_US.properties?rev=1605070&r1=1605069&r2=1605070&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_en_US.properties (original)
+++ manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_en_US.properties Tue Jun 24 12:49:29 2014
@@ -15,6 +15,7 @@
 
 SolrConnector.Arguments=Arguments
 SolrConnector.KeepAllMetadata=Keep all metadata:
+SolrConnector.UseExtractUpdateHandler=Use the Extract Update Handler:
 SolrConnector.Arguments2=Arguments:
 SolrConnector.Commits=Commits
 SolrConnector.Documents=Documents