You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/06/24 14:49:29 UTC
svn commit: r1605070 - in
/manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main:
java/org/apache/manifoldcf/agents/output/solr/
native2ascii/org/apache/manifoldcf/agents/output/solr/
Author: kwright
Date: Tue Jun 24 12:49:29 2014
New Revision: 1605070
URL: http://svn.apache.org/r1605070
Log:
Partial commit of Alessandro's patch, including HttpPoster put into a form where a diff is actually possible.
Modified:
manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConfig.java
manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java
manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_en_US.properties
Modified: manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java?rev=1605070&r1=1605069&r2=1605070&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java (original)
+++ manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java Tue Jun 24 12:49:29 2014
@@ -55,6 +55,7 @@ import org.apache.solr.common.params.Mod
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.SolrException;
import org.apache.solr.client.solrj.impl.HttpClientUtil;
+import org.apache.solr.common.SolrInputDocument;
/**
@@ -506,13 +507,14 @@ public class HttpPoster
* @param document is the document structure to ingest.
* @param arguments are the configuration arguments to pass in the post. Key is argument name, value is a list of the argument values.
* @param keepAllMetadata
- *@param authorityNameString is the name of the governing authority for this document's acls, or null if none.
+ * @param useExtractUpdateHandler is true if the extract update handler should be used.
+ * @param authorityNameString is the name of the governing authority for this document's acls, or null if none.
* @param activities is the activities object, so we can report what's happening. @return true if the ingestion was successful, or false if the ingestion is illegal.
* @throws ManifoldCFException, ServiceInterruption
*/
public boolean indexPost(String documentURI,
RepositoryDocument document, Map arguments, Map<String, List<String>> sourceTargets,
- boolean keepAllMetadata, String authorityNameString, IOutputAddActivity activities)
+ boolean keepAllMetadata, boolean useExtractUpdateHandler, String authorityNameString, IOutputAddActivity activities)
throws ManifoldCFException, ServiceInterruption
{
if (Logging.ingest.isDebugEnabled())
@@ -544,7 +546,7 @@ public class HttpPoster
try
{
IngestThread t = new IngestThread(documentURI,document,arguments,keepAllMetadata,sourceTargets,
- aclsMap,denyAclsMap,commitWithin);
+ aclsMap,denyAclsMap,commitWithin,useExtractUpdateHandler);
try
{
t.start();
@@ -803,6 +805,18 @@ public class HttpPoster
}
}
+ /**
+ * Output an acl level in a SolrInputDocument
+ */
+ protected void writeACLsInSolrDoc( SolrInputDocument inputDoc, String aclType, String[] acl, String[] denyAcl )
+ {
+ String metadataACLName = allowAttributeName + aclType;
+ inputDoc.addField( metadataACLName, acl );
+
+ String metadataDenyACLName = denyAttributeName + aclType;
+ inputDoc.addField( metadataDenyACLName, denyAcl );
+ }
+
/** Killable thread that does ingestions.
* Java 1.5 stopped permitting thread interruptions to abort socket waits. As a result, it is impossible to get threads to shutdown cleanly that are doing
* such waits. So, the places where this happens are segregated in their own threads so that they can be just abandoned.
@@ -819,6 +833,7 @@ public class HttpPoster
protected final Map<String,String[]> denyAclsMap;
protected final String commitWithin;
protected final boolean keepAllMetadata;
+ protected final boolean useExtractUpdateHandler;
protected Long activityStart = null;
protected Long activityBytes = null;
@@ -831,7 +846,7 @@ public class HttpPoster
public IngestThread(String documentURI, RepositoryDocument document,
Map<String, List<String>> arguments, boolean keepAllMetadata, Map<String, List<String>> sourceTargets,
Map<String,String[]> aclsMap, Map<String,String[]> denyAclsMap,
- String commitWithin)
+ String commitWithin, boolean useExtractUpdateHandler)
{
super();
setDaemon(true);
@@ -843,6 +858,7 @@ public class HttpPoster
this.sourceTargets = sourceTargets;
this.commitWithin = commitWithin;
this.keepAllMetadata=keepAllMetadata;
+ this.useExtractUpdateHandler=useExtractUpdateHandler;
}
public void run()
@@ -860,90 +876,34 @@ public class HttpPoster
// Open a socket to ingest, and to the response stream to get the post result
try
{
+ SolrInputDocument currentSolrDoc = new SolrInputDocument();
ContentStreamUpdateRequest contentStreamUpdateRequest = new ContentStreamUpdateRequest(postUpdateAction);
-
- ModifiableSolrParams out = new ModifiableSolrParams();
-
- // Write the id field
- writeField(out,LITERAL+idAttributeName,documentURI);
- // Write the rest of the attributes
- if (modifiedDateAttributeName != null)
- {
- Date date = document.getModifiedDate();
- if (date != null)
- // Write value
- writeField(out,LITERAL+modifiedDateAttributeName,DateParser.formatISO8601Date(date));
- }
- if (createdDateAttributeName != null)
- {
- Date date = document.getCreatedDate();
- if (date != null)
- // Write value
- writeField(out,LITERAL+createdDateAttributeName,DateParser.formatISO8601Date(date));
- }
- if (indexedDateAttributeName != null)
- {
- Date date = document.getIndexingDate();
- if (date != null)
- // Write value
- writeField(out,LITERAL+indexedDateAttributeName,DateParser.formatISO8601Date(date));
- }
- if (fileNameAttributeName != null)
- {
- String fileName = document.getFileName();
- if (fileName != null)
- writeField(out,LITERAL+fileNameAttributeName,fileName);
- }
- if (mimeTypeAttributeName != null)
+ if ( useExtractUpdateHandler )
{
- String mimeType = document.getMimeType();
- if (mimeType != null)
- writeField(out,LITERAL+mimeTypeAttributeName,mimeType);
+ buildExtractUpdateHandlerRequest( length, is, contentType, contentName,
+ contentStreamUpdateRequest );
}
-
- // Write the access token information
- // Both maps have the same keys.
- Iterator<String> typeIterator = aclsMap.keySet().iterator();
- while (typeIterator.hasNext())
- {
- String aclType = typeIterator.next();
- writeACLs(out,aclType,aclsMap.get(aclType),denyAclsMap.get(aclType));
- }
-
- // Write the arguments
- for (String name : arguments.keySet())
+ else
{
- List<String> values = arguments.get(name);
- writeField(out,name,values);
+ currentSolrDoc = buildSolrDocument();
}
- // Write the metadata, each in a field by itself
- buildSolrParamsFromMetadata(out);
-
- // These are unnecessary now in the case of non-solrcloud setups, because we overrode the SolrJ posting method to use multipart.
- //writeField(out,LITERAL+"stream_size",String.valueOf(length));
- //writeField(out,LITERAL+"stream_name",document.getFileName());
-
- // General hint for Tika
- if (document.getFileName() != null)
- writeField(out,"resource.name",document.getFileName());
-
- // Write the commitWithin parameter
- if (commitWithin != null)
- writeField(out,COMMITWITHIN_METADATA,commitWithin);
-
- contentStreamUpdateRequest.setParams(out);
-
- contentStreamUpdateRequest.addContentStream(new RepositoryDocumentStream(is,length,contentType,contentName));
-
// Fire off the request.
// Note: I need to know whether the document has been permanently rejected or not, but we currently have
// no means to determine that. Analysis of SolrServerExceptions that have been thrown is likely needed.
try
{
readFromDocumentStreamYet = true;
- UpdateResponse response = contentStreamUpdateRequest.process(solrServer);
-
+ UpdateResponse response;
+ if ( useExtractUpdateHandler )
+ {
+ response = contentStreamUpdateRequest.process( solrServer );
+ }
+ else
+ {
+ response = solrServer.add( currentSolrDoc );
+ }
+
// Successful completion
activityStart = new Long(fullStartTime);
activityBytes = new Long(length);
@@ -1008,6 +968,156 @@ public class HttpPoster
}
}
+ private SolrInputDocument buildSolrDocument( )
+ throws IOException
+ {
+ SolrInputDocument outputDoc = new SolrInputDocument();
+
+ // Write the id field
+ outputDoc.addField( idAttributeName, documentURI );
+ // Write the rest of the attributes
+ if ( modifiedDateAttributeName != null )
+ {
+ Date date = document.getModifiedDate();
+ if ( date != null )
+ {
+ outputDoc.addField( modifiedDateAttributeName, DateParser.formatISO8601Date( date ) );
+ }
+ }
+ if ( createdDateAttributeName != null )
+ {
+ Date date = document.getCreatedDate();
+ if ( date != null )
+ {
+ outputDoc.addField( createdDateAttributeName, DateParser.formatISO8601Date( date ) );
+ }
+
+ }
+ if ( indexedDateAttributeName != null )
+ {
+ Date date = document.getIndexingDate();
+ if ( date != null )
+ {
+ outputDoc.addField( indexedDateAttributeName, DateParser.formatISO8601Date( date ) );
+ }
+ }
+ if ( fileNameAttributeName != null )
+ {
+ String fileName = document.getFileName();
+ if ( fileName != null )
+ {
+ outputDoc.addField( fileNameAttributeName, fileName );
+ }
+ }
+ if ( mimeTypeAttributeName != null )
+ {
+ String mimeType = document.getMimeType();
+ if ( mimeType != null )
+ {
+ outputDoc.addField( mimeTypeAttributeName, mimeType );
+ }
+ }
+
+ Iterator<String> typeIterator = aclsMap.keySet().iterator();
+ while (typeIterator.hasNext())
+ {
+ String aclType = typeIterator.next();
+ writeACLsInSolrDoc(outputDoc,aclType,aclsMap.get(aclType),denyAclsMap.get(aclType));
+ }
+
+ // Write the arguments
+ for ( String name : arguments.keySet() )
+ {
+ List<String> values = arguments.get( name );
+ outputDoc.addField( name, values );
+ }
+
+ // Write the metadata, each in a field by itself
+ buildSolrParamsFromMetadata( outputDoc );
+
+ return outputDoc;
+ }
+
+ private void buildExtractUpdateHandlerRequest( long length, InputStream is, String contentType,
+ String contentName,
+ ContentStreamUpdateRequest contentStreamUpdateRequest )
+ throws IOException
+ {
+ ModifiableSolrParams out = new ModifiableSolrParams();
+
+ // Write the id field
+ writeField(out,LITERAL+idAttributeName,documentURI);
+ // Write the rest of the attributes
+ if (modifiedDateAttributeName != null)
+ {
+ Date date = document.getModifiedDate();
+ if (date != null)
+ // Write value
+ writeField(out,LITERAL+modifiedDateAttributeName,DateParser.formatISO8601Date(date));
+ }
+ if (createdDateAttributeName != null)
+ {
+ Date date = document.getCreatedDate();
+ if (date != null)
+ // Write value
+ writeField(out,LITERAL+createdDateAttributeName,DateParser.formatISO8601Date(date));
+ }
+ if (indexedDateAttributeName != null)
+ {
+ Date date = document.getIndexingDate();
+ if (date != null)
+ // Write value
+ writeField(out,LITERAL+indexedDateAttributeName,DateParser.formatISO8601Date(date));
+ }
+ if (fileNameAttributeName != null)
+ {
+ String fileName = document.getFileName();
+ if (fileName != null)
+ writeField(out,LITERAL+fileNameAttributeName,fileName);
+ }
+ if (mimeTypeAttributeName != null)
+ {
+ String mimeType = document.getMimeType();
+ if (mimeType != null)
+ writeField(out,LITERAL+mimeTypeAttributeName,mimeType);
+ }
+
+ // Write the access token information
+ // Both maps have the same keys.
+ Iterator<String> typeIterator = aclsMap.keySet().iterator();
+ while (typeIterator.hasNext())
+ {
+ String aclType = typeIterator.next();
+ writeACLs(out,aclType,aclsMap.get(aclType),denyAclsMap.get(aclType));
+ }
+
+ // Write the arguments
+ for (String name : arguments.keySet())
+ {
+ List<String> values = arguments.get(name);
+ writeField(out,name,values);
+ }
+
+ // Write the metadata, each in a field by itself
+ buildSolrParamsFromMetadata(out);
+
+ // These are unnecessary now in the case of non-solrcloud setups, because we overrode the SolrJ posting method to use multipart.
+ //writeField(out,LITERAL+"stream_size",String.valueOf(length));
+ //writeField(out,LITERAL+"stream_name",document.getFileName());
+
+ // General hint for Tika
+ if (document.getFileName() != null)
+ writeField(out,"resource.name",document.getFileName());
+
+ // Write the commitWithin parameter
+ if (commitWithin != null)
+ writeField(out,COMMITWITHIN_METADATA,commitWithin);
+
+ contentStreamUpdateRequest.setParams(out);
+
+ contentStreamUpdateRequest.addContentStream(new RepositoryDocumentStream(is,length,contentType,contentName));
+ }
+
/**
* builds the solr parameter maps for the update request.
* For each mapping expressed is applied the renaming for the metadata field name.
@@ -1044,6 +1154,34 @@ public class HttpPoster
}
}
+ private void buildSolrParamsFromMetadata(SolrInputDocument outputDocument) throws IOException
+ {
+ if (this.keepAllMetadata)
+ {
+ Iterator<String> iter = document.getFields();
+ while (iter.hasNext())
+ {
+ String fieldName = iter.next();
+ List<String> mappings = sourceTargets.get(fieldName);
+ if (mappings != null)
+ for (String newFieldName : mappings)
+ applySingleMapping(fieldName, outputDocument, newFieldName);
+ else // the fields not mentioned in the mapping are added only if we have set the keep all metadata=true.
+ applySingleMapping(fieldName, outputDocument, fieldName);
+ }
+ }
+ else
+ {
+ //don't keep all the metadata but only the ones in sourceTargets
+ for (String originalFieldName : sourceTargets.keySet())
+ {
+ List<String> mapping = sourceTargets.get(originalFieldName);
+ for (String newFieldName : mapping)
+ applySingleMapping(originalFieldName, outputDocument, newFieldName);
+ }
+ }
+ }
+
private void applySingleMapping(String originalFieldName, ModifiableSolrParams out, String newFieldName) throws IOException {
if(newFieldName != null && !newFieldName.isEmpty()) {
if (newFieldName.toLowerCase(Locale.ROOT).equals(idAttributeName.toLowerCase(Locale.ROOT))) {
@@ -1054,6 +1192,16 @@ public class HttpPoster
}
}
+ private void applySingleMapping(String originalFieldName, SolrInputDocument outputDocument, String newFieldName) throws IOException {
+ if(newFieldName != null && !newFieldName.isEmpty()) {
+ if (newFieldName.toLowerCase(Locale.ROOT).equals(idAttributeName.toLowerCase(Locale.ROOT))) {
+ newFieldName = ID_METADATA;
+ }
+ String[] values = document.getFieldAsStrings(originalFieldName);
+ outputDocument.addField( newFieldName, values );
+ }
+ }
+
public void finishUp()
throws InterruptedException, SolrServerException, IOException
{
Modified: manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConfig.java?rev=1605070&r1=1605069&r2=1605070&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConfig.java (original)
+++ manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConfig.java Tue Jun 24 12:49:29 2014
@@ -132,4 +132,9 @@ public class SolrConfig
*/
public static final String NODE_KEEPMETADATA = "keepAllMetadata";
+ /**
+ * Node describing the use of Extract Update handler
+ */
+ public static final String NODE_EXTRACTUPDATE = "useExtractUpdateHandler";
+
}
Modified: manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java?rev=1605070&r1=1605069&r2=1605070&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java (original)
+++ manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/SolrConnector.java Tue Jun 24 12:49:29 2014
@@ -710,7 +710,7 @@ public class SolrConnector extends org.a
getSession();
// Now, go off and call the ingest API.
- if (poster.indexPost(documentURI,document,args,sourceTargets,keepAllMetadata,authorityNameString,activities))
+ if (poster.indexPost(documentURI,document,args,sourceTargets,keepAllMetadata,true,authorityNameString,activities))
return DOCUMENTSTATUS_ACCEPTED;
return DOCUMENTSTATUS_REJECTED;
}
Modified: manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_en_US.properties?rev=1605070&r1=1605069&r2=1605070&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_en_US.properties (original)
+++ manifoldcf/branches/CONNECTORS-981/connectors/solr/connector/src/main/native2ascii/org/apache/manifoldcf/agents/output/solr/common_en_US.properties Tue Jun 24 12:49:29 2014
@@ -15,6 +15,7 @@
SolrConnector.Arguments=Arguments
SolrConnector.KeepAllMetadata=Keep all metadata:
+SolrConnector.UseExtractUpdateHandler=Use the Extract Update Handler:
SolrConnector.Arguments2=Arguments:
SolrConnector.Commits=Commits
SolrConnector.Documents=Documents